Top

goulash.parsing module

goulash.parsing

""" goulash.parsing
"""
import re
from HTMLParser import HTMLParser

R_SPLIT_DELIM = re.compile('[\W_]+')

class MLStripper(HTMLParser):
    # utility to strip html which only requires the pyton stdlib.  taken from:
    #  http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
    def __init__(self):
        self.reset()
        self.fed = []

    def handle_data(self, d):
        self.fed.append(d)

    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def smart_split(x):
    """ splits on most delims """
    return [y for y in R_SPLIT_DELIM.split(x) if y]

def sanitize_txt(x):
    """ make text suitable for href linking, etc """
    return '_'.join(smart_split(x.lower()))

Module variables

var R_SPLIT_DELIM

Functions

def sanitize_txt(

x)

make text suitable for href linking, etc

def sanitize_txt(x):
    """ make text suitable for href linking, etc """
    return '_'.join(smart_split(x.lower()))

def smart_split(

x)

splits on most delims

def smart_split(x):
    """ splits on most delims """
    return [y for y in R_SPLIT_DELIM.split(x) if y]

def strip_tags(

html)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

Classes

class MLStripper

class MLStripper(HTMLParser):
    # utility to strip html which only requires the pyton stdlib.  taken from:
    #  http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
    def __init__(self):
        self.reset()
        self.fed = []

    def handle_data(self, d):
        self.fed.append(d)

    def get_data(self):
        return ''.join(self.fed)

Ancestors (in MRO)

  • MLStripper
  • HTMLParser.HTMLParser
  • markupbase.ParserBase

Class variables

var CDATA_CONTENT_ELEMENTS

var entitydefs

Instance variables

var fed

Methods

def __init__(

self)

def __init__(self):
    self.reset()
    self.fed = []

def check_for_whole_start_tag(

self, i)

def check_for_whole_start_tag(self, i):
    rawdata = self.rawdata
    m = locatestarttagend.match(rawdata, i)
    if m:
        j = m.end()
        next = rawdata[j:j+1]
        if next == ">":
            return j + 1
        if next == "/":
            if rawdata.startswith("/>", j):
                return j + 2
            if rawdata.startswith("/", j):
                # buffer boundary
                return -1
            # else bogus input
            self.updatepos(i, j + 1)
            self.error("malformed empty start tag")
        if next == "":
            # end of input
            return -1
        if next in ("abcdefghijklmnopqrstuvwxyz=/"
                    "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
            # end of input in or before attribute value, or we have the
            # '/' from a '/>' ending
            return -1
        if j > i:
            return j
        else:
            return i + 1
    raise AssertionError("we should not get here!")

def clear_cdata_mode(

self)

def clear_cdata_mode(self):
    self.interesting = interesting_normal
    self.cdata_elem = None

def close(

self)

Handle any buffered data.

def close(self):
    """Handle any buffered data."""
    self.goahead(1)

def error(

self, message)

def error(self, message):
    raise HTMLParseError(message, self.getpos())

def feed(

self, data)

Feed data to the parser.

Call this as often as you want, with as little or as much text as you want (may include '\n').

def feed(self, data):
    r"""Feed data to the parser.
    Call this as often as you want, with as little or as much text
    as you want (may include '\n').
    """
    self.rawdata = self.rawdata + data
    self.goahead(0)

def get_data(

self)

def get_data(self):
    return ''.join(self.fed)

def get_starttag_text(

self)

Return full source of start tag: '<...>'.

def get_starttag_text(self):
    """Return full source of start tag: '<...>'."""
    return self.__starttag_text

def getpos(

self)

Return current line number and offset.

def getpos(self):
    """Return current line number and offset."""
    return self.lineno, self.offset

def goahead(

self, end)

def goahead(self, end):
    rawdata = self.rawdata
    i = 0
    n = len(rawdata)
    while i < n:
        match = self.interesting.search(rawdata, i) # < or &
        if match:
            j = match.start()
        else:
            if self.cdata_elem:
                break
            j = n
        if i < j: self.handle_data(rawdata[i:j])
        i = self.updatepos(i, j)
        if i == n: break
        startswith = rawdata.startswith
        if startswith('<', i):
            if starttagopen.match(rawdata, i): # < + letter
                k = self.parse_starttag(i)
            elif startswith("', i + 1)
                if k < 0:
                    k = rawdata.find('<', i + 1)
                    if k < 0:
                        k = i + 1
                else:
                    k += 1
                self.handle_data(rawdata[i:k])
            i = self.updatepos(i, k)
        elif startswith("&#", i):
            match = charref.match(rawdata, i)
            if match:
                name = match.group()[2:-1]
                self.handle_charref(name)
                k = match.end()
                if not startswith(';', k-1):
                    k = k - 1
                i = self.updatepos(i, k)
                continue
            else:
                if ";" in rawdata[i:]:  # bail by consuming '&#'
                    self.handle_data(rawdata[i:i+2])
                    i = self.updatepos(i, i+2)
                break
        elif startswith('&', i):
            match = entityref.match(rawdata, i)
            if match:
                name = match.group(1)
                self.handle_entityref(name)
                k = match.end()
                if not startswith(';', k-1):
                    k = k - 1
                i = self.updatepos(i, k)
                continue
            match = incomplete.match(rawdata, i)
            if match:
                # match.group() will contain at least 2 chars
                if end and match.group() == rawdata[i:]:
                    self.error("EOF in middle of entity or char ref")
                # incomplete
                break
            elif (i + 1) < n:
                # not the end of the buffer, and can't be confused
                # with some other construct
                self.handle_data("&")
                i = self.updatepos(i, i + 1)
            else:
                break
        else:
            assert 0, "interesting.search() lied"
    # end while
    if end and i < n and not self.cdata_elem:
        self.handle_data(rawdata[i:n])
        i = self.updatepos(i, n)
    self.rawdata = rawdata[i:]

def handle_charref(

self, name)

def handle_charref(self, name):
    pass

def handle_comment(

self, data)

def handle_comment(self, data):
    pass

def handle_data(

self, d)

def handle_data(self, d):
    self.fed.append(d)

def handle_decl(

self, decl)

def handle_decl(self, decl):
    pass

def handle_endtag(

self, tag)

def handle_endtag(self, tag):
    pass

def handle_entityref(

self, name)

def handle_entityref(self, name):
    pass

def handle_pi(

self, data)

def handle_pi(self, data):
    pass

def handle_startendtag(

self, tag, attrs)

def handle_startendtag(self, tag, attrs):
    self.handle_starttag(tag, attrs)
    self.handle_endtag(tag)

def handle_starttag(

self, tag, attrs)

def handle_starttag(self, tag, attrs):
    pass

def parse_bogus_comment(

self, i, report=1)

def parse_bogus_comment(self, i, report=1):
    rawdata = self.rawdata
    if rawdata[i:i+2] not in ('', i+2)
    if pos == -1:
        return -1
    if report:
        self.handle_comment(rawdata[i+2:pos])
    return pos + 1

def parse_comment(

self, i, report=1)

def parse_comment(self, i, report=1):
    rawdata = self.rawdata
    if rawdata[i:i+4] != '