goulash.parsing module

goulash.parsing

""" goulash.parsing
"""
import re
from HTMLParser import HTMLParser

R_SPLIT_DELIM = re.compile('[\W_]+')

class MLStripper(HTMLParser):
    # utility to strip html which only requires the pyton stdlib.  taken from:
    #  http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
    def __init__(self):
        self.reset()
        self.fed = []

    def handle_data(self, d):
        self.fed.append(d)

    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def smart_split(x):
    """ splits on most delims """
    return [y for y in R_SPLIT_DELIM.split(x) if y]

def sanitize_txt(x):
    """ make text suitable for href linking, etc """
    return '_'.join(smart_split(x.lower()))

Module variables

var R_SPLIT_DELIM

Functions

def sanitize_txt(

make text suitable for href linking, etc

Show source ≡

def sanitize_txt(x):
    """ make text suitable for href linking, etc """
    return '_'.join(smart_split(x.lower()))

def smart_split(

splits on most delims

Show source ≡

def smart_split(x):
    """ splits on most delims """
    return [y for y in R_SPLIT_DELIM.split(x) if y]

def strip_tags(

html)

Show source ≡

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

Classes

class MLStripper

Show source ≡

class MLStripper(HTMLParser):
    # utility to strip html which only requires the pyton stdlib.  taken from:
    #  http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
    def __init__(self):
        self.reset()
        self.fed = []

    def handle_data(self, d):
        self.fed.append(d)

    def get_data(self):
        return ''.join(self.fed)

Ancestors (in MRO)

MLStripper
HTMLParser.HTMLParser
markupbase.ParserBase

Class variables

var CDATA_CONTENT_ELEMENTS

var entitydefs

Instance variables

var fed

Methods

def __init__(

self)

Show source ≡

def __init__(self):
    self.reset()
    self.fed = []

def check_for_whole_start_tag(

self, i)

Show source ≡

def check_for_whole_start_tag(self, i):
    rawdata = self.rawdata
    m = locatestarttagend.match(rawdata, i)
    if m:
        j = m.end()
        next = rawdata[j:j+1]
        if next == ">":
            return j + 1
        if next == "/":
            if rawdata.startswith("/>", j):
                return j + 2
            if rawdata.startswith("/", j):
                # buffer boundary
                return -1
            # else bogus input
            self.updatepos(i, j + 1)
            self.error("malformed empty start tag")
        if next == "":
            # end of input
            return -1
        if next in ("abcdefghijklmnopqrstuvwxyz=/"
                    "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
            # end of input in or before attribute value, or we have the
            # '/' from a '/>' ending
            return -1
        if j > i:
            return j
        else:
            return i + 1
    raise AssertionError("we should not get here!")

def clear_cdata_mode(

self)

Show source ≡

def clear_cdata_mode(self):
    self.interesting = interesting_normal
    self.cdata_elem = None

def close(

self)

Handle any buffered data.

Show source ≡

def close(self):
    """Handle any buffered data."""
    self.goahead(1)

def error(

self, message)

Show source ≡

def error(self, message):
    raise HTMLParseError(message, self.getpos())

def feed(

self, data)

Feed data to the parser.

Call this as often as you want, with as little or as much text as you want (may include '\n').

Show source ≡

def feed(self, data):
    r"""Feed data to the parser.
    Call this as often as you want, with as little or as much text
    as you want (may include '\n').
    """
    self.rawdata = self.rawdata + data
    self.goahead(0)

def get_data(

self)

Show source ≡

def get_data(self):
    return ''.join(self.fed)

def get_starttag_text(

self)

Return full source of start tag: '<...>'.

Show source ≡

def get_starttag_text(self):
    """Return full source of start tag: '<...>'."""
    return self.__starttag_text

def getpos(

self)

Return current line number and offset.

Show source ≡

def getpos(self):
    """Return current line number and offset."""
    return self.lineno, self.offset

def goahead(

self, end)

Show source ≡

def goahead(self, end):
    rawdata = self.rawdata
    i = 0
    n = len(rawdata)
    while i < n:
        match = self.interesting.search(rawdata, i) # < or &
        if match:
            j = match.start()
        else:
            if self.cdata_elem:
                break
            j = n
        if i < j: self.handle_data(rawdata[i:j])
        i = self.updatepos(i, j)
        if i == n: break
        startswith = rawdata.startswith
        if startswith('<', i):
            if starttagopen.match(rawdata, i): # < + letter
                k = self.parse_starttag(i)
            elif startswith("', i + 1)
                if k < 0:
                    k = rawdata.find('<', i + 1)
                    if k < 0:
                        k = i + 1
                else:
                    k += 1
                self.handle_data(rawdata[i:k])
            i = self.updatepos(i, k)
        elif startswith("&#", i):
            match = charref.match(rawdata, i)
            if match:
                name = match.group()[2:-1]
                self.handle_charref(name)
                k = match.end()
                if not startswith(';', k-1):
                    k = k - 1
                i = self.updatepos(i, k)
                continue
            else:
                if ";" in rawdata[i:]:  # bail by consuming '&#'
                    self.handle_data(rawdata[i:i+2])
                    i = self.updatepos(i, i+2)
                break
        elif startswith('&', i):
            match = entityref.match(rawdata, i)
            if match:
                name = match.group(1)
                self.handle_entityref(name)
                k = match.end()
                if not startswith(';', k-1):
                    k = k - 1
                i = self.updatepos(i, k)
                continue
            match = incomplete.match(rawdata, i)
            if match:
                # match.group() will contain at least 2 chars
                if end and match.group() == rawdata[i:]:
                    self.error("EOF in middle of entity or char ref")
                # incomplete
                break
            elif (i + 1) < n:
                # not the end of the buffer, and can't be confused
                # with some other construct
                self.handle_data("&")
                i = self.updatepos(i, i + 1)
            else:
                break
        else:
            assert 0, "interesting.search() lied"
    # end while
    if end and i < n and not self.cdata_elem:
        self.handle_data(rawdata[i:n])
        i = self.updatepos(i, n)
    self.rawdata = rawdata[i:]

def handle_charref(

self, name)

Show source ≡

def handle_charref(self, name):
    pass

def handle_comment(

self, data)

Show source ≡

def handle_comment(self, data):
    pass

def handle_data(

self, d)

Show source ≡

def handle_data(self, d):
    self.fed.append(d)

def handle_decl(

self, decl)

Show source ≡

def handle_decl(self, decl):
    pass

def handle_endtag(

self, tag)

Show source ≡

def handle_endtag(self, tag):
    pass

def handle_entityref(

self, name)

Show source ≡

def handle_entityref(self, name):
    pass

def handle_pi(

self, data)

Show source ≡

def handle_pi(self, data):
    pass

def handle_startendtag(

self, tag, attrs)

Show source ≡

def handle_startendtag(self, tag, attrs):
    self.handle_starttag(tag, attrs)
    self.handle_endtag(tag)

def handle_starttag(

self, tag, attrs)

Show source ≡

def handle_starttag(self, tag, attrs):
    pass

def parse_bogus_comment(

self, i, report=1)

Show source ≡

def parse_bogus_comment(self, i, report=1):
    rawdata = self.rawdata
    if rawdata[i:i+2] not in ('', i+2)
    if pos == -1:
        return -1
    if report:
        self.handle_comment(rawdata[i+2:pos])
    return pos + 1

def parse_comment(

self, i, report=1)

Show source ≡

def parse_comment(self, i, report=1):
    rawdata = self.rawdata
    if rawdata[i:i+4] != '

Index

Module variables

Functions

Classes

Module variables

Functions

Classes

Ancestors (in MRO)

Class variables

Instance variables

Methods