goulash.parsing module
goulash.parsing
""" goulash.parsing
"""
import re
from HTMLParser import HTMLParser
R_SPLIT_DELIM = re.compile('[\W_]+')
class MLStripper(HTMLParser):
# utility to strip html which only requires the pyton stdlib. taken from:
# http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
def smart_split(x):
""" splits on most delims """
return [y for y in R_SPLIT_DELIM.split(x) if y]
def sanitize_txt(x):
""" make text suitable for href linking, etc """
return '_'.join(smart_split(x.lower()))
Module variables
var R_SPLIT_DELIM
Functions
def sanitize_txt(
x)
make text suitable for href linking, etc
def sanitize_txt(x):
""" make text suitable for href linking, etc """
return '_'.join(smart_split(x.lower()))
def smart_split(
x)
splits on most delims
def smart_split(x):
""" splits on most delims """
return [y for y in R_SPLIT_DELIM.split(x) if y]
def strip_tags(
html)
Classes
class MLStripper
class MLStripper(HTMLParser):
# utility to strip html which only requires the pyton stdlib. taken from:
# http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
Ancestors (in MRO)
- MLStripper
- HTMLParser.HTMLParser
- markupbase.ParserBase
Class variables
var CDATA_CONTENT_ELEMENTS
var entitydefs
Instance variables
var fed
Methods
def __init__(
self)
def __init__(self):
self.reset()
self.fed = []
def check_for_whole_start_tag(
self, i)
def check_for_whole_start_tag(self, i):
rawdata = self.rawdata
m = locatestarttagend.match(rawdata, i)
if m:
j = m.end()
next = rawdata[j:j+1]
if next == ">":
return j + 1
if next == "/":
if rawdata.startswith("/>", j):
return j + 2
if rawdata.startswith("/", j):
# buffer boundary
return -1
# else bogus input
self.updatepos(i, j + 1)
self.error("malformed empty start tag")
if next == "":
# end of input
return -1
if next in ("abcdefghijklmnopqrstuvwxyz=/"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
# end of input in or before attribute value, or we have the
# '/' from a '/>' ending
return -1
if j > i:
return j
else:
return i + 1
raise AssertionError("we should not get here!")
def clear_cdata_mode(
self)
def clear_cdata_mode(self):
self.interesting = interesting_normal
self.cdata_elem = None
def close(
self)
Handle any buffered data.
def close(self):
"""Handle any buffered data."""
self.goahead(1)
def error(
self, message)
def error(self, message):
raise HTMLParseError(message, self.getpos())
def feed(
self, data)
Feed data to the parser.
Call this as often as you want, with as little or as much text as you want (may include '\n').
def feed(self, data):
r"""Feed data to the parser.
Call this as often as you want, with as little or as much text
as you want (may include '\n').
"""
self.rawdata = self.rawdata + data
self.goahead(0)
def get_data(
self)
def get_data(self):
return ''.join(self.fed)
def get_starttag_text(
self)
Return full source of start tag: '<...>'.
def get_starttag_text(self):
"""Return full source of start tag: '<...>'."""
return self.__starttag_text
def getpos(
self)
Return current line number and offset.
def getpos(self):
"""Return current line number and offset."""
return self.lineno, self.offset
def goahead(
self, end)
def goahead(self, end):
rawdata = self.rawdata
i = 0
n = len(rawdata)
while i < n:
match = self.interesting.search(rawdata, i) # < or &
if match:
j = match.start()
else:
if self.cdata_elem:
break
j = n
if i < j: self.handle_data(rawdata[i:j])
i = self.updatepos(i, j)
if i == n: break
startswith = rawdata.startswith
if startswith('<', i):
if starttagopen.match(rawdata, i): # < + letter
k = self.parse_starttag(i)
elif startswith("", i):
k = self.parse_endtag(i)
elif startswith("