55,59c55,83
< def unescape(s):
< p = htmllib.HTMLParser(None)
< p.save_bgn()
< p.feed(s)
< return p.save_end()
---
>
> import re, htmlentitydefs
>
> ##
> # Removes HTML or XML character references and entities from a text string.
> #
> # @param text The HTML (or XML) source text.
> # @return The plain text, as a Unicode string, if necessary.
>
> def unescape(text):
> def fixup(m):
> text = m.group(0)
> if text[:2] == "":
> # character reference
> try:
> if text[:3] == "":
> return unichr(int(text[3:-1], 16))
> else:
> return unichr(int(text[2:-1]))
> except ValueError:
> pass
> else:
> # named entity
> try:
> text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
> except KeyError:
> pass
> return text # leave as is
> return re.sub("?\w+;", fixup, text)