Index: utils.py =================================================================== --- utils.py (revision 5620) +++ utils.py (revision 5621) @@ -20,7 +20,15 @@ xml_preamble_reg = re.compile(r'^<\?xml.*?encoding="(.*?)".*?\?>', re.M) -http_equiv_reg = re.compile(r'()', re.I|re.M|re.S) +# This regular expression is defined extremely carelessly. It starts +# with a tag beginning with 'meta' and extends until an arbitrary +# 'content-type' (maybe in a completely unrelated element). +# Tighten the expression a bit. +# Note that using a regular expression at all is unreliable as it does +# not know about e.g. HTML comments. A robust solution would need to +# use an HTML parser to locate the 'meta' tag. +#http_equiv_reg = re.compile(r'()', re.I|re.M|re.S) +http_equiv_reg = re.compile(r'(]*?http\-equiv[^>]*?content-type.*?>)', re.I|re.M|re.S) http_equiv_reg2 = re.compile(r'charset.*?=.*?(?P[\w\-]*)', re.I|re.M|re.S) def encodingFromXMLPreamble(xml):