This non-Beautiful Soup test code exposes the problem. It only occurs when parsing Unicode data longer than 512 bytes with XMLParser. HTMLParser works fine. Bizarrely, a document 4096 bytes long passes the test, even though 2048-byte and 8192-byte documents fail.
---
from lxml.etree import XMLParser, HTMLParser
class TestTarget:
"""This target considers the test a success if it receives a start tag."""
def __init__(self): self.success = False
def start(self, tag, attrib): self.success = True
def document_of_size(size, as_unicode=False):
"""Return an XML document of the given size, either as string or Unicode."""
size -= len("<root></root>")
doc = "<root>%s</root>" % ("0" * size)
if as_unicode:
doc = unicode(doc)
return doc
def test_with_document_of_size(size, parser, as_unicode=False):
"""Create a document of the given size and see if feed() handles it."""
target = TestTarget()
parser = parser(target=target)
try: parser.feed(document_of_size(size, as_unicode)) parser.close()
except Exception, e:
return "Exception: %s" % e
if target.success:
return "Success"
else:
return "Failure"
for parser in (XMLParser, HTMLParser):
for as_unicode in (True, False):
for power in range(5,16):
size = 2**power
result = test_with_document_of_size(size, parser, as_unicode)
if as_unicode: label = "u"
else: label = "s"
print "%.5d %s %s: %s" % (size, label, parser.__name__, result)
This non-Beautiful Soup test code exposes the problem. It only occurs when parsing Unicode data longer than 512 bytes with XMLParser. HTMLParser works fine. Bizarrely, a document 4096 bytes long passes the test, even though 2048-byte and 8192-byte documents fail.
---
from lxml.etree import XMLParser, HTMLParser
class TestTarget:
"""This target considers the test a success if it receives a start tag."""
def __init__(self):
self.success = False
def start(self, tag, attrib):
self.success = True
def document_ of_size( size, as_unicode=False): </root> ")
"""Return an XML document of the given size, either as string or Unicode."""
size -= len("<root>
doc = "<root>%s</root>" % ("0" * size)
if as_unicode:
doc = unicode(doc)
return doc
def test_with_ document_ of_size( size, parser, as_unicode=False): target= target)
parser. feed(document_ of_size( size, as_unicode))
parser. close()
"""Create a document of the given size and see if feed() handles it."""
target = TestTarget()
parser = parser(
try:
except Exception, e:
return "Exception: %s" % e
if target.success:
return "Success"
else:
return "Failure"
for parser in (XMLParser, HTMLParser): document_ of_size( size, parser, as_unicode)
label = "u"
label = "s"
for as_unicode in (True, False):
for power in range(5,16):
size = 2**power
result = test_with_
if as_unicode:
else:
print "%.5d %s %s: %s" % (size, label, parser.__name__, result)