Comment 1 for bug 963880

Revision history for this message
Leonard Richardson (leonardr) wrote :

This non-Beautiful Soup test code exposes the problem. It only occurs when parsing Unicode data longer than 512 bytes with XMLParser. HTMLParser works fine. Bizarrely, a document 4096 bytes long passes the test, even though 2048-byte and 8192-byte documents fail.

---

from lxml.etree import XMLParser, HTMLParser

class TestTarget:
    """This target considers the test a success if it receives a start tag."""

    def __init__(self):
        self.success = False

    def start(self, tag, attrib):
        self.success = True

def document_of_size(size, as_unicode=False):
    """Return an XML document of the given size, either as string or Unicode."""
    size -= len("<root></root>")
    doc = "<root>%s</root>" % ("0" * size)
    if as_unicode:
        doc = unicode(doc)
    return doc

def test_with_document_of_size(size, parser, as_unicode=False):
    """Create a document of the given size and see if feed() handles it."""
    target = TestTarget()
    parser = parser(target=target)
    try:
        parser.feed(document_of_size(size, as_unicode))
        parser.close()
    except Exception, e:
        return "Exception: %s" % e
    if target.success:
        return "Success"
    else:
        return "Failure"

for parser in (XMLParser, HTMLParser):
    for as_unicode in (True, False):
        for power in range(5,16):
            size = 2**power
            result = test_with_document_of_size(size, parser, as_unicode)
            if as_unicode:
                label = "u"
            else:
                label = "s"
            print "%.5d %s %s: %s" % (size, label, parser.__name__, result)