>>> from lxml.html import parse
>>> tree = parse('690110.html.gz')
>>> body = tree.getroot()[1]
>>> body.text
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "lxml.etree.pyx", line 897, in lxml.etree._Element.text.__get__ (src/lxml/lxml.etree.c:37022)
File "apihelpers.pxi", line 691, in lxml.etree._collectText (src/lxml/lxml.etree.c:16626)
File "apihelpers.pxi", line 1344, in lxml.etree.funicode (src/lxml/lxml.etree.c:21864)
UnicodeDecodeError: 'utf8' codec can't decode byte 0xf3 in position 0: unexpected end of data
I attached a minimal test-case:
>>> from lxml.html import parse 690110. html.gz' ) _Element. text.__ get__ (src/lxml/ lxml.etree. c:37022) _collectText (src/lxml/ lxml.etree. c:16626) lxml.etree. c:21864)
>>> tree = parse('
>>> body = tree.getroot()[1]
>>> body.text
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "lxml.etree.pyx", line 897, in lxml.etree.
File "apihelpers.pxi", line 691, in lxml.etree.
File "apihelpers.pxi", line 1344, in lxml.etree.funicode (src/lxml/
UnicodeDecodeError: 'utf8' codec can't decode byte 0xf3 in position 0: unexpected end of data