I can also reproduce this on lxml 4.9.2, MacOS 12.6.2, Python 3.11.1, with a very small example.
This took me too long to figure out. The fact that etree.HTML(s) sometimes returns `None` and sometimes returns a broken HTML object is a bit confounding.
**There's a near-trivial workaround - call `.encode()` on any `str` that you pass in.**
Here's the code to reproduce, which also shows the workaround.
from lxml import etree
def round_trip(s):
html = etree.HTML(s)
assert html is not None, 'etree.HTML(s) returned None!'
result = etree.tostring(html, pretty_print=True, method="html")
Result:
<html><body><p>h t m l > </p></body></html>
Traceback (most recent call last):
File "<frozen runpy>", line 198, in _run_module_as_main
File "<frozen runpy>", line 88, in _run_code
File "/Users/tom/synthetic/code/multi/multi/tweak_index.py", line 93, in <module> compare_both()
File "/Users/tom/synthetic/code/multi/multi/tweak_index.py", line 64, in compare_both round_trip('\n' + p2) # Returns None ^^^^^^^^^^^^^^^^^^^^^
File "/Users/tom/synthetic/code/multi/multi/tweak_index.py", line 41, in round_trip
assert html is not None, 'etree.HTML(s) returned None!' ^^^^^^^^^^^^^^^^
AssertionError: etree.HTML(s) returned None!
0.03 real 0.02 user 0.00 sys
I can also reproduce this on lxml 4.9.2, MacOS 12.6.2, Python 3.11.1, with a very small example.
This took me too long to figure out. The fact that etree.HTML(s) sometimes returns `None` and sometimes returns a broken HTML object is a bit confounding.
**There's a near-trivial workaround - call `.encode()` on any `str` that you pass in.**
Here's the code to reproduce, which also shows the workaround.
from lxml import etree
def round_trip(s):
html = etree.HTML(s)
assert html is not None, 'etree.HTML(s) returned None!'
result = etree.tostring( html, pretty_print=True, method="html")
def compare_both(): <head>< title>ANT< /title> </head> <body>< /body>< /html>'
p1 = '<html>
# Works
round_ trip(p1)
round_ trip('\ n' + p1)
p2 = '<html> <head>< title>🐜 </title> </head> <body>< /body>< /html>'
# The workaround!
round_ trip(p2. encode( ))
round_ trip((' \n' + p2).encode())
# Fails
round_ trip(p2) # Wrong answer
round_ trip('\ n' + p2) # etree.HTML(s) returns None
and here are the results:
Result: <title> ANT</title> </head>
<html>
<head>
<body></body>
</html>
Result: <title> ANT</title> </head>
<html>
<head>
<body></body>
</html>
Result: <title> ð& #159;&# 144;œ </title> </head>
<html>
<head>
<body></body>
</html>
Result: <title> ð& #159;&# 144;œ </title> </head>
<html>
<head>
<body></body>
</html>
Result: <body>< p>h t m l > </p></body></html>
<html>
Traceback (most recent call last): tom/synthetic/ code/multi/ multi/tweak_ index.py" , line 93, in <module>
compare_ both() tom/synthetic/ code/multi/ multi/tweak_ index.py" , line 64, in compare_both
round_ trip('\ n' + p2) # Returns None
^^^^^^ ^^^^^^^ ^^^^^^^ ^ tom/synthetic/ code/multi/ multi/tweak_ index.py" , line 41, in round_trip
^^^^^^ ^^^^^^^ ^^^
File "<frozen runpy>", line 198, in _run_module_as_main
File "<frozen runpy>", line 88, in _run_code
File "/Users/
File "/Users/
File "/Users/
assert html is not None, 'etree.HTML(s) returned None!'
AssertionError: etree.HTML(s) returned None!
0.03 real 0.02 user 0.00 sys