From 62b8c345779bdfb8b261a6f7de3287f4fe800bb6 Mon Sep 17 00:00:00 2001 From: Koert van der Veer Date: Thu, 16 Mar 2017 09:13:18 +0100 Subject: [PATCH] Perform full-document detection on decoded bytes. Closes #1673355 --- src/lxml/html/html5parser.py | 9 ++++++++- src/lxml/html/tests/test_html5parser.py | 6 ++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/lxml/html/html5parser.py b/src/lxml/html/html5parser.py index 7188c7e..ba9d41b 100644 --- a/src/lxml/html/html5parser.py +++ b/src/lxml/html/html5parser.py @@ -147,7 +147,14 @@ def fromstring(html, guess_charset=True, parser=None): guess_charset=guess_charset) # document starts with doctype or , full document! - start = html[:50].lstrip().lower() + start = html[:50] + if hasattr(start, 'decode'): + # In python3, we may have been presented with a bytes object. + # Decode in ascii, that also covers latin-1 and utf-8 for the + # characters we need + start = start.decode('ascii', 'replace') + + start = start.lstrip().lower() if start.startswith('', parser=parser), 'the doc') + def test_returns_whole_doc_if_input_is_encoded(self): + parser = DummyParser(root='the doc') + input = ''.encode('ascii') + self.assertEqual(self.call_it(input, parser=parser), + 'the doc') + def test_returns_whole_doc_if_head_not_empty(self, use_ns=True): E = HTMLElementMaker(namespaceHTMLElements=use_ns) root = E.html(E.head(E.title())) -- 2.9.3