From 432340af8dca83a4ddb5d4599f2ae8276669ab1c Mon Sep 17 00:00:00 2001 From: Koert van der Veer Date: Thu, 16 Mar 2017 10:07:38 +0100 Subject: [PATCH 1/2] Build a retry mechanism around html5lib's unpredictable useChardet support Closes LP1654544 --- src/lxml/html/html5parser.py | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/src/lxml/html/html5parser.py b/src/lxml/html/html5parser.py index 7188c7e..3524535 100644 --- a/src/lxml/html/html5parser.py +++ b/src/lxml/html/html5parser.py @@ -7,9 +7,8 @@ import string from html5lib import HTMLParser as _HTMLParser from html5lib.treebuilders.etree_lxml import TreeBuilder - from lxml import etree -from lxml.html import _contains_block_level_tag, XHTML_NAMESPACE, Element +from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag # python3 compatibility try: @@ -25,12 +24,41 @@ try: except ImportError: from urllib.parse import urlparse +def _dodgeUseChardet(method_name): + # html5lib does not accept useChardet as an argument, if it + # detected the html argument would produce unicode objects. + # However, there is no reasonable way to predict if html5lib will + # detect the argument to be unicode (all of that code is private), + # so we'll have to settle for a retry. + + # this function creates a wrapper around the specified object, which + # retries when html5lib complains about the useChardet argument + + def inner(self, *args, **kwargs): + callee = getattr(super(type(self), self), method_name) + try: + return callee(*args, **kwargs) + except TypeError: + exception = sys.exc_info()[1] + if "'useChardet'" not in str(exception): + # Some other issue caused the exception. Tell the caller + raise + kwargs.pop('useChardet') + return callee(*args, **kwargs) + inner.__name__ = method_name + return inner + + + class HTMLParser(_HTMLParser): """An html5lib HTML parser with lxml as tree.""" def __init__(self, strict=False, **kwargs): _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs) + parse = _dodgeUseChardet('parse') + parseFragment = _dodgeUseChardet('parseFragment') + try: from html5lib import XHTMLParser as _XHTMLParser @@ -43,6 +71,9 @@ else: def __init__(self, strict=False, **kwargs): _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs) + parse = _dodgeUseChardet('parse') + parseFragment = _dodgeUseChardet('parseFragment') + xhtml_parser = XHTMLParser() -- 2.9.3