From cfcbd71c264233aed8757b1144032e159f055e5d Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Sun, 21 Jul 2019 17:35:04 +0100 Subject: [PATCH] Support line numbers and offsets with html.parser Signed-off-by: Chris Mayo --- bs4/__init__.py | 6 ++++-- bs4/builder/_htmlparser.py | 3 ++- bs4/element.py | 4 +++- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/bs4/__init__.py b/bs4/__init__.py index 9cd01c8..84f5f8c 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -531,7 +531,8 @@ class BeautifulSoup(Tag): return most_recently_popped - def handle_starttag(self, name, namespace, nsprefix, attrs): + def handle_starttag(self, name, namespace, nsprefix, attrs, + lineno=None, offset=None): """Push a start tag on to the stack. If this method returns None, the tag was rejected by the @@ -549,7 +550,8 @@ class BeautifulSoup(Tag): return None tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, - self.currentTag, self._most_recent_element) + self.currentTag, self._most_recent_element, lineno=lineno, + offset=offset) if tag is None: return tag if self._most_recent_element is not None: diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index 56b8b91..52bfcd4 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -99,7 +99,8 @@ class BeautifulSoupHTMLParser(HTMLParser): attr_dict[key] = value attrvalue = '""' #print "START", name - tag = self.soup.handle_starttag(name, None, None, attr_dict) + tag = self.soup.handle_starttag(name, None, None, attr_dict, + *self.getpos()) if tag and tag.is_empty_element and handle_empty_element: # Unlike other parsers, html.parser doesn't send separate end tag # events for empty-element tags. (It's handled in diff --git a/bs4/element.py b/bs4/element.py index 73e3867..016e4e1 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -724,7 +724,7 @@ class Tag(PageElement): def __init__(self, parser=None, builder=None, name=None, namespace=None, prefix=None, attrs=None, parent=None, previous=None, - is_xml=None): + is_xml=None, lineno=None, offset=None): "Basic constructor." if parser is None: @@ -738,6 +738,8 @@ class Tag(PageElement): self.name = name self.namespace = namespace self.prefix = prefix + self.lineno = lineno + self.offset = offset if attrs is None: attrs = {} elif attrs: -- 2.21.0