An exception does occur sometimes (in my experience, `_self_index+1` out of bound), but so far no problem with (adding a `strip=False` argument for backward compatibility):
# PATCH for BeautifulSoup extract() method
from bs4 import PageElement
def extract_patched(self, _self_index=None, strip=False):
"""Destructively rips this element out of the tree.
:param _self_index: The location of this element in its parent's
.contents, if known. Passing this in allows for a performance optimization.
:return: `self`, no longer part of the tree.
"""
if self.parent is not None:
if _self_index is None: _self_index = self.parent.index(self)
del self.parent.contents[_self_index]
# ! PATCH STARTS HERE !
# remove empty line introduced by extract():
# check that nearby parent.contents is really empty before deleting
if strip:
try:
for i in range(_self_index-1, _self_index+1): if str(self.parent.contents[i]).strip() == '': del self.parent.contents[i]
except: pass
# ! PATCH ENDS HERE !
#Find the two elements that would be next to each other if
#this element (and any children) hadn't been parsed. Connect
#the two.
last_child = self._last_descendant()
next_element = last_child.next_element
if (self.previous_element is not None and self.previous_element is not next_element): self.previous_element.next_element = next_element
if next_element is not None and next_element is not self.previous_element: next_element.previous_element = self.previous_element
self.previous_element = None
last_child.next_element = None
self.parent = None
if (self.previous_sibling is not None
and self.previous_sibling is not self.next_sibling): self.previous_sibling.next_sibling = self.next_sibling
if (self.next_sibling is not None
and self.next_sibling is not self.previous_sibling): self.next_sibling.previous_sibling = self.previous_sibling
self.previous_sibling = self.next_sibling = None
return self
# ! APPLY PATCH !
PageElement.extract = extract_patched
An exception does occur sometimes (in my experience, `_self_index+1` out of bound), but so far no problem with (adding a `strip=False` argument for backward compatibility):
# PATCH for BeautifulSoup extract() method patched( self, _self_index=None, strip=False): Destructively rips this element out of the tree.
from bs4 import PageElement
def extract_
"""
:param _self_index: The location of this element in its parent's
optimization.
.contents, if known. Passing this in allows for a performance
:return: `self`, no longer part of the tree.
_self_ index = self.parent. index(self) contents[ _self_index]
"""
if self.parent is not None:
if _self_index is None:
del self.parent.
# ! PATCH STARTS HERE ! self_index- 1, _self_index+1):
if str(self. parent. contents[ i]).strip( ) == '':
del self.parent. contents[ i]
pass
# remove empty line introduced by extract():
# check that nearby parent.contents is really empty before deleting
if strip:
try:
for i in range(_
except:
# ! PATCH ENDS HERE !
#Find the two elements that would be next to each other if descendant( ) next_element
#this element (and any children) hadn't been parsed. Connect
#the two.
last_child = self._last_
next_element = last_child.
if (self.previous_ element is not None and
self.previous_ element is not next_element):
self.previous_ element. next_element = next_element element:
next_element. previous_ element = self.previous_ element previous_ element = None child.next_ element = None
if next_element is not None and next_element is not self.previous_
self.
last_
self.parent = None sibling is not None sibling is not self.next_sibling):
self.previous_ sibling. next_sibling = self.next_sibling sibling) :
self.next_ sibling. previous_ sibling = self.previous_ sibling previous_ sibling = self.next_sibling = None
if (self.previous_
and self.previous_
if (self.next_sibling is not None
and self.next_sibling is not self.previous_
self.
return self
# ! APPLY PATCH !
PageElement.extract = extract_patched