In the meantime, without explanation (sorry - it still uses L_res but the reason
should be clearer here):
def fulltext(self):
"""Return the full text content of this object."""
if self.version_status() == 'unapproved':
return ''
fulltext = [self.get_title()]
text = list() self._get_textContents(self.content.documentElement, text) fulltext.extend(filter(None, text))
return fulltext
def _get_textContents(self, node, L_res, textStrip=True):
for child in node.childNodes: nodeType = child.nodeType
if ( nodeType == child.COMMENT_NODE
or nodeType == child.PROCESSING_INSTRUCTION_NODE
): continue
if nodeType == child.TEXT_NODE:
if textStrip: L_res.append(child.nodeValue.strip()) else: L_res.append(child.nodeValue)
elif ( nodeType in [ child.ATTRIBUTE_NODE, child.DOCUMENT_TYPE_NODE, child.NOTATION_NODE, child.ENTITY_NODE,
] ): continue # or you might prefer, e.g, to look inside attributes
elif nodeType == child.ELEMENT_NODE:
if tag_is_p(child.tagName) : # text in a paragraph should be joined with no space but not
stripped P_text = list() self._get_textContents(child, P_text, textStrip=False) L_res.append(''.join(P_text)) # can alternatively call a mutually recursive helper
function if we want # to do something more complicated or e.g. to enforce schema else: self._get_textContents(child, L_res, textStrip=textStrip)
else: self._get_textContents(child, L_res, textStrip=textStrip)
Here tag_is_p() looks, I suppose something like:
def tag_is_p(tagName):
if ":" in tagName:
parts = tagName.split(":")
if len(parts) != 2 or not parts[1]:
raise SomeException
else: # check parts[0] is a suitable prefix if you want
tagName = parts[1]
return tagName == "p"
In the meantime, without explanation (sorry - it still uses L_res but the reason
should be clearer here):
def fulltext(self): status( ) == 'unapproved':
self._ get_textContent s(self. content. documentElement , text)
fulltext. extend( filter( None, text))
"""Return the full text content of this object."""
if self.version_
return ''
fulltext = [self.get_title()]
text = list()
return fulltext
def _get_textConten ts(self, node, L_res, textStrip=True):
nodeType = child.nodeType G_INSTRUCTION_ NODE
continue
L_ res.append( child.nodeValue .strip( ))
else:
L_ res.append( child.nodeValue )
child. ATTRIBUTE_ NODE,
child. DOCUMENT_ TYPE_NODE,
child. NOTATION_ NODE,
child. ENTITY_ NODE,
continue # or you might prefer, e.g, to look inside attributes p(child. tagName) :
# text in a paragraph should be joined with no space but not
P_ text = list()
self. _get_textConten ts(child, P_text, textStrip=False)
L_ res.append( ''.join( P_text) )
# can alternatively call a mutually recursive helper
# to do something more complicated or e.g. to enforce schema
else:
self. _get_textConten ts(child, L_res, textStrip= textStrip)
self. _get_textConten ts(child, L_res, textStrip= textStrip)
for child in node.childNodes:
if ( nodeType == child.COMMENT_NODE
or nodeType == child.PROCESSIN
):
if nodeType == child.TEXT_NODE:
if textStrip:
elif ( nodeType in [
] ):
elif nodeType == child.ELEMENT_NODE:
if tag_is_
stripped
function if we want
else:
Here tag_is_p() looks, I suppose something like:
def tag_is_p(tagName):
if ":" in tagName:
parts = tagName.split(":")
if len(parts) != 2 or not parts[1]:
raise SomeException
else: # check parts[0] is a suitable prefix if you want
tagName = parts[1]
return tagName == "p"