Comment 18 for bug 101440

Revision history for this message
sacco (timothy-heap) wrote :

In the meantime, without explanation (sorry - it still uses L_res but the reason
should be clearer here):

    def fulltext(self):
        """Return the full text content of this object."""
        if self.version_status() == 'unapproved':
            return ''
        fulltext = [self.get_title()]
        text = list()
        self._get_textContents(self.content.documentElement, text)
        fulltext.extend(filter(None, text))
        return fulltext

    def _get_textContents(self, node, L_res, textStrip=True):
        for child in node.childNodes:
            nodeType = child.nodeType
            if ( nodeType == child.COMMENT_NODE
                 or nodeType == child.PROCESSING_INSTRUCTION_NODE
                 ):
                continue
            if nodeType == child.TEXT_NODE:
                if textStrip:
                    L_res.append(child.nodeValue.strip())
                else:
                    L_res.append(child.nodeValue)
            elif ( nodeType in [
                child.ATTRIBUTE_NODE,
                child.DOCUMENT_TYPE_NODE,
                child.NOTATION_NODE,
                child.ENTITY_NODE,
                ] ):
                continue # or you might prefer, e.g, to look inside attributes
            elif nodeType == child.ELEMENT_NODE:
                if tag_is_p(child.tagName) :
                    # text in a paragraph should be joined with no space but not
stripped
                    P_text = list()
                    self._get_textContents(child, P_text, textStrip=False)
                    L_res.append(''.join(P_text))
                    # can alternatively call a mutually recursive helper
function if we want
                    # to do something more complicated or e.g. to enforce schema
                else:
                    self._get_textContents(child, L_res, textStrip=textStrip)
            else:
                self._get_textContents(child, L_res, textStrip=textStrip)

Here tag_is_p() looks, I suppose something like:

def tag_is_p(tagName):
    if ":" in tagName:
        parts = tagName.split(":")
        if len(parts) != 2 or not parts[1]:
            raise SomeException
        else: # check parts[0] is a suitable prefix if you want
            tagName = parts[1]
    return tagName == "p"