Comment 4 for bug 101440

sacco (timothy-heap) wrote :

Here is a new version of fulltext() along the lines of johnny's suggestion
which strips the XML to return a list of strings, each of which represents the
content of a top-level element of the DocumentVersion.

    def fulltext(self):
        """Return the full text content of this object."""
        if self.version_status() == 'unapproved':
            return ''
        fulltext = [self.get_title()]
        ### text = self._flattenxml(self.content_xml())
        text = self._get_textContents(self.content.documentElement, [])
        if isinstance(text, (list, tuple, )):
            fulltext.extend(map(self._stringify, filter(None, text)))
        else:
            text = self._stringify(text)
            if text:
                fulltext.append(text)
        return fulltext

    def _get_textContents(self, node, L_res=False, textStrip=True):
        if not L_res and not isinstance(L_res, list): L_res = []
        for child in node.childNodes:
            nodeType = child.nodeType
            if ( nodeType == child.COMMENT_NODE
                 or nodeType == child.PROCESSING_INSTRUCTION_NODE
                 ):
                continue
            if nodeType == child.TEXT_NODE:
                if textStrip:
                    L_res.append(child.nodeValue.strip())
                else:
                    L_res.append(child.nodeValue)
            elif ( nodeType in [
                child.ATTRIBUTE_NODE,
                child.DOCUMENT_TYPE_NODE,
                child.NOTATION_NODE,
                child.ENTITY_NODE,
                ] ):
                continue # or you might prefer, e.g, to look inside attributes
            else:
                ### This (first) call would simply grow the list recursively
                ### self._get_textContents(child, L_res, textStrip=textStrip)
                # or one could decide to join or not (and how) based on, e.g,
                # the element tag type --- for now we just join:
                L_res.append(' '.join(
                    self._get_textContents(child, [], textStrip=textStrip) ))
        return L_res