Comment 6 for bug 101440

sacco (timothy-heap) wrote :

Finally, one could combine the recursive and iterator-based approaches:
the idea would be to start at the top with a recursive function which is
clear, flexible, and easy to modify to particular requirements, but,
once one has used this to select and divide as required the text from the
various elements, an iterator is used to extract the text more efficiently
from the lower levels.

Below is a continuation of the previous example, showing how this might
be done (the desired output, a list of strings representing top-level elements,
is as before).

I'd welcome any comments, especially from somebody who knows more
about either ParsedXML or about Silva document structure; my choices of
_nodeAcceptMap and whatToShow , in particular, are just what I happened
to type in/cut and paste on the spur of the moment while I was experimenting.

This does, however, all seem to work. Can we include something like this,
at least as a stop-gap until a better approach is ready?

   def _get_textContents(self, node, L_res=False, textStrip=True):
        if not L_res and not isinstance(L_res, list): L_res = []
        for child in node.childNodes:
            nodeType = child.nodeType
            if ( nodeType == child.COMMENT_NODE
                 or nodeType == child.PROCESSING_INSTRUCTION_NODE
                 ):
                continue
            if nodeType == child.TEXT_NODE:
                if textStrip:
                    L_res.append(child.nodeValue.strip())
                else:
                    L_res.append(child.nodeValue)
            elif ( nodeType in [
                child.ATTRIBUTE_NODE,
                child.DOCUMENT_TYPE_NODE,
                child.NOTATION_NODE,
                child.ENTITY_NODE,
                ] ):
                continue # or you might prefer, e.g, to look inside attributes
            else:
                ### This (first) call would simply grow the list recursively
                ### self._get_textContents(child, L_res, textStrip=textStrip)
                # or one could decide to join or not (and how) based on, e.g,
                # the element tag type --- for now we just join:
                ##L_res.append(' '.join(
                ## self._get_textContents(child, [], textStrip=textStrip) ))
                L_res.append(' '.join(
                    self._get_textFlat(child, textStrip=textStrip, ) ))
        return L_res

    _nodeAcceptMap = dict( { # REJECT some sub-trees we don't want to see
        Node.COMMENT_NODE : NodeFilter.FILTER_REJECT,
        Node.PROCESSING_INSTRUCTION_NODE : NodeFilter.FILTER_REJECT,
        Node.TEXT_NODE : NodeFilter.FILTER_ACCEPT,
        Node.ATTRIBUTE_NODE : NodeFilter.FILTER_REJECT,
        Node.DOCUMENT_TYPE_NODE : NodeFilter.FILTER_REJECT,
        Node.NOTATION_NODE : NodeFilter.FILTER_REJECT,
        Node.ENTITY_NODE : NodeFilter.FILTER_REJECT,
        Node.ENTITY_REFERENCE_NODE : NodeFilter.FILTER_ACCEPT,
        Node.CDATA_SECTION_NODE : NodeFilter.FILTER_ACCEPT,
        Node.ELEMENT_NODE : NodeFilter.FILTER_SKIP,
        Node.DOCUMENT_FRAGMENT_NODE : NodeFilter.FILTER_SKIP,
        } )

    class myNodeFilter(NodeFilter):
        def __init__(self, nodeAcceptMap):
            self._nodeAcceptMap = nodeAcceptMap
        def acceptNode(self, node):
            return self._nodeAcceptMap.get(node.nodeType) or \
                   self.FILTER_ACCEPT

    def _get_textFlat(self, document, textJoin=False, textStrip=True,
                      nodeFilter=myNodeFilter(_nodeAcceptMap),
                      nodeAcceptMap=None):
        _document = document.cloneNode(True)
        _document.normalize()
        L_ret = []
        if nodeAcceptMap is not None:
            nodeFilter = self.myNodeFilter(nodeAcceptMap)
        elif ( not isinstance(nodeFilter, NodeFilter) ) \
                 and nodeFilter is not None and not nodeFilter:
            nodeFilter = self.myNodeFilter(self._nodeAcceptMap)

        whatToShow = NodeFilter.SHOW_ALL \
                     & ~ NodeFilter.SHOW_DOCUMENT_FRAGMENT \
                     & ~ NodeFilter.SHOW_CDATA_SECTION

        nodeIter = _document.createTreeWalker(_document,
                                              whatToShow, nodeFilter,
                                              entityReferenceExpansion=1)
        child = nodeIter._get_currentNode()
        while child is not None:
            nodeType = child.nodeType
            if nodeType == child.TEXT_NODE:
                L_ret.append(child.nodeValue)
            child = nodeIter.nextNode()
        _document = None
        del nodeIter, _document
        if textStrip:
            L_ret = map(lambda s : s.strip(), L_ret)
        if isinstance(textJoin, basestring):
            return textJoin.join(L_ret)
        elif textJoin:
            return ' '.join(L_ret)
        else:
            if L_ret:
                return L_ret
        return []