Finally, one could combine the recursive and iterator-based approaches: the idea would be to start at the top with a recursive function which is clear, flexible, and easy to modify to particular requirements, but, once one has used this to select and divide as required the text from the various elements, an iterator is used to extract the text more efficiently from the lower levels. Below is a continuation of the previous example, showing how this might be done (the desired output, a list of strings representing top-level elements, is as before). I'd welcome any comments, especially from somebody who knows more about either ParsedXML or about Silva document structure; my choices of _nodeAcceptMap and whatToShow , in particular, are just what I happened to type in/cut and paste on the spur of the moment while I was experimenting. This does, however, all seem to work. Can we include something like this, at least as a stop-gap until a better approach is ready? def _get_textContents(self, node, L_res=False, textStrip=True): if not L_res and not isinstance(L_res, list): L_res = [] for child in node.childNodes: nodeType = child.nodeType if ( nodeType == child.COMMENT_NODE or nodeType == child.PROCESSING_INSTRUCTION_NODE ): continue if nodeType == child.TEXT_NODE: if textStrip: L_res.append(child.nodeValue.strip()) else: L_res.append(child.nodeValue) elif ( nodeType in [ child.ATTRIBUTE_NODE, child.DOCUMENT_TYPE_NODE, child.NOTATION_NODE, child.ENTITY_NODE, ] ): continue # or you might prefer, e.g, to look inside attributes else: ### This (first) call would simply grow the list recursively ### self._get_textContents(child, L_res, textStrip=textStrip) # or one could decide to join or not (and how) based on, e.g, # the element tag type --- for now we just join: ##L_res.append(' '.join( ## self._get_textContents(child, [], textStrip=textStrip) )) L_res.append(' '.join( self._get_textFlat(child, textStrip=textStrip, ) )) return L_res _nodeAcceptMap = dict( { # REJECT some sub-trees we don't want to see Node.COMMENT_NODE : NodeFilter.FILTER_REJECT, Node.PROCESSING_INSTRUCTION_NODE : NodeFilter.FILTER_REJECT, Node.TEXT_NODE : NodeFilter.FILTER_ACCEPT, Node.ATTRIBUTE_NODE : NodeFilter.FILTER_REJECT, Node.DOCUMENT_TYPE_NODE : NodeFilter.FILTER_REJECT, Node.NOTATION_NODE : NodeFilter.FILTER_REJECT, Node.ENTITY_NODE : NodeFilter.FILTER_REJECT, Node.ENTITY_REFERENCE_NODE : NodeFilter.FILTER_ACCEPT, Node.CDATA_SECTION_NODE : NodeFilter.FILTER_ACCEPT, Node.ELEMENT_NODE : NodeFilter.FILTER_SKIP, Node.DOCUMENT_FRAGMENT_NODE : NodeFilter.FILTER_SKIP, } ) class myNodeFilter(NodeFilter): def __init__(self, nodeAcceptMap): self._nodeAcceptMap = nodeAcceptMap def acceptNode(self, node): return self._nodeAcceptMap.get(node.nodeType) or \ self.FILTER_ACCEPT def _get_textFlat(self, document, textJoin=False, textStrip=True, nodeFilter=myNodeFilter(_nodeAcceptMap), nodeAcceptMap=None): _document = document.cloneNode(True) _document.normalize() L_ret = [] if nodeAcceptMap is not None: nodeFilter = self.myNodeFilter(nodeAcceptMap) elif ( not isinstance(nodeFilter, NodeFilter) ) \ and nodeFilter is not None and not nodeFilter: nodeFilter = self.myNodeFilter(self._nodeAcceptMap) whatToShow = NodeFilter.SHOW_ALL \ & ~ NodeFilter.SHOW_DOCUMENT_FRAGMENT \ & ~ NodeFilter.SHOW_CDATA_SECTION nodeIter = _document.createTreeWalker(_document, whatToShow, nodeFilter, entityReferenceExpansion=1) child = nodeIter._get_currentNode() while child is not None: nodeType = child.nodeType if nodeType == child.TEXT_NODE: L_ret.append(child.nodeValue) child = nodeIter.nextNode() _document = None del nodeIter, _document if textStrip: L_ret = map(lambda s : s.strip(), L_ret) if isinstance(textJoin, basestring): return textJoin.join(L_ret) elif textJoin: return ' '.join(L_ret) else: if L_ret: return L_ret return []