class Node: COMMENT_NODE = 1 PROCESSING_INSTRUCTION_NODE = 2 TEXT_NODE = 3 ATTRIBUTE_NODE = 4 DOCUMENT_TYPE_NODE = 5 NOTATION_NODE = 6 ENTITY_NODE = 7 ELEMENT_NODE = 8 def __init__(self, nodeType, nodeName='', nodeValue=''): self.nodeType = nodeType self.nodeName = nodeName self.nodeValue = nodeValue self.childNodes = [] if nodeType == Node.ELEMENT_NODE: self.localName = nodeName trace = [] def _get_textContents(node, depth=0): for child in node.childNodes: nodeType = child.nodeType if (nodeType == child.COMMENT_NODE or nodeType == child.PROCESSING_INSTRUCTION_NODE): continue if nodeType == child.TEXT_NODE: trace.append("T:%i" % depth) trace.append("Y:%i" % depth) yield child.nodeValue elif (nodeType in [ child.ATTRIBUTE_NODE, child.DOCUMENT_TYPE_NODE, child.NOTATION_NODE, child.ENTITY_NODE, ]): continue # or you might prefer, e.g, to look inside attributes else: # iterate through elements are left for text in _get_textContents(child, depth+1): trace.append("Y:%i" % depth) yield text trace.append("R:%i" % depth) trace2 = [] def _get_textContents2(node, L_res, L_test, depth=0, ): for child in node.childNodes: nodeType = child.nodeType if (nodeType == child.COMMENT_NODE or nodeType == child.PROCESSING_INSTRUCTION_NODE): continue if nodeType == child.TEXT_NODE: trace2.append("T:%i" % depth) L_test.append(child.nodeValue) L_res.append(child.nodeValue) elif (nodeType in [ child.ATTRIBUTE_NODE, child.DOCUMENT_TYPE_NODE, child.NOTATION_NODE, child.ENTITY_NODE, ]): continue # or you might prefer, e.g, to look inside attributes elif nodeType == child.ELEMENT_NODE: if child.localName == "p": P_text = list() _get_textContents2(child, P_text, L_test, depth+1, ) L_res.append(''.join(P_text)) else: _get_textContents2(child, L_res, L_test, depth+1, ) else: _get_textContents2(child, L_res, L_test, depth+1, ) trace2.append("R:%i" % depth) def main(): top = Node(Node.ELEMENT_NODE, 'top') p1 = Node(Node.ELEMENT_NODE, 'p') p2 = Node(Node.ELEMENT_NODE, 'p') top.childNodes.append(p1) top.childNodes.append(p2) p1.childNodes.append(Node(Node.TEXT_NODE, '#text', 'foo')) strong = Node(Node.ELEMENT_NODE, 'strong') strong.childNodes.append(Node(Node.TEXT_NODE, '#text', 'bar')) p1.childNodes.append(strong) p1.childNodes.append(Node(Node.TEXT_NODE, '#text', 'baz')) p2.childNodes.append(Node(Node.TEXT_NODE, '#text', 'qux')) qa = Node(Node.ELEMENT_NODE, 'l', '#list' ) qb = Node(Node.ELEMENT_NODE, 'i', '#item') p3 = Node(Node.ELEMENT_NODE, 'p') top.childNodes.append(qa) qa.childNodes.append(qb) qb.childNodes.append(p3) one = Node(Node.TEXT_NODE, '#text', 'one ') two = Node(Node.ELEMENT_NODE, 'even') two.childNodes.append(Node(Node.TEXT_NODE, '#text', 'two')) three = Node(Node.ELEMENT_NODE, 'odd') three.childNodes.append(Node(Node.TEXT_NODE, '#text', 'three')) four = Node(Node.ELEMENT_NODE, 'even') four.childNodes.append(Node(Node.TEXT_NODE, '#text', 'four')) five = Node(Node.TEXT_NODE, '#text', ' five') p3.childNodes.append(one) p3.childNodes.append(two) p3.childNodes.append(Node(Node.TEXT_NODE, '#text', ' ')) p3.childNodes.append(three) p3.childNodes.append(Node(Node.TEXT_NODE, '#text', ' ')) p3.childNodes.append(four) p3.childNodes.append(five) # assert list(_get_textContents(top)) == ['foo', 'bar', 'baz', 'qux'] # assert ' '.join(_get_textContents(top)) == 'foo bar baz qux' text1 = list(_get_textContents(top)) print text1 print filter(lambda s: not s.startswith("T"), trace) # print trace text = list() text2 = list() _get_textContents2(top, text, text2) print text2 print text print filter(lambda s: not s.startswith("T"), trace2) # print trace2 assert text1 == text2 assert text == ['foobarbaz', 'qux', 'one two three four five'] if __name__ == '__main__': main()