class Node: COMMENT_NODE = 1 PROCESSING_INSTRUCTION_NODE = 2 TEXT_NODE = 3 ATTRIBUTE_NODE = 4 DOCUMENT_TYPE_NODE = 5 NOTATION_NODE = 6 ENTITY_NODE = 7 ELEMENT_NODE = 8 def __init__(self, nodeType, nodeName='', nodeValue=''): self.nodeType = nodeType self.nodeName = nodeName self.nodeValue = nodeValue self.childNodes = [] def _get_textContents(node): for child in node.childNodes: nodeType = child.nodeType if (nodeType == child.COMMENT_NODE or nodeType == child.PROCESSING_INSTRUCTION_NODE): continue if nodeType == child.TEXT_NODE: yield child.nodeValue elif (nodeType in [ child.ATTRIBUTE_NODE, child.DOCUMENT_TYPE_NODE, child.NOTATION_NODE, child.ENTITY_NODE, ]): continue # or you might prefer, e.g, to look inside attributes else: # iterate through elements are left for text in _get_textContents(child): yield text def main(): top = Node(Node.ELEMENT_NODE, 'top') p1 = Node(Node.ELEMENT_NODE, 'p') p2 = Node(Node.ELEMENT_NODE, 'p') top.childNodes.append(p1) top.childNodes.append(p2) p1.childNodes.append(Node(Node.TEXT_NODE, '#text', 'foo')) strong = Node(Node.ELEMENT_NODE, 'strong') strong.childNodes.append(Node(Node.TEXT_NODE, '#text', 'bar')) p1.childNodes.append(strong) p1.childNodes.append(Node(Node.TEXT_NODE, '#text', 'baz')) p2.childNodes.append(Node(Node.TEXT_NODE, '#text', 'qux')) assert list(_get_textContents(top)) == ['foo', 'bar', 'baz', 'qux'] assert ' '.join(_get_textContents(top)) == 'foo bar baz qux' if __name__ == '__main__': main()