Comment 0 for bug 1833083

Revision history for this message
Alexander Lebedev (woutut) wrote :

Replaces html with div.

In [20]: import lxml

In [21]: import lxml.html

In [22]: from lxml.html.clean import Cleaner

In [23]: text = u"""<html>
    ...: <body>
    ...: <h1>Hello, Parsel!</h1>
    ...: <ul>
    ...: <li><a href="http://example.com">Link 1</a></li>
    ...: <li><a href="http://scrapy.org">Link 2</a></li>
    ...: </ul>
    ...: </body>
    ...: </html>"""

In [24]: html_root = lxml.html.document_fromstring(text)

In [25]: html = Cleaner().clean_html(html_root)

In [26]: lxml.html.tostring(html)
Out[26]: '<div>\n <body>\n <h1>Hello, Parsel!</h1>\n <ul>\n <li><a href="http://example.com">Link 1</a></
li>\n <li><a href="http://scrapy.org">Link 2</a></li>\n </ul>\n </body>\n </div>'