Beautiful Soup

Bug #1974105
Comment #6

Comment 6 for bug 1974105

Revision history for this message

Luke P (lkp80877984) wrote on 2022-05-24: Re: slow performance within Tag.index(child)

"wasn't designed for"
Well, to your credit, it does that pretty darn well!

Here (btw, I'm learning python, you're welcome to give other feedback):

```
class TestMonkeyPatchBs4(unittest.TestCase):
    def test_bs4(self):
        for seed in range(0, 1000):
            # phase 1: randomly build a series of operations
            # starting with abcdex and ending in an empty string
            ops = []
            random.seed(seed)
            current = "abcdex"
            proposed_ops = [
                "+f",
                "+g",
                "+h",
                "+i",
                "+j",
                "!?",
                "!?",
                "!?",
                "!?",
                "!?",
                "-?",
                "-?",
                "-?",
                "-?",
                "-?",
                "@",
                "@",
            ]
            # there's an edge case where we do 5 removals, then a replace, which would err if there were only 5 chars
            # this is vanishingly unlikely, but can happen, so an extra x char fixes this
            random.shuffle(proposed_ops)

            while proposed_ops:
                proposed_op = proposed_ops.pop()
                i = random.randrange(0, len(current))
                char = current[i]
                if proposed_op[0] == "+":
                    ops.append(("+", i))
                    current = current[:i] + proposed_op[1] + current[i:]
                elif proposed_op[0] == "-":
                    ops.append(("-", i, char))
                    current = current[:i] + current[i + 1 :]
                elif proposed_op[0] == "!":
                    ops.append(("!", i, char))
                    inv_char = char.lower() if char.isupper() else char.upper()
                    current = current[:i] + inv_char + current[i + 1 :]
                else:
                    ops.append(("@"))
                # print(ops[-1])
                # print(current)

            while len(current) > 0:
                i = random.randrange(0, len(current))
                char = current[i]
                ops.append(("-", i, char))
                current = current[:i] + current[i + 1 :]
                # print(ops[-1])
                # print(current)

# phase two: run the html equivalent of the inverse those operations in reverse
 # we should always end with abcde, for a reliable assertion
 soup = bs4.BeautifulSoup("<html></html>", "html.parser")
 body = soup.new_tag("body")
 soup.html.append(body)
 for rop in reversed(ops):
 if rop[0] == "+":
 # exercise the inverse of insert, extract
 body.contents[rop[1]].extract()
 elif rop[0] == "-":
 # exercise the inverse of substring removal, insert
 p_tag = soup.new_tag("p")
 p_tag.append(rop[2])
 body.insert(rop[1], p_tag)
 elif rop[0] == "!":
 # exercise the inverse of substring replacement, replace_with
 p_tag = soup.new_tag("p")
 p_tag.append(rop[2])
 body.contents[rop[1]].replace_with(p_tag)
 else:
 # exercise the index method
 for i in range(0, len(body.contents)):
 self.assertEqual(i, body.index(body.contents[i]))

self.assertEqual(
 b"<html><body>abcdex</body></html>",
 soup.encode(encoding="utf-8"),
 )
```

Cheers,
Luke

"wasn't designed for"
Well, to your credit, it does that pretty darn well!

Here (btw, I'm learning python, you're welcome to give other feedback):

self.assertEqual(
 b"<html><body>abcdex</body></html>",
 soup.encode(encoding="utf-8"),
 )
```

Cheers,
Luke