pythonweb-scrapingweb-crawlerplaywrightplaywright-python

How to extarct the google's buttons element via playwright?


I have a code snippet to extract the inputable and clickable node elements (i.e. interactive elements) from the DOM tree of the web pages via Playwright in python.

This code almost works properly but in some cases misses some elements like google's buttons! In fact, this button is marked as unclickable with this code. Can someone identify the issue with this code?

Here's the code:

from playwright.sync_api import sync_playwright

VOID_ELEMENTS = {
    "area",
    "base",
    "br",
    "col",
    "embed",
    "hr",
    "img",
    "input",
    "link",
    "meta",
    "param",
    "source",
    "track",
    "wbr",
}
READABLE_ATTRIBUTES = {
    "title",
    "alt",
    "href",
    "placeholder",
    "label",
    "value",
    "caption",
    "summary",
    "aria-label",
    "aria-describedby",
    "datetime",
    "download",
    "selected",
    "checked",
    "type",
}
UNCLICKABLE_ELEMENTS = {"html", "head", "body"}
CLICKABLE_ELEMENTS = {"a", "button", "img", "details", "summary"}
INPUT_ELEMENTS = {"input", "textarea", "select", "option"}


class DOMNode:
    def __init__(self, i, nodes, strings):
        self._on_screen = None
        self.parent = None
        self.children = []
        self.llm_id = None
        ### Only some nodes have these, default None to differentiate between None and False
        self.bounds = None
        self.center = None
        self.inputValue = None
        self.inputChecked = None
        self.isClickable = None
        self.optionSelected = None
        self.parentId = (
            nodes["parentIndex"][i] if nodes["parentIndex"][i] >= 0 else None
        )
        self.nodeType = strings[nodes["nodeType"][i]]
        self.nodeName = strings[nodes["nodeName"][i]].lower()
        self.nodeValue = (
            strings[nodes["nodeValue"][i]].strip()
            if nodes["nodeValue"][i] >= 0
            else None
        )
        self.backendNodeId = nodes["backendNodeId"][i]

        self.attributes = {}
        attrs = nodes["attributes"][i]
        for att1, att2 in zip(attrs[::2], attrs[1::2]):
            self.attributes[strings[att1]] = strings[att2][:100]  # cut off long URLs

        self.readable_attributes = {
            k: v for k, v in self.attributes.items() if k in READABLE_ATTRIBUTES
        }

    def __repr__(self, indent=0) -> str:
        if self.nodeName == "#text":
            return " " * indent + (self.nodeValue or "")

        attr_str = " ".join([f'{k}="{v}"' for k, v in self.readable_attributes.items()])
        attr_str = " " + attr_str if attr_str else ""
        open_tag = f"<{self.nodeName}{attr_str}>"
        close_tag = f"</{self.nodeName}>"

        if len(self.children) == 0:
            return (" " * indent + open_tag) + (
                close_tag if self.nodeName not in VOID_ELEMENTS else ""
            )

        # special case for elements with only one text child -> one-line element
        if len(self.children) == 1 and self.children[0].nodeName == "#text":
            return (" " * indent + open_tag) + self.children[0].__repr__() + close_tag

        children_repr = "\n".join(
            [child.__repr__(indent + 2) for child in self.children]
        )
        return (
            (" " * indent + open_tag)
            + "\n"
            + children_repr
            + "\n"
            + (" " * indent + close_tag)
        )

    def on_screen(self, screen_bounds):
        if len(self.children) > 0:
            return any([child.on_screen(screen_bounds) for child in self.children])

        if (
            self.bounds is None
            or len(self.bounds) != 4
            or self.bounds[2] * self.bounds[3] == 0
        ):
            return False

        x, y, w, h = self.bounds
        win_upper_bound, win_left_bound, win_width, win_height = screen_bounds
        win_right_bound = win_left_bound + win_width
        win_lower_bound = win_upper_bound + win_height
        return (
            x < win_right_bound
            and x + w > win_left_bound
            and y < win_lower_bound
            and y + h > win_upper_bound
        )


class Globot:
    def __init__(self, headless=False):
        playwright = sync_playwright().start()
        self.browser = playwright.chromium.launch(headless=headless)
        self.context = self.browser.new_context()
        self.page = self.context.new_page()

    def go_to_page(self, url):
        self.page.goto(url=url if "://" in url else "https://" + url)
        self.client = self.page.context.new_cdp_session(self.page)
        self.page.wait_for_load_state("domcontentloaded")

    def crawl(self) -> tuple[dict[int, DOMNode], dict[int, DOMNode]]:
        dom = self.client.send(
            "DOMSnapshot.captureSnapshot",
            {"computedStyles": [], "includeDOMRects": True, "includePaintOrder": True},
        )

        dom_strings = dom["strings"]
        document = dom["documents"][0]
        dom_layout = document["layout"]
        dom_nodes = document["nodes"]

        screen_bounds = dom_layout["bounds"][0]
        # For some reason `window.devicePixelRatio` this gives the wrong answer sometimes
        device_pixel_ratio = screen_bounds[2] / self.page.evaluate(
            "window.screen.width"
        )

        nodes = []
        root = None

        # Takes much longer naively
        nodeIndex_flipped = {v: k for k, v in enumerate(dom_layout["nodeIndex"])}
        inputValue_flipped = {
            v: k for k, v in enumerate(dom_nodes["inputValue"]["index"])
        }
        for i in range(len(dom_nodes["parentIndex"])):
            node = DOMNode(i, dom_nodes, dom_strings)
            if i == 0:
                root = node

            if i in nodeIndex_flipped:
                bounds = dom_layout["bounds"][nodeIndex_flipped[i]]
                bounds = [int(b / device_pixel_ratio) for b in bounds]
                node.bounds = bounds
                node.center = (
                    int(bounds[0] + bounds[2] / 2),
                    int(bounds[1] + bounds[3] / 2),
                )

            if i in dom_nodes["isClickable"]["index"]:
                node.isClickable = True

            if i in inputValue_flipped:
                v = dom_nodes["inputValue"]["value"][inputValue_flipped[i]]
                node.inputValue = dom_strings[v] if v >= 0 else ""
                # node.string_attributes['value'] = node.inputValue

            if i in dom_nodes["inputChecked"]["index"]:
                node.inputChecked = True

            if i in dom_nodes["optionSelected"]["index"]:
                node.optionSelected = True

            nodes.append(node)

        # Switch node ids to node pointers
        for node in nodes:
            if node.parentId is not None:
                node.parent = nodes[node.parentId]
                node.parent.children.append(node)

        count = 0
        input_elements = {}
        clickable_elements = {}

        def find_interactive_elements(node):
            nonlocal count
            clickable = (
                node.nodeName in CLICKABLE_ELEMENTS
                and node.isClickable
                and node.center is not None
            )
            inputable = node.nodeName in INPUT_ELEMENTS or node.inputValue is not None

            # Special case for select and option elements
            select_or_option = node.nodeName == "select" or node.nodeName == "option"
            visible = node.on_screen(
                root.bounds
            ) and "visibility: hidden" not in node.attributes.get("style", "")

            if node.nodeName == "button":
                print(f"Node: {node.nodeName}")
                print(f"  Attributes: {node.attributes}")
                print(f"  Bounds: {node.bounds}")
                print(f"  Clickable: {clickable}")
                print(f"  Inputable: {inputable}")
                print(f"  Visible: {visible}")
                print(f"  Center: {node.center}")

            if visible and (clickable or inputable) or select_or_option:
                if clickable:
                    clickable_elements[count] = node
                if inputable or select_or_option:
                    input_elements[count] = node
                node.llm_id = count
                count += 1

            for child in node.children:
                find_interactive_elements(child)

        find_interactive_elements(root)

        return input_elements, clickable_elements

Code snippet for reproducing the issue (here the Next button is not known as clickable):

from pprint import pprint

bot = Globot()
bot.go_to_page(
    "https://accounts.google.com/v3/signin/identifier?authuser=0&continue=https%3A%2F%2Fwww.google.com%2F&ec=GAlAmgQ&hl=en&flowName=GlifWebSignIn&flowEntry=AddSession&dsh=S1040273122%3A1718390580872851&ddm=0"
)
inputs, clickables = bot.crawl()

s = ""
for i in inputs.keys() | clickables.keys():
    inputable = False
    clickable = False
    if i in inputs:
        node = inputs[i]
        inputable = True
    if i in clickables:
        node = clickables[i]
        clickable = True

    s += f"<node id={i} clickable={clickable} inputable={inputable}>\n"
    s += node.__repr__(indent=2)
    s += "\n</node>\n"
html_description = s
pprint(html_description)

Here's the part of the log regarding the Next element - as you can see the Clickable is set to None:

Node: button
  Attributes: {'class': 'VfPpkd-LgbsSe VfPpkd-LgbsSe-OWXEXe-k8QpJ VfPpkd-LgbsSe-OWXEXe-dgl2Hf nCP5yc AjY5Oe DuMIQc LQeN7 BqKG', 'jscontroller': 'soHxf', 'jsaction': 'click:cOuCgd; mousedown:UX7yZ; mouseup:lbsD7e; mouseenter:tfO1Yc; mouseleave:JywGue; touchstart:p6p2', 'data-idom-class': 'nCP5yc AjY5Oe DuMIQc LQeN7 BqKGqe Jskylb TrZEUc lw1w4b', 'jsname': 'LgbsSe', 'type': 'button'}
  Bounds: [965, 453, 78, 40]
  Clickable: None
  Inputable: False
  Visible: True
  Center: (1004, 473)

Here's the RAW HTML of the Next button:

<button class="VfPpkd-LgbsSe VfPpkd-LgbsSe-OWXEXe-k8QpJ VfPpkd-LgbsSe-OWXEXe-dgl2Hf nCP5yc AjY5Oe DuMIQc LQeN7 BqKGqe Jskylb TrZEUc lw1w4b" jscontroller="soHxf" jsaction="click:cOuCgd; mousedown:UX7yZ; mouseup:lbsD7e; mouseenter:tfO1Yc; mouseleave:JywGue; touchstart:p6p2H; touchmove:FwuNnf; touchend:yfqBxc; touchcancel:JMtRjd; focus:AHmuwe; blur:O22p3e; contextmenu:mg9Pef;mlnRJb:fLiPzd;" data-idom-class="nCP5yc AjY5Oe DuMIQc LQeN7 BqKGqe Jskylb TrZEUc lw1w4b" jsname="LgbsSe" type="button"><div class="VfPpkd-Jh9lGc"></div><div class="VfPpkd-J1Ukfc-LhBDec"></div><div class="VfPpkd-RLmnJb"></div><span jsname="V67aGc" class="VfPpkd-vQzf8d">Next</span></button>

Here's the screenshot of the respective page:

enter image description here

I apologize if the code is too long and I appreciate any help in advance.


Solution

  • I ended up with the following code including optimization using sets for quick lookup, and adding a new condition for determining if a node is clickable by the presence of an onclick attribute or if the node is a button.

    Here's the refined version:

    from playwright.sync_api import sync_playwright
    from pprint import pprint
    
    VOID_ELEMENTS = {
        "area",
        "base",
        "br",
        "col",
        "embed",
        "hr",
        "img",
        "input",
        "link",
        "meta",
        "param",
        "source",
        "track",
        "wbr",
    }
    READABLE_ATTRIBUTES = {
        "title",
        "alt",
        "href",
        "placeholder",
        "label",
        "value",
        "caption",
        "summary",
        "aria-label",
        "aria-describedby",
        "datetime",
        "download",
        "selected",
        "checked",
        "type",
    }
    UNCLICKABLE_ELEMENTS = {"html", "head", "body"}
    CLICKABLE_ELEMENTS = {"a", "button", "img", "details", "summary", "ul", "li"}
    INPUT_ELEMENTS = {"input", "textarea", "select", "option"}
    
    
    class DOMNode:
        def __init__(self, i, nodes, strings):
            self._on_screen = None
            self.parent = None
            self.children = []
            self.llm_id = None
            ### Only some nodes have these, default None to differentiate between None and False
            self.bounds = None
            self.center = None
            self.inputValue = None
            self.inputChecked = None
            self.isClickable = None
            self.optionSelected = None
            self.parentId = (
                nodes["parentIndex"][i] if nodes["parentIndex"][i] >= 0 else None
            )
            self.nodeType = strings[nodes["nodeType"][i]]
            self.nodeName = strings[nodes["nodeName"][i]].lower()
            self.nodeValue = (
                strings[nodes["nodeValue"][i]].strip()
                if nodes["nodeValue"][i] >= 0
                else None
            )
            self.backendNodeId = nodes["backendNodeId"][i]
    
            self.attributes = {}
            attrs = nodes["attributes"][i]
            for att1, att2 in zip(attrs[::2], attrs[1::2]):
                self.attributes[strings[att1]] = strings[att2][:100]  # cut off long URLs
    
            self.readable_attributes = {
                k: v for k, v in self.attributes.items() if k in READABLE_ATTRIBUTES
            }
    
        def __repr__(self, indent=0) -> str:
            if self.nodeName == "#text":
                return " " * indent + (self.nodeValue or "")
    
            attr_str = " ".join([f'{k}="{v}"' for k, v in self.readable_attributes.items()])
            attr_str = " " + attr_str if attr_str else ""
            open_tag = f"<{self.nodeName}{attr_str}>"
            close_tag = f"</{self.nodeName}>"
    
            if len(self.children) == 0:
                return (" " * indent + open_tag) + (
                    close_tag if self.nodeName not in VOID_ELEMENTS else ""
                )
    
            # special case for elements with only one text child -> one-line element
            if len(self.children) == 1 and self.children[0].nodeName == "#text":
                return (" " * indent + open_tag) + self.children[0].__repr__() + close_tag
    
            children_repr = "\n".join(
                [child.__repr__(indent + 2) for child in self.children]
            )
            return (
                (" " * indent + open_tag)
                + "\n"
                + children_repr
                + "\n"
                + (" " * indent + close_tag)
            )
    
        def on_screen(self, screen_bounds):
            if len(self.children) > 0:
                return any([child.on_screen(screen_bounds) for child in self.children])
    
            if (
                self.bounds is None
                or len(self.bounds) != 4
                or self.bounds[2] * self.bounds[3] == 0
            ):
                return False
    
            x, y, w, h = self.bounds
            win_upper_bound, win_left_bound, win_width, win_height = screen_bounds
            win_right_bound = win_left_bound + win_width
            win_lower_bound = win_upper_bound + win_height
            return (
                x < win_right_bound
                and x + w > win_left_bound
                and y < win_lower_bound
                and y + h > win_upper_bound
            )
    
    
    class Globot:
        def __init__(self, headless=False):
            playwright = sync_playwright().start()
            self.browser = playwright.chromium.launch(headless=headless)
            self.context = self.browser.new_context()
            self.page = self.context.new_page()
    
        def go_to_page(self, url):
            self.page.goto(url=url if "://" in url else "https://" + url)
            self.client = self.page.context.new_cdp_session(self.page)
            self.page.wait_for_load_state("domcontentloaded")
    
        def crawl(self) -> tuple[dict[int, DOMNode], dict[int, DOMNode]]:
            dom = self.client.send(
                "DOMSnapshot.captureSnapshot",
                {"computedStyles": [], "includeDOMRects": True, "includePaintOrder": True},
            )
    
            dom_strings = dom["strings"]
            document = dom["documents"][0]
            dom_layout = document["layout"]
            dom_nodes = document["nodes"]
    
            screen_bounds = dom_layout["bounds"][0]
            # For some reason `window.devicePixelRatio` this gives the wrong answer sometimes
            device_pixel_ratio = screen_bounds[2] / self.page.evaluate(
                "window.screen.width"
            )
    
            nodes = []
            root = None
    
            isClickable_set = set(dom_nodes.get("isClickable", {}).get("index", []))
            inputChecked_set = set(dom_nodes.get("inputChecked", {}).get("index", []))
            optionSelected_set = set(dom_nodes.get("optionSelected", {}).get("index", []))
    
            inputValue_map = dict(
                zip(
                    dom_nodes.get("inputValue", {}).get("index", []),
                    dom_nodes.get("inputValue", {}).get("value", []),
                )
            )
    
            nodeIndex_flipped = {v: k for k, v in enumerate(dom_layout["nodeIndex"])}
            for i in range(len(dom_nodes["parentIndex"])):
                node = DOMNode(i, dom_nodes, dom_strings)
                if i == 0:
                    root = node
    
                if i in nodeIndex_flipped:
                    bounds = dom_layout["bounds"][nodeIndex_flipped[i]]
                    bounds = [int(b / device_pixel_ratio) for b in bounds]
                    node.bounds = bounds
                    node.center = (
                        int(bounds[0] + bounds[2] / 2),
                        int(bounds[1] + bounds[3] / 2),
                    )
    
                node.isClickable = i in isClickable_set
                node.inputChecked = i in inputChecked_set
                node.optionSelected = i in optionSelected_set
    
                if i in inputValue_map:
                    v = inputValue_map[i]
                    node.inputValue = dom_strings[v] if v >= 0 else ""
    
                nodes.append(node)
    
            # Switch node ids to node pointers
            for node in nodes:
                if node.parentId is not None:
                    node.parent = nodes[node.parentId]
                    node.parent.children.append(node)
    
            count = 0
            input_elements = {}
            clickable_elements = {}
    
            def find_interactive_elements(node):
                nonlocal count
                clickable = (
                    node.nodeName in CLICKABLE_ELEMENTS
                    and node.center is not None
                    and (
                        node.isClickable
                        or node.nodeName == "button"
                        or "onclick" in node.attributes
                    )
                )
                inputable = node.nodeName in INPUT_ELEMENTS or node.inputValue is not None
    
                # Special case for select and option elements
                select_or_option = node.nodeName in {"select", "option"}
                visible = node.on_screen(
                    root.bounds
                ) and "visibility: hidden" not in node.attributes.get("style", "")
    
                if visible and (clickable or inputable) or select_or_option:
                    if clickable:
                        clickable_elements[count] = node
                    if inputable or select_or_option:
                        input_elements[count] = node
                    node.llm_id = count
                    count += 1
    
                for child in node.children:
                    find_interactive_elements(child)
    
            find_interactive_elements(root)
    
            return input_elements, clickable_elements
    
    
    bot = Globot()
    bot.go_to_page(
        "https://accounts.google.com/v3/signin/identifier?authuser=0&continue=https%3A%2F%2Fwww.google.com%2F&ec=GAlAmgQ&hl=en&flowName=GlifWebSignIn&flowEntry=AddSession&dsh=S1040273122%3A1718390580872851&ddm=0"
    )
    inputs, clickables = bot.crawl()
    
    s = ""
    for i in inputs.keys() | clickables.keys():
        inputable = False
        clickable = False
        if i in inputs:
            node = inputs[i]
            inputable = True
        if i in clickables:
            node = clickables[i]
            clickable = True
    
        s += f"<node id={i} clickable={clickable} inputable={inputable}>\n"
        s += node.__repr__(indent=2)
        s += "\n</node>\n"
    html_description = s
    pprint(html_description)
    

    Output:

    ('<node id=0 clickable=False inputable=True>\n'
     '  <input type="email" aria-label="Email or phone" '
     'value="2D7AB92D588040EBA91955F62E1BEE47">\n'
     '</node>\n'
     '<node id=1 clickable=True inputable=False>\n'
     '  <button type="button">\n'
     '    <::before></::before>\n'
     '    Forgot email?\n'
     '  </button>\n'
     '</node>\n'
     '<node id=2 clickable=True inputable=False>\n'
     '  <a href="https://support.google.com/chrome/answer/6130773?hl=en">\n'
     '    <::before></::before>\n'
     '    Learn more about using Guest mode\n'
     '  </a>\n'
     '</node>\n'
     '<node id=3 clickable=True inputable=False>\n'
     '  <button type="button">\n'
     '    <div>\n'
     '      <::before></::before>\n'
     '      <::after></::after>\n'
     '    </div>\n'
     '    <div></div>\n'
     '    <div></div>\n'
     '    <span>Next</span>\n'
     '  </button>\n'
     '</node>\n'
     '<node id=4 clickable=True inputable=False>\n'
     '  <button type="button">\n'
     '    <div>\n'
     '      <::before></::before>\n'
     '      <::after></::after>\n'
     '    </div>\n'
     '    <div></div>\n'
     '    <div></div>\n'
     '    <span>Create account</span>\n'
     '  </button>\n'
     '</node>\n'
     '<node id=5 clickable=True inputable=False>\n'
     '  <a href="https://support.google.com/accounts?hl=en&p=account_iph">\n'
     '    <::before></::before>\n'
     '    Help\n'
     '  </a>\n'
     '</node>\n'
     '<node id=6 clickable=True inputable=False>\n'
     '  <a href="https://accounts.google.com/TOS?loc=GB&hl=en&privacy=true">\n'
     '    <::before></::before>\n'
     '    Privacy\n'
     '  </a>\n'
     '</node>\n'
     '<node id=7 clickable=True inputable=False>\n'
     '  <a href="https://accounts.google.com/TOS?loc=GB&hl=en">\n'
     '    <::before></::before>\n'
     '    Terms\n'
     '  </a>\n'
     '</node>\n')
    

    See the extracted clickables like the Next button and others that are detected properly above.