python web-scraping web-crawler playwright playwright-python

How to extarct the google's buttons element via playwright?

I have a code snippet to extract the inputable and clickable node elements (i.e. interactive elements) from the DOM tree of the web pages via Playwright in python.

This code almost works properly but in some cases misses some elements like google's buttons! In fact, this button is marked as unclickable with this code. Can someone identify the issue with this code?

Here's the code:

from playwright.sync_api import sync_playwright

VOID_ELEMENTS = {
    "area",
    "base",
    "br",
    "col",
    "embed",
    "hr",
    "img",
    "input",
    "link",
    "meta",
    "param",
    "source",
    "track",
    "wbr",
}
READABLE_ATTRIBUTES = {
    "title",
    "alt",
    "href",
    "placeholder",
    "label",
    "value",
    "caption",
    "summary",
    "aria-label",
    "aria-describedby",
    "datetime",
    "download",
    "selected",
    "checked",
    "type",
}
UNCLICKABLE_ELEMENTS = {"html", "head", "body"}
CLICKABLE_ELEMENTS = {"a", "button", "img", "details", "summary"}
INPUT_ELEMENTS = {"input", "textarea", "select", "option"}


class DOMNode:
    def __init__(self, i, nodes, strings):
        self._on_screen = None
        self.parent = None
        self.children = []
        self.llm_id = None
        ### Only some nodes have these, default None to differentiate between None and False
        self.bounds = None
        self.center = None
        self.inputValue = None
        self.inputChecked = None
        self.isClickable = None
        self.optionSelected = None
        self.parentId = (
            nodes["parentIndex"][i] if nodes["parentIndex"][i] >= 0 else None
        )
        self.nodeType = strings[nodes["nodeType"][i]]
        self.nodeName = strings[nodes["nodeName"][i]].lower()
        self.nodeValue = (
            strings[nodes["nodeValue"][i]].strip()
            if nodes["nodeValue"][i] >= 0
            else None
        )
        self.backendNodeId = nodes["backendNodeId"][i]

        self.attributes = {}
        attrs = nodes["attributes"][i]
        for att1, att2 in zip(attrs[::2], attrs[1::2]):
            self.attributes[strings[att1]] = strings[att2][:100]  # cut off long URLs

        self.readable_attributes = {
            k: v for k, v in self.attributes.items() if k in READABLE_ATTRIBUTES
        }

    def __repr__(self, indent=0) -> str:
        if self.nodeName == "#text":
            return " " * indent + (self.nodeValue or "")

        attr_str = " ".join([f'{k}="{v}"' for k, v in self.readable_attributes.items()])
        attr_str = " " + attr_str if attr_str else ""
        open_tag = f"<{self.nodeName}{attr_str}>"
        close_tag = f"</{self.nodeName}>"

        if len(self.children) == 0:
            return (" " * indent + open_tag) + (
                close_tag if self.nodeName not in VOID_ELEMENTS else ""
            )

        # special case for elements with only one text child -> one-line element
        if len(self.children) == 1 and self.children[0].nodeName == "#text":
            return (" " * indent + open_tag) + self.children[0].__repr__() + close_tag

        children_repr = "\n".join(
            [child.__repr__(indent + 2) for child in self.children]
        )
        return (
            (" " * indent + open_tag)
            + "\n"
            + children_repr
            + "\n"
            + (" " * indent + close_tag)
        )

    def on_screen(self, screen_bounds):
        if len(self.children) > 0:
            return any([child.on_screen(screen_bounds) for child in self.children])

        if (
            self.bounds is None
            or len(self.bounds) != 4
            or self.bounds[2] * self.bounds[3] == 0
        ):
            return False

        x, y, w, h = self.bounds
        win_upper_bound, win_left_bound, win_width, win_height = screen_bounds
        win_right_bound = win_left_bound + win_width
        win_lower_bound = win_upper_bound + win_height
        return (
            x < win_right_bound
            and x + w > win_left_bound
            and y < win_lower_bound
            and y + h > win_upper_bound
        )


class Globot:
    def __init__(self, headless=False):
        playwright = sync_playwright().start()
        self.browser = playwright.chromium.launch(headless=headless)
        self.context = self.browser.new_context()
        self.page = self.context.new_page()

    def go_to_page(self, url):
        self.page.goto(url=url if "://" in url else "https://" + url)
        self.client = self.page.context.new_cdp_session(self.page)
        self.page.wait_for_load_state("domcontentloaded")

    def crawl(self) -> tuple[dict[int, DOMNode], dict[int, DOMNode]]:
        dom = self.client.send(
            "DOMSnapshot.captureSnapshot",
            {"computedStyles": [], "includeDOMRects": True, "includePaintOrder": True},
        )

        dom_strings = dom["strings"]
        document = dom["documents"][0]
        dom_layout = document["layout"]
        dom_nodes = document["nodes"]

        screen_bounds = dom_layout["bounds"][0]
        # For some reason `window.devicePixelRatio` this gives the wrong answer sometimes
        device_pixel_ratio = screen_bounds[2] / self.page.evaluate(
            "window.screen.width"
        )

        nodes = []
        root = None

        # Takes much longer naively
        nodeIndex_flipped = {v: k for k, v in enumerate(dom_layout["nodeIndex"])}
        inputValue_flipped = {
            v: k for k, v in enumerate(dom_nodes["inputValue"]["index"])
        }
        for i in range(len(dom_nodes["parentIndex"])):
            node = DOMNode(i, dom_nodes, dom_strings)
            if i == 0:
                root = node

            if i in nodeIndex_flipped:
                bounds = dom_layout["bounds"][nodeIndex_flipped[i]]
                bounds = [int(b / device_pixel_ratio) for b in bounds]
                node.bounds = bounds
                node.center = (
                    int(bounds[0] + bounds[2] / 2),
                    int(bounds[1] + bounds[3] / 2),
                )

            if i in dom_nodes["isClickable"]["index"]:
                node.isClickable = True

            if i in inputValue_flipped:
                v = dom_nodes["inputValue"]["value"][inputValue_flipped[i]]
                node.inputValue = dom_strings[v] if v >= 0 else ""
                # node.string_attributes['value'] = node.inputValue

            if i in dom_nodes["inputChecked"]["index"]:
                node.inputChecked = True

            if i in dom_nodes["optionSelected"]["index"]:
                node.optionSelected = True

            nodes.append(node)

        # Switch node ids to node pointers
        for node in nodes:
            if node.parentId is not None:
                node.parent = nodes[node.parentId]
                node.parent.children.append(node)

        count = 0
        input_elements = {}
        clickable_elements = {}

        def find_interactive_elements(node):
            nonlocal count
            clickable = (
                node.nodeName in CLICKABLE_ELEMENTS
                and node.isClickable
                and node.center is not None
            )
            inputable = node.nodeName in INPUT_ELEMENTS or node.inputValue is not None

            # Special case for select and option elements
            select_or_option = node.nodeName == "select" or node.nodeName == "option"
            visible = node.on_screen(
                root.bounds
            ) and "visibility: hidden" not in node.attributes.get("style", "")

            if node.nodeName == "button":
                print(f"Node: {node.nodeName}")
                print(f"  Attributes: {node.attributes}")
                print(f"  Bounds: {node.bounds}")
                print(f"  Clickable: {clickable}")
                print(f"  Inputable: {inputable}")
                print(f"  Visible: {visible}")
                print(f"  Center: {node.center}")

            if visible and (clickable or inputable) or select_or_option:
                if clickable:
                    clickable_elements[count] = node
                if inputable or select_or_option:
                    input_elements[count] = node
                node.llm_id = count
                count += 1

            for child in node.children:
                find_interactive_elements(child)

        find_interactive_elements(root)

        return input_elements, clickable_elements

Code snippet for reproducing the issue (here the Next button is not known as clickable):

from pprint import pprint

bot = Globot()
bot.go_to_page(
    "https://accounts.google.com/v3/signin/identifier?authuser=0&continue=https%3A%2F%2Fwww.google.com%2F&ec=GAlAmgQ&hl=en&flowName=GlifWebSignIn&flowEntry=AddSession&dsh=S1040273122%3A1718390580872851&ddm=0"
)
inputs, clickables = bot.crawl()

s = ""
for i in inputs.keys() | clickables.keys():
    inputable = False
    clickable = False
    if i in inputs:
        node = inputs[i]
        inputable = True
    if i in clickables:
        node = clickables[i]
        clickable = True

    s += f"<node id={i} clickable={clickable} inputable={inputable}>\n"
    s += node.__repr__(indent=2)
    s += "\n</node>\n"
html_description = s
pprint(html_description)

Here's the part of the log regarding the Next element - as you can see the Clickable is set to None:

Node: button
  Attributes: {'class': 'VfPpkd-LgbsSe VfPpkd-LgbsSe-OWXEXe-k8QpJ VfPpkd-LgbsSe-OWXEXe-dgl2Hf nCP5yc AjY5Oe DuMIQc LQeN7 BqKG', 'jscontroller': 'soHxf', 'jsaction': 'click:cOuCgd; mousedown:UX7yZ; mouseup:lbsD7e; mouseenter:tfO1Yc; mouseleave:JywGue; touchstart:p6p2', 'data-idom-class': 'nCP5yc AjY5Oe DuMIQc LQeN7 BqKGqe Jskylb TrZEUc lw1w4b', 'jsname': 'LgbsSe', 'type': 'button'}
  Bounds: [965, 453, 78, 40]
  Clickable: None
  Inputable: False
  Visible: True
  Center: (1004, 473)

Here's the RAW HTML of the Next button:

<button class="VfPpkd-LgbsSe VfPpkd-LgbsSe-OWXEXe-k8QpJ VfPpkd-LgbsSe-OWXEXe-dgl2Hf nCP5yc AjY5Oe DuMIQc LQeN7 BqKGqe Jskylb TrZEUc lw1w4b" jscontroller="soHxf" jsaction="click:cOuCgd; mousedown:UX7yZ; mouseup:lbsD7e; mouseenter:tfO1Yc; mouseleave:JywGue; touchstart:p6p2H; touchmove:FwuNnf; touchend:yfqBxc; touchcancel:JMtRjd; focus:AHmuwe; blur:O22p3e; contextmenu:mg9Pef;mlnRJb:fLiPzd;" data-idom-class="nCP5yc AjY5Oe DuMIQc LQeN7 BqKGqe Jskylb TrZEUc lw1w4b" jsname="LgbsSe" type="button"><div class="VfPpkd-Jh9lGc"></div><div class="VfPpkd-J1Ukfc-LhBDec"></div><div class="VfPpkd-RLmnJb"></div><span jsname="V67aGc" class="VfPpkd-vQzf8d">Next</span></button>

Here's the screenshot of the respective page:

I apologize if the code is too long and I appreciate any help in advance.

Solution

I ended up with the following code including optimization using sets for quick lookup, and adding a new condition for determining if a node is clickable by the presence of an onclick attribute or if the node is a button.

Here's the refined version:

from playwright.sync_api import sync_playwright
from pprint import pprint

VOID_ELEMENTS = {
    "area",
    "base",
    "br",
    "col",
    "embed",
    "hr",
    "img",
    "input",
    "link",
    "meta",
    "param",
    "source",
    "track",
    "wbr",
}
READABLE_ATTRIBUTES = {
    "title",
    "alt",
    "href",
    "placeholder",
    "label",
    "value",
    "caption",
    "summary",
    "aria-label",
    "aria-describedby",
    "datetime",
    "download",
    "selected",
    "checked",
    "type",
}
UNCLICKABLE_ELEMENTS = {"html", "head", "body"}
CLICKABLE_ELEMENTS = {"a", "button", "img", "details", "summary", "ul", "li"}
INPUT_ELEMENTS = {"input", "textarea", "select", "option"}


class DOMNode:
    def __init__(self, i, nodes, strings):
        self._on_screen = None
        self.parent = None
        self.children = []
        self.llm_id = None
        ### Only some nodes have these, default None to differentiate between None and False
        self.bounds = None
        self.center = None
        self.inputValue = None
        self.inputChecked = None
        self.isClickable = None
        self.optionSelected = None
        self.parentId = (
            nodes["parentIndex"][i] if nodes["parentIndex"][i] >= 0 else None
        )
        self.nodeType = strings[nodes["nodeType"][i]]
        self.nodeName = strings[nodes["nodeName"][i]].lower()
        self.nodeValue = (
            strings[nodes["nodeValue"][i]].strip()
            if nodes["nodeValue"][i] >= 0
            else None
        )
        self.backendNodeId = nodes["backendNodeId"][i]

        self.attributes = {}
        attrs = nodes["attributes"][i]
        for att1, att2 in zip(attrs[::2], attrs[1::2]):
            self.attributes[strings[att1]] = strings[att2][:100]  # cut off long URLs

        self.readable_attributes = {
            k: v for k, v in self.attributes.items() if k in READABLE_ATTRIBUTES
        }

    def __repr__(self, indent=0) -> str:
        if self.nodeName == "#text":
            return " " * indent + (self.nodeValue or "")

        attr_str = " ".join([f'{k}="{v}"' for k, v in self.readable_attributes.items()])
        attr_str = " " + attr_str if attr_str else ""
        open_tag = f"<{self.nodeName}{attr_str}>"
        close_tag = f"</{self.nodeName}>"

        if len(self.children) == 0:
            return (" " * indent + open_tag) + (
                close_tag if self.nodeName not in VOID_ELEMENTS else ""
            )

        # special case for elements with only one text child -> one-line element
        if len(self.children) == 1 and self.children[0].nodeName == "#text":
            return (" " * indent + open_tag) + self.children[0].__repr__() + close_tag

        children_repr = "\n".join(
            [child.__repr__(indent + 2) for child in self.children]
        )
        return (
            (" " * indent + open_tag)
            + "\n"
            + children_repr
            + "\n"
            + (" " * indent + close_tag)
        )

    def on_screen(self, screen_bounds):
        if len(self.children) > 0:
            return any([child.on_screen(screen_bounds) for child in self.children])

        if (
            self.bounds is None
            or len(self.bounds) != 4
            or self.bounds[2] * self.bounds[3] == 0
        ):
            return False

        x, y, w, h = self.bounds
        win_upper_bound, win_left_bound, win_width, win_height = screen_bounds
        win_right_bound = win_left_bound + win_width
        win_lower_bound = win_upper_bound + win_height
        return (
            x < win_right_bound
            and x + w > win_left_bound
            and y < win_lower_bound
            and y + h > win_upper_bound
        )


class Globot:
    def __init__(self, headless=False):
        playwright = sync_playwright().start()
        self.browser = playwright.chromium.launch(headless=headless)
        self.context = self.browser.new_context()
        self.page = self.context.new_page()

    def go_to_page(self, url):
        self.page.goto(url=url if "://" in url else "https://" + url)
        self.client = self.page.context.new_cdp_session(self.page)
        self.page.wait_for_load_state("domcontentloaded")

    def crawl(self) -> tuple[dict[int, DOMNode], dict[int, DOMNode]]:
        dom = self.client.send(
            "DOMSnapshot.captureSnapshot",
            {"computedStyles": [], "includeDOMRects": True, "includePaintOrder": True},
        )

        dom_strings = dom["strings"]
        document = dom["documents"][0]
        dom_layout = document["layout"]
        dom_nodes = document["nodes"]

        screen_bounds = dom_layout["bounds"][0]
        # For some reason `window.devicePixelRatio` this gives the wrong answer sometimes
        device_pixel_ratio = screen_bounds[2] / self.page.evaluate(
            "window.screen.width"
        )

        nodes = []
        root = None

        isClickable_set = set(dom_nodes.get("isClickable", {}).get("index", []))
        inputChecked_set = set(dom_nodes.get("inputChecked", {}).get("index", []))
        optionSelected_set = set(dom_nodes.get("optionSelected", {}).get("index", []))

        inputValue_map = dict(
            zip(
                dom_nodes.get("inputValue", {}).get("index", []),
                dom_nodes.get("inputValue", {}).get("value", []),
            )
        )

        nodeIndex_flipped = {v: k for k, v in enumerate(dom_layout["nodeIndex"])}
        for i in range(len(dom_nodes["parentIndex"])):
            node = DOMNode(i, dom_nodes, dom_strings)
            if i == 0:
                root = node

            if i in nodeIndex_flipped:
                bounds = dom_layout["bounds"][nodeIndex_flipped[i]]
                bounds = [int(b / device_pixel_ratio) for b in bounds]
                node.bounds = bounds
                node.center = (
                    int(bounds[0] + bounds[2] / 2),
                    int(bounds[1] + bounds[3] / 2),
                )

            node.isClickable = i in isClickable_set
            node.inputChecked = i in inputChecked_set
            node.optionSelected = i in optionSelected_set

            if i in inputValue_map:
                v = inputValue_map[i]
                node.inputValue = dom_strings[v] if v >= 0 else ""

            nodes.append(node)

        # Switch node ids to node pointers
        for node in nodes:
            if node.parentId is not None:
                node.parent = nodes[node.parentId]
                node.parent.children.append(node)

        count = 0
        input_elements = {}
        clickable_elements = {}

        def find_interactive_elements(node):
            nonlocal count
            clickable = (
                node.nodeName in CLICKABLE_ELEMENTS
                and node.center is not None
                and (
                    node.isClickable
                    or node.nodeName == "button"
                    or "onclick" in node.attributes
                )
            )
            inputable = node.nodeName in INPUT_ELEMENTS or node.inputValue is not None

            # Special case for select and option elements
            select_or_option = node.nodeName in {"select", "option"}
            visible = node.on_screen(
                root.bounds
            ) and "visibility: hidden" not in node.attributes.get("style", "")

            if visible and (clickable or inputable) or select_or_option:
                if clickable:
                    clickable_elements[count] = node
                if inputable or select_or_option:
                    input_elements[count] = node
                node.llm_id = count
                count += 1

            for child in node.children:
                find_interactive_elements(child)

        find_interactive_elements(root)

        return input_elements, clickable_elements


bot = Globot()
bot.go_to_page(
    "https://accounts.google.com/v3/signin/identifier?authuser=0&continue=https%3A%2F%2Fwww.google.com%2F&ec=GAlAmgQ&hl=en&flowName=GlifWebSignIn&flowEntry=AddSession&dsh=S1040273122%3A1718390580872851&ddm=0"
)
inputs, clickables = bot.crawl()

s = ""
for i in inputs.keys() | clickables.keys():
    inputable = False
    clickable = False
    if i in inputs:
        node = inputs[i]
        inputable = True
    if i in clickables:
        node = clickables[i]
        clickable = True

    s += f"<node id={i} clickable={clickable} inputable={inputable}>\n"
    s += node.__repr__(indent=2)
    s += "\n</node>\n"
html_description = s
pprint(html_description)

Output:

('<node id=0 clickable=False inputable=True>\n'
 '  <input type="email" aria-label="Email or phone" '
 'value="2D7AB92D588040EBA91955F62E1BEE47">\n'
 '</node>\n'
 '<node id=1 clickable=True inputable=False>\n'
 '  <button type="button">\n'
 '    <::before></::before>\n'
 '    Forgot email?\n'
 '  </button>\n'
 '</node>\n'
 '<node id=2 clickable=True inputable=False>\n'
 '  <a href="https://support.google.com/chrome/answer/6130773?hl=en">\n'
 '    <::before></::before>\n'
 '    Learn more about using Guest mode\n'
 '  </a>\n'
 '</node>\n'
 '<node id=3 clickable=True inputable=False>\n'
 '  <button type="button">\n'
 '    <div>\n'
 '      <::before></::before>\n'
 '      <::after></::after>\n'
 '    </div>\n'
 '    <div></div>\n'
 '    <div></div>\n'
 '    <span>Next</span>\n'
 '  </button>\n'
 '</node>\n'
 '<node id=4 clickable=True inputable=False>\n'
 '  <button type="button">\n'
 '    <div>\n'
 '      <::before></::before>\n'
 '      <::after></::after>\n'
 '    </div>\n'
 '    <div></div>\n'
 '    <div></div>\n'
 '    <span>Create account</span>\n'
 '  </button>\n'
 '</node>\n'
 '<node id=5 clickable=True inputable=False>\n'
 '  <a href="https://support.google.com/accounts?hl=en&p=account_iph">\n'
 '    <::before></::before>\n'
 '    Help\n'
 '  </a>\n'
 '</node>\n'
 '<node id=6 clickable=True inputable=False>\n'
 '  <a href="https://accounts.google.com/TOS?loc=GB&hl=en&privacy=true">\n'
 '    <::before></::before>\n'
 '    Privacy\n'
 '  </a>\n'
 '</node>\n'
 '<node id=7 clickable=True inputable=False>\n'
 '  <a href="https://accounts.google.com/TOS?loc=GB&hl=en">\n'
 '    <::before></::before>\n'
 '    Terms\n'
 '  </a>\n'
 '</node>\n')

See the extracted clickables like the Next button and others that are detected properly above.