I have a code snippet to extract the inputable and clickable node elements (i.e. interactive elements) from the DOM tree of the web pages via Playwright in python.
This code almost works properly but in some cases misses some elements like google's buttons! In fact, this button is marked as unclickable with this code. Can someone identify the issue with this code?
Here's the code:
from playwright.sync_api import sync_playwright
VOID_ELEMENTS = {
"area",
"base",
"br",
"col",
"embed",
"hr",
"img",
"input",
"link",
"meta",
"param",
"source",
"track",
"wbr",
}
READABLE_ATTRIBUTES = {
"title",
"alt",
"href",
"placeholder",
"label",
"value",
"caption",
"summary",
"aria-label",
"aria-describedby",
"datetime",
"download",
"selected",
"checked",
"type",
}
UNCLICKABLE_ELEMENTS = {"html", "head", "body"}
CLICKABLE_ELEMENTS = {"a", "button", "img", "details", "summary"}
INPUT_ELEMENTS = {"input", "textarea", "select", "option"}
class DOMNode:
def __init__(self, i, nodes, strings):
self._on_screen = None
self.parent = None
self.children = []
self.llm_id = None
### Only some nodes have these, default None to differentiate between None and False
self.bounds = None
self.center = None
self.inputValue = None
self.inputChecked = None
self.isClickable = None
self.optionSelected = None
self.parentId = (
nodes["parentIndex"][i] if nodes["parentIndex"][i] >= 0 else None
)
self.nodeType = strings[nodes["nodeType"][i]]
self.nodeName = strings[nodes["nodeName"][i]].lower()
self.nodeValue = (
strings[nodes["nodeValue"][i]].strip()
if nodes["nodeValue"][i] >= 0
else None
)
self.backendNodeId = nodes["backendNodeId"][i]
self.attributes = {}
attrs = nodes["attributes"][i]
for att1, att2 in zip(attrs[::2], attrs[1::2]):
self.attributes[strings[att1]] = strings[att2][:100] # cut off long URLs
self.readable_attributes = {
k: v for k, v in self.attributes.items() if k in READABLE_ATTRIBUTES
}
def __repr__(self, indent=0) -> str:
if self.nodeName == "#text":
return " " * indent + (self.nodeValue or "")
attr_str = " ".join([f'{k}="{v}"' for k, v in self.readable_attributes.items()])
attr_str = " " + attr_str if attr_str else ""
open_tag = f"<{self.nodeName}{attr_str}>"
close_tag = f"</{self.nodeName}>"
if len(self.children) == 0:
return (" " * indent + open_tag) + (
close_tag if self.nodeName not in VOID_ELEMENTS else ""
)
# special case for elements with only one text child -> one-line element
if len(self.children) == 1 and self.children[0].nodeName == "#text":
return (" " * indent + open_tag) + self.children[0].__repr__() + close_tag
children_repr = "\n".join(
[child.__repr__(indent + 2) for child in self.children]
)
return (
(" " * indent + open_tag)
+ "\n"
+ children_repr
+ "\n"
+ (" " * indent + close_tag)
)
def on_screen(self, screen_bounds):
if len(self.children) > 0:
return any([child.on_screen(screen_bounds) for child in self.children])
if (
self.bounds is None
or len(self.bounds) != 4
or self.bounds[2] * self.bounds[3] == 0
):
return False
x, y, w, h = self.bounds
win_upper_bound, win_left_bound, win_width, win_height = screen_bounds
win_right_bound = win_left_bound + win_width
win_lower_bound = win_upper_bound + win_height
return (
x < win_right_bound
and x + w > win_left_bound
and y < win_lower_bound
and y + h > win_upper_bound
)
class Globot:
def __init__(self, headless=False):
playwright = sync_playwright().start()
self.browser = playwright.chromium.launch(headless=headless)
self.context = self.browser.new_context()
self.page = self.context.new_page()
def go_to_page(self, url):
self.page.goto(url=url if "://" in url else "https://" + url)
self.client = self.page.context.new_cdp_session(self.page)
self.page.wait_for_load_state("domcontentloaded")
def crawl(self) -> tuple[dict[int, DOMNode], dict[int, DOMNode]]:
dom = self.client.send(
"DOMSnapshot.captureSnapshot",
{"computedStyles": [], "includeDOMRects": True, "includePaintOrder": True},
)
dom_strings = dom["strings"]
document = dom["documents"][0]
dom_layout = document["layout"]
dom_nodes = document["nodes"]
screen_bounds = dom_layout["bounds"][0]
# For some reason `window.devicePixelRatio` this gives the wrong answer sometimes
device_pixel_ratio = screen_bounds[2] / self.page.evaluate(
"window.screen.width"
)
nodes = []
root = None
# Takes much longer naively
nodeIndex_flipped = {v: k for k, v in enumerate(dom_layout["nodeIndex"])}
inputValue_flipped = {
v: k for k, v in enumerate(dom_nodes["inputValue"]["index"])
}
for i in range(len(dom_nodes["parentIndex"])):
node = DOMNode(i, dom_nodes, dom_strings)
if i == 0:
root = node
if i in nodeIndex_flipped:
bounds = dom_layout["bounds"][nodeIndex_flipped[i]]
bounds = [int(b / device_pixel_ratio) for b in bounds]
node.bounds = bounds
node.center = (
int(bounds[0] + bounds[2] / 2),
int(bounds[1] + bounds[3] / 2),
)
if i in dom_nodes["isClickable"]["index"]:
node.isClickable = True
if i in inputValue_flipped:
v = dom_nodes["inputValue"]["value"][inputValue_flipped[i]]
node.inputValue = dom_strings[v] if v >= 0 else ""
# node.string_attributes['value'] = node.inputValue
if i in dom_nodes["inputChecked"]["index"]:
node.inputChecked = True
if i in dom_nodes["optionSelected"]["index"]:
node.optionSelected = True
nodes.append(node)
# Switch node ids to node pointers
for node in nodes:
if node.parentId is not None:
node.parent = nodes[node.parentId]
node.parent.children.append(node)
count = 0
input_elements = {}
clickable_elements = {}
def find_interactive_elements(node):
nonlocal count
clickable = (
node.nodeName in CLICKABLE_ELEMENTS
and node.isClickable
and node.center is not None
)
inputable = node.nodeName in INPUT_ELEMENTS or node.inputValue is not None
# Special case for select and option elements
select_or_option = node.nodeName == "select" or node.nodeName == "option"
visible = node.on_screen(
root.bounds
) and "visibility: hidden" not in node.attributes.get("style", "")
if node.nodeName == "button":
print(f"Node: {node.nodeName}")
print(f" Attributes: {node.attributes}")
print(f" Bounds: {node.bounds}")
print(f" Clickable: {clickable}")
print(f" Inputable: {inputable}")
print(f" Visible: {visible}")
print(f" Center: {node.center}")
if visible and (clickable or inputable) or select_or_option:
if clickable:
clickable_elements[count] = node
if inputable or select_or_option:
input_elements[count] = node
node.llm_id = count
count += 1
for child in node.children:
find_interactive_elements(child)
find_interactive_elements(root)
return input_elements, clickable_elements
Code snippet for reproducing the issue (here the Next button is not known as clickable):
from pprint import pprint
bot = Globot()
bot.go_to_page(
"https://accounts.google.com/v3/signin/identifier?authuser=0&continue=https%3A%2F%2Fwww.google.com%2F&ec=GAlAmgQ&hl=en&flowName=GlifWebSignIn&flowEntry=AddSession&dsh=S1040273122%3A1718390580872851&ddm=0"
)
inputs, clickables = bot.crawl()
s = ""
for i in inputs.keys() | clickables.keys():
inputable = False
clickable = False
if i in inputs:
node = inputs[i]
inputable = True
if i in clickables:
node = clickables[i]
clickable = True
s += f"<node id={i} clickable={clickable} inputable={inputable}>\n"
s += node.__repr__(indent=2)
s += "\n</node>\n"
html_description = s
pprint(html_description)
Here's the part of the log regarding the Next element - as you can see the Clickable
is set to None
:
Node: button
Attributes: {'class': 'VfPpkd-LgbsSe VfPpkd-LgbsSe-OWXEXe-k8QpJ VfPpkd-LgbsSe-OWXEXe-dgl2Hf nCP5yc AjY5Oe DuMIQc LQeN7 BqKG', 'jscontroller': 'soHxf', 'jsaction': 'click:cOuCgd; mousedown:UX7yZ; mouseup:lbsD7e; mouseenter:tfO1Yc; mouseleave:JywGue; touchstart:p6p2', 'data-idom-class': 'nCP5yc AjY5Oe DuMIQc LQeN7 BqKGqe Jskylb TrZEUc lw1w4b', 'jsname': 'LgbsSe', 'type': 'button'}
Bounds: [965, 453, 78, 40]
Clickable: None
Inputable: False
Visible: True
Center: (1004, 473)
Here's the RAW HTML of the Next button:
<button class="VfPpkd-LgbsSe VfPpkd-LgbsSe-OWXEXe-k8QpJ VfPpkd-LgbsSe-OWXEXe-dgl2Hf nCP5yc AjY5Oe DuMIQc LQeN7 BqKGqe Jskylb TrZEUc lw1w4b" jscontroller="soHxf" jsaction="click:cOuCgd; mousedown:UX7yZ; mouseup:lbsD7e; mouseenter:tfO1Yc; mouseleave:JywGue; touchstart:p6p2H; touchmove:FwuNnf; touchend:yfqBxc; touchcancel:JMtRjd; focus:AHmuwe; blur:O22p3e; contextmenu:mg9Pef;mlnRJb:fLiPzd;" data-idom-class="nCP5yc AjY5Oe DuMIQc LQeN7 BqKGqe Jskylb TrZEUc lw1w4b" jsname="LgbsSe" type="button"><div class="VfPpkd-Jh9lGc"></div><div class="VfPpkd-J1Ukfc-LhBDec"></div><div class="VfPpkd-RLmnJb"></div><span jsname="V67aGc" class="VfPpkd-vQzf8d">Next</span></button>
Here's the screenshot of the respective page:
I apologize if the code is too long and I appreciate any help in advance.
I ended up with the following code including optimization using sets
for quick lookup, and adding a new condition for determining if a node is clickable by the presence of an onclick
attribute or if the node is a button
.
Here's the refined version:
from playwright.sync_api import sync_playwright
from pprint import pprint
VOID_ELEMENTS = {
"area",
"base",
"br",
"col",
"embed",
"hr",
"img",
"input",
"link",
"meta",
"param",
"source",
"track",
"wbr",
}
READABLE_ATTRIBUTES = {
"title",
"alt",
"href",
"placeholder",
"label",
"value",
"caption",
"summary",
"aria-label",
"aria-describedby",
"datetime",
"download",
"selected",
"checked",
"type",
}
UNCLICKABLE_ELEMENTS = {"html", "head", "body"}
CLICKABLE_ELEMENTS = {"a", "button", "img", "details", "summary", "ul", "li"}
INPUT_ELEMENTS = {"input", "textarea", "select", "option"}
class DOMNode:
def __init__(self, i, nodes, strings):
self._on_screen = None
self.parent = None
self.children = []
self.llm_id = None
### Only some nodes have these, default None to differentiate between None and False
self.bounds = None
self.center = None
self.inputValue = None
self.inputChecked = None
self.isClickable = None
self.optionSelected = None
self.parentId = (
nodes["parentIndex"][i] if nodes["parentIndex"][i] >= 0 else None
)
self.nodeType = strings[nodes["nodeType"][i]]
self.nodeName = strings[nodes["nodeName"][i]].lower()
self.nodeValue = (
strings[nodes["nodeValue"][i]].strip()
if nodes["nodeValue"][i] >= 0
else None
)
self.backendNodeId = nodes["backendNodeId"][i]
self.attributes = {}
attrs = nodes["attributes"][i]
for att1, att2 in zip(attrs[::2], attrs[1::2]):
self.attributes[strings[att1]] = strings[att2][:100] # cut off long URLs
self.readable_attributes = {
k: v for k, v in self.attributes.items() if k in READABLE_ATTRIBUTES
}
def __repr__(self, indent=0) -> str:
if self.nodeName == "#text":
return " " * indent + (self.nodeValue or "")
attr_str = " ".join([f'{k}="{v}"' for k, v in self.readable_attributes.items()])
attr_str = " " + attr_str if attr_str else ""
open_tag = f"<{self.nodeName}{attr_str}>"
close_tag = f"</{self.nodeName}>"
if len(self.children) == 0:
return (" " * indent + open_tag) + (
close_tag if self.nodeName not in VOID_ELEMENTS else ""
)
# special case for elements with only one text child -> one-line element
if len(self.children) == 1 and self.children[0].nodeName == "#text":
return (" " * indent + open_tag) + self.children[0].__repr__() + close_tag
children_repr = "\n".join(
[child.__repr__(indent + 2) for child in self.children]
)
return (
(" " * indent + open_tag)
+ "\n"
+ children_repr
+ "\n"
+ (" " * indent + close_tag)
)
def on_screen(self, screen_bounds):
if len(self.children) > 0:
return any([child.on_screen(screen_bounds) for child in self.children])
if (
self.bounds is None
or len(self.bounds) != 4
or self.bounds[2] * self.bounds[3] == 0
):
return False
x, y, w, h = self.bounds
win_upper_bound, win_left_bound, win_width, win_height = screen_bounds
win_right_bound = win_left_bound + win_width
win_lower_bound = win_upper_bound + win_height
return (
x < win_right_bound
and x + w > win_left_bound
and y < win_lower_bound
and y + h > win_upper_bound
)
class Globot:
def __init__(self, headless=False):
playwright = sync_playwright().start()
self.browser = playwright.chromium.launch(headless=headless)
self.context = self.browser.new_context()
self.page = self.context.new_page()
def go_to_page(self, url):
self.page.goto(url=url if "://" in url else "https://" + url)
self.client = self.page.context.new_cdp_session(self.page)
self.page.wait_for_load_state("domcontentloaded")
def crawl(self) -> tuple[dict[int, DOMNode], dict[int, DOMNode]]:
dom = self.client.send(
"DOMSnapshot.captureSnapshot",
{"computedStyles": [], "includeDOMRects": True, "includePaintOrder": True},
)
dom_strings = dom["strings"]
document = dom["documents"][0]
dom_layout = document["layout"]
dom_nodes = document["nodes"]
screen_bounds = dom_layout["bounds"][0]
# For some reason `window.devicePixelRatio` this gives the wrong answer sometimes
device_pixel_ratio = screen_bounds[2] / self.page.evaluate(
"window.screen.width"
)
nodes = []
root = None
isClickable_set = set(dom_nodes.get("isClickable", {}).get("index", []))
inputChecked_set = set(dom_nodes.get("inputChecked", {}).get("index", []))
optionSelected_set = set(dom_nodes.get("optionSelected", {}).get("index", []))
inputValue_map = dict(
zip(
dom_nodes.get("inputValue", {}).get("index", []),
dom_nodes.get("inputValue", {}).get("value", []),
)
)
nodeIndex_flipped = {v: k for k, v in enumerate(dom_layout["nodeIndex"])}
for i in range(len(dom_nodes["parentIndex"])):
node = DOMNode(i, dom_nodes, dom_strings)
if i == 0:
root = node
if i in nodeIndex_flipped:
bounds = dom_layout["bounds"][nodeIndex_flipped[i]]
bounds = [int(b / device_pixel_ratio) for b in bounds]
node.bounds = bounds
node.center = (
int(bounds[0] + bounds[2] / 2),
int(bounds[1] + bounds[3] / 2),
)
node.isClickable = i in isClickable_set
node.inputChecked = i in inputChecked_set
node.optionSelected = i in optionSelected_set
if i in inputValue_map:
v = inputValue_map[i]
node.inputValue = dom_strings[v] if v >= 0 else ""
nodes.append(node)
# Switch node ids to node pointers
for node in nodes:
if node.parentId is not None:
node.parent = nodes[node.parentId]
node.parent.children.append(node)
count = 0
input_elements = {}
clickable_elements = {}
def find_interactive_elements(node):
nonlocal count
clickable = (
node.nodeName in CLICKABLE_ELEMENTS
and node.center is not None
and (
node.isClickable
or node.nodeName == "button"
or "onclick" in node.attributes
)
)
inputable = node.nodeName in INPUT_ELEMENTS or node.inputValue is not None
# Special case for select and option elements
select_or_option = node.nodeName in {"select", "option"}
visible = node.on_screen(
root.bounds
) and "visibility: hidden" not in node.attributes.get("style", "")
if visible and (clickable or inputable) or select_or_option:
if clickable:
clickable_elements[count] = node
if inputable or select_or_option:
input_elements[count] = node
node.llm_id = count
count += 1
for child in node.children:
find_interactive_elements(child)
find_interactive_elements(root)
return input_elements, clickable_elements
bot = Globot()
bot.go_to_page(
"https://accounts.google.com/v3/signin/identifier?authuser=0&continue=https%3A%2F%2Fwww.google.com%2F&ec=GAlAmgQ&hl=en&flowName=GlifWebSignIn&flowEntry=AddSession&dsh=S1040273122%3A1718390580872851&ddm=0"
)
inputs, clickables = bot.crawl()
s = ""
for i in inputs.keys() | clickables.keys():
inputable = False
clickable = False
if i in inputs:
node = inputs[i]
inputable = True
if i in clickables:
node = clickables[i]
clickable = True
s += f"<node id={i} clickable={clickable} inputable={inputable}>\n"
s += node.__repr__(indent=2)
s += "\n</node>\n"
html_description = s
pprint(html_description)
Output:
('<node id=0 clickable=False inputable=True>\n'
' <input type="email" aria-label="Email or phone" '
'value="2D7AB92D588040EBA91955F62E1BEE47">\n'
'</node>\n'
'<node id=1 clickable=True inputable=False>\n'
' <button type="button">\n'
' <::before></::before>\n'
' Forgot email?\n'
' </button>\n'
'</node>\n'
'<node id=2 clickable=True inputable=False>\n'
' <a href="https://support.google.com/chrome/answer/6130773?hl=en">\n'
' <::before></::before>\n'
' Learn more about using Guest mode\n'
' </a>\n'
'</node>\n'
'<node id=3 clickable=True inputable=False>\n'
' <button type="button">\n'
' <div>\n'
' <::before></::before>\n'
' <::after></::after>\n'
' </div>\n'
' <div></div>\n'
' <div></div>\n'
' <span>Next</span>\n'
' </button>\n'
'</node>\n'
'<node id=4 clickable=True inputable=False>\n'
' <button type="button">\n'
' <div>\n'
' <::before></::before>\n'
' <::after></::after>\n'
' </div>\n'
' <div></div>\n'
' <div></div>\n'
' <span>Create account</span>\n'
' </button>\n'
'</node>\n'
'<node id=5 clickable=True inputable=False>\n'
' <a href="https://support.google.com/accounts?hl=en&p=account_iph">\n'
' <::before></::before>\n'
' Help\n'
' </a>\n'
'</node>\n'
'<node id=6 clickable=True inputable=False>\n'
' <a href="https://accounts.google.com/TOS?loc=GB&hl=en&privacy=true">\n'
' <::before></::before>\n'
' Privacy\n'
' </a>\n'
'</node>\n'
'<node id=7 clickable=True inputable=False>\n'
' <a href="https://accounts.google.com/TOS?loc=GB&hl=en">\n'
' <::before></::before>\n'
' Terms\n'
' </a>\n'
'</node>\n')
See the extracted clickables like the Next button and others that are detected properly above.