im trying to capture all "src" elements in all elements and it never gives me back urls like "/cdn/script.js" and only full urls like "site.com/cdn/script.js", how can i enable this?
def GetScriptArray():
ScriptElements = Driver.find_elements(By.TAG_NAME, 'script')
for x, Script in enumerate(ScriptElements, start=1):
ScriptSource = Script.get_attribute("src")
ScriptSourceAlt = Script.get_attribute("data-original-src")
if ScriptSource:
if ScriptSource.startswith("http"):
ScriptArray.append(ScriptSource)
elif ScriptSource.startswith("//"):
print("SPECIAL 1 : " + ScriptSource)
elif ScriptSource.startswith("/"):
print("SPECIAL 2 : " + ScriptSource)
else:
print("SCRIPT NUM " + str(x) + " HAS NO SRC")
The above script outputs the below (I'm testing on hugedomains.com/domain_profile.cfm?d=myecommercewebsite.com
):
DevTools listening on ws://127.0.0.1:60068/devtools/browser/a7437c3c-2acf-484f-9ec8-92c7fb9acca4
SCRIPT NUM 4 HAS NO SRC
SCRIPT NUM 5 HAS NO SRC
SCRIPT NUM 6 HAS NO SRC
SCRIPT NUM 7 HAS NO SRC
SCRIPT NUM 8 HAS NO SRC
SCRIPT NUM 9 HAS NO SRC
SCRIPT NUM 16 HAS NO SRC
SCRIPT NUM 17 HAS NO SRC
SCRIPT NUM 18 HAS NO SRC
[array with no cut urls (cant share because u cant post https://]
doesnt come up with no urls like "/cdn/script,js" only full urls...........
My assumption is that you aren't letting the page fully load. I changed the approach, changed the <script>
locator to script[src]
to only pull tags that have an src attribute, added a wait, and it's working fine for me.
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
#url = f'file:///C:/Users/jbcro/Desktop/sample.html'
url = 'https://www.hugedomains.com/domain_profile.cfm?d=myecommercewebsite.com'
driver = webdriver.Chrome()
driver.maximize_window()
driver.get(url)
wait = WebDriverWait(driver, 10)
wait.until(EC.visibility_of_element_located((By.ID, "header")))
tags = driver.find_elements(By.CSS_SELECTOR, "script[src]")
print("DEFAULT")
for tag in tags:
print(tag.get_attribute("src"))
print("")
iframes = driver.find_elements(By.CSS_SELECTOR, "iframe")
for iframe in iframes:
driver.switch_to.frame(iframe)
tags = driver.find_elements(By.CSS_SELECTOR, "script[src]")
print("IFRAME")
for tag in tags:
print(tag.get_attribute("src"))
driver.switch_to.default_content()
print("")
It prints
DEFAULT
https://www.gstatic.com/recaptcha/releases/pPK749sccDmVW_9DSeTMVvh2/recaptcha__en.js
https://cdn-cookieyes.com/client_data/e71bc53f1cb88666d160c1e2/script.js
https://cdn-cookieyes.com/client_data/e71bc53f1cb88666d160c1e2/banner.js
https://www.google.com/recaptcha/enterprise.js?render=6LdRB9UiAAAAABaf3jRLyU_gwaGIp-3OvR51myRx
https://static.hugedomains.com/js/hdv3-js/jquery.min.js
https://static.hugedomains.com/js/hdv3-js/script.js?aa=2022-10-32
https://static.hugedomains.com/js/hdv3-js/common.js
https://static.hugedomains.com/js/hdv3-js/hd-js.js?a=20220124b
https://www.hugedomains.com/rjs/hdv3-rjs/hd-js.cfm?aa=2022-10-32
IFRAME
https://www.hugedomains.com/cdn-cgi/challenge-platform/scripts/jsd/main.js
IFRAME
https://www.gstatic.com/recaptcha/releases/pPK749sccDmVW_9DSeTMVvh2/recaptcha__en.js
https://www.google.com/js/bg/1WJ41Y3FKQ963wNVVah2aO2i1At8ivplN5CN6DwMHdo.js
IFRAME