pythonselenium-webdriverselenium-chromedriver

Selenium only captures full urls for script src


im trying to capture all "src" elements in all elements and it never gives me back urls like "/cdn/script.js" and only full urls like "site.com/cdn/script.js", how can i enable this?

def GetScriptArray():
  ScriptElements = Driver.find_elements(By.TAG_NAME, 'script')
  for x, Script in enumerate(ScriptElements, start=1):
    ScriptSource = Script.get_attribute("src")
    ScriptSourceAlt = Script.get_attribute("data-original-src")
    if ScriptSource:
      if ScriptSource.startswith("http"):
        ScriptArray.append(ScriptSource)
      elif ScriptSource.startswith("//"):
        print("SPECIAL 1 : " + ScriptSource)
      elif ScriptSource.startswith("/"):
        print("SPECIAL 2 : " + ScriptSource)
    else:
      print("SCRIPT NUM " + str(x) + " HAS NO SRC")

The above script outputs the below (I'm testing on hugedomains.com/domain_profile.cfm?d=myecommercewebsite.com):

DevTools listening on ws://127.0.0.1:60068/devtools/browser/a7437c3c-2acf-484f-9ec8-92c7fb9acca4
SCRIPT NUM 4 HAS NO SRC
SCRIPT NUM 5 HAS NO SRC
SCRIPT NUM 6 HAS NO SRC
SCRIPT NUM 7 HAS NO SRC
SCRIPT NUM 8 HAS NO SRC
SCRIPT NUM 9 HAS NO SRC
SCRIPT NUM 16 HAS NO SRC
SCRIPT NUM 17 HAS NO SRC
SCRIPT NUM 18 HAS NO SRC

[array with no cut urls (cant share because u cant post https://]

doesnt come up with no urls like "/cdn/script,js" only full urls...........


Solution

  • My assumption is that you aren't letting the page fully load. I changed the approach, changed the <script> locator to script[src] to only pull tags that have an src attribute, added a wait, and it's working fine for me.

    from selenium import webdriver
    from selenium.webdriver.support.wait import WebDriverWait
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support import expected_conditions as EC
    
    #url = f'file:///C:/Users/jbcro/Desktop/sample.html'
    url = 'https://www.hugedomains.com/domain_profile.cfm?d=myecommercewebsite.com'
    driver = webdriver.Chrome()
    driver.maximize_window()
    driver.get(url)
    
    wait = WebDriverWait(driver, 10)
    wait.until(EC.visibility_of_element_located((By.ID, "header")))
    tags = driver.find_elements(By.CSS_SELECTOR, "script[src]")
    print("DEFAULT")
    for tag in tags:
        print(tag.get_attribute("src"))
    print("")
    
    iframes = driver.find_elements(By.CSS_SELECTOR, "iframe")
    for iframe in iframes:
        driver.switch_to.frame(iframe)
        tags = driver.find_elements(By.CSS_SELECTOR, "script[src]")
        print("IFRAME")
        for tag in tags:
            print(tag.get_attribute("src"))
        driver.switch_to.default_content()
        print("")
    

    It prints

    DEFAULT
    https://www.gstatic.com/recaptcha/releases/pPK749sccDmVW_9DSeTMVvh2/recaptcha__en.js
    https://cdn-cookieyes.com/client_data/e71bc53f1cb88666d160c1e2/script.js
    https://cdn-cookieyes.com/client_data/e71bc53f1cb88666d160c1e2/banner.js
    https://www.google.com/recaptcha/enterprise.js?render=6LdRB9UiAAAAABaf3jRLyU_gwaGIp-3OvR51myRx
    https://static.hugedomains.com/js/hdv3-js/jquery.min.js
    https://static.hugedomains.com/js/hdv3-js/script.js?aa=2022-10-32
    https://static.hugedomains.com/js/hdv3-js/common.js
    https://static.hugedomains.com/js/hdv3-js/hd-js.js?a=20220124b  
    https://www.hugedomains.com/rjs/hdv3-rjs/hd-js.cfm?aa=2022-10-32
    
    IFRAME
    https://www.hugedomains.com/cdn-cgi/challenge-platform/scripts/jsd/main.js
    
    IFRAME
    https://www.gstatic.com/recaptcha/releases/pPK749sccDmVW_9DSeTMVvh2/recaptcha__en.js
    https://www.google.com/js/bg/1WJ41Y3FKQ963wNVVah2aO2i1At8ivplN5CN6DwMHdo.js
    
    IFRAME