pythonselenium-webdriverbeautifulsoupbiopython

Extracting NCBI RefSeq and Submitted GenBank assembly accession numbers using Selenium and BeautifulSoup


Title: Difficulty Extracting GenBank Accession Number Using Species and Strain Name, using webscraping (Using BeautifulSoup or Selenium) Following this post, I'm attempting to extract NCBI RefSeq and Submitted GenBank assembly accession numbers from a webpage using Selenium and BeautifulSoup in Python. However, I'm encountering an issue where the previous code doesn't work for genomes with a single assembly, as it opens a different page.

To address this, I've tried a different approach:

This codes

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

# Define the search term
search_term = "Streptomyces anthocyanicus NBC 01687"

# Open a Chrome browser
driver = webdriver.Chrome()

# Construct the search URL for assembly
search_url = f"https://www.ncbi.nlm.nih.gov/assembly/?term={search_term.replace(' ', '+')}"

# Navigate to the search URL
driver.get(search_url)

try:
    # Wait for the main content to be visible
    main_content = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.ID, "maincontent")))

    # Find the assembly information
    assembly_info = main_content.text if main_content else "Assembly information not found"
    #print(assembly_info)

    # Extract GenBank and RefSeq assembly IDs if the assembly widget is present
    try:
        assembly_table = driver.find_element(By.CLASS_NAME, "assembly-widget")
        rows = assembly_table.find_elements(By.TAG_NAME, "tr")
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, "td")
            if len(cells) == 3:
                label = cells[1].text.strip()
                assembly_id = cells[2].text.strip()
                if label == "NCBI RefSeq assembly":
                    print("NCBI RefSeq assembly:", assembly_id)
                elif label == "Submitted GenBank assembly":
                    print("Submitted GenBank assembly:", assembly_id)
    except NoSuchElementException:
        print("Assembly information widget not found.")

except TimeoutException:
    print("Elements not found or timed out waiting for them to appear.")

# Initialize variables to store assembly IDs
genbank_assembly = None
refseq_assembly = None

# Split the assembly information into lines and iterate over them
lines = assembly_info.split("\n")
for i in range(len(lines)):
    if "NCBI RefSeq assembly" in lines[i]:
        refseq_assembly = lines[i+1].strip()
    elif "Submitted GenBank assembly" in lines[i]:
        genbank_assembly = lines[i+1].strip()

# Print the assembly IDs if found
if refseq_assembly:
    print("NCBI RefSeq assembly:", refseq_assembly)
if genbank_assembly:
    print("Submitted GenBank assembly:", genbank_assembly)


# Close the browser
driver.quit()

output is

Assembly information widget not found.
NCBI RefSeq assembly: GCF_036226945.1
Submitted GenBank assembly: GCA_036226945.1

However, this code opens the entire page, , so I process extract the RefSeq and GenBank accession numbers.

But, I Think, its not good way, there will be some correct way to acheive

anotherway I found is

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

# Define the search term
search_term = "Streptomyces anthocyanicus NBC 01687"

# Open a Chrome browser
driver = webdriver.Chrome()

try:
    # Construct the search URL for assembly
    search_url = f"https://www.ncbi.nlm.nih.gov/assembly/?term={search_term.replace(' ', '+')}"

    # Navigate to the search URL
    driver.get(search_url)

    # Find elements containing the organism name
    elements = driver.find_elements(By.XPATH, "//*[contains(text(), 'NCBI RefSeq assembly')]") #{search_term}

    if elements:
        print(f"Text '{search_term}' found on the webpage.")
        # Loop through elements containing the organism name
        for element in elements:
            # Find the parent element of the matched element
            parent_element = element.find_element(By.XPATH, "..") # for sibling"following-sibling::*[1]" #for parents ".." and for grand parents "../.." 
            # Print the text content of the parent element
            print("Parent element:")
            print(parent_element.text)
            
            
    else:
        print(f"Text '{search_term}' not found on the webpage.")

except Exception as e:
    print("An error occurred:", e)

finally:
    # Quit the browser
    driver.quit()

But I want to do same way as I did previous Title: Difficulty Extracting GenBank Accession Number Using Species and Strain Name, using webscraping (Using BeautifulSoup or Selenium) for this page, so I can collect all information in one scripts , Could someone please suggest me proper code to achieve this?, kindly help

Thank you in advance!


Solution

  • I took your code, simplified it, and came up with this working code.

    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.common.exceptions import TimeoutException
    
    # Define the search term
    search_term = "Streptomyces anthocyanicus NBC 01687"
    
    # Open a Chrome browser
    driver = webdriver.Chrome()
    
    # Construct the search URL for assembly
    search_url = f"https://www.ncbi.nlm.nih.gov/assembly/?term={search_term.replace(' ', '+')}"
    
    # Navigate to the search URL
    driver.get(search_url)
    
    wait = WebDriverWait(driver, 10)
    
    # Extract GenBank and RefSeq assembly IDs if the assembly widget is present
    try:
        refseq_assembly = wait.until(EC.visibility_of_element_located((By.XPATH, "//dt[text()='NCBI RefSeq assembly']//following::dd/span"))).text
        print("NCBI RefSeq assembly:", refseq_assembly)
        genbank_assembly = wait.until(EC.visibility_of_element_located((By.XPATH, "//dt[text()='Submitted GenBank assembly']//following::dd/span"))).text
        print("Submitted GenBank assembly:", genbank_assembly)
    except TimeoutException:
        print("Elements not found or timed out waiting for them to appear.")
    
    # Close the browser
    driver.quit()
    

    and it prints

    NCBI RefSeq assembly: GCF_036226945.1      
    Submitted GenBank assembly: GCA_036226945.1