pythonhtmlweb-scrapingplaywrightplaywright-python

I am having problem scraping the website, my script extracts data upto aria rowindex 29 while I need to extract upto aria rowindex 2509


This is my code you can see I am using playwright and selectolax to scrape the website. Whenever I execute the script the script extract data from the table on the website upto aria row index 29 and then the execution stops successfully showing no error but I want the script to execute upto aria row index 2509

from playwright.sync_api import sync_playwright
from selectolax.parser import HTMLParser
import time
import pandas as pd


def extract_full_body_html(url):
    TIMEOUT = 30000  # Reduced timeout to prevent long waits

    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        page = browser.new_page()

        # Maximize the window
        page.set_viewport_size({'width': 1920, 'height': 1080})

        page.goto(url, wait_until='networkidle')

        # Wait for the initial dynamic content to load
        page.wait_for_selector('div[role="gridcell"]', timeout=TIMEOUT)  # Adjusted selector

        # Scroll down and periodically check for new content
        def load_more_content():
            last_row_index = 0
            while True:
                page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
                time.sleep(10)  # Wait for the page to load more content

                # Check for new elements based on the aria-rowindex attribute
                new_last_row_index = int(page.evaluate('''() => {
                    const rows = document.querySelectorAll('div[role="gridcell"][aria-rowindex]');
                    return rows[rows.length - 1].getAttribute("aria-rowindex");
                }'''))

                if new_last_row_index <= last_row_index:
                    break  # No new data loaded, stop the process
                last_row_index = new_last_row_index

                # Small delay to ensure all data is loaded for the new rows
                time.sleep(2)

        load_more_content()

        return page.inner_html('body')

def extraction(html):
    tree = HTMLParser(html)
    data = []

    # Adjust the range if you expect more or fewer rows
    for i in range(1, 2510):  # Extract data up to aria row index 2509
        row_selector = f'div[role="gridcell"][aria-rowindex="{i}"]'
        company_div = tree.css_first(f'{row_selector}[aria-colindex="1"]')
        if company_div is None:
            break  # Exit if no more rows are found

        # Extracting data for each column in the row
        row_data = {
            'Company': company_div.text(deep=True, separator=' '),
            'Emails': tree.css_first(f'{row_selector}[aria-colindex="2"]').text(deep=True, separator=' '),
            'Addresses': tree.css_first(f'{row_selector}[aria-colindex="3"]').text(deep=True, separator=' '),
            'Urls': tree.css_first(f'{row_selector}[aria-colindex="4"]').text(deep=True, separator=' '),
            'Description': tree.css_first(f'{row_selector}[aria-colindex="5"]').text(deep=True, separator=' '),
            'Stage': tree.css_first(f'{row_selector}[aria-colindex="6"]').text(deep=True, separator=' '),
            'Number of Portfolio Organizations': tree.css_first(f'{row_selector}[aria-colindex="7"]').text(deep=True, separator=' '),
            'Number of Investments': tree.css_first(f'{row_selector}[aria-colindex="8"]').text(deep=True, separator=' '),
            'Accelerator Duration (in weeks)': tree.css_first(f'{row_selector}[aria-colindex="9"]').text(deep=True, separator=' '),
            'Number of Exits': tree.css_first(f'{row_selector}[aria-colindex="10"]').text(deep=True, separator=' '),
            'Linkedin': tree.css_first(f'{row_selector}[aria-colindex="11"]').text(deep=True, separator=' '),
            'Founders': tree.css_first(f'{row_selector}[aria-colindex="12"]').text(deep=True, separator=' '),
            'Twitter': tree.css_first(f'{row_selector}[aria-colindex="13"]').text(deep=True, separator=' ')

        }
        data.append(row_data)

    return data

if __name__ == '__main__':
    url = 'https://app.folk.app/shared/All-accelerators-rw0kuUNqtzl6j6dDQquoZTYF6MFKIQHo'
    html = extract_full_body_html(url)
    data = extraction(html)
    df = pd.DataFrame(data)
    df.to_excel('output.xlsx', index=False)

In my script I think the html content of the page is not fully available to be scraped or as the script executes further html of the page is not loaded or visibile to be scraped.


Solution

  • I think this is more or less what you want:

    import time
    from playwright.sync_api import sync_playwright
    
    
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=False)
        context = browser.new_context()
        page = context.new_page()
        page.goto('https://app.folk.app/shared/All-accelerators-rw0kuUNqtzl6j6dDQquoZTYF6MFKIQHo')
    
        #We make click into the table (Otherwise we can not make scroll)
        page.locator("//div[@data-testid='contact-table']").click()
    
        # We make scroll till the end of the page
        for i in range(5):  # make the range as long as needed
            page.mouse.wheel(0, 150000)
            time.sleep(1)
    
        # We get the aria-rowindex of the last row of the table
        print(page.locator("//div[@role='row'][last()]").get_attribute('aria-rowindex'))
        num_rows = page.locator("//div[@role='row'][last()]").get_attribute('aria-rowindex')
    
        # We make scrill again till the top of the page again
        for i in range(5):  # make the range as long as needed
            page.mouse.wheel(0, -150000)
            time.sleep(1)
    
        # We iterate to take all the data using the num of rows we previously took
        for i in range(1, int(num_rows)+1):
            page.locator(f"//div[@class='c-klyBnI c-klyBnI-inIPuL-css']/div[@aria-rowindex='{i}']").scroll_into_view_if_needed()
            company = page.locator(f"//div[@class='c-klyBnI c-klyBnI-inIPuL-css']/div[@aria-rowindex='{i}']//span[2]").inner_text()
            email = page.locator(f"//div[@role='row' and  @aria-rowindex='{i}']//div[@aria-colindex='2']/span").inner_text()
            print(f"{i} - {company} - {email}")
    

    I left some comments into the code to explain what the code is doing.

    Basically, as you said, the page is loaded by Javascript, so I think the key is to take the last row and then scrolling row by row till we get all the data.

    I just extracted a couple of columns, but I think for you should be easy to take the rest of them.

    Good luck!