pythonloopsweb-scrapingbeautifulsouppython-beautifultable

Webscrape Multiple Pages with python - output issue


Happy new year python community,

I am trying to extract a table from website using Python Beautifulsoup4

I am struggling to see the results in my output files. The code run smoothly but nothing is written the file.

My code below

from bs4 import BeautifulSoup as bsoup
import requests as rq
import re

base_url = 'http://www.creationdentreprise.sn/rechercher-une-societe?field_rc_societe_value=&field_ninea_societe_value=&denomination=&field_localite_nid=All&field_siege_societe_value=&field_forme_juriduqe_nid=All&field_secteur_nid=All&field_date_crea_societe_value='
r = rq.get(base_url)

soup = bsoup(r.text)
# Use regex to isolate only the links of the page numbers, the one you click on.
page_count_links = soup.find_all("a",href=re.compile(r".http://www.creationdentreprise.sn/rechercher-une-societe?field_rc_societe_value=&field_ninea_societe_value=&denomination=&field_localite_nid=All&field_siege_societe_value=&field_forme_juriduqe_nid=All&field_secteur_nid=All&field_date_crea_societe_value=&page=.*"))
try: # Make sure there are more than one page, otherwise, set to 1.
    num_pages = int(page_count_links[-1].get_text())
except IndexError:
    num_pages = 1

# Add 1 because Python range.
url_list = ["{}&page={}".format(base_url, str(page)) for page in range(1, 3)]

# Open the text file. Use with to save self from grief.
with open("results.txt","wb") as acct:
    for url_ in url_list:
        print("Processing {}...".format(url_))
        r_new = rq.get(url_)
        soup_new = bsoup(r_new.text)
        for tr in soup_new.find_all('tr', align='center'):
            stack = []
            for td in tr.findAll('td'):
                stack.append(td.text.replace('\n', '').replace('\t', '').strip())
            acct.write(", ".join(stack) + '\n')

Solution

  • soup_new.find_all('tr', align='center') returns an empty list

    Try switching that for for tr in soup_new.find_all('tr'):

    and secondly, since you're using strings, switch the mode with open("results.txt","wb") to with open("results.txt","w")

    from bs4 import BeautifulSoup as bsoup
    import requests as rq
    import re
    
    base_url = 'http://www.creationdentreprise.sn/rechercher-une-societe?field_rc_societe_value=&field_ninea_societe_value=&denomination=&field_localite_nid=All&field_siege_societe_value=&field_forme_juriduqe_nid=All&field_secteur_nid=All&field_date_crea_societe_value=&page=2'
    r = rq.get(base_url)
    
    soup = bsoup(r.text)
    # Use regex to isolate only the links of the page numbers, the one you click on.
    page_count_links = soup.find_all("a",href=re.compile(r".http://www.creationdentreprise.sn/rechercher-une-societe?field_rc_societe_value=&field_ninea_societe_value=&denomination=&field_localite_nid=All&field_siege_societe_value=&field_forme_juriduqe_nid=All&field_secteur_nid=All&field_date_crea_societe_value=&page=.*"))
    try: # Make sure there are more than one page, otherwise, set to 1.
        num_pages = int(page_count_links[-1].get_text())
    except IndexError:
        num_pages = 1
    
    # Add 1 because Python range.
    url_list = ["{}&page={}".format(base_url, str(page)) for page in range(1, 3)]
    
    # Open the text file. Use with to save self from grief.
    with open("results.txt","w") as acct:
        for url_ in url_list:
    
            #url_ = url_list[0]
    
            print("Processing {}...".format(url_))
            r_new = rq.get(url_)
            soup_new = bsoup(r_new.text)
    
    
            for tr in soup_new.find_all('tr'):
                stack = []
                for td in tr.findAll('td'):
                    stack.append(td.text.replace('\n', '').replace('\t', '').strip())
                acct.write(", ".join(stack) + '\n')