pythonpandasweb-scrapingbeautifulsoup

Python: BeautifulSoup scraping yield data


I am trying to scrape Yield tables for several countries and several maturities from a website. So far I only get empty tables:

enter image description here

while it should rather look like:

enter image description here

So far I have been doing the following:

import time 
import datetime as dt
import pandas as pd
from bs4 import BeautifulSoup
from dateutil.relativedelta import relativedelta
import requests
import re 
import os

path = os.getcwd()

def ZCCWord(Date,country): 

    # Site URL
    url="http://www.worldgovernmentbonds.com/country/"+country

    html_content = requests.get(url).text
    soup = BeautifulSoup(html_content, "lxml")
    #gdp = soup.find_all("table", attrs={"class": "w3-table w3-white table-padding-custom w3 small font-family-arial table-valign-middle"})
    gdp = soup.find_all("table") # , attrs={"class": "w3-table money pd44 -f15"})
    table1 = gdp[0]
    body = table1.find_all("tr")
    body_rows = body[1:] 
    all_rows = [] # will be a list for list for all rows
    for row_num in range(len(body_rows)): # A row at a time
        row = [] # this will old entries for one row
        for row_item in body_rows[row_num].find_all("td"): #loop through all row entries
            aa = re.sub("(\xa0)|(\n)|,","",row_item.text)
            #append aa to row - note one row entry is being appended
            row.append(aa)
        # append one row to all_rows
        all_rows.append(row)

    AAA           = pd.DataFrame(all_rows)
    ZCC           = pd.DataFrame()
    ZCC           = AAA[1].str.extract('([^a-zA-Z]+)([a-zA-Z]+)', expand=True).dropna().reset_index(drop=True)
    ZCC.columns   = ['TENOR', 'PERIOD'] 
    ZCC['TENOR'] = ZCC['TENOR'].str.strip().str.isdigit()  # Remove leading/trailing spaces
    #ZCC = ZCC[ZCC['TENOR'].str.isdigit()] 
    ZCC['TENOR']  = ZCC['TENOR'].astype(int)
    ZCC['RATES']  = AAA[2].str.extract(r'([0-9.]+)', expand=True).dropna().reset_index(drop=True).astype(float)
    ZCC['RATES']  = ZCC['RATES']/100

    row2      = []
    for i in range(len(ZCC)): 
        if ZCC['PERIOD'][i]=='month' or  ZCC['PERIOD'][i]=='months':
            b  = ZCC['TENOR'][i]
            bb = Date + relativedelta(months = b)
            row2.append(bb)
        else: 
            b  = ZCC['TENOR'][i]
            bb = Date + relativedelta(years = b)
            row2.append(bb)

    ZCC['DATES'] = pd.DataFrame(row2)
    ZCC = ZCC.reindex(['TENOR','PERIOD','DATES','RATES'], axis=1)
    return ZCC



LitsCountries   =  ['spain','portugal','latvia','ireland','united-kingdom',
                'germany', 'france','italy','sweden','finland','greece',
                'poland','romania','hungary','netherlands']

todays_date     = path+'\\WorldYields' +str(dt.datetime.now().strftime("%Y-%m-%d-%H-%M") )+ '.xlsx'   
writer          = pd.ExcelWriter(todays_date, engine='xlsxwriter',engine_kwargs={'options':{'strings_to_urls': False}})
dictYield       = {}

for i in range(len(LitsCountries)): 
        country         = LitsCountries[i]
        Date            = pd.to_datetime('today').date()
        country         = LitsCountries[i] 
        ZCC             = ZCCWord(Date,country)  
        dictYield[i]    = ZCC
        ZCC.to_excel(writer, sheet_name=country)     
       
writer.close()    
time.sleep(60) # wait one minute

I would be fine also with other websites, solutions or methods which provide similar outputs. Any idea?

thanks in advance!


Solution

  • To get the table data you need to use the wp-json endpoint combined with the country id. You can retrieve the country id from the website you are already requesting by finding it in the raw text response.

    Next you need to request the wp-json endpoint. There you will recieve a json object including the table html.

    def request_table(country_id: str):
        url = "https://www.worldgovernmentbonds.com/wp-json/country/v1/main"
    
        payload = {
        "GLOBALVAR":
            {
                "JS_VARIABLE": "jsGlobalVars",
                "FUNCTION": "Country",
                "DOMESTIC": True,
                "ENDPOINT": "https://www.worldgovernmentbonds.com/wp-json/country/v1/historical",
                "DATE_RIF": "2099-12-31",
                "OBJ": None,
                "COUNTRY1":
                    {
                        "SYMBOL": country_id
                    },
                "COUNTRY2": None,
                "OBJ1": None,
                "OBJ2":None
            }
        }
        headers = {
          'accept': '*/*',
          'content-type': 'application/json; charset=UTF-8',
          'origin': 'https://www.worldgovernmentbonds.com',
        }
        response = requests.request("POST", url, headers=headers, data=json.dumps(payload))
    
        data = response.json()
        return data.get("mainTable")
    
    
    
    def ZCCWord(Date, country):
        # Site URL
        url = "http://www.worldgovernmentbonds.com/country/" + country
        html_content = requests.get(url).text
    
        # extract country id
        start_index = html_content.find("\"SYMBOL\":\"")
        end_index = html_content[start_index + 10:].find("\",")
        country_id = html_content[start_index + 10:start_index + 10 + end_index]
        # request table
        table_html = request_table(country_id)
    
        soup = BeautifulSoup(table_html, "lxml")
        # gdp = soup.find_all("table", attrs={"class": "w3-table w3-white table-padding-custom w3 small font-family-arial table-valign-middle"})
        gdp = soup.find_all("table")  # , attrs={"class": "w3-table money pd44 -f15"})
        table1 = gdp[0]
    
        # ... Rest of your code ...