pythonweb-scrapingbeautifulsoup

Real estate website scrape with linksusing Python


I tried to use Python and Beautifulsoup to scrape a commercial real estate website with the corresponding href also shown in the final csv list. But the link column is always shown empty. How could I extract the href and schedule this task run through the whole website weekly? Thank you in advance!

from bs4 import BeautifulSoup
import requests
from csv import writer
import re


url = "https://objektvision.se/lediga_lokaler/stockholm/city"
page = requests.get(url)

soup = BeautifulSoup(page.content, 'html.parser')
lists = soup.find_all('a', class_ ="ov--list-item d-flex")

with open('lokal_stockholm_city_v11.csv', 'w', encoding='utf8', newline='') as f:
    thewriter = writer(f)
    header = ['title', 'location', 'area','link']
    thewriter.writerow(header)
    
    
    for list in lists:
        title = list.find('div', class_="font-weight-bold text-ov street-address").text.replace('\r\n','')
        location = list.find('div', class_="text-ov-dark-grey area-address").text.replace('\r\n','')
        area = list.find('div', class_="font-weight-bold size").text.replace('\r\n','')
        link =list.find('a', attrs_={'href': re.compile("^https://objektvision.se/Beskriv/")})
            
       
      
        info = [title,location, area,link]
        thewriter.writerow(info)

The final csv looks like this


Solution

  • Focused on - Getting the href there are two points you should be aware - The href in your soup do not start with domain, they are relativ and you do not need to find the <a> cause you allready processing it based on your ResultSet.

    So to get you href call .get('href) or ['href] directly and concat it with base url:

        link = 'https://objektvision.se/'+e['href']
    

    Example

    Note: Do not use list as variable name - changed it to e for element

    from bs4 import BeautifulSoup
    import requests
    from csv import writer
    
    url = "https://objektvision.se/lediga_lokaler/stockholm/city"
    page = requests.get(url)
    
    soup = BeautifulSoup(page.content, 'html.parser')
    lists = soup.find_all('a', class_ ="ov--list-item d-flex")
    
    with open('lokal_stockholm_city_v11.csv', 'w', encoding='utf8', newline='') as f:
        thewriter = writer(f)
        header = ['title', 'location', 'area','link']
        thewriter.writerow(header)
    
        for e in lists:
            title = e.find('div', class_="font-weight-bold text-ov street-address").text.replace('\r\n','')
            location = e.find('div', class_="text-ov-dark-grey area-address").text.replace('\r\n','')
            area = e.find('div', class_="font-weight-bold size").text.replace('\r\n','')
            link = 'https://objektvision.se/'+e['href']
    
            info = [title,location, area,link]
            thewriter.writerow(info)
    

    Output

    title location area link
    Kungsgatan 49 City , Stockholm 923 m² https://objektvision.se//Beskriv/218003079?IsPremium=True
    Sveavägen 20 City , Stockholm 1 000 - 2 200 m² https://objektvision.se//Beskriv/218017049?IsPremium=True
    Sergelgatan 8-14/Sveavägen 5-9 /Mäste... City , Stockholm 1 373 m² https://objektvision.se//Beskriv/218030745?IsPremium=True
    Adolf Fredriks Kyrkogata 13 Stockholm 191 m² https://objektvision.se//Beskriv/218031939
    Arena Sergel - Malmskillnadsgatan 36 City , Stockholm 1 - 3 000 m² https://objektvision.se//Beskriv/218006788