pythonseleniumweb-scrapingbeautifulsoupurllib

Parse the html code for a whole webpage scrolled down


from bs4 import BeautifulSoup
import urllib,sys
reload(sys)
sys.setdefaultencoding("utf-8")
r = urllib.urlopen('https://twitter.com/ndtv').read()
soup = BeautifulSoup(r)

This would give me not the whole web page scrolled down the end which I want but only some of it.

EDIT:

from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import urllib,sys,requests
reload(sys)
sys.setdefaultencoding("utf-8")

class wait_for_more_than_n_elements_to_be_present(object):
    def __init__(self, locator, count):
        self.locator = locator
        self.count = count

    def __call__(self, driver):
        try:
            elements = EC._find_elements(driver, self.locator)
            return len(elements) > self.count
        except StaleElementReferenceException:
            return False

def return_html_code(url):
    driver = webdriver.Firefox()
    driver.maximize_window()
    driver.get(url)
    # initial wait for the tweets to load
    wait = WebDriverWait(driver, 10)
    wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))
    # scroll down to the last tweet until there is no more tweets loaded
    while True:
        tweets = driver.find_elements_by_css_selector("li[data-item-id]")
        number_of_tweets = len(tweets)
        print number_of_tweets
        driver.execute_script("arguments[0].scrollIntoView();", tweets[-1])
        try:
            wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))
        except TimeoutException:
            break
    html_full_source=driver.page_source
    driver.close()
    return html_full_source


url='https://twitter.com/thecoolstacks'
#using selenium browser
html_source=return_html_code(url)
soup_selenium = BeautifulSoup(html_source)
print soup_selenium
text_tweet=[]
alltweets_selenium = soup_selenium.find_all(attrs={'data-item-type' : 'tweet'})
for tweet in alltweets_selenium:
    #Text of tweet
    html_tweet= tweet.find_all("p", class_="TweetTextSize TweetTextSize--16px js-tweet-text tweet-text")
    text_tweet.append(''.join(html_tweet[0].findAll(text=True)))    
print text_tweet

Intended Output:

import requests from bs4 import BeautifulSoup      url='https://twitter.com/thecoolstacks' 
req = requests.get(url) 
soup = BeautifulSoup(req.content) 
alltweets = soup.find_all(attrs={'data-item-type' : 'tweet'}) 
print alltweets[0]

Solution

  • I would still insist on using the Twitter API.

    Alternatively, here is how you can approach the problem with selenium:

    Implementation:

    from selenium import webdriver
    from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    
    
    class wait_for_more_than_n_elements_to_be_present(object):
        def __init__(self, locator, count):
            self.locator = locator
            self.count = count
    
        def __call__(self, driver):
            try:
                elements = EC._find_elements(driver, self.locator)
                return len(elements) > self.count
            except StaleElementReferenceException:
                return False
    
    
    url = "https://twitter.com/ndtv"
    driver = webdriver.Firefox()
    driver.maximize_window()
    driver.get(url)
    
    # initial wait for the tweets to load
    wait = WebDriverWait(driver, 10)
    wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))
    
    # scroll down to the last tweet until there is no more tweets loaded
    while True:
        tweets = driver.find_elements_by_css_selector("li[data-item-id]")
        number_of_tweets = len(tweets)
    
        driver.execute_script("arguments[0].scrollIntoView();", tweets[-1])
    
        try:
            wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))
        except TimeoutException:
            break
    

    This would scroll down as much as it is needed to load all of the existing tweets in this channel.


    Here is the HTML-parsing snippet, extracting tweets:

    page_source = driver.page_source
    driver.close()
    
    soup = BeautifulSoup(page_source)
    for tweet in soup.select("div.tweet div.content"):
        print tweet.p.text
    

    It prints:

    Father's Day Facebook post by arrested cop Suhas Gokhale's son got nearly 10,000 likes http://www.ndtv.com/india-news/fathers-day-post-by-arrested-cop-suhas-gokhales-son-got-nearly-10-000-likes-775634  pic.twitter.com/JUqmdWNQ3c
    #HWL2015 End of third quarter! Breathtaking stuff. India 2-2 Pakistan - http://sports.ndtv.com/hockey/news/244463-hockey-world-league-semifinal-india-vs-pakistan-antwerp …
    Why these Kashmiri boys may miss their IIT dream https://www.ndtv.com/india-news/why-these-kashmiri-boys-may-miss-their-iit-dream-775677  pic.twitter.com/gohX21Gibi
    ...