javaweb-scrapingcss-selectorsjsoupdata-extraction

Why HTML code is different when parsing site using Jsoup than using browser


I am on the website http://www.flashscore.com/nhl/ and I am trying to extract the links of the 'Today's Matches' table.

I am trying it with the following code, but it does not work Can you point out where the mistake is?

  final Document page = Jsoup
    .connect("http://d.flashscore.com/x/feed/t_4_200_G2Op923t_1_en_1")
    .cookie("_ga","GA1.2.47011772.1485726144")
    .referrer("http://d.flashscore.com/x/feed/proxy-local")
    .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36")
    .header("X-Fsign", "SW9D1eZo")
    .header("X-GeoIP", "1")
    .header("X-Requested-With", "XMLHttpRequest")
    .header("Accept" , "*/*")
    .get();

for (Element game : page.select("table.hockey tr")) {
Elements links = game.getElementsByClass("tr-first stage-finished");
for (Element link : links) {
    String linkHref = link.attr("href");
    String linkText = link.text();
}
 }

To try to fix it I started to debug it. It shows that we get the page (althouh we are getting kind of a strange HTML). After that the debugging showed that the for loop does not even start. I was trying to change the page.select("") part to different ones (like getElementByAttribute etc.), but I have just started to learn web scraping, so I need to get familiar with those methods to navigate through a document. How am I supposed to extract this data?


Solution

  • As said in comments, this website need to execute some Javascript in order to build that linkable elements. Jsoup only parse HTML, it doesn't run any JS and you won't see same HTML source if you get from a browser or if you get from Jsoup.

    You need to get the website as if you were running it on a real browser. You can do that programatically using WebDriver and Firefox.

    I've tried with your example site and works:

    pom.xml

    <project>
    
    <modelVersion>4.0.0</modelVersion>
    <groupId>com.test</groupId>
    <artifactId>test</artifactId>
    <version>1.0-SNAPSHOT</version>
    <build>
      <plugins>
        <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-compiler-plugin</artifactId>
        <configuration>
          <source>1.8</source>
          <target>1.8</target>
          </configuration>
        </plugin>
      </plugins>
    </build>
    <packaging>jar</packaging>
    
    <name>test</name>
    <url>http://maven.apache.org</url>
    
    <properties>
      <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    </properties>
    
    <dependencies>
      <dependency>
        <groupId>org.seleniumhq.selenium</groupId>
        <artifactId>selenium-firefox-driver</artifactId>
        <version>2.43.0</version>
      </dependency>
    </dependencies>
    
    </project>
    

    App.java

    package com.test;
    
    import org.openqa.selenium.By;
    import org.openqa.selenium.WebDriver;
    import org.openqa.selenium.firefox.FirefoxDriver;
    import java.util.Collections;
    import java.util.List;
    import java.util.stream.Collectors;
    
    public class App {
    
    public static void main( String[] args ) {
        App app = new App();
        List<String> links = app.parseLinks();
        links.forEach(System.out::println);
    }
    
    public List<String> parseLinks() {
        try {
            WebDriver driver ;
            // should download geckodriver https://github.com/mozilla/geckodriver/releases and set according your local file
            System.setProperty("webdriver.firefox.marionette","C:\\apps\\geckodriver.exe");
            driver = new FirefoxDriver();
            String baseUrl = "http://www.flashscore.com/nhl/";
    
            driver.get(baseUrl);
    
            return driver.findElement(By.className("hockey"))
                    .findElements(By.tagName("tr"))
                    .stream()
                    .distinct()
                    .filter(we -> !we.getAttribute("id").isEmpty())
                    .map(we -> createLink(we.getAttribute("id")))
                    .collect(Collectors.toList());
    
        } catch (Exception e) {
            e.printStackTrace();
            return Collections.EMPTY_LIST;
        }
    }
    
    private String createLink(String id) {
        return String.format("http://www.flashscore.com/match/%s/#match-summary", extractId(id));
    }
    
    private String extractId(String id) {
        if (id.contains("x_4_")) {
            id = id.replace("x_4_","");
        } else if (id.contains("g_4_")) {
            id = id.replace("g_4_","");
        }
    
        return id;
    }
    }
    

    Output:

    http://www.flashscore.com/match/f9MJJI69/#match-summary
    http://www.flashscore.com/match/zZCyd0dC/#match-summary
    http://www.flashscore.com/match/drEXdts6/#match-summary
    http://www.flashscore.com/match/EJOScMRa/#match-summary
    http://www.flashscore.com/match/0GKOb2Cg/#match-summary
    http://www.flashscore.com/match/6gLKarcm/#match-summary
    ...
    ...
    

    PS: Working using Firefox version 32.0 and Selenium 2.43.0. It's a common error to use unsupported version between Selenium and Firefox.