javamultithreadingjsoupexecutorserviceexecutors

ExecutorService never stops. When execute new task inside another executing task


Good day.

I have blocker issue with my web crawler project. Logic is simple. First creates one Runnable, it downloads html document, scans all links and then on all funded links it creates new Runnable objects. Each new created Runnable in its turn creates new Runnable objects for each link and execute them.

Problem is that ExecutorService never stops.

CrawlerTest.java

public class CrawlerTest {

    public static void main(String[] args) throws InterruptedException {
        new CrawlerService().crawlInternetResource("https://jsoup.org/");
    }
}

CrawlerService.java

import java.io.IOException;
import java.util.Collections;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class CrawlerService {

    private Set<String> uniqueUrls = Collections.newSetFromMap(new ConcurrentHashMap<String, Boolean>(10000));
    private ExecutorService executorService = Executors.newFixedThreadPool(8);
    private String baseDomainUrl;

    public void crawlInternetResource(String baseDomainUrl) throws InterruptedException {
        this.baseDomainUrl = baseDomainUrl;
        System.out.println("Start");
        executorService.execute(new Crawler(baseDomainUrl)); //Run first thread and scan main domain page. This thread produce new threads.
        executorService.awaitTermination(10, TimeUnit.MINUTES);
        System.out.println("End");
    }

    private class Crawler implements Runnable { // Inner class that encapsulates thread and scan for links

        private String urlToCrawl;

        public Crawler(String urlToCrawl) {
            this.urlToCrawl = urlToCrawl;
        }

        public void run() {
            try {
                findAllLinks();
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }

        private void findAllLinks() throws InterruptedException {
            /*Try to add new url in collection, if url is unique adds it to collection, 
             * scan document and start new thread for finded links*/
            if (uniqueUrls.add(urlToCrawl)) { 
                System.out.println(urlToCrawl);

                Document htmlDocument = loadHtmlDocument(urlToCrawl);
                Elements findedLinks = htmlDocument.select("a[href]");

                for (Element link : findedLinks) {
                    String absLink = link.attr("abs:href");
                    if (absLink.contains(baseDomainUrl) && !absLink.contains("#")) { //Check that we are don't go out of domain
                        executorService.execute(new Crawler(absLink)); //Start new thread for each funded link
                    }
                }
            }
        }

        private Document loadHtmlDocument(String internetResourceUrl) {
            Document document = null;
            try {
                document = Jsoup.connect(internetResourceUrl).ignoreHttpErrors(true).ignoreContentType(true)
                        .userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:48.0) Gecko/20100101 Firefox/48.0")
                        .timeout(10000).get();
            } catch (IOException e) {
                System.out.println("Page load error");
                e.printStackTrace();
            }
            return document;
        }
    }
}

This app need about 20 secs to scan jsoup.org for all unique links. But it just wait 10 minutes executorService.awaitTermination(10, TimeUnit.MINUTES); and then I see dead main thread and still working executor.

Threads

How to force ExecutorService work correctly?

I think problem is that it invoke executorService.execute inside another task instead in main thread.


Solution

  • I see your comment from earlier:

    I can't use CountDownLatch because I don't know beforehand how many unique links I will collect from resource.

    First off, vsminkov is spot on with the answer as to why awaitTermniation will sit and wait for 10 minutes. I will offer an alternate solution.

    Instead of using a CountDownLatch use a Phaser. For each new task, you can register, and await completion.

    Create a single phaser and register each time a execute.submit is invoked and arrive each time a Runnable completes.

    public void crawlInternetResource(String baseDomainUrl) {
        this.baseDomainUrl = baseDomainUrl;
    
        Phaser phaser = new Phaser();
        executorService.execute(new Crawler(phaser, baseDomainUrl)); 
        int phase = phaser.getPhase();
        phase.awaitAdvance(phase);
    }
    
    private class Crawler implements Runnable { 
    
        private final Phaser phaser;
        private String urlToCrawl;
    
        public Crawler(Phaser phaser, String urlToCrawl) {
            this.urlToCrawl = urlToCrawl;
            this.phaser = phaser;
            phaser.register(); // register new task
        }
    
        public void run(){
           ...
           phaser.arrive(); //may want to surround this in try/finally
        }