javaweb-crawlerhttpurlconnectionboilerpipe

java web crawler downloads too many GB data


I have coded a web crawler. But when crawling it downloads too many GBs of data.

I want to read only the text (avoiding images ...etc).

I use Boilerpipe to extract the content from html

Here is how I find the final redirected url

public String getFinalRedirectedUrl(String url) throws IOException{
    HttpURLConnection connection;
    String finalUrl = url;
    int redirectCount = 0;
    do {
        connection = (HttpURLConnection) new URL(finalUrl)
                .openConnection();
        connection.setConnectTimeout(Config.HTTP_CONNECTION_TIMEOUT_TIME);
        connection.setReadTimeout(Config.HTTP_READ_TIMEOUT_TIME);
        connection.setInstanceFollowRedirects(false);
        connection.setUseCaches(false);
        connection.setRequestMethod("GET");
        connection.connect();
        int responseCode = connection.getResponseCode();
        if (responseCode >= 300 && responseCode < 400) {
            String redirectedUrl = connection.getHeaderField("Location");
            if (null == redirectedUrl)
                break;
            finalUrl = redirectedUrl;
            redirectCount++;
            if(redirectCount > Config.MAX_REDIRECT_COUNT){
                throw new java.net.ProtocolException("Server redirected too many  times ("+Config.MAX_REDIRECT_COUNT+")");
            }
        } else{
            break;
        }
    } while (connection.getResponseCode() != HttpURLConnection.HTTP_OK);
    connection.disconnect();

    return finalUrl;
}

This is how I fetch the url

private HTMLDocument fetch(URL url) throws IOException{
    final HttpURLConnection httpcon = (HttpURLConnection) url.openConnection();
    httpcon.setFollowRedirects(true);
    httpcon.setConnectTimeout(Config.HTTP_CONNECTION_TIMEOUT_TIME);
    httpcon.setReadTimeout(Config.HTTP_READ_TIMEOUT_TIME);
    httpcon.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.2) Gecko/20100101 Firefox/10.0.2");
    final String ct = httpcon.getContentType();

    Charset cs = Charset.forName("Cp1252");
    if (ct != null) {
        if(!ct.contains("text/html")){
            System.err.println("Content type is:"+ct);
            return new HTMLDocument("");
        }

        Matcher m = PAT_CHARSET.matcher(ct);
        if(m.find()) {
                final String charset = m.group(1);
                try {
                        cs = Charset.forName(charset);
                } catch (UnsupportedCharsetException | IllegalCharsetNameException e) {
                        // keep default
                }
        }
    }

    InputStream in = httpcon.getInputStream();

    final String encoding = httpcon.getContentEncoding();
    if(encoding != null) {
        if("gzip".equalsIgnoreCase(encoding)) {
                in = new GZIPInputStream(in);
        } else {
                System.err.println("WARN: unsupported Content-Encoding: "+encoding);
        }
    }

    ByteArrayOutputStream bos = new ByteArrayOutputStream();
    byte[] buf = new byte[4096];
    int r;
    while ((r = in.read(buf)) != -1) {
        bos.write(buf, 0, r);
    }
    in.close();

    final byte[] data = bos.toByteArray();

    return new HTMLDocument(data, cs);
}

And to get the body using Boilerpipe

HTMLDocument htmlDoc = fetch(new URL(url));
String body = ArticleExtractor.INSTANCE.getText(htmlDoc.toInputSource());

How to reduce the amount of data downloaded?


Solution

  • Reduced the GB downloaded and increased the efficiency by using JSoup

    public HashMap<String, String> fetchWithJsoup(String url, String iniUrl, int redirCount)
                                            throws IOException
    {
        HashMap<String, String> returnObj = new HashMap<>();
    
        Connection con;
        try{
            con = Jsoup.connect(url);
        }catch(IllegalArgumentException ex){
            if(ex.getMessage().contains("Malformed URL")){
                System.err.println("Malformed URL:: "
                    +ex.getClass().getName()+": "+ex.getMessage()+" > "+iniUrl);
            }else{
                Logger.getLogger(ContentGetter.class.getName()).log(Level.SEVERE, null, ex);
            }
            returnObj.put(RETURN_FINAL_URL, url);
            returnObj.put(RETURN_BODY, "");
            return returnObj;
        }
    
        con.userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.2) Gecko/20100101 Firefox/10.0.2");
    
        con.timeout(Config.HTTP_CONNECTION_TIMEOUT_TIME);
        Document doc = con.get();
    
        String uri = doc.baseUri();
        returnObj.put(RETURN_FINAL_URL, uri);
    
        Elements redirEle = doc.head().select("meta[http-equiv=refresh]");
        if(redirEle.size() > 0){
            String content = redirEle.get(0).attr("content");
            Pattern pattern = Pattern.compile("^.*URL=(.+)$", Pattern.CASE_INSENSITIVE);
            Matcher matcher = pattern.matcher(content);
            if (matcher.matches() && matcher.groupCount() > 0) {
                String redirectUrl = matcher.group(1);
                if(redirectUrl.startsWith("'")){
                    /*removes single quotes of urls within single quotes*/
                    redirectUrl = redirectUrl.replaceAll("(^')|('$)","");
                }
                if(redirectUrl.startsWith("/")){
                    String[] splitedUrl = url.split("/");
                    redirectUrl = splitedUrl[0]+"//"+splitedUrl[2]+redirectUrl;
                }
                if(!redirectUrl.equals(url)){
                    redirCount++;
                    if(redirCount < Config.MAX_REDIRECT_COUNT){
                        return fetchWithJsoup(redirectUrl, iniUrl, redirCount);
                    }
                }
            }
        }
    
        HTMLDocument htmlDoc = new HTMLDocument(doc.html());
        String body = "";
        try{
            if(htmlDoc != null){
                body = ArticleExtractor.INSTANCE.getText(htmlDoc.toInputSource());
            }
        }catch(OutOfMemoryError ex){
            System.err.println("OutOfMemoryError while extracting text !!!!!!!!");
            System.gc();
        } catch (BoilerpipeProcessingException ex) {
            Logger.getLogger(ContentGetter.class.getName()).log(Level.SEVERE, null, ex);
        }
        returnObj.put(RETURN_BODY, body);
    
        return returnObj;
    }