I have coded a web crawler. But when crawling it downloads too many GBs of data.
I want to read only the text (avoiding images ...etc).
I use Boilerpipe to extract the content from html
Here is how I find the final redirected url
public String getFinalRedirectedUrl(String url) throws IOException{
HttpURLConnection connection;
String finalUrl = url;
int redirectCount = 0;
do {
connection = (HttpURLConnection) new URL(finalUrl)
.openConnection();
connection.setConnectTimeout(Config.HTTP_CONNECTION_TIMEOUT_TIME);
connection.setReadTimeout(Config.HTTP_READ_TIMEOUT_TIME);
connection.setInstanceFollowRedirects(false);
connection.setUseCaches(false);
connection.setRequestMethod("GET");
connection.connect();
int responseCode = connection.getResponseCode();
if (responseCode >= 300 && responseCode < 400) {
String redirectedUrl = connection.getHeaderField("Location");
if (null == redirectedUrl)
break;
finalUrl = redirectedUrl;
redirectCount++;
if(redirectCount > Config.MAX_REDIRECT_COUNT){
throw new java.net.ProtocolException("Server redirected too many times ("+Config.MAX_REDIRECT_COUNT+")");
}
} else{
break;
}
} while (connection.getResponseCode() != HttpURLConnection.HTTP_OK);
connection.disconnect();
return finalUrl;
}
This is how I fetch the url
private HTMLDocument fetch(URL url) throws IOException{
final HttpURLConnection httpcon = (HttpURLConnection) url.openConnection();
httpcon.setFollowRedirects(true);
httpcon.setConnectTimeout(Config.HTTP_CONNECTION_TIMEOUT_TIME);
httpcon.setReadTimeout(Config.HTTP_READ_TIMEOUT_TIME);
httpcon.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.2) Gecko/20100101 Firefox/10.0.2");
final String ct = httpcon.getContentType();
Charset cs = Charset.forName("Cp1252");
if (ct != null) {
if(!ct.contains("text/html")){
System.err.println("Content type is:"+ct);
return new HTMLDocument("");
}
Matcher m = PAT_CHARSET.matcher(ct);
if(m.find()) {
final String charset = m.group(1);
try {
cs = Charset.forName(charset);
} catch (UnsupportedCharsetException | IllegalCharsetNameException e) {
// keep default
}
}
}
InputStream in = httpcon.getInputStream();
final String encoding = httpcon.getContentEncoding();
if(encoding != null) {
if("gzip".equalsIgnoreCase(encoding)) {
in = new GZIPInputStream(in);
} else {
System.err.println("WARN: unsupported Content-Encoding: "+encoding);
}
}
ByteArrayOutputStream bos = new ByteArrayOutputStream();
byte[] buf = new byte[4096];
int r;
while ((r = in.read(buf)) != -1) {
bos.write(buf, 0, r);
}
in.close();
final byte[] data = bos.toByteArray();
return new HTMLDocument(data, cs);
}
And to get the body using Boilerpipe
HTMLDocument htmlDoc = fetch(new URL(url));
String body = ArticleExtractor.INSTANCE.getText(htmlDoc.toInputSource());
How to reduce the amount of data downloaded?
Reduced the GB downloaded and increased the efficiency by using JSoup
public HashMap<String, String> fetchWithJsoup(String url, String iniUrl, int redirCount)
throws IOException
{
HashMap<String, String> returnObj = new HashMap<>();
Connection con;
try{
con = Jsoup.connect(url);
}catch(IllegalArgumentException ex){
if(ex.getMessage().contains("Malformed URL")){
System.err.println("Malformed URL:: "
+ex.getClass().getName()+": "+ex.getMessage()+" > "+iniUrl);
}else{
Logger.getLogger(ContentGetter.class.getName()).log(Level.SEVERE, null, ex);
}
returnObj.put(RETURN_FINAL_URL, url);
returnObj.put(RETURN_BODY, "");
return returnObj;
}
con.userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.2) Gecko/20100101 Firefox/10.0.2");
con.timeout(Config.HTTP_CONNECTION_TIMEOUT_TIME);
Document doc = con.get();
String uri = doc.baseUri();
returnObj.put(RETURN_FINAL_URL, uri);
Elements redirEle = doc.head().select("meta[http-equiv=refresh]");
if(redirEle.size() > 0){
String content = redirEle.get(0).attr("content");
Pattern pattern = Pattern.compile("^.*URL=(.+)$", Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(content);
if (matcher.matches() && matcher.groupCount() > 0) {
String redirectUrl = matcher.group(1);
if(redirectUrl.startsWith("'")){
/*removes single quotes of urls within single quotes*/
redirectUrl = redirectUrl.replaceAll("(^')|('$)","");
}
if(redirectUrl.startsWith("/")){
String[] splitedUrl = url.split("/");
redirectUrl = splitedUrl[0]+"//"+splitedUrl[2]+redirectUrl;
}
if(!redirectUrl.equals(url)){
redirCount++;
if(redirCount < Config.MAX_REDIRECT_COUNT){
return fetchWithJsoup(redirectUrl, iniUrl, redirCount);
}
}
}
}
HTMLDocument htmlDoc = new HTMLDocument(doc.html());
String body = "";
try{
if(htmlDoc != null){
body = ArticleExtractor.INSTANCE.getText(htmlDoc.toInputSource());
}
}catch(OutOfMemoryError ex){
System.err.println("OutOfMemoryError while extracting text !!!!!!!!");
System.gc();
} catch (BoilerpipeProcessingException ex) {
Logger.getLogger(ContentGetter.class.getName()).log(Level.SEVERE, null, ex);
}
returnObj.put(RETURN_BODY, body);
return returnObj;
}