javaselenium-webdriverweb-scrapinghtmlunit-driver

How to do web scraping using htmlunitsriver?


i am getting somthing like this enter image description here
Hi i am scraping a web page using Selenium Webdriver an i am able to achieve my data but problem is that this directly interact with browser and i dont want to open a web browser and want to scrape all data as it is

How can i achieve my goal

Here is my code

    import org.openqa.selenium.By;
    import org.openqa.selenium.WebDriver;
    import org.openqa.selenium.WebElement;
    import org.openqa.selenium.firefox.FirefoxDriver;
    import org.openqa.selenium.support.ui.Select;
    
    public class GetData {
    
        public static void main(String args[]) throws InterruptedException {
            String sDate = "27/03/2014";
            WebDriver driver = new FirefoxDriver();
            String url="http://www.upmandiparishad.in/commodityWiseAll.aspx";
            driver.get(url);
            Thread.sleep(5000);
            // select barge
            new Select(driver.findElement(By.id("ctl00_ContentPlaceHolder1_ddl_commodity"))).selectByVisibleText("Jo");
             driver.findElement(By.id("ctl00_ContentPlaceHolder1_txt_rate")).sendKeys(sDate);
            // click buttonctl00_ContentPlaceHolder1_txt_rate
            Thread.sleep(3000);
            driver.findElement(By.id("ctl00_ContentPlaceHolder1_btn_show")).click();
            Thread.sleep(5000);
    
            //get only table tex
            WebElement findElement = driver.findElement(By.id("ctl00_ContentPlaceHolder1_GridView1"));
            String htmlTableText = findElement.getText();
            // do whatever you want now, This is raw table values.
        System.out.println(htmlTableText);
           
     
            driver.close();
            driver.quit();
    
        }
    }


My updated New code



import com.gargoylesoftware.htmlunit.BrowserVersion;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.htmlunit.HtmlUnitDriver;
import org.openqa.selenium.support.ui.Select;

    public class Getdata1 {
    
        public static void main(String args[]) throws InterruptedException {
            WebDriver driver = new HtmlUnitDriver(BrowserVersion.FIREFOX_3_6);
        driver.get("http://www.upmandiparishad.in/commodityWiseAll.aspx");
        System.out.println(driver.getPageSource());
        Thread.sleep(5000);
        // select barge         
        new Select(driver.findElement(By.id("ctl00_ContentPlaceHolder1_ddl_commodity"))).selectByVisibleText("Jo");
    
        String sDate = "12/04/2014"; //What date you want
        driver.findElement(By.id("ctl00_ContentPlaceHolder1_txt_rate")).sendKeys(sDate);
    
        driver.findElement(By.id("ctl00_ContentPlaceHolder1_btn_show")).click();
        Thread.sleep(3000);
    
        //get only table tex
        WebElement findElement = driver.findElement(By.id("ctl00_ContentPlaceHolder1_GridView1"));
        String htmlTableText = findElement.getText();
        // do whatever you want now, This is raw table values.
        System.out.println(htmlTableText);
    
        driver.close();
        driver.quit();
    
        }
    }

Thanks in advance


Solution

  • Use HtmlUnit or HtmlUnitDriver by Selenium

        WebDriver driver = new HtmlUnitDriver(BrowserVersion.FIREFOX_17);
        driver.get("http://www.upmandiparishad.in/commodityWiseAll.aspx");
        System.out.println(driver.getPageSource());
        Thread.sleep(5000);
        // select barge         
        new Select(driver.findElement(By.id("ctl00_ContentPlaceHolder1_ddl_commodity"))).selectByVisibleText("Jo");
    
        String sDate = "12/04/2014"; //What date you want
        driver.findElement(By.id("ctl00_ContentPlaceHolder1_txt_rate")).sendKeys(sDate);
    
        driver.findElement(By.id("ctl00_ContentPlaceHolder1_btn_show")).click();
        Thread.sleep(3000);
    
        //get only table tex
        WebElement findElement = driver.findElement(By.id("ctl00_ContentPlaceHolder1_GridView1"));
        String htmlTableText = findElement.getText();
        // do whatever you want now, This is raw table values.
        System.out.println(htmlTableText);
    
        driver.close();
        driver.quit();
    

    To get tabular output, you can try something like this..

        String arrCells[] = htmlTableText.split(" ");
        Boolean bIsANumber = false;
        for(int i = 0; i < arrCells.length; i++) {
    
            try {
                int tmp = Integer.parseInt(arrCells[i]);
                bIsANumber = true;
            }
            catch(Exception ex) {
                bIsANumber = false;
            }
    
            if(bIsANumber) {
                System.out.print("\n"+arrCells[i]+"\t");
            }
            else {
                System.out.print(arrCells[i]+"\t");
            }
        }