javahtmlvalidationjsoup

How to validate html using java? getting issues with jsoup library


I need to validate HTML using java. So I try with jsoup library. But some my test cases failing with it.

For eg this is my html content. I dont have any control on this content. I am getting this from some external source provider.

String invalidHtml = "<div id=\"myDivId\" ' class = claasnamee value='undaa' > <<p> p tagil vanne <br> <span> span close cheythillee!!  </p> </div>";

doc = Jsoup.parseBodyFragment(invalidHtml);

For above html I am getting this output.

<html>
 <head></head>
 <body>
  <div id="myDivId"  '=""  class="claasnamee" value="undaa">
    &lt;
   <p> p tagil vanne <br /> <span> span close cheythillee!! </span></p> 
  </div>
 </body>
</html>

for a single quote in my above string is comming like this. So how can I fix this issue. Any one can help me please.


Solution

  • The best place to validate your html would be http://validator.w3.org/. But that would be manual process. But dont worry jsoup can do this for you as well. The below program is like a workaround but it does the purpose.

    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    
    public class JsoupValidate {
    
        public static void main(String[] args) throws Exception {
    
            String invalidHtml = "<div id=\"myDivId\" ' class = claasnamee value='undaa' > <<p> p tagil vanne <br> <span> span close cheythillee!!  </p> </div>";
    
            Document initialDoc = Jsoup.parseBodyFragment(invalidHtml);
    
            Document validatedDoc = Jsoup.connect("http://validator.w3.org/check")
                    .data("fragment", initialDoc.html())
                    .data("st", "1")
                    .post();
    
            System.out.println("******");
            System.out.println("Errors");
            System.out.println("******");
            for(Element error : validatedDoc.select("li.msg_err")){
                System.out.println(error.select("em").text() + " : " + error.select("span.msg").text());
            }
    
            System.out.println();
            System.out.println("**************");
            System.out.println("Cleaned output");
            System.out.println("**************");
            Document cleanedOuput = Jsoup.parse(validatedDoc.select("pre.source").text());
            cleanedOuput.select("meta[name=generator]").first().remove();
            cleanedOuput.outputSettings().indentAmount(4);
            cleanedOuput.outputSettings().prettyPrint(true);
            System.out.println(cleanedOuput.html());
        }
    
    }