javaxmlxml-parsingstaxvtd-xml

Splitting of a large XML file into small Chunks based on repeated elements


Consider the following XML with 500 MB data

<?xml version="1.0" encoding="UTF-8"?>
<Parents>
  <process  Child ="A">...</process>
  <process  Child="B">...</process>
  <process  Child="A">...</process>
  <process  Child="C">..</process>
  <process Child=...
  </process>
 <\Parents>

This xml has multiple child attribute with tag "A" or "B" or other I want to create a separate XML for "A", "B", "C" or others like expamle_A.xml, example_B.xml etc. Below Code is creating separate xml foe each child attribute, means if we have 500 child attribute its creating 500 xml's.

public static void main(String args[]) {
        try {
            VTDGen v = new VTDGen();
            if (v.parseFile("C:\\..\\example.xml", true)) {
                VTDNav vn = vg.getNav();
                AutoPilot ap = new AutoPilot(vn);
                ap.selectXPath("/Parents/child");
                int  chunk = 0;
                while (( ap.evalXPath()) != -1) {
                    long frag = vn.getElementFragment();
                    (new FileOutputStream("C:\\....\\result" + chunk + ".xml")).write(vn.getXML().getBytes(), (int) frag,
                            (int) (frag >> 32));
                    chunk++;
                }
            }
        } catch (Exception ex) {
            ex.printStackTrace();
        }
}

Now the thing is i want to split the file on the basis of child attribute of same group for an instance all the child of "A" should be in example_A.xml file same way for B,C and others.


Solution

  • It is a very simple modification to your existing code. Actually there are multiple ways to do this. I am gonna just show you one of them: by explicitly comparing the attr val using VTDNav's getAttrVal methods().

    public static void main1(String args[]) {
        try {
            VTDGen vg = new VTDGen();
            if (vg.parseFile("C:\\..\\example.xml", true)) {
                VTDNav vn = vg.getNav();
                AutoPilot ap = new AutoPilot(vn);
                ap.selectXPath("/Parents/process");
                int  chunk = 0;
                FileOutputStream fopsA=(new FileOutputStream("C:\\....\\resultA" + chunk + ".xml"));
                fopsA.write("<Parent>\n".getBytes());
                FileOutputStream fopsB=(new FileOutputStream("C:\\....\\resultB" + chunk + ".xml"));
                while (( ap.evalXPath()) != -1) {
                    long frag = vn.getElementFragment();
                    int i=vn.getAttrVal("Child");
                    if (i==-1) throw new NavException("unexpected result");
                    if  (vn.compareTokenString(i,"A")==0){
    
                        fopsA.write(vn.getXML().getBytes(), (int) frag,
                            (int) (frag >> 32));
    
                    }else if  (vn.compareTokenString(i,"B")==0){
    
                        fopsB.write(vn.getXML().getBytes(), (int) frag,
                                (int) (frag >> 32));
                    }
                    chunk++;
                }
    
                fopsA.write("</Parent>\n".getBytes());
                fopsB.write("</Parent>\n".getBytes());
            }
        } catch (Exception ex) {
            ex.printStackTrace();
        }