javaparsingxlm

How to get a specifc information for an XML file


I have a large XML file and below is an extract from it:

...
<LexicalEntry id="Ait~ifAq_1">
  <Lemma partOfSpeech="n" writtenForm="اِتِّفاق"/>
  <Sense id="Ait~ifAq_1_tawaAfuq_n1AR" synset="tawaAfuq_n1AR"/>
  <WordForm formType="root" writtenForm="وفق"/>
</LexicalEntry>
<LexicalEntry id="tawaA&amp;um__1">
  <Lemma partOfSpeech="n" writtenForm="تَوَاؤُم"/>
  <Sense id="tawaA&amp;um__1_AinosijaAm_n1AR" synset="AinosijaAm_n1AR"/>
  <WordForm formType="root" writtenForm="وأم"/>
</LexicalEntry>    
<LexicalEntry id="tanaAgum_2">
  <Lemma partOfSpeech="n" writtenForm="تناغُم"/>
  <Sense id="tanaAgum_2_AinosijaAm_n1AR" synset="AinosijaAm_n1AR"/>
  <WordForm formType="root" writtenForm="نغم"/>
</LexicalEntry>


<Synset baseConcept="3" id="tawaAfuq_n1AR">
  <SynsetRelations>
    <SynsetRelation relType="hyponym" targets="AinosijaAm_n1AR"/>
    <SynsetRelation relType="hyponym" targets="AinosijaAm_n1AR"/>
    <SynsetRelation relType="hypernym" targets="ext_noun_NP_420"/>
  </SynsetRelations>
  <MonolingualExternalRefs>
    <MonolingualExternalRef externalReference="13971065-n" externalSystem="PWN30"/>
  </MonolingualExternalRefs>
</Synset>
...

I want to extract specific information from it. For a given writtenForm whether from <Lemma> or <WordForm>, the programme takes the value of synset from <Sense> of that writtenForm (same <LexicalEntry>) and searches for all the value id of <Synset> that have the same value as the synset from <Sense>. After that, the programme gives us all the relations of that Synset, i.e it displays the value of relType and returns to <LexicalEntry> and looks for the value synset of <Sense> who have the same value of targets then displays its writtenForm.

I think it's a little bit complicated but the result should be like this:

اِتِّفاق hyponym تَوَاؤُم, اِنْسِجام

One of the solutions is the use of the Stream reader because of the memory consumption. but I don't how should I proceed to get what I want. help me please.


Solution

  • The SAX Parser is different from DOM Parser.It is looking only on the current item it can't see on the future items until they become the current item . It is one of the many you can use when XML file is extremely big . Instead of it there are many out there . To name a few:

    You can find for all them tutorials here.

    In my opinion after learning it go straight to use DOM4J or JDOM for commercial product.

    The logic of SAX Parser is that you have a MyHandler class which is extending DefaultHandler and @Overrides some of it's methods:

    XML FILE:

    <?xml version="1.0"?>
    <class>
       <student rollno="393">
          <firstname>dinkar</firstname>
          <lastname>kad</lastname>
          <nickname>dinkar</nickname>
          <marks>85</marks>
       </student>
       <student rollno="493">
          <firstname>Vaneet</firstname>
          <lastname>Gupta</lastname>
          <nickname>vinni</nickname>
          <marks>95</marks>
       </student>
       <student rollno="593">
          <firstname>jasvir</firstname>
          <lastname>singn</lastname>
          <nickname>jazz</nickname>
          <marks>90</marks>
       </student>
    </class>
    

    Handler class:

    import org.xml.sax.Attributes;
    import org.xml.sax.SAXException;
    import org.xml.sax.helpers.DefaultHandler;
    
    public class UserHandler extends DefaultHandler {
    
       boolean bFirstName = false;
       boolean bLastName = false;
       boolean bNickName = false;
       boolean bMarks = false;
    
       @Override
       public void startElement(String uri, 
       String localName, String qName, Attributes attributes)
          throws SAXException {
          if (qName.equalsIgnoreCase("student")) {
             String rollNo = attributes.getValue("rollno");
             System.out.println("Roll No : " + rollNo);
          } else if (qName.equalsIgnoreCase("firstname")) {
             bFirstName = true;
          } else if (qName.equalsIgnoreCase("lastname")) {
             bLastName = true;
          } else if (qName.equalsIgnoreCase("nickname")) {
             bNickName = true;
          }
          else if (qName.equalsIgnoreCase("marks")) {
             bMarks = true;
          }
       }
    
       @Override
       public void endElement(String uri, 
       String localName, String qName) throws SAXException {
          if (qName.equalsIgnoreCase("student")) {
             System.out.println("End Element :" + qName);
          }
       }
    
       @Override
       public void characters(char ch[], 
          int start, int length) throws SAXException {
          if (bFirstName) {
             System.out.println("First Name: " 
                + new String(ch, start, length));
             bFirstName = false;
          } else if (bLastName) {
             System.out.println("Last Name: " 
                + new String(ch, start, length));
             bLastName = false;
          } else if (bNickName) {
             System.out.println("Nick Name: " 
                + new String(ch, start, length));
             bNickName = false;
          } else if (bMarks) {
             System.out.println("Marks: " 
                + new String(ch, start, length));
             bMarks = false;
          }
       }
    }
    

    Main Class :

    import java.io.File;
    import javax.xml.parsers.SAXParser;
    import javax.xml.parsers.SAXParserFactory;
    
    import org.xml.sax.Attributes;
    import org.xml.sax.SAXException;
    import org.xml.sax.helpers.DefaultHandler;
    
    public class SAXParserDemo {
       public static void main(String[] args){
    
          try { 
             File inputFile = new File("input.txt");
             SAXParserFactory factory = SAXParserFactory.newInstance();
             SAXParser saxParser = factory.newSAXParser();
             UserHandler userhandler = new UserHandler();
             saxParser.parse(inputFile, userhandler);     
          } catch (Exception e) {
             e.printStackTrace();
          }
       }   
    }