xmlcsvxquerybasexxquery-3.1

XML to CSV with BaseX/XQuery


I'm trying to transform a big number of xmls to a single csv file. A simplified structure of the xml would look like this:

<Receipts>
    <Receipt>
        <Field1 attribute1="a"/>
        <Fields2>
            <Field2 attribute2="1"/>
            <Field2 attribute2="2"/>
        </Fields2>
        <Field4 attribute4="4a"/>
    </Receipt>
    <Receipt>
        <Field1 attribute1="b"/>
        <Field4 attribute4="4b"/>
    </Receipt>
    <Receipt>
        <Field1 attribute1="c"/>
        <Fields2>
            <Field2 attribute2="3"/>
        </Fields2>
        <Field3 attribute3="c3"/>
        <Field4 attribute4="4c"/>
    </Receipt>
</Receipts>

And the csv result that I would like to obtain is

Attribute1,Attribute2,Attribute3,Attribute4
a,1,,4a
a,2,,4a
b,,,4b
c,3,c3,4c

I've based my code on this answer, but I'only able to either have a line on the csv for each Receipt with all of the attribute2 concatenated, or only return the Receipts that have the Fields2 element with a Field2, that is: Either this:

Attribute1,Attribute2,Attribute3,Attribute4
a,1 2,,4a
b,,,4b
c,3,c3,4c

Or this:

Attribute1,Attribute2,Attribute3,Attribute4
a,1,,4a
a,2,,4a
c,3,c3,4c

My code for the first case would be:

declare option output:method "csv";
declare option output:csv "header=yes, separator=comma";

    declare context item := document {<Receipts>
    <Receipt>
        <Field1 attribute1="a"/>
        <Fields2>
            <Field2 attribute2="1"/>
            <Field2 attribute2="2"/>
        </Fields2>
        <Field4 attribute4="4a"/>
    </Receipt>
    <Receipt>
        <Field1 attribute1="b"/>
        <Field4 attribute4="4b"/>
    </Receipt>
    <Receipt>
        <Field1 attribute1="c"/>
        <Fields2>
            <Field2 attribute2="3"/>
        </Fields2>
        <Field3 attribute3="c3"/>
        <Field4 attribute4="4c"/>
    </Receipt>
</Receipts>};



for $x in //Receipt
return 
<csv>
  <record>
    <Attribute1>{$x/Field1/@attribute1/data()}</Attribute1>
    <Attribute2>{$x/Fields2/Field2/@attribute2/data()}</Attribute2>
    <Attribute3>{$x/Field3/@attribute3/data()}</Attribute3>
    <Attribute4>{$x/Field4/@attribute4/data()}</Attribute4>
  </record>
</csv>

And for the second case it would be:

declare option output:method "csv";
declare option output:csv "header=yes, separator=comma";

    declare context item := document {<Receipts>
    <Receipt>
        <Field1 attribute1="a"/>
        <Fields2>
            <Field2 attribute2="1"/>
            <Field2 attribute2="2"/>
        </Fields2>
        <Field4 attribute4="4a"/>
    </Receipt>
    <Receipt>
        <Field1 attribute1="b"/>
        <Field4 attribute4="4b"/>
    </Receipt>
    <Receipt>
        <Field1 attribute1="c"/>
        <Fields2>
            <Field2 attribute2="3"/>
        </Fields2>
        <Field3 attribute3="c3"/>
        <Field4 attribute4="4c"/>
    </Receipt>
</Receipts>};



for $x in //Receipt for $y in $x/Fields2/Field2
return 
<csv>
  <record>
    <Attribute1>{$x/Field1/@attribute1/data()}</Attribute1>
    <Attribute2>{$y/@attribute2/data()}</Attribute2>
    <Attribute3>{$x/Field3/@attribute3/data()}</Attribute3>
    <Attribute4>{$x/Field4/@attribute4/data()}</Attribute4>
  </record>
</csv>

Solution

  • After a deeper search, I found the solution. On the second option on the second for loop, you should add the allowing empty function, so that the code ends up looking like this:

    declare option output:method "csv";
    declare option output:csv "header=yes, separator=comma";
    
        declare context item := document {<Receipts>
        <Receipt>
            <Field1 attribute1="a"/>
            <Fields2>
                <Field2 attribute2="1"/>
                <Field2 attribute2="2"/>
            </Fields2>
            <Field4 attribute4="4a"/>
        </Receipt>
        <Receipt>
            <Field1 attribute1="b"/>
            <Field4 attribute4="4b"/>
        </Receipt>
        <Receipt>
            <Field1 attribute1="c"/>
            <Fields2>
                <Field2 attribute2="3"/>
            </Fields2>
            <Field3 attribute3="c3"/>
            <Field4 attribute4="4c"/>
        </Receipt>
    </Receipts>};
    
    
    
    for $x in //Receipt for $y allowing empty in $x/Fields2/Field2
    return 
    <csv>
      <record>
        <Attribute1>{$x/Field1/@attribute1/data()}</Attribute1>
        <Attribute2>{$y/@attribute2/data()}</Attribute2>
        <Attribute3>{$x/Field3/@attribute3/data()}</Attribute3>
        <Attribute4>{$x/Field4/@attribute4/data()}</Attribute4>
      </record>
    </csv>
    

    Which returns the desider CSV:

    Attribute1,Attribute2,Attribute3,Attribute4
    a,1,,4a
    a,2,,4a
    b,,,4b
    c,3,c3,4c