xsdsaxonxmlschemaxpath-3.1

How can I read the schema (XSD) from Saxon after loading an XML & XSD file?


Our program displays a tree control showing the metadata structure of the XML file they are using as a datasource. So it displays all elements & attributes in use in the XML file, like this:

Employees
  Employee
    FirstName
    LastName
Orders
  Order
    OrderId

For the case where the user does not pass us a XSD file, we need to walk the XML file and build up the metadata structure.

The full code for this is at SaxonQuestions.zip, TestBuildTreeWithSchema.java and is also listed below.

The below code works but it has a problem. Suppose under Employee there's an element for SpouseName. This is only populated if the employee is married. What if the sample data file I have is all unmarried employees? Then the below code does not know there's a SpouseName element.

So my question is - how can I read the schema directly, instead of using the below code. If I read the schema then I get every node & attribute including the optional ones. I also get the type. And the schema optionally has a description for each node and I get that too.

Therefore, I need to read the schema itself. How can I do that?

And secondary question - why is the type for an int BigInteger instead of Integer or Long? I see this with Employee/@EmployeeID in Southwind.xml & Southwind.xsd.

TestBuildTreeWithSample.java

import net.sf.saxon.s9api.*;

import java.io.File;
import java.io.FileInputStream;
import java.util.ArrayList;
import java.util.List;

public class TestBuildTreeWithSchema {

    public static void main(String[] args) throws Exception {

        XmlDatasource datasource = new XmlDatasource(
                new FileInputStream(new File("files", "SouthWind.xml").getCanonicalPath()),
                new FileInputStream(new File("files", "SouthWind.xsd").getCanonicalPath()));

        // get the root element
        XdmNode rootNode = null;
        for (XdmNode node : datasource.getXmlRootNode().children()) {
            if (node.getNodeKind() == XdmNodeKind.ELEMENT) {
                rootNode = node;
                break;
            }
        }

        TestBuildTreeWithSchema buildTree = new TestBuildTreeWithSchema(rootNode);
        Element root = buildTree.addNode();

        System.out.println("Schema:");
        printElement("", root);
    }

    private static void printElement(String indent, Element element) {
        System.out.println(indent + "<" + element.name + "> (" + (element.type == null ? "null" : element.type.getSimpleName()) + ")");
        indent += "  ";
        for (Attribute attr : element.attributes)
            System.out.println(indent + "=" + attr.name + " (" + (attr.type == null ? "null" : attr.type.getSimpleName()) + ")");
        for (Element child : element.children)
            printElement(indent, child);
    }

    protected XdmItem currentNode;

    public TestBuildTreeWithSchema(XdmItem currentNode) {
        this.currentNode = currentNode;
    }

    private Element addNode() throws SaxonApiException {

        String name = ((XdmNode)currentNode).getNodeName().getLocalName();

        // Question:
        //   Is this the best way to determine that this element has data (as opposed to child elements)?
        Boolean elementHasData;
        try {
            ((XdmNode) currentNode).getTypedValue();
            elementHasData = true;
        } catch (Exception ex) {
            elementHasData = false;
        }

        // Questions:
        //   Is this the best way to get the type of the element value?
        //   Why BigInteger instead of Long for int?
        Class valueClass = ! elementHasData ? null : ((XdmAtomicValue)((XdmNode)currentNode).getTypedValue()).getValue().getClass();
        Element element = new Element(name, valueClass, null);

        // add in attributes
        XdmSequenceIterator currentSequence;
        if ((currentSequence = moveTo(Axis.ATTRIBUTE)) != null) {
            do {
                name = ((XdmNode) currentNode).getNodeName().getLocalName();

                // Questions:
                //   Is this the best way to get the type of the attribute value?
                //   Why BigInteger instead of Long for int?
                valueClass = ((XdmAtomicValue)((XdmNode)currentNode).getTypedValue()).getValue().getClass();

                Attribute attr = new Attribute(name, valueClass, null);
                element.attributes.add(attr);
            } while (moveToNextInCurrentSequence(currentSequence));
            moveTo(Axis.PARENT);
        }

        // add in children elements
        if ((currentSequence = moveTo(Axis.CHILD)) != null) {
            do {
                Element child = addNode();
                // if we don't have this, add it
                Element existing = element.getChildByName(child.name);
                if (existing == null)
                    element.children.add(child);
                else
                    // add in any children this does not have
                    existing.addNewItems (child);
            } while (moveToNextInCurrentSequence(currentSequence));
            moveTo(Axis.PARENT);
        }

        return element;
    }

    // moves to element or attribute
    private XdmSequenceIterator moveTo(Axis axis) {

        XdmSequenceIterator en = ((XdmNode) currentNode).axisIterator(axis);

        boolean gotIt = false;
        while (en.hasNext()) {
            currentNode = en.next();
            if (((XdmNode) currentNode).getNodeKind() == XdmNodeKind.ELEMENT || ((XdmNode) currentNode).getNodeKind() == XdmNodeKind.ATTRIBUTE) {
                gotIt = true;
                break;
            }
        }

        if (gotIt) {
            if (axis == Axis.ATTRIBUTE || axis == Axis.CHILD || axis == Axis.NAMESPACE)
                return en;
            return null;
        }
        return null;
    }

    // moves to next element or attribute
    private Boolean moveToNextInCurrentSequence(XdmSequenceIterator currentSequence)
    {
        if (currentSequence == null)
            return false;
        while (currentSequence.hasNext())
        {
            currentNode = currentSequence.next();
            if (((XdmNode)currentNode).getNodeKind() == XdmNodeKind.ELEMENT || ((XdmNode)currentNode).getNodeKind() == XdmNodeKind.ATTRIBUTE)
                return true;
        }
        return false;
    }

    static class Node {
        String name;
        Class type;
        String description;

        public Node(String name, Class type, String description) {
            this.name = name;
            this.type = type;
            this.description = description;
        }
    }

    static class Element extends Node {
        List<Element> children;
        List<Attribute> attributes;

        public Element(String name, Class type, String description) {
            super(name, type, description);
            children = new ArrayList<>();
            attributes = new ArrayList<>();
        }

        public Element getChildByName(String name) {
            for (Element child : children) {
                if (child.name.equals(name))
                    return child;
            }
            return null;
        }

        public void addNewItems(Element child) {
            for (Attribute attrAdd : child.attributes) {
                boolean haveIt = false;
                for (Attribute attrExist : attributes)
                    if (attrExist.name.equals(attrAdd.name)) {
                        haveIt = true;
                        break;
                    }
                if (!haveIt)
                    attributes.add(attrAdd);
            }

            for (Element elemAdd : child.children) {
                Element exist = null;
                for (Element elemExist : children)
                    if (elemExist.name.equals(elemAdd.name)) {
                        exist = elemExist;
                        break;
                    }
                if (exist == null)
                    children.add(elemAdd);
                else
                    exist.addNewItems(elemAdd);
            }
        }
    }

    static class Attribute extends Node {
        public Attribute(String name, Class type, String description) {
            super(name, type, description);
        }
    }
}

XmlDatasource.java

import com.saxonica.config.EnterpriseConfiguration;
import com.saxonica.ee.s9api.SchemaValidatorImpl;
import net.sf.saxon.Configuration;
import net.sf.saxon.lib.FeatureKeys;
import net.sf.saxon.s9api.*;
import net.sf.saxon.type.SchemaException;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;

import javax.xml.transform.Source;
import javax.xml.transform.sax.SAXSource;
import javax.xml.transform.stream.StreamSource;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;

public class XmlDatasource {

    /** the DOM all searches are against */
    private XdmNode xmlRootNode;

    private XPathCompiler xPathCompiler;

    /** key == the prefix; value == the uri mapped to that prefix */
    private HashMap<String, String> prefixToUriMap = new HashMap<>();

    /** key == the uri mapped to that prefix; value == the prefix */
    private HashMap<String, String> uriToPrefixMap = new HashMap<>();


    public XmlDatasource (InputStream xmlData, InputStream schemaFile) throws SAXException, SchemaException, SaxonApiException, IOException {

        boolean haveSchema = schemaFile != null;

        // call this before any instantiation of Saxon classes.
        Configuration config = createEnterpriseConfiguration();

        if (haveSchema) {
            Source schemaSource = new StreamSource(schemaFile);
            config.addSchemaSource(schemaSource);
        }

        Processor processor = new Processor(config);

        DocumentBuilder doc_builder = processor.newDocumentBuilder();

        XMLReader reader = createXMLReader();

        InputSource xmlSource = new InputSource(xmlData);
        SAXSource saxSource = new SAXSource(reader, xmlSource);

        if (haveSchema) {
            SchemaValidator validator = new SchemaValidatorImpl(processor);
            doc_builder.setSchemaValidator(validator);
        }
        xmlRootNode = doc_builder.build(saxSource);

        xPathCompiler = processor.newXPathCompiler();
        if (haveSchema)
            xPathCompiler.setSchemaAware(true);

        declareNameSpaces();
    }

    public XdmNode getXmlRootNode() {
        return xmlRootNode;
    }

    public XPathCompiler getxPathCompiler() {
        return xPathCompiler;
    }

    /**
     * Create a XMLReader set to disallow XXE aattacks.
     * @return a safe XMLReader.
     */
    public static XMLReader createXMLReader() throws SAXException {

        XMLReader reader = XMLReaderFactory.createXMLReader();

        // stop XXE https://www.owasp.org/index.php/XML_External_Entity_(XXE)_Prevention_Cheat_Sheet#JAXP_DocumentBuilderFactory.2C_SAXParserFactory_and_DOM4J
        reader.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
        reader.setFeature("http://xml.org/sax/features/external-general-entities", false);
        reader.setFeature("http://xml.org/sax/features/external-parameter-entities", false);

        return reader;
    }

    private void declareNameSpaces() throws SaxonApiException {

        // saxon has some of their functions set up with this.
        prefixToUriMap.put("saxon", "http://saxon.sf.net");
        uriToPrefixMap.put("http://saxon.sf.net", "saxon");

        XdmValue list = xPathCompiler.evaluate("//namespace::*", xmlRootNode);
        if (list == null || list.size() == 0)
            return;

        for (int index=0; index<list.size(); index++) {
            XdmNode node = (XdmNode) list.itemAt(index);
            String prefix = node.getNodeName() == null ? "" : node.getNodeName().getLocalName();

            // xml, xsd, & xsi are XML structure ones, not ones used in the XML
            if (prefix.equals("xml") || prefix.equals("xsd") || prefix.equals("xsi"))
                continue;

            // use default prefix if prefix is empty.
            if (prefix == null || prefix.isEmpty())
                prefix = "def";

            // this returns repeats, so if a repeat, go on to next.
            if (prefixToUriMap.containsKey(prefix))
                continue;

            String uri = node.getStringValue();
            if (uri != null && !uri.isEmpty()) {
                xPathCompiler.declareNamespace(prefix, uri);
                prefixToUriMap.put(prefix, uri);
                uriToPrefixMap.put(uri, prefix);            }
        }
    }

    public static EnterpriseConfiguration createEnterpriseConfiguration()
    {
        EnterpriseConfiguration configuration = new EnterpriseConfiguration();
        configuration.supplyLicenseKey(new BufferedReader(new java.io.StringReader(deobfuscate(key))));
        configuration.setConfigurationProperty(FeatureKeys.SUPPRESS_XPATH_WARNINGS, Boolean.TRUE);

        return configuration;
    }
}

Solution

  • Thanks for the clarifications. I think your real goal is to find a way to parse and process an XML Schema in Java without having to treat the XSD as an ordinary XML document (it is an ordinary XML document, but processing it using the standard facilities is not easy).

    On that basis, I think this thread should help: In Java, how do I parse an xml schema (xsd) to learn what's valid at a given element?

    Personally, I've never found any library that does a better job than the EMF XSD model. It's complex, but comprehensive.