marklogicmarklogic-8query-tuning

Expanded tree cache full error need to tune the query


Description:

All three of my attempts below are giving me expanded tree cache full errors. There are ~470000 assets i.e XML I'm querying.

How can I tune these queries to avoid expanded tree cache errors?

approach 1:

let $query-name := "get-asset-sub-group-values"
let $output-dir := "D:\output\"

let $report-uri := concat($output-dir, $query-name, "_report0.txt")

let $enumValues := (:all sequence of strings goes here :)

let $map1 := map:new($enumValues ! map:entry(fn:string(.), fn:true()))

let $result1 := concat('asset-id' , "|",  'upi',  "|", 'assetSubGroup',  "|",  'asset-type',  "|", 'asset-sub-type', "|", 'originator', "|", 'originator-identifier',  "|",  'mm-project-id',  "|" , 'sap-project-id' , "
")
let $result2 :=
  for $each-search-copy in cts:search(collection("metadata-search"), cts:element-value-query(xs:QName("AssetID"), "*"))/metadata
    let $asset-id := $each-search-copy/assetIdentifiers/assetIdentifier/AssetID[1]/text()
    let $upi := $each-search-copy/assetIdentifiers/assetIdentifier/SAPID[1]/text()
    let $asset-type := $each-search-copy/biblioCore/assetType[1]/text()
    let $asset-sub-type := $each-search-copy/biblioCore/assetSubType[1]/text()
    let $originator := $each-search-copy/biblioCore/originator[1]/text()
    let $originator-identifier := $each-search-copy/assetIdentifiers/assetIdentifier/OriginatorIdentifier[1]/text()
    let $mm-project-id := $each-search-copy/biblioCore/MMProjectID[1]/text()
    let $sap-project-id := $each-search-copy/biblioCore/SAPProjectID[1]/text()
    let $assetSubGroup := $each-search-copy/biblioCore/assetSubGroup[1]/text()
    let $map2 := map:new($assetSubGroup ! map:entry(fn:string(.), fn:true()))
    let $flag := map:keys($map2 - $map1)
    return
        if ($flag)
            then(

                     concat($asset-id , "|",  $upi,  "|", $assetSubGroup,  "|",  $asset-type,  "|", $asset-sub-type, "|", $originator, "|", $originator-identifier,  "|",  $mm-project-id,  "|" , $sap-project-id , "
")
                 )
            else (
                  if($assetSubGroup) then() 
                  else (     

                        concat($asset-id , "|",  $upi,  "|", $assetSubGroup,  "|",  $asset-type,  "|", $asset-sub-type, "|", $originator, "|", $originator-identifier,  "|",  $mm-project-id,  "|" , $sap-project-id , "
")

            ))
let $result3 := ($result1, $result2)    
return xdmp:save($report-uri, text{$result3}), xdmp:elapsed-time()

approach 2:

let $query-name := "get-asset-sub-group-values"
let $output-dir := "D:\output\"
let $report-uri := concat($output-dir, $query-name, "_report1.txt")

let $enumValues := (:all string value sequence goes here that has to match:)

let $map1 := map:new($enumValues ! map:entry(fn:string(.), fn:true()))

let $result1 :=( concat('asset-id' , "|",  'upi',  "|", 'assetSubGroup',  "|",  'asset-type',  "|", 'asset-sub-type', "|", 'originator', "|", 'originator-identifier',  "|",  'mm-project-id',  "|" , 'sap-project-id' , "
"),

  for $each-search-copy in cts:search(collection("metadata-search"), cts:element-value-query(xs:QName("AssetID"), "*"))/metadata
    let $asset-id := $each-search-copy/assetIdentifiers/assetIdentifier/AssetID[1]/text()
    let $upi := $each-search-copy/assetIdentifiers/assetIdentifier/SAPID[1]/text()
    let $asset-type := $each-search-copy/biblioCore/assetType[1]/text()
    let $asset-sub-type := $each-search-copy/biblioCore/assetSubType[1]/text()
    let $originator := $each-search-copy/biblioCore/originator[1]/text()
    let $originator-identifier := $each-search-copy/assetIdentifiers/assetIdentifier/OriginatorIdentifier[1]/text()
    let $mm-project-id := $each-search-copy/biblioCore/MMProjectID[1]/text()
    let $sap-project-id := $each-search-copy/biblioCore/SAPProjectID[1]/text()
    let $assetSubGroup := $each-search-copy/biblioCore/assetSubGroup[1]/text()
    let $map2 := map:new($assetSubGroup ! map:entry(fn:string(.), fn:true()))
    let $flag := map:keys($map2 - $map1)
    return
        if ($flag)
            then(

                     concat($asset-id , "|",  $upi,  "|", $assetSubGroup,  "|",  $asset-type,  "|", $asset-sub-type, "|", $originator, "|", $originator-identifier,  "|",  $mm-project-id,  "|" , $sap-project-id , "
")
                 )
            else (
                  if($assetSubGroup) then() 
                  else (     

                        concat($asset-id , "|",  $upi,  "|", $assetSubGroup,  "|",  $asset-type,  "|", $asset-sub-type, "|", $originator, "|", $originator-identifier,  "|",  $mm-project-id,  "|" , $sap-project-id , "
")

            ))
            )

return xdmp:save($report-uri, text{$result1}), xdmp:elapsed-time()

approach 3:

let $query-name := "get-asset-sub-group-values"
let $output-dir :=  "D:\output\"
let $report-uri := concat($output-dir, $query-name, "_report2.txt")


let $enumValues := (:sequence of strings are passed:)

let $result1 :=( concat('asset-id' , "|",  'upi',  "|", 'assetSubGroup',  "|",  'asset-type',  "|", 'asset-sub-type', "|", 'originator', "|", 'originator-identifier',  "|",  'mm-project-id',  "|" , 'sap-project-id' , "
"),

  for $each-search-copy in cts:search(collection("metadata-search"), cts:element-value-query(xs:QName("AssetID"), "*"))/metadata
    let $asset-id := $each-search-copy/assetIdentifiers/assetIdentifier/AssetID[1]/text()
    let $upi := $each-search-copy/assetIdentifiers/assetIdentifier/SAPID[1]/text()
    let $asset-type := $each-search-copy/biblioCore/assetType[1]/text()
    let $asset-sub-type := $each-search-copy/biblioCore/assetSubType[1]/text()
    let $originator := $each-search-copy/biblioCore/originator[1]/text()
    let $originator-identifier := $each-search-copy/assetIdentifiers/assetIdentifier/OriginatorIdentifier[1]/text()
    let $mm-project-id := $each-search-copy/biblioCore/MMProjectID[1]/text()
    let $sap-project-id := $each-search-copy/biblioCore/SAPProjectID[1]/text()
    let $assetSubGroup := $each-search-copy/biblioCore/assetSubGroup[1]/text()
    let $flag := ($assetSubGroup eq $enumValues)
    return
        if ($flag)
            then()
            else (
                  concat($asset-id , "|",  $upi,  "|", $assetSubGroup,  "|",  $asset-type,  "|", $asset-sub-type, "|", $originator, "|", $originator-identifier,  "|",  $mm-project-id,  "|" , $sap-project-id , "
")
                 ))

return xdmp:save($report-uri, text{$result1}), xdmp:elapsed-time()

Solution

  • If you find that you are having trouble writing a "streamable" query, another alternative would be to use a batch tool, such as CORB2 that uses multiple threads to execute a module for each document to produce it's output in a separate transaction and collects the results into a final output file. By breaking out the work into separate transactions, you don't have to worry about expanded tree cache errors or timeouts, and you can adjust the thread count to perform more work concurrently and get it done faster than a single query execution.

    An example CORB2 options file to produce your | delimited text file would look something like this (you would need to adjust the XCC-CONNECTION-URI value for your environment):

    XCC-CONNECTION-URI=xcc://user:password@host:port
    THREAD-COUNT=8
    URIS-MODULE=selector.xqy|ADHOC
    PROCESS-MODULE=process.xqy|ADHOC
    PROCESS-TASK=com.marklogic.developer.corb.ExportBatchToFileTask
    EXPORT-FILE-DIR=D:\output\
    EXPORT-FILE-NAME=get-asset-sub-group-values_report.txt
    PRE-BATCH-TASK=com.marklogic.developer.corb.PreBatchUpdateFileTask
    EXPORT-FILE-TOP-CONTENT=asset-id|upi|assetSubGroup|asset-type|asset-sub-type|originator|originator-identifier|mm-project-id|sap-project-id
    BATCH-URI-DELIM=|
    

    Create the URIs selector module selector.xqy that will find all of the URIs to process:

    xquery version "1.0-ml";
    let $uris := cts:uris("", (), cts:and-query((
      cts:collection-query("metadata-search"), 
      cts:element-value-query(xs:QName("AssetID"), "*")
    )) )
    return (fn:count($uris), $uris)
    

    Create the process module process.xqy that will be called for each URI:

    xquery version "1.0-ml";
    
    declare variable $URI external;
    
    let $each-search-copy := fn:doc($URI)/metadata
    
    let $asset-id := $each-search-copy/assetIdentifiers/assetIdentifier/AssetID[1]/text()
    let $upi := $each-search-copy/assetIdentifiers/assetIdentifier/SAPID[1]/text()
    let $asset-type := $each-search-copy/biblioCore/assetType[1]/text()
    let $asset-sub-type := $each-search-copy/biblioCore/assetSubType[1]/text()
    let $originator := $each-search-copy/biblioCore/originator[1]/text()
    let $originator-identifier := $each-search-copy/assetIdentifiers/assetIdentifier/OriginatorIdentifier[1]/text()
    let $mm-project-id := $each-search-copy/biblioCore/MMProjectID[1]/text()
    let $sap-project-id := $each-search-copy/biblioCore/SAPProjectID[1]/text()
    let $assetSubGroup := $each-search-copy/biblioCore/assetSubGroup[1]/text()
    let $flag := ($assetSubGroup eq $enumValues)
    return
        if ($flag)
            then()
            else (
                  (: NOTE you could use string-join() instead of concat()
    
                     string-join(($asset-id, $upi, $assetSubGroup, $asset-type, $asset-sub-type, $originator, $originator-identifier, $mm-project-id, $sap-project-id), "|")  
    
                  :)
                  concat($asset-id , "|",  $upi,  "|", $assetSubGroup,  "|",  $asset-type,  "|", $asset-sub-type, "|", $originator, "|", $originator-identifier,  "|",  $mm-project-id,  "|" , $sap-project-id)
                 ))
    

    Invoking the CORB job like this (adjust paths and filenames for the XCC and CORB jars and your properties file):

    java -server -cp .:marklogic-xcc-8.0.8.jar:marklogic-corb-2.4.1.jar
        -DOPTIONS-FILE=myjob.properties com.marklogic.developer.corb.Manager
    

    Or, if you are using ml-gradle, use the corb task