I am currently doing development work for the harmonization of our data.
I noticed that completion of the harmonization is a slow.
I'm relatively new to the NoSQL / MarkLogic development and not sure of the best practices to follow in order for a smooth, faster harmonization.
Here are some facts:
Data Load:
Post harmonization:
Harmonization Code snippets:
function getScheduleWindowEnd(businessUnit,targetDateString,schEndDateString)
{
var scheduleWindowEnd = new String();
var preferredDate = new Date();
var startDayOfWeek = getBUStartDayOfWeek(businessUnit);
if (fn.empty(targetDateString) || targetDateString == null || targetDateString == "" ||
fn.empty(schEndDateString) || schEndDateString == null || schEndDateString == "")
{
tempScheduleWindowEnd = "";
return "";
}
else
{
targetDateString = fn.replace(targetDateString, "/", "-") ;
schEndDateString = fn.replace(schEndDateString,"/","-");
var targetDate = xs.date(targetDateString);
var schEndDate = xs.date(schEndDateString);
// Get preferred date
if (fn.empty(schEndDate))
{
preferredDate = targetDate;
}
else
{
preferredDate = schEndDate;
}
//get target day of week
var scheduledDayOfWeek = xdmp.weekdayFromDate(preferredDate);
if (scheduledDayOfWeek < startDayOfWeek)
{
scheduleWindowEnd = fn.string(addDays(preferredDate,(startDayOfWeek-scheduledDayOfWeek)));
}
else
{
scheduleWindowEnd = fn.string(addDays(preferredDate,(startDayOfWeek-scheduledDayOfWeek+7)));
}
scheduleWindowEnd = fn.replace(fn.substring(scheduleWindowEnd, 1, 10), "-", "/");
tempScheduleWindowEnd = scheduleWindowEnd;
}
return scheduleWindowEnd
}
<StatusDescription>${fn.normalizeSpace(getUDCDescription("00", "SS", fn.normalizeSpace(hl.elementText(source, "WASRST", true))))}</StatusDescription>
function getUDCDescription(drsy,drrt,drky) {
let udcRecord = cts.search(cts.andQuery([
cts.collectionQuery("ERPSystemSource"),
cts.collectionQuery("Table2"),
cts.elementWordQuery(xs.QName("DRSY"), drsy),
cts.elementWordQuery(xs.QName("DRRT"), drrt),
cts.elementWordQuery(xs.QName("DRKY"), drky)
]))
let docXML = new String();
for (const item of udcRecord) {
docXML += hl.encodeXml(fn.normalizeSpace(hl.elementText(item, "DRDL01", true)))
}
return docXML;
}
<Element1>${hl.elementText(source, "WADOCO", true)}</Element1>
<Element2>${fn.normalizeSpace(hl.elementText(source, "WAMCU", true))}</Element2>
I would strongly recommend if possible that you consider working with a MarkLogic representative on this problem. Improving software performance can be complex and it's best to have a working relationship with someone who can go back and forth with you.
The first question I always ask is : Well, what is your expected SLA? Until you have a clear expectation set of what you think performance should look like I can't tell you that this is slow or fast or if your expectation is realistic or unrealistic.
In my experience, performance issues tend to fall into one of two categories : Software or Infrastructure bottlenecks. Since the extrapolation in time from 200k to 1m records seems linear, I would expect that your bottleneck is not a severe software issue.
The first thing I would do is check the MarkLogic monitoring history and determine whether you are fully utilizing your infrastructure. If not, try increasing the threadcounts and batch sizes of your harmonization workloads so that your infrastructure is fully utilized.
If you are fully utilizing your infrastructure, you can either upgrade your infrastructure or you can start to look at improving your software.
Based off your code, here are several suggestions you can look into to improve your software: