I have a collection where there are some duplicate documents. In example:
First document:
{
"_id" : ObjectId("56f3d7cc1de31cb20c08ae6b"),
"AddedDate" : ISODate("2016-05-01T00:00:00.000Z"),
"Place": "THISPLACE",
"PresentInDB" : [
{
"InDB" : ISODate("2016-05-01T00:00:00.000Z")
}
],
"Checked" : [],
"Link": "http://www.mylink.com/first/84358"
}
Second document:
{
"_id" : ObjectId("577740526c1e542904725238"),
"AddedDate" : ISODate("2016-05-02T00:00:00.000Z"),
"Place": "THISPLACE",
"PresentInDB" : [
{
"InDB" : ISODate("2016-05-02T00:00:00.000Z")
},
{
"InDB" : ISODate("2016-05-03T00:00:00.000Z")
}
],
"Checked" : [
{
"Done" : ISODate("2016-05-02T00:00:00.000Z")
},
],
"Link": "http://www.mylink.com/second/84358"
}
Link
field contains same sequense of numbers in both documents, 84358
.
So I would like to achieve those steps:
Link
field (i.e. 84358
above) and if there are several documents in
collection that have that sequence in the Link
field. And also if Place
field match in both documents:PresentInDB
and Checked
fields - > merge PresentInDB
and Checked
fields by adding array values from the newest document (by date in AddedDate
field) to the oldest document. How could I achieve such a query?
In MongoDB 3.3.6 release is introduced a $split
operator for dealing with strings in aggregation framework (Jira). Before this release you only could solve this with a map/reduce solution.
After MongoDB 3.3.6 release: Aggregation framework solution
db.duplicatedCollection.aggregate(
[
{
$project: {
_id : 1,
AddedDate : 1,
Place : 1,
PresentInDB : 1,
Checked : 1,
Link : 1,
sequenceNumber: { $arrayElemAt: [ {$split: ["$Link", "/"]}, -1 ]},
}
},
{
$sort: { AddedDate: 1 }
},
{
$group: {
_id : {
sequenceNumber : "$sequenceNumber",
Place : "$Place"
},
id : { $first: "$_id"},
AddedDate: { $first: "$AddedDate" },
Place : { $first: "$Place" },
PresentInDB: {
$push: '$PresentInDB'
},
Checked: {
$push: '$Checked'
},
Link: { $first: "$Link"}
}
},
{
$unwind: "$PresentInDB"
},
{
$unwind: {
path : "$PresentInDB",
preserveNullAndEmptyArrays: true
}
},
{
$unwind: "$Checked"
},
{
$unwind: {
path : "$Checked",
preserveNullAndEmptyArrays: true
}
},
{
$group: {
_id : "$id",
AddedDate: { $first: "$AddedDate" },
Place : { $first: "$Place" },
PresentInDB : {
$addToSet: '$PresentInDB'
},
Checked : {
$addToSet: '$Checked'
},
Link: { $first: "$Link"}
}
},
{
$out: "duplicatedCollection"
}
]
);
Before MongoDB 3.3.6 release: Map/Reduce solution
Map Function:
var mapFunction = function() {
var linkArray = this.Link.split("/");
var sequenceNumber = linkArray[linkArray.length - 1];
var keyDoc = {
place : this.Place,
sequenceNumber: sequenceNumber,
};
emit(keyDoc, this);
};
Reduce Function:
var reduceFunction = function(key, values) {
var reducedDoc = {};
reducedDoc._id = values[0]._id;
reducedDoc.AddedDate = values[0].AddedDate;
reducedDoc.Link = values[0].Link;
reducedDoc.PresentInDB = [];
reducedDoc.Checked = [];
var presentInDbMillisArray = [];
var checkedMillisArray = [];
values.forEach(function(doc) {
if (reducedDoc.AddedDate < doc.AddedDate) {
reducedDoc._id = doc._id;
reducedDoc.AddedDate = doc.AddedDate;
reducedDoc.Link = doc.Link;
}
// PresentInDB field merge
doc.PresentInDB.forEach(function(presentInDBElem) {
var millis = presentInDBElem.InDB.getTime();
if (!Array.contains(presentInDbMillisArray, millis)) {
reducedDoc.PresentInDB.push(presentInDBElem);
presentInDbMillisArray.push(millis);
}
});
// same here with Checked field
doc.Checked.forEach(function(checkedElem) {
var millis = checkedElem.Done.getTime();
if (!Array.contains(checkedMillisArray, millis)) {
reducedDoc.Checked.push(checkedElem);
checkedMillisArray.push(millis);
}
});
});
return reducedDoc;
};
Map/Reduce:
db.duplicatedCollection.mapReduce(
mapFunction,
reduceFunction,
{
"out": "duplicatedCollection"
}
);
Unwrap the value from the map/reduce returned documents:
db.duplicatedCollection.find(
{
value : {
$exists: true
}
}
).forEach(function(doc) {
db.duplicatedCollection.insert(doc.value);
db.duplicatedCollection.remove({_id : doc._id});
});