Our IP address ranges table has ~2.8 million records, of the following structure:
{
start_ip: integer, // IPv4 converted to int32
end_ip: integer, // IPv4 converted to int32
country_code: string,
country: string,
region: string,
city: string
}
There is a compound index on start_ip
and end_ip
:
{ start_ip: 1, end_ip: 1 }
To get records from the table (read: to know what range the given IP belongs to) we use this query:
{"start_ip"=>{"$lte"=>3284004939}, "end_ip"=>{"$gte"=>3284004939}}
However we observe an unacceptably low performance of the query. Here is the explain
example:
{
"explainVersion": "2",
"queryPlanner": {
"namespace": "restclass_development.ip_ranges",
"indexFilterSet": false,
"parsedQuery": {
"$and": [
{
"start_ip": {
"$lte": 3284004939
}
},
{
"end_ip": {
"$gte": 3284004939
}
}
]
},
"queryHash": "379E557D",
"planCacheKey": "E1838ADA",
"maxIndexedOrSolutionsReached": false,
"maxIndexedAndSolutionsReached": false,
"maxScansToExplodeReached": false,
"winningPlan": {
"queryPlan": {
"stage": "FETCH",
"planNodeId": 2,
"inputStage": {
"stage": "IXSCAN",
"planNodeId": 1,
"keyPattern": {
"start_ip": 1,
"end_ip": 1
},
"indexName": "start_ip_1_end_ip_1",
"isMultiKey": false,
"multiKeyPaths": {
"start_ip": [],
"end_ip": []
},
"isUnique": false,
"isSparse": false,
"isPartial": false,
"indexVersion": 2,
"direction": "forward",
"indexBounds": {
"start_ip": [
"[-inf.0, 3284004939]"
],
"end_ip": [
"[3284004939, inf.0]"
]
}
}
},
"slotBasedPlan": {
"slots": "$$RESULT=s25 env: { s9 = {\"start_ip\" : 1, \"end_ip\" : 1}, s5 = IndexBounds(\"field #0['start_ip']: [-inf.0, 3284004939], field #1['end_ip']: [3284004939, inf.0]\"), s3 = 1699191077268 (NOW), s24 = true, s1 = TimeZoneDatabase(Africa/Ceuta...Etc/GMT-12) (timeZoneDB), s2 = Nothing (SEARCH_META), s13 = Nothing }",
"stages": "[2] nlj inner [] [s19, s20, s21, s22, s23] \n left \n [1] branch {s24} [s19, s20, s21, s22, s23] \n [s4, s6, s7, s8, s9] [1] ixscan_generic s5 s8 s4 s6 s7 lowPriority [] @\"99a3fa22-8459-4875-bbc9-3cf3ee9afc55\" @\"start_ip_1_end_ip_1\" true \n [s10, s16, s17, s18, s9] [1] nlj inner [] [s11, s12] \n left \n [1] project [s11 = getField(s14, \"l\"), s12 = getField(s14, \"h\")] \n [1] unwind s14 s15 s13 false \n [1] limit 1 \n [1] coscan \n right \n [1] ixseek s11 s12 s18 s10 s16 s17 [] @\"99a3fa22-8459-4875-bbc9-3cf3ee9afc55\" @\"start_ip_1_end_ip_1\" true \n right \n [2] limit 1 \n [2] seek s19 s25 s26 s20 s21 s22 s23 [] @\"99a3fa22-8459-4875-bbc9-3cf3ee9afc55\" true false \n"
}
},
"rejectedPlans": []
},
"executionStats": {
"executionSuccess": true,
"nReturned": 1,
"executionTimeMillis": 1276,
"totalKeysExamined": 2422998,
"totalDocsExamined": 1,
"executionStages": {
"stage": "nlj",
"planNodeId": 2,
"nReturned": 1,
"executionTimeMillisEstimate": 1276,
"opens": 1,
"closes": 1,
"saveState": 1,
"restoreState": 1,
"isEOF": 1,
"totalDocsExamined": 1,
"totalKeysExamined": 2422998,
"collectionScans": 0,
"collectionSeeks": 1,
"indexScans": 0,
"indexSeeks": 1,
"indexesUsed": [
"start_ip_1_end_ip_1",
"start_ip_1_end_ip_1"
],
"innerOpens": 1,
"innerCloses": 1,
"outerProjects": [],
"outerCorrelated": [
19,
20,
21,
22,
23
],
"outerStage": {
"stage": "branch",
"planNodeId": 1,
"nReturned": 1,
"executionTimeMillisEstimate": 1276,
"opens": 1,
"closes": 1,
"saveState": 1,
"restoreState": 1,
"isEOF": 1,
"numTested": 1,
"thenBranchOpens": 1,
"thenBranchCloses": 1,
"elseBranchOpens": 0,
"elseBranchCloses": 0,
"filter": "s24 ",
"thenSlots": [
4,
6,
7,
8,
9
],
"elseSlots": [
10,
16,
17,
18,
9
],
"outputSlots": [
19,
20,
21,
22,
23
],
"thenStage": {
"stage": "ixscan_generic",
"planNodeId": 1,
"nReturned": 1,
"executionTimeMillisEstimate": 1276,
"opens": 1,
"closes": 1,
"saveState": 1,
"restoreState": 1,
"isEOF": 1,
"indexName": "start_ip_1_end_ip_1",
"keysExamined": 2422998,
"seeks": 2422997,
"numReads": 2422998,
"indexKeySlot": 8,
"recordIdSlot": 4,
"snapshotIdSlot": 6,
"indexIdentSlot": 7,
"outputSlots": [],
"indexKeysToInclude": "00000000000000000000000000000000"
},
"elseStage": {
"stage": "nlj",
"planNodeId": 1,
"nReturned": 0,
"executionTimeMillisEstimate": 0,
"opens": 0,
"closes": 0,
"saveState": 1,
"restoreState": 1,
"isEOF": 0,
"totalDocsExamined": 0,
"totalKeysExamined": 0,
"collectionScans": 0,
"collectionSeeks": 0,
"indexScans": 0,
"indexSeeks": 0,
"indexesUsed": [
"start_ip_1_end_ip_1"
],
"innerOpens": 0,
"innerCloses": 0,
"outerProjects": [],
"outerCorrelated": [
11,
12
],
"outerStage": {
"stage": "project",
"planNodeId": 1,
"nReturned": 0,
"executionTimeMillisEstimate": 0,
"opens": 0,
"closes": 0,
"saveState": 1,
"restoreState": 1,
"isEOF": 0,
"projections": {
"11": "getField(s14, \"l\") ",
"12": "getField(s14, \"h\") "
},
"inputStage": {
"stage": "unwind",
"planNodeId": 1,
"nReturned": 0,
"executionTimeMillisEstimate": 0,
"opens": 0,
"closes": 0,
"saveState": 1,
"restoreState": 1,
"isEOF": 0,
"inputSlot": 13,
"outSlot": 14,
"outIndexSlot": 15,
"preserveNullAndEmptyArrays": 0,
"inputStage": {
"stage": "limit",
"planNodeId": 1,
"nReturned": 0,
"executionTimeMillisEstimate": 0,
"opens": 0,
"closes": 0,
"saveState": 1,
"restoreState": 1,
"isEOF": 0,
"limit": 1,
"inputStage": {
"stage": "coscan",
"planNodeId": 1,
"nReturned": 0,
"executionTimeMillisEstimate": 0,
"opens": 0,
"closes": 0,
"saveState": 1,
"restoreState": 1,
"isEOF": 0
}
}
}
},
"innerStage": {
"stage": "ixseek",
"planNodeId": 1,
"nReturned": 0,
"executionTimeMillisEstimate": 0,
"opens": 0,
"closes": 0,
"saveState": 1,
"restoreState": 1,
"isEOF": 0,
"indexName": "start_ip_1_end_ip_1",
"keysExamined": 0,
"seeks": 0,
"numReads": 0,
"indexKeySlot": 18,
"recordIdSlot": 10,
"snapshotIdSlot": 16,
"indexIdentSlot": 17,
"outputSlots": [],
"indexKeysToInclude": "00000000000000000000000000000000",
"seekKeyLow": "s11 ",
"seekKeyHigh": "s12 "
}
}
},
"innerStage": {
"stage": "limit",
"planNodeId": 2,
"nReturned": 1,
"executionTimeMillisEstimate": 0,
"opens": 1,
"closes": 1,
"saveState": 1,
"restoreState": 1,
"isEOF": 1,
"limit": 1,
"inputStage": {
"stage": "seek",
"planNodeId": 2,
"nReturned": 1,
"executionTimeMillisEstimate": 0,
"opens": 1,
"closes": 1,
"saveState": 1,
"restoreState": 1,
"isEOF": 0,
"numReads": 1,
"recordSlot": 25,
"recordIdSlot": 26,
"seekKeySlot": 19,
"snapshotIdSlot": 20,
"indexIdentSlot": 21,
"indexKeySlot": 22,
"indexKeyPatternSlot": 23,
"fields": [],
"outputSlots": []
}
}
},
"allPlansExecution": []
},
"command": {
"find": "ip_ranges",
"filter": {
"start_ip": {
"$lte": 3284004939
},
"end_ip": {
"$gte": 3284004939
}
},
"$db": "restclass_development"
},
"serverInfo": {
"host": "sun.arg.network",
"port": 27017,
"version": "7.0.2",
"gitVersion": "02b3c655e1302209ef046da6ba3ef6749dd0b62a"
},
"serverParameters": {
"internalQueryFacetBufferSizeBytes": 104857600,
"internalQueryFacetMaxOutputDocSizeBytes": 104857600,
"internalLookupStageIntermediateDocumentMaxSizeBytes": 104857600,
"internalDocumentSourceGroupMaxMemoryBytes": 104857600,
"internalQueryMaxBlockingSortMemoryUsageBytes": 104857600,
"internalQueryProhibitBlockingMergeOnMongoS": 0,
"internalQueryMaxAddToSetBytes": 104857600,
"internalDocumentSourceSetWindowFieldsMaxMemoryBytes": 104857600,
"internalQueryFrameworkControl": "trySbeEngine"
},
"ok": 1
}
This query is executed in ~1.5 seconds. What would be a more time efficient query and/or indexing strategy for our use case?
First, acknowledgements to John Page (https://www.mongodb.com/developer/author/john-page), one of the all-time MongoDB greats, for first exploring this approach.
Often we encounter data range challenges in our data design and querying. The OP has approached this in the traditional way: create two fields, start_THING and end_THING, set up a compound key on the fields, and use start_THING $lte target
and end_THING $gte target
to quickly find one doc. Or, to find things in a range, start_THING $gte lowerBound
and end_THING $lt upperBound
.
Another way to do this is by using a single geospatial field with a 2dsphere
index on it. Here's how this approach would work in the context of the OP data:
0.0.0.0
and go to 255.255.255.255
. The OP notes that a conversion to integer is clearly possible. The range of IPs in integer terms is 0
to 4294967295
(256 ** 4).10.3.232.0
to 10.3.232.5
. This is 168028160
and 168028165
in integer terms, respectively. geo_start = ((168028160 / 4294967295) * (180 - (-180))) + (-180);
geo_end = ((168028165 / 4294967295) * (180 - (-180))) + (-180);
geo: { type: "LineString", coordinates: [ [ geo_start, 0.0 ], [geo_end, 0.0 ] ] }
Here is an example of a data record fully populated:
{
_id: ObjectId("6549055e795d8927e4779193"),
N: 1000,
start_ip: '10.3.232.0',
end_ip: '10.3.232.5',
int_start_ip: 168028160,
int_end_ip: 168028165,
geo: {
type: 'LineString',
coordinates: [ [ -165.91604232460168, 0 ], [ -165.91604190550652, 0 ] ]
},
country_code: 'XXX',
country: 'Xland',
region: 'Mars',
city: 'Foo'
}
db.foo.createIndex({geo:"2dsphere"})
$geoIntersects
: function convertToGeo(sip) {
q = sip.split('.');
var iip = 0;
for(var n = 0; n < 4; n++) {
iip += (parseInt(q[n]) * (256 ** (3 - n)));
}
return ((iip / new NumberLong("4294967295")) * 360 ) - 180 ;
}
var gsip = convertToGeo('10.232.0.3'); // .3 is in between .0 and .5; see data record above
var xx = db.foo.findOne({
geo: {
$geoIntersects: {
$geometry: { type: "LineString",
// Must create a second discrete point for a line, so make it 0.1
coordinates: [ [ gsip, 0.0 ], gsip, 0.1 ] ] }
}
}
};
// The doc above is found!
Sampling 100 random IPs in a population of 3m items show the single geo index approach working much faster than the compound index approach, typically taking no more than 1 millisecond:
100 samples of GEO approach:
{
count: 100,
execMillis_min: 0,
execMillis_max: 1,
execMillis_tot: 16,
execMillis_avg: 0,
totKeysExamined_min: 296,
totKeysExamined_max: 648,
totKeysExamined_tot: 48327,
totKeysExamined_avg: 3.744628643254086,
totDocsExamined_min: 222,
totDocsExamined_max: 556,
totDocsExamined_tot: 39977,
totDocsExamined_avg: 2.806450990888347
}
One of the explains:
{
explainVersion: '1',
queryPlanner: {
...
inputStage: {
stage: 'IXSCAN',
keyPattern: { geo: '2dsphere' },
indexName: 'geo_2dsphere',
executionStats: {
executionSuccess: true,
nReturned: 1,
executionTimeMillis: 1,
totalKeysExamined: 353,
totalDocsExamined: 266,
keysExamined: 353,
seeks: 21,
dupsTested: 332,
dupsDropped: 66
}
},
...
}
100 samples of COMPOUND start/end approach:
{
count: 100,
execMillis_min: 48,
execMillis_max: 1360,
execMillis_tot: 63096,
execMillis_avg: 630.96,
totKeysExamined_min: 10000,
totKeysExamined_max: 2859339,
totKeysExamined_tot: 137167079,
totKeysExamined_avg: 19980.70323149996,
totDocsExamined_min: 1,
totDocsExamined_max: 1,
totDocsExamined_tot: 100,
totDocsExamined_avg: 0.010102051554122065
}
One of the explains:
{
explainVersion: '1',
queryPlanner: {
inputStage: {
stage: 'IXSCAN',
keyPattern: {
int_start_ip: 1,
int_end_ip: 1
},
indexName: 'int_start_ip_1_int_end_ip_1',
...
executionStats: {
executionSuccess: true,
nReturned: 1,
executionTimeMillis: 1051,
totalKeysExamined: 2311153,
totalDocsExamined: 1,
keysExamined: 2311153,
seeks: 2311152,
dupsTested: 0,
dupsDropped: 0
}
Some more tests now in tabular form. The 2dsphere
index strategy clearly benefits when looking up material "further down" the range of values:
db.foo.find({int_start_ip:{$lte:int_target}, int_end_ip:{$gte:int_target}}).explain(true);
N totMS keysExam docsExam seeks
2 0 4 1 3
500000 235 500002 1 500001
3000000 1322 3000002 1 3000001
5000000 2301 5000002 1 5000001
9999999 4710 10000000 1 10000000
db.foo.find({geo: {'$geoIntersects': {'$geometry': {
type: 'LineString',
coordinates: [ [ geo_target,0 ], [ geo_target,0.1 ] ]
}}}}).explain(true);
N totMS keysExam docsExam seeks
2 1 200 130 8
500000 1 256 276 11
3000000 3 621 530 23
5000000 2 621 533 13
9999999 0 39 15 9