I'm trying to query results from a large dataset called 'tasks' containing 187297 documents which are nested into another dataset called 'workers', that's in its turn nested into a collection called 'production_units'.
production_units -> workers -> tasks
(BTW this is a simplified version of production_units):
[{
"_id": ObjectId("5aca27b926974863ed9f01ab"),
"name": "Z",
"workers": [{
"name": "X Y",
"worker_number": 655,
"employed": false,
"_id": ObjectId("5aca27bd26974863ed9f0425"),
"tasks": [{
"_id": ObjectId("5ac9f6c2e1a668d6d39c1fd1"),
"inbound_order_number": 3296,
"task_number": 90,
"minutes_elapsed": 120,
"date": "2004-11-30",
"start": 1101823200,
"pieces_actual": 160,
"pause_from": 1101812400,
"pause_to": 1101814200
}]
}]
}]
In order to accomplish this I have used the following aggregation command:
db.production_units.aggregate([{
'$project': {
'workers': '$workers'
}
}, {
'$unwind': '$workers'
}, {
'$project': {
'tasks': '$workers.tasks',
'worker_number': '$workers.worker_number'
}
}, {
'$unwind': '$tasks'
}, {
'$project': {
'task_number': '$tasks.task_number',
'pieces_actual': '$tasks.pieces_actual',
'minutes_elapsed': '$tasks.minutes_elapsed',
'worker_number': 1,
'start': '$tasks.start',
'inbound_order_number': '$tasks.inbound_order_number',
'pause_from': '$tasks.pause_from',
'date': '$tasks.date',
'_id': '$tasks._id',
'pause_to': '$tasks.pause_to'
}
}, {
'$match': {
'start': {
'$exists': true
}
}
}, {
'$group': {
'entries_count': {
'$sum': 1
},
'_id': null,
'entries': {
'$push': '$$ROOT'
}
}
}, {
'$project': {
'entries_count': 1,
'_id': 0,
'entries': 1
}
}, {
'$unwind': '$entries'
}, {
'$project': {
'task_number': '$entries.task_number',
'pieces_actual': '$entries.pieces_actual',
'minutes_elapsed': '$entries.minutes_elapsed',
'worker_number': '$entries.worker_number',
'start': '$entries.start',
'inbound_order_number': '$entries.inbound_order_number',
'pause_from': '$entries.pause_from',
'date': '$entries.date',
'entries_count': 1,
'_id': '$entries._id',
'pause_to': '$entries.pause_to'
}
}, {
'$sort': {
'start': 1
}
}, {
'$skip': 187290
}, {
'$limit': 10
}], {
allowDiskUse: true
})
And the returned documents are:
{ "entries_count" : 187297, "task_number" : 100, "pieces_actual" : 68, "minutes_elapsed" : 102, "worker_number" : 411, "start" : 1594118400, "inbound_order_number" : 8569, "pause_from" : 1594119600, "date" : "2020-07-07", "_id" : ObjectId("5ac9f6d3e1a668d6d3a06351"), "pause_to" : 1594119600 } { "entries_count" : 187297, "task_number" : 130, "pieces_actual" : 20, "minutes_elapsed" : 30, "worker_number" : 549, "start" : 1596531600, "inbound_order_number" : 7683, "pause_from" : 1596538800, "date" : "2020-08-04", "_id" : ObjectId("5ac9f6cde1a668d6d39f1b26"), "pause_to" : 1596538800 } { "entries_count" : 187297, "task_number" : 210, "pieces_actual" : 84, "minutes_elapsed" : 180, "worker_number" : 734, "start" : 1601276400, "inbound_order_number" : 8330, "pause_from" : 1601290800, "date" : "2020-09-28", "_id" : ObjectId("5ac9f6d0e1a668d6d39fd677"), "pause_to" : 1601290800 } { "entries_count" : 187297, "task_number" : 20, "pieces_actual" : 64, "minutes_elapsed" : 90, "worker_number" : 114, "start" : 1601800200, "inbound_order_number" : 7690, "pause_from" : 1601809200, "date" : "2020-10-04", "_id" : ObjectId("5ac9f6cee1a668d6d39f3032"), "pause_to" : 1601811900 } { "entries_count" : 187297, "task_number" : 140, "pieces_actual" : 70, "minutes_elapsed" : 84, "worker_number" : 49, "start" : 1603721640, "inbound_order_number" : 4592, "pause_from" : 1603710000, "date" : "2020-10-26", "_id" : ObjectId("5ac9f6c8e1a668d6d39df664"), "pause_to" : 1603712700 } { "entries_count" : 187297, "task_number" : 80, "pieces_actual" : 20, "minutes_elapsed" : 30, "worker_number" : 277, "start" : 1796628600, "inbound_order_number" : 4655, "pause_from" : 1796641200, "date" : "2026-12-07", "_id" : ObjectId("5ac9f6c8e1a668d6d39e1fc0"), "pause_to" : 1796643900 } { "entries_count" : 187297, "task_number" : 40, "pieces_actual" : 79, "minutes_elapsed" : 123, "worker_number" : 96, "start" : 3802247580, "inbound_order_number" : 4592, "pause_from" : 3802244400, "date" : "2090-06-27", "_id" : ObjectId("5ac9f6c8e1a668d6d39de218"), "pause_to" : 3802244400 }
However, the query takes seconds in order to show the results, instead of few milliseconds. This is the result returned by the profiler:
db.system.profile.findOne().millis 3216
(UPDATE)
Even the following simplified count query gets executed in 312 ms instead of few time:
db.production_units.aggregate([{
"$unwind": "$workers"
}, {
"$unwind": "$workers.tasks"
},
{
"$count": "entries_count"
}
])
This is what explain()
returns for the query above:
{
"stages" : [
{
"$cursor" : {
"query" : {
},
"fields" : {
"workers" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "my_db.production_units",
"indexFilterSet" : false,
"parsedQuery" : {
},
"winningPlan" : {
"stage" : "COLLSCAN",
"direction" : "forward"
},
"rejectedPlans" : [ ]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 28,
"executionTimeMillis" : 13,
"totalKeysExamined" : 0,
"totalDocsExamined" : 28,
"executionStages" : {
"stage" : "COLLSCAN",
"nReturned" : 28,
"executionTimeMillisEstimate" : 0,
"works" : 30,
"advanced" : 28,
"needTime" : 1,
"needYield" : 0,
"saveState" : 1,
"restoreState" : 1,
"isEOF" : 1,
"invalidates" : 0,
"direction" : "forward",
"docsExamined" : 28
},
"allPlansExecution" : [ ]
}
}
},
{
"$unwind" : {
"path" : "$workers"
}
},
{
"$unwind" : {
"path" : "$workers.tasks"
}
},
{
"$group" : {
"_id" : {
"$const" : null
},
"entries_count" : {
"$sum" : {
"$const" : 1
}
}
}
},
{
"$project" : {
"_id" : false,
"entries_count" : true
}
}
],
"ok" : 1
}
I'm not an experienced DBA, so I don't know what I'm missing exactly in my aggregation pipeline, for solving the performance issue I'm facing. I have also investigated the problem and made research, but without finding any solution.
What I am missing?
See Question&Answers more detail:
os