27-Sep-2018 |
Like this? Dislike this? Let me know |

A Microsoft Excel Pivot table is as much -- if not more -- of a GUI control interface as it is a dimensional data aggregation engine. It's very easy to move and format rows and columns to present the rolled up data just the way you want.

MongoDB, of course, is just a database -- no specialized "pivoting GUI." But
very often we want to extract the same kind of data from the database, and
to do so efficiently. This article demonstrates the use of the powerful
`$facet` operator and less than 60 lines of javascript to programmatically
create an aggregation framework pipeline that does the heavy lifting to yield
pivot-table-esque output.

{region: "A", state: "MA", year: 2001, q1:1, q2:2, q3:3, q4:4} ,{region: "A", state: "NH", year: 2001, q1:6, q2:2, q3:3, q4:4} ,{region: "A", state: "CT", year: 2001, q1:1, q2:2, q3:3, q4:4} ,{region: "B", state: "NY", year: 2001, q1:8, q2:2, q3:3, q4:4} ,{region: "B", state: "NJ", year: 2001, q1:1, q2:2, q3:3, q4:4} ,{region: "A", state: "MA", year: 2002, q1:52, q2:32, q3:43, q4:54} ,{region: "A", state: "NH", year: 2002, q1:21, q2:32, q3:43, q4:54} ,{region: "A", state: "CT", year: 2002, q1:67, q2:32, q3:43, q4:54} ,{region: "B", state: "NY", year: 2002, q1:8, q2:32, q3:43, q4:54} ,{region: "B", state: "NJ", year: 2002, q1:9, q2:32, q3:43, q4:54}

db.collection.aggregate([ {$group: {_id: "$region", tot_q1: {$sum:"$q1"}, tot_q2: {$sum:"$q2"}} } ]} { "_id" : "B", "tot_q1" : 26, "tot_q2" : 68 } { "_id" : "A", "tot_q1" : 148, "tot_q2" : 102 } db.collection.aggregate([ {$group: {_id: "$state", tot_q1: {$sum:"$q1"}, tot_q2: {$sum:"$q2"}} } ]} { "_id" : "NJ", "tot_q1" : 10, "tot_q2" : 34 } { "_id" : "CT", "tot_q1" : 68, "tot_q2" : 34 } { "_id" : "NH", "tot_q1" : 27, "tot_q2" : 34 } { "_id" : "NY", "tot_q1" : 16, "tot_q2" : 34 } { "_id" : "MA", "tot_q1" : 53, "tot_q2" : 34 }

db.collection.aggregate([ {$facet: { "byRegion": [ {$group: {_id: "$region", tot_q1: {$sum:"$q1"}, tot_q2: {$sum:"$q2"}} } ], "byState": [ {$group: {_id: "$state", tot_q1: {$sum:"$q1"}, tot_q2: {$sum:"$q2"}} } ] }} ]); { "byRegion" : [ {"_id" : "B","tot_q1" : 26,"tot_q2" : 68}, {"_id" : "A","tot_q1" : 148,"tot_q2" : 102} ], "byState" : [ {"_id" : "NJ","tot_q1" : 10,"tot_q2" : 34}, {"_id" : "CT","tot_q1" : 68,"tot_q2" : 34}, {"_id" : "NH","tot_q1" : 27,"tot_q2" : 34}, {"_id" : "NY","tot_q1" : 16,"tot_q2" : 34}, {"_id" : "MA","tot_q1" : 53,"tot_q2" : 34} ] }

db.collection.aggregate([{$match: {"year": 2002}}, // only get 2002{$facet: { "byRegion": [ {$group: {_id: "$region", tot_q1: {$sum:"$q1"}, tot_q2: {$sum:"$q2"}} } ], "byState": [ {$group: {_id: "$state", tot_q1: {$sum:"$q1"}, tot_q2: {$sum:"$q2"}} } ] }} ]); { "byRegion" : [ {"_id" : "B","tot_q1" : 17,"tot_q2" : 64}, {"_id" : "A","tot_q1" : 140,"tot_q2" : 96} ], "byState" : [ {"_id" : "NJ","tot_q1" : 9,"tot_q2" : 32}, {"_id" : "CT","tot_q1" : 67,"tot_q2" : 32}, {"_id" : "NH","tot_q1" : 21,"tot_q2" : 32}, {"_id" : "NY","tot_q1" : 8,"tot_q2" : 32}, {"_id" : "MA","tot_q1" : 52,"tot_q2" : 32} ] }

db.collection.aggregate([ {$facet: { "byRegion": [ {$group: {_id: "$region", tot_q1: {$sum:"$q1"}, tot_q2: {$sum:"$q2"}} } ], "byRegionYear": [ {$group: {_id: {region:"$region",year:"$year"}, tot_q1: {$sum:"$q1"}, tot_q2: {$sum:"$q2"}} } ], "byRegionState": [ {$group: {_id: {region:"$region",year:"$state"}, tot_q1: {$sum:"$q1"}, tot_q2: {$sum:"$q2"}} } ], //... }} ]);

db.collection.aggregate([ {$facet: { "byRegion": [ {$group: {_id: "$region",x: {$push: {q1:"$q1", q2:"$q2", state:"$state",year:"$year"}}, tot_q1: {$sum:"$q1"}, tot_q2: {$sum:"$q2"}} } ], "byRegionYear": [ {$group: {_id: {region:"$region",year:"$year"},x: {$push: {q1:"$q1", q2:"$q2", state:"$state"}}, tot_q1: {$sum:"$q1"}, tot_q2: {$sum:"$q2"}} } ], "byRegionState": [ {$group: {_id: {region:"$region",year:"$state"},x: {$push: {q1:"$q1", q2:"$q2", year:"$year"}}, tot_q1: {$sum:"$q1"}, tot_q2: {$sum:"$q2"}} } ], //... }} ]); "byRegion" : [ { "_id" : "B", "x" : [ {"state" : "NY","year" : 2001,"q1" : 8,"q2" : 2 }, {"state" : "NJ","year" : 2001,"q1" : 1,"q2" : 2 }, {"state" : "NY","year" : 2002,"q1" : 8,"q2" : 32}, {"state" : "NJ","year" : 2002,"q1" : 9,"q2" : 32} ], "tot_q1" : 26, "tot_q2" : 68 }, // ...

- Each discrete combination of dimensions (e.g. region, region+year, state+year, etc.) becomes the special
`_id`field. - The "diff" of each combination against the full set of dimension becomes those dimension fields that are pushed in addition to the static set of measures.
- Each aggregation gets the same static set of measures.

function getCombinations(items) { var result = []; var f = function(keyarr, items, lvl) { for (var i = 0; i < items.length; i++) { var nka = keyarr.concat(items[i]); result.push(nka); f(nka, items.slice(i + 1), lvl+1); } } f([], items, 0); return result; } function createPivotFacets(dims, measures) { var fcts = {}; // This is where we build the $project stage to turn // sum_q1: nnn // avg_q1: nnn // max_q1: nnn // into // q1: { sum:nnn, avg:nnn, ... } // Only need to do this once; the $project is the same for all // variations of $group. var prj = {_id:1, _x:1, _n:1}; measures.forEach(function(m) { var z = {}; ["sum","avg","max","min"].forEach(function(f) { z[f] = "$"+f+"_"+m; prj[m] = z; }) }); var combinations = getCombinations(dims); for (var i = 0; i < combinations.length; i++) { var karr = combinations[i]; var parr = dims.filter(function(i) {return karr.indexOf(i) < 0;}); var grp = {}; var idd = {}; karr.forEach(function(k) { idd[k] = "$" + k; }); grp['_id'] = idd; if(parr.length > 0) { var pdd = {}; measures.forEach(function(m) { pdd[m] = "$"+m }); parr.forEach(function(k) { pdd[k] = "$" + k; }); grp['_x'] = {"$push": pdd}; } grp['_n'] = {"$sum":1}; // handy but basically {$size: "$_x"} measures.forEach(function(m) { ["sum","avg","max","min"].forEach(function(f) { var z = {}; z["$"+f] = "$"+m; grp[f+"_"+m] = z; }); }); fcts[karr.join(':')] = [ {$group:grp}, {$project:prj} ]; } return {"$facet":fcts}; } pipe = [ createPivotFacets(["region","year","state"], ["q1","q2","q3"]) ]; db.collection.aggregate(pipe); { "region" : [ { "_id" : { "region" : "B" }, "_x" : [ {"q1" : 8,"q2" : 2,"q3" : 3,"year" : 2001,"state" : "NY"}, {"q1" : 1,"q2" : 2,"q3" : 3,"year" : 2001,"state" : "NJ"}, {"q1" : 8,"q2" : 32,"q3" : 43,"year" : 2002,"state" : "NY"}, {"q1" : 9,"q2" : 32,"q3" : 43,"year" : 2002,"state" : "NJ"} ], "_n" : 4, "q1" : {"sum" : 26,"avg" : 6.5,"max" : 9,"min" : 1}, "q2" : {"sum" : 68,"avg" : 17,"max" : 32,"min" : 2}, "q3" : {"sum" : 92,"avg" : 23,"max" : 43,"min" : 3} }, { "_id" : { "region" : "A" }, "_x" : [ {"q1" : 1,"q2" : 2,"q3" : 3,"year" : 2001,"state" : "MA"}, {"q1" : 6,"q2" : 2,"q3" : 3,"year" : 2001,"state" : "NH"}, {"q1" : 1,"q2" : 2,"q3" : 3,"year" : 2001,"state" : "CT"}, {"q1" : 52,"q2" : 32,"q3" : 43,"year" : 2002,"state" : "MA"}, {"q1" : 21,"q2" : 32,"q3" : 43,"year" : 2002,"state" : "NH"}, {"q1" : 67,"q2" : 32,"q3" : 43,"year" : 2002,"state" : "CT"} ], "_n" : 6, "q1" : {"sum" : 148,"avg" : 24.666666666666668,"max" : 67,"min" : 1}, "q2" : {"sum" : 102,"avg" : 17,"max" : 32,"min" : 2}, "q3" : {"sum" : 138,"avg" : 23,"max" : 43,"min" : 3} } ], "region:year" : [ { "_id" : { "region" : "B", "year" : 2002 }, "_x" : [ {"q1" : 8,"q2" : 32,"q3" : 43,"state" : "NY"}, {"q1" : 9,"q2" : 32,"q3" : 43,"state" : "NJ"} ], "_n" : 2, "q1" : {"sum" : 17,"avg" : 8.5,"max" : 9,"min" : 8}, "q2" : {"sum" : 64,"avg" : 32,"max" : 32,"min" : 32}, "q3" : {"sum" : 86,"avg" : 43,"max" : 43,"min" : 43} }, { "_id" : { "region" : "A", "year" : 2002 }, "_x" : [ {"q1" : 52,"q2" : 32,"q3" : 43,"state" : "MA"}, {"q1" : 21,"q2" : 32,"q3" : 43,"state" : "NH"}, {"q1" : 67,"q2" : 32,"q3" : 43,"state" : "CT"} ], "_n" : 3, "q1" : {"sum" : 140,"avg" : 46.666666666666664,"max" : 67,"min" : 21}, "q2" : {"sum" : 96,"avg" : 32,"max" : 32,"min" : 32}, "q3" : {"sum" : 129,"avg" : 43,"max" : 43,"min" : 43} }, // ...

- Obviously, this is
*much*easier and more reliable than handcoding the full`$facet`expression. The number of combinations grows exponentially (`2`) and this makes manual editing of the facet groups impossible on a practical basis.^{n}-1 - Each subgroup ("pivot") is named by using a colon-separated join of the dimension names.
- Each measure has a substructure containing the aggregations. You can always
use dot notation to access them e.g.
`{$match: {"q2.sum": {$gt: 100}}}` - The
`_x`field carries the individual items for each aggregation. - As an optimization, the combination which is
*all*the dimensions does not create an`_x`field with the individual items because:- The
`_id`field contains all relevant dimension values. - By definition there can only be one item with a unique set of dimension values so the
`_x`is redundant - All the measures aggregations are captured anyway

- The

There is room for improvement here but it is left as an exercise to the reader to experiment. Suggestions:

- Option to suppress capturing detail in the
`_x`array because that array might become very very large. - Passing in a set of aggregation functions to use instead of hardcoding sum, avg, min, and max. As a stretch beyond this, the ability to pass in an aggregation fragment.
- Config options for naming of
`$facet`name output instead of colon- separated, agg measures names, etc.

Like this? Dislike this? Let me know