An Approach for Creating Pivot Tables with MongoDB

27-Sep-2018 Like this? Dislike this? Let me know

A Microsoft Excel Pivot table is as much -- if not more -- of a GUI control interface as it is a dimensional data aggregation engine. It's very easy to move and format rows and columns to present the rolled up data just the way you want.

MongoDB, of course, is just a database -- no specialized "pivoting GUI." But very often we want to extract the same kind of data from the database, and to do so efficiently. This article demonstrates the use of the powerful $facet operator and less than 60 lines of javascript to programmatically create an aggregation framework pipeline that does the heavy lifting to yield pivot-table-esque output.

A Brief Overview of $facet

The $facet function, introduced in version 3.4, can basically be considered a "multigrouping" operator. It allows you to pass one or more named "subpipelines" that are optimized and executed at once on the server side. Consider this set of data:
 {region: "A", state: "MA", year: 2001, q1:1, q2:2, q3:3, q4:4}
 ,{region: "A", state: "NH", year: 2001, q1:6, q2:2, q3:3, q4:4}
 ,{region: "A", state: "CT", year: 2001, q1:1, q2:2, q3:3, q4:4}
 ,{region: "B", state: "NY", year: 2001, q1:8, q2:2, q3:3, q4:4}
 ,{region: "B", state: "NJ", year: 2001, q1:1, q2:2, q3:3, q4:4}

 ,{region: "A", state: "MA", year: 2002, q1:52, q2:32, q3:43, q4:54}
 ,{region: "A", state: "NH", year: 2002, q1:21, q2:32, q3:43, q4:54}
 ,{region: "A", state: "CT", year: 2002, q1:67, q2:32, q3:43, q4:54}
 ,{region: "B", state: "NY", year: 2002, q1:8, q2:32, q3:43, q4:54}
 ,{region: "B", state: "NJ", year: 2002, q1:9, q2:32, q3:43, q4:54}
We wish to aggregate (sum) measures q1 and q2 by dimensions region and then separately by state (for example). Before $facet, it was most straightforward simply to issue two separate agg pipelines:
db.collection.aggregate([
    {$group: {_id: "$region", tot_q1: {$sum:"$q1"}, tot_q2: {$sum:"$q2"}} }
]}

{ "_id" : "B", "tot_q1" : 26, "tot_q2" : 68 }
{ "_id" : "A", "tot_q1" : 148, "tot_q2" : 102 }

db.collection.aggregate([
    {$group: {_id: "$state", tot_q1: {$sum:"$q1"}, tot_q2: {$sum:"$q2"}} }
]}

{ "_id" : "NJ", "tot_q1" : 10, "tot_q2" : 34 }
{ "_id" : "CT", "tot_q1" : 68, "tot_q2" : 34 }
{ "_id" : "NH", "tot_q1" : 27, "tot_q2" : 34 }
{ "_id" : "NY", "tot_q1" : 16, "tot_q2" : 34 }
{ "_id" : "MA", "tot_q1" : 53, "tot_q2" : 34 }
$facet allows us to do this in one shot:
db.collection.aggregate([
{$facet: {
	"byRegion": [
{$group: {_id: "$region", tot_q1: {$sum:"$q1"}, tot_q2: {$sum:"$q2"}} }
		     ],
	"byState": [
{$group: {_id: "$state", tot_q1: {$sum:"$q1"}, tot_q2: {$sum:"$q2"}} }
		     ]
    }}
		      ]);

{
	"byRegion" : [
		{"_id" : "B","tot_q1" : 26,"tot_q2" : 68},
		{"_id" : "A","tot_q1" : 148,"tot_q2" : 102}
	],
	"byState" : [
		{"_id" : "NJ","tot_q1" : 10,"tot_q2" : 34},
		{"_id" : "CT","tot_q1" : 68,"tot_q2" : 34},
		{"_id" : "NH","tot_q1" : 27,"tot_q2" : 34},
		{"_id" : "NY","tot_q1" : 16,"tot_q2" : 34},
		{"_id" : "MA","tot_q1" : 53,"tot_q2" : 34}
	]
}
As you can see, each named element of the $facet object has its own "subpipeline." With certain restrictions, you may construct any arbitrary subpipeline including operators like $lookup, $addFields, and $bucket. Of course, you can combine $facet with other operators as appropriate. The example above perfoms aggregations across both years 2001 and 2002. If just year 2002 was required, then this pipeline could do the job:
db.collection.aggregate([
{$match: {"year": 2002}},  // only get 2002 
{$facet: {
	"byRegion": [
{$group: {_id: "$region", tot_q1: {$sum:"$q1"}, tot_q2: {$sum:"$q2"}} }
		     ],
	"byState": [
{$group: {_id: "$state", tot_q1: {$sum:"$q1"}, tot_q2: {$sum:"$q2"}} }
		     ]
    }}
		      ]);

{
	"byRegion" : [
		{"_id" : "B","tot_q1" : 17,"tot_q2" : 64},
		{"_id" : "A","tot_q1" : 140,"tot_q2" : 96}
	],
	"byState" : [
		{"_id" : "NJ","tot_q1" : 9,"tot_q2" : 32},
		{"_id" : "CT","tot_q1" : 67,"tot_q2" : 32},
		{"_id" : "NH","tot_q1" : 21,"tot_q2" : 32},
		{"_id" : "NY","tot_q1" : 8,"tot_q2" : 32},
		{"_id" : "MA","tot_q1" : 52,"tot_q2" : 32}
	]
}

$facet: Simple but ... tedious

The challenge arises when you want to perform multilevel aggregations. It is not a question of performance but rather clarity and simplicity. Our example above has two single dimension aggregations, region and state. If we really wanted to "pivot" on, say, region, state, and year, then seven multidimensional aggs appear: [region], [region,year], [region,year,state], [region,state], [year], [year,state], and [state]. This is partially demonstrated here:
db.collection.aggregate([
{$facet: {
	"byRegion": [
{$group: {_id: "$region", tot_q1: {$sum:"$q1"}, tot_q2: {$sum:"$q2"}} }
		     ],
	"byRegionYear": [
{$group: {_id: {region:"$region",year:"$year"}, tot_q1: {$sum:"$q1"}, tot_q2: {$sum:"$q2"}} }
		     ],
	"byRegionState": [
{$group: {_id: {region:"$region",year:"$state"}, tot_q1: {$sum:"$q1"}, tot_q2: {$sum:"$q2"}} }
		     ],
//...

    }}
		      ]);
Clearly, this becomes tedious to set up. But we've only touched upon the end result of each aggregation. What if we want to know the detail behind each? This can be done by using $push to capture the items going into each agg:
db.collection.aggregate([
{$facet: {
	"byRegion": [
{$group: {_id: "$region", x: {$push: {q1:"$q1", q2:"$q2", state:"$state",year:"$year"}}, tot_q1: {$sum:"$q1"}, tot_q2: {$sum:"$q2"}} }
		     ],
	"byRegionYear": [
{$group: {_id: {region:"$region",year:"$year"}, x: {$push: {q1:"$q1", q2:"$q2", state:"$state"}}, tot_q1: {$sum:"$q1"}, tot_q2: {$sum:"$q2"}} }
		     ],
	"byRegionState": [
{$group: {_id: {region:"$region",year:"$state"}, x: {$push: {q1:"$q1", q2:"$q2", year:"$year"}}, tot_q1: {$sum:"$q1"}, tot_q2: {$sum:"$q2"}} }
		     ],
//...

    }}
		      ]);

"byRegion" : [
	{
		"_id" : "B",
		"x" : [
			{"state" : "NY","year" : 2001,"q1" : 8,"q2" : 2	},
			{"state" : "NJ","year" : 2001,"q1" : 1,"q2" : 2	},
			{"state" : "NY","year" : 2002,"q1" : 8,"q2" : 32},
			{"state" : "NJ","year" : 2002,"q1" : 9,"q2" : 32}
		],
		"tot_q1" : 26,
		"tot_q2" : 68
	},
        // ...
But this is now becoming a headache. And we only did $sum. What if wanted $avg, $max, and $min?

Javascript to the rescue

Fortunately, a strong pattern emerges here in the setup of the $group operator: It should therefore be straightforward to programmatically generate the $facet inputs and here is a solution:
function getCombinations(items) {
    var result = [];
    var f = function(keyarr, items, lvl) {
	for (var i = 0; i < items.length; i++) {
	    var nka = keyarr.concat(items[i]);
	    result.push(nka);
	    f(nka, items.slice(i + 1), lvl+1);
	}
    }
    f([], items, 0);
    return result;
}

function createPivotFacets(dims, measures) {
    var fcts = {};

    // This is where we build the $project stage to turn
    //   sum_q1: nnn
    //   avg_q1: nnn
    //   max_q1: nnn
    // into
    //   q1: { sum:nnn, avg:nnn, ... }
    // Only need to do this once; the $project is the same for all
    // variations of $group.			    
    var prj = {_id:1, _x:1, _n:1};
    measures.forEach(function(m) {
	    var z = {};
	    ["sum","avg","max","min"].forEach(function(f) { 	    
		    z[f] = "$"+f+"_"+m;
		    prj[m] = z;
		})
	});

    var combinations = getCombinations(dims);

    for (var i = 0; i < combinations.length; i++) {
	var karr = combinations[i];
	var parr = dims.filter(function(i) {return karr.indexOf(i) < 0;});

	var grp = {};

	var idd = {};
	karr.forEach(function(k) { idd[k] = "$" + k; });
	grp['_id'] = idd;

	if(parr.length > 0) {
	    var pdd = {};
	    measures.forEach(function(m) { pdd[m] = "$"+m });

	    parr.forEach(function(k) { pdd[k] = "$" + k; });
	    grp['_x'] = {"$push": pdd};
	}

	grp['_n'] = {"$sum":1}; // handy but basically {$size: "$_x"}

	measures.forEach(function(m) {
		["sum","avg","max","min"].forEach(function(f) { 
			var z = {};
			z["$"+f] = "$"+m;
			grp[f+"_"+m] = z;
		    });
	    });

	fcts[karr.join(':')] = [ {$group:grp}, {$project:prj} ];
    }

    return {"$facet":fcts};
}

pipe = [
	createPivotFacets(["region","year","state"], ["q1","q2","q3"])
	];
db.collection.aggregate(pipe);

{
  "region" : [
    {
      "_id" : { "region" : "B" },
      "_x" : [
        {"q1" : 8,"q2" : 2,"q3" : 3,"year" : 2001,"state" : "NY"},
        {"q1" : 1,"q2" : 2,"q3" : 3,"year" : 2001,"state" : "NJ"},
        {"q1" : 8,"q2" : 32,"q3" : 43,"year" : 2002,"state" : "NY"},
        {"q1" : 9,"q2" : 32,"q3" : 43,"year" : 2002,"state" : "NJ"}
      ],
      "_n" : 4,
      "q1" : {"sum" : 26,"avg" : 6.5,"max" : 9,"min" : 1},
      "q2" : {"sum" : 68,"avg" : 17,"max" : 32,"min" : 2},
      "q3" : {"sum" : 92,"avg" : 23,"max" : 43,"min" : 3}
    },
    {
      "_id" : { "region" : "A" },
      "_x" : [
        {"q1" : 1,"q2" : 2,"q3" : 3,"year" : 2001,"state" : "MA"},
        {"q1" : 6,"q2" : 2,"q3" : 3,"year" : 2001,"state" : "NH"},
        {"q1" : 1,"q2" : 2,"q3" : 3,"year" : 2001,"state" : "CT"},
        {"q1" : 52,"q2" : 32,"q3" : 43,"year" : 2002,"state" : "MA"},
        {"q1" : 21,"q2" : 32,"q3" : 43,"year" : 2002,"state" : "NH"},
        {"q1" : 67,"q2" : 32,"q3" : 43,"year" : 2002,"state" : "CT"}
      ],
      "_n" : 6,
      "q1" : {"sum" : 148,"avg" : 24.666666666666668,"max" : 67,"min" : 1},
      "q2" : {"sum" : 102,"avg" : 17,"max" : 32,"min" : 2},
      "q3" : {"sum" : 138,"avg" : 23,"max" : 43,"min" : 3}
    }
  ],
  "region:year" : [
    {
      "_id" : { "region" : "B", "year" : 2002 },
      "_x" : [
        {"q1" : 8,"q2" : 32,"q3" : 43,"state" : "NY"},
        {"q1" : 9,"q2" : 32,"q3" : 43,"state" : "NJ"}
      ],
      "_n" : 2,
      "q1" : {"sum" : 17,"avg" : 8.5,"max" : 9,"min" : 8},
      "q2" : {"sum" : 64,"avg" : 32,"max" : 32,"min" : 32},
      "q3" : {"sum" : 86,"avg" : 43,"max" : 43,"min" : 43}
    },
    {
      "_id" : { "region" : "A", "year" : 2002 },
      "_x" : [
        {"q1" : 52,"q2" : 32,"q3" : 43,"state" : "MA"},
        {"q1" : 21,"q2" : 32,"q3" : 43,"state" : "NH"},
        {"q1" : 67,"q2" : 32,"q3" : 43,"state" : "CT"}
      ],
      "_n" : 3,
      "q1" : {"sum" : 140,"avg" : 46.666666666666664,"max" : 67,"min" : 21},
      "q2" : {"sum" : 96,"avg" : 32,"max" : 32,"min" : 32},
      "q3" : {"sum" : 129,"avg" : 43,"max" : 43,"min" : 43}
    },
    // ... 

Things to note:

There is room for improvement here but it is left as an exercise to the reader to experiment. Suggestions:

  1. Option to suppress capturing detail in the _x array because that array might become very very large.
  2. Passing in a set of aggregation functions to use instead of hardcoding sum, avg, min, and max. As a stretch beyond this, the ability to pass in an aggregation fragment.
  3. Config options for naming of $facet name output instead of colon- separated, agg measures names, etc.

Like this? Dislike this? Let me know


Site copyright © 2014-2018 Buzz Moschetti. All rights reserved