| 17-Oct-2016 Revised 7-Jan-2017 | Like this? Dislike this? Let me know |
This article covers two topics at a high level:
For Starters: What is pandas? |
![]() |
import pandas as pd from pymongo import MongoClient client = MongoClient() db = client.sourceDatabase df = pd.DataFrame(list(db.myCollection.find()))
db.myCollection.insert({"a":1, "b":2, "c":3})
db.myCollection.insert({"a":4, "b":5, "c":6})
_id a b c
0 5804e627119c0d2c13c0a7e1 1.0 2.0 3.0
1 5804e627119c0d2c13c0a7e2 4.0 5.0 6.0
db.myCollection.insert({"a":"hello", "b": datetime.datetime.now(), "c": {"d":"AA","e":"BB"}, "f": [2,3,4]})
db.myCollection.insert({"a":"goodbye", "c": {"e":"WW"}, "f": [0, "grimble", datetime.datetime.now(), 7.7 ]})
_id a b \
0 5804e758119c0d2c13c0a7e5 hello 2016-10-17 14:59:36.029
1 5804e758119c0d2c13c0a7e6 goodbye
c f
0 {u'e': u'BB', u'd': u'AA'} [2.0, 3.0, 4.0]
1 {u'e': u'WW'} [0.0, grimble, 2016-10-17 14:59:36.107000, 7.7]
Although The 5-Liner highlights the ease of integration, it sidesteps an important issue: how to best deal with arrays in MongoDB.
values = [
[ 4, 5, 6, 7],
[ 7, 8, 9, 10],
[ 10, 11, 12]
]
print pd.DataFrame(values)
0 1 2 3
0 4 5 6 7.0
1 7 8 9 10.0
2 10 11 12 NaN
values = [
{ "a":4, "b":5, "c":6, "d":7},
{ "a":7, "b":8, "c":9, "d":10},
{ "b":11, "c":12 }
]
print pd.DataFrame(values)
a b c d
0 4.0 5 6 7.0
1 7.0 8 9 10.0
2 NaN 11 12 NaN
values = [
{ "a": [ 4,5,6,7]},
{ "a": [ 8,9,10] },
{ "a": [ 11, 12] }
]
print pd.DataFrame(values)
a
0 [4, 5, 6, 7]
1 [8, 9, 10]
2 [11, 12]
In MongoDB it is often very advantageous to store arrays of values. But 5-Liner approach suffers from the same problem:
data = [
{ "a": [ 4,5,6,7]},
{ "a": [ 8,9,10] },
{ "a": [ 11, 12] }
]
db.myCollection.insert(data)
print pd.DataFrame(list(db.myCollection.find()))
_id a
0 58056115119c0d2c13c0a7e9 [4.0, 5.0, 6.0, 7.0]
1 58056115119c0d2c13c0a7ea [8.0, 9.0, 10.0]
2 58056115119c0d2c13c0a7eb [11.0, 12.0]
values = []
for cc in db.myCollection.find():
values.append(cc['a'])
print pd.DataFrame(values)
0 1 2 3
0 4.0 5.0 6.0 7.0
1 8.0 9.0 10.0 NaN
2 11.0 12.0 NaN NaN
Each document will typically have some kind of label associated with the value array. Let's add that as field n:
data = [
{ "a": [ 4,5,6,7], "n": "foo"},
{ "a": [ 8,9,10] , "n": "bar"},
{ "a": [ 11, 12] , "n": "baz"}
]
db.myCollection.insert(data)
values = []
seriesLbls = []
for cc in db.myCollection.find():
values.append(cc['a'])
seriesLbls.append(cc['n'])
print pd.DataFrame(values, index=seriesLbls)
0 1 2 3
foo 4.0 5.0 6.0 7.0
bar 8.0 9.0 10.0 NaN
baz 11.0 12.0 NaN NaN
Datetimes also flow well from MongoDB into pandas for use as an index. Let's add a date to each record and use that instead of the name:
data = [
{ "a": [ 4,5,6,7], "n": "foo", "d": datetime.datetime(2016,7,1) },
{ "a": [ 8,9,10] , "n": "bar", "d": datetime.datetime(2016,8,1) },
{ "a": [ 11, 12] , "n": "baz", "d": datetime.datetime(2016,9,1) }
]
db.myCollection.insert(data)
values = []
dates = []
for cc in db.myCollection.find():
values.append(cc['a'])
dates.append(cc['d'])
didx = pd.DatetimeIndex(dates) # not strictly necessary
print pd.DataFrame(values, index=didx)
0 1 2 3
2016-07-01 4.0 5.0 6.0 7.0
2016-08-01 8.0 9.0 10.0 NaN
2016-09-01 11.0 12.0 NaN NaN
Finally, there are a few ways to craft column labels to make the dataframe nice and complete. Here we simply look for the longest array to drive the generation of column names C0 through Cn (via a list comprehension):
values = []
seriesLbls = []
max = 0
for cc in db.myCollection.find():
if len(cc['a']) > max:
max = len(cc['a'])
values.append(cc['a'])
seriesLbls.append(cc['n'])
df = pd.DataFrame(values, index=seriesLbls, columns=[ "C"+str(n) for n in range(max)])
print df
print df.T
C0 C1 C2 C3
foo 4.0 5.0 6.0 7.0
bar 8.0 9.0 10.0 NaN
baz 11.0 12.0 NaN NaN
foo bar baz
C0 4.0 8.0 11.0
C1 5.0 9.0 12.0
C2 6.0 10.0 NaN
C3 7.0 NaN NaN
values = [
{ "name":"A", "v": [ [1,2,3], [4,5,6], [7,8,9] ] }
,{ "name":"B", "v": [ [2,3,4], [5,6,4], [8,9,10] ] }
]
db.myCollection.drop()
db.myCollection.insert(values)
print pd.DataFrame(list(db.myCollection.find()))
_id name v
0 587115a8ed58db9467d94d34 A [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
1 587115a8ed58db9467d94d35 B [[2, 3, 4], [5, 6, 4], [8, 9, 10]]
values = []
item = db.myCollection.find_one({"name":"B"})
for row in item['v']: # item['v'] is array of array, so row is array
values.append(row)
print pd.DataFrame(values)
0 1 2
0 2 3 4
1 5 6 4
2 8 9 10
The examples above focused on efficiently moving arrays from MongoDB into dataframes but did not explore filtering and aggregation. Below is an example of how the powerful MongoDB aggregation framework can be brought into the picture. These filtering commands are executed in an index-optimized fashion on the server side, dramatically reducing the amount of material populating the dataframe and improving performance all around:
values = []
seriesLbls = []
max = 0
for cc in db.myCollection.aggregate([
# First, only get things after 2016-07-15. Typically this would be on an
# indexed field and will rapidly cut down the material to a fraction
# of what might be stored in the DB. "First cut" filtering on dates,
# portfolios, owners, compute run ids, etc. is a very important and useful
# capability.
#
{"$match": {"d": {"$gt": datetime.datetime(2016,7,15)}}}
# Next, compute stdDevPop of the array. MongoDB offers powerful array
# handling functions:
,{"$addFields": {"sdev": {"$stdDevPop": "$a"}}}
# Next, only permit those items where stdDevPop is <.75 to come through:
,{"$match": {"sdev": {"$lt": 0.75}}}
]):
if len(cc['a']) > max:
max = len(cc['a'])
values.append(cc['a'])
seriesLbls.append(cc['n'])
df = pd.DataFrame(values, index=seriesLbls, columns=[ "C"+str(n) for n in range(max)])
print df
C0 C1
baz 11 12
# Next, only permit those items where stdDevPop is <.75 to come through:
# ,{"$match": {"sdev": {"$lt": 0.75}}} # commented out
C0 C1 C2
bar 8 9 10.0
baz 11 12 NaN
data = [
{ "v": 4, "n": "foo", "d": datetime.datetime(2016,7,1) },
{ "v": 5, "n": "foo", "d": datetime.datetime(2016,7,1) },
{ "v": 6, "n": "foo", "d": datetime.datetime(2016,7,1) },
{ "v": 7, "n": "foo", "d": datetime.datetime(2016,7,1) },
{ "v": 8, "n": "bar", "d": datetime.datetime(2016,8,1) },
{ "v": 9, "n": "bar", "d": datetime.datetime(2016,8,1) },
{ "v": 10, "n": "bar", "d": datetime.datetime(2016,8,1) },
{ "v": 11, "n": "baz", "d": datetime.datetime(2016,9,1) },
{ "v": 12, "n": "baz", "d": datetime.datetime(2016,9,1) }
]
db.myCollection.insert(data)
values = []
seriesLbls = []
max = 0
for cc in db.myCollection.aggregate([
{"$group": {"_id": "$n", "a": {"$push": "$v"}}}
]):
print cc
if len(cc['a']) > max:
max = len(cc['a'])
values.append(cc['a'])
seriesLbls.append(cc['_id'])
print pd.DataFrame(values, index=seriesLbls, columns=[ "C"+str(n) for n in range(max)])
C0 C1 C2 C3
baz 11 12 NaN NaN
bar 8 9 10.0 NaN
foo 4 5 6.0 7.0
Of course, we can add in the filtering from the previous example before the $group operator to reduce the amount of material being passed to $group and later, the filter for $stdDevPop:
for cc in db.myCollection.aggregate([
{"$match": {"d": {"$gt": datetime.datetime(2016,7,15)}}}
,{"$group": {"_id": "$n", "a": {"$push": "$v"}}}
,{"$addFields": {"sdev": {"$stdDevPop": "$a"}}}
,{"$match": {"sdev": {"$lt": 0.75}}}
]):
...
df = pd.DataFrame(list(db.myCollection.find()))
def iterator2dataframes(iterator, chunk_size: int):
"""Turn an iterator into multiple small pandas.DataFrame
This is a balance between memory and efficiency
"""
records = []
frames = []
for i, record in enumerate(iterator):
records.append(record)
if i % chunk_size == chunk_size - 1:
frames.append(pd.DataFrame(records))
records = []
if records:
frames.append(pd.DataFrame(records))
return pd.concat(frames)
df = iterator2dataframe(db.myCollection.find(), 10000)
Like this? Dislike this? Let me know