import pandas as pd
import numpy as np
df = pd.DataFrame({"key1":["a","a","b","b","a"],
"key2":["one","two","one","two","one"],
"data1":np.random.randint(1,10,5),
"data2":np.random.randint(1,10,5)})
df
| data1 | data2 | key1 | key2 |
---|
0 | 3 | 3 | a | one |
---|
1 | 1 | 4 | a | two |
---|
2 | 9 | 4 | b | one |
---|
3 | 4 | 4 | b | two |
---|
4 | 7 | 4 | a | one |
---|
df["data1"].groupby(df["key1"]).mean()
key1
a 3.666667
b 6.500000
Name: data1, dtype: float64
key = [1,2,1,2,3]
df["data1"].groupby(key).mean()
1 6.0
2 2.5
3 7.0
Name: data1, dtype: float64
df["data1"].groupby([df["key1"],df["key2"]]).sum()
key1 key2
a one 10
two 1
b one 9
two 4
Name: data1, dtype: int32
df["data1"].groupby([df["key1"],df["key2"]]).size()
key1 key2
a one 2
two 1
b one 1
two 1
dtype: int64
df.groupby("key1").sum()
mean = df.groupby(["key1","key2"]).sum()["data1"]
mean
key1 key2
a one 10
two 1
b one 9
two 4
Name: data1, dtype: int32
mean.unstack()
for name,group in df.groupby("key1"):
print(name)
print(group)
a
data1 data2 key1 key2
0 3 3 a one
1 1 4 a two
4 7 4 a one
b
data1 data2 key1 key2
2 9 4 b one
3 4 4 b two
df.groupby(df.dtypes, axis = 1).sum()
| int32 | object |
---|
0 | 6 | aone |
---|
1 | 5 | atwo |
---|
2 | 13 | bone |
---|
3 | 8 | btwo |
---|
4 | 11 | aone |
---|
df = pd.DataFrame(np.random.randint(1,10,(5,5)),
columns=["a","b","c","d","e"],
index=["Alice","Bob","Candy","Dark","Emily"])
df.ix[1,1:3] = np.NaN
df
| a | b | c | d | e |
---|
Alice | 4 | 4.0 | 5.0 | 4 | 6 |
---|
Bob | 6 | NaN | NaN | 8 | 1 |
---|
Candy | 7 | 9.0 | 6.0 | 8 | 8 |
---|
Dark | 3 | 6.0 | 1.0 | 3 | 9 |
---|
Emily | 2 | 3.0 | 6.0 | 9 | 1 |
---|
mapping = {"a":"red","b":"red","c":"blue","d":"orange","e":"blue"}
grouped = df.groupby(mapping, axis=1)
grouped.sum()
| blue | orange | red |
---|
Alice | 11.0 | 4.0 | 8.0 |
---|
Bob | 1.0 | 8.0 | 6.0 |
---|
Candy | 14.0 | 8.0 | 16.0 |
---|
Dark | 10.0 | 3.0 | 9.0 |
---|
Emily | 7.0 | 9.0 | 5.0 |
---|
grouped.count()
| blue | orange | red |
---|
Alice | 2 | 1 | 2 |
---|
Bob | 1 | 1 | 1 |
---|
Candy | 2 | 1 | 2 |
---|
Dark | 2 | 1 | 2 |
---|
Emily | 2 | 1 | 2 |
---|
df = pd.DataFrame(np.random.randint(1,10,(5,5)),
columns=["a","b","c","d","e"],
index=["Alice","Bob","Candy","Dark","Emily"])
df
| a | b | c | d | e |
---|
Alice | 7 | 9 | 8 | 9 | 8 |
---|
Bob | 8 | 4 | 2 | 9 | 1 |
---|
Candy | 9 | 6 | 1 | 1 | 1 |
---|
Dark | 6 | 6 | 8 | 1 | 1 |
---|
Emily | 3 | 5 | 8 | 8 | 6 |
---|
def _group_key(idx):
print(idx)
return(idx)
df.groupby(_group_key).size()
Alice
Bob
Candy
Dark
Emily
Alice 1
Bob 1
Candy 1
Dark 1
Emily 1
dtype: int64
def _group_key(idx):
print(idx)
return len(idx)
df.groupby(_group_key).size()
Alice
Bob
Candy
Dark
Emily
3 1
4 1
5 3
dtype: int64
df.groupby(len).size()
3 1
4 1
5 3
dtype: int64
df.groupby(len).sum()
| a | b | c | d | e |
---|
3 | 8 | 4 | 2 | 9 | 1 |
---|
4 | 6 | 6 | 8 | 1 | 1 |
---|
5 | 19 | 20 | 17 | 18 | 15 |
---|
columns = pd.MultiIndex.from_arrays([["china","usa","china","usa","china"],
["A","A","B","C","B"]], names = ["country","index"])
df = pd.DataFrame(np.random.randint(1,10,(5,5)),columns=columns)
df
country | china | usa | china | usa | china |
---|
index | A | A | B | C | B |
---|
0 | 6 | 5 | 5 | 5 | 9 |
---|
1 | 9 | 5 | 2 | 9 | 2 |
---|
2 | 1 | 6 | 5 | 9 | 2 |
---|
3 | 8 | 9 | 1 | 3 | 3 |
---|
4 | 2 | 2 | 7 | 9 | 6 |
---|
df.groupby(level="country", axis=1).sum()
country | china | usa |
---|
0 | 20 | 10 |
---|
1 | 13 | 14 |
---|
2 | 8 | 15 |
---|
3 | 12 | 12 |
---|
4 | 15 | 11 |
---|
df.groupby(level="index", axis=1).sum()
index | A | B | C |
---|
0 | 11 | 14 | 5 |
---|
1 | 14 | 4 | 9 |
---|
2 | 7 | 7 | 9 |
---|
3 | 17 | 4 | 3 |
---|
4 | 4 | 13 | 9 |
---|