Pandas数据规整
数据分析和建模方面的大量编程工作都是用在数据准备上的,有时候存放在文件或数据库中的数据并不能满足数据处理应用的要求
Pandas提供了一组高级的、灵活的、高效的核心函数和算法,它们能够轻松地将数据规整化为你需要的形式
合并
连接
Pandas提供了大量方法,能轻松的对Series,DataFrame和Panel执行合并操作
连接pandas对象 .concat()
import numpy as np
import pandas as pd
df = pd.DataFrame(np.random.randn(10, 4))
df.head()
| 0 | 1 | 2 | 3 |
---|
0 | 0.231308 | 1.193636 | -0.033288 | 0.826399 |
---|
1 | -0.421474 | -0.618510 | -1.266325 | -0.439435 |
---|
2 | -0.279457 | 0.578144 | 1.131353 | -0.639720 |
---|
3 | -1.197750 | -0.446579 | 0.495728 | 0.900704 |
---|
4 | -0.638926 | -0.233019 | -1.106248 | -0.762133 |
---|
pieces = [df[:2], df[3:5], df[7:]]
pieces
[ 0 1 2 3
0 0.231308 1.193636 -0.033288 0.826399
1 -0.421474 -0.618510 -1.266325 -0.439435,
0 1 2 3
3 -1.197750 -0.446579 0.495728 0.900704
4 -0.638926 -0.233019 -1.106248 -0.762133,
0 1 2 3
7 -0.265515 -0.705797 0.695531 -0.257374
8 0.552615 -0.137180 0.859215 -0.853752
9 -1.014105 0.392409 -1.832748 0.612679]
df2 = pd.concat(pieces)
df2
| 0 | 1 | 2 | 3 |
---|
0 | 0.231308 | 1.193636 | -0.033288 | 0.826399 |
---|
1 | -0.421474 | -0.618510 | -1.266325 | -0.439435 |
---|
3 | -1.197750 | -0.446579 | 0.495728 | 0.900704 |
---|
4 | -0.638926 | -0.233019 | -1.106248 | -0.762133 |
---|
7 | -0.265515 | -0.705797 | 0.695531 | -0.257374 |
---|
8 | 0.552615 | -0.137180 | 0.859215 | -0.853752 |
---|
9 | -1.014105 | 0.392409 | -1.832748 | 0.612679 |
---|
追加 .append()
df = pd.DataFrame(np.random.randn(4, 4), columns=['A','B','C','D'])
df
| A | B | C | D |
---|
0 | 1.295901 | -0.742636 | 0.873728 | -0.810075 |
---|
1 | 1.073456 | 0.344627 | 0.156597 | 1.460616 |
---|
2 | 1.696282 | -1.272457 | 1.226460 | -1.944458 |
---|
3 | -0.473047 | 0.147528 | -0.538231 | 0.125467 |
---|
s = df.iloc[2]
s
A 1.696282
B -1.272457
C 1.226460
D -1.944458
Name: 2, dtype: float64
df.append(s, ignore_index=True)
| A | B | C | D |
---|
0 | 1.295901 | -0.742636 | 0.873728 | -0.810075 |
---|
1 | 1.073456 | 0.344627 | 0.156597 | 1.460616 |
---|
2 | 1.696282 | -1.272457 | 1.226460 | -1.944458 |
---|
3 | -0.473047 | 0.147528 | -0.538231 | 0.125467 |
---|
4 | 1.696282 | -1.272457 | 1.226460 | -1.944458 |
---|
分组
group by()
:一般指以下一个或多个操作步骤
- Splitting 将数据分组
- Applying 对每个分组应用不同的function
- Combining 使用某种数据结果展示结果
df = pd.DataFrame({
'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three','two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)
})
df
| A | B | C | D |
---|
0 | foo | one | 0.556699 | 1.543716 |
---|
1 | bar | one | -0.905349 | -0.054870 |
---|
2 | foo | two | 1.220397 | -0.589706 |
---|
3 | bar | three | 0.637305 | -0.046351 |
---|
4 | foo | two | -0.150553 | -0.889157 |
---|
5 | bar | two | -0.771132 | 0.196547 |
---|
6 | foo | one | 0.008275 | -0.571672 |
---|
7 | foo | three | 0.228275 | -1.164593 |
---|
a = df.groupby('A').sum()
a
| C | D |
---|
A | | |
---|
bar | -1.039176 | 0.095325 |
---|
foo | 1.863094 | -1.671411 |
---|
a = df.groupby('A',as_index=False).sum()
a
| A | C | D |
---|
0 | bar | -1.039176 | 0.095325 |
---|
1 | foo | 1.863094 | -1.671411 |
---|
b = df.groupby(['A','B']).sum()
b
| | C | D |
---|
A | B | | |
---|
bar | one | -0.905349 | -0.054870 |
---|
three | 0.637305 | -0.046351 |
---|
two | -0.771132 | 0.196547 |
---|
foo | one | 0.564975 | 0.972044 |
---|
three | 0.228275 | -1.164593 |
---|
two | 1.069844 | -1.478862 |
---|
b = df.groupby(['A','B'],as_index=False).sum()
b
| A | B | C | D |
---|
0 | bar | one | -0.905349 | -0.054870 |
---|
1 | bar | three | 0.637305 | -0.046351 |
---|
2 | bar | two | -0.771132 | 0.196547 |
---|
3 | foo | one | 0.564975 | 0.972044 |
---|
4 | foo | three | 0.228275 | -1.164593 |
---|
5 | foo | two | 1.069844 | -1.478862 |
---|