pandas
- pandas是在numpy基础上做的,用来分析数据的python模块
pandas具有两个重要的数据结构:
- series:序列,用来存储一维数据
- dataframe:数据框,用来存储2维数据
举例
In [4]:
import pandas as pd import numpy as np import matplotlib.pyplot as plt %matplotlib inline
- 创建对象
In [9]:
dates = pd.date_range('20181203', periods=6) # 设置一个表格序列范围,此处为6个元素 dates
Out[9]:
DatetimeIndex(['2018-12-03', '2018-12-04', '2018-12-05', '2018-12-06', '2018-12-07', '2018-12-08'], dtype='datetime64[ns]', freq='D')
In [5]:
s = pd.Series([1, 3, 5, np.nan, 6, 8]) s
Out[5]:
0 1.0 1 3.0 2 5.0 3 NaN 4 6.0 5 8.0 dtype: float64
In [10]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=tuple('ABCD')) df
Out[10]:
A | B | C | D | |
---|---|---|---|---|
2018-12-03 | -0.726504 | -1.006359 | 0.683608 | 0.706655 |
2018-12-04 | 1.689128 | 0.317478 | -0.319723 | -1.224090 |
2018-12-05 | -0.160188 | -0.601648 | 0.216990 | 0.564879 |
2018-12-06 | -0.662323 | 1.021000 | 0.647567 | -1.551296 |
2018-12-07 | -0.520531 | 0.747183 | -1.093671 | 1.963961 |
2018-12-08 | -2.298572 | 1.143361 | -0.050722 | -0.327890 |
In [11]:
df2 = pd.DataFrame({'A': 1., 'B': pd.Timestamp('20181203'), 'C': pd.Series(1, index=list(range(4)),dtype='float32'), 'D': np.array([3] * 4,dtype='int32'), 'E' : pd.Categorical(["test","train","test","train"]), 'F' : 'foo' }) df2
Out[11]:
A | B | C | D | E | F | |
---|---|---|---|---|---|---|
0 | 1.0 | 2018-12-03 | 1.0 | 3 | test | foo |
1 | 1.0 | 2018-12-03 | 1.0 | 3 | train | foo |
2 | 1.0 | 2018-12-03 | 1.0 | 3 | test | foo |
3 | 1.0 | 2018-12-03 | 1.0 | 3 | train | foo |
In [12]:
df2.dtypes
Out[12]:
A float64 B datetime64[ns] C float32 D int32 E category F object dtype: object
In [13]:
df2.C
Out[13]:
0 1.0 1 1.0 2 1.0 3 1.0 Name: C, dtype: float32
查看数据
In [14]:
df.head()
Out[14]:
A | B | C | D | |
---|---|---|---|---|
2018-12-03 | -0.726504 | -1.006359 | 0.683608 | 0.706655 |
2018-12-04 | 1.689128 | 0.317478 | -0.319723 | -1.224090 |
2018-12-05 | -0.160188 | -0.601648 | 0.216990 | 0.564879 |
2018-12-06 | -0.662323 | 1.021000 | 0.647567 | -1.551296 |
2018-12-07 | -0.520531 | 0.747183 | -1.093671 | 1.963961 |
In [15]:
df.tail(3)
Out[15]:
A | B | C | D | |
---|---|---|---|---|
2018-12-06 | -0.662323 | 1.021000 | 0.647567 | -1.551296 |
2018-12-07 | -0.520531 | 0.747183 | -1.093671 | 1.963961 |
2018-12-08 | -2.298572 | 1.143361 | -0.050722 | -0.327890 |
In [16]:
df.index
Out[16]:
DatetimeIndex(['2018-12-03', '2018-12-04', '2018-12-05', '2018-12-06', '2018-12-07', '2018-12-08'], dtype='datetime64[ns]', freq='D')
In [17]:
df.columns
Out[17]:
Index(['A', 'B', 'C', 'D'], dtype='object')
In [18]:
df.values
Out[18]:
array([[-0.72650443, -1.00635897, 0.68360775, 0.7066548 ], [ 1.68912831, 0.3174782 , -0.31972257, -1.22409028], [-0.16018784, -0.60164781, 0.21699028, 0.56487859], [-0.66232342, 1.02099996, 0.64756704, -1.55129606], [-0.52053137, 0.74718252, -1.09367075, 1.96396074], [-2.29857171, 1.14336104, -0.0507219 , -0.32789047]])
In [19]:
df.describe
Out[19]:
<bound method NDFrame.describe of A B C D 2018-12-03 -0.726504 -1.006359 0.683608 0.706655 2018-12-04 1.689128 0.317478 -0.319723 -1.224090 2018-12-05 -0.160188 -0.601648 0.216990 0.564879 2018-12-06 -0.662323 1.021000 0.647567 -1.551296 2018-12-07 -0.520531 0.747183 -1.093671 1.963961 2018-12-08 -2.298572 1.143361 -0.050722 -0.327890>
In [20]:
df.T
Out[20]:
2018-12-03 00:00:00 | 2018-12-04 00:00:00 | 2018-12-05 00:00:00 | 2018-12-06 00:00:00 | 2018-12-07 00:00:00 | 2018-12-08 00:00:00 | |
---|---|---|---|---|---|---|
A | -0.726504 | 1.689128 | -0.160188 | -0.662323 | -0.520531 | -2.298572 |
B | -1.006359 | 0.317478 | -0.601648 | 1.021000 | 0.747183 | 1.143361 |
C | 0.683608 | -0.319723 | 0.216990 | 0.647567 | -1.093671 | -0.050722 |
D | 0.706655 | -1.224090 | 0.564879 | -1.551296 | 1.963961 | -0.327890 |
In [21]:
df.sort_index(axis = 1, ascending=False)
Out[21]:
D | C | B | A | |
---|---|---|---|---|
2018-12-03 | 0.706655 | 0.683608 | -1.006359 | -0.726504 |
2018-12-04 | -1.224090 | -0.319723 | 0.317478 | 1.689128 |
2018-12-05 | 0.564879 | 0.216990 | -0.601648 | -0.160188 |
2018-12-06 | -1.551296 | 0.647567 | 1.021000 | -0.662323 |
2018-12-07 | 1.963961 | -1.093671 | 0.747183 | -0.520531 |
2018-12-08 | -0.327890 | -0.050722 | 1.143361 | -2.298572 |
In [22]:
df.sort_values(by='B')
Out[22]:
A | B | C | D | |
---|---|---|---|---|
2018-12-03 | -0.726504 | -1.006359 | 0.683608 | 0.706655 |
2018-12-05 | -0.160188 | -0.601648 | 0.216990 | 0.564879 |
2018-12-04 | 1.689128 | 0.317478 | -0.319723 | -1.224090 |
2018-12-07 | -0.520531 | 0.747183 | -1.093671 | 1.963961 |
2018-12-06 | -0.662323 | 1.021000 | 0.647567 | -1.551296 |
2018-12-08 | -2.298572 | 1.143361 | -0.050722 | -0.327890 |
选择数据
In [23]:
df['A']
Out[23]:
2018-12-03 -0.726504 2018-12-04 1.689128 2018-12-05 -0.160188 2018-12-06 -0.662323 2018-12-07 -0.520531 2018-12-08 -2.298572 Freq: D, Name: A, dtype: float64
In [24]:
df.A
Out[24]:
2018-12-03 -0.726504 2018-12-04 1.689128 2018-12-05 -0.160188 2018-12-06 -0.662323 2018-12-07 -0.520531 2018-12-08 -2.298572 Freq: D, Name: A, dtype: float64
In [25]:
df[0:3]
Out[25]:
A | B | C | D | |
---|---|---|---|---|
2018-12-03 | -0.726504 | -1.006359 | 0.683608 | 0.706655 |
2018-12-04 | 1.689128 | 0.317478 | -0.319723 | -1.224090 |
2018-12-05 | -0.160188 | -0.601648 | 0.216990 | 0.564879 |
In [28]:
df['20181203': '20181205'] # 结尾也会被返回
Out[28]:
A | B | C | D | |
---|---|---|---|---|
2018-12-03 | -0.726504 | -1.006359 | 0.683608 | 0.706655 |
2018-12-04 | 1.689128 | 0.317478 | -0.319723 | -1.224090 |
2018-12-05 | -0.160188 | -0.601648 | 0.216990 | 0.564879 |
使用标签选择数据
In [29]:
df.loc[dates[0]]
Out[29]:
A -0.726504 B -1.006359 C 0.683608 D 0.706655 Name: 2018-12-03 00:00:00, dtype: float64
In [30]:
df.loc[:,['A', 'B']]
Out[30]:
A | B | |
---|---|---|
2018-12-03 | -0.726504 | -1.006359 |
2018-12-04 | 1.689128 | 0.317478 |
2018-12-05 | -0.160188 | -0.601648 |
2018-12-06 | -0.662323 | 1.021000 |
2018-12-07 | -0.520531 | 0.747183 |
2018-12-08 | -2.298572 | 1.143361 |
In [31]:
df.loc['20181203': '20181205', ['A', 'B']]
Out[31]:
A | B | |
---|---|---|
2018-12-03 | -0.726504 | -1.006359 |
2018-12-04 | 1.689128 | 0.317478 |
2018-12-05 | -0.160188 | -0.601648 |
In [32]:
df.loc['20181203': '20181205', 'A': 'B']
Out[32]:
A | B | |
---|---|---|
2018-12-03 | -0.726504 | -1.006359 |
2018-12-04 | 1.689128 | 0.317478 |
2018-12-05 | -0.160188 | -0.601648 |
In [33]:
df.loc[dates[0], 'A']
Out[33]:
-0.7265044266335897
In [34]:
df.at[dates[0], 'A']
Out[34]:
-0.7265044266335897
- 使用位置选择
In [35]:
df.iloc[3]
Out[35]:
A -0.662323 B 1.021000 C 0.647567 D -1.551296 Name: 2018-12-06 00:00:00, dtype: float64
In [36]:
df.iloc[3: 5, 0: 2]
Out[36]:
A | B | |
---|---|---|
2018-12-06 | -0.662323 | 1.021000 |
2018-12-07 | -0.520531 | 0.747183 |
- 布尔索引
In [38]:
df[df.A > 0]
Out[38]:
A | B | C | D | |
---|---|---|---|---|
2018-12-04 | 1.689128 | 0.317478 | -0.319723 | -1.22409 |
In [40]:
df[df > 0]
Out[40]:
A | B | C | D | |
---|---|---|---|---|
2018-12-03 | NaN | NaN | 0.683608 | 0.706655 |
2018-12-04 | 1.689128 | 0.317478 | NaN | NaN |
2018-12-05 | NaN | NaN | 0.216990 | 0.564879 |
2018-12-06 | NaN | 1.021000 | 0.647567 | NaN |
2018-12-07 | NaN | 0.747183 | NaN | 1.963961 |
2018-12-08 | NaN | 1.143361 | NaN | NaN |
- 计数
In [41]:
s = pd.Series(np.random.randint(0, 7, size = 10)) # 第一个参数下限,第二个上线,size是个数 s
Out[41]:
0 2 1 1 2 1 3 2 4 6 5 4 6 5 7 6 8 2 9 4 dtype: int32
In [42]:
s.value_counts()
Out[42]:
2 3 6 2 4 2 1 2 5 1 dtype: int64
- 合并
In [43]:
df = pd.DataFrame(np.random.rand(10, 4)) df
Out[43]:
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 0.260510 | 0.712819 | 0.990425 | 0.744928 |
1 | 0.251795 | 0.926093 | 0.749391 | 0.909539 |
2 | 0.423172 | 0.574187 | 0.829044 | 0.820088 |
3 | 0.522513 | 0.838938 | 0.032058 | 0.007587 |
4 | 0.148221 | 0.144721 | 0.759867 | 0.991285 |
5 | 0.847081 | 0.383048 | 0.485961 | 0.746508 |
6 | 0.886374 | 0.347589 | 0.447043 | 0.629631 |
7 | 0.431346 | 0.129931 | 0.022428 | 0.260882 |
8 | 0.701627 | 0.919099 | 0.386347 | 0.002468 |
9 | 0.924300 | 0.679334 | 0.226608 | 0.765498 |
In [44]:
pieces = [df[: 3], df[3: 7], df[7:]] pieces
Out[44]:
[ 0 1 2 3 0 0.260510 0.712819 0.990425 0.744928 1 0.251795 0.926093 0.749391 0.909539 2 0.423172 0.574187 0.829044 0.820088, 0 1 2 3 3 0.522513 0.838938 0.032058 0.007587 4 0.148221 0.144721 0.759867 0.991285 5 0.847081 0.383048 0.485961 0.746508 6 0.886374 0.347589 0.447043 0.629631, 0 1 2 3 7 0.431346 0.129931 0.022428 0.260882 8 0.701627 0.919099 0.386347 0.002468 9 0.924300 0.679334 0.226608 0.765498]
In [45]:
pd.concat(pieces)
Out[45]:
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 0.260510 | 0.712819 | 0.990425 | 0.744928 |
1 | 0.251795 | 0.926093 | 0.749391 | 0.909539 |
2 | 0.423172 | 0.574187 | 0.829044 | 0.820088 |
3 | 0.522513 | 0.838938 | 0.032058 | 0.007587 |
4 | 0.148221 | 0.144721 | 0.759867 | 0.991285 |
5 | 0.847081 | 0.383048 | 0.485961 | 0.746508 |
6 | 0.886374 | 0.347589 | 0.447043 | 0.629631 |
7 | 0.431346 | 0.129931 | 0.022428 | 0.260882 |
8 | 0.701627 | 0.919099 | 0.386347 | 0.002468 |
9 | 0.924300 | 0.679334 | 0.226608 | 0.765498 |
- 还可以用join(),append()
数据表格
In [46]:
np.random.seed(1) df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3, 'B' : ['A', 'B', 'C'] * 4, 'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2, 'D' : np.random.randn(12), 'E' : np.random.randn(12)}) df
Out[46]:
A | B | C | D | E | |
---|---|---|---|---|---|
0 | one | A | foo | 1.624345 | -0.322417 |
1 | one | B | foo | -0.611756 | -0.384054 |
2 | two | C | foo | -0.528172 | 1.133769 |
3 | three | A | bar | -1.072969 | -1.099891 |
4 | one | B | bar | 0.865408 | -0.172428 |
5 | one | C | bar | -2.301539 | -0.877858 |
6 | two | A | foo | 1.744812 | 0.042214 |
7 | three | B | foo | -0.761207 | 0.582815 |
8 | one | C | foo | 0.319039 | -1.100619 |
9 | one | A | bar | -0.249370 | 1.144724 |
10 | two | B | bar | 1.462108 | 0.901591 |
11 | three | C | bar | -2.060141 | 0.502494 |
In [49]:
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C']) # 对应于Excel,以A,B为行维度,以C为列维度,计算D列之和
Out[49]:
C | bar | foo | |
---|---|---|---|
A | B | ||
one | A | -0.249370 | 1.624345 |
B | 0.865408 | -0.611756 | |
C | -2.301539 | 0.319039 | |
three | A | -1.072969 | NaN |
B | NaN | -0.761207 | |
C | -2.060141 | NaN | |
two | A | NaN | 1.744812 |
B | 1.462108 | NaN | |
C | NaN | -0.528172 |
绘图
In [52]:
ts = pd.Series(np.random.rand(1000), index=pd.date_range('1/1/2017', periods=1000)) ts = ts.cumsum() %matplotlib inline ts.plot() plt.show()
In [55]:
df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, columns=['A', 'B', 'C', 'D']) df = df.cumsum() df.plot() plt.legend(loc='best')
Out[55]:
<matplotlib.legend.Legend at 0xc7ed410>