python数据分析之pandas
pandas主要提供了3种数据结构:
- Series,带标签的一维数组;
- DataFrame,带标签且大小可变的二维表格结构;
- Panel,带标签且大小可变的三维数组。
(1)生成一维数组
>>> import numpy as np
>>> import pandas as pd
>>> x = pd.Series([1, 3, 5, np.nan])
>>> x
0 1.0
1 3.0
2 5.0
3 NaN
dtype: float64
>>> pd.date_range(start='20130101', end='20131231', freq='H')
DatetimeIndex(['2013-01-01 00:00:00', '2013-01-01 01:00:00',
'2013-01-01 02:00:00', '2013-01-01 03:00:00',
'2013-01-01 04:00:00', '2013-01-01 05:00:00',
'2013-01-01 06:00:00', '2013-01-01 07:00:00',
'2013-01-01 08:00:00', '2013-01-01 09:00:00',
...
'2013-12-30 15:00:00', '2013-12-30 16:00:00',
'2013-12-30 17:00:00', '2013-12-30 18:00:00',
'2013-12-30 19:00:00', '2013-12-30 20:00:00',
'2013-12-30 21:00:00', '2013-12-30 22:00:00',
'2013-12-30 23:00:00', '2013-12-31 00:00:00'],
dtype='datetime64[ns]', length=8737, freq='H')
>>> dates = pd.date_range(start='20130101', end='20131231', freq='D') # 间隔为天
>>> dates
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
'2013-01-05', '2013-01-06', '2013-01-07', '2013-01-08',
'2013-01-09', '2013-01-10',
...
'2013-12-22', '2013-12-23', '2013-12-24', '2013-12-25',
'2013-12-26', '2013-12-27', '2013-12-28', '2013-12-29',
'2013-12-30', '2013-12-31'],
dtype='datetime64[ns]', length=365, freq='D')
>>> dates = pd.date_range(start='20130101', end='20131231', freq='M') # 间隔为月
>>> dates
DatetimeIndex(['2013-01-31', '2013-02-28', '2013-03-31', '2013-04-30',
'2013-05-31', '2013-06-30', '2013-07-31', '2013-08-31',
'2013-09-30', '2013-10-31', '2013-11-30', '2013-12-31'],
dtype='datetime64[ns]', freq='M')
>>> pd.period_range('20170601', '20170630', freq='W')
PeriodIndex(['2017-05-29/2017-06-04', '2017-06-05/2017-06-11',
'2017-06-12/2017-06-18', '2017-06-19/2017-06-25',
'2017-06-26/2017-07-02'],
dtype='period[W-SUN]', freq='W-SUN')
>>> pd.period_range('20170601', '20170630', freq='D')
PeriodIndex(['2017-06-01', '2017-06-02', '2017-06-03', '2017-06-04',
'2017-06-05', '2017-06-06', '2017-06-07', '2017-06-08',
'2017-06-09', '2017-06-10', '2017-06-11', '2017-06-12',
'2017-06-13', '2017-06-14', '2017-06-15', '2017-06-16',
'2017-06-17', '2017-06-18', '2017-06-19', '2017-06-20',
'2017-06-21', '2017-06-22', '2017-06-23', '2017-06-24',
'2017-06-25', '2017-06-26', '2017-06-27', '2017-06-28',
'2017-06-29', '2017-06-30'],
dtype='period[D]', freq='D')
>>> pd.period_range('20170601', '20170630', freq='H')
PeriodIndex(['2017-06-01 00:00', '2017-06-01 01:00', '2017-06-01 02:00',
'2017-06-01 03:00', '2017-06-01 04:00', '2017-06-01 05:00',
'2017-06-01 06:00', '2017-06-01 07:00', '2017-06-01 08:00',
'2017-06-01 09:00',
...
'2017-06-29 15:00', '2017-06-29 16:00', '2017-06-29 17:00',
'2017-06-29 18:00', '2017-06-29 19:00', '2017-06-29 20:00',
'2017-06-29 21:00', '2017-06-29 22:00', '2017-06-29 23:00',
'2017-06-30 00:00'],
dtype='period[H]', length=697, freq='H')
(2)生成DataFrame
>>> pd.DataFrame(np.random.randn(12,4), index=dates, columns=list('ABCD'))
A B C D
2013-01-31 1.628310 -0.281223 0.247675 -1.604243
2013-02-28 0.071069 1.310116 -0.945838 -0.613267
2013-03-31 0.956887 -1.691863 0.170843 -0.387298
2013-04-30 0.869391 -1.939210 2.220454 1.654112
2013-05-31 -0.802416 0.558953 1.086787 -0.870317
2013-06-30 0.463761 2.451659 0.165985 0.913551
2013-07-31 1.755720 1.246089 -0.237590 -0.892358
2013-08-31 0.191604 -1.481263 -0.142491 -2.672721
2013-09-30 -0.146444 0.493261 -1.719681 0.676592
2013-10-31 1.153289 0.179862 -1.879004 -0.616305
2013-11-30 -0.500726 1.057525 0.140623 -0.113951
2013-12-31 0.229572 -0.778378 -0.682233 0.009218
>>> pd.DataFrame([np.random.randint(1, 100, 4) for i in range(12)],
index=dates, columns=list('ABCD')) # 4列随机数
A B C D
2013-01-31 17 72 26 13
2013-02-28 61 42 88 3
2013-03-31 14 61 97 95
2013-04-30 73 87 55 1
2013-05-31 58 80 20 2
2013-06-30 41 6 40 70
2013-07-31 51 48 81 77
2013-08-31 56 54 76 61
2013-09-30 32 27 82 76
2013-10-31 21 78 91 15
2013-11-30 75 77 17 50
2013-12-31 54 12 75 53
>>> pd.DataFrame({'A':np.random.randint(1, 100, 4),
'B':pd.date_range(start='20130101', periods=4, freq='D'),
'C':pd.Series([1, 2, 3, 4],index=list(range(4)),dtype='float32'),
'D':np.array([3] * 4,dtype='int32'),
'E':pd.Categorical(["test","train","test","train"]),
'F':'foo'})
A B C D E F
0 65 2013-01-01 1.0 3 test foo
1 18 2013-01-02 2.0 3 train foo
2 24 2013-01-03 3.0 3 test foo
3 32 2013-01-04 4.0 3 train foo
>>> df = pd.DataFrame({'A':np.random.randint(1, 100, 4),
'B':pd.date_range(start='20130101', periods=4, freq='D'),
'C':pd.Series([1, 2, 3, 4],\
index=['zhang', 'li', 'zhou', 'wang'],dtype='float32'),
'D':np.array([3] * 4,dtype='int32'),
'E':pd.Categorical(["test","train","test","train"]),
'F':'foo'})
>>> df
A B C D E F
zhang 20 2013-01-01 1.0 3 test foo
li 26 2013-01-02 2.0 3 train foo
zhou 63 2013-01-03 3.0 3 test foo
wang 69 2013-01-04 4.0 3 train foo
(3)二维数据查看
>>> df.head() # 默认显示前5行
A B C D E F
zhang 20 2013-01-01 1.0 3 test foo
li 26 2013-01-02 2.0 3 train foo
zhou 63 2013-01-03 3.0 3 test foo
wang 69 2013-01-04 4.0 3 train foo
>>> df.head(3) # 查看前3行
A B C D E F
zhang 20 2013-01-01 1.0 3 test foo
li 26 2013-01-02 2.0 3 train foo
zhou 63 2013-01-03 3.0 3 test foo
>>> df.tail(2) # 查看最后2行
A B C D E F
zhou 63 2013-01-03 3.0 3 test foo
wang 69 2013-01-04 4.0 3 train foo
(3)二维数据查看
>>> df.head() # 默认显示前5行
A B C D E F
zhang 20 2013-01-01 1.0 3 test foo
li 26 2013-01-02 2.0 3 train foo
zhou 63 2013-01-03 3.0 3 test foo
wang 69 2013-01-04 4.0 3 train foo
>>> df.head(3) # 查看前3行
A B C D E F
zhang 20 2013-01-01 1.0 3 test foo
li 26 2013-01-02 2.0 3 train foo
zhou 63 2013-01-03 3.0 3 test foo
>>> df.tail(2) # 查看最后2行
A B C D E F
zhou 63 2013-01-03 3.0 3 test foo
wang 69 2013-01-04 4.0 3 train foo
(4)查看二维数据的索引、列名和数据
>>> df.index
Index(['zhang', 'li', 'zhou', 'wang'], dtype='object')
>>> df.columns
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
>>> df.values
array([[20, Timestamp('2013-01-01 00:00:00'), 1.0, 3, 'test', 'foo'],
[26, Timestamp('2013-01-02 00:00:00'), 2.0, 3, 'train', 'foo'],
[63, Timestamp('2013-01-03 00:00:00'), 3.0, 3, 'test', 'foo'],
[69, Timestamp('2013-01-04 00:00:00'), 4.0, 3, 'train', 'foo']], dtype=object)
(5)查看数据的统计信息
>>> df.describe() # 平均值、标准差、最小值、最大值等信息
A C D
count 4.000000 4.000000 4.0
mean 44.500000 2.500000 3.0
std 25.066578 1.290994 0.0
min 20.000000 1.000000 3.0
25% 24.500000 1.750000 3.0
50% 44.500000 2.500000 3.0
75% 64.500000 3.250000 3.0
max 69.000000 4.000000 3.0
(6)二维数据转置
>>> df.T
zhang li zhou \
A 20 26 63
B 2013-01-01 00:00:00 2013-01-02 00:00:00 2013-01-03 00:00:00
C 1 2 3
D 3 3 3
E test train test
F foo foo foo
wang
A 69
B 2013-01-04 00:00:00
C 4
D 3
E train
F foo
(7)排序
>>> df.sort_index(axis=0, ascending=False) # 对轴进行排序
A B C D E F
zhou 63 2013-01-03 3.0 3 test foo
zhang 20 2013-01-01 1.0 3 test foo
wang 69 2013-01-04 4.0 3 train foo
li 26 2013-01-02 2.0 3 train foo
>>> df.sort_index(axis=0, ascending=True)
A B C D E F
li 26 2013-01-02 2.0 3 train foo
wang 69 2013-01-04 4.0 3 train foo
zhang 20 2013-01-01 1.0 3 test foo
zhou 63 2013-01-03 3.0 3 test foo
>>> df.sort_index(axis=1, ascending=False)
F E D C B A
zhang foo test 3 1.0 2013-01-01 20
li foo train 3 2.0 2013-01-02 26
zhou foo test 3 3.0 2013-01-03 63
wang foo train 3 4.0 2013-01-04 69
>>> df.sort_values(by='A') # 对数据进行排序
# 也可以使用by=['A','B']按多列进行排序
A B C D E F
zhang 20 2013-01-01 1.0 3 test foo
li 26 2013-01-02 2.0 3 train foo
zhou 63 2013-01-03 3.0 3 test foo
wang 69 2013-01-04 4.0 3 train foo
(8)数据选择
>>> df['A'] # 选择列
zhang 20
li 26
zhou 63
wang 69
Name: A, dtype: int32
>>> 69 in df['A']
False
>>> 69 in df['A'].values
True
>>> df.iloc[0,1] # 查询第0行第1列位置的数据值
Timestamp('2013-01-01 00:00:00')
>>> df.iloc[2,2] # 查询第2行第2列位置的数据值
3.0
>>> df[df.A>50] # 按给定条件进行查询
A B C D E F
zhou 63 2013-01-03 3.0 3 test foo
wang 69 2013-01-04 4.0 3 train foo
>>> df[df['E']=='test'] # 按给定条件进行查询
A B C D E F
zhang 20 2013-01-01 1.0 3 test foo
zhou 63 2013-01-03 3.0 3 test foo
>>> df[df['A'].isin([20,69])]
A B C D E F
zhang 20 2013-01-01 1.0 3 test foo
wang 69 2013-01-04 4.0 3 train foo
>>> df.nlargest(3, ['C']) # 返回指定列最大的前3行
A B C D E F
wang 69 2013-01-04 4.0 3 train foo
zhou 63 2013-01-03 3.0 3 test foo
li 26 2013-01-02 2.0 3 train foo
>>> df.nlargest(3, ['A'])
A B C D E F
wang 69 2013-01-04 4.0 3 train foo
zhou 63 2013-01-03 3.0 3 test foo
li 26 2013-01-02 2.0 3 train foo
(9)数据修改
>>> df.iat[0, 2] = 3 # 修改指定行、列位置的数据值
>>> df.loc[:, 'D'] = np.random.randint(50, 60, 4)
# 修改某列的值
>>> df['C'] = -df['C'] # 对指定列数据取反
>>> df # 查看修改结果
A B C D E F
zhang 20 2013-01-01 -3.0 53 test foo
li 26 2013-01-02 -2.0 59 train foo
zhou 63 2013-01-03 -3.0 59 test foo
wang 69 2013-01-04 -4.0 50 train foo
>>> dff = df[:] # 切片
>>> dff
A B C D E F
zhang 20 2013-01-01 -3.0 53 test foo
li 26 2013-01-02 -2.0 59 train foo
zhou 63 2013-01-03 -3.0 59 test foo
wang 69 2013-01-04 -4.0 50 train foo
>>> dff['C'] = dff['C'] ** 2 # 替换列数据
>>> dff
A B C D E F
zhang 20 2013-01-01 9.0 53 test foo
li 26 2013-01-02 4.0 59 train foo
zhou 63 2013-01-03 9.0 59 test foo
wang 69 2013-01-04 16.0 50 train foo
>>> dff = df[:]
>>> dff
A B C D E F
zhang 20 2013-01-01 -3.0 53 test foo
li 26 2013-01-02 -2.0 59 train foo
zhou 63 2013-01-03 -3.0 59 test foo
wang 69 2013-01-04 -4.0 50 train foo
>>> dff.loc[dff['C']==-3.0, 'D'] = 100 # 修改特定行的指定列
>>> dff
A B C D E F
zhang 20 2013-01-01 -3.0 100 test foo
li 26 2013-01-02 -2.0 59 train foo
zhou 63 2013-01-03 -3.0 100 test foo
wang 69 2013-01-04 -4.0 50 train foo
>>> data = pd.DataFrame({'k1':['one'] * 3 + ['two'] * 4,
'k2':[1, 1, 2, 3, 3, 4, 4]})
>>> data.replace(1, 5) # 把所有1替换为5
k1 k2
0 one 5
1 one 5
2 one 2
3 two 3
4 two 3
5 two 4
6 two 4
>>> data.replace([1,2],[5,6]) # 1->5,2->6
k1 k2
0 one 5
1 one 5
2 one 6
3 two 3
4 two 3
5 two 4
6 two 4
>>> data.replace({1:5, 'one':'ONE'}) # 使用字典指定替换关系
k1 k2
0 ONE 5
1 ONE 5
2 ONE 2
3 two 3
4 two 3
5 two 4
6 two 4
>>> data = pd.DataFrame({'k1':['one'] * 3 + ['two'] * 4,
'k2':[1, 1, 2, 3, 3, 4, 4]})
>>> data
k1 k2
0 one 1
1 one 1
2 one 2
3 two 3
4 two 3
5 two 4
6 two 4
>>> data.drop(5, axis=0) # 删除指定行
k1 k2
0 one 1
1 one 1
2 one 2
3 two 3
4 two 3
6 two 4
>>> data.drop(3, inplace=True) # 原地删除
>>> data
k1 k2
0 one 1
1 one 1
2 one 2
4 two 3
5 two 4
6 two 4
>>> data.drop('k1', axis=1) # 删除指定列
k2
0 1
1 1
2 2
4 3
5 4
6 4
(10)缺失值处理
>>> df
A B C D E F
zhang 20 2013-01-01 9.0 53 test foo
li 26 2013-01-02 4.0 59 train foo
zhou 63 2013-01-03 9.0 59 test foo
wang 69 2013-01-04 16.0 50 train foo
>>> df1 = df.reindex(columns=list(df.columns) + ['G'])
>>> df1
A B C D E F G
zhang 20 2013-01-01 9.0 53 test foo NaN
li 26 2013-01-02 4.0 59 train foo NaN
zhou 63 2013-01-03 9.0 59 test foo NaN
wang 69 2013-01-04 16.0 50 train foo NaN
>>> df1.iat[0, 6] = 3 # 修改指定位置元素值,该列其他元素为缺失值NaN
>>> df1
A B C D E F G
zhang 20 2013-01-01 9.0 53 test foo 3.0
li 26 2013-01-02 4.0 59 train foo NaN
zhou 63 2013-01-03 9.0 59 test foo NaN
wang 69 2013-01-04 16.0 50 train foo NaN
>>> pd.isnull(df1) # 测试缺失值,返回值为True/False阵列
A B C D E F G
zhang False False False False False False False
li False False False False False False True
zhou False False False False False False True
wang False False False False False False True
>>> df1.dropna() # 返回不包含缺失值的行
A B C D E F G
zhang 20 2013-01-01 9.0 53 test foo 3.0
>>> df1['G'].fillna(5, inplace=True) # 使用指定值填充缺失值
>>> df1
A B C D E F G
zhang 20 2013-01-01 9.0 53 test foo 3.0
li 26 2013-01-02 4.0 59 train foo 5.0
zhou 63 2013-01-03 9.0 59 test foo 5.0
wang 69 2013-01-04 16.0 50 train foo 5.0
(11)重复值处理
>>> data = pd.DataFrame({'k1':['one'] * 3 + ['two'] * 4,
'k2':[1, 1, 2, 3, 3, 4, 4]})
>>> data
k1 k2
0 one 1
1 one 1
2 one 2
3 two 3
4 two 3
5 two 4
6 two 4
>>> data.duplicated() # 检查重复行
0 False
1 True
2 False
3 False
4 True
5 False
6 True
dtype: bool
>>> data.drop_duplicates() # 返回新数组,删除重复行
k1 k2
0 one 1
2 one 2
3 two 3
5 two 4
>>> data.drop_duplicates(['k1']) # 删除k1列的重复数据
k1 k2
0 one 1
3 two 3
>>> data.drop_duplicates(['k1'], keep='last')
k1 k2
2 one 2
6 two 4
(12)异常值处理
>>> import numpy as np
>>> import pandas as pd
>>> data = pd.DataFrame(np.random.randn(500, 4))
>>> data.describe() # 查看数据的统计信息
0 1 2 3
count 500.000000 500.000000 500.000000 500.000000
mean -0.077138 0.052644 -0.045360 0.024275
std 0.983532 1.027400 1.009228 1.000710
min -2.810694 -2.974330 -2.640951 -2.762731
25% -0.746102 -0.695053 -0.808262 -0.620448
50% -0.096517 -0.008122 -0.113366 -0.074785
75% 0.590671 0.793665 0.634192 0.711785
max 2.763723 3.762775 3.986027 3.539378
>>> col2 = data[2] # 第2列
>>> col2[col2>3.5] # 该列中大于3.5的数值
12 3.986027
Name: 2, dtype: float64
>>> col2[col2>3.0]
12 3.986027
Name: 2, dtype: float64
>>> col2[col2>2.5]
11 2.528325
12 3.986027
41 2.775205
157 2.707940
365 2.558892
483 2.990861
Name: 2, dtype: float64
>>> data[(data>3).any(1)] # 任意一列中有大于3的数值的行
0 1 2 3
4 1.008617 3.104177 0.522157 0.148458
12 -0.099386 0.218586 3.986027 0.997698
58 -1.553998 3.489834 0.438321 -0.276171
121 -2.101393 3.762775 1.124320 -0.210449
312 -0.945021 3.408861 1.143247 -0.005104
410 -0.279519 1.232496 -0.190450 3.539378
>>> data[np.abs(data)>2.5] = np.sign(data) * 2.5
# 把所有数据都限定到[-2.5, 2.5]之间
>>> data.describe()
0 1 2 3
count 500.000000 500.000000 500.000000 500.000000
mean -0.076439 0.046131 -0.049867 0.021888
std 0.978170 0.998113 0.992184 0.990873
min -2.500000 -2.500000 -2.500000 -2.500000
25% -0.746102 -0.695053 -0.808262 -0.620448
50% -0.096517 -0.008122 -0.113366 -0.074785
75% 0.590671 0.793665 0.634192 0.711785
max 2.500000 2.500000 2.500000 2.500000
(13)映射
>>> data['k1'] = data['k1'].map(str.upper) # 使用函数进行映射
>>> data
k1 k2
0 ONE 1
1 ONE 1
2 ONE 2
3 TWO 3
4 TWO 3
5 TWO 4
6 TWO 4
>>> data['k1'] = data['k1'].map({'ONE':'one', 'TWO':'two'}) # 使用字典表示映射关系
>>> data
k1 k2
0 one 1
1 one 1
2 one 2
3 two 3
4 two 3
5 two 4
6 two 4
>>> data['k2'] = data['k2'].map(lambda x:x+5) # lambda表达式
>>> data
k1 k2
0 one 6
1 one 6
2 one 7
3 two 8
4 two 8
5 two 9
6 two 9
>>> data.index = data.index.map(lambda x:x+5) # 修改索引
>>> data
k1 k2
5 one 6
6 one 6
7 one 7
8 two 8
9 two 8
10 two 9
11 two 9
>>> data.columns = data.columns.map(str.upper) # 修改列名
>>> data
K1 K2
5 one 6
6 one 6
7 one 7
8 two 8
9 two 8
10 two 9
11 two 9
(14)数据离散化
>>> from random import randrange
>>> data = [randrange(100) for _ in range(10)]
>>> category = [0,25,50,100]
>>> pd.cut(data, category)
[(50, 100], (0, 25], (50, 100], (0, 25], (50, 100], (50, 100], (50, 100], (0, 25], (0, 25], (50, 100]]
Categories (3, interval[int64]): [(0, 25] < (25, 50] < (50, 100]]
>>> pd.cut(data, category, right=False) # 左闭右开区间
[[50, 100), [0, 25), [50, 100), [25, 50), [50, 100), [50, 100), [50, 100), [0, 25), [0, 25), [50, 100)]
Categories (3, interval[int64]): [[0, 25) < [25, 50) < [50, 100)]
>>> labels = ['low', 'middle', 'high']
>>> pd.cut(data, category, right=False, labels=labels)
# 指定标签
[high, low, high, middle, high, high, high, low, low, high]
Categories (3, object): [high < low < middle]
>>> data
[74, 19, 59, 25, 53, 60, 54, 22, 24, 55]
>>> pd.cut(data,4) # 四分位
[(60.25, 74.0], (18.945, 32.75], (46.5, 60.25], (18.945, 32.75], (46.5, 60.25], (46.5, 60.25], (46.5, 60.25], (18.945, 32.75], (18.945, 32.75], (46.5, 60.25]]
Categories (4, interval[float64]): [(18.945, 32.75] < (32.75, 46.5] < (46.5, 60.25] < (60.25, 74.0]]
(15)频次统计与移位
>>> df1.shift(1) # 数据下移一行,负数表示上移
A B C D E F G
zhang NaN NaT NaN NaN NaN NaN NaN
li 20.0 2013-01-01 9.0 53.0 test foo 3.0
zhou 26.0 2013-01-02 4.0 59.0 train foo 5.0
wang 63.0 2013-01-03 9.0 59.0 test foo 5.0
>>> df1['D'].value_counts() # 直方图统计
59 2
50 1
53 1
Name: D, dtype: int64
(16)拆分与合并/连接
>>> df2 = pd.DataFrame(np.random.randn(10, 4))
>>> df2
0 1 2 3
0 2.064867 -0.888018 0.586441 -0.660901
1 -0.465664 -0.496101 0.249952 0.627771
2 1.974986 1.304449 -0.168889 -0.334622
3 0.715677 2.017427 1.750627 -0.787901
4 -0.370020 -0.878282 0.499584 0.269102
5 0.184308 0.653620 0.117899 -1.186588
6 -0.364170 1.652270 0.234833 0.362925
7 -0.329063 0.356276 1.158202 -1.063800
8 -0.778828 -0.156918 -0.760394 -0.040323
9 -0.391045 -0.374825 -1.016456 0.767481
>>> p1 = df2[:3] # 数据行拆分
>>> p1
0 1 2 3
0 2.064867 -0.888018 0.586441 -0.660901
1 -0.465664 -0.496101 0.249952 0.627771
2 1.974986 1.304449 -0.168889 -0.334622
>>> p2 = df2[3:7]
>>> p3 = df2[7:]
>>> df3 = pd.concat([p1, p2, p3]) # 数据行合并
>>> df2 == df3 # 测试两个二维数据是否相等,返回True/False阵列
0 1 2 3
0 True True True True
1 True True True True
2 True True True True
3 True True True True
4 True True True True
5 True True True True
6 True True True True
7 True True True True
8 True True True True
9 True True True True
>>> df1 = pd.DataFrame({'a':range(5), 'b':range(50,55), c':range(60,65)})
>>> df3 = pd.DataFrame({'a':range(3,8), 'd':range(30,35)})
>>> df1
a b c
0 0 50 60
1 1 51 61
2 2 52 62
3 3 53 63
4 4 54 64
>>> df3
a d
0 3 30
1 4 31
2 5 32
3 6 33
4 7 34
>>> pd.merge(df1, df3) # 内连接
a b c d
0 3 53 63 30
1 4 54 64 31
>>> pd.merge(df1, df3, how='right') # 右连接
a b c d
0 3 53.0 63.0 30
1 4 54.0 64.0 31
2 5 NaN NaN 32
3 6 NaN NaN 33
4 7 NaN NaN 34
>>> pd.merge(df1, df3, how='left') # 左连接
a b c d
0 0 50 60 NaN
1 1 51 61 NaN
2 2 52 62 NaN
3 3 53 63 30.0
4 4 54 64 31.0
>>> pd.merge(df1, df3, how='outer') # 外连接
a b c d
0 0 50.0 60.0 NaN
1 1 51.0 61.0 NaN
2 2 52.0 62.0 NaN
3 3 53.0 63.0 30.0
4 4 54.0 64.0 31.0
5 5 NaN NaN 32.0
6 6 NaN NaN 33.0
7 7 NaN NaN 34.0
(17)分组计算
>>> df4 = pd.DataFrame({'A':np.random.randint(1,5,8),
'B':np.random.randint(10,15,8),
'C':np.random.randint(20,30,8),
'D':np.random.randint(80,100,8)})
>>> df4
A B C D
0 1 13 26 81
1 3 14 29 88
2 1 13 28 88
3 2 10 21 90
4 4 14 28 83
5 4 11 24 81
6 2 11 26 99
7 3 13 25 91
>>> df4.groupby('A').sum() # 数据分组计算
B C D
A
1 26 54 169
2 21 47 189
3 27 54 179
4 25 52 164
>>> df4.groupby(['A','B']).mean()
C D
A B
1 13 27.0 84.5
2 10 21.0 90.0
11 26.0 99.0
3 13 25.0 91.0
14 29.0 88.0
4 11 24.0 81.0
14 28.0 83.0
(18)透视转换
>>> df = pd.DataFrame({'a':[1,2,3,4],
'b':[2,3,4,5],
'c':[3,4,5,6],
'd':[3,3,3,3]})
>>> df
a b c d
0 1 2 3 3
1 2 3 4 3
2 3 4 5 3
3 4 5 6 3
>>> df.pivot(index='a', columns='b', values='c')
b 2 3 4 5
a
1 3.0 NaN NaN NaN
2 NaN 4.0 NaN NaN
3 NaN NaN 5.0 NaN
4 NaN NaN NaN 6.0
>>> df.pivot(index='a', columns='b', values='d')
b 2 3 4 5
a
1 3.0 NaN NaN NaN
2 NaN 3.0 NaN NaN
3 NaN NaN 3.0 NaN
4 NaN NaN NaN 3.0
(19)数据差分
>>> df = pd.DataFrame({'a':np.random.randint(1, 100, 10),
'b':np.random.randint(1, 100, 10)},
index=map(str, range(10)))
>>> df
a b
0 21 54
1 53 28
2 18 87
3 56 40
4 62 34
5 74 10
6 7 78
7 58 79
8 66 80
9 30 21
>>> df.diff() # 纵向一阶差分
a b
0 NaN NaN
1 32.0 -26.0
2 -35.0 59.0
3 38.0 -47.0
4 6.0 -6.0
5 12.0 -24.0
6 -67.0 68.0
7 51.0 1.0
8 8.0 1.0
9 -36.0 -59.0
>>> df.diff(axis=1) # 横向一阶差分
a b
0 NaN 33.0
1 NaN -25.0
2 NaN 69.0
3 NaN -16.0
4 NaN -28.0
5 NaN -64.0
6 NaN 71.0
7 NaN 21.0
8 NaN 14.0
9 NaN -9.0
>>> df.diff(periods=2) # 纵向二阶差分
a b
0 NaN NaN
1 NaN NaN
2 -3.0 33.0
3 3.0 12.0
4 44.0 -53.0
5 18.0 -30.0
6 -55.0 44.0
7 -16.0 69.0
8 59.0 2.0
9 -28.0 -58.0
(20)计算相关系数
>>> df = pd.DataFrame({'A':np.random.randint(1, 100, 10),
'B':np.random.randint(1, 100, 10),
'C':np.random.randint(1, 100, 10)})
>>> df
A B C
0 5 91 3
1 90 15 66
2 93 27 3
3 70 44 66
4 27 14 10
5 35 46 20
6 33 14 69
7 12 41 15
8 28 62 47
9 15 92 77
>>> df.corr() # pearson相关系数
A B C
A 1.000000 -0.560009 0.162105
B -0.560009 1.000000 0.014687
C 0.162105 0.014687 1.000000
>>> df.corr('kendall') # Kendall Tau相关系数
A B C
A 1.000000 -0.314627 0.113666
B -0.314627 1.000000 0.045980
C 0.113666 0.045980 1.000000
>>> df.corr('spearman') # spearman秩相关
A B C
A 1.000000 -0.419455 0.128051
B -0.419455 1.000000 0.067279
C 0.128051 0.067279 1.000000
(21)结合matplotlib绘图
>>> import pandas as pd
>>> import numpy as np
>>> import matplotlib.pyplot as plt
>>> df = pd.DataFrame(np.random.randn(1000, 2), columns=['B', 'C']).cumsum()
>>> df['A'] = pd.Series(list(range(len(df))))
>>> plt.figure()
>>> df.plot(x='A')
>>> plt.show()
>>> df = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd'])
>>> df.plot(kind='bar')
>>> plt.show()
>>> df = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd'])
>>> df.plot(kind='barh', stacked=True)
>>> plt.show()