本文主要介绍pandas数据清洗和数据操作
1. 数据统计
In [31]: df = DataFrame({'A':np.random.randn(5),'B':np.random.randn(5)})
In [32]: df
Out[32]:
A B
0 -0.635732 0.738902
1 -1.100320 0.910203
2 1.503987 -2.030411
3 0.548760 0.228552
4 -2.201917 1.676173
In [33]: df.count() #计算个数
Out[33]:
A 5
B 5
dtype: int64
In [34]: df.min() #最小值
Out[34]:
A -2.201917
B -2.030411
dtype: float64
In [35]: df.max() #最大值
Out[35]:
A 1.503987
B 1.676173
dtype: float64
In [36]: df.idxmin() #最小值的位置
Out[36]:
A 4
B 2
dtype: int64
In [37]: df.idxmax() #最大值的位置
Out[37]:
A 2
B 4
dtype: int64
In [38]: df.sum() #求和
Out[38]:
A -1.885221
B 1.523419
dtype: float64
In [39]: df.mean() #平均数
Out[39]:
A -0.377044
B 0.304684
dtype: float64
In [40]: df.median() #中位数
Out[40]:
A -0.635732
B 0.738902
dtype: float64
In [41]: df.mode() #众数
Out[41]:
Empty DataFrame
Columns: [A, B]
Index: []
In [42]: df.var() #方差
Out[42]:
A 2.078900
B 1.973661
dtype: float64
In [43]: df.std() #标准差
Out[43]:
A 1.441839
B 1.404871
dtype: float64
In [44]: df.mad() #平均绝对偏差
Out[44]:
A 1.122734
B 0.964491
dtype: float64
In [45]: df.skew() #偏度
Out[45]:
A 0.135719
B -1.480080
dtype: float64
In [46]: df.kurt() #峰度
Out[46]:
A -0.878539
B 2.730675
dtype: float64
In [48]: df.quantile(0.25) #25%分位数
Out[48]:
A -1.100320
B 0.228552
dtype: float64
In [49]: df.describe() #描述性统计指标
Out[49]:
A B
count 5.000000 5.000000
mean -0.377044 0.304684
std 1.441839 1.404871
min -2.201917 -2.030411
25% -1.100320 0.228552
50% -0.635732 0.738902
75% 0.548760 0.910203
max 1.503987 1.676173
pandas
有个非常简便的方法一次得到这些统计分析,那就是agg()
函数,如:
# 自定义函数
def first(columns):
return columns[0]
df.agg(mean,cout,max,first).reset_index()
2. 处理缺失值
In [50]: string = Series(['apple','banana','pear',np.nan,'grape'])
In [51]: string
Out[51]:
0 apple
1 banana
2 pear
3 NaN
4 grape
dtype: object
In [52]: string.isnull() #判断是否为缺失值
Out[52]:
0 False
1 False
2 False
3 True
4 False
dtype: bool
In [53]: string.dropna() #过滤缺失值,默认过滤任何含NaN的行
Out[53]:
0 apple
1 banana
2 pear
4 grape
dtype: object
In [54]: string.fillna(0) #填充缺失值
Out[54]:
0 apple
1 banana
2 pear
3 0
4 grape
dtype: object
In [55]: string.ffill() #向前填充
Out[55]:
0 apple
1 banana
2 pear
3 pear
4 grape
dtype: object
In [56]: data = DataFrame([[1. ,6.5,3],[1. ,np.nan,np.nan],[np.nan,np.nan,np.nan],[np.nan,7,9]]) #DataFrame操作同理
In [57]: data
Out[57]:
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
2 NaN NaN NaN
3 NaN 7.0 9.0
3. 层次化索引
In [6]: data = Series(np.random.randn(10),index=[['a','a','a','b','b','b','c','c','d','d'],[1,2,3,1,2,3,1,2,2,3]])
In [7]: data
Out[7]:
a 1 0.386697
2 0.822063
3 0.338441
b 1 0.017249
2 0.880122
3 0.296465
c 1 0.376104
2 -1.309419
d 2 0.512754
3 0.223535
dtype: float64
In [8]: data.index
Out[8]:
MultiIndex(levels=[[u'a', u'b', u'c', u'd'], [1, 2, 3]],
labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])
In [10]: data['b':'c']
Out[10]:
b 1 0.017249
2 0.880122
3 0.296465
c 1 0.376104
2 -1.309419
dtype: float64
In [11]: data[:,2]
Out[11]:
a 0.822063
b 0.880122
c -1.309419
d 0.512754
dtype: float64
In [12]: data.unstack()
Out[12]:
1 2 3
a 0.386697 0.822063 0.338441
b 0.017249 0.880122 0.296465
c 0.376104 -1.309419 NaN
d NaN 0.512754 0.223535
In [13]: data.unstack().stack()
Out[13]:
a 1 0.386697
2 0.822063
3 0.338441
b 1 0.017249
2 0.880122
3 0.296465
c 1 0.376104
2 -1.309419
d 2 0.512754
3 0.223535
dtype: float64
In [14]: df = DataFrame(np.arange(12).reshape(4,3),index=[['a','a','b','b'],[1,2,1,2]],columns=[['Ohio','Ohio','Colorad
...: o'],['Green','Red','Green']])
In [15]: df
Out[15]:
Ohio Colorado
Green Red Green
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11
In [16]: df.index.names = ['key1','key2']
In [17]: df.columns.names = ['state','color']
In [18]: df
Out[18]:
state Ohio Colorado
color Green Red Green
key1 key2
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11
In [19]: df['Ohio'] #降维
Out[19]:
color Green Red
key1 key2
a 1 0 1
2 3 4
b 1 6 7
2 9 10
In [20]: df.swaplevel('key1','key2')
Out[20]:
state Ohio Colorado
color Green Red Green
key2 key1
1 a 0 1 2
2 a 3 4 5
1 b 6 7 8
2 b 9 10 11
In [21]: df.sortlevel(1) #key2
Out[21]:
state Ohio Colorado
color Green Red Green
key1 key2
a 1 0 1 2
b 1 6 7 8
a 2 3 4 5
b 2 9 10 11
In [22]: df.sortlevel(0) #key1
Out[22]:
state Ohio Colorado
color Green Red Green
key1 key2
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11
4. 类sql操作
In [5]: dic = {'Name':['LiuShunxiang','Zhangshan','ryan'],
...: 'Sex':['M','F','F'],
...: 'Age':[27,23,24],
...: 'Height':[165.7,167.2,154],
...: 'Weight':[61,63,41]}
In [6]: student = pd.DataFrame(dic)
In [7]: student
Out[7]:
Age Height Name Sex Weight
0 27 165.7 LiuShunxiang M 61
1 23 167.2 Zhangshan F 63
2 24 154.0 ryan F 41
In [8]: dic1 = {'Name':['Ann','Joe'],
...: 'Sex':['M','F'],
...: 'Age':[27,33],
...: 'Height':[168,177.2],
...: 'Weight':[51,65]}
In [9]: student1 = pd.DataFrame(dic1)
In [10]: Student = pd.concat([student,student1]) #插入行
In [11]: Student
Out[11]:
Age Height Name Sex Weight
0 27 165.7 LiuShunxiang M 61
1 23 167.2 Zhangshan F 63
2 24 154.0 ryan F 41
0 27 168.0 Ann M 51
1 33 177.2 Joe F 65
In [14]: pd.DataFrame(Student,columns = ['Age','Height','Name','Sex','Weight','Score']) #新增列
Out[14]:
Age Height Name Sex Weight Score
0 27 165.7 LiuShunxiang M 61 NaN
1 23 167.2 Zhangshan F 63 NaN
2 24 154.0 ryan F 41 NaN
0 27 168.0 Ann M 51 NaN
1 33 177.2 Joe F 65 NaN
In [16]: Student.ix[Student['Name']=='ryan','Height'] = 160 #修改某个数据
In [17]: Student
Out[17]:
Age Height Name Sex Weight
0 27 165.7 LiuShunxiang M 61
1 23 167.2 Zhangshan F 63
2 24 160.0 ryan F 41
0 27 168.0 Ann M 51
1 33 177.2 Joe F 65
In [18]: Student[Student['Height']>160] #删选
Out[18]:
Age Height Name Sex Weight
0 27 165.7 LiuShunxiang M 61
1 23 167.2 Zhangshan F 63
0 27 168.0 Ann M 51
1 33 177.2 Joe F 65
In [21]: Student.drop(['Weight'],axis = 1).head() #删除列
Out[21]:
Age Height Name Sex
0 27 165.7 LiuShunxiang M
1 23 167.2 Zhangshan F
2 24 160.0 ryan F
0 27 168.0 Ann M
1 33 177.2 Joe F
In [22]: Student.drop([1,2]) #删除行索引为1和2的行
Out[22]:
Age Height Name Sex Weight
0 27 165.7 LiuShunxiang M 61
0 27 168.0 Ann M 51
In [24]: Student.drop(['Age'],axis = 1) #删除列索引为Age的列
Out[24]:
Height Name Sex Weight
0 165.7 LiuShunxiang M 61
1 167.2 Zhangshan F 63
2 154.0 ryan F 41
0 168.0 Ann M 51
1 177.2 Joe F 65
In [26]: Student.groupby('Sex').agg([np.mean,np.median]) #等价于SELECT…FROM…GROUP BY…功能
Out[26]:
Age Height Weight
mean median mean median mean median
Sex
F 26.666667 24 168.133333 167.20 56.333333 63
M 27.000000 27 166.850000 166.85 56.000000 56
In [27]: series = pd.Series(np.random.randint(1,20,5)) #排序
In [28]: series
Out[28]:
0 9
1 17
2 17
3 13
4 15
dtype: int32
In [29]: series.order() #默认升序
C:/Anaconda2/Scripts/ipython-script.py:1: FutureWarning: order is deprecated, use sort_values(...)
if __name__ == '__main__':
Out[29]:
0 9
3 13
4 15
1 17
2 17
dtype: int32
In [30]: series.order(ascending = False) #降序
C:/Anaconda2/Scripts/ipython-script.py:1: FutureWarning: order is deprecated, use sort_values(...)
if __name__ == '__main__':
Out[30]:
2 17
1 17
4 15
3 13
0 9
dtype: int32
In [31]: Student.sort_values(by = ['Height']) #按值排序
Out[31]:
Age Height Name Sex Weight
2 24 160.0 ryan F 41
0 27 165.7 LiuShunxiang M 61
1 23 167.2 Zhangshan F 63
0 27 168.0 Ann M 51
1 33 177.2 Joe F 65
In [32]: dict2 = {'Name':['ryan','LiuShunxiang','Zhangshan','Ann','Joe'],
...: 'Score':['89','90','78','60','53']}
In [33]: Score = pd.DataFrame(dict2)
In [34]: Score
Out[34]:
Name Score
0 ryan 89
1 LiuShunxiang 90
2 Zhangshan 78
3 Ann 60
4 Joe 53
In [35]: stu_score = pd.merge(Student,Score,on = 'Name') #表连接
In [36]: stu_score
Out[36]:
Age Height Name Sex Weight Score
0 27 165.7 LiuShunxiang M 61 90
1 23 167.2 Zhangshan F 63 78
2 24 160.0 ryan F 41 89
3 27 168.0 Ann M 51 60
4 33 177.2 Joe F 65 53
注:student1以dic形式转DataFrame对象和直接新建DataFrame对象,连接结果不同
In [71]:student1 = DataFrame({'name';['Ann','Joe'],'Sex':['M','F'],'Age':[27,33],'Height':[168,177.2],'Weight':[51,65
...: ]})
In [72]: student
Out[72]:
Age Height Name Sex Weight
0 27 165.7 LiuShunxiang M 61
1 23 167.2 Zhangshan F 63
2 24 154.0 ryan F 41
In [73]: student1
Out[73]:
Age Height Sex Weight name
0 27 168.0 M 51 Ann
1 33 177.2 F 65 Joe
In [74]: Student = pd.concat([student,student1])
In [75]: Student
Out[75]:
Age Height Name Sex Weight name
0 27 165.7 LiuShunxiang M 61 NaN
1 23 167.2 Zhangshan F 63 NaN
2 24 154.0 ryan F 41 NaN
0 27 168.0 NaN M 51 Ann
1 33 177.2 NaN F 65 Joe
延伸表连接,merge函数参数说明如下:
参数 | 说明 |
---|---|
left | 参与合并的左侧DataFrame |
right | 参与合并的右侧DataFrame |
how | “inner”、”outer”、”left”、”right”其中之一,默认为inner |
on | 用于连接的列名 |
left_on | 左侧DataFrame中用作连接键的列 |
right_on | 右侧DataFrame中用作连接键的列 |
left_index | 将左侧DataFrame中的行索引作为连接的键 |
right_index | 将右侧DataFrame中的行索引作为连接的键 |
sort | 根据连接键对合并后的数据进行排序 |
举例如下
In [5]: df1 = DataFrame({'key':['b','b','a','c','a','a','b'],'data1':range(7)})
In [6]: df1
Out[6]:
data1 key
0 0 b
1 1 b
2 2 a
3 3 c
4 4 a
5 5 a
6 6 b
In [7]: df2 = DataFrame({'key':['a','b','d'],'data2':range(3)})
In [8]: df2
Out[8]:
data2 key
0 0 a
1 1 b
2 2 d
In [9]: pd.merge(df1,df2) #默认内链接,合并相同的key即a,b
Out[9]:
data1 key data2
0 0 b 1
1 1 b 1
2 6 b 1
3 2 a 0
4 4 a 0
5 5 a 0
In [10]: df3 = DataFrame({'lkey':['b','b','a','c','a','a','b'],'data1':range(7)})
In [11]: df4 = DataFrame({'rkey':['a','b','d'],'data2':range(3)})
In [12]: df3
Out[12]:
data1 lkey
0 0 b
1 1 b
2 2 a
3 3 c
4 4 a
5 5 a
6 6 b
In [13]: df4
Out[13]:
data2 rkey
0 0 a
1 1 b
2 2 d
In [14]: print pd.merge(df3,df4,left_on = 'lkey',right_on = 'rkey')
data1 lkey data2 rkey
0 0 b 1 b
1 1 b 1 b
2 6 b 1 b
3 2 a 0 a
4 4 a 0 a
5 5 a 0 a
In [15]: print pd.merge(df3,df4,left_on = 'lkey',right_on = 'data2')
Empty DataFrame
Columns: [data1, lkey, data2, rkey]
Index: []
In [16]: print pd.merge(df1,df2,how = 'outer')
data1 key data2
0 0.0 b 1.0
1 1.0 b 1.0
2 6.0 b 1.0
3 2.0 a 0.0
4 4.0 a 0.0
5 5.0 a 0.0
6 3.0 c NaN
7 NaN d 2.0
In [17]: df5 = DataFrame({'key':list('bbacab'),'data1':range(6)})
In [18]: df6 = DataFrame({'key':list('ababd'),'data2':range(5)})
In [19]: df5
Out[19]:
data1 key
0 0 b
1 1 b
2 2 a
3 3 c
4 4 a
5 5 b
In [20]: df6
Out[20]:
data2 key
0 0 a
1 1 b
2 2 a
3 3 b
4 4 d
In [21]: print pd.merge(df5,df6,on = 'key',how = 'left')
data1 key data2
0 0 b 1.0
1 0 b 3.0
2 1 b 1.0
3 1 b 3.0
4 2 a 0.0
5 2 a 2.0
6 3 c NaN
7 4 a 0.0
8 4 a 2.0
9 5 b 1.0
10 5 b 3.0
In [22]: left = DataFrame({'key1':['foo','foo','bar'],'key2':['one','two','one'],'lval':[1,2,3]})
In [23]: right = DataFrame({'key1':['foo','foo','bar','bar'],'key2':['one','one','one','two'],'rval':[4,5,6,7]})
In [24]: left
Out[24]:
key1 key2 lval
0 foo one 1
1 foo two 2
2 bar one 3
In [25]: right
Out[25]:
key1 key2 rval
0 foo one 4
1 foo one 5
2 bar one 6
3 bar two 7
In [26]: print pd.merge(left,right,on = ['key1','key2'],how = 'outer')
key1 key2 lval rval
0 foo one 1.0 4.0
1 foo one 1.0 5.0
2 foo two 2.0 NaN
3 bar one 3.0 6.0
4 bar two NaN 7.0
In [27]: print pd.merge(left,right,on = 'key1')
key1 key2_x lval key2_y rval
0 foo one 1 one 4
1 foo one 1 one 5
2 foo two 2 one 4
3 foo two 2 one 5
4 bar one 3 one 6
5 bar one 3 two 7
In [28]: print pd.merge(left,right,on = 'key1',suffixes = ('_left','_right'))
key1 key2_left lval key2_right rval
0 foo one 1 one 4
1 foo one 1 one 5
2 foo two 2 one 4
3 foo two 2 one 5
4 bar one 3 one 6
5 bar one 3 two 7
5. 离散化面元划分
In [17]: age = [20,22,25,27,21,23,37,31,61,45,41,32]
In [18]: bins = [18,25,35,60,100]
In [19]: cats = pd.cut(age,bins)
In [20]: cats
Out[20]:
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, object): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
In [26]: group_names = ['YoungAdult','Adult','MiddleAged','Senior']
In [27]: pd.cut(age,bins,labels = group_names) #设置面元名称
Out[27]:
[YoungAdult, YoungAdult, YoungAdult, Adult, YoungAdult, ..., Adult, Senior, MiddleAged, MiddleAged, Adult]
Length: 12
Categories (4, object): [YoungAdult < Adult < MiddleAged < Senior]
In [28]: data = np.random.randn(10)
In [29]: cats = pd.qcut(data,4) #qcut提供根据样本分位数对数据进行面元划分
In [30]: cats
Out[30]:
[(0.268, 0.834], (-0.115, 0.268], (0.268, 0.834], [-1.218, -0.562], (-0.562, -0.115], [-1.218, -0.562], (-0.115, 0.268], [-1.218, -0.562], (0.268, 0.834], (-0.562, -0.115]]
Categories (4, object): [[-1.218, -0.562] < (-0.562, -0.115] < (-0.115, 0.268] < (0.268, 0.834]]
In [33]: pd.value_counts(cats)
Out[33]:
(0.268, 0.834] 3
[-1.218, -0.562] 3
(-0.115, 0.268] 2
(-0.562, -0.115] 2
dtype: int64
In [35]: pd.qcut(data,[0.1,0.5,0.9,1.]) #自定义分位数,[0-1]的数值
Out[35]:
[(-0.115, 0.432], (-0.115, 0.432], (0.432, 0.834], NaN, [-0.787, -0.115], [-0.787, -0.115], (-0.115, 0.432], [-0.787, -0.115], (-0.115, 0.432], [-0.787, -0.115]]
Categories (3, object): [[-0.787, -0.115] < (-0.115, 0.432] < (0.432, 0.834]]
6. 重命名轴索引
In [36]: data = DataFrame(np.arange(12).reshape(3,4),index = ['Ohio','Colorado','New York'],columns = ['one','two','thr
...: ee','four'])
In [37]: data
Out[37]:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
New York 8 9 10 11
In [38]: data.index = data.index.map(str.upper)
In [39]: data
Out[39]:
one two three four
OHIO 0 1 2 3
COLORADO 4 5 6 7
NEW YORK 8 9 10 11
In [40]: data.rename(index = str.title,columns=str.upper)
Out[40]:
ONE TWO THREE FOUR
Ohio 0 1 2 3
Colorado 4 5 6 7
New York 8 9 10 11
In [41]: data.rename(index={'OHIO':'INDIANA'},columns={'three':'ryana'}) #对部分轴标签更新
Out[41]:
one two ryana four
INDIANA 0 1 2 3
COLORADO 4 5 6 7
NEW YORK 8 9 10 11