pandas
文件读写
文件读取
# !/user/bin/python
# -- coding: UTF-8 --
import numpy as np
import pandas as pd
uid=pd.read_csv(r'C:\Users\Administrator\Desktop\uid_countrycode.csv',header=0)
#读取txt
uid=pd.read_table(r'C:\Users\Administrator\Desktop\uid_countrycode.csv',names=['uid','date'])
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
存入文件
#存为csv文件
other.to_csv(r'C:\Users\Administrator\Desktop\new.csv',sep=',')
#存为txt文件
other.to_csv(r'C:\Users\Administrator\Desktop\new.txt',sep='\t')
- 1
- 2
- 3
- 4
- 5
预处理
创建dataframe
import pandas as pd
a=[1,2,3]
b=[5,6,7]
c={"a" : a, "b" : b}#将列表a,b转换成字典
data=pd.DataFrame(c)#将字典转换成为数据框
print(data)
- 1
- 2
- 3
- 4
- 5
- 6
简单处理
查看数据类型
data.dtypes
- 1
拼接
uid=[vmoney,paytime,viewtime]
uid=pd.concat(uid)
#注意合并以后重置index
uid = uid.reset_index(drop=True)
- 1
- 2
- 3
- 4
匹配合并
df_inner=pd.merge(df,df1,how='inner')
df_left=pd.merge(df,df1,how='left')
df_right=pd.merge(df,df1,how='right')
df_outer=pd.merge(df,df1,how='outer')
- 1
- 2
- 3
- 4
命名列名
other.columns=['a','b','c']
- 1
替换列值
#前提只r1列存在
data.r1=list(range(len(data.index)))
print data
fecha r1 r2 r3
0 2017-05-01 0 8 6
1 2017-05-02 1 17 4
2 2017-05-03 2 16 11
3 2017-05-04 3 1 2
4 2017-05-05 4 19 9
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
创建新列
data['r4']=list(range(len(data.index)))
print data
fecha r1 r2 r3 r4
0 2017-05-01 5 14 7 0
1 2017-05-02 4 5 4 1
2 2017-05-03 16 6 11 2
3 2017-05-04 3 8 8 3
#加标签列/分类列
game.loc[game.onehour>0.5,'new']='A'
game['new']=game['new'].fillna('B')
print game
type num onehour fake_pcu200 day new
0 DOTA II 237 0.49 0.21 2017/7/1 B
1 DOTA II 206 0.52 0.20 2017/7/2 A
2 DOTA II 231 0.52 0.24 2017/7/3 A
3 DOTA II 233 0.53 0.26 2017/7/4 A
4 DOTA II 273 0.56 0.26 2017/7/5 A
5 DOTA II 275 0.54 0.26 2017/7/6 A
6 DOTA II 251 0.53 0.27 2017/7/7 A
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
分组及排序
#按'type'和'new'列分组,对指定列'onehour'求和
#如果不指定求和列,默认对剩余所有列求和
game=game.groupby(['type','new'])['onehour'].sum().reset_index()
print game
type new onehour
0 DOTA II A 4.28
1 DOTA II B 1.48
2 Gerena RoV:Mobile MOBA A 4.95
3 Gerena RoV:Mobile MOBA B 1.24
4 Grand Theft Auto V A 4.81
5 Grand Theft Auto V B 1.38
#按指定列排序
game.sort_values(by=['onehour'],ascending=False)
#按索引排序
game.sort_index()
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
全组排序、组内排序及标号
data['sort_num']=data['comment_num'].rank(ascending=0,method='dense')
第一个参数 ascending,就是选择是升序还是降序排列。
第二个参数
method=’dense’
method=’first’
method=’min’
method=’max’
组内排序 data['group_sort']=data['comment_num'].groupby(data['cate']).rank(ascending=0,method='dense')
筛选
简单操作
筛选出指定行
ID=uid.loc[uid['countrycode']=='ID']
other=uid.loc[uid['countrycode'].isin(['VN','TH','ID'])==False]
- 1
- 2
筛选掉带空值的行
other=other.dropna()
- 1
对缺失值填充
other=other.fillna()
- 1
分组
other=other.groupby('countrycode')
a=[x for x in other]
- 1
- 2
lambda函数筛选
lambda 参数:表达式
In [85]: df1
Out[85]:
A B C D
a -0.023688 2.410179 1.450520 0.206053
b -0.251905 -2.213588 1.063327 1.266143
c 0.299368 -0.863838 0.408204 -1.048089
d -0.025747 -0.988387 0.094055 1.262731
e 1.289997 0.082423 -0.055758 0.536580
f -0.489682 0.369374 -0.034571 -2.484478
In [86]: df1.loc[lambda df: df.A > 0, :]
Out[86]:
A B C D
c 0.299368 -0.863838 0.408204 -1.048089
e 1.289997 0.082423 -0.055758 0.536580
In [90]: df1.A.loc[lambda s: s > 0]
Out[90]:
c 0.299368
e 1.289997
Name: A, dtype: float64
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
where筛选
where语句保留了原始series/dataframe的长度和大小,不满足条件的值默认被替换为空值NaN
In [166]: df[df < 0]
Out[166]:
A B C D
2000-01-01 -2.104139 -1.309525 NaN NaN
2000-01-02 -0.352480 NaN -1.192319 NaN
2000-01-03 -0.864883 NaN -0.227870 NaN
2000-01-04 NaN -1.222082 NaN -1.233203
#where有other参数,用于替换不满足条件的值
In [167]: df.where(df < 0, -df)
Out[167]:
A B C D
2000-01-01 -2.104139 -1.309525 -0.485855 -0.245166
2000-01-02 -0.352480 -0.390389 -1.192319 -1.655824
2000-01-03 -0.864883 -0.299674 -0.227870 -0.281059
2000-01-04 -0.846958 -1.222082 -0.600705 -1.233203
#参数还可以为函数
In [185]: df3 = pd.DataFrame({'A': [1, 2, 3],
.....: 'B': [4, 5, 6],
.....: 'C': [7, 8, 9]})
.....:
In [186]: df3.where(lambda x: x > 4, lambda x: x + 10)
Out[186]:
A B C
0 11 14 7
1 12 5 8
2 13 6 9
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
DataFrame.where()不同于numpy.where()
df1.where(m, df2)=np.where(m, df1, df2)
In [177]: df.where(df < 0, -df) == np.where(df < 0, df, -df)
Out[177]:
A B C D
2000-01-01 True True True True
2000-01-02 True True True True
2000-01-03 True True True True
#在np里还可以根据where分组贴标签
#如果price列的值>3000,group列显示high,否则显示low
df_inner['group'] = np.where(df_inner['price'] > 3000,'high','low')
#对复合多个条件的数据进行分组标记
df_inner.loc[(df_inner['city'] == 'beijing') & (df_inner['price'] >= 4000), 'sign']=1
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
.query()筛选
query括号内允许使用表达式
In [190]: df = pd.DataFrame(np.random.rand(6, 3), columns=list('abc'))
In [191]: df
Out[191]:
a b c
0 0.438921 0.118680 0.863670
1 0.138138 0.577363 0.686602
2 0.595307 0.564592 0.520630
3 0.913052 0.926075 0.616184
4 0.078718 0.854477 0.898725
5 0.076404 0.523211 0.591538
# pure python
In [192]: df[(df.a < df.b) & (df.b < df.c)]
Out[192]:
a b c
1 0.138138 0.577363 0.686602
4 0.078718 0.854477 0.898725
5 0.076404 0.523211 0.591538
# query
In [193]: df.query('(a < b) & (b < c)')
Out[193]:
a b c
1 0.138138 0.577363 0.686602
4 0.078718 0.854477 0.898725
5 0.076404 0.523211 0.591538
# 括号内也可以是比较复杂的表达式
# short query syntax
In [250]: shorter = df.query('a < b < c and (not bools) or bools > 2')
# equivalent in pure Python
In [251]: longer = df[(df.a < df.b) & (df.b < df.c) & (~df.bools) | (df.bools > 2)]
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
重复值处理
切片
#生成随机dataframe
import random as rd
import pandas as pd
r1=[rd.randrange(1,20) for x in xrange(62)]
r2=[rd.randrange(1,20) for x in xrange(62)]
r3=[rd.randrange(1,20) for x in xrange(62)]
fecha=pd.date_range('2017-05-01','2017-07-01')
data=pd.DataFrame({'fecha':fecha,'r1':r1,'r2':r2,'r3':r3})
print data
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
注意此处xrange作用类似range, 但range结果为list, xrange结果是一个生成器。要生成较大的数字序列时,xrange比range节省内存空间。
[ ]切片方法
In [31]: s[:5]
Out[31]:
2000-01-01 0.469112
2000-01-02 1.212112
2000-01-03 -0.861849
2000-01-04 0.721555
2000-01-05 -0.424972
In [32]: s[::2] #跳行取
Out[32]:
2000-01-01 0.469112
2000-01-03 -0.861849
2000-01-05 -0.424972
In [33]: s[::-1] #逆序取
Out[33]:
2000-01-05 -0.424972
2000-01-04 0.721555
2000-01-03 -0.861849
2000-01-02 1.212112
2000-01-01 0.469112
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
按照索引实现列切片或区域切片
# 行选择
data1=data[1:5]
print data1
fecha r1 r2 r3
1 2017-05-02 3 6 15
2 2017-05-03 14 14 11
3 2017-05-04 6 8 12
4 2017-05-05 13 14 2
# 列选择
data2=data[['r1','r2']]
print data2
r1 r2
0 17 19
1 3 6
2 14 14
3 6 8
4 13 14
5 3 14
6 18 12
7 15 18
8 3 11
9 1 14
10 7 16
11 13 2
12 2 12
13 16 10
14 14 6
15 10 12
16 13 13
17 6 15
18 13 4
19 16 13
20 6 17
21 2 3
22 10 3
23 17 6
24 19 5
25 15 16
26 12 15
27 13 9
28 11 7
29 8 1
30 4 18
31 13 4
# 区块选择
data3=data[:10][['r1','r2']]
print data3
r1 r2
0 17 19
1 3 6
2 14 14
3 6 8
4 13 14
5 3 14
6 18 12
7 15 18
8 3 11
9 1 14
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
.loc
按照标签进行行列选择
注意.loc的切片结果包括了第五行,而[ ]不包括。
data1=data.loc[1:5]
print data1
fecha r1 r2 r3
1 2017-05-02 5 16 14
2 2017-05-03 9 5 19
3 2017-05-04 9 7 18
4 2017-05-05 2 4 12
5 2017-05-06 16 18 4
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
注意下列写法:
In [15]: df.loc[:,['B', 'A']] = df[['A', 'B']].values
In [16]: df[['A', 'B']]
Out[16]:
A B
2000-01-01 0.469112 -0.282863
2000-01-02 1.212112 -0.173215
2000-01-03 -0.861849 -2.104569
- 1
- 2
- 3
- 4
- 5
- 6
- 7
补充
#取特定列
data=data.loc[:,['r1','r3']]
print data
r1 r3
0 8 3
1 2 18
2 18 6
3 1 9
4 13 8
#取连续列报错
data=data.loc[:,['r1':'r3']]
#取连续列正确方式
data=data.loc[:,'r1':'r3']
print data
r1 r2 r3
0 8 18 15
1 9 13 12
2 10 18 2
3 14 17 2
4 1 2 16
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
.loc还能选择特定日期内的数据。但要求日期在索引内。
data_fecha=data.set_index('fecha')
print data_fecha
r1 r2 r3
fecha
2017-05-01 15 4 13
2017-05-02 8 4 11
2017-05-03 6 8 10
2017-05-04 9 19 1
2017-05-05 12 9 7
2017-05-06 18 6 8
2017-05-07 17 14 17
2017-05-08 1 13 16
2017-05-09 13 2 7
2017-05-10 8 16 5
2017-05-11 13 7 16
2017-05-12 6 11 18
2017-05-13 11 7 13
2017-05-14 1 13 12
2017-05-15 19 11 4
2017-05-16 4 8 14
2017-05-17 2 14 15
2017-05-18 6 15 6
2017-05-19 13 9 4
2017-05-20 3 10 7
2017-05-21 11 12 10
2017-05-22 5 17 11
2017-05-23 7 18 17
2017-05-24 3 17 2
2017-05-25 10 8 17
2017-05-26 3 16 9
2017-05-27 9 1 6
2017-05-28 8 8 17
2017-05-29 9 9 16
2017-05-30 4 10 6
2017-05-31 8 18 14
2017-06-01 14 15 14
#生成两个特定日期
fecha1=dt.datetime(2017,5,5)
fecha2=dt.datetime(2017,5,10)
#生成切片数据
data1=data_fecha.loc[fecha1:fecha2]
print data1
r1 r2 r3
fecha
2017-05-05 18 6 10
2017-05-06 19 12 14
2017-05-07 16 15 8
2017-05-08 8 17 10
2017-05-09 11 14 8
2017-05-10 7 19 16
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
.iloc
按照索引选取,其括号内只能为数值。
#行选择
data1=data[1:5]
print data1
fecha r1 r2 r3
1 2017-05-02 1 6 7
2 2017-05-03 15 9 15
3 2017-05-04 15 10 13
4 2017-05-05 15 15 14
#列选择
data2=data.iloc[:,[1,3]]
print data2
r1 r3
0 2 18
1 1 7
2 15 15
3 15 13
4 15 14
5 16 7
6 16 9
7 1 5
8 3 9
9 2 1
10 1 17
11 19 3
12 2 2
13 6 14
14 9 10
15 13 6
16 14 14
17 9 8
18 16 13
19 11 14
20 7 10
21 1 12
22 13 1
23 12 19
24 15 17
25 18 8
26 6 17
27 15 3
28 5 12
29 12 1
30 14 19
31 17 13
#切片选择
data3=data.iloc[[1,5],[1,3]]
print data3
r1 r3
1 1 7
5 16 7
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
.at
通过标签访问单个元素,速度比.loc快
a=data.iat[4,'r1']
print a
12
- 1
- 2
- 3
.iat
类似.iloc,访问索引提取元素
a=data.iat[4,1]
print a
12
- 1
- 2
- 3
索引
设置索引
#将原有某列设置为索引
data_fecha=data.set_index('fecha')
r1 r2 r3
fecha
2017-05-01 15 4 13
2017-05-02 8 4 11
2017-05-03 6 8 10
2017-05-04 9 19 1
2017-05-05 12 9 7
#添加新列为索引
data.index=data['fecha']
fecha r1 r2 r3
fecha
2017-05-01 2017-05-01 6 2 17
2017-05-02 2017-05-02 17 18 5
2017-05-03 2017-05-03 1 9 18
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
布尔索引
In [140]: df2 = pd.DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'],
.....: 'b' : ['x', 'y', 'y', 'x', 'y', 'x', 'x'],
.....: 'c' : np.random.randn(7)})
.....:
# only want 'two' or 'three'
In [141]: criterion = df2['a'].map(lambda x: x.startswith('t'))
In [142]: df2[criterion]
Out[142]:
a b c
2 two y 0.041290
3 three x 0.361719
4 two y -0.238075
# equivalent but slower
In [143]: df2[[x.startswith('t') for x in df2['a']]]
Out[143]:
a b c
2 two y 0.041290
3 three x 0.361719
4 two y -0.238075
# Multiple criteria
In [144]: df2[criterion & (df2['b'] == 'x')]
Out[144]:
a b c
3 three x 0.361719
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
多重索引
In [152]: s_mi = pd.Series(np.arange(6),index=pd.MultiIndex.from_product([[0, 1], ['a', 'b', 'c']]))
In [153]: s_mi
Out[153]:
0 a 0
b 1
c 2
1 a 3
b 4
c 5
In [154]: s_mi.iloc[s_mi.index.isin([(1, 'a'), (2, 'b'), (0, 'c')])]
Out[154]:
0 c 2
1 a 3
In [155]: s_mi.iloc[s_mi.index.isin(['a', 'c', 'e'], level=1)]
Out[155]:
0 a 0
c 2
1 a 3
c 5
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
详细见官方文档:http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy