In [45]: data = pd.DataFrame({'k1':['one','two']*3+['two'],....:'k2':[1,1,2,3,3,4,4]})
In [46]: data
Out[46]:
k1 k2
0 one 11 two 12 one 23 two 34 one 35 two 46 two 4# DataFrame的duplicated⽅法返回⼀个布尔型Series,表示各⾏是否是重复⾏
In [47]: data.duplicated()
Out[47]:0False1False2False3False4False5False6True
dtype:bool# drop_duplicates⽅法会返回⼀个DataFrame,重复的数组会标为False
In [48]: data.drop_duplicates()
Out[48]:
k1 k2
0 one 11 two 12 one 23 two 34 one 35 two 4# 指定部分列进⾏重复项判断
In [49]: data['v1']=range(7)
In [50]: data.drop_duplicates(['k1'])
Out[50]:
k1 k2 v1
0 one 101 two 11# duplicated和drop_duplicates默认保留的是第⼀个出现的值组合。传⼊keep='last'则保留最后⼀个
In [51]: data.drop_duplicates(['k1','k2'], keep='last')
Out[51]:
k1 k2 v1
0 one 101 two 112 one 223 two 334 one 346 two 46
7.2.2 利用函数或映射进行数据转换
In [52]: data = pd.DataFrame({'food':['bacon','pulled pork',....:'Pastrami','corned beef','Bacon',....:'pastrami','honey ham','nova lox'],....:'ounces':[4,3,12,6,7.5,8,3,5,6]})
In [53]: data
Out[53]:
food ounces
0 bacon 4.01 pulled pork 3.02 bacon 12.03 Pastrami 6.04 corned beef 7.55 Bacon 8.06 pastrami 3.07 honey ham 5.08 nova lox 6.0# 编写⼀个不同⾁类到动物的映射
meat_to_animal ={'bacon':'pig','pulled pork':'pig','pastrami':'cow','corned beef':'cow','honey ham':'pig','nova lox':'salmon'}# Series的map⽅法可以接受⼀个函数或含有映射关系的字典型对象# 使⽤Series的str.lower⽅法,将各个值转换为⼩写
In [55]: lowercased = data['food'].str.lower()
In [56]: lowercased
Out[56]:0 bacon
1 pulled pork
2 bacon
3 pastrami
4 corned beef
5 bacon
6 pastrami
7 honey ham
8 nova lox
Name: food, dtype:object
In [57]: data['animal']= lowercased.map(meat_to_animal)
In [58]: data
Out[58]:
food ounces animal
0 bacon 4.0 pig
1 pulled pork 3.0 pig
2 bacon 12.0 pig
3 Pastrami 6.0 cow
4 corned beef 7.5 cow
5 Bacon 8.0 pig
6 pastrami 3.0 cow
7 honey ham 5.0 pig
8 nova lox 6.0 salmon
# 我们也可以传⼊⼀个能够完成全部这些⼯作的函数
In [59]: data['food'].map(lambda x: meat_to_animal[x.lower()])
Out[59]:0 pig
1 pig
2 pig
3 cow
4 cow
5 pig
6 cow
7 pig
8 salmon
Name: food, dtype:object
7.2.3 替换值
# 利⽤fillna⽅法填充缺失数据可以看做值替换的⼀种特殊情况
In [60]: data = pd.Series([1.,-999.,2.,-999.,-1000.,3.])
In [61]: data
Out[61]:01.01-999.022.03-999.04-1000.053.0# -999这个值可能是⼀个表示缺失数据的标记值。要将其替换为pandas能够理解的NA值,我们可以利⽤replace来产⽣⼀个新的Series
In [62]: data.replace(-999, np.nan)
Out[62]:01.01 NaN
22.03 NaN
4-1000.053.0
dtype: float64
# 如果希望⼀次性替换多个值,可以传⼊⼀个由待替换值组成的列表以及⼀个替换值
In [63]: data.replace([-999,-1000], np.nan)
Out[63]:01.01 NaN
22.03 NaN
4 NaN
53.0
dtype: float64
# 要让每个值有不同的替换值,可以传递⼀个替换列表
In [64]: data.replace([-999,-1000],[np.nan,0])
Out[64]:01.01 NaN
22.03 NaN
40.053.0
dtype: float64
# 传⼊的参数也可以是字典
In [65]: data.replace({-999: np.nan,-1000:0})
Out[65]:01.01 NaN
22.03 NaN
40.053.0
dtype: float64
7.2.4 重命名轴索引
# 跟Series中的值⼀样,轴标签也可以通过函数或映射进⾏转换,从⽽得到⼀个新的不同标签的对象。轴还可以被就地修改,⽽⽆需新建⼀个数据结构。
In [66]: data = pd.DataFrame(np.arange(12).reshape((3,4)),....: index=['Ohio','Colorado','New York'],....: columns=['one','two','three','four'])
In [67]: transform =lambda x: x[:4].upper()
In [68]: data.index.map(transform)
Out[68]: Index(['OHIO','COLO','NEW'], dtype='object')# 可以将其赋值给index,这样就可以对DataFrame进行就地修改
In [69]: data.index = data.index.map(transform)
In [70]: data
Out[70]:
one two three four
OHIO 0123
COLO 4567
NEW 891011# 如果想要创建数据集的转换版(⽽不是修改原始数据),⽐较实⽤的⽅法是rename
In [71]: data.rename(index=str.title, columns=str.upper)
Out[71]:
ONE TWO THREE FOUR
Ohio 0123
Colo 4567
New 891011# rename可以实现复制DataFrame并对其索引和列标签进⾏赋值。如果希望就地修改某个数据集,传⼊inplace=True即可
In [73]: data.rename(index={'OHIO':'INDIANA'}, inplace=True)
In [74]: data
Out[74]:
one two three four
INDIANA 0123
COLO 4567
NEW 891011
7.2.5 离散化和面元划分
# 为了便于分析,连续数据常常被离散化或拆分为“⾯元”(bin)
In [75]: ages =[20,22,25,27,21,23,37,31,61,45,41,32]# 使⽤pandas的cut函数将这些数据划分为“18到25”、“26到35”、“35到60”以及“60以上”⼏个⾯元
In [76]: bins =[18,25,35,60,100]
In [77]: cats = pd.cut(ages, bins)
In [78]: cats
Out[78]:[(18,25],(18,25],(18,25],(25,35],(18,25],...,(25,35]]
Length:12
Categories (4, interval[int64]):[(18,25]<(25,35]<(35,60]<(60,100]]# pandas返回的是⼀个特殊的Categorical对象。结果展示了pandas.cut划分的⾯元。可以将其看做⼀组表示⾯元名称的字符串。它的底层含有⼀个表示不同分类名称的类型数组,以及⼀个codes属性中的年龄数据的标签
In [79]: cats.codes
Out[79]: array([0,0,0,1,0,0,2,1,3,2,2,1], dtype=int8)
In [80]: cats.categories
Out[80]:
IntervalIndex([(18,25],(25,35],(35,60],(60,100]]
closed='right',
dtype='interval[int64]')
In [81]: pd.value_counts(cats)
Out[81]:(18,25]5(35,60]3(25,35]3(60,100]1
dtype: int64
# 哪边是闭端可以通过right=False进⾏修改
In [82]: pd.cut(ages,[18,26,36,61,100], right=False)
Out[82]:[[18,26),[18,26),[18,26),[26,36),[18,26),...,[26,36),61),[36,61),[26,36)]
Length:12
Categories (4, interval[int64]):[[18,26)<[26,36)<[36,61)# 可以通过传递⼀个列表或数组到labels,设置⾃⼰的⾯元名称
In [83]: group_names =['Youth','YoungAdult','MiddleAged','Senior']
In [84]: pd.cut(ages, bins, labels=group_names)
Out[84]:[Youth, Youth, Youth, YoungAdult, Youth,..., YoungAdult, Senior, dleAged, YoungAdult]
Length:12
Categories (4,object):[Youth < YoungAdult < MiddleAged < Senior]# 如果向cut传⼊的是⾯元的数量⽽不是确切的⾯元边界,则它会根据数据的最⼩值和最⼤值计算等⻓⾯元。
In [85]: data = np.random.rand(20)
In [86]: pd.cut(data,4, precision=2)# 选项precision=2,限定⼩数只有两位
Out[86]:[(0.34,0.55],(0.34,0.55],(0.76,0.97],(0.76,0.97],(0.34,,0.55],(0.34,0.55],(0.55,0.76],(0.34,0.55],(0.12,0.34]]
Length:20
Categories (4, interval[float64]):[(0.12,0.34]<(0.34,0.55](0.76,0.97]]# qcut是⼀个⾮常类似于cut的函数,它可以根据样本分位数对数据进⾏⾯元划分。
In [87]: data = np.random.randn(1000)# Normally distributed
In [88]: cats = pd.qcut(data,4)# Cut into quartiles
In [89]: cats
Out[89]:[(-0.0265,0.62],(0.62,3.928],(-0.68,-0.0265],(0.62,3.928],,...,(-0.68,-0.0265],(-0.68,-0.0265],(-2.95,-0.68],(0.62,-0.0265]]
Length:1000
Categories (4, interval[float64]):[(-2.95,-0.68]<(-0.68,-0.62]<(0.62,3.928]]
In [90]: pd.value_counts(cats)
Out[90]:(0.62,3.928]250(-0.0265,0.62]250(-0.68,-0.0265]250(-2.95,-0.68]250
dtype: int64
# 与cut类似,你也可以传递⾃定义的分位数
In [91]: pd.qcut(data,[0,0.1,0.5,0.9,1.])
Out[91]:[(-0.0265,1.286],(-0.0265,1.286],(-1.187,-0.0265],(-0.0265,5,1.286],...,(-1.187,-0.0265],(-1.187,-0.0265],(-2.95,1.286],(-1.187,-0.0265]]
Length:1000
Categories (4, interval[float64]):[(-2.95,-1.187]<(-1.187,5,1.286]<(1.286,3.928]]
7.2.6 检测和过滤异常值
# 过滤或变换异常值(outlier)在很⼤程度上就是运⽤数组运算
In [92]: data = pd.DataFrame(np.random.randn(1000,4))
In [93]: data.describe()
Out[93]:0123
count 1000.0000001000.0000001000.0000001000.000000
mean 0.0490910.026112-0.002544-0.051827
std 0.9969471.0074580.9952320.998311min-3.645860-3.184377-3.745356-3.42825425%-0.599807-0.612162-0.687373-0.74747850%0.047101-0.013609-0.022158-0.08827475%0.7566460.6952980.6990460.623331max2.6536563.5258652.7355273.366626# 假设你想要找出某列中绝对值⼤⼩超过3的值
In [94]: col = data[2]
In [95]: col[np.abs(col)>3]
Out[95]:41-3.399312136-3.745356
Name:2, dtype: float64
# 要选出全部含有“超过3或-3的值”的⾏,你可以在布尔型DataFrame中使⽤any⽅法
In [96]: data[(np.abs(data)>3).any(1)]
Out[96]:0123410.457246-0.025907-3.399312-0.974657601.9513123.2603830.9633011.2012061360.508391-0.196713-3.745356-1.520113235-0.242459-3.0569901.918403-0.5788282580.6828410.3260450.425384-3.4282543221.179227-3.1843771.369891-1.074833544-3.5488241.553205-2.1863011.277104635-0.5780930.1932991.3978223.366626782-0.2074343.5258650.2830700.544635803-3.6458600.255475-0.549574-1.907459# 根据这些条件,就可以对值进⾏设置。下⾯的代码可以将值限制在区间-3到3以内
In [97]: data[np.abs(data)>3]= np.sign(data)*3
In [98]: data.describe()
Out[98]:0123
count 1000.0000001000.0000001000.0000001000.000000
mean 0.0502860.025567-0.001399-0.051765
std 0.9929201.0042140.9914140.995761min-3.000000-3.000000-3.000000-3.00000025%-0.599807-0.612162-0.687373-0.74747850%0.047101-0.013609-0.022158-0.08827475%0.7566460.6952980.6990460.623331max2.6536563.0000002.7355273.000000# 根据数据的值是正还是负,np.sign(data)可以⽣成1和-1
In [99]: np.sign(data).head()
Out[99]:01230-1.01.0-1.01.011.0-1.01.0-1.021.01.01.0-1.03-1.0-1.01.0-1.04-1.01.0-1.0-1.0
7.2.7 排列和随机采样
# 利⽤numpy.random.permutation函数可以轻松实现对Series或DataFrame的列的排列⼯作(permuting,随机重排序)# 通过需要排列的轴的⻓度调⽤permutation,可产⽣⼀个表示新顺序的整数数组
In [100]: df = pd.DataFrame(np.arange(5*4).reshape((5,4)))
In [101]: sampler = np.random.permutation(5)
In [102]: sampler
Out[102]: array([3,1,4,2,0])# 可以在基于iloc的索引操作或take函数中使⽤该数组
In [103]: df
Out[103]:012300123145672891011312131415416171819
In [104]: df.take(sampler)
Out[104]:012331213141514567416171819289101100123# 如果不想⽤替换的⽅式选取随机⼦集,可以在Series和DataFrame上使⽤sample⽅法
In [105]: df.sample(n=3)
Out[105]:01233121314154161718192891011# 要通过替换的⽅式产⽣样本(允许重复选择),可以传递replace=True到sample
In [106]: choices = pd.Series([5,7,-1,6,4])
In [107]: draws = choices.sample(n=10, replace=True)
In [108]: draws
Out[108]:4417442-1053617440544
dtype: int64
7.2.8 计算指标、哑变量
另⼀种常⽤于统计建模或机器学习的转换⽅式是:将分类变量转换为“哑变量”或“指标矩阵”。
# 如果DataFrame的某⼀列中含有k个不同的值,pandas的get_dummies函数可以派生出⼀个k列矩阵或DataFrame(其值全为1和0)
In [109]: df = pd.DataFrame({'key':['b','b','a','c','a','b'],.....:'data1':range(6)})
In [110]: pd.get_dummies(df['key'])
Out[110]:
a b c
001010102100300141005010# get_dummies的prefix参数可以给指标DataFrame的列加上⼀个前缀,以便能够跟其他数据进⾏合并。
In [111]: dummies = pd.get_dummies(df['key'], prefix='key')
In [112]: df_with_dummy = df[['data1']].join(dummies)
In [113]: df_with_dummy
Out[113]:
data1 key_a key_b key_c
000101101022100330014410055010