4.2数据转换
import numpy as np
import pandas as pd
移除重复数据
data = pd. DataFrame( { 'k1' : [ 'one' , 'two' ] * 3 + [ 'two' ] ,
'k2' : [ 1 , 1 , 2 , 3 , 3 , 4 , 4 ] } )
data
k1 k2 0 one 1 1 two 1 2 one 2 3 two 3 4 one 3 5 two 4 6 two 4
duplicated方法
data. duplicated( )
0 False
1 False
2 False
3 False
4 False
5 False
6 True
dtype: bool
duplicated方法会返回布尔型的Series表示每一行是否有重复。例子中,第6行和第7行显然是相同的,所以第六行的值是True
drop_duplicates()方法
顾名思义,该方法是用来删除重复的行,第二次出现重复的行就会被删除
data. drop_duplicates( )
k1 k2 0 one 1 1 two 1 2 one 2 3 two 3 4 one 3 5 two 4
参数一:传入列
以上两个方法都可以设置根据哪一列来进行删除,直接传入包含列的list即可,例如
data[ 'v1' ] = np. arange( 7 )
data
k1 k2 v1 0 one 1 0 1 two 1 1 2 one 2 2 3 two 3 3 4 one 3 4 5 two 4 5 6 two 4 6
例一:只传入一列
data. drop_duplicates( [ 'k1' ] )
data. duplicated( [ 'k1' ] )
0 False
1 False
2 True
3 True
4 True
5 True
6 True
dtype: bool
例二:传入两列
data. drop_duplicates( [ 'k1' , 'v1' ] )
k1 k2 v1 0 one 1 0 1 two 1 1 2 one 2 2 3 two 3 3 4 one 3 4 5 two 4 5 6 two 4 6
data. duplicated( [ 'k1' , 'k2' ] )
0 False
1 False
2 False
3 False
4 False
5 False
6 True
dtype: bool
参数二:keep
以上两个方法默认第一个出现的不算duplicate,设置**keep=‘last’**则可以保留最后一个:
data. drop_duplicates( [ 'k1' , 'k2' ] , keep= 'last' )
k1 k2 v1 0 one 1 0 1 two 1 1 2 one 2 2 3 two 3 3 4 one 3 4 6 two 4 6
利用函数或映射进行数据转换
例如:
data = pd. DataFrame( { 'food' : [ 'bacon' , 'pulled pork' , 'bacon' , 'Pastrami' , 'corned beef' , 'Bacon' , 'pastrami' , 'honey ham' , 'nova lox' ] ,
'ounces' : [ 4 , 3 , 12 , 6 , 7.5 , 8 , 3 , 5 , 6 ] } )
data
food ounces 0 bacon 4.0 1 pulled pork 3.0 2 bacon 12.0 3 Pastrami 6.0 4 corned beef 7.5 5 Bacon 8.0 6 pastrami 3.0 7 honey ham 5.0 8 nova lox 6.0
任务:添加一列表示该肉类食物来源的动物类型
meat_to_animal = { 'bacon' : 'pig' ,
'pulled pork' : 'pig' ,
'pastrami' : 'cow' ,
'corned beef' : 'cow' ,
'honey ham' : 'pig' ,
'nova lox' : 'salmon' }
使用映射
lowercased = data[ 'food' ] . str . lower( )
lowercased
0 bacon
1 pulled pork
2 bacon
3 pastrami
4 corned beef
5 bacon
6 pastrami
7 honey ham
8 nova lox
Name: food, dtype: object
data[ 'animal' ] = lowercased. map ( meat_to_animal)
data
food ounces animal 0 bacon 4.0 pig 1 pulled pork 3.0 pig 2 bacon 12.0 pig 3 Pastrami 6.0 cow 4 corned beef 7.5 cow 5 Bacon 8.0 pig 6 pastrami 3.0 cow 7 honey ham 5.0 pig 8 nova lox 6.0 salmon
data = data. drop( 'animal' , axis= 1 )
使用函数(lambda函数)
data[ 'animal' ] = data[ 'food' ] . map ( lambda x: meat_to_animal[ x. lower( ) ] )
data
food ounces animal 0 bacon 4.0 pig 1 pulled pork 3.0 pig 2 bacon 12.0 pig 3 Pastrami 6.0 cow 4 corned beef 7.5 cow 5 Bacon 8.0 pig 6 pastrami 3.0 cow 7 honey ham 5.0 pig 8 nova lox 6.0 salmon
替换值
fillna方法可以看做用于替换值的特殊情况(NA是特殊值),replace方法 更简单、灵活
data = pd. Series( [ 1 . , - 999 . , 2 . , - 999 . , - 1000 , 3 ] )
data
0 1.0
1 -999.0
2 2.0
3 -999.0
4 -1000.0
5 3.0
dtype: float64
data. replace( - 999 , np. nan)
0 1.0
1 NaN
2 2.0
3 NaN
4 -1000.0
5 3.0
dtype: float64
希望将-1000也替换掉?那么就传入一个list!
data. replace( [ - 999 , - 1000 ] , np. nan)
0 1.0
1 NaN
2 2.0
3 NaN
4 NaN
5 3.0
dtype: float64
希望将-999和-1000替换成不同的值?同样,传入一个list!
data. replace( [ - 999 , - 1000 ] , [ np. nan, 0 ] )
0 1.0
1 NaN
2 2.0
3 NaN
4 0.0
5 3.0
dtype: float64
传入dict也可
data. replace( { - 999 : np. nan, - 1000 : 0 } )
0 1.0
1 NaN
2 2.0
3 NaN
4 0.0
5 3.0
dtype: float64
重命名轴索引
轴索引也可以通过和之前介绍过的名字一样的map方法重新命名 rename方法
df.index.map
data = pd. DataFrame( np. arange( 12 ) . reshape( 3 , 4 ) ,
index= [ 'Ohio' , 'Colorado' , 'New York' ] ,
columns= [ 'one' , 'two' , 'three' , 'four' ] )
data
one two three four Ohio 0 1 2 3 Colorado 4 5 6 7 New York 8 9 10 11
transform = lambda x: x[ : 4 ] . upper( )
data. index = data. index. map ( transform)
data
one two three four OHIO 0 1 2 3 COLO 4 5 6 7 NEW 8 9 10 11
rename
直接修改index参数和columns参数
data. rename( index= str . title, columns= str . upper)
ONE TWO THREE FOUR Ohio 0 1 2 3 Colo 4 5 6 7 New 8 9 10 11
使用dict部分修改index和columns
data. rename( index= { 'OHIO' : 'INDIANA' } , columns= { 'three' : 'peekaboo' } )
one two peekaboo four INDIANA 0 1 2 3 COLO 4 5 6 7 NEW 8 9 10 11
inplace参数
直接修改数据集,而非返回副本
data. rename( index= str . upper, columns= str . title, inplace= True )
data
One Two Three Four OHIO 0 1 2 3 COLO 4 5 6 7 NEW 8 9 10 11
离散化和面元分割
pd.cut方法
根据面元(bins)划分 返回的是Categorical对象
ages = np. random. randint( 120 , size= 100 )
ages
array([ 46, 21, 116, 119, 118, 119, 90, 88, 16, 83, 103, 8, 17,
92, 75, 6, 19, 63, 19, 92, 91, 74, 10, 87, 25, 17,
116, 118, 60, 98, 52, 54, 101, 26, 66, 58, 37, 68, 19,
91, 65, 98, 57, 84, 85, 100, 32, 87, 105, 21, 29, 104,
109, 15, 101, 81, 10, 34, 27, 117, 85, 42, 54, 16, 112,
102, 51, 13, 93, 43, 8, 55, 33, 34, 72, 103, 79, 61,
41, 36, 47, 72, 116, 100, 0, 38, 41, 84, 54, 64, 46,
56, 10, 116, 78, 76, 116, 98, 20, 48])
bins= [ 18 , 25 , 35 , 60 , 100 ]
cats = pd. cut( ages, bins)
cats
[(35.0, 60.0], (18.0, 25.0], NaN, NaN, NaN, ..., (60.0, 100.0], NaN, (60.0, 100.0], (18.0, 25.0], (35.0, 60.0]]
Length: 100
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
categorial对象的codes属性
数字代表了属于的面元的index -1代表不属于任何一个面元
cats. codes
array([ 2, 0, -1, -1, -1, -1, 3, 3, -1, 3, -1, -1, -1, 3, 3, -1, 0,
3, 0, 3, 3, 3, -1, 3, 0, -1, -1, -1, 2, 3, 2, 2, -1, 1,
3, 2, 2, 3, 0, 3, 3, 3, 2, 3, 3, 3, 1, 3, -1, 0, 1,
-1, -1, -1, -1, 3, -1, 1, 1, -1, 3, 2, 2, -1, -1, -1, 2, -1,
3, 2, -1, 2, 1, 1, 3, -1, 3, 3, 2, 2, 2, 3, -1, 3, -1,
2, 2, 3, 2, 3, 2, 2, -1, -1, 3, 3, -1, 3, 0, 2],
dtype=int8)
categorial对象的categories属性
cats. categories
IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
closed='right',
dtype='interval[int64]')
pd.value_counts()方法
这边可以用来计算categorial各类别分别有几个值
pd. value_counts( cats)
(60, 100] 33
(35, 60] 21
(25, 35] 7
(18, 25] 7
dtype: int64
categorial的right参数
pd. cut( ages, bins, right= False )
[[35.0, 60.0), [18.0, 25.0), NaN, NaN, NaN, ..., [60.0, 100.0), NaN, [60.0, 100.0), [18.0, 25.0), [35.0, 60.0)]
Length: 100
Categories (4, interval[int64]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]
categorial的label参数
pd. cut( ages, bins, labels= [ 'Youth' , 'Young Adult' , 'Middle Age' , 'Senior' ] )
[Middle Age, Youth, NaN, NaN, NaN, ..., Senior, NaN, Senior, Youth, Middle Age]
Length: 100
Categories (4, object): [Youth < Young Adult < Middle Age < Senior]
传入的是面元数量而非边界?
会根据最大值和最小值来计算等长的面元,并以此作为边界来划分 可选参数precision来确定小数的位数
data = np. random. rand( 20 )
pd. cut( data, 4 )
[(0.338, 0.524], (0.152, 0.338], (0.524, 0.71], (0.524, 0.71], (0.524, 0.71], ..., (0.71, 0.895], (0.338, 0.524], (0.338, 0.524], (0.524, 0.71], (0.524, 0.71]]
Length: 20
Categories (4, interval[float64]): [(0.152, 0.338] < (0.338, 0.524] < (0.524, 0.71] < (0.71, 0.895]]
pd. cut( data, 4 , precision= 2 )
[(0.34, 0.52], (0.15, 0.34], (0.52, 0.71], (0.52, 0.71], (0.52, 0.71], ..., (0.71, 0.9], (0.34, 0.52], (0.34, 0.52], (0.52, 0.71], (0.52, 0.71]]
Length: 20
Categories (4, interval[float64]): [(0.15, 0.34] < (0.34, 0.52] < (0.52, 0.71] < (0.71, 0.9]]
qcut方法
根据样本分位数对数据进行面元划分->能得到大小基本相等的面元
data = np. random. rand( 1000 )
cats = pd. qcut( data, 4 )
cats
[(0.481, 0.745], (0.256, 0.481], (-0.00040400000000000006, 0.256], (-0.00040400000000000006, 0.256], (0.481, 0.745], ..., (0.256, 0.481], (-0.00040400000000000006, 0.256], (0.256, 0.481], (0.745, 0.999], (-0.00040400000000000006, 0.256]]
Length: 1000
Categories (4, interval[float64]): [(-0.00040400000000000006, 0.256] < (0.256, 0.481] < (0.481, 0.745] < (0.745, 0.999]]
cats. value_counts( )
(-0.00040400000000000006, 0.256] 250
(0.256, 0.481] 250
(0.481, 0.745] 250
(0.745, 0.999] 250
dtype: int64
可以发现每个区间的值都是一样的(在数量可以被categories整除时)
qcut:自定义分位数
cats = pd. qcut( data, [ 0 , 0.1 , 0.5 , 0.9 , 1 . ] )
cats
[(0.481, 0.882], (0.0954, 0.481], (0.0954, 0.481], (-0.00040400000000000006, 0.0954], (0.481, 0.882], ..., (0.0954, 0.481], (0.0954, 0.481], (0.0954, 0.481], (0.882, 0.999], (0.0954, 0.481]]
Length: 1000
Categories (4, interval[float64]): [(-0.00040400000000000006, 0.0954] < (0.0954, 0.481] < (0.481, 0.882] < (0.882, 0.999]]
cats. value_counts( )
(-0.00040400000000000006, 0.0954] 100
(0.0954, 0.481] 400
(0.481, 0.882] 400
(0.882, 0.999] 100
dtype: int64
可以发现是按照自定义的分位数划分的数量 [(0.1-0)*1000 (0.5-0.1)*1000 (0.9-0.5)*1000 (1-0.1)*1000]
检测和过滤异常值
data = pd. DataFrame( np. random. randn( 1000 , 4 ) )
data. describe( )
0 1 2 3 count 1000.000000 1000.000000 1000.000000 1000.000000 mean 0.072513 -0.035990 0.000788 -0.057542 std 0.998585 1.011963 1.022648 0.985153 min -2.869199 -3.294376 -3.174468 -3.809258 25% -0.576538 -0.749310 -0.680436 -0.694334 50% 0.107212 -0.080716 0.013427 -0.045903 75% 0.717562 0.673483 0.699950 0.570491 max 3.525382 3.343722 3.196883 2.934261
例一:找出某列中绝对值大小超过3的值
col = data[ 2 ]
col[ np. abs ( col) > 3 ]
183 3.015125
437 3.196883
509 -3.174468
Name: 2, dtype: float64
例二:找出全部绝对值大小超过3的行 顺便复习一下DataFrame.any方法: DataFrame.any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs)
np.abs(data)>3返回的是布尔类型的DataFrame,然后.any(1)返回True值代表该行有绝对值大小超过3的数字,外面data[]代表返回这些行的原始值
data[ ( np. abs ( data) > 3 ) . any ( 1 ) ]
0 1 2 3 19 3.525382 0.857763 -0.464948 -1.067385 76 0.520871 -3.229111 0.392370 0.652889 183 -0.132046 1.198331 3.015125 -0.259849 368 3.289928 0.797813 -1.672622 -0.315966 437 -0.877511 -0.240920 3.196883 -0.595124 438 1.555436 3.343722 -0.838151 -0.353502 501 0.898879 -3.294376 -0.521216 0.363553 509 -1.798896 2.085908 -3.174468 -0.632310 679 -1.043170 -2.108286 0.940602 -3.809258 731 0.680952 3.000503 0.885260 -1.007260 881 -0.103924 -3.025645 -0.708571 -1.093002
设置值
data[ np. abs ( data) > 3 ] = np. sign( data) * 3
复习一下np.sign() $ sign(x) = \begin{cases} 1, & \text{if
x
x
x > 0 }\ 0, & \text{if
x
x
x = 0 }\ -1, & \text{if
x
x
x < 0} \end{cases} $
data. describe( )
0 1 2 3 count 1000.000000 1000.000000 1000.000000 1000.000000 mean 0.071697 -0.035785 0.000750 -0.056733 std 0.996008 1.009188 1.021479 0.982397 min -2.869199 -3.000000 -3.000000 -3.000000 25% -0.576538 -0.749310 -0.680436 -0.694334 50% 0.107212 -0.080716 0.013427 -0.045903 75% 0.717562 0.673483 0.699950 0.570491 max 3.000000 3.000000 3.000000 2.934261
根据值的正负,np.sign(data)还能生成1和-1
np. sign( data)
0 1 2 3 0 1.0 1.0 1.0 1.0 1 -1.0 -1.0 -1.0 1.0 2 1.0 -1.0 -1.0 1.0 3 1.0 1.0 -1.0 -1.0 4 1.0 -1.0 1.0 1.0 ... ... ... ... ... 995 1.0 1.0 1.0 -1.0 996 1.0 1.0 1.0 1.0 997 -1.0 -1.0 1.0 -1.0 998 -1.0 1.0 -1.0 1.0 999 -1.0 1.0 -1.0 -1.0
1000 rows × 4 columns
排列和随机采样
numpy.random.permutation
排列函数->对Series或DataFrame的列的排列工作
df = pd. DataFrame( np. arange( 5 * 4 ) . reshape( 5 , 4 ) )
df
0 1 2 3 0 0 1 2 3 1 4 5 6 7 2 8 9 10 11 3 12 13 14 15 4 16 17 18 19
sampler = np. random. permutation( 5 )
sampler
array([3, 2, 1, 0, 4])
对行排列
iloc take(DataFrame.take(indices, axis=0, convert=True, is_copy=True, **kwargs) 参数:convert : translate neg to pos indices (default))
df. iloc[ sampler]
0 1 2 3 3 12 13 14 15 2 8 9 10 11 1 4 5 6 7 0 0 1 2 3 4 16 17 18 19
df. take( sampler)
0 1 2 3 3 12 13 14 15 2 8 9 10 11 1 4 5 6 7 0 0 1 2 3 4 16 17 18 19
sample方法
df. sample( n= 3 )
0 1 2 3 1 4 5 6 7 3 12 13 14 15 2 8 9 10 11
replace参数
choices = pd. Series( [ 5 , 7 , - 1 , 6 , 4 ] )
choices. sample( n= 10 , replace= True )
0 5
0 5
3 6
0 5
1 7
3 6
2 -1
1 7
0 5
2 -1
dtype: int64
计算指标/哑变量
df = pd. DataFrame( { 'key' : [ 'b' , 'b' , 'a' , 'c' , 'a' , 'b' ] , 'data1' : np. arange( 6 ) } )
df
key data1 0 b 0 1 b 1 2 a 2 3 c 3 4 a 4 5 b 5
pd. get_dummies( df[ 'key' ] )
a b c 0 0 1 0 1 0 1 0 2 1 0 0 3 0 0 1 4 1 0 0 5 0 1 0
prefix参数
dummies = pd. get_dummies( df[ 'key' ] , prefix= 'key' )
dummies
key_a key_b key_c 0 0 1 0 1 0 1 0 2 1 0 0 3 0 0 1 4 1 0 0 5 0 1 0
df_with_dummy = df[ [ 'data1' ] ] . join( dummies)
df_with_dummy
data1 key_a key_b key_c 0 0 0 1 0 1 1 0 1 0 2 2 1 0 0 3 3 0 0 1 4 4 1 0 0 5 5 0 1 0
某行同时属于多个分类
mnames= [ 'movie_id' , 'title' , 'genres' ]
movies = pd. read_table( 'PythonForDataAnalysis-master/ch07/ml-1m/movies.dat' , sep= '::' , header= None , names= mnames, engine= 'python' )
movies. head( )
movie_id title genres 0 1 Toy Story (1995) Animation|Children's|Comedy 1 2 Jumanji (1995) Adventure|Children's|Fantasy 2 3 Grumpier Old Men (1995) Comedy|Romance 3 4 Waiting to Exhale (1995) Comedy|Drama 4 5 Father of the Bride Part II (1995) Comedy
抽取出不同的genre值
all_genres = [ ]
for x in movies. genres:
all_genres. extend( x. split( '|' ) )
genres = pd. unique( all_genres)
genres
array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
'Western'], dtype=object)
从一个全是zero的DataFrame开始构建指标DataFrame
zero_matrix = np. zeros( ( len ( movies) , len ( genres) ) )
dummies = pd. DataFrame( zero_matrix, columns= genres)
for i, gen in enumerate ( movies. genres) :
indices = dummies. columns. get_indexer( gen. split( '|' ) )
dummies. iloc[ i, indices] = 1
movies_windic = movies. join( dummies. add_prefix( 'Genre_' ) )
movies_windic
movie_id title genres Genre_Animation Genre_Children's Genre_Comedy Genre_Adventure Genre_Fantasy Genre_Romance Genre_Drama ... Genre_Crime Genre_Thriller Genre_Horror Genre_Sci-Fi Genre_Documentary Genre_War Genre_Musical Genre_Mystery Genre_Film-Noir Genre_Western 0 1 Toy Story (1995) Animation|Children's|Comedy 1.0 1.0 1.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1 2 Jumanji (1995) Adventure|Children's|Fantasy 0.0 1.0 0.0 1.0 1.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2 3 Grumpier Old Men (1995) Comedy|Romance 0.0 0.0 1.0 0.0 0.0 1.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3 4 Waiting to Exhale (1995) Comedy|Drama 0.0 0.0 1.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4 5 Father of the Bride Part II (1995) Comedy 0.0 0.0 1.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
5 rows × 21 columns
注意:对于很大的数据,用这种方式构建多成员指标会很慢。最好使用更低级的函数,将其写⼊NumPy数组,然后用DataFrame包装。
values = np. random. rand( 10 )
values
array([0.72757738, 0.11463144, 0.98715221, 0.67817538, 0.76816537,
0.34697118, 0.5721833 , 0.24276292, 0.88097025, 0.30371988])
bins = np. arange( 0 , 1 , 0.2 )
bins
array([0. , 0.2, 0.4, 0.6, 0.8])
pd. get_dummies( pd. cut( values, bins) )
(0.0, 0.2] (0.2, 0.4] (0.4, 0.6] (0.6, 0.8] 0 0 0 0 1 1 1 0 0 0 2 0 0 0 0 3 0 0 0 1 4 0 0 0 1 5 0 1 0 0 6 0 0 1 0 7 0 1 0 0 8 0 0 0 0 9 0 1 0 0