数据清洗和准备
import numpy as np
import pandas as pd
PREVIOUS_MAX_ROWS = pd. options. display. max_rows
pd. options. display. max_rows = 20
np. random. seed( 12345 )
import matplotlib. pyplot as plt
plt. rc( 'figure' , figsize= ( 10 , 6 ) )
np. set_printoptions( precision= 4 , suppress= True )
处理缺失值
string_data = pd. Series( [ 'aardvark' , 'artichoke' , np. nan, 'avocado' ] )
string_data
0 aardvark
1 artichoke
2 NaN
3 avocado
dtype: object
string_data. isnull( )
0 False
1 False
2 True
3 False
dtype: bool
1.过滤掉丢失的数据
from numpy import nan as NA
data = pd. Series( [ 1 , NA, 3.5 , NA, 7 ] )
data. dropna( )
0 1.0
2 3.5
4 7.0
dtype: float64
data[ data. notnull( ) ]
0 1 2 0 1.0 6.5 3.0 1 1.0 NaN NaN 2 NaN NaN NaN 3 NaN 6.5 3.0
2.按行删除缺失值
data = pd. DataFrame( [ [ 1 . , 6.5 , 3 . ] , [ 1 . , NA, NA] ,
[ NA, NA, NA] , [ NA, 6.5 , 3 . ] ] )
data
0 1 2 0 1.0 6.5 3.0 1 1.0 NaN NaN 2 NaN NaN NaN 3 NaN 6.5 3.0
cleaned = data. dropna( )
cleaned
3.仅删除行值全为NaN的对应行
data. dropna( how= 'all' )
0 1 2 0 1.0 6.5 3.0 1 1.0 NaN NaN 3 NaN 6.5 3.0
4.删除列全为NaN的对应的列
data[ 4 ] = NA
data
0 1 2 4 0 1.0 6.5 3.0 NaN 1 1.0 NaN NaN NaN 2 NaN NaN NaN NaN 3 NaN 6.5 3.0 NaN
data. dropna( axis= 1 , how= 'all' )
0 1 2 0 1.0 6.5 3.0 1 1.0 NaN NaN 2 NaN NaN NaN 3 NaN 6.5 3.0
举例
df = pd. DataFrame( np. random. randn( 7 , 3 ) )
df. iloc[ : 4 , 1 ] = NA
df. iloc[ : 2 , 2 ] = NA
df
0 1 2 0 -0.204708 NaN NaN 1 -0.555730 NaN NaN 2 0.092908 NaN 0.769023 3 1.246435 NaN -1.296221 4 0.274992 0.228913 1.352917 5 0.886429 -2.001637 -0.371843 6 1.669025 -0.438570 -0.539741
df. dropna( )
0 1 2 4 0.274992 0.228913 1.352917 5 0.886429 -2.001637 -0.371843 6 1.669025 -0.438570 -0.539741
df. dropna( thresh= 2 )
填充缺失值
1.填充0
df. fillna( 0 )
0 1 2 0 -0.204708 0.000000 0.000000 1 -0.555730 0.000000 0.000000 2 0.092908 0.000000 0.769023 3 1.246435 0.000000 -1.296221 4 0.274992 0.228913 1.352917 5 0.886429 -2.001637 -0.371843 6 1.669025 -0.438570 -0.539741
2.按列填充不同的数字
df. fillna( { 1 : 0.5 , 2 : 0 } )
0 1 2 0 -0.204708 0.500000 0.000000 1 -0.555730 0.500000 0.000000 2 0.092908 0.500000 0.769023 3 1.246435 0.500000 -1.296221 4 0.274992 0.228913 1.352917 5 0.886429 -2.001637 -0.371843 6 1.669025 -0.438570 -0.539741
数据转换
1.对数据进行去重处理(各指标都相同)
data = pd. DataFrame( { 'k1' : [ 'one' , 'two' ] * 3 + [ 'two' ] ,
'k2' : [ 1 , 1 , 2 , 3 , 3 , 4 , 4 ] } )
data
k1 k2 0 one 1 1 two 1 2 one 2 3 two 3 4 one 3 5 two 4 6 two 4
data. duplicated( )
0 False
1 False
2 False
3 False
4 False
5 False
6 True
dtype: bool
data. drop_duplicates( )
k1 k2 0 one 1 1 two 1 2 one 2 3 two 3 4 one 3 5 two 4
data[ 'v1' ] = range ( 7 )
data
k1 k2 v1 0 one 1 0 1 two 1 1 2 one 2 2 3 two 3 3 4 one 3 4 5 two 4 5 6 two 4 6
2.对数据进行去重处理(某个或者某几个指标都相同)
data. drop_duplicates( [ 'k1' ] )
data. drop_duplicates( [ 'k1' , 'k2' ] , keep= 'last' )
k1 k2 v1 0 one 1 0 1 two 1 1 2 one 2 2 3 two 3 3 4 one 3 4 6 two 4 6
对Series进行替换值
data = pd. Series( [ 1 . , - 999 . , 2 . , - 999 . , - 1000 . , 3 . ] )
data
0 1.0
1 -999.0
2 2.0
3 -999.0
4 -1000.0
5 3.0
dtype: float64
data. replace( - 999 , np. nan)
0 1.0
1 NaN
2 2.0
3 NaN
4 -1000.0
5 3.0
dtype: float64
data. replace( [ - 999 , - 1000 ] , np. nan)
0 1.0
1 NaN
2 2.0
3 NaN
4 NaN
5 3.0
dtype: float64
data. replace( [ - 999 , - 1000 ] , [ np. nan, 0 ] )
0 1.0
1 NaN
2 2.0
3 NaN
4 0.0
5 3.0
dtype: float64
data. replace( { - 999 : np. nan, - 1000 : 0 } )
0 1.0
1 NaN
2 2.0
3 NaN
4 0.0
5 3.0
dtype: float64
Renaming Axis Indexes
data = pd. DataFrame( np. arange( 12 ) . reshape( ( 3 , 4 ) ) ,
index= [ 'Ohio' , 'Colorado' , 'New York' ] ,
columns= [ 'one' , 'two' , 'three' , 'four' ] )
data
one two three four Ohio 0 1 2 3 Colorado 4 5 6 7 New York 8 9 10 11
data. rename( index= str . title, columns= str . upper)
ONE TWO THREE FOUR Ohio 0 1 2 3 Colo 4 5 6 7 New 8 9 10 11
1.将一维数据进行频率统计,分区间
ages = [ 20 , 22 , 25 , 27 , 21 , 23 , 37 , 31 , 61 , 45 , 41 , 32 ]
bins = [ 18 , 25 , 35 , 60 , 100 ]
cats = pd. cut( ages, bins)
cats
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
cats. codes
array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)
cats. categories
IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
closed='right',
dtype='interval[int64]')
pd. value_counts( cats)
(18, 25] 5
(35, 60] 3
(25, 35] 3
(60, 100] 1
dtype: int64
pd. cut( ages, [ 18 , 26 , 36 , 61 , 100 ] , right= False )
[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]
2.将一维数据区间进行重命名,概念化
group_names = [ 'Youth' , 'YoungAdult' , 'MiddleAged' , 'Senior' ]
cats1= pd. cut( ages, bins, labels= group_names)
cats1
[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]
data = np. random. randn( 1000 )
cats = pd. qcut( data, 4 )
cats
[(0.678, 3.26], (-3.746, -0.64], (-0.0251, 0.678], (-3.746, -0.64], (-0.64, -0.0251], ..., (-3.746, -0.64], (-0.0251, 0.678], (-3.746, -0.64], (-0.64, -0.0251], (-3.746, -0.64]]
Length: 1000
Categories (4, interval[float64]): [(-3.746, -0.64] < (-0.64, -0.0251] < (-0.0251, 0.678] < (0.678, 3.26]]
pd. value_counts( cats)
(0.678, 3.26] 250
(-0.0251, 0.678] 250
(-0.64, -0.0251] 250
(-3.746, -0.64] 250
dtype: int64
pd. qcut( data, [ 0 , 0.1 , 0.5 , 0.9 , 1 . ] )
[(1.338, 3.26], (-1.296, -0.0251], (-0.0251, 1.338], (-1.296, -0.0251], (-1.296, -0.0251], ..., (-1.296, -0.0251], (-0.0251, 1.338], (-3.746, -1.296], (-1.296, -0.0251], (-1.296, -0.0251]]
Length: 1000
Categories (4, interval[float64]): [(-3.746, -1.296] < (-1.296, -0.0251] < (-0.0251, 1.338] < (1.338, 3.26]]
检测和过滤异常值
data = pd. DataFrame( np. random. randn( 1000 , 4 ) )
data. describe( )
0 1 2 3 count 1000.000000 1000.000000 1000.000000 1000.000000 mean -0.048908 -0.062646 0.007473 -0.008516 std 0.991455 1.007185 1.025612 1.001500 min -3.194414 -3.530912 -3.183867 -3.481593 25% -0.752792 -0.755464 -0.690993 -0.694234 50% -0.041676 -0.032640 0.004415 0.025105 75% 0.617693 0.617544 0.713606 0.694813 max 3.023720 2.916153 3.189940 2.961194
1.对某一列进行检测
col = data[ 2 ]
col
0 -2.296733
1 -0.908114
2 -1.488200
3 0.074264
4 1.242359
...
995 -0.105099
996 -1.343030
997 -0.671807
998 0.307430
999 0.998089
Name: 2, Length: 1000, dtype: float64
col[ np. abs ( col) > 3 ]
17 -3.183867
64 -3.140963
457 3.082067
485 3.189940
851 -3.024110
Name: 2, dtype: float64
2.对整个数据框进行检测
data[ ( np. abs ( data) > 3 ) . any ( 1 ) ]
0 1 2 3 17 -0.274138 1.188742 -3.183867 1.050471 64 1.741426 -2.214074 -3.140963 -1.509976 230 -3.194414 0.077839 -1.733549 0.235425 260 3.023720 -1.105312 0.105141 0.995257 288 0.062528 2.368010 0.452649 -3.481593 457 -0.071320 0.164293 3.082067 -0.516982 485 0.617599 -0.843849 3.189940 0.070978 650 -3.044612 -1.193980 0.862312 1.012656 683 0.069036 0.617561 -1.148738 -3.170292 840 -3.105636 -0.369009 0.131459 -2.540833 851 -1.414637 0.123291 -3.024110 -1.168413 912 0.691626 -3.530912 -0.576175 -0.750648
3.前五行数据的正负
np. sign( data) . head( )
0 1 2 3 0 1.0 -1.0 -1.0 -1.0 1 -1.0 -1.0 -1.0 -1.0 2 -1.0 1.0 -1.0 -1.0 3 1.0 1.0 1.0 1.0 4 -1.0 -1.0 1.0 -1.0
排列和随机抽样
df = pd. DataFrame( np. arange( 5 * 4 ) . reshape( ( 5 , 4 ) ) )
sampler = np. random. permutation( 5 )
sampler
array([1, 3, 2, 4, 0])
df
0 1 2 3 0 0 1 2 3 1 4 5 6 7 2 8 9 10 11 3 12 13 14 15 4 16 17 18 19
1.改变行排序
df. take( sampler)
0 1 2 3 1 4 5 6 7 3 12 13 14 15 2 8 9 10 11 4 16 17 18 19 0 0 1 2 3
2.按行抽样
df. sample( n= 3 )
0 1 2 3 3 12 13 14 15 1 4 5 6 7 2 8 9 10 11
choices = pd. Series( [ 5 , 7 , - 1 , 6 , 4 ] )
draws = choices. sample( n= 10 , replace= True )
draws
3 6
0 5
1 7
4 4
2 -1
0 5
0 5
1 7
4 4
0 5
dtype: int64
字符串操作
字符串对象方法
val = 'a,b, guido'
val. split( ',' )
['a', 'b', ' guido']
pieces = [ x. strip( ) for x in val. split( ',' ) ]
pieces
['a', 'b', 'guido']
first, second, third = pieces
first + '::' + second + '::' + third
'a::b::guido'
'::' . join( pieces)
'a::b::guido'