利用Python进行数据分析的学习笔记——chap7

数据规整化:清理、转换、合并、重塑

合并数据集

pandas.merge可根据一个或多个键将不同DataFrame中的行连接起来。
pandas.concat可以沿着一条轴将多个对象堆叠到一起。
实例方法combine_first可以将重复数据编接在一起,用一个对象中的值填充另一个对象中的缺失值。

数据库风格的DataFrame合并

import numpy as np
import pandas as pd
from pandas import DataFrame,Series
df1 = DataFrame({'key':['b','b','a','c','a','a','b'],'data1':range(7)})
df2 = DataFrame({'key':['a','b','d'],
                'data2':range(3)})
df1
keydata1
0b0
1b1
2a2
3c3
4a4
5a5
6b6
df2
keydata2
0a0
1b1
2d2
pd.merge(df1,df2)#未指定情况下,将重叠列的列名当做键。
keydata1data2
0b01
1b11
2b61
3a20
4a40
5a50
pd.merge(df1,df2,on='key')
keydata1data2
0b01
1b11
2b61
3a20
4a40
5a50
#如果两个对象的列名不同,分别进行指定
df3 = DataFrame({'lkey':['b','b','a','c','a','a','b'],
                'data1':range(7)})
df4 = DataFrame({'rkey':['a','b','d'],
                'data2':range(3)})
#默认下merge是inner连接,结果未交集。还有left,right,outer。外连接取得是键得并集,组合了左连接和右连接得效果。
pd.merge(df3,df4,left_on='lkey',right_on='rkey')
lkeydata1rkeydata2
0b0b1
1b1b1
2b6b1
3a2a0
4a4a0
5a5a0
pd.merge(df1,df2,how='outer')
keydata1data2
0b0.01.0
1b1.01.0
2b6.01.0
3a2.00.0
4a4.00.0
5a5.00.0
6c3.0NaN
7dNaN2.0
#多对多的合并
df1 = DataFrame({'key':['b','b','a','c','a','b'],
                'data1':range(6)})
df2 = DataFrame({'key':['a','b','a','b','d'],
                'data2':range(5)})
df1
keydata1
0b0
1b1
2a2
3c3
4a4
5b5
df2
keydata2
0a0
1b1
2a2
3b3
4d4
pd.merge(df1,df2,on='key',how='left')#多对多连接产生的是行的笛卡尔积
keydata1data2
0b01.0
1b03.0
2b11.0
3b13.0
4a20.0
5a22.0
6c3NaN
7a40.0
8a42.0
9b51.0
10b53.0
pd.merge(df1,df2,how='inner')
keydata1data2
0b01
1b03
2b11
3b13
4b51
5b53
6a20
7a22
8a40
9a42
#根据多个键进行合并
left = DataFrame({'key1':['foo','foo','bar'],
                 'key2':['one','two','one'],
                 'lval':[1,2,3]})
right = DataFrame({'key1':['foo','foo','bar','bar'],
                  'key2':['one','one','one','two'],
                  'rval':[4,5,6,7]})
pd.merge(left,right,on=['key1','key2'],how='outer')
key1key2lvalrval
0fooone1.04.0
1fooone1.05.0
2footwo2.0NaN
3barone3.06.0
4bartwoNaN7.0
#对重复列名的处理
pd.merge(left,right,on='key1')
key1key2_xlvalkey2_yrval
0fooone1one4
1fooone1one5
2footwo2one4
3footwo2one5
4barone3one6
5barone3two7
pd.merge(left,right,on='key1',suffixes=('_left','_right'))
key1key2_leftlvalkey2_rightrval
0fooone1one4
1fooone1one5
2footwo2one4
3footwo2one5
4barone3one6
5barone3two7

在这里插入图片描述
在这里插入图片描述

索引上的合并

#索引用作连接键
left1 = DataFrame({'key':['a','b','a','a','b','c'],
                  'value':range(6)})
right1 = DataFrame({'group_val':[3.5,7]},index=['a','b'])
left1
keyvalue
0a0
1b1
2a2
3a3
4b4
5c5
right1
group_val
a3.5
b7.0
pd.merge(left1,right1,left_on='key',right_index=True)
keyvaluegroup_val
0a03.5
2a23.5
3a33.5
1b17.0
4b47.0
#通过外连接的方式得到并集
pd.merge(left1,right1,left_on='key',right_index=True,how='outer')
keyvaluegroup_val
0a03.5
2a23.5
3a33.5
1b17.0
4b47.0
5c5NaN
#层次化索引
lefth = DataFrame({'key1':['Ohio','Ohio','Ohio','Nevada','Nevada'],
                  'key2':[2000,2001,2002,2001,2002],
                  'data':np.arange(5.)})
righth = DataFrame(np.arange(12).reshape((6,2)),
                  index=[['Nevada','Nevada','Ohio','Ohio','Ohio','Ohio'],
                        [2001,2000,2000,2000,2001,2002]],
                  columns=['event1','event2'])
lefth
key1key2data
0Ohio20000.0
1Ohio20011.0
2Ohio20022.0
3Nevada20013.0
4Nevada20024.0
righth
event1event2
Nevada200101
200023
Ohio200045
200067
200189
20021011
#以列表的形式指明用作合并键的多个列
pd.merge(lefth,righth,left_on=['key1','key2'],right_index=True)
key1key2dataevent1event2
0Ohio20000.045
0Ohio20000.067
1Ohio20011.089
2Ohio20022.01011
3Nevada20013.001
pd.merge(lefth,righth,left_on=['key1','key2'],
        right_index=True,how='outer')
key1key2dataevent1event2
0Ohio20000.04.05.0
0Ohio20000.06.07.0
1Ohio20011.08.09.0
2Ohio20022.010.011.0
3Nevada20013.00.01.0
4Nevada20024.0NaNNaN
4Nevada2000NaN2.03.0
#使用合并双方的索引
left2 = DataFrame([[1.,2.],[3.,4.],[5.,6.]],index=['a','c','e'],
                 columns=['Ohio','Nevada'])
right2 = DataFrame([[7.,8.],[9.,10.],[11.,12.],[13.,14.]],
                  index=['b','c','d','e'],columns=['Missouri','Alabama'])
left2
OhioNevada
a1.02.0
c3.04.0
e5.06.0
right2
MissouriAlabama
b7.08.0
c9.010.0
d11.012.0
e13.014.0
pd.merge(left2,right2,how='outer',left_index=True,right_index=True)
OhioNevadaMissouriAlabama
a1.02.0NaNNaN
bNaNNaN7.08.0
c3.04.09.010.0
dNaNNaN11.012.0
e5.06.013.014.0
#使用join方法更方便实现按索引合并
left2.join(right2,how='outer')
OhioNevadaMissouriAlabama
a1.02.0NaNNaN
bNaNNaN7.08.0
c3.04.09.010.0
dNaNNaN11.012.0
e5.06.013.014.0
left1.join(right1,on='key')
keyvaluegroup_val
0a03.5
1b17.0
2a23.5
3a33.5
4b47.0
5c5NaN
#还可以向join传入一组DataFrame
another = DataFrame([[7.,8.],[9.,10.],[11.,12.],[16.,17]],
                   index=['a','c','e','f'],columns=['New York','Oregon'])
left2.join([right2,another])
OhioNevadaMissouriAlabamaNew YorkOregon
a1.02.0NaNNaN7.08.0
c3.04.09.010.09.010.0
e5.06.013.014.011.012.0
left2.join([right2,another],how='outer')
OhioNevadaMissouriAlabamaNew YorkOregon
a1.02.0NaNNaN7.08.0
c3.04.09.010.09.010.0
e5.06.013.014.011.012.0
bNaNNaN7.08.0NaNNaN
dNaNNaN11.012.0NaNNaN
fNaNNaNNaNNaN16.017.0

轴向连接

#Numpy的concatenate函数
arr = np.arange(12).reshape((3,4))
arr
array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])
np.concatenate([arr,arr],axis=1)
array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])
#pandas的concat函数,默认情况下在axis=0上工作
s1 = Series([0,1],index=['a','b'])
s2 = Series([2,3,4],index=['c','d','e'])
s3 = Series([5,6],index=['f','g'])
pd.concat([s1,s2,s3])
a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64
#若传入axis=1(是列),则结果会变成一个DataFrame
pd.concat([s1,s2,s3],axis=1)
012
a0.0NaNNaN
b1.0NaNNaN
cNaN2.0NaN
dNaN3.0NaN
eNaN4.0NaN
fNaNNaN5.0
gNaNNaN6.0
s4 = pd.concat([s1*5,s3])
s4
a    0
b    5
f    5
g    6
dtype: int64
pd.concat([s1,s4],axis=1)
01
a0.00
b1.05
fNaN5
gNaN6
pd.concat([s1,s4],axis=1,join='inner')
01
a00
b15
pd.concat([s1,s4],axis=1,join_axes=[['a','c','b','e']])#在使用Pandas_Profiling.ProfilingReport报错," concat() got an unexpected keyword argument ‘join_axes’ "
#经查是pandas和Pandas_Profiling版本过低导致,
#在命令行更新 pip install --upgrade pandas
#pip install --upgrade pandas_Profiling(这一步出错)
---------------------------------------------------------------------------
TypeError: concat() got an unexpected keyword argument 'join_axes'
#在连接轴上创建一个层次化索引
result = pd.concat([s1,s1,s3],keys=['one','two','three'])
result
one    a    0
       b    1
two    a    0
       b    1
three  f    5
       g    6
dtype: int64
result.unstack()
abfg
one0.01.0NaNNaN
two0.01.0NaNNaN
threeNaNNaN5.06.0
#沿着axis=1对Series进行合并,则keys会称为DataFrame的列头
pd.concat([s1,s2,s3],axis=1,keys=['one','two','three'])
onetwothree
a0.0NaNNaN
b1.0NaNNaN
cNaN2.0NaN
dNaN3.0NaN
eNaN4.0NaN
fNaNNaN5.0
gNaNNaN6.0
#同理,对DataFrame也是一样
df1 = DataFrame(np.arange(6).reshape(3,2),index=['a','b','c'],
               columns=['one','two'])
df2 = DataFrame(5+np.arange(4).reshape(2,2),index=['a','c'],
               columns=['three','four'])
pd.concat([df1,df2],axis=1,keys=['level1','level2'])
level1level2
onetwothreefour
a015.06.0
b23NaNNaN
c457.08.0
#对于字典,其键会被当做keys选项的值
pd.concat([df1,df2],axis=1,keys=['level1','level2'])
level1level2
onetwothreefour
a015.06.0
b23NaNNaN
c457.08.0
pd.concat([df1,df2],axis=1,keys=['level1','level2'],
         names=['upper','lower'])
upperlevel1level2
loweronetwothreefour
a015.06.0
b23NaNNaN
c457.08.0
df1 = DataFrame(np.random.randn(3,4),columns=['a','b','c','d'])
df2 = DataFrame(np.random.randn(2,3),columns=['b','d','a'])
df1
abcd
00.909729-1.166310-0.988668-0.398099
11.2267151.429447-0.3536151.096639
20.4207151.319891-2.096798-0.125654
df2
bda
0-0.0778810.031641-0.839706
1-0.2311051.2278900.422392
pd.concat([df1,df2],ignore_index=True)
abcd
00.909729-1.166310-0.988668-0.398099
11.2267151.429447-0.3536151.096639
20.4207151.319891-2.096798-0.125654
3-0.839706-0.077881NaN0.031641
40.422392-0.231105NaN1.227890

在这里插入图片描述

合并重叠数据

#Numpy的where函数,用于表达一种矢量化的if-else
a = Series([np.nan,2.5,np.nan,3.5,4.5,np.nan],
          index=['f','e','d','c','b','a'])
b = Series(np.arange(len(a),dtype=np.float64),index=['f','e','d','c','b','a'])
b[-1] = np.nan
a
f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64
b
f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    NaN
dtype: float64
np.where(pd.isnull(a),b,a)
array([0. , 2.5, 2. , 3.5, 4.5, nan])
#另一种方法
b[:-2].combine_first(a[2:])
a    NaN
b    4.5
c    3.0
d    2.0
e    1.0
f    0.0
dtype: float64
### 用参数对象中的数据为调用者对象的缺失数据“打补丁”
df1 = DataFrame({'a':[1.,np.nan,5.,np.nan],
                'b':[np.nan,2.,np.nan,6.],
                'c':range(2,18,4)})
df2 = DataFrame({'a':[5.,4.,np.nan,3.,7.],
                'b':[np.nan,3.,4.,6.,8.]})
df1.combine_first(df2)
abc
01.0NaN2.0
14.02.06.0
25.04.010.0
33.06.014.0
47.08.0NaN

abc
01.0NaN2
1NaN2.06
25.0NaN10
3NaN6.014
df2
ab
05.0NaN
14.03.0
2NaN4.0
33.06.0
47.08.0

重塑和轴向旋转

重塑层次化索引

stack:将数据的列旋转为行
unstack:将数据的行旋转为列

data = DataFrame(np.arange(6).reshape((2,3)),index=pd.Index(['Ohio','Colorado'],name='state'),
                 columns=pd.Index(['one','two','three'],name='number'))
data
numberonetwothree
state
Ohio012
Colorado345
result = data.stack()
result
state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32
result.unstack()
numberonetwothree
state
Ohio012
Colorado345
#传入分层级别的编号或名称即可对其他级别进行unstack操作
result.unstack(0)
stateOhioColorado
number
one03
two14
three25
result.unstack('state')
stateOhioColorado
number
one03
two14
three25
s1 = Series([0,1,2,3],index=['a','b','c','d'])
s2 = Series([4,5,6],index=['c','d','e'])
data2 = pd.concat([s1,s2],keys=['one','two'])
data2.unstack()
abcde
one0.01.02.03.0NaN
twoNaNNaN4.05.06.0
data2.unstack().stack()
one  a    0.0
     b    1.0
     c    2.0
     d    3.0
two  c    4.0
     d    5.0
     e    6.0
dtype: float64
data2.unstack().stack(dropna=False)
one  a    0.0
     b    1.0
     c    2.0
     d    3.0
     e    NaN
two  a    NaN
     b    NaN
     c    4.0
     d    5.0
     e    6.0
dtype: float64
#在对DataFrame进行unstack操作时,作为旋转轴的级别将会成为结果中的最低级别
df = DataFrame({'left':result,'right':result+5},columns=pd.Index(['left','right'],name='side'))
df
sideleftright
statenumber
Ohioone05
two16
three27
Coloradoone38
two49
three510
df.unstack('state')
sideleftright
stateOhioColoradoOhioColorado
number
one0358
two1469
three25710
df.unstack('state').stack('side')
stateColoradoOhio
numberside
oneleft30
right85
twoleft41
right96
threeleft52
right107

将“长格式”旋转为“宽格式”

# 这里的ldata为自己生成,与原文不一致,只实验方法
ldata = DataFrame({'date':['1959-03-31','1959-03-31','1959-03-31',
                                '1959-06-30','1959-06-30','1959-06-30',
                                '1959-09-30', '1959-09-30','1959-09-30'],
                'item':['realgdp', 'infl', 'unemp']*3,
                'value':[2710.349, 0.0, 5.8]*3})
print(ldata)
         date     item     value
0  1959-03-31  realgdp  2710.349
1  1959-03-31     infl     0.000
2  1959-03-31    unemp     5.800
3  1959-06-30  realgdp  2710.349
4  1959-06-30     infl     0.000
5  1959-06-30    unemp     5.800
6  1959-09-30  realgdp  2710.349
7  1959-09-30     infl     0.000
8  1959-09-30    unemp     5.800
pivoted = ldata.pivot('date','item','value')#前两个参数值分别用作行和列索引的列名,最后一个参数值则是用于填充DataFrame的数据列的列名
pivoted.head()
iteminflrealgdpunemp
date
1959-03-310.02710.3495.8
1959-06-300.02710.3495.8
1959-09-300.02710.3495.8
ldata['value2'] = np.random.randn(len(ldata))
ldata
dateitemvaluevalue2
01959-03-31realgdp2710.3490.576713
11959-03-31infl0.0000.164654
21959-03-31unemp5.800-0.309467
31959-06-30realgdp2710.3491.321802
41959-06-30infl0.0000.964655
51959-06-30unemp5.800-0.355550
61959-09-30realgdp2710.349-0.303973
71959-09-30infl0.0000.791143
81959-09-30unemp5.8000.985764
pivoted = ldata.pivot('date','item')
pivoted
valuevalue2
iteminflrealgdpunempinflrealgdpunemp
date
1959-03-310.02710.3495.80.1646540.576713-0.309467
1959-06-300.02710.3495.80.9646551.321802-0.355550
1959-09-300.02710.3495.80.791143-0.3039730.985764
pivoted['value']
iteminflrealgdpunemp
date
1959-03-310.02710.3495.8
1959-06-300.02710.3495.8
1959-09-300.02710.3495.8
#用set_index创建层次化索引,再用unstack重塑
unstacked = ldata.set_index(['date','item']).unstack('item')
unstacked
valuevalue2
iteminflrealgdpunempinflrealgdpunemp
date
1959-03-310.02710.3495.80.1646540.576713-0.309467
1959-06-300.02710.3495.80.9646551.321802-0.355550
1959-09-300.02710.3495.80.791143-0.3039730.985764

数据转换

移除重复数据

data = DataFrame({'k1':['one']*3+['two']*4,
                 'k2':[1,1,2,3,3,4,4]})
data
k1k2
0one1
1one1
2one2
3two3
4two3
5two4
6two4
#DataFrame的duplicated方法返回一个布尔型Series,表示各行是否时重复行
data.duplicated()
0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool
data.drop_duplicates()#返回一个移除了重复行的DataFrame
k1k2
0one1
2one2
3two3
5two4
data['v1'] = range(7)
data.drop_duplicates(['k1'])
k1k2v1
0one10
3two33
#duplicated和drop_duplicates默认保留的是第一个出现的值组合
data.drop_duplicates(['k1','k2'],keep='last')#take_last=True则保留最后一个.将 take_last=True 改为 keep='last' 就可以了
k1k2v1
1one11
2one22
4two34
6two46

利用函数或映射进行数据转换

data = DataFrame({'food':['bacon','pulled pork','bacon','Pastrami','corned beef','Bacon','pastrami','honey ham','nova lox'],
                 'ounces':[4,3,12,6,7.5,8,3,5,6]})
data
foodounces
0bacon4.0
1pulled pork3.0
2bacon12.0
3Pastrami6.0
4corned beef7.5
5Bacon8.0
6pastrami3.0
7honey ham5.0
8nova lox6.0
meat_to_animal = {
    'bacon':'pig',
    'pulled pork':'pig',
    'pastrami':'cow',
    'corned beef':'cow',
    'honey ham':'pig',
    'nova lox':'salmon'
}
data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data
foodouncesanimal
0bacon4.0pig
1pulled pork3.0pig
2bacon12.0pig
3Pastrami6.0cow
4corned beef7.5cow
5Bacon8.0pig
6pastrami3.0cow
7honey ham5.0pig
8nova lox6.0salmon
data['food'].map(lambda x: meat_to_animal[x.lower()])
#使用map是一种实现元素级转换以及其他数据清理工作的便捷方式
0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

替换值

data = Series([1.,-999.,2.,-999.,-1000.,3.])
data
0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64
data.replace(-999,np.nan)
0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64
#一次性替换多个值,传入一个由待替换值组成的列表以及一个替换值
data.replace([-999,-1000],np.nan)
0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64
#对不同的值进行不同的替换
data.replace([-999,-1000],[np.nan,0])
0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64
#传入的参数也可以是字典
data.replace({-999:np.nan,-1000:0})
0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

重命名轴索引

data = DataFrame(np.arange(12).reshape((3,4)),
                index=['Ohio','Colorado','New York'],
                columns=['one','two','three','four'])
data.index.map(str.upper)
Index(['OHIO', 'COLORADO', 'NEW YORK'], dtype='object')
data.index = data.index.map(str.upper)
data
onetwothreefour
OHIO0123
COLORADO4567
NEW YORK891011
#如果想要创建数据集的转换版(而不是修改原始数据),比较实用的方法是rename
data.rename(index=str.title,columns=str.upper)
ONETWOTHREEFOUR
Ohio0123
Colorado4567
New York891011
data.rename(index={'OHIO':'INDIANA'},
           columns={'three':'peekaboo'})
onetwopeekaboofour
INDIANA0123
COLORADO4567
NEW YORK891011
_ = data.rename(index={'OHIO':'INDIANA'},inplace=True)
data
onetwothreefour
INDIANA0123
COLORADO4567
NEW YORK891011

离散化和面元划分

ages = [20,22,25,27,21,23,37,31,61,45,41,32]
bins = [18,25,35,60,100]
cats = pd.cut(ages,bins)
cats
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
#labels属性停止使用,现在为Categorical.codes,cats.labels
cats.codes
array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)
#levels属性停止使用,现在为Categorical.categories
cats.categories
IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')
pd.value_counts(cats)
(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64
#哪边是闭端可以通过right=False进行修改
pd.cut(ages,[18,26,36,61,100],right=False)
[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64, left]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]
#可以设置自己的面元名称
group_names = ['Youth','YoungAdult','MiddleAged','Senior']
pd.cut(ages,bins,labels=group_names)
['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']
#如果向cut传入的是面元的数量而不是确切的面元边界,则它会根据数据的最小值和最大值计算等长面元
data = np.random.rand(20)#生成均匀分布的数据
pd.cut(data,4,precision=2)
[(0.031, 0.27], (0.75, 1.0], (0.27, 0.51], (0.031, 0.27], (0.031, 0.27], ..., (0.27, 0.51], (0.031, 0.27], (0.75, 1.0], (0.75, 1.0], (0.031, 0.27]]
Length: 20
Categories (4, interval[float64, right]): [(0.031, 0.27] < (0.27, 0.51] < (0.51, 0.75] < (0.75, 1.0]]
#qcut可以根据样本分位数对数据进行面元划分
data = np.random.randn(1000)#正态分布
cats = pd.qcut(data,4)#按四分位数进行切割
cats
[(0.699, 2.801], (0.699, 2.801], (0.699, 2.801], (0.0346, 0.699], (-0.703, 0.0346], ..., (-2.582, -0.703], (-0.703, 0.0346], (-2.582, -0.703], (0.699, 2.801], (0.699, 2.801]]
Length: 1000
Categories (4, interval[float64, right]): [(-2.582, -0.703] < (-0.703, 0.0346] < (0.0346, 0.699] < (0.699, 2.801]]
pd.value_counts(cats)
(-2.582, -0.703]    250
(-0.703, 0.0346]    250
(0.0346, 0.699]     250
(0.699, 2.801]      250
dtype: int64
#设置自定义的分位数
pd.qcut(data,[0,0.1,0.5,0.9,1.])
[(1.263, 2.801], (0.0346, 1.263], (1.263, 2.801], (0.0346, 1.263], (-1.274, 0.0346], ..., (-2.582, -1.274], (-1.274, 0.0346], (-1.274, 0.0346], (0.0346, 1.263], (0.0346, 1.263]]
Length: 1000
Categories (4, interval[float64, right]): [(-2.582, -1.274] < (-1.274, 0.0346] < (0.0346, 1.263] < (1.263, 2.801]]

检测和过滤异常值

np.random.seed(12345)
data = DataFrame(np.random.randn(1000,4))
data.describe()
0123
count1000.0000001000.0000001000.0000001000.000000
mean-0.0676840.0679240.025598-0.002298
std0.9980350.9921061.0068350.996794
min-3.428254-3.548824-3.184377-3.745356
25%-0.774890-0.591841-0.641675-0.644144
50%-0.1164010.1011430.002073-0.013611
75%0.6163660.7802820.6803910.654328
max3.3666262.6536563.2603833.927528
col = data[3]
col[np.abs(col)>3]
97     3.927528
305   -3.399312
400   -3.745356
Name: 3, dtype: float64
#选出全部含有“超过3或-3的值”的行
data[(np.abs(data)>3).any(1)]
0123
5-0.5397410.4769853.248944-1.021228
97-0.7743630.5529360.1060613.927528
102-0.655054-0.5652303.1768730.959533
305-2.3155550.457246-0.025907-3.399312
3240.0501881.9513123.2603830.963301
4000.1463260.508391-0.196713-3.745356
499-0.293333-0.242459-3.0569901.918403
523-3.428254-0.296336-0.439938-0.867165
5860.2751441.179227-3.1843771.369891
808-0.362528-3.5488241.553205-2.186301
9003.366626-2.3722140.8510101.332846
#将值限制在区间-3到3以内
data[np.abs(data)>3] = np.sign(data)*3
data.describe()
0123
count1000.0000001000.0000001000.0000001000.000000
mean-0.0676230.0684730.025153-0.002081
std0.9954850.9902531.0039770.989736
min-3.000000-3.000000-3.000000-3.000000
25%-0.774890-0.591841-0.641675-0.644144
50%-0.1164010.1011430.002073-0.013611
75%0.6163660.7802820.6803910.654328
max3.0000002.6536563.0000003.000000

排列和随机采样

df = DataFrame(np.arange(5*4).reshape(5,4))
#实现对Series或DataFrame的列的排列工作(随机重排序)
sampler = np.random.permutation(5)
sampler
array([1, 0, 2, 3, 4])
df
0123
00123
14567
2891011
312131415
416171819
df.take(sampler)
0123
14567
00123
2891011
312131415
416171819
#另一种非替换的方式
df.take(np.random.permutation(len(df))[:3])
0123
14567
312131415
416171819
#要通过替换的方式产生样本,最快的方式是通过np.random.randint得到一组随机整数
bag = np.array([5,7,-1,6,4])
sampler = np.random.randint(0,len(bag),size=10)
sampler
array([4, 4, 2, 2, 2, 0, 3, 0, 4, 1])
draws = bag.take(sampler)#take(m,1)意思是取每一行的第m个值
draws
array([ 4,  4, -1, -1, -1,  5,  6,  5,  4,  7])

计算指标/哑变量

如果DataFrame的某一列中含有k个不同的值,则可以派生初一个k列矩阵或DataFrame(其值全为1和0)
pandas有一个get_dummies函数可以实现该功能

df = DataFrame({'key':['b','b','a','c','a','b'],
               'data1':range(6)})
pd.get_dummies(df['key'])
abc
0010
1010
2100
3001
4100
5010
#给指标DataFrame的列加上一个前缀
dummies = pd.get_dummies(df['key'],prefix='key')
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy
data1key_akey_bkey_c
00010
11010
22100
33001
44100
55010
#如果DataFrame中的某行同属于多个分类
mnames = ['movie_id','title','genres']
movies = pd.read_table("E:\python_study_files\python\pydata-book-2nd-edition\datasets\movielens\movies.dat",sep='::',header=None,names=mnames)
movies[:10]
C:\windows\Temp/ipykernel_6352/824848908.py:3: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  movies = pd.read_table("E:\python_study_files\python\pydata-book-2nd-edition\datasets\movielens\movies.dat",sep='::',header=None,names=mnames)
movie_idtitlegenres
01Toy Story (1995)Animation|Children's|Comedy
12Jumanji (1995)Adventure|Children's|Fantasy
23Grumpier Old Men (1995)Comedy|Romance
34Waiting to Exhale (1995)Comedy|Drama
45Father of the Bride Part II (1995)Comedy
56Heat (1995)Action|Crime|Thriller
67Sabrina (1995)Comedy|Romance
78Tom and Huck (1995)Adventure|Children's
89Sudden Death (1995)Action
910GoldenEye (1995)Action|Adventure|Thriller
#要为每个genre添加指标变量
genre_iter = (set(x.split('|')) for x in movies.genres)
genres = sorted(set.union(*genre_iter))
dummies  =DataFrame(np.zeros((len(movies),len(genres))),columns=genres)
#迭代每一部电影并将dummies各行的项设置为1
for i, gen in enumerate(movies.genres):
    dummies.loc[i,gen.split('|')]  = 1
#将其与movies合并起来
movies_windic = movies.join(dummies.add_prefix('Genre_'))
movies_windic.loc[0]
movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Action                                 0.0
Genre_Adventure                              0.0
Genre_Animation                              1.0
Genre_Children's                             1.0
Genre_Comedy                                 1.0
Genre_Crime                                  0.0
Genre_Documentary                            0.0
Genre_Drama                                  0.0
Genre_Fantasy                                0.0
Genre_Film-Noir                              0.0
Genre_Horror                                 0.0
Genre_Musical                                0.0
Genre_Mystery                                0.0
Genre_Romance                                0.0
Genre_Sci-Fi                                 0.0
Genre_Thriller                               0.0
Genre_War                                    0.0
Genre_Western                                0.0
Name: 0, dtype: object
#结合cut之类的离散化函数
values = np.random.rand(10)
values
array([0.75603383, 0.90830844, 0.96588737, 0.17373658, 0.87592824,
       0.75415641, 0.163486  , 0.23784062, 0.85564381, 0.58743194])
bins = [0,0.2,0.4,0.6,0.8,1]
pd.get_dummies(pd.cut(values,bins))
(0.0, 0.2](0.2, 0.4](0.4, 0.6](0.6, 0.8](0.8, 1.0]
000010
100001
200001
310000
400001
500010
610000
701000
800001
900100

字符串操作

字符串对象方法

val = 'a,b, guido'
val.split(',')
['a', 'b', ' guido']
#strip(用于修剪空白符(包括换行符))
pieces = [x.strip() for x in val.split(',')]
pieces
['a', 'b', 'guido']
first, second, third = pieces
first + '::' + second + '::' + third
'a::b::guido'
'::'.join(pieces)
'a::b::guido'
'guido' in val
True
val.index(',')
1
val.find(':')
-1
#find和index的区别:如果找不到字符串,index将会引发一个异常(而不是返回-1)
val.index(':')
---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)

C:\windows\Temp/ipykernel_13684/2927268062.py in <module>
----> 1 val.index(':')


ValueError: substring not found
#返回指定子串的出现次数
val.count(',')
2
val.replace(',','::')
'a::b:: guido'
val.replace(',','')
'ab guido'

在这里插入图片描述
在这里插入图片描述

正则表达式

正则表达式提供了一种灵活的在文本中搜索或匹配字符串模式的方式。
re模块的函数可以分为三个大类:模式匹配、替换以及拆分。

import re
text = "foo   bar\t baz   \tqux"
#描述一个或多个空白符的regex是\s+
re.split('\s+',text)
['foo', 'bar', 'baz', 'qux']
#如果打算对许多字符串应用同一条正则表达式
regex = re.compile('\s+')
regex.split(text)
['foo', 'bar', 'baz', 'qux']
regex.findall(text)
['   ', '\t ', '   \t']

findall返回的是字符串中所有的匹配项,而search则只返回第一个匹配项,match只匹配字符串的首部。

text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
#re.IGNORECASE的作用是使正则表达式对大小写不敏感
regex = re.compile(pattern,flags=re.IGNORECASE)
regex.findall(text)
['dave@google.com', 'steve@gmail.com', 'rob@gmail.com']
m = regex.search(text)
m
<re.Match object; span=(5, 20), match='dave@google.com'>
text[m.start():m.end()]
'dave@google.com'
print(regex.match(text))
None
#sub会将匹配到的模式替换为指定字符串,并返回所得到的新字符串。
print(regex.sub('REDACTED',text))
Dave REDACTED
Steve REDACTED
Rob REDACTED
#将地址分成3个部分:用户名、域名以及域后缀。只需将待分段的模式的各部分用圆括号包起来即可。
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern,flags=re.IGNORECASE)
m = regex.match('wesm@bright.net')
m.groups()
('wesm', 'bright', 'net')
regex.findall(text)
[('dave', 'google', 'com'), ('steve', 'gmail', 'com'), ('rob', 'gmail', 'com')]
#sub还能通过诸如\1、\2之类的特殊符号访问各匹配项中的分组
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3',text))
Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
regex = re.compile(r"""
(?P<username>[A-Z0-9._%+-]+)
@
(?P<domain>[A-Z0-9.-]+)
\.
(?P<suffix>[A-Z]{2,4})""",flags=re.IGNORECASE|re.VERBOSE)
m = regex.match('wesm@bright.net')
m.groupdict()
{'username': 'wesm', 'domain': 'bright', 'suffix': 'net'}

在这里插入图片描述

pandas中矢量化的字符串函数

data = {'Dave':'dave@google.com','Steve':'steve@gmail.com',
       'Rob':'rob@gmail.com','Wes':np.nan}
data = Series(data)
data
Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object
data.isnull()
Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool
data.str.contains('gmail')
Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object
pattern
'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'
#data.str.findall(pattern,flags=re.IGNORECASE)
x = data.str.findall(pattern,flags=re.IGNORECASE)
x
Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object
matches = data.str.match(pattern,flags=re.IGNORECASE)
matches
Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object
x.str.get(1)
Dave    NaN
Steve   NaN
Rob     NaN
Wes     NaN
dtype: float64
x.str[0]
Dave     (dave, google, com)
Steve    (steve, gmail, com)
Rob        (rob, gmail, com)
Wes                      NaN
dtype: object
#对字符串进行子串截取
data.str[:5]
Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object

在这里插入图片描述

示例:USDA食品数据库

import json
db = json.load(open("E:\\python_study_files\\python\\pydata-book-2nd-edition\\datasets\\usda_food\\database.json"))
len(db)
6636
#db中每个条目都是一个含有某种食物全部数据的字典
db[0].keys()
dict_keys(['id', 'description', 'tags', 'manufacturer', 'group', 'portions', 'nutrients'])
db[0]['nutrients'][0]
{'value': 25.18,
 'units': 'g',
 'description': 'Protein',
 'group': 'Composition'}
nutrients = DataFrame(db[0]['nutrients'])
nutrients[:7]
valueunitsdescriptiongroup
025.18gProteinComposition
129.20gTotal lipid (fat)Composition
23.06gCarbohydrate, by differenceComposition
33.28gAshOther
4376.00kcalEnergyEnergy
539.28gWaterComposition
61573.00kJEnergyEnergy
info_keys = ['description','group','id','manufacturer']
info = DataFrame(db,columns=info_keys)
info[:5]
descriptiongroupidmanufacturer
0Cheese, carawayDairy and Egg Products1008
1Cheese, cheddarDairy and Egg Products1009
2Cheese, edamDairy and Egg Products1018
3Cheese, fetaDairy and Egg Products1019
4Cheese, mozzarella, part skim milkDairy and Egg Products1028
info
descriptiongroupidmanufacturer
0Cheese, carawayDairy and Egg Products1008
1Cheese, cheddarDairy and Egg Products1009
2Cheese, edamDairy and Egg Products1018
3Cheese, fetaDairy and Egg Products1019
4Cheese, mozzarella, part skim milkDairy and Egg Products1028
...............
6631Bologna, beef, low fatSausages and Luncheon Meats42161
6632Turkey and pork sausage, fresh, bulk, patty or...Sausages and Luncheon Meats42173
6633Babyfood, juice, pearBaby Foods43408None
6634Babyfood, dessert, banana yogurt, strainedBaby Foods43539None
6635Babyfood, banana no tapioca, strainedBaby Foods43546None

6636 rows × 4 columns

pd.value_counts(info.group)[:10]
Vegetables and Vegetable Products    812
Beef Products                        618
Baked Products                       496
Breakfast Cereals                    403
Legumes and Legume Products          365
Fast Foods                           365
Lamb, Veal, and Game Products        345
Sweets                               341
Fruits and Fruit Juices              328
Pork Products                        328
Name: group, dtype: int64
#要对全部营养数据做一些分析
nutrients = []
for rec in db:
    fnuts = DataFrame(rec['nutrients'])
    fnuts['id'] = rec['id']
    nutrients.append(fnuts)
nutrients = pd.concat(nutrients,ignore_index=True)
nutrients
valueunitsdescriptiongroupid
025.180gProteinComposition1008
129.200gTotal lipid (fat)Composition1008
23.060gCarbohydrate, by differenceComposition1008
33.280gAshOther1008
4376.000kcalEnergyEnergy1008
..................
3893500.000mcgVitamin B-12, addedVitamins43546
3893510.000mgCholesterolOther43546
3893520.072gFatty acids, total saturatedOther43546
3893530.028gFatty acids, total monounsaturatedOther43546
3893540.041gFatty acids, total polyunsaturatedOther43546

389355 rows × 5 columns

#丢弃重复项
nutrients.duplicated().sum()
14179
nutrients = nutrients.drop_duplicates()
col_mapping = {'description':'food',
              'group':'fgroup'}
info = info.rename(columns=col_mapping,copy=False)
info
foodfgroupidmanufacturer
0Cheese, carawayDairy and Egg Products1008
1Cheese, cheddarDairy and Egg Products1009
2Cheese, edamDairy and Egg Products1018
3Cheese, fetaDairy and Egg Products1019
4Cheese, mozzarella, part skim milkDairy and Egg Products1028
...............
6631Bologna, beef, low fatSausages and Luncheon Meats42161
6632Turkey and pork sausage, fresh, bulk, patty or...Sausages and Luncheon Meats42173
6633Babyfood, juice, pearBaby Foods43408None
6634Babyfood, dessert, banana yogurt, strainedBaby Foods43539None
6635Babyfood, banana no tapioca, strainedBaby Foods43546None

6636 rows × 4 columns

col_mapping = {'description':'nutrient',
              'group':'nutgroup'}
nutrients = nutrients.rename(columns=col_mapping,copy=False)
nutrients
valueunitsnutrientnutgroupid
025.180gProteinComposition1008
129.200gTotal lipid (fat)Composition1008
23.060gCarbohydrate, by differenceComposition1008
33.280gAshOther1008
4376.000kcalEnergyEnergy1008
..................
3893500.000mcgVitamin B-12, addedVitamins43546
3893510.000mgCholesterolOther43546
3893520.072gFatty acids, total saturatedOther43546
3893530.028gFatty acids, total monounsaturatedOther43546
3893540.041gFatty acids, total polyunsaturatedOther43546

375176 rows × 5 columns

#将info跟nutrients合并起来
ndata = pd.merge(nutrients,info,on='id',how='outer')
ndata
valueunitsnutrientnutgroupidfoodfgroupmanufacturer
025.180gProteinComposition1008Cheese, carawayDairy and Egg Products
129.200gTotal lipid (fat)Composition1008Cheese, carawayDairy and Egg Products
23.060gCarbohydrate, by differenceComposition1008Cheese, carawayDairy and Egg Products
33.280gAshOther1008Cheese, carawayDairy and Egg Products
4376.000kcalEnergyEnergy1008Cheese, carawayDairy and Egg Products
...........................
3751710.000mcgVitamin B-12, addedVitamins43546Babyfood, banana no tapioca, strainedBaby FoodsNone
3751720.000mgCholesterolOther43546Babyfood, banana no tapioca, strainedBaby FoodsNone
3751730.072gFatty acids, total saturatedOther43546Babyfood, banana no tapioca, strainedBaby FoodsNone
3751740.028gFatty acids, total monounsaturatedOther43546Babyfood, banana no tapioca, strainedBaby FoodsNone
3751750.041gFatty acids, total polyunsaturatedOther43546Babyfood, banana no tapioca, strainedBaby FoodsNone

375176 rows × 8 columns

result = ndata.groupby(['nutrient','fgroup'])['value'].quantile(0.5)
result['Zinc, Zn'].sort_values().plot(kind='barh')
by_nutrient = ndata.groupby(['nutgroup','nutrient'])
get_maximum = lambda x: x.xs(x.value.idxmax())
get_minimum = lambda x: x.xs(x.value.idxmin())
max_foods = by_nutrient.apply(get_maximum)[['value','food']]
#让food小一点
max_foods.food = max_foods.food.str[:50]
max_foods.loc['Amino Acids']['food']
nutrient
Alanine                           Gelatins, dry powder, unsweetened
Arginine                               Seeds, sesame flour, low-fat
Aspartic acid                                   Soy protein isolate
Cystine                Seeds, cottonseed flour, low fat (glandless)
Glutamic acid                                   Soy protein isolate
Glycine                           Gelatins, dry powder, unsweetened
Histidine                Whale, beluga, meat, dried (Alaska Native)
Hydroxyproline    KENTUCKY FRIED CHICKEN, Fried Chicken, ORIGINA...
Isoleucine        Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Leucine           Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Lysine            Seal, bearded (Oogruk), meat, dried (Alaska Na...
Methionine                    Fish, cod, Atlantic, dried and salted
Phenylalanine     Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Proline                           Gelatins, dry powder, unsweetened
Serine            Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Threonine         Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Tryptophan         Sea lion, Steller, meat with fat (Alaska Native)
Tyrosine          Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Valine            Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Name: food, dtype: object
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值