利用Python进行数据分析的学习笔记——chap7

调停者จุ๊บ

已于 2022-03-01 15:59:51 修改

阅读量1.3k

点赞数

分类专栏：笔记文章标签：数据分析 python 学习

于 2022-03-01 15:45:08 首次发布

本文链接：https://blog.csdn.net/hlinging/article/details/123207471

版权

笔记专栏收录该内容

15 篇文章 1 订阅

订阅专栏

数据规整化：清理、转换、合并、重塑

合并数据集

pandas.merge可根据一个或多个键将不同DataFrame中的行连接起来。
pandas.concat可以沿着一条轴将多个对象堆叠到一起。
实例方法combine_first可以将重复数据编接在一起，用一个对象中的值填充另一个对象中的缺失值。

数据库风格的DataFrame合并

import numpy as np
import pandas as pd
from pandas import DataFrame,Series

df1 = DataFrame({'key':['b','b','a','c','a','a','b'],'data1':range(7)})
df2 = DataFrame({'key':['a','b','d'],
                'data2':range(3)})
df1

	key	data1
0	b	0
1	b	1
2	a	2
3	c	3
4	a	4
5	a	5
6	b	6

df2

	key	data2
0	a	0
1	b	1
2	d	2

pd.merge(df1,df2)#未指定情况下，将重叠列的列名当做键。

	key	data1	data2
0	b	0	1
1	b	1	1
2	b	6	1
3	a	2	0
4	a	4	0
5	a	5	0

pd.merge(df1,df2,on='key')

	key	data1	data2
0	b	0	1
1	b	1	1
2	b	6	1
3	a	2	0
4	a	4	0
5	a	5	0

#如果两个对象的列名不同，分别进行指定
df3 = DataFrame({'lkey':['b','b','a','c','a','a','b'],
                'data1':range(7)})
df4 = DataFrame({'rkey':['a','b','d'],
                'data2':range(3)})
#默认下merge是inner连接，结果未交集。还有left,right,outer。外连接取得是键得并集，组合了左连接和右连接得效果。
pd.merge(df3,df4,left_on='lkey',right_on='rkey')

	lkey	data1	rkey	data2
0	b	0	b	1
1	b	1	b	1
2	b	6	b	1
3	a	2	a	0
4	a	4	a	0
5	a	5	a	0

pd.merge(df1,df2,how='outer')

	key	data1	data2
0	b	0.0	1.0
1	b	1.0	1.0
2	b	6.0	1.0
3	a	2.0	0.0
4	a	4.0	0.0
5	a	5.0	0.0
6	c	3.0	NaN
7	d	NaN	2.0

#多对多的合并
df1 = DataFrame({'key':['b','b','a','c','a','b'],
                'data1':range(6)})
df2 = DataFrame({'key':['a','b','a','b','d'],
                'data2':range(5)})
df1

	key	data1
0	b	0
1	b	1
2	a	2
3	c	3
4	a	4
5	b	5

df2

	key	data2
0	a	0
1	b	1
2	a	2
3	b	3
4	d	4

pd.merge(df1,df2,on='key',how='left')#多对多连接产生的是行的笛卡尔积

	key	data1	data2
0	b	0	1.0
1	b	0	3.0
2	b	1	1.0
3	b	1	3.0
4	a	2	0.0
5	a	2	2.0
6	c	3	NaN
7	a	4	0.0
8	a	4	2.0
9	b	5	1.0
10	b	5	3.0

pd.merge(df1,df2,how='inner')

	key	data1	data2
0	b	0	1
1	b	0	3
2	b	1	1
3	b	1	3
4	b	5	1
5	b	5	3
6	a	2	0
7	a	2	2
8	a	4	0
9	a	4	2

#根据多个键进行合并
left = DataFrame({'key1':['foo','foo','bar'],
                 'key2':['one','two','one'],
                 'lval':[1,2,3]})
right = DataFrame({'key1':['foo','foo','bar','bar'],
                  'key2':['one','one','one','two'],
                  'rval':[4,5,6,7]})
pd.merge(left,right,on=['key1','key2'],how='outer')

	key1	key2	lval	rval
0	foo	one	1.0	4.0
1	foo	one	1.0	5.0
2	foo	two	2.0	NaN
3	bar	one	3.0	6.0
4	bar	two	NaN	7.0

#对重复列名的处理
pd.merge(left,right,on='key1')

	key1	key2_x	lval	key2_y	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	two	7

pd.merge(left,right,on='key1',suffixes=('_left','_right'))

	key1	key2_left	lval	key2_right	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	two	7

在这里插入图片描述

索引上的合并

#索引用作连接键
left1 = DataFrame({'key':['a','b','a','a','b','c'],
                  'value':range(6)})
right1 = DataFrame({'group_val':[3.5,7]},index=['a','b'])
left1

	key	value
0	a	0
1	b	1
2	a	2
3	a	3
4	b	4
5	c	5

right1

	group_val
a	3.5
b	7.0

pd.merge(left1,right1,left_on='key',right_index=True)

	key	value	group_val
0	a	0	3.5
2	a	2	3.5
3	a	3	3.5
1	b	1	7.0
4	b	4	7.0

#通过外连接的方式得到并集
pd.merge(left1,right1,left_on='key',right_index=True,how='outer')

	key	value	group_val
0	a	0	3.5
2	a	2	3.5
3	a	3	3.5
1	b	1	7.0
4	b	4	7.0
5	c	5	NaN

#层次化索引
lefth = DataFrame({'key1':['Ohio','Ohio','Ohio','Nevada','Nevada'],
                  'key2':[2000,2001,2002,2001,2002],
                  'data':np.arange(5.)})
righth = DataFrame(np.arange(12).reshape((6,2)),
                  index=[['Nevada','Nevada','Ohio','Ohio','Ohio','Ohio'],
                        [2001,2000,2000,2000,2001,2002]],
                  columns=['event1','event2'])
lefth

	key1	key2	data
0	Ohio	2000	0.0
1	Ohio	2001	1.0
2	Ohio	2002	2.0
3	Nevada	2001	3.0
4	Nevada	2002	4.0

righth

		event1	event2
Nevada	2001	0	1
Nevada	2000	2	3
Ohio	2000	4	5
	2000	6	7
	2001	8	9
	2002	10	11

#以列表的形式指明用作合并键的多个列
pd.merge(lefth,righth,left_on=['key1','key2'],right_index=True)

	key1	key2	data	event1	event2
0	Ohio	2000	0.0	4	5
0	Ohio	2000	0.0	6	7
1	Ohio	2001	1.0	8	9
2	Ohio	2002	2.0	10	11
3	Nevada	2001	3.0	0	1

pd.merge(lefth,righth,left_on=['key1','key2'],
        right_index=True,how='outer')

	key1	key2	data	event1	event2
0	Ohio	2000	0.0	4.0	5.0
0	Ohio	2000	0.0	6.0	7.0
1	Ohio	2001	1.0	8.0	9.0
2	Ohio	2002	2.0	10.0	11.0
3	Nevada	2001	3.0	0.0	1.0
4	Nevada	2002	4.0	NaN	NaN
4	Nevada	2000	NaN	2.0	3.0

#使用合并双方的索引
left2 = DataFrame([[1.,2.],[3.,4.],[5.,6.]],index=['a','c','e'],
                 columns=['Ohio','Nevada'])
right2 = DataFrame([[7.,8.],[9.,10.],[11.,12.],[13.,14.]],
                  index=['b','c','d','e'],columns=['Missouri','Alabama'])
left2

	Ohio	Nevada
a	1.0	2.0
c	3.0	4.0
e	5.0	6.0

right2

	Missouri	Alabama
b	7.0	8.0
c	9.0	10.0
d	11.0	12.0
e	13.0	14.0

pd.merge(left2,right2,how='outer',left_index=True,right_index=True)

	Ohio	Nevada	Missouri	Alabama
a	1.0	2.0	NaN	NaN
b	NaN	NaN	7.0	8.0
c	3.0	4.0	9.0	10.0
d	NaN	NaN	11.0	12.0
e	5.0	6.0	13.0	14.0

#使用join方法更方便实现按索引合并
left2.join(right2,how='outer')

	Ohio	Nevada	Missouri	Alabama
a	1.0	2.0	NaN	NaN
b	NaN	NaN	7.0	8.0
c	3.0	4.0	9.0	10.0
d	NaN	NaN	11.0	12.0
e	5.0	6.0	13.0	14.0

left1.join(right1,on='key')

	key	value	group_val
0	a	0	3.5
1	b	1	7.0
2	a	2	3.5
3	a	3	3.5
4	b	4	7.0
5	c	5	NaN

#还可以向join传入一组DataFrame
another = DataFrame([[7.,8.],[9.,10.],[11.,12.],[16.,17]],
                   index=['a','c','e','f'],columns=['New York','Oregon'])
left2.join([right2,another])

	Ohio	Nevada	Missouri	Alabama	New York	Oregon
a	1.0	2.0	NaN	NaN	7.0	8.0
c	3.0	4.0	9.0	10.0	9.0	10.0
e	5.0	6.0	13.0	14.0	11.0	12.0

left2.join([right2,another],how='outer')

	Ohio	Nevada	Missouri	Alabama	New York	Oregon
a	1.0	2.0	NaN	NaN	7.0	8.0
c	3.0	4.0	9.0	10.0	9.0	10.0
e	5.0	6.0	13.0	14.0	11.0	12.0
b	NaN	NaN	7.0	8.0	NaN	NaN
d	NaN	NaN	11.0	12.0	NaN	NaN
f	NaN	NaN	NaN	NaN	16.0	17.0

轴向连接

#Numpy的concatenate函数
arr = np.arange(12).reshape((3,4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

np.concatenate([arr,arr],axis=1)

array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])

#pandas的concat函数,默认情况下在axis=0上工作
s1 = Series([0,1],index=['a','b'])
s2 = Series([2,3,4],index=['c','d','e'])
s3 = Series([5,6],index=['f','g'])
pd.concat([s1,s2,s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

#若传入axis=1(是列），则结果会变成一个DataFrame
pd.concat([s1,s2,s3],axis=1)

	0	1	2
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

s4 = pd.concat([s1*5,s3])
s4

a    0
b    5
f    5
g    6
dtype: int64

pd.concat([s1,s4],axis=1)

	0	1
a	0.0	0
b	1.0	5
f	NaN	5
g	NaN	6

pd.concat([s1,s4],axis=1,join='inner')

	0	1
a	0	0
b	1	5

pd.concat([s1,s4],axis=1,join_axes=[['a','c','b','e']])#在使用Pandas_Profiling.ProfilingReport报错，" concat() got an unexpected keyword argument ‘join_axes’ "
#经查是pandas和Pandas_Profiling版本过低导致，
#在命令行更新 pip install --upgrade pandas
#pip install --upgrade pandas_Profiling(这一步出错)

---------------------------------------------------------------------------
TypeError: concat() got an unexpected keyword argument 'join_axes'

#在连接轴上创建一个层次化索引
result = pd.concat([s1,s1,s3],keys=['one','two','three'])
result

one    a    0
       b    1
two    a    0
       b    1
three  f    5
       g    6
dtype: int64

result.unstack()

	a	b	f	g
one	0.0	1.0	NaN	NaN
two	0.0	1.0	NaN	NaN
three	NaN	NaN	5.0	6.0

#沿着axis=1对Series进行合并，则keys会称为DataFrame的列头
pd.concat([s1,s2,s3],axis=1,keys=['one','two','three'])

	one	two	three
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

#同理，对DataFrame也是一样
df1 = DataFrame(np.arange(6).reshape(3,2),index=['a','b','c'],
               columns=['one','two'])
df2 = DataFrame(5+np.arange(4).reshape(2,2),index=['a','c'],
               columns=['three','four'])
pd.concat([df1,df2],axis=1,keys=['level1','level2'])

	level1		level2
	one	two	three	four
a	0	1	5.0	6.0
b	2	3	NaN	NaN
c	4	5	7.0	8.0

#对于字典,其键会被当做keys选项的值
pd.concat([df1,df2],axis=1,keys=['level1','level2'])

	level1		level2
	one	two	three	four
a	0	1	5.0	6.0
b	2	3	NaN	NaN
c	4	5	7.0	8.0

pd.concat([df1,df2],axis=1,keys=['level1','level2'],
         names=['upper','lower'])

upper	level1		level2
lower	one	two	three	four
a	0	1	5.0	6.0
b	2	3	NaN	NaN
c	4	5	7.0	8.0

df1 = DataFrame(np.random.randn(3,4),columns=['a','b','c','d'])
df2 = DataFrame(np.random.randn(2,3),columns=['b','d','a'])
df1

	a	b	c	d
0	0.909729	-1.166310	-0.988668	-0.398099
1	1.226715	1.429447	-0.353615	1.096639
2	0.420715	1.319891	-2.096798	-0.125654

df2

	b	d	a
0	-0.077881	0.031641	-0.839706
1	-0.231105	1.227890	0.422392

pd.concat([df1,df2],ignore_index=True)

	a	b	c	d
0	0.909729	-1.166310	-0.988668	-0.398099
1	1.226715	1.429447	-0.353615	1.096639
2	0.420715	1.319891	-2.096798	-0.125654
3	-0.839706	-0.077881	NaN	0.031641
4	0.422392	-0.231105	NaN	1.227890

在这里插入图片描述

合并重叠数据

#Numpy的where函数，用于表达一种矢量化的if-else
a = Series([np.nan,2.5,np.nan,3.5,4.5,np.nan],
          index=['f','e','d','c','b','a'])
b = Series(np.arange(len(a),dtype=np.float64),index=['f','e','d','c','b','a'])
b[-1] = np.nan
a

f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64

f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    NaN
dtype: float64

np.where(pd.isnull(a),b,a)

array([0. , 2.5, 2. , 3.5, 4.5, nan])

#另一种方法
b[:-2].combine_first(a[2:])

a    NaN
b    4.5
c    3.0
d    2.0
e    1.0
f    0.0
dtype: float64

### 用参数对象中的数据为调用者对象的缺失数据“打补丁”
df1 = DataFrame({'a':[1.,np.nan,5.,np.nan],
                'b':[np.nan,2.,np.nan,6.],
                'c':range(2,18,4)})
df2 = DataFrame({'a':[5.,4.,np.nan,3.,7.],
                'b':[np.nan,3.,4.,6.,8.]})
df1.combine_first(df2)

	a	b	c
0	1.0	NaN	2.0
1	4.0	2.0	6.0
2	5.0	4.0	10.0
3	3.0	6.0	14.0
4	7.0	8.0	NaN

	a	b	c
0	1.0	NaN	2
1	NaN	2.0	6
2	5.0	NaN	10
3	NaN	6.0	14

df2

	a	b
0	5.0	NaN
1	4.0	3.0
2	NaN	4.0
3	3.0	6.0
4	7.0	8.0

重塑和轴向旋转

重塑层次化索引

stack:将数据的列旋转为行
unstack:将数据的行旋转为列

data = DataFrame(np.arange(6).reshape((2,3)),index=pd.Index(['Ohio','Colorado'],name='state'),
                 columns=pd.Index(['one','two','three'],name='number'))
data

number	one	two	three
state
Ohio	0	1	2
Colorado	3	4	5

result = data.stack()
result

state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32

result.unstack()

number	one	two	three
state
Ohio	0	1	2
Colorado	3	4	5

#传入分层级别的编号或名称即可对其他级别进行unstack操作
result.unstack(0)

state	Ohio	Colorado
number
one	0	3
two	1	4
three	2	5

result.unstack('state')

state	Ohio	Colorado
number
one	0	3
two	1	4
three	2	5

s1 = Series([0,1,2,3],index=['a','b','c','d'])
s2 = Series([4,5,6],index=['c','d','e'])
data2 = pd.concat([s1,s2],keys=['one','two'])
data2.unstack()

	a	b	c	d	e
one	0.0	1.0	2.0	3.0	NaN
two	NaN	NaN	4.0	5.0	6.0

data2.unstack().stack()

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
two  c    4.0
     d    5.0
     e    6.0
dtype: float64

data2.unstack().stack(dropna=False)

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
     e    NaN
two  a    NaN
     b    NaN
     c    4.0
     d    5.0
     e    6.0
dtype: float64

#在对DataFrame进行unstack操作时，作为旋转轴的级别将会成为结果中的最低级别
df = DataFrame({'left':result,'right':result+5},columns=pd.Index(['left','right'],name='side'))
df

	side	left	right
state	number
Ohio	one	0	5
	two	1	6
	three	2	7
Colorado	one	3	8
	two	4	9
	three	5	10

df.unstack('state')

side	left		right
state	Ohio	Colorado	Ohio	Colorado
number
one	0	3	5	8
two	1	4	6	9
three	2	5	7	10

df.unstack('state').stack('side')

	state	Colorado	Ohio
number	side
one	left	3	0
one	right	8	5
two	left	4	1
two	right	9	6
three	left	5	2
three	right	10	7

将“长格式”旋转为“宽格式”

# 这里的ldata为自己生成，与原文不一致，只实验方法
ldata = DataFrame({'date':['1959-03-31','1959-03-31','1959-03-31',
                                '1959-06-30','1959-06-30','1959-06-30',
                                '1959-09-30', '1959-09-30','1959-09-30'],
                'item':['realgdp', 'infl', 'unemp']*3,
                'value':[2710.349, 0.0, 5.8]*3})
print(ldata)

         date     item     value
0  1959-03-31  realgdp  2710.349
1  1959-03-31     infl     0.000
2  1959-03-31    unemp     5.800
3  1959-06-30  realgdp  2710.349
4  1959-06-30     infl     0.000
5  1959-06-30    unemp     5.800
6  1959-09-30  realgdp  2710.349
7  1959-09-30     infl     0.000
8  1959-09-30    unemp     5.800

pivoted = ldata.pivot('date','item','value')#前两个参数值分别用作行和列索引的列名，最后一个参数值则是用于填充DataFrame的数据列的列名
pivoted.head()

item	infl	realgdp	unemp
date
1959-03-31	0.0	2710.349	5.8
1959-06-30	0.0	2710.349	5.8
1959-09-30	0.0	2710.349	5.8

ldata['value2'] = np.random.randn(len(ldata))
ldata

	date	item	value	value2
0	1959-03-31	realgdp	2710.349	0.576713
1	1959-03-31	infl	0.000	0.164654
2	1959-03-31	unemp	5.800	-0.309467
3	1959-06-30	realgdp	2710.349	1.321802
4	1959-06-30	infl	0.000	0.964655
5	1959-06-30	unemp	5.800	-0.355550
6	1959-09-30	realgdp	2710.349	-0.303973
7	1959-09-30	infl	0.000	0.791143
8	1959-09-30	unemp	5.800	0.985764

pivoted = ldata.pivot('date','item')
pivoted

	value			value2
item	infl	realgdp	unemp	infl	realgdp	unemp
date
1959-03-31	0.0	2710.349	5.8	0.164654	0.576713	-0.309467
1959-06-30	0.0	2710.349	5.8	0.964655	1.321802	-0.355550
1959-09-30	0.0	2710.349	5.8	0.791143	-0.303973	0.985764

pivoted['value']

item	infl	realgdp	unemp
date
1959-03-31	0.0	2710.349	5.8
1959-06-30	0.0	2710.349	5.8
1959-09-30	0.0	2710.349	5.8

#用set_index创建层次化索引，再用unstack重塑
unstacked = ldata.set_index(['date','item']).unstack('item')
unstacked

	value			value2
item	infl	realgdp	unemp	infl	realgdp	unemp
date
1959-03-31	0.0	2710.349	5.8	0.164654	0.576713	-0.309467
1959-06-30	0.0	2710.349	5.8	0.964655	1.321802	-0.355550
1959-09-30	0.0	2710.349	5.8	0.791143	-0.303973	0.985764

数据转换

移除重复数据

data = DataFrame({'k1':['one']*3+['two']*4,
                 'k2':[1,1,2,3,3,4,4]})
data

	k1	k2
0	one	1
1	one	1
2	one	2
3	two	3
4	two	3
5	two	4
6	two	4

#DataFrame的duplicated方法返回一个布尔型Series，表示各行是否时重复行
data.duplicated()

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

data.drop_duplicates()#返回一个移除了重复行的DataFrame

	k1	k2
0	one	1
2	one	2
3	two	3
5	two	4

data['v1'] = range(7)
data.drop_duplicates(['k1'])

	k1	k2	v1
0	one	1	0
3	two	3	3

#duplicated和drop_duplicates默认保留的是第一个出现的值组合
data.drop_duplicates(['k1','k2'],keep='last')#take_last=True则保留最后一个.将 take_last=True 改为 keep='last' 就可以了

	k1	k2	v1
1	one	1	1
2	one	2	2
4	two	3	4
6	two	4	6

利用函数或映射进行数据转换

data = DataFrame({'food':['bacon','pulled pork','bacon','Pastrami','corned beef','Bacon','pastrami','honey ham','nova lox'],
                 'ounces':[4,3,12,6,7.5,8,3,5,6]})
data

	food	ounces
0	bacon	4.0
1	pulled pork	3.0
2	bacon	12.0
3	Pastrami	6.0
4	corned beef	7.5
5	Bacon	8.0
6	pastrami	3.0
7	honey ham	5.0
8	nova lox	6.0

meat_to_animal = {
    'bacon':'pig',
    'pulled pork':'pig',
    'pastrami':'cow',
    'corned beef':'cow',
    'honey ham':'pig',
    'nova lox':'salmon'
}
data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data

	food	ounces	animal
0	bacon	4.0	pig
1	pulled pork	3.0	pig
2	bacon	12.0	pig
3	Pastrami	6.0	cow
4	corned beef	7.5	cow
5	Bacon	8.0	pig
6	pastrami	3.0	cow
7	honey ham	5.0	pig
8	nova lox	6.0	salmon

data['food'].map(lambda x: meat_to_animal[x.lower()])
#使用map是一种实现元素级转换以及其他数据清理工作的便捷方式

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

替换值

data = Series([1.,-999.,2.,-999.,-1000.,3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

data.replace(-999,np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

#一次性替换多个值，传入一个由待替换值组成的列表以及一个替换值
data.replace([-999,-1000],np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

#对不同的值进行不同的替换
data.replace([-999,-1000],[np.nan,0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

#传入的参数也可以是字典
data.replace({-999:np.nan,-1000:0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

重命名轴索引

data = DataFrame(np.arange(12).reshape((3,4)),
                index=['Ohio','Colorado','New York'],
                columns=['one','two','three','four'])
data.index.map(str.upper)

Index(['OHIO', 'COLORADO', 'NEW YORK'], dtype='object')

data.index = data.index.map(str.upper)
data

	one	two	three	four
OHIO	0	1	2	3
COLORADO	4	5	6	7
NEW YORK	8	9	10	11

#如果想要创建数据集的转换版（而不是修改原始数据），比较实用的方法是rename
data.rename(index=str.title,columns=str.upper)

	ONE	TWO	THREE	FOUR
Ohio	0	1	2	3
Colorado	4	5	6	7
New York	8	9	10	11

data.rename(index={'OHIO':'INDIANA'},
           columns={'three':'peekaboo'})

	one	two	peekaboo	four
INDIANA	0	1	2	3
COLORADO	4	5	6	7
NEW YORK	8	9	10	11

_ = data.rename(index={'OHIO':'INDIANA'},inplace=True)
data

	one	two	three	four
INDIANA	0	1	2	3
COLORADO	4	5	6	7
NEW YORK	8	9	10	11

离散化和面元划分

ages = [20,22,25,27,21,23,37,31,61,45,41,32]
bins = [18,25,35,60,100]
cats = pd.cut(ages,bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

#labels属性停止使用，现在为Categorical.codes,cats.labels
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

#levels属性停止使用，现在为Categorical.categories
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

pd.value_counts(cats)

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

#哪边是闭端可以通过right=False进行修改
pd.cut(ages,[18,26,36,61,100],right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64, left]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

#可以设置自己的面元名称
group_names = ['Youth','YoungAdult','MiddleAged','Senior']
pd.cut(ages,bins,labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

#如果向cut传入的是面元的数量而不是确切的面元边界，则它会根据数据的最小值和最大值计算等长面元
data = np.random.rand(20)#生成均匀分布的数据
pd.cut(data,4,precision=2)

[(0.031, 0.27], (0.75, 1.0], (0.27, 0.51], (0.031, 0.27], (0.031, 0.27], ..., (0.27, 0.51], (0.031, 0.27], (0.75, 1.0], (0.75, 1.0], (0.031, 0.27]]
Length: 20
Categories (4, interval[float64, right]): [(0.031, 0.27] < (0.27, 0.51] < (0.51, 0.75] < (0.75, 1.0]]

#qcut可以根据样本分位数对数据进行面元划分
data = np.random.randn(1000)#正态分布
cats = pd.qcut(data,4)#按四分位数进行切割
cats

[(0.699, 2.801], (0.699, 2.801], (0.699, 2.801], (0.0346, 0.699], (-0.703, 0.0346], ..., (-2.582, -0.703], (-0.703, 0.0346], (-2.582, -0.703], (0.699, 2.801], (0.699, 2.801]]
Length: 1000
Categories (4, interval[float64, right]): [(-2.582, -0.703] < (-0.703, 0.0346] < (0.0346, 0.699] < (0.699, 2.801]]

pd.value_counts(cats)

(-2.582, -0.703]    250
(-0.703, 0.0346]    250
(0.0346, 0.699]     250
(0.699, 2.801]      250
dtype: int64

#设置自定义的分位数
pd.qcut(data,[0,0.1,0.5,0.9,1.])

[(1.263, 2.801], (0.0346, 1.263], (1.263, 2.801], (0.0346, 1.263], (-1.274, 0.0346], ..., (-2.582, -1.274], (-1.274, 0.0346], (-1.274, 0.0346], (0.0346, 1.263], (0.0346, 1.263]]
Length: 1000
Categories (4, interval[float64, right]): [(-2.582, -1.274] < (-1.274, 0.0346] < (0.0346, 1.263] < (1.263, 2.801]]

检测和过滤异常值

np.random.seed(12345)
data = DataFrame(np.random.randn(1000,4))
data.describe()

	0	1	2	3
count	1000.000000	1000.000000	1000.000000	1000.000000
mean	-0.067684	0.067924	0.025598	-0.002298
std	0.998035	0.992106	1.006835	0.996794
min	-3.428254	-3.548824	-3.184377	-3.745356
25%	-0.774890	-0.591841	-0.641675	-0.644144
50%	-0.116401	0.101143	0.002073	-0.013611
75%	0.616366	0.780282	0.680391	0.654328
max	3.366626	2.653656	3.260383	3.927528

col = data[3]
col[np.abs(col)>3]

97     3.927528
305   -3.399312
400   -3.745356
Name: 3, dtype: float64

#选出全部含有“超过3或-3的值”的行
data[(np.abs(data)>3).any(1)]

	0	1	2	3
5	-0.539741	0.476985	3.248944	-1.021228
97	-0.774363	0.552936	0.106061	3.927528
102	-0.655054	-0.565230	3.176873	0.959533
305	-2.315555	0.457246	-0.025907	-3.399312
324	0.050188	1.951312	3.260383	0.963301
400	0.146326	0.508391	-0.196713	-3.745356
499	-0.293333	-0.242459	-3.056990	1.918403
523	-3.428254	-0.296336	-0.439938	-0.867165
586	0.275144	1.179227	-3.184377	1.369891
808	-0.362528	-3.548824	1.553205	-2.186301
900	3.366626	-2.372214	0.851010	1.332846

#将值限制在区间-3到3以内
data[np.abs(data)>3] = np.sign(data)*3
data.describe()

	0	1	2	3
count	1000.000000	1000.000000	1000.000000	1000.000000
mean	-0.067623	0.068473	0.025153	-0.002081
std	0.995485	0.990253	1.003977	0.989736
min	-3.000000	-3.000000	-3.000000	-3.000000
25%	-0.774890	-0.591841	-0.641675	-0.644144
50%	-0.116401	0.101143	0.002073	-0.013611
75%	0.616366	0.780282	0.680391	0.654328
max	3.000000	2.653656	3.000000	3.000000

排列和随机采样

df = DataFrame(np.arange(5*4).reshape(5,4))
#实现对Series或DataFrame的列的排列工作（随机重排序）
sampler = np.random.permutation(5)
sampler

array([1, 0, 2, 3, 4])

df

	0	1	2	3
0	0	1	2	3
1	4	5	6	7
2	8	9	10	11
3	12	13	14	15
4	16	17	18	19

df.take(sampler)

	0	1	2	3
1	4	5	6	7
0	0	1	2	3
2	8	9	10	11
3	12	13	14	15
4	16	17	18	19

#另一种非替换的方式
df.take(np.random.permutation(len(df))[:3])

	0	1	2	3
1	4	5	6	7
3	12	13	14	15
4	16	17	18	19

#要通过替换的方式产生样本，最快的方式是通过np.random.randint得到一组随机整数
bag = np.array([5,7,-1,6,4])
sampler = np.random.randint(0,len(bag),size=10)
sampler

array([4, 4, 2, 2, 2, 0, 3, 0, 4, 1])

draws = bag.take(sampler)#take(m,1)意思是取每一行的第m个值
draws

array([ 4,  4, -1, -1, -1,  5,  6,  5,  4,  7])

计算指标/哑变量

如果DataFrame的某一列中含有k个不同的值，则可以派生初一个k列矩阵或DataFrame(其值全为1和0）
pandas有一个get_dummies函数可以实现该功能

df = DataFrame({'key':['b','b','a','c','a','b'],
               'data1':range(6)})
pd.get_dummies(df['key'])

	a	b	c
0	0	1	0
1	0	1	0
2	1	0	0
3	0	0	1
4	1	0	0
5	0	1	0

#给指标DataFrame的列加上一个前缀
dummies = pd.get_dummies(df['key'],prefix='key')
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

	data1	key_a	key_b	key_c
0	0	0	1	0
1	1	0	1	0
2	2	1	0	0
3	3	0	0	1
4	4	1	0	0
5	5	0	1	0

#如果DataFrame中的某行同属于多个分类
mnames = ['movie_id','title','genres']
movies = pd.read_table("E:\python_study_files\python\pydata-book-2nd-edition\datasets\movielens\movies.dat",sep='::',header=None,names=mnames)
movies[:10]

C:\windows\Temp/ipykernel_6352/824848908.py:3: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  movies = pd.read_table("E:\python_study_files\python\pydata-book-2nd-edition\datasets\movielens\movies.dat",sep='::',header=None,names=mnames)

	movie_id	title	genres
0	1	Toy Story (1995)	Animation\|Children's\|Comedy
1	2	Jumanji (1995)	Adventure\|Children's\|Fantasy
2	3	Grumpier Old Men (1995)	Comedy\|Romance
3	4	Waiting to Exhale (1995)	Comedy\|Drama
4	5	Father of the Bride Part II (1995)	Comedy
5	6	Heat (1995)	Action\|Crime\|Thriller
6	7	Sabrina (1995)	Comedy\|Romance
7	8	Tom and Huck (1995)	Adventure\|Children's
8	9	Sudden Death (1995)	Action
9	10	GoldenEye (1995)	Action\|Adventure\|Thriller

#要为每个genre添加指标变量
genre_iter = (set(x.split('|')) for x in movies.genres)
genres = sorted(set.union(*genre_iter))
dummies  =DataFrame(np.zeros((len(movies),len(genres))),columns=genres)
#迭代每一部电影并将dummies各行的项设置为1
for i, gen in enumerate(movies.genres):
    dummies.loc[i,gen.split('|')]  = 1
#将其与movies合并起来
movies_windic = movies.join(dummies.add_prefix('Genre_'))
movies_windic.loc[0]

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Action                                 0.0
Genre_Adventure                              0.0
Genre_Animation                              1.0
Genre_Children's                             1.0
Genre_Comedy                                 1.0
Genre_Crime                                  0.0
Genre_Documentary                            0.0
Genre_Drama                                  0.0
Genre_Fantasy                                0.0
Genre_Film-Noir                              0.0
Genre_Horror                                 0.0
Genre_Musical                                0.0
Genre_Mystery                                0.0
Genre_Romance                                0.0
Genre_Sci-Fi                                 0.0
Genre_Thriller                               0.0
Genre_War                                    0.0
Genre_Western                                0.0
Name: 0, dtype: object

#结合cut之类的离散化函数
values = np.random.rand(10)
values

array([0.75603383, 0.90830844, 0.96588737, 0.17373658, 0.87592824,
       0.75415641, 0.163486  , 0.23784062, 0.85564381, 0.58743194])

bins = [0,0.2,0.4,0.6,0.8,1]
pd.get_dummies(pd.cut(values,bins))

	(0.0, 0.2]	(0.2, 0.4]	(0.4, 0.6]	(0.6, 0.8]	(0.8, 1.0]
0	0	0	0	1	0
1	0	0	0	0	1
2	0	0	0	0	1
3	1	0	0	0	0
4	0	0	0	0	1
5	0	0	0	1	0
6	1	0	0	0	0
7	0	1	0	0	0
8	0	0	0	0	1
9	0	0	1	0	0

字符串操作

字符串对象方法

val = 'a,b, guido'
val.split(',')

['a', 'b', ' guido']

#strip(用于修剪空白符（包括换行符）)
pieces = [x.strip() for x in val.split(',')]
pieces

['a', 'b', 'guido']

first, second, third = pieces
first + '::' + second + '::' + third

'a::b::guido'

'::'.join(pieces)

'a::b::guido'

'guido' in val

True

val.index(',')

val.find(':')

-1

#find和index的区别：如果找不到字符串，index将会引发一个异常（而不是返回-1）
val.index(':')

---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)

C:\windows\Temp/ipykernel_13684/2927268062.py in <module>
----> 1 val.index(':')


ValueError: substring not found

#返回指定子串的出现次数
val.count(',')

val.replace(',','::')

'a::b:: guido'

val.replace(',','')

'ab guido'

在这里插入图片描述

正则表达式

正则表达式提供了一种灵活的在文本中搜索或匹配字符串模式的方式。
re模块的函数可以分为三个大类：模式匹配、替换以及拆分。

import re
text = "foo   bar\t baz   \tqux"
#描述一个或多个空白符的regex是\s+
re.split('\s+',text)

['foo', 'bar', 'baz', 'qux']

#如果打算对许多字符串应用同一条正则表达式
regex = re.compile('\s+')
regex.split(text)

['foo', 'bar', 'baz', 'qux']

regex.findall(text)

['   ', '\t ', '   \t']

findall返回的是字符串中所有的匹配项，而search则只返回第一个匹配项，match只匹配字符串的首部。

text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
#re.IGNORECASE的作用是使正则表达式对大小写不敏感
regex = re.compile(pattern,flags=re.IGNORECASE)
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com']

m = regex.search(text)
m

<re.Match object; span=(5, 20), match='dave@google.com'>

text[m.start():m.end()]

'dave@google.com'

print(regex.match(text))

None

#sub会将匹配到的模式替换为指定字符串，并返回所得到的新字符串。
print(regex.sub('REDACTED',text))

Dave REDACTED
Steve REDACTED
Rob REDACTED

#将地址分成3个部分：用户名、域名以及域后缀。只需将待分段的模式的各部分用圆括号包起来即可。
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern,flags=re.IGNORECASE)
m = regex.match('wesm@bright.net')
m.groups()

('wesm', 'bright', 'net')

regex.findall(text)

[('dave', 'google', 'com'), ('steve', 'gmail', 'com'), ('rob', 'gmail', 'com')]

#sub还能通过诸如\1、\2之类的特殊符号访问各匹配项中的分组
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3',text))

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com

regex = re.compile(r"""
(?P<username>[A-Z0-9._%+-]+)
@
(?P<domain>[A-Z0-9.-]+)
\.
(?P<suffix>[A-Z]{2,4})""",flags=re.IGNORECASE|re.VERBOSE)
m = regex.match('wesm@bright.net')
m.groupdict()

{'username': 'wesm', 'domain': 'bright', 'suffix': 'net'}

在这里插入图片描述

pandas中矢量化的字符串函数

data = {'Dave':'dave@google.com','Steve':'steve@gmail.com',
       'Rob':'rob@gmail.com','Wes':np.nan}
data = Series(data)
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

pattern

'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

#data.str.findall(pattern,flags=re.IGNORECASE)
x = data.str.findall(pattern,flags=re.IGNORECASE)
x

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

matches = data.str.match(pattern,flags=re.IGNORECASE)
matches

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object

x.str.get(1)

Dave    NaN
Steve   NaN
Rob     NaN
Wes     NaN
dtype: float64

x.str[0]

Dave     (dave, google, com)
Steve    (steve, gmail, com)
Rob        (rob, gmail, com)
Wes                      NaN
dtype: object

#对字符串进行子串截取
data.str[:5]

Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object

在这里插入图片描述

示例：USDA食品数据库

import json
db = json.load(open("E:\\python_study_files\\python\\pydata-book-2nd-edition\\datasets\\usda_food\\database.json"))
len(db)

#db中每个条目都是一个含有某种食物全部数据的字典
db[0].keys()

dict_keys(['id', 'description', 'tags', 'manufacturer', 'group', 'portions', 'nutrients'])

db[0]['nutrients'][0]

{'value': 25.18,
 'units': 'g',
 'description': 'Protein',
 'group': 'Composition'}

nutrients = DataFrame(db[0]['nutrients'])
nutrients[:7]

	value	units	description	group
0	25.18	g	Protein	Composition
1	29.20	g	Total lipid (fat)	Composition
2	3.06	g	Carbohydrate, by difference	Composition
3	3.28	g	Ash	Other
4	376.00	kcal	Energy	Energy
5	39.28	g	Water	Composition
6	1573.00	kJ	Energy	Energy

info_keys = ['description','group','id','manufacturer']
info = DataFrame(db,columns=info_keys)
info[:5]

	description	group	id
0	Cheese, caraway	Dairy and Egg Products	1008
1	Cheese, cheddar	Dairy and Egg Products	1009
2	Cheese, edam	Dairy and Egg Products	1018
3	Cheese, feta	Dairy and Egg Products	1019
4	Cheese, mozzarella, part skim milk	Dairy and Egg Products	1028

info

	description	group	id	manufacturer
0	Cheese, caraway	Dairy and Egg Products	1008
1	Cheese, cheddar	Dairy and Egg Products	1009
2	Cheese, edam	Dairy and Egg Products	1018
3	Cheese, feta	Dairy and Egg Products	1019
4	Cheese, mozzarella, part skim milk	Dairy and Egg Products	1028
...	...	...	...	...
6631	Bologna, beef, low fat	Sausages and Luncheon Meats	42161
6632	Turkey and pork sausage, fresh, bulk, patty or...	Sausages and Luncheon Meats	42173
6633	Babyfood, juice, pear	Baby Foods	43408	None
6634	Babyfood, dessert, banana yogurt, strained	Baby Foods	43539	None
6635	Babyfood, banana no tapioca, strained	Baby Foods	43546	None

6636 rows × 4 columns

pd.value_counts(info.group)[:10]

Vegetables and Vegetable Products    812
Beef Products                        618
Baked Products                       496
Breakfast Cereals                    403
Legumes and Legume Products          365
Fast Foods                           365
Lamb, Veal, and Game Products        345
Sweets                               341
Fruits and Fruit Juices              328
Pork Products                        328
Name: group, dtype: int64

#要对全部营养数据做一些分析
nutrients = []
for rec in db:
    fnuts = DataFrame(rec['nutrients'])
    fnuts['id'] = rec['id']
    nutrients.append(fnuts)
nutrients = pd.concat(nutrients,ignore_index=True)
nutrients

	value	units	description	group	id
0	25.180	g	Protein	Composition	1008
1	29.200	g	Total lipid (fat)	Composition	1008
2	3.060	g	Carbohydrate, by difference	Composition	1008
3	3.280	g	Ash	Other	1008
4	376.000	kcal	Energy	Energy	1008
...	...	...	...	...	...
389350	0.000	mcg	Vitamin B-12, added	Vitamins	43546
389351	0.000	mg	Cholesterol	Other	43546
389352	0.072	g	Fatty acids, total saturated	Other	43546
389353	0.028	g	Fatty acids, total monounsaturated	Other	43546
389354	0.041	g	Fatty acids, total polyunsaturated	Other	43546

389355 rows × 5 columns

#丢弃重复项
nutrients.duplicated().sum()

nutrients = nutrients.drop_duplicates()
col_mapping = {'description':'food',
              'group':'fgroup'}
info = info.rename(columns=col_mapping,copy=False)
info

	food	fgroup	id	manufacturer
0	Cheese, caraway	Dairy and Egg Products	1008
1	Cheese, cheddar	Dairy and Egg Products	1009
2	Cheese, edam	Dairy and Egg Products	1018
3	Cheese, feta	Dairy and Egg Products	1019
4	Cheese, mozzarella, part skim milk	Dairy and Egg Products	1028
...	...	...	...	...
6631	Bologna, beef, low fat	Sausages and Luncheon Meats	42161
6632	Turkey and pork sausage, fresh, bulk, patty or...	Sausages and Luncheon Meats	42173
6633	Babyfood, juice, pear	Baby Foods	43408	None
6634	Babyfood, dessert, banana yogurt, strained	Baby Foods	43539	None
6635	Babyfood, banana no tapioca, strained	Baby Foods	43546	None

6636 rows × 4 columns

col_mapping = {'description':'nutrient',
              'group':'nutgroup'}
nutrients = nutrients.rename(columns=col_mapping,copy=False)
nutrients

	value	units	nutrient	nutgroup	id
0	25.180	g	Protein	Composition	1008
1	29.200	g	Total lipid (fat)	Composition	1008
2	3.060	g	Carbohydrate, by difference	Composition	1008
3	3.280	g	Ash	Other	1008
4	376.000	kcal	Energy	Energy	1008
...	...	...	...	...	...
389350	0.000	mcg	Vitamin B-12, added	Vitamins	43546
389351	0.000	mg	Cholesterol	Other	43546
389352	0.072	g	Fatty acids, total saturated	Other	43546
389353	0.028	g	Fatty acids, total monounsaturated	Other	43546
389354	0.041	g	Fatty acids, total polyunsaturated	Other	43546

375176 rows × 5 columns

#将info跟nutrients合并起来
ndata = pd.merge(nutrients,info,on='id',how='outer')
ndata

	value	units	nutrient	nutgroup	id	food	fgroup	manufacturer
0	25.180	g	Protein	Composition	1008	Cheese, caraway	Dairy and Egg Products
1	29.200	g	Total lipid (fat)	Composition	1008	Cheese, caraway	Dairy and Egg Products
2	3.060	g	Carbohydrate, by difference	Composition	1008	Cheese, caraway	Dairy and Egg Products
3	3.280	g	Ash	Other	1008	Cheese, caraway	Dairy and Egg Products
4	376.000	kcal	Energy	Energy	1008	Cheese, caraway	Dairy and Egg Products
...	...	...	...	...	...	...	...	...
375171	0.000	mcg	Vitamin B-12, added	Vitamins	43546	Babyfood, banana no tapioca, strained	Baby Foods	None
375172	0.000	mg	Cholesterol	Other	43546	Babyfood, banana no tapioca, strained	Baby Foods	None
375173	0.072	g	Fatty acids, total saturated	Other	43546	Babyfood, banana no tapioca, strained	Baby Foods	None
375174	0.028	g	Fatty acids, total monounsaturated	Other	43546	Babyfood, banana no tapioca, strained	Baby Foods	None
375175	0.041	g	Fatty acids, total polyunsaturated	Other	43546	Babyfood, banana no tapioca, strained	Baby Foods	None

375176 rows × 8 columns

result = ndata.groupby(['nutrient','fgroup'])['value'].quantile(0.5)
result['Zinc, Zn'].sort_values().plot(kind='barh')

by_nutrient = ndata.groupby(['nutgroup','nutrient'])
get_maximum = lambda x: x.xs(x.value.idxmax())
get_minimum = lambda x: x.xs(x.value.idxmin())
max_foods = by_nutrient.apply(get_maximum)[['value','food']]
#让food小一点
max_foods.food = max_foods.food.str[:50]

max_foods.loc['Amino Acids']['food']

nutrient
Alanine                           Gelatins, dry powder, unsweetened
Arginine                               Seeds, sesame flour, low-fat
Aspartic acid                                   Soy protein isolate
Cystine                Seeds, cottonseed flour, low fat (glandless)
Glutamic acid                                   Soy protein isolate
Glycine                           Gelatins, dry powder, unsweetened
Histidine                Whale, beluga, meat, dried (Alaska Native)
Hydroxyproline    KENTUCKY FRIED CHICKEN, Fried Chicken, ORIGINA...
Isoleucine        Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Leucine           Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Lysine            Seal, bearded (Oogruk), meat, dried (Alaska Na...
Methionine                    Fish, cod, Atlantic, dried and salted
Phenylalanine     Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Proline                           Gelatins, dry powder, unsweetened
Serine            Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Threonine         Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Tryptophan         Sea lion, Steller, meat with fat (Alaska Native)
Tyrosine          Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Valine            Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Name: food, dtype: object

调停者จุ๊บ

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
利用Python进行数据分析的学习笔记——chap7

数据规整化：清理、转换、合并、重塑合并数据集pandas.merge可根据一个或多个键将不同DataFrame中的行连接起来。pandas.concat可以沿着一条轴将多个对象堆叠到一起。实例方法combine_first可以将重复数据编接在一起，用一个对象中的值填充另一个对象中的缺失值。数据库风格的DataFrame合并import numpy as npimport pandas as pdfrom pandas import DataFrame,Seriesdf1 = DataFr
复制链接

扫一扫

专栏目录

	key	data1	data2
0	b	0.0	1.0
1	b	1.0	1.0
2	b	6.0	1.0
3	a	2.0	0.0
4	a	4.0	0.0
5	a	5.0	0.0
6	c	3.0	NaN
7	d	NaN	2.0

	key	data1	data2
0	b	0	1.0
1	b	0	3.0
2	b	1	1.0
3	b	1	3.0
4	a	2	0.0
5	a	2	2.0
6	c	3	NaN
7	a	4	0.0
8	a	4	2.0
9	b	5	1.0
10	b	5	3.0

	key1	key2	lval	rval
0	foo	one	1.0	4.0
1	foo	one	1.0	5.0
2	foo	two	2.0	NaN
3	bar	one	3.0	6.0
4	bar	two	NaN	7.0

	key1	key2_x	lval	key2_y	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	two	7

	key1	key2_left	lval	key2_right	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	two	7

	0	1	2
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

	one	two	three
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

	(0.0, 0.2]	(0.2, 0.4]	(0.4, 0.6]	(0.6, 0.8]	(0.8, 1.0]
0	0	0	0	1	0
1	0	0	0	0	1
2	0	0	0	0	1
3	1	0	0	0	0
4	0	0	0	0	1
5	0	0	0	1	0
6	1	0	0	0	0
7	0	1	0	0	0
8	0	0	0	0	1
9	0	0	1	0	0

	key	data1	data2
0	b	0.0	1.0
1	b	1.0	1.0
2	b	6.0	1.0
3	a	2.0	0.0
4	a	4.0	0.0
5	a	5.0	0.0
6	c	3.0	NaN
7	d	NaN	2.0

	key	data1	data2
0	b	0	1.0
1	b	0	3.0
2	b	1	1.0
3	b	1	3.0
4	a	2	0.0
5	a	2	2.0
6	c	3	NaN
7	a	4	0.0
8	a	4	2.0
9	b	5	1.0
10	b	5	3.0

	key1	key2	lval	rval
0	foo	one	1.0	4.0
1	foo	one	1.0	5.0
2	foo	two	2.0	NaN
3	bar	one	3.0	6.0
4	bar	two	NaN	7.0

	key1	key2_x	lval	key2_y	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	two	7

	key1	key2_left	lval	key2_right	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	two	7

	0	1	2
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

	one	two	three
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

	(0.0, 0.2]	(0.2, 0.4]	(0.4, 0.6]	(0.6, 0.8]	(0.8, 1.0]
0	0	0	0	1	0
1	0	0	0	0	1
2	0	0	0	0	1
3	1	0	0	0	0
4	0	0	0	0	1
5	0	0	0	1	0
6	1	0	0	0	0
7	0	1	0	0	0
8	0	0	0	0	1
9	0	0	1	0	0

	key	data1	data2
0	b	0.0	1.0
1	b	1.0	1.0
2	b	6.0	1.0
3	a	2.0	0.0
4	a	4.0	0.0
5	a	5.0	0.0
6	c	3.0	NaN
7	d	NaN	2.0

	key	data1	data2
0	b	0	1.0
1	b	0	3.0
2	b	1	1.0
3	b	1	3.0
4	a	2	0.0
5	a	2	2.0
6	c	3	NaN
7	a	4	0.0
8	a	4	2.0
9	b	5	1.0
10	b	5	3.0

	key1	key2	lval	rval
0	foo	one	1.0	4.0
1	foo	one	1.0	5.0
2	foo	two	2.0	NaN
3	bar	one	3.0	6.0
4	bar	two	NaN	7.0

	key1	key2_x	lval	key2_y	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	two	7

	key1	key2_left	lval	key2_right	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	two	7

	0	1	2
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

	one	two	three
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

	(0.0, 0.2]	(0.2, 0.4]	(0.4, 0.6]	(0.6, 0.8]	(0.8, 1.0]
0	0	0	0	1	0
1	0	0	0	0	1
2	0	0	0	0	1
3	1	0	0	0	0
4	0	0	0	0	1
5	0	0	0	1	0
6	1	0	0	0	0
7	0	1	0	0	0
8	0	0	0	0	1
9	0	0	1	0	0