import pandas as pd
import numpy as np
import warnings
path = r"C:\Users\yongx\Desktop\data"
df = pd. read_csv( path + "\\learn_pandas.csv" , \
usecols = [ 'School' , 'Grade' , 'Name' , 'Gender' , 'Weight' , 'Transfer' ] )
df[ 'Name' ] . head( 2 )
0 Gaopeng Yang
1 Changqiang You
Name: Name, dtype: object
索引器
列索引
df[ [ 'Gender' , 'Name' ] ] . head( 2 )
Gender Name 0 Female Gaopeng Yang 1 Male Changqiang You
df. Name. head( 2 )
0 Gaopeng Yang
1 Changqiang You
Name: Name, dtype: object
字符行索引
s = pd. Series( [ 1 , 2 , 3 , 4 , 5 , 6 ] , \
index = [ 'a' , 'b' , 'a' , 'a' , 'a' , 'c' ] )
s[ 'b' ]
2
s[ [ 'b' , 'c' ] ]
b 2
c 6
dtype: int64
s[ 'c' : 'b' : - 2 ]
c 6
a 4
b 2
dtype: int64
整数行索引
s = pd. Series( [ 'a' , 'b' , 'c' , 'd' , 'e' , 'f' ] , \
index = [ 1 , 3 , 1 , 2 , 5 , 4 ] )
s[ 1 ]
1 a
1 c
dtype: object
s[ [ 2 , 3 ] ]
2 d
3 b
dtype: object
s[ 1 : - 1 : 2 ]
3 b
2 d
dtype: object
loc索引器
对于表有两种索引器,基于元素
的loc
和基于位置
的iloc
的索引器. loc
索引器形式为loc[*,*]
,分别代表行
,列
.当省略第二个位置时表示对行进行筛选.*
的位置一共有五类合法对象,分别为单个元素,元素列表,元素切片,布尔列表及函数.
df_demo = df. set_index( 'Name' )
df_demo. head( 3 )
School Grade Gender Weight Transfer Name Gaopeng Yang Shanghai Jiao Tong University Freshman Female 46.0 N Changqiang You Peking University Freshman Male 70.0 N Mei Sun Shanghai Jiao Tong University Senior Male 89.0 N
df_demo. loc[ 'Qiang Sun' ]
School Grade Gender Weight Transfer Name Qiang Sun Tsinghua University Junior Female 53.0 N Qiang Sun Tsinghua University Sophomore Female 40.0 N Qiang Sun Shanghai Jiao Tong University Junior Female NaN N
print ( df_demo. loc[ 'Quan Zhao' ] )
type ( df_demo. loc[ 'Quan Zhao' ] )
School Shanghai Jiao Tong University
Grade Junior
Gender Female
Weight 53
Transfer N
Name: Quan Zhao, dtype: object
pandas.core.series.Series
df_demo. loc[ 'Qiang Sun' , 'School' ]
Name
Qiang Sun Tsinghua University
Qiang Sun Tsinghua University
Qiang Sun Shanghai Jiao Tong University
Name: School, dtype: object
df_demo. loc[ [ 'Qiang Sun' , 'Quan Zhao' ] , [ 'School' ,
'Gender' ] ]
School Gender Name Qiang Sun Tsinghua University Female Qiang Sun Tsinghua University Female Qiang Sun Shanghai Jiao Tong University Female Quan Zhao Shanghai Jiao Tong University Female
df_demo. loc[ 'Gaojuan You' : 'Gaoqiang Qian' ,
'School' : 'Gender' ]
School Grade Gender Name Gaojuan You Fudan University Sophomore Male Xiaoli Qian Tsinghua University Freshman Female Qiang Chu Shanghai Jiao Tong University Freshman Female Gaoqiang Qian Tsinghua University Junior Female
df_loc_slice_demo = df_demo. copy( )
df_loc_slice_demo. index = range ( df_demo. shape[ 0 ] , 0 , - 1 )
df_loc_slice_demo. loc[ 5 : 3 ]
School Grade Gender Weight Transfer 5 Fudan University Junior Female 46.0 N 4 Tsinghua University Senior Female 50.0 N 3 Shanghai Jiao Tong University Senior Female 45.0 N
df_loc_slice_demo. loc[ 3 : 5 ]
School Grade Gender Weight Transfer
df_demo. loc[ df_demo. Weight > 70 ] . head( 3 )
School Grade Gender Weight Transfer Name Mei Sun Shanghai Jiao Tong University Senior Male 89.0 N Gaojuan You Fudan University Sophomore Male 74.0 N Xiaopeng Zhou Shanghai Jiao Tong University Freshman Male 74.0 N
df_demo. loc[ df_demo. Grade. isin( [ 'Freshman' , 'Senior' ] ) ] . head( 3 )
School Grade Gender Weight Transfer Name Gaopeng Yang Shanghai Jiao Tong University Freshman Female 46.0 N Changqiang You Peking University Freshman Male 70.0 N Mei Sun Shanghai Jiao Tong University Senior Male 89.0 N
condition_1_1 = df_demo. School == 'Fudan University'
condition_1_2 = df_demo. Grade == 'Senior'
condition_1_3 = df_demo. Weight > 70
condition_1 = condition_1_1 & condition_1_2 & condition_1_3
condition_2_1 = df_demo. School == 'Peking University'
condition_2_2 = df_demo. Grade == 'Senior'
condition_2_3 = df_demo. Weight > 80
condition_2 = condition_2_1 & ( ~ condition_2_2) & condition_2_3
df_demo. loc[ condition_1 | condition_2]
School Grade Gender Weight Transfer Name Qiang Han Peking University Freshman Male 87.0 N Chengpeng Zhou Fudan University Senior Male 81.0 N Changpeng Zhao Peking University Freshman Male 83.0 N Chengpeng Qian Fudan University Senior Male 73.0 Y
def condition ( x) :
condition_1_1 = x. School == 'Fudan University'
condition_1_2 = x. Grade == 'Senior'
condition_1_3 = x. Weight > 70
condition_1 = condition_1_1 & condition_1_2 & condition_1_3
condition_2_1 = x. School == 'Peking University'
condition_2_2 = x. Grade == 'Senior'
condition_2_3 = x. Weight > 80
condition_2 = condition_2_1 & ( ~ condition_2_2) & condition_2_3
result = condition_1 | condition_2
return result
df_demo. loc[ condition]
School Grade Gender Weight Transfer Name Qiang Han Peking University Freshman Male 87.0 N Chengpeng Zhou Fudan University Senior Male 81.0 N Changpeng Zhao Peking University Freshman Male 83.0 N Chengpeng Qian Fudan University Senior Male 73.0 Y
df_demo. loc[ lambda x: 'Quan Zhao' , lambda x: 'Gender' ]
'Female'
df_demo. loc[ lambda x : slice ( 'Gaojuan You' , 'Gaoqiang Qian' ) ]
df_demo. loc[ 'Gaojuan You' : 'Gaoqiang Qian' ]
School Grade Gender Weight Transfer Name Gaojuan You Fudan University Sophomore Male 74.0 N Xiaoli Qian Tsinghua University Freshman Female 51.0 N Qiang Chu Shanghai Jiao Tong University Freshman Female 52.0 N Gaoqiang Qian Tsinghua University Junior Female 50.0 N
df_chain= pd. DataFrame( [ [ 0 , 0 ] , [ 1 , 0 ] , [ 1 , 0 ] ] , columns = list ( 'AB' ) )
df_chain
import warnings
with warnings. catch_warnings( ) :
warnings. filterwarnings( 'error' )
try :
df_chain[ df_chain. A!= 0 ] . B= 1
except Warning as w:
Warning_Msg = w
print ( Warning_Msg)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df_chain
df_chain. loc[ df_chain. A!= 0 , 'B' ] = 1
df_chain
iloc索引器
iloc类似于loc, 不同之处在于iloc针对位置进行筛选,在相同的*
位置处也有五类合法对象有:整数,整数列表,整数切片,布尔列表及函数.范数的返回值必须是前面四类中的任意一类,输入为DataFrame
df_demo. iloc[ 1 , 1 ]
df_demo. iloc[ [ 0 , 1 ] , [ 0 , 1 ] ]
df_demo. iloc[ 1 : 4 , 2 : 4 ]
df_demo. iloc[ lambda x: slice ( 1 , 4 ) ]
df_demo. iloc[ ( df_demo. Weight > 80 ) . values] . head( 3 )
School Grade Gender Weight Transfer Name Mei Sun Shanghai Jiao Tong University Senior Male 89.0 N Qiang Zheng Shanghai Jiao Tong University Senior Male 87.0 N Qiang Han Peking University Freshman Male 87.0 N
df_demo. School. iloc[ 1 ]
df_demo. School. iloc[ 1 : 5 : 2 ]
Name
Changqiang You Peking University
Xiaojuan Sun Fudan University
Name: School, dtype: object
query方法
通过字符串形式的查询表达式传入query方式来查询数据,表达式的执行结果必须返回布尔列表, 进行复杂索引时无需重复使用DataFrame的名字来引用列明,一般而言可以降低代码长度.
df. query( '((School == "Fudan University") &(Grade == "Senior")&(Weight > 70))|((School == "Peking University")&(Grade != "Senior") &(Weight > 80))' )
df. query( '((School == "Fudan University") & '
'(Grade == "Senior")&'
'(Weight > 70))|'
'((School == "Peking University")&'
'(Grade != "Senior") &'
'(Weight > 80))' )
School Grade Name Gender Weight Transfer 38 Peking University Freshman Qiang Han Male 87.0 N 66 Fudan University Senior Chengpeng Zhou Male 81.0 N 99 Peking University Freshman Changpeng Zhao Male 83.0 N 131 Fudan University Senior Chengpeng Qian Male 73.0 Y
df_demo. query( 'Weight > Weight.mean()' ) . head( 3 )
School Grade Gender Weight Transfer Name Changqiang You Peking University Freshman Male 70.0 N Mei Sun Shanghai Jiao Tong University Senior Male 89.0 N Gaojuan You Fudan University Sophomore Male 74.0 N
df_demo_col = df_demo. copy( )
df_demo_col[ 'Student Weight' ] = df_demo_col[ 'Weight' ] . copy( )
df_demo_col. query( '`Student Weight` > `Student Weight`.mean()' ) . head( 3 )
School Grade Gender Weight Transfer Student Weight Name Changqiang You Peking University Freshman Male 70.0 N 70.0 Mei Sun Shanghai Jiao Tong University Senior Male 89.0 N 89.0 Gaojuan You Fudan University Sophomore Male 74.0 N 74.0
df. query( '(Grade not in ["Freshman", "Sophomore"]) and (Gender == "Male")' ) . head( 3 )
df. query( '(Grade != ["Frashmax", "Sophomore"]) and (Gender == "Male")' ) . head( 3 )
School Grade Name Gender Weight Transfer 1 Peking University Freshman Changqiang You Male 70.0 N 2 Shanghai Jiao Tong University Senior Mei Sun Male 89.0 N 10 Shanghai Jiao Tong University Freshman Xiaopeng Zhou Male 74.0 N
df. query( 'School.isin(["Tsinghua University", "Shanghai Jiao Tong University"])' , engine = 'python' ) . head( 3 )
School Grade Name Gender Weight Transfer 0 Shanghai Jiao Tong University Freshman Gaopeng Yang Female 46.0 N 2 Shanghai Jiao Tong University Senior Mei Sun Male 89.0 N 5 Tsinghua University Freshman Xiaoli Qian Female 51.0 N
low, high = 70 , 80
df. query( 'Weight.between(@low, @high)' ) . head( 3 )
School Grade Name Gender Weight Transfer 1 Peking University Freshman Changqiang You Male 70.0 N 4 Fudan University Sophomore Gaojuan You Male 74.0 N 10 Shanghai Jiao Tong University Freshman Xiaopeng Zhou Male 74.0 N
随机抽样
将DataFrame每行当作一个样本,或每列当作一个特征,再将DataFrame当作一个整体,可以通过sample来对样本或特征进行抽样,用途是用总体统计特征的无偏估计小样本代替整体提升工作效率. sample
函数主要参数n
,axis
,frac
,replace
,weight
前三个分别为抽样数量,抽样方向(0为行,1为列)和抽样比例(小于1),replace和weights分别指是否放回和每个样本的抽样相对概率,replace=True表示又放回抽样
df_sample = pd. DataFrame( { 'id' : list ( 'abcde' ) , 'value' : [ 1 , 2 , 3 , 4 , 90 ] } )
df_sample
df_sample. sample( 3 , replace = True , weights = df_sample. value)
多级索引
多级索引及表结构
np. random. seed( 0 )
multi_index = pd. MultiIndex. from_product( [ list ( 'ABCD' ) ,
df. Gender. unique( ) ] , names = ( 'School' , 'Gender' ) )
multi_column = pd. MultiIndex. from_product( [ [ 'Height' , 'Weight' ] ,
df. Grade. unique( ) ] , names = ( 'indicator' , 'Grade' ) )
df_multi = pd. DataFrame( np. c_[ ( np. random. randn( 8 , 4 ) * 5 + 163 ) . tolist( ) ,
( np. random. randn( 8 , 4 ) * 5 + 65 ) . tolist( ) ] ,
index = multi_index, columns = multi_column) . round ( 1 )
df_multi
indicator Height Weight Grade Freshman Senior Sophomore Junior Freshman Senior Sophomore Junior School Gender A Female 171.8 165.0 167.9 174.2 60.6 55.1 63.3 65.8 Male 172.3 158.1 167.8 162.2 71.2 71.0 63.1 63.5 B Female 162.5 165.1 163.7 170.3 59.8 57.9 56.5 74.8 Male 166.8 163.6 165.2 164.7 62.5 62.8 58.7 68.9 C Female 170.5 162.0 164.6 158.7 56.9 63.9 60.5 66.9 Male 150.2 166.3 167.3 159.3 62.4 59.1 64.9 67.1 D Female 174.3 155.7 163.2 162.1 65.3 66.5 61.8 63.2 Male 170.7 170.3 163.8 164.9 61.6 63.2 60.9 56.4
df_multi. index. names
FrozenList(['School', 'Gender'])
df_multi. columns. names
FrozenList(['indicator', 'Grade'])
df_multi. index. values
array([('A', 'Female'), ('A', 'Male'), ('B', 'Female'), ('B', 'Male'),
('C', 'Female'), ('C', 'Male'), ('D', 'Female'), ('D', 'Male')],
dtype=object)
df_multi. columns. values
array([('Height', 'Freshman'), ('Height', 'Senior'),
('Height', 'Sophomore'), ('Height', 'Junior'),
('Weight', 'Freshman'), ('Weight', 'Senior'),
('Weight', 'Sophomore'), ('Weight', 'Junior')], dtype=object)
df_multi. index. get_level_values( 0 )
Index(['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'], dtype='object', name='School')
多级索引器中的loc索引器
将原表中学校和年级设为索引,此时行为多级索引,列为单级索引,并且由于默认状态列索引不含名字,则Indicator
和Grade
索引位置为空
df_multi = df. set_index( [ 'School' , 'Grade' ] )
df_multi. head( 5 )
Name Gender Weight Transfer School Grade Shanghai Jiao Tong University Freshman Gaopeng Yang Female 46.0 N Peking University Freshman Changqiang You Male 70.0 N Shanghai Jiao Tong University Senior Mei Sun Male 89.0 N Fudan University Sophomore Xiaojuan Sun Female 41.0 N Sophomore Gaojuan You Male 74.0 N
with warnings. catch_warnings( ) :
warnings. filterwarnings( 'error' )
try :
df_multi. loc[ ( 'Fudan University' , 'Junior' ) ] . head( )
except Warning as w:
Warning_Msg = w
print ( Warning_Msg)
indexing past lexsort depth may impact performance.
df_sorted = df_multi. sort_index( )
df_sorted. loc[ ( 'Fudan University' , 'Junior' ) ] . head( )
Name Gender Weight Transfer School Grade Fudan University Junior Yanli You Female 48.0 N Junior Chunqiang Chu Male 72.0 N Junior Changfeng Lv Male 76.0 N Junior Yanjuan Lv Female 49.0 NaN Junior Gaoqiang Zhou Female 43.0 N
df_sorted. loc[ [ ( 'Fudan University' , 'Senior' ) ,
( 'Shanghai Jiao Tong University' , 'Freshman' ) ] ] . head( )
Name Gender Weight Transfer School Grade Fudan University Senior Chengpeng Zheng Female 38.0 N Senior Feng Zhou Female 47.0 N Senior Gaomei Lv Female 34.0 N Senior Chunli Lv Female 56.0 N Senior Chengpeng Zhou Male 81.0 N
df_sorted. loc[ df_sorted. Weight > 70 ] . head( )
Name Gender Weight Transfer School Grade Fudan University Freshman Feng Wang Male 74.0 N Junior Chunqiang Chu Male 72.0 N Junior Changfeng Lv Male 76.0 N Senior Chengpeng Zhou Male 81.0 N Senior Chengpeng Qian Male 73.0 Y
df_sorted. loc[ lambda x : ( 'Fudan University' , 'Junior' ) ] . head( )
Name Gender Weight Transfer School Grade Fudan University Junior Yanli You Female 48.0 N Junior Chunqiang Chu Male 72.0 N Junior Changfeng Lv Male 76.0 N Junior Yanjuan Lv Female 49.0 NaN Junior Gaoqiang Zhou Female 43.0 N
df_sorted. head( )
Name Gender Weight Transfer School Grade Fudan University Freshman Changqiang Yang Female 49.0 N Freshman Gaoqiang Qin Female 63.0 N Freshman Gaofeng Zhao Female 43.0 N Freshman Yanquan Wang Female 55.0 N Freshman Feng Wang Male 74.0 N
try :
df_multi. loc[ ( 'Fudan University' , 'Senior' ) : ] . head( )
except Exception as e:
Err_Msg = e
print ( Err_Msg)
'Key length (2) was greater than MultiIndex lexsort depth (0)'
df_sorted. loc[ ( 'Fudan University' , 'Senior' ) : ] . head( )
Name Gender Weight Transfer School Grade Fudan University Senior Chengpeng Zheng Female 38.0 N Senior Feng Zhou Female 47.0 N Senior Gaomei Lv Female 34.0 N Senior Chunli Lv Female 56.0 N Senior Chengpeng Zhou Male 81.0 N
df_unique = df. drop_duplicates( subset= [ 'School' , 'Grade' ] ) . set_index( [ 'School' , 'Grade' ] )
df_unique. head( )
Name Gender Weight Transfer School Grade Shanghai Jiao Tong University Freshman Gaopeng Yang Female 46.0 N Peking University Freshman Changqiang You Male 70.0 N Shanghai Jiao Tong University Senior Mei Sun Male 89.0 N Fudan University Sophomore Xiaojuan Sun Female 41.0 N Tsinghua University Freshman Xiaoli Qian Female 51.0 N
df_unique. sort_index( ) . loc[ ( 'Fudan University' , 'Senior' ) : ] . head( )
Name Gender Weight Transfer School Grade Fudan University Senior Chengpeng Zheng Female 38.0 N Sophomore Xiaojuan Sun Female 41.0 N Peking University Freshman Changqiang You Male 70.0 N Junior Juan Xu Female NaN N Senior Changli Lv Female 41.0 N
res = df_multi. loc[ ( [ 'Peking University' , 'Fudan Unversity' ] , [ 'Sophomort' , 'Junior' ] ) , : ]
res. head( )
Name Gender Weight Transfer School Grade Peking University Junior Juan Xu Female NaN N Junior Changjuan You Female 47.0 N Junior Gaoli Xu Female 48.0 N Junior Gaoquan Zhou Male 70.0 N Junior Qiang You Female 56.0 N
res. shape
(8, 4)
res = df_multi. loc[ [ ( 'Peking University' , 'Senior' ) , ( 'Fudan University' , 'Sophomore' ) ] ]
res. head( )
Name Gender Weight Transfer School Grade Peking University Senior Changli Lv Female 41.0 N Senior Feng Zheng Female 49.0 N Senior Feng Zhao Male 66.0 N Senior Changquan Han Male 77.0 N Senior Mei Feng Female 51.0 N
res. shape
(16, 4)
IndexSlice对象
前面方法不可以在索引不重复的时候对每层进行切片,也不允许将切片和布尔列表进行混合使用,只能对元组整体进行切片,可以通过引入IndexSlice对象来解决这个问题.Slice对象有loc[idx[, ]]和loc[idx[, ],idx[, ]]两种
pd. MultiIndex. from_product?
np. random. seed( 0 )
L1, L2 = [ 'A' , 'B' , 'C' ] , [ 'a' , 'b' , 'c' ]
mul_index1 = pd. MultiIndex. from_product( [ L1, L2] , names = ( 'Upper' , 'lower' ) )
L3, L4 = [ 'D' , 'E' , 'F' ] , [ 'd' , 'e' , 'f' ]
mul_index2 = pd. MultiIndex. from_product( [ L3, L4] , names= ( 'Big' , 'Small' ) )
df_ex = pd. DataFrame( np. random. randint( - 9 , 10 , ( 9 , 9 ) ) ,
index = mul_index1,
columns = mul_index2)
df_ex
Big D E F Small d e f d e f d e f Upper lower A a 3 6 -9 -6 -6 -2 0 9 -5 b -3 3 -8 -3 -2 5 8 -4 4 c -1 0 7 -4 6 6 -9 9 -6 B a 8 5 -2 -9 -8 0 -9 1 -6 b 2 9 -7 -9 -9 -5 -4 -3 -1 c 8 6 -5 0 1 -8 -8 -2 0 C a -6 -3 2 5 9 -9 5 -6 3 b 1 2 -5 -3 -5 6 -6 3 -5 c -1 5 6 -6 6 4 7 8 -4
idx = pd. IndexSlice
df_ex. loc[ idx[ 'C' : , ( 'D' , 'E' ) : ] ]
df_ex. loc[ idx[ : 'A' , lambda x: x. sum ( ) > 0 ] ]
df_ex. loc[ idx[ : 'A' , 'b' : ] , idx[ 'E' : , 'e' : ] ]
Big E F Small e f e f Upper lower A b -2 5 -4 4 c 6 6 9 -6
多级索引的构造
可通过from_tuples,from_arrays, from_product三种方法构造多级索引,以上三种方法均为pd.MultiIndex对象下的函数
df_ex. describe( )
Big D E F Small d e f d e f d e f count 9.000000 9.000000 9.000000 9.000000 9.000000 9.000000 9.000000 9.000000 9.000000 mean 1.222222 3.666667 -2.333333 -3.888889 -0.888889 -0.333333 -1.777778 1.666667 -2.222222 std 4.684490 3.605551 6.000000 4.428443 6.679155 5.979130 6.960204 5.873670 3.865805 min -6.000000 -3.000000 -9.000000 -9.000000 -9.000000 -9.000000 -9.000000 -6.000000 -6.000000 25% -1.000000 2.000000 -7.000000 -6.000000 -6.000000 -5.000000 -8.000000 -3.000000 -5.000000 50% 1.000000 5.000000 -5.000000 -4.000000 -2.000000 0.000000 -4.000000 1.000000 -4.000000 75% 3.000000 6.000000 2.000000 -3.000000 6.000000 5.000000 5.000000 8.000000 0.000000 max 8.000000 9.000000 7.000000 5.000000 9.000000 6.000000 8.000000 9.000000 4.000000
my_tuple= [ ( 'a' , 'cat' ) , ( 'a' , 'dog' ) , ( 'b' , 'cat' ) , ( 'b' , 'dog' ) ]
pd. MultiIndex. from_tuples( my_tuple, names= [ 'First' , 'Second' ] )
MultiIndex([('a', 'cat'),
('a', 'dog'),
('b', 'cat'),
('b', 'dog')],
names=['First', 'Second'])
my_array = [ list ( 'aabb' ) , [ 'cat' , 'dog' ] * 2 ]
pd. MultiIndex. from_arrays( my_array, names = [ 'First' , 'Second' ] )
MultiIndex([('a', 'cat'),
('a', 'dog'),
('b', 'cat'),
('b', 'dog')],
names=['First', 'Second'])
my_list1 = [ 'a' , 'b' ]
my_list2 = [ 'cat' , 'dog' ]
pd. MultiIndex. from_product( [ my_list1, my_list2] , names = [ 'First' , 'Second' ] )
MultiIndex([('a', 'cat'),
('a', 'dog'),
('b', 'cat'),
('b', 'dog')],
names=['First', 'Second'])
索引的常用方法
索引层的交换和删除
np. random. seed( 0 )
L1, L2, L3 = [ 'A' , 'B' ] , [ 'a' , 'b' ] , [ 'alpha' , 'beta' ]
mul_index1 = pd. MultiIndex. from_product( [ L1, L2, L3] ,
names= ( 'Upper' , 'Lower' , 'Extra' ) )
L4, L5, L6 = [ 'C' , 'D' ] , [ 'c' , 'd' ] , [ 'cat' , 'dog' ]
mul_index2 = pd. MultiIndex. from_product( [ L4, L5, L6] ,
names= ( 'Big' , 'Small' , 'Other' ) )
df_ex = pd. DataFrame( np. random. randint( - 9 , 10 , ( 8 , 8 ) ) ,
index= mul_index1,
columns= mul_index2)
df_ex
Big C D Small c d c d Other cat dog cat dog cat dog cat dog Upper Lower Extra A a alpha 3 6 -9 -6 -6 -2 0 9 beta -5 -3 3 -8 -3 -2 5 8 b alpha -4 4 -1 0 7 -4 6 6 beta -9 9 -6 8 5 -2 -9 -8 B a alpha 0 -9 1 -6 2 9 -7 -9 beta -9 -5 -4 -3 -1 8 6 -5 b alpha 0 1 -8 -8 -2 0 -6 -3 beta 2 5 9 -9 5 -6 3 1
df_ex. swaplevel( 0 , 2 , axis = 1 ) . head( 3 )
Other cat dog cat dog cat dog cat dog Small c c d d c c d d Big C C C C D D D D Upper Lower Extra A a alpha 3 6 -9 -6 -6 -2 0 9 beta -5 -3 3 -8 -3 -2 5 8 b alpha -4 4 -1 0 7 -4 6 6
df_ex. reorder_levels( [ 2 , 0 , 1 ] , axis = 0 ) . head( 3 )
Big C D Small c d c d Other cat dog cat dog cat dog cat dog Extra Upper Lower alpha A a 3 6 -9 -6 -6 -2 0 9 beta A a -5 -3 3 -8 -3 -2 5 8 alpha A b -4 4 -1 0 7 -4 6 6
df_ex. droplevel( 1 , axis = 1 ) . head( 3 )
Big C D Other cat dog cat dog cat dog cat dog Upper Lower Extra A a alpha 3 6 -9 -6 -6 -2 0 9 beta -5 -3 3 -8 -3 -2 5 8 b alpha -4 4 -1 0 7 -4 6 6
df_ex. droplevel( [ 0 , 1 ] , axis = 0 ) . head( 3 )
Big C D Small c d c d Other cat dog cat dog cat dog cat dog Extra alpha 3 6 -9 -6 -6 -2 0 9 beta -5 -3 3 -8 -3 -2 5 8 alpha -4 4 -1 0 7 -4 6 6
索引属性的修改
通过rename_axis对索引层
的名字进行修改, 常用修改方式有传入字典的映射
df_ex. rename_axis( index = { 'Upper' : 'Changed_row' } ,
columns = { 'Other' : 'Changed_col' } ) . head( )
Big C D Small c d c d Changed_col cat dog cat dog cat dog cat dog Changed_row Lower Extra A a alpha 3 6 -9 -6 -6 -2 0 9 beta -5 -3 3 -8 -3 -2 5 8 b alpha -4 4 -1 0 7 -4 6 6 beta -9 9 -6 8 5 -2 -9 -8 B a alpha 0 -9 1 -6 2 9 -7 -9
df_ex. rename( columns = { 'cat' : 'not_cat' } ,
level = 2 ) . head( )
Big C D Small c d c d Other not_cat dog not_cat dog not_cat dog not_cat dog Upper Lower Extra A a alpha 3 6 -9 -6 -6 -2 0 9 beta -5 -3 3 -8 -3 -2 5 8 b alpha -4 4 -1 0 7 -4 6 6 beta -9 9 -6 8 5 -2 -9 -8 B a alpha 0 -9 1 -6 2 9 -7 -9
df_ex. rename( index = lambda x: str . upper( x) ,
level = 2 ) . head( )
Big C D Small c d c d Other cat dog cat dog cat dog cat dog Upper Lower Extra A a ALPHA 3 6 -9 -6 -6 -2 0 9 BETA -5 -3 3 -8 -3 -2 5 8 b ALPHA -4 4 -1 0 7 -4 6 6 BETA -9 9 -6 8 5 -2 -9 -8 B a ALPHA 0 -9 1 -6 2 9 -7 -9
new_values = iter ( list ( 'abcdefgh' ) )
df_ex. rename( index = lambda x: next ( new_values) , level = 2 )
Big C D Small c d c d Other cat dog cat dog cat dog cat dog Upper Lower Extra A a a 3 6 -9 -6 -6 -2 0 9 b -5 -3 3 -8 -3 -2 5 8 b c -4 4 -1 0 7 -4 6 6 d -9 9 -6 8 5 -2 -9 -8 B a e 0 -9 1 -6 2 9 -7 -9 f -9 -5 -4 -3 -1 8 6 -5 b g 0 1 -8 -8 -2 0 -6 -3 h 2 5 9 -9 5 -6 3 1
df_temp = df_ex. copy( )
new_idx = df_temp. index. map ( lambda x : ( str . upper( x[ 0 ] ) , x[ 1 ] , str . upper( x[ 2 ] ) ) )
df_temp. index = new_idx
df_temp. head( )
Big C D Small c d c d Other cat dog cat dog cat dog cat dog Upper Lower Extra A a ALPHA 3 6 -9 -6 -6 -2 0 9 BETA -5 -3 3 -8 -3 -2 5 8 b ALPHA -4 4 -1 0 7 -4 6 6 BETA -9 9 -6 8 5 -2 -9 -8 B a ALPHA 0 -9 1 -6 2 9 -7 -9
df_temp = df_ex. copy( )
new_idx = df_temp. index. map ( lambda x : ( x[ 0 ] + '-' + x[ 1 ] + '-' + x[ 2 ] ) )
df_temp. index = new_idx
df_temp. head( 3 )
Big C D Small c d c d Other cat dog cat dog cat dog cat dog A-a-alpha 3 6 -9 -6 -6 -2 0 9 A-a-beta -5 -3 3 -8 -3 -2 5 8 A-b-alpha -4 4 -1 0 7 -4 6 6
new_idx = df_temp. index. map ( lambda x: tuple ( x. split( '-' ) ) )
df_temp. index = new_idx
df_temp. head( 3 )
Big C D Small c d c d Other cat dog cat dog cat dog cat dog A a alpha 3 6 -9 -6 -6 -2 0 9 beta -5 -3 3 -8 -3 -2 5 8 b alpha -4 4 -1 0 7 -4 6 6
索引的设置与重置
df_new = pd. DataFrame( { 'A' : list ( 'aacd' ) ,
'B' : list ( 'PQRT' ) ,
'C' : [ 1 , 2 , 3 , 4 ] } )
df_new
df_new. set_index( 'A' )
df_new. set_index( 'A' , append = True )
df_new. set_index( [ 'A' , 'B' ] )
my_index = pd. Series( list ( 'WXYZ' ) , name = 'D' )
df_new = df_new. set_index( [ 'A' , my_index] )
df_new
df_new. reset_index( [ 'D' ] )
索引的变形
对索引做一些扩充或剔除,也或者给定一个新的索引,将原表中相应的索引对应元素填充到新索引构成的表中
df_reindex = pd. DataFrame( { "Weight" : [ 60 , 70 , 80 ] ,
"Height" : [ 176 , 180 , 179 ] } ,
index= [ '1001' , '1003' , '1002' ] )
df_reindex
Weight Height 1001 60 176 1003 70 180 1002 80 179
df_reindex. reindex( index = [ '1001' , '1002' , '1003' , '1004' ] ,
columns = [ 'Weight' , 'Gender' ] )
Weight Gender 1001 60.0 NaN 1002 80.0 NaN 1003 70.0 NaN 1004 NaN NaN
df_existed = pd. DataFrame( index = [ '1001' , '1002' , '1003' , '1004' ] ,
columns = [ 'Weight' , 'Gender' ] )
df_existed
Weight Gender 1001 NaN NaN 1002 NaN NaN 1003 NaN NaN 1004 NaN NaN
索引运算
集合运算法则
利用集合算取出符合条件行的需求,如取出两表中index索引交集的数据,此时通过Index上的运算操作较为容易实现, 有intersection
(交集),union
(并集),difference
(差集),symmetric_difference
(并集减去交集)
一般索引运算
集合的元素是互异的,但是索引中可能有相同的元素,应该首先使用unique
方法去重后再进行运算
df_set_1 = pd. DataFrame( [ [ 0 , 1 ] , [ 1 , 2 ] , [ 3 , 4 ] ] ,
index= pd. Index( [ 'a' , 'b' , 'a' ] , name= 'id1' ) )
df_set_2 = pd. DataFrame( [ [ 4 , 5 ] , [ 2 , 6 ] , [ 7 , 1 ] ] ,
index= pd. Index( [ 'b' , 'b' , 'c' ] , name= 'id2' ) )
id1, id2 = df_set_1. index. unique( ) , df_set_2. index. unique( )
print ( id1. intersection( id2) )
print ( id1. union( id2) )
print ( id1. difference( id2) )
print ( id1. symmetric_difference( id2) )
print ( '--' * 30 )
print ( id1 & id2)
print ( id1 | id2)
print ( ( id1 ^ id2) & id1)
print ( id1 ^ id2)
Index(['b'], dtype='object')
Index(['a', 'b', 'c'], dtype='object')
Index(['a'], dtype='object')
Index(['a', 'c'], dtype='object')
------------------------------------------------------------
Index(['b'], dtype='object')
Index(['a', 'b', 'c'], dtype='object')
Index(['a'], dtype='object')
Index(['a', 'c'], dtype='object')
df_set_in_col_1 = df_set_1. reset_index( )
df_set_in_col_2 = df_set_2. reset_index( )
df_set_in_col_1
df_set_in_col_2
df_set_in_col_1[ df_set_in_col_1. id1. isin( df_set_in_col_2. id2) ]
练习
number = [ float , int ]
df_number = df_demo. loc[ : ,
df_demo. dtypes. apply ( lambda x : x in ( number) ) ]
df_number
Weight Name Gaopeng Yang 46.0 Changqiang You 70.0 Mei Sun 89.0 Xiaojuan Sun 41.0 Gaojuan You 74.0 ... ... Xiaojuan Sun 46.0 Li Zhao 50.0 Chengqiang Chu 45.0 Chengmei Shen 71.0 Chunpeng Lv 51.0
200 rows × 1 columns
df_ex. rename_axis( index = lambda x : str . upper( x) )
Big C D Small c d c d Other cat dog cat dog cat dog cat dog UPPER LOWER EXTRA A a alpha 3 6 -9 -6 -6 -2 0 9 beta -5 -3 3 -8 -3 -2 5 8 b alpha -4 4 -1 0 7 -4 6 6 beta -9 9 -6 8 5 -2 -9 -8 B a alpha 0 -9 1 -6 2 9 -7 -9 beta -9 -5 -4 -3 -1 8 6 -5 b alpha 0 1 -8 -8 -2 0 -6 -3 beta 2 5 9 -9 5 -6 3 1
df = pd. read_csv( path + '\company.csv' )
df. head( 3 )
EmployeeID birthdate_key age city_name department job_title gender 0 1318 1/3/1954 61 Vancouver Executive CEO M 1 1319 1/3/1957 58 Vancouver Executive VP Stores F 2 1320 1/2/1955 60 Vancouver Executive Legal Counsel F
df_query = df. query( '(age <= 40) & (department in ["Dairy", "Bakery"]) & (gender=="M")' )
dpt = [ 'Dairy' , 'Bakery' ]
df_query_true = df. query( '(age <= 40) & (department == &dpt) & (gender == "M")' )
df_loc = df. loc[ ( df. age <= 40 ) & ( df. department. isin( [ "Dairy" , 'Bakery' ] ) ) ]
dpt = [ 'Dairy' , 'Bakery' ]
df_loc_true = df. loc[ ( df. age <= 40 ) & ( df. department. isin( dpt) ) & ( df. gender== 'M' ) ] . head( 3 )
df. iloc[ ( df. EmployeeID% 2 == 1 ) . values, [ 0 , 2 , - 2 ] ] . head( 3 )
df_test = df. copy( )
df_test = df_test. set_index( list ( df. columns[ - 3 : ] ) ) . reorder_levels( [ 2 , 1 , 0 ] ,
axis= 0 )
df_test. reset_index( level = 1 )
df_test. rename_axis( index = { 'gender' : 'Gender' } )
df_op = df. copy( )
df_op = df_op. set_index( df_op. columns[ - 3 : ] . tolist( ) ) . swaplevel( 0 , 2 , axis = 0 )
df_op = df_op. reset_index( level = 1 )
df_op = df_op. rename( index = { 'gender' : 'Gender' } )
df_op. index = df_op. index. map ( lambda x : '_' . join( x) )
df_op. index = df_op. index. map ( lambda x : tuple ( x. split( '_' ) ) )
df_op = df_op. rename_axis( index = [ 'gender' , 'department' ] )
df_op = df_op. reset_index( ) . reindex( df. columns, axis = 1 )
df = pd. read_csv( path + '\\chocolate.csv' )
df. head( 3 )
Company Review\nDate Cocoa\nPercent Company\nLocation Rating 0 A. Morin 2016 63% France 3.75 1 A. Morin 2015 70% France 2.75 2 A. Morin 2015 70% France 3.00
df_op1 = df. copy( )
df_op1. columns = df_op1. columns. map ( lambda x : x. replace( '\n' , ' ' ) )
df_op2 = df. copy( )
df_op2. columns = [ ' ' . join( i. split( '\n' ) ) for i in df_op2. columns]
df_op = df_op1. copy( )
df_op[ 'Cocoa Percent' ] = df_op[ 'Cocoa Percent' ] . apply ( lambda x : float ( x[ : - 1 ] ) / 100 )
df_op. query( '(Rating <= 2.75) & (`Cocoa Percent` > `Cocoa Percent`.median())' ) . head( 3 )
idx = pd. IndexSlice
df_op = df_op. set_index( [ 'Review Date' , 'Company Location' ] ) . sort_index( level = 0 )
exclude = [ 'France' , 'Canada' , 'Amsterdam' , 'Belgium' ]
df_op. loc[ idx[ 2012 : , ~ df_op. index. get_level_values( 1 ) . isin( exclude) ] , : ] . head( 3 )
Company Cocoa Percent Rating Review Date Company Location 2012 Australia Bahen & Co. 0.7 3.0 Australia Bahen & Co. 0.7 2.5 Australia Bahen & Co. 0.7 2.5