import numpy as np
import pandas as pd
Ex1:公司员工数据集
1. 分别只使用query
和loc
选出年龄不超过四十岁且工作部门为Dairy
或Bakery
的男性。
df. query( '(age<=40)&((department == "Dairy")|(department == "Bakery"))' )
df. loc[ ( df. age<= 40 ) & ( ( df. department == 'Dairy' ) | ( df. department == 'Bakery' ) ) ]
df. loc[ df. age<= 40 & df. department. isin( [ 'Dairy' , 'Bakery' ] ) ]
EmployeeID birthdate_key age city_name department job_title gender
dpt = [ 'Dairy' , 'Bakery' ]
df. query( "(age <= 40)&(department == @dpt)&(gender=='M')" ) . head( 3 )
df. loc[ ( df. age<= 40 ) & df. department. isin( dpt) & ( df. gender== 'M' ) ] . head( 3 )
EmployeeID birthdate_key age city_name department job_title gender
== 注意@的使用 ==
对于query中的字符串,如果要引用外部变量,只需在变量名前加@符号。
2. 选出员工ID
号 为奇数所在行的第1、第3和倒数第2列。
df. iloc[ ( df. EmployeeID% 2 == 1 ) . values, [ 1 , 3 , - 2 ] ]
birthdate_key city_name job_title 1 1/3/1957 Vancouver VP Stores
3. 按照以下步骤进行索引操作:
* 把后三列设为索引后交换内外两层
df. set_index( [ 'department' , 'job_title' , 'gender' ] )
df. swaplevel( 0 , 2 , axis= 0 )
---------------------------------------------------------------------------
TypeError: Can only swap levels on a hierarchical axis.
== 注意 这里报错的原因是因为设定index后没有保存数据 == 正确的:
dfsw = df. set_index( [ 'department' , 'job_title' , 'gender' ] )
dfsw. swaplevel( 0 , 2 , axis= 0 )
EmployeeID birthdate_key age city_name gender job_title department M CEO Executive 1318 1/3/1954 61 Vancouver F VP Stores Executive 1319 1/3/1957 58 Vancouver Legal Counsel Executive 1320 1/2/1955 60 Vancouver
df_op = df. copy( )
df_op = df_op. set_index( df_op. columns[ - 3 : ] . tolist( ) ) . swaplevel( 0 , 2 , axis= 0 )
df_op
EmployeeID birthdate_key age city_name gender job_title department M CEO Executive 1318 1/3/1954 61 Vancouver F VP Stores Executive 1319 1/3/1957 58 Vancouver Legal Counsel Executive 1320 1/2/1955 60 Vancouver
* 恢复中间层索引
df_op. reset_index( )
gender job_title department EmployeeID birthdate_key age city_name 0 M CEO Executive 1318 1/3/1954 61 Vancouver 1 F VP Stores Executive 1319 1/3/1957 58 Vancouver 2 F Legal Counsel Executive 1320 1/2/1955 60 Vancouver
df_op = df_op. reset_index( level= 2 )
* 修改外层索引名为Gender
df_op = df_op. rename_axis( index= { 'gender' : 'Gender' } )
df_op
department EmployeeID birthdate_key age city_name Gender job_title M CEO Executive 1318 1/3/1954 61 Vancouver F VP Stores Executive 1319 1/3/1957 58 Vancouver Legal Counsel Executive 1320 1/2/1955 60 Vancouver
* 用下划线合并两层行索引
df_temp = df. set_index( df. columns[ - 3 : ] . tolist( ) ) . swaplevel( 0 , 2 , axis= 0 )
df_temp
df_index = df_temp. index. map ( lambda x: ( x[ 0 ] + '_' + x[ 1 ] + '_' + x[ 2 ] ) )
df_temp. index = df_index
df_temp
EmployeeID birthdate_key age city_name M_CEO_Executive 1318 1/3/1954 61 Vancouver F_VP Stores_Executive 1319 1/3/1957 58 Vancouver F_Legal Counsel_Executive 1320 1/2/1955 60 Vancouver
df_op. index = df_op. index. map ( lambda x: '_' . join( x) )
df_op
department EmployeeID birthdate_key age city_name M_CEO Executive 1318 1/3/1954 61 Vancouver F_VP Stores Executive 1319 1/3/1957 58 Vancouver F_Legal Counsel Executive 1320 1/2/1955 60 Vancouver
== 注意 str.join()
的用法
* 把行索引拆分为原状态
df_temp = df_op. copy( )
df_temp. reset_index( )
index department EmployeeID birthdate_key age city_name 0 M_CEO Executive 1318 1/3/1954 61 Vancouver 1 F_VP Stores Executive 1319 1/3/1957 58 Vancouver 2 F_Legal Counsel Executive 1320 1/2/1955 60 Vancouver
df_temp = df_op. copy( )
df_temp. index = df_temp. index. map ( lambda x: tuple ( x. split( '_' ) ) )
df_op = df_temp
== 注意 == str.split()
的用法
* 修改索引名为原表名称
df_temp = df_op. copy( )
df_temp = df_temp. rename_axis( index = [ 'Gender' , 'job_title' ] )
df_op = df_op. rename_axis( index= [ 'gender' , 'job_title' ] )
df_op. shape
df_temp = df_op. copy( )
df_temp
department EmployeeID birthdate_key age city_name gender job_title M CEO Executive 1318 1/3/1954 61 Vancouver F VP Stores Executive 1319 1/3/1957 58 Vancouver Legal Counsel Executive 1320 1/2/1955 60 Vancouver
* 恢复默认索引并将列保持为原表的相对位置
df_existed = pd. DataFrame( index = np. arange( df_op. shape[ 0 ] ) , columns= [ 'EmployeeID' , 'birthdate_key' , 'age' , 'city_name' , 'department' , 'job_title' , 'gender' ] )
df_temp. reindex_like ( df_existed)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'
== 注意 这里报错的原因 ==
使用 reindex_like
必须保证index
的列不能改变,但是可以改变列的行数。
函数名 作用 set_index 设置某一列为index reset_index 与set_index
作用相反 reindex 增加index
的行数,去除某列 reindex_like 与reindex
作用相同
df_existed = pd. DataFrame( index = np. arange( df_op. shape[ 0 ] ) , columns= [ 'EmployeeID' , 'birthdate_key' , 'age' , 'city_name' , 'department' , 'job_title' , 'gender' ] )
df_temp. reset_index( ) . reindex_like ( df_existed)
EmployeeID birthdate_key age city_name department job_title gender 0 1318 1/3/1954 61 Vancouver Executive CEO M 1 1319 1/3/1957 58 Vancouver Executive VP Stores F 2 1320 1/2/1955 60 Vancouver Executive Legal Counsel F
df. columns
df_op = df_op. reset_index( ) . reindex( df. columns, axis= 1 )
df_op
df_op. equals( df)
True
注意 .equals 的用法
Ex2:巧克力数据集
现有一份关于巧克力评价的数据集:
data = [ [ 'A. Morin' , 2016 , '63%' , 'France' , 3.75 ] , [ 'A. Morin' , 2015 , '70%' , 'France' , 2.75 ] , [ 'A. Morin' , 2015 , '70%' , 'France' , 3.00 ] ]
df = pd. DataFrame( data = data, index = np. arange( 3 ) , columns= [ 'Company' , 'Review Date' , 'Cocoa Percent' , 'Company Location' , 'Rating' ] )
df
Company Review Date Cocoa Percent Company Location Rating 0 A. Morin 2016 63% France 3.75 1 A. Morin 2015 70% France 2.75 2 A. Morin 2015 70% France 3.00
1. 把列索引名中的\n
替换为空格。
df. columns
new_columns = [ ' ' . join( i. split( '\n' ) ) for i in df. columns]
df. columns = new_columns
2. 巧克力Rating
评分为1至5,每0.25分一档,请选出2.75分及以下且可可含量Cocoa Percent
高于中位数的样本。
df[ 'Cocoa Percent' ] = df[ 'Cocoa Percent' ] . apply ( lambda x: float ( x[ : , - 1 ] ) / 100 )
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
TypeError: string indices must be integers
== 注意 错误的原因 == float(x[:,-1]
这里错了 ,应该是 float(x[:-1]
或者`float(x[0:-1]’
df[ 'Cocoa Percent' ] = df[ 'Cocoa Percent' ] . apply ( lambda x: float ( x[ 0 : - 1 ] ) / 100 )
df. query( ' Rating<= 2.75 & Cocoa Percent >= Cocoa Percent.median' )
SyntaxError: Python keyword not valid identifier in numexpr query
== 注意 == 再query中对于有 空格 的属性,不能直接用,要加上`Cocoa Percent`
来使用
不是单引号 !!!
df. query( '( Rating<= 2.75 )& (`Cocoa Percent`>=`Cocoa Percent`.median())' )
Company Review Date Cocoa Percent Company Location Rating 1 A. Morin 2015 0.7 France 2.75
3. 将Review Date
和Company Location
设为索引后,选出Review Date
在2012年之后且Company Location
不属于France, Canada, Amsterdam, Belgium
的样本。
df. set_index( [ 'Review Date' , 'Company Location' ] )
Company Cocoa Percent Rating Review Date Company Location 2016 France A. Morin 0.63 3.75 2015 France A. Morin 0.70 2.75 France A. Morin 0.70 3.00
Location = [ 'France' , 'Canada' , 'Amsterdam' , 'Belgium' ]
df. query( '(`Review Date`>2012) & (!`Company Location`==@Location)' )
SyntaxError: invalid syntax
idx = pd. IndexSlice
exclude = [ 'France' , 'Canada' , 'Amsterdam' , 'Belgium' ]
res = df. set_index( [ 'Review Date' , 'Company Location' ] ) . sort_index( level= 0 )
res. loc[ idx[ 2012 : , ~ res. index. get_level_values( 1 ) . isin( exclude) ] , : ] . head( 3 )
== 注意 == 索引是不能用 query的方法查找的。 get_level_values
sort_index
了解这些方法