pandas_day03

import numpy as np
import pandas as pd
df = pd.read_csv('learn_pandas.csv',usecols = ['School','Grade', 'Name','Gender','Weight','Transfer'])
# 把Name列设为索引
df_demo = df.set_index('Name')
df_demo
SchoolGradeGenderWeightTransfer
Name
Gaopeng YangShanghai Jiao Tong UniversityFreshmanFemale46.0N
Changqiang YouPeking UniversityFreshmanMale70.0N
Mei SunShanghai Jiao Tong UniversitySeniorMale89.0N
Xiaojuan SunFudan UniversitySophomoreFemale41.0N
Gaojuan YouFudan UniversitySophomoreMale74.0N
..................
Xiaojuan SunFudan UniversityJuniorFemale46.0N
Li ZhaoTsinghua UniversitySeniorFemale50.0N
Chengqiang ChuShanghai Jiao Tong UniversitySeniorFemale45.0N
Chengmei ShenShanghai Jiao Tong UniversitySeniorMale71.0N
Chunpeng LvTsinghua UniversitySophomoreMale51.0N

200 rows × 5 columns

df_loc_slice_demo = df_demo.copy()
df_loc_slice_demo.index = range(df_demo.shape[0],0,-1)
# 倒叙排列
df_loc_slice_demo.index
RangeIndex(start=200, stop=0, step=-1)
df_loc_slice_demo.loc[5:3]
SchoolGradeGenderWeightTransfer
5Fudan UniversityJuniorFemale46.0N
4Tsinghua UniversitySeniorFemale50.0N
3Shanghai Jiao Tong UniversitySeniorFemale45.0N
np.random.seed(0)
multi_index = pd.MultiIndex.from_product([list('ABCD'),df.Gender.unique()],names=('School', 'Gender'))
multi_index
MultiIndex([('A', 'Female'),
            ('A',   'Male'),
            ('B', 'Female'),
            ('B',   'Male'),
            ('C', 'Female'),
            ('C',   'Male'),
            ('D', 'Female'),
            ('D',   'Male')],
           names=['School', 'Gender'])
multi_column=pd.MultiIndex.from_product([['Height','Weight'],df.Grade.unique()],names=('Indicator','Grade'))
multi_column
MultiIndex([('Height',  'Freshman'),
            ('Height',    'Senior'),
            ('Height', 'Sophomore'),
            ('Height',    'Junior'),
            ('Weight',  'Freshman'),
            ('Weight',    'Senior'),
            ('Weight', 'Sophomore'),
            ('Weight',    'Junior')],
           names=['Indicator', 'Grade'])
df_multi = pd.DataFrame(np.c_[(np.random.rand(8,4)*5 + 163).tolist(), (np.random.randn(8,4)*5 + 65).tolist()],
                       index = multi_index,
                       columns = multi_column).round(1)
df_multi
IndicatorHeightWeight
GradeFreshmanSeniorSophomoreJuniorFreshmanSeniorSophomoreJunior
SchoolGender
AFemale165.7166.6166.0165.776.357.765.264.1
Male165.1166.2165.2167.572.772.365.866.9
BFemale167.8164.9167.0165.660.655.163.365.8
Male165.8167.6163.4163.471.271.063.163.5
CFemale163.1167.2166.9167.459.857.956.574.8
Male167.9167.0165.3166.962.562.858.768.9
DFemale163.6166.2163.7167.756.963.960.566.9
Male165.6165.1164.3166.962.459.164.967.1

3.2 索引的常用方法
3.3.1 索引层的交换和删除

import numpy as np
import pandas as pd
np.random.seed(0)
L1,L2,L3 = ['A','B'],['a','b'],['alpha','beta']
# 相当于做笛卡尔积
mul_index1 = pd.MultiIndex.from_product([L1,L2,L3], names=('Upper', 'Lower','Extra'))
mul_index1
MultiIndex([('A', 'a', 'alpha'),
            ('A', 'a',  'beta'),
            ('A', 'b', 'alpha'),
            ('A', 'b',  'beta'),
            ('B', 'a', 'alpha'),
            ('B', 'a',  'beta'),
            ('B', 'b', 'alpha'),
            ('B', 'b',  'beta')],
           names=['Upper', 'Lower', 'Extra'])
L4,L5,L6 = ['C','D'],['c','d'],['cat','dog']
mul_index2 = pd.MultiIndex.from_product([L4,L5,L6], names=('Big', 'Small', 'Other')) 
mul_index2
MultiIndex([('C', 'c', 'cat'),
            ('C', 'c', 'dog'),
            ('C', 'd', 'cat'),
            ('C', 'd', 'dog'),
            ('D', 'c', 'cat'),
            ('D', 'c', 'dog'),
            ('D', 'd', 'cat'),
            ('D', 'd', 'dog')],
           names=['Big', 'Small', 'Other'])
df_ex = pd.DataFrame(np.random.randint(-9,10,(8,8)),index=mul_index1,columns=mul_index2) 
df_ex
BigCD
Smallcdcd
Othercatdogcatdogcatdogcatdog
UpperLowerExtra
Aaalpha36-9-6-6-209
beta-5-33-8-3-258
balpha-44-107-466
beta-99-685-2-9-8
Baalpha0-91-629-7-9
beta-9-5-4-3-186-5
balpha01-8-8-20-6-3
beta259-95-631

3.4 索引运算
由于集合的元素是互异的,但是索引中可能有相同的元素,先用 unique 去重后再进行运算

df_set_1 = pd.DataFrame([[0,1],[1,2],[3,4]], index = pd.Index(['a','b','a'],name='id1'))
df_set_1
01
id1
a01
b12
a34
df_set_2 = pd.DataFrame([[4,5],[2,6],[7,1]],index = pd.Index(['b','b','c'],name='id2')) 
df_set_2
01
id2
b45
b26
c71
# 去重
id1, id2 = df_set_1.index.unique(), df_set_2.index.unique()
id1, id2
(Index(['a', 'b'], dtype='object', name='id1'),
 Index(['b', 'c'], dtype='object', name='id2'))
id1.intersection(id2) # id1 & id2 
Index(['b'], dtype='object')
id1.union(id2)  # id1 | id2 
Index(['a', 'b', 'c'], dtype='object')
id1.difference(id2) # (id1 ^ id2) & id1
Index(['a'], dtype='object')
id1.symmetric_difference(id2) #  id1 ^ id2 
Index(['a', 'c'], dtype='object')

若两张表需要做集合运算的列并没有被设置索引,一种办法是先转成索引,运算后再恢复,另一种方法是利用isin函数,例如在重置索引的第一张表中选出 id 列交集的所在行:

df_set_in_col_1 = df_set_1.reset_index()
df_set_in_col_1
id101
0a01
1b12
2a34
df_set_in_col_2 = df_set_2.reset_index()
df_set_in_col_2
id201
0b45
1b26
2c71
df_set_in_col_1[df_set_in_col_1.id1.isin(df_set_in_col_2.id2)] 
id101
1b12

3.5.1 公司员工数据集

  1. 分别只使用 query 和 loc 选出年龄不超过四十岁且工作部门为 Dairy 或 Bakery 的男性。
  2. 选出员工 ID 号为奇数所在行的第 1、第 3 和倒数第 2 列。
  3. 按照以下步骤进行索引操作:
    • 把后三列设为索引后交换内外两层
    • 恢复中间一层
    • 修改外层索引名为 Gender
    • 用下划线合并两层行索引
    • 把行索引拆分为原状态
    • 修改索引名为原表名称
    • 恢复默认索引并将列保持为原表的相对位置
df = pd.read_csv('company.csv')
df.head(3)
EmployeeIDbirthdate_keyagecity_namedepartmentjob_titlegender
013181/3/195461VancouverExecutiveCEOM
113191/3/195758VancouverExecutiveVP StoresF
213201/2/195560VancouverExecutiveLegal CounselF
# 工作部门同属于一个列,所以应当用list表示
df.query("(age<=40)&(department == ['Dairy', 'Bakery'])&(gender == 'M')")
EmployeeIDbirthdate_keyagecity_namedepartmentjob_titlegender
361157911/14/197540KelownaDairyDairy PersonM
361357931/22/197540RichmondBakeryBakerM
361557951/30/197540NanaimoDairyDairy PersonM
361757972/3/197540NanaimoDairyDairy PersonM
361857982/4/197540SurreyDairyDairy PersonM
........................
6108830710/20/199421BurnabyDairyDairy PersonM
6113831211/12/199421BurnabyDairyDairy PersonM
6137833612/31/199421VancouverDairyDairy PersonM
627063125/14/197936Grand ForksDairyDairy PersonM
627165402/14/198134VictoriaBakeryBakerM

441 rows × 7 columns

# 每一个& 前后连接的最好都用括号括起来
df.loc[(df.age<=40)& (df.department.isin(['Dairy', 'Bakery']))&(df.gender == 'M')]
EmployeeIDbirthdate_keyagecity_namedepartmentjob_titlegender
361157911/14/197540KelownaDairyDairy PersonM
361357931/22/197540RichmondBakeryBakerM
361557951/30/197540NanaimoDairyDairy PersonM
361757972/3/197540NanaimoDairyDairy PersonM
361857982/4/197540SurreyDairyDairy PersonM
........................
6108830710/20/199421BurnabyDairyDairy PersonM
6113831211/12/199421BurnabyDairyDairy PersonM
6137833612/31/199421VancouverDairyDairy PersonM
627063125/14/197936Grand ForksDairyDairy PersonM
627165402/14/198134VictoriaBakeryBakerM

441 rows × 7 columns

(df.EmployeeID%2 == 1).values
array([False,  True, False, ...,  True, False, False])
df.iloc[(df.EmployeeID%2 == 1).values, [0,2,-2]]
EmployeeIDagejob_title
1131958VP Stores
3132156VP Human Resources
5132353Exec Assistant, VP Stores
6132551Exec Assistant, Legal Counsel
8132948Store Manager
............
6276765926Cashier
6277774125Cashier
6278780125Dairy Person
6280818122Cashier
6281822321Cashier

3126 rows × 3 columns

df_new = df.copy()
# 把后三列设为索引后交换内外两层
df_new = df_new.set_index(df_new.columns[-3:].tolist()).swaplevel(0,2,axis = 0)
df_new
EmployeeIDbirthdate_keyagecity_name
genderjob_titledepartment
MCEOExecutive13181/3/195461Vancouver
FVP StoresExecutive13191/3/195758Vancouver
Legal CounselExecutive13201/2/195560Vancouver
MVP Human ResourcesExecutive13211/2/195956Vancouver
VP FinanceExecutive13221/9/195857Vancouver
.....................
FCashierCustomer Service80368/9/199223New Westminister
MCashierCustomer Service81819/26/199322Prince George
Customer Service82232/11/199421Trail
FCashierCustomer Service82262/16/199421Victoria
Customer Service82646/13/199421Vancouver

6284 rows × 4 columns

# 恢复中间一层
df_new = df_new.reset_index(1)
df_new
job_titleEmployeeIDbirthdate_keyagecity_name
genderdepartment
MExecutiveCEO13181/3/195461Vancouver
FExecutiveVP Stores13191/3/195758Vancouver
ExecutiveLegal Counsel13201/2/195560Vancouver
MExecutiveVP Human Resources13211/2/195956Vancouver
ExecutiveVP Finance13221/9/195857Vancouver
.....................
FCustomer ServiceCashier80368/9/199223New Westminister
MCustomer ServiceCashier81819/26/199322Prince George
Customer ServiceCashier82232/11/199421Trail
FCustomer ServiceCashier82262/16/199421Victoria
Customer ServiceCashier82646/13/199421Vancouver

6284 rows × 5 columns

# 修改外层索引名为Gender
df_new.rename_axis(index={'gender':'Gender'})
job_titleEmployeeIDbirthdate_keyagecity_name
Genderdepartment
MExecutiveCEO13181/3/195461Vancouver
FExecutiveVP Stores13191/3/195758Vancouver
ExecutiveLegal Counsel13201/2/195560Vancouver
MExecutiveVP Human Resources13211/2/195956Vancouver
ExecutiveVP Finance13221/9/195857Vancouver
.....................
FCustomer ServiceCashier80368/9/199223New Westminister
MCustomer ServiceCashier81819/26/199322Prince George
Customer ServiceCashier82232/11/199421Trail
FCustomer ServiceCashier82262/16/199421Victoria
Customer ServiceCashier82646/13/199421Vancouver

6284 rows × 5 columns

# 用下划线合并两层行索引,转为index
df_new.index = df_new.index.map(lambda x: '_'.join(x))
df_new
job_titleEmployeeIDbirthdate_keyagecity_name
M_ExecutiveCEO13181/3/195461Vancouver
F_ExecutiveVP Stores13191/3/195758Vancouver
F_ExecutiveLegal Counsel13201/2/195560Vancouver
M_ExecutiveVP Human Resources13211/2/195956Vancouver
M_ExecutiveVP Finance13221/9/195857Vancouver
..................
F_Customer ServiceCashier80368/9/199223New Westminister
M_Customer ServiceCashier81819/26/199322Prince George
M_Customer ServiceCashier82232/11/199421Trail
F_Customer ServiceCashier82262/16/199421Victoria
F_Customer ServiceCashier82646/13/199421Vancouver

6284 rows × 5 columns

# 把行索引拆分为原状态
df_new.index = df_new.index.map(lambda x:tuple(x.split('_')))
df_new
job_titleEmployeeIDbirthdate_keyagecity_name
MExecutiveCEO13181/3/195461Vancouver
FExecutiveVP Stores13191/3/195758Vancouver
ExecutiveLegal Counsel13201/2/195560Vancouver
MExecutiveVP Human Resources13211/2/195956Vancouver
ExecutiveVP Finance13221/9/195857Vancouver
.....................
FCustomer ServiceCashier80368/9/199223New Westminister
MCustomer ServiceCashier81819/26/199322Prince George
Customer ServiceCashier82232/11/199421Trail
FCustomer ServiceCashier82262/16/199421Victoria
Customer ServiceCashier82646/13/199421Vancouver

6284 rows × 5 columns

# 修改索引名为原表名称,需要指定参数axis
df_new.rename_axis(index = ['gender', 'department'])
job_titleEmployeeIDbirthdate_keyagecity_name
genderdepartment
MExecutiveCEO13181/3/195461Vancouver
FExecutiveVP Stores13191/3/195758Vancouver
ExecutiveLegal Counsel13201/2/195560Vancouver
MExecutiveVP Human Resources13211/2/195956Vancouver
ExecutiveVP Finance13221/9/195857Vancouver
.....................
FCustomer ServiceCashier80368/9/199223New Westminister
MCustomer ServiceCashier81819/26/199322Prince George
Customer ServiceCashier82232/11/199421Trail
FCustomer ServiceCashier82262/16/199421Victoria
Customer ServiceCashier82646/13/199421Vancouver

6284 rows × 5 columns

# 恢复默认索引
df_new.reset_index().reindex(df.columns, axis=1)
EmployeeIDbirthdate_keyagecity_namedepartmentjob_titlegender
013181/3/195461VancouverNaNCEONaN
113191/3/195758VancouverNaNVP StoresNaN
213201/2/195560VancouverNaNLegal CounselNaN
313211/2/195956VancouverNaNVP Human ResourcesNaN
413221/9/195857VancouverNaNVP FinanceNaN
........................
627980368/9/199223New WestministerNaNCashierNaN
628081819/26/199322Prince GeorgeNaNCashierNaN
628182232/11/199421TrailNaNCashierNaN
628282262/16/199421VictoriaNaNCashierNaN
628382646/13/199421VancouverNaNCashierNaN

6284 rows × 7 columns

df_new.equals(df)
False
  1. 把列索引名中的\n替换为空格。
  2. 巧克力Rating评分为1至5,每0.25分一档,请选出2.75分及以下且可可含量 Cocoa Percent高于中位数的样本。
  3. 将Review Date和Company Location设为索引后,选出Review Date在2012年之后且Company Location不属于France, Canada, Amsterdam, Belgium的样本。
dfc = pd.read_csv('chocolate.csv')
dfc.columns = [' '.join(i.split('\n')) for i in dfc.columns]
dfc
CompanyReview DateCocoa PercentCompany LocationRating
0A. Morin201663%France3.75
1A. Morin201570%France2.75
2A. Morin201570%France3.00
3A. Morin201570%France3.50
4A. Morin201570%France3.50
..................
1790Zotter201170%Austria3.75
1791Zotter201165%Austria3.00
1792Zotter201165%Austria3.50
1793Zotter201162%Austria3.25
1794Zotter201065%Austria3.00

1795 rows × 5 columns

dfc['Cocoa Percent'] = dfc['Cocoa Percent'].apply(lambda x: float(x[:-1])/100)
dfc.query('Rating <=2.75 & `Cocoa Percent` > `Cocoa Percent`.median()').head()
CompanyReview DateCocoa PercentCompany LocationRating
33Akesson's (Pralus)20100.75Switzerland2.75
34Akesson's (Pralus)20100.75Switzerland2.75
36Alain Ducasse20140.75France2.75
38Alain Ducasse20130.75France2.50
39Alain Ducasse20130.75France2.50
dfc[(dfc['Rating'] <=2.75) & (dfc['Cocoa Percent'] > dfc['Cocoa Percent'].median())]
CompanyReview DateCocoa PercentCompany LocationRating
33Akesson's (Pralus)20100.75Switzerland2.75
34Akesson's (Pralus)20100.75Switzerland2.75
36Alain Ducasse20140.75France2.75
38Alain Ducasse20130.75France2.50
39Alain Ducasse20130.75France2.50
..................
1736Wilkie's Organic20130.89Ireland2.75
1738Wilkie's Organic20130.75Ireland2.75
1741Willie's Cacao20131.00U.K.2.25
1769Zart Pralinen20160.85Austria2.75
1778Zotter20140.80Austria2.75

239 rows × 5 columns

idx = pd.IndexSlice
# 设置Review Date和Company Location为索引
dfc= dfc.set_index(['Review Date', 'Company Location']).sort_index(level=0)
dfc.head()
CompanyCocoa PercentRating
Review DateCompany Location
2006BelgiumCote d' Or (Kraft)0.701.00
BelgiumDolfin (Belcolade)0.701.50
BelgiumNeuhaus (Callebaut)0.732.00
BelgiumNeuhaus (Callebaut)0.752.75
BelgiumNeuhaus (Callebaut)0.713.00
# 选出Review Date在2012年之后且Company Location不属于France, Canada, Amsterdam, Belgium的样本
dfc.loc[idx[2012:, dfc.index.get_level_values(1).difference(['France', 'Canada', 'Amsterdam', 'Belgium'])], :]
CompanyCocoa PercentRating
Review DateCompany Location
2012AustraliaBahen & Co.0.703.00
AustraliaBahen & Co.0.702.50
AustraliaBahen & Co.0.702.50
AustraliaCravve0.753.25
AustraliaCravve0.653.25
...............
2017U.S.A.Spencer0.703.75
U.S.A.Spencer0.703.50
U.S.A.Spencer0.702.75
U.S.A.Xocolla0.702.75
U.S.A.Xocolla0.702.50

972 rows × 3 columns

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值