Pandas 学习-task03

DataWhale-Pandas组队学习

import pandas as pd
import numpy as np
import warnings
path = r"C:\Users\yongx\Desktop\data"
df = pd.read_csv(path + "\\learn_pandas.csv", \
                 usecols = ['School', 'Grade', 'Name', 'Gender', 'Weight', 'Transfer'])
df['Name'].head(2)
0      Gaopeng Yang
1    Changqiang You
Name: Name, dtype: object

索引器

列索引

df[['Gender', 'Name']].head(2)
GenderName
0FemaleGaopeng Yang
1MaleChangqiang You
df.Name.head(2)
0      Gaopeng Yang
1    Changqiang You
Name: Name, dtype: object

字符行索引

s = pd.Series([1,2,3,4,5,6],\
             index = [     'a','b',   'a','a',   'a','c'])
s['b']
2
s[['b', 'c']]
b    2
c    6
dtype: int64
s['c':'b':-2]
c    6
a    4
b    2
dtype: int64

整数行索引

s = pd.Series(['a','b','c','d','e','f'],\
             index = [1, 3, 1, 2, 5, 4])
s[1]
1    a
1    c
dtype: object
s[[2, 3]]
2    d
3    b
dtype: object
s[1:-1:2]
3    b
2    d
dtype: object

loc索引器

对于表有两种索引器,基于元素loc和基于位置iloc的索引器.
loc索引器形式为loc[*,*],分别代表,.当省略第二个位置时表示对行进行筛选.*的位置一共有五类合法对象,分别为单个元素,元素列表,元素切片,布尔列表及函数.

df_demo = df.set_index('Name')
df_demo.head(3)
SchoolGradeGenderWeightTransfer
Name
Gaopeng YangShanghai Jiao Tong UniversityFreshmanFemale46.0N
Changqiang YouPeking UniversityFreshmanMale70.0N
Mei SunShanghai Jiao Tong UniversitySeniorMale89.0N
df_demo.loc['Qiang Sun']  # 名字不唯一,返回DataFrame
SchoolGradeGenderWeightTransfer
Name
Qiang SunTsinghua UniversityJuniorFemale53.0N
Qiang SunTsinghua UniversitySophomoreFemale40.0N
Qiang SunShanghai Jiao Tong UniversityJuniorFemaleNaNN
print(df_demo.loc['Quan Zhao'])
type(df_demo.loc['Quan Zhao'])  # 名字唯一,返回Series
School      Shanghai Jiao Tong University
Grade                              Junior
Gender                             Female
Weight                                 53
Transfer                                N
Name: Quan Zhao, dtype: object





pandas.core.series.Series
df_demo.loc['Qiang Sun', 'School']  # 同时选择行和列, 返回Series
Name
Qiang Sun              Tsinghua University
Qiang Sun              Tsinghua University
Qiang Sun    Shanghai Jiao Tong University
Name: School, dtype: object
df_demo.loc[['Qiang Sun', 'Quan Zhao'], ['School',
                                         'Gender']]  ##使用元素列表形式,返回DataFrame
SchoolGender
Name
Qiang SunTsinghua UniversityFemale
Qiang SunTsinghua UniversityFemale
Qiang SunShanghai Jiao Tong UniversityFemale
Quan ZhaoShanghai Jiao Tong UniversityFemale
df_demo.loc['Gaojuan You':'Gaoqiang Qian',
            'School':'Gender']  #使用字符串索引时,若起点和终点字符不是唯一值,则无法使用,并报错
SchoolGradeGender
Name
Gaojuan YouFudan UniversitySophomoreMale
Xiaoli QianTsinghua UniversityFreshmanFemale
Qiang ChuShanghai Jiao Tong UniversityFreshmanFemale
Gaoqiang QianTsinghua UniversityJuniorFemale
df_loc_slice_demo = df_demo.copy()
df_loc_slice_demo.index = range(df_demo.shape[0], 0, -1)
df_loc_slice_demo.loc[5:3]  ## 使用整数索引时要求同字符索引相同.
SchoolGradeGenderWeightTransfer
5Fudan UniversityJuniorFemale46.0N
4Tsinghua UniversitySeniorFemale50.0N
3Shanghai Jiao Tong UniversitySeniorFemale45.0N
df_loc_slice_demo.loc[3:5]  #无返回值即不为整数位置切片
SchoolGradeGenderWeightTransfer
df_demo.loc[df_demo.Weight > 70].head(3)  #使用布尔列表进行索引
SchoolGradeGenderWeightTransfer
Name
Mei SunShanghai Jiao Tong UniversitySeniorMale89.0N
Gaojuan YouFudan UniversitySophomoreMale74.0N
Xiaopeng ZhouShanghai Jiao Tong UniversityFreshmanMale74.0N
df_demo.loc[df_demo.Grade.isin(['Freshman', 'Senior'])].head(3)  #同样为通过布尔列表进行索引
SchoolGradeGenderWeightTransfer
Name
Gaopeng YangShanghai Jiao Tong UniversityFreshmanFemale46.0N
Changqiang YouPeking UniversityFreshmanMale70.0N
Mei SunShanghai Jiao Tong UniversitySeniorMale89.0N
condition_1_1 = df_demo.School == 'Fudan University'
condition_1_2 = df_demo.Grade == 'Senior'
condition_1_3 = df_demo.Weight > 70
condition_1 = condition_1_1 & condition_1_2 & condition_1_3
condition_2_1 = df_demo.School == 'Peking University'
condition_2_2 = df_demo.Grade == 'Senior'
condition_2_3 = df_demo.Weight > 80
condition_2 = condition_2_1 & (~condition_2_2) & condition_2_3

df_demo.loc[condition_1 | condition_2]  ##同时使用多个条件时可以通过定义布尔值变量进行多条件筛选
SchoolGradeGenderWeightTransfer
Name
Qiang HanPeking UniversityFreshmanMale87.0N
Chengpeng ZhouFudan UniversitySeniorMale81.0N
Changpeng ZhaoPeking UniversityFreshmanMale83.0N
Chengpeng QianFudan UniversitySeniorMale73.0Y
def condition(x):
    condition_1_1 = x.School == 'Fudan University'
    condition_1_2 = x.Grade == 'Senior'
    condition_1_3 = x.Weight > 70
    condition_1 = condition_1_1 & condition_1_2 & condition_1_3
    condition_2_1 = x.School == 'Peking University'
    condition_2_2 = x.Grade == 'Senior'
    condition_2_3 = x.Weight > 80
    condition_2 = condition_2_1 & (~condition_2_2) & condition_2_3
    result = condition_1 | condition_2
    return result

df_demo.loc[condition]   ##这里的函数返回值必须是前面要求的布尔值列表
SchoolGradeGenderWeightTransfer
Name
Qiang HanPeking UniversityFreshmanMale87.0N
Chengpeng ZhouFudan UniversitySeniorMale81.0N
Changpeng ZhaoPeking UniversityFreshmanMale83.0N
Chengpeng QianFudan UniversitySeniorMale73.0Y
df_demo.loc[lambda x:'Quan Zhao', lambda x:'Gender'] ##匿名函数返回值同函数要求相同
'Female'
#通过slice对象进行包装返回start:end:step的切片形式
df_demo.loc[lambda x : slice('Gaojuan You', 'Gaoqiang Qian')]
#相同与
df_demo.loc['Gaojuan You' : 'Gaoqiang Qian']
SchoolGradeGenderWeightTransfer
Name
Gaojuan YouFudan UniversitySophomoreMale74.0N
Xiaoli QianTsinghua UniversityFreshmanFemale51.0N
Qiang ChuShanghai Jiao Tong UniversityFreshmanFemale52.0N
Gaoqiang QianTsinghua UniversityJuniorFemale50.0N
#不可以使用链式赋值,应当在使用一层索引器后直接进行赋值操作,原因是多次索引后赋值对象是临时返回的copy脚本,并没有真正的修改元素将导致SettingWithCopyWarning警告/
df_chain= pd.DataFrame([[0,0],[1,0],[1,0]], columns = list('AB'))
df_chain
AB
000
110
210
import warnings
with warnings.catch_warnings():
    warnings.filterwarnings('error')
    try:
        df_chain[df_chain.A!=0].B=1
    except Warning as w:
        Warning_Msg = w

print(Warning_Msg)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df_chain
AB
000
110
210
df_chain.loc[df_chain.A!=0, 'B'] = 1
df_chain
AB
000
111
211

iloc索引器

iloc类似于loc, 不同之处在于iloc针对位置进行筛选,在相同的*位置处也有五类合法对象有:整数,整数列表,整数切片,布尔列表及函数.范数的返回值必须是前面四类中的任意一类,输入为DataFrame

df_demo.iloc[1,1]    #整数
df_demo.iloc[[0,1],[0,1]] ##整数列表
df_demo.iloc[1:4, 2:4]   #整数切片
df_demo.iloc[lambda x:slice(1,4)]  ##函数
df_demo.iloc[(df_demo.Weight > 80).values].head(3) ## 使用布尔列表时需要注意不可以传入Series而必须传入序列的values,否则会报错,使用布尔列表时优先使用loc函数方法

SchoolGradeGenderWeightTransfer
Name
Mei SunShanghai Jiao Tong UniversitySeniorMale89.0N
Qiang ZhengShanghai Jiao Tong UniversitySeniorMale87.0N
Qiang HanPeking UniversityFreshmanMale87.0N
df_demo.School.iloc[1]
df_demo.School.iloc[1:5:2]
Name
Changqiang You    Peking University
Xiaojuan Sun       Fudan University
Name: School, dtype: object

query方法

通过字符串形式的查询表达式传入query方式来查询数据,表达式的执行结果必须返回布尔列表, 进行复杂索引时无需重复使用DataFrame的名字来引用列明,一般而言可以降低代码长度.

# 相同作用,不同点在与使用了python 字符串输入换行符号 '(上段),'(下段)
df.query('((School == "Fudan University") &(Grade == "Senior")&(Weight > 70))|((School == "Peking University")&(Grade != "Senior") &(Weight > 80))')

df.query('((School == "Fudan University") & '
        '(Grade == "Senior")&'
        '(Weight > 70))|'
        '((School == "Peking University")&'
        '(Grade != "Senior") &'
        '(Weight > 80))')
SchoolGradeNameGenderWeightTransfer
38Peking UniversityFreshmanQiang HanMale87.0N
66Fudan UniversitySeniorChengpeng ZhouMale81.0N
99Peking UniversityFreshmanChangpeng ZhaoMale83.0N
131Fudan UniversitySeniorChengpeng QianMale73.0Y
#query 表达式中已经注册了所有来自DataFrame的列名,因此所有属于Series的方法都可以被调用和正常函数调用无区别
df_demo.query('Weight > Weight.mean()').head(3)
SchoolGradeGenderWeightTransfer
Name
Changqiang YouPeking UniversityFreshmanMale70.0N
Mei SunShanghai Jiao Tong UniversitySeniorMale89.0N
Gaojuan YouFudan UniversitySophomoreMale74.0N
df_demo_col = df_demo.copy()
df_demo_col['Student Weight'] = df_demo_col['Weight'].copy()

#当列名含有空格时需要使用`col name`的方法进行引用,即使用一组反单引号``将列名包括起来进行使用

df_demo_col.query('`Student Weight` > `Student Weight`.mean()').head(3)

SchoolGradeGenderWeightTransferStudent Weight
Name
Changqiang YouPeking UniversityFreshmanMale70.0N70.0
Mei SunShanghai Jiao Tong UniversitySeniorMale89.0N89.0
Gaojuan YouFudan UniversitySophomoreMale74.0N74.0
# query 中已经注册了若干英语的字面用法如or,and, is in , not in等
df.query('(Grade not in ["Freshman", "Sophomore"]) and (Gender == "Male")').head(3)

##等价
df.query('(Grade != ["Frashmax", "Sophomore"]) and (Gender == "Male")').head(3)
SchoolGradeNameGenderWeightTransfer
1Peking UniversityFreshmanChangqiang YouMale70.0N
2Shanghai Jiao Tong UniversitySeniorMei SunMale89.0N
10Shanghai Jiao Tong UniversityFreshmanXiaopeng ZhouMale74.0N
df.query('School.isin(["Tsinghua University", "Shanghai Jiao Tong University"])', engine = 'python').head(3)
SchoolGradeNameGenderWeightTransfer
0Shanghai Jiao Tong UniversityFreshmanGaopeng YangFemale46.0N
2Shanghai Jiao Tong UniversitySeniorMei SunMale89.0N
5Tsinghua UniversityFreshmanXiaoli QianFemale51.0N
#query使用外部变量时需要使用@符号
low, high = 70, 80
df.query('Weight.between(@low, @high)').head(3)
SchoolGradeNameGenderWeightTransfer
1Peking UniversityFreshmanChangqiang YouMale70.0N
4Fudan UniversitySophomoreGaojuan YouMale74.0N
10Shanghai Jiao Tong UniversityFreshmanXiaopeng ZhouMale74.0N

随机抽样

将DataFrame每行当作一个样本,或每列当作一个特征,再将DataFrame当作一个整体,可以通过sample来对样本或特征进行抽样,用途是用总体统计特征的无偏估计小样本代替整体提升工作效率. sample函数主要参数n,axis,frac,replace,weight前三个分别为抽样数量,抽样方向(0为行,1为列)和抽样比例(小于1),replace和weights分别指是否放回和每个样本的抽样相对概率,replace=True表示又放回抽样

df_sample = pd.DataFrame({'id':list('abcde'), 'value':[1,2,3,4,90]})
df_sample
idvalue
0a1
1b2
2c3
3d4
4e90
df_sample.sample(3, replace = True, weights = df_sample.value)
idvalue
4e90
4e90
4e90

多级索引

多级索引及表结构

np.random.seed(0)
multi_index = pd.MultiIndex.from_product([list('ABCD'),
                                         df.Gender.unique()], names = ('School', 'Gender'))
multi_column = pd.MultiIndex.from_product([['Height', 'Weight'],
                                          df.Grade.unique()], names = ('indicator', 'Grade'))
df_multi = pd.DataFrame(np.c_[(np.random.randn(8,4)*5 + 163).tolist(),
                             (np.random.randn(8,4)*5 + 65).tolist()],
                       index = multi_index, columns = multi_column).round(1)
df_multi
indicatorHeightWeight
GradeFreshmanSeniorSophomoreJuniorFreshmanSeniorSophomoreJunior
SchoolGender
AFemale171.8165.0167.9174.260.655.163.365.8
Male172.3158.1167.8162.271.271.063.163.5
BFemale162.5165.1163.7170.359.857.956.574.8
Male166.8163.6165.2164.762.562.858.768.9
CFemale170.5162.0164.6158.756.963.960.566.9
Male150.2166.3167.3159.362.459.164.967.1
DFemale174.3155.7163.2162.165.366.561.863.2
Male170.7170.3163.8164.961.663.260.956.4
df_multi.index.names
FrozenList(['School', 'Gender'])
df_multi.columns.names
FrozenList(['indicator', 'Grade'])
df_multi.index.values
array([('A', 'Female'), ('A', 'Male'), ('B', 'Female'), ('B', 'Male'),
       ('C', 'Female'), ('C', 'Male'), ('D', 'Female'), ('D', 'Male')],
      dtype=object)
df_multi.columns.values
array([('Height', 'Freshman'), ('Height', 'Senior'),
       ('Height', 'Sophomore'), ('Height', 'Junior'),
       ('Weight', 'Freshman'), ('Weight', 'Senior'),
       ('Weight', 'Sophomore'), ('Weight', 'Junior')], dtype=object)
df_multi.index.get_level_values(0) ##获得某一层索引

#注意到此时索引无论单层还是多层均无法直接通过索引的方法进行修改
Index(['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'], dtype='object', name='School')

多级索引器中的loc索引器

将原表中学校和年级设为索引,此时行为多级索引,列为单级索引,并且由于默认状态列索引不含名字,则IndicatorGrade索引位置为空

df_multi = df.set_index(['School', 'Grade'])
df_multi.head(5)
NameGenderWeightTransfer
SchoolGrade
Shanghai Jiao Tong UniversityFreshmanGaopeng YangFemale46.0N
Peking UniversityFreshmanChangqiang YouMale70.0N
Shanghai Jiao Tong UniversitySeniorMei SunMale89.0N
Fudan UniversitySophomoreXiaojuan SunFemale41.0N
SophomoreGaojuan YouMale74.0N
#多级索引中单个元素以元组为单位,此种情况下可以使用loc和iloc方法,不过需要将原来的标量位置替换为元组当传入元组列表或单个元组或返回前二者的函数时需要先进行索引排序来避免性能警告
with warnings.catch_warnings():
    warnings.filterwarnings('error')
    try:
        df_multi.loc[('Fudan University', 'Junior')].head()
    except Warning as w:
        Warning_Msg = w
        
print(Warning_Msg)## 产生性能警告

indexing past lexsort depth may impact performance.
df_sorted = df_multi.sort_index()
df_sorted.loc[('Fudan University', 'Junior')].head()
NameGenderWeightTransfer
SchoolGrade
Fudan UniversityJuniorYanli YouFemale48.0N
JuniorChunqiang ChuMale72.0N
JuniorChangfeng LvMale76.0N
JuniorYanjuan LvFemale49.0NaN
JuniorGaoqiang ZhouFemale43.0N
df_sorted.loc[[('Fudan University', 'Senior'),
              ('Shanghai Jiao Tong University', 'Freshman')]].head()
NameGenderWeightTransfer
SchoolGrade
Fudan UniversitySeniorChengpeng ZhengFemale38.0N
SeniorFeng ZhouFemale47.0N
SeniorGaomei LvFemale34.0N
SeniorChunli LvFemale56.0N
SeniorChengpeng ZhouMale81.0N
df_sorted.loc[df_sorted.Weight > 70].head()
NameGenderWeightTransfer
SchoolGrade
Fudan UniversityFreshmanFeng WangMale74.0N
JuniorChunqiang ChuMale72.0N
JuniorChangfeng LvMale76.0N
SeniorChengpeng ZhouMale81.0N
SeniorChengpeng QianMale73.0Y
df_sorted.loc[lambda x : ('Fudan University', 'Junior')].head()
NameGenderWeightTransfer
SchoolGrade
Fudan UniversityJuniorYanli YouFemale48.0N
JuniorChunqiang ChuMale72.0N
JuniorChangfeng LvMale76.0N
JuniorYanjuan LvFemale49.0NaN
JuniorGaoqiang ZhouFemale43.0N
df_sorted.head()
NameGenderWeightTransfer
SchoolGrade
Fudan UniversityFreshmanChangqiang YangFemale49.0N
FreshmanGaoqiang QinFemale63.0N
FreshmanGaofeng ZhaoFemale43.0N
FreshmanYanquan WangFemale55.0N
FreshmanFeng WangMale74.0N
#单级别索引使用切片时要求切片端点元素是唯一的,多级索引中无论元素是否在索引中是否重复出现,都必须经过排序才能使用切片,否则会报错
try:
    df_multi.loc[('Fudan University', 'Senior'):].head()
except Exception as e:
    Err_Msg = e
print(Err_Msg)
'Key length (2) was greater than MultiIndex lexsort depth (0)'
df_sorted.loc[('Fudan University', 'Senior'):].head()
NameGenderWeightTransfer
SchoolGrade
Fudan UniversitySeniorChengpeng ZhengFemale38.0N
SeniorFeng ZhouFemale47.0N
SeniorGaomei LvFemale34.0N
SeniorChunli LvFemale56.0N
SeniorChengpeng ZhouMale81.0N
df_unique = df.drop_duplicates(subset=['School', 'Grade']).set_index(['School', 'Grade'])
df_unique.head()
NameGenderWeightTransfer
SchoolGrade
Shanghai Jiao Tong UniversityFreshmanGaopeng YangFemale46.0N
Peking UniversityFreshmanChangqiang YouMale70.0N
Shanghai Jiao Tong UniversitySeniorMei SunMale89.0N
Fudan UniversitySophomoreXiaojuan SunFemale41.0N
Tsinghua UniversityFreshmanXiaoli QianFemale51.0N
df_unique.sort_index().loc[('Fudan University', 'Senior'):].head()
NameGenderWeightTransfer
SchoolGrade
Fudan UniversitySeniorChengpeng ZhengFemale38.0N
SophomoreXiaojuan SunFemale41.0N
Peking UniversityFreshmanChangqiang YouMale70.0N
JuniorJuan XuFemaleNaNN
SeniorChangli LvFemale41.0N
#多级索引的元组中特殊用法,对多层的元素进行交叉组合后索引,同时需要指定loc的列,全选时:表示.
#每一层需要选中的元素用列表存放,传入loc的形式[(level_0_list,level_1_list), cols]
res = df_multi.loc[(['Peking University', 'Fudan Unversity'], ['Sophomort', 'Junior']), :]
res.head()
NameGenderWeightTransfer
SchoolGrade
Peking UniversityJuniorJuan XuFemaleNaNN
JuniorChangjuan YouFemale47.0N
JuniorGaoli XuFemale48.0N
JuniorGaoquan ZhouMale70.0N
JuniorQiang YouFemale56.0N
res.shape
(8, 4)
res = df_multi.loc[[('Peking University', 'Senior'), ('Fudan University', 'Sophomore')]]
res.head()
NameGenderWeightTransfer
SchoolGrade
Peking UniversitySeniorChangli LvFemale41.0N
SeniorFeng ZhengFemale49.0N
SeniorFeng ZhaoMale66.0N
SeniorChangquan HanMale77.0N
SeniorMei FengFemale51.0N
res.shape
(16, 4)

IndexSlice对象

前面方法不可以在索引不重复的时候对每层进行切片,也不允许将切片和布尔列表进行混合使用,只能对元组整体进行切片,可以通过引入IndexSlice对象来解决这个问题.Slice对象有loc[idx[,]]和loc[idx[,],idx[,]]两种

pd.MultiIndex.from_product?
np.random.seed(0)
L1, L2 = ['A', 'B','C'], ['a','b','c']
mul_index1 = pd.MultiIndex.from_product([L1, L2], names = ('Upper', 'lower'))
L3, L4 = ['D','E','F'], ['d','e','f']
mul_index2 = pd.MultiIndex.from_product([L3,L4],names=('Big','Small'))

df_ex = pd.DataFrame(np.random.randint(-9,10,(9,9)),
                    index = mul_index1,
                    columns = mul_index2)
df_ex
BigDEF
Smalldefdefdef
Upperlower
Aa36-9-6-6-209-5
b-33-8-3-258-44
c-107-466-99-6
Ba85-2-9-80-91-6
b29-7-9-9-5-4-3-1
c86-501-8-8-20
Ca-6-3259-95-63
b12-5-3-56-63-5
c-156-66478-4
#使用slice对象直线需要先定义

idx = pd.IndexSlice
#loc[idx[*,*]]
df_ex.loc[idx['C':, ('D', 'E'):]]
df_ex.loc[idx[:'A', lambda x: x.sum()>0]]

#loc[idx[*,*], idx[*,*]]
df_ex.loc[idx[:'A', 'b':], idx['E':, 'e':]]
BigEF
Smallefef
Upperlower
Ab-25-44
c669-6

多级索引的构造

可通过from_tuples,from_arrays, from_product三种方法构造多级索引,以上三种方法均为pd.MultiIndex对象下的函数

df_ex.describe()
BigDEF
Smalldefdefdef
count9.0000009.0000009.0000009.0000009.0000009.0000009.0000009.0000009.000000
mean1.2222223.666667-2.333333-3.888889-0.888889-0.333333-1.7777781.666667-2.222222
std4.6844903.6055516.0000004.4284436.6791555.9791306.9602045.8736703.865805
min-6.000000-3.000000-9.000000-9.000000-9.000000-9.000000-9.000000-6.000000-6.000000
25%-1.0000002.000000-7.000000-6.000000-6.000000-5.000000-8.000000-3.000000-5.000000
50%1.0000005.000000-5.000000-4.000000-2.0000000.000000-4.0000001.000000-4.000000
75%3.0000006.0000002.000000-3.0000006.0000005.0000005.0000008.0000000.000000
max8.0000009.0000007.0000005.0000009.0000006.0000008.0000009.0000004.000000
#from_tuples 由元组组成的列表进行构造
my_tuple= [('a', 'cat'), ('a', 'dog'), ('b', 'cat'), ('b', 'dog')]

pd.MultiIndex.from_tuples(my_tuple, names=['First', 'Second'])
MultiIndex([('a', 'cat'),
            ('a', 'dog'),
            ('b', 'cat'),
            ('b', 'dog')],
           names=['First', 'Second'])
#from_arrays 通过对应层的列表进行构造
my_array = [list('aabb'), ['cat','dog']*2]
pd.MultiIndex.from_arrays(my_array, names = ['First', 'Second'])
MultiIndex([('a', 'cat'),
            ('a', 'dog'),
            ('b', 'cat'),
            ('b', 'dog')],
           names=['First', 'Second'])
#from_product 通过给定多个列表的笛卡尔积进行构造
my_list1 = ['a','b']
my_list2 = ['cat', 'dog']
pd.MultiIndex.from_product([my_list1, my_list2], names = ['First', 'Second'])

MultiIndex([('a', 'cat'),
            ('a', 'dog'),
            ('b', 'cat'),
            ('b', 'dog')],
           names=['First', 'Second'])

索引的常用方法

索引层的交换和删除

np.random.seed(0)
L1,L2,L3 = ['A','B'],['a','b'],['alpha','beta']
mul_index1 = pd.MultiIndex.from_product([L1,L2,L3],
          names=('Upper', 'Lower','Extra'))

L4,L5,L6 = ['C','D'],['c','d'],['cat','dog']
mul_index2 = pd.MultiIndex.from_product([L4,L5,L6],
          names=('Big', 'Small', 'Other'))

df_ex = pd.DataFrame(np.random.randint(-9,10,(8,8)),
                        index=mul_index1,
                        columns=mul_index2)
df_ex
BigCD
Smallcdcd
Othercatdogcatdogcatdogcatdog
UpperLowerExtra
Aaalpha36-9-6-6-209
beta-5-33-8-3-258
balpha-44-107-466
beta-99-685-2-9-8
Baalpha0-91-629-7-9
beta-9-5-4-3-186-5
balpha01-8-8-20-6-3
beta259-95-631
#索引层的交换有swaplevel和reorder_levels完成,前者交换两个层,后可可以交换任意层,两者均可指定交换的是轴是哪一个.
df_ex.swaplevel(0, 2, axis = 1).head(3)
Othercatdogcatdogcatdogcatdog
Smallccddccdd
BigCCCCDDDD
UpperLowerExtra
Aaalpha36-9-6-6-209
beta-5-33-8-3-258
balpha-44-107-466
df_ex.reorder_levels([2,0,1], axis = 0).head(3)
BigCD
Smallcdcd
Othercatdogcatdogcatdogcatdog
ExtraUpperLower
alphaAa36-9-6-6-209
betaAa-5-33-8-3-258
alphaAb-44-107-466
#使用droplevel删除某层索引
df_ex.droplevel(1, axis = 1).head(3)

BigCD
Othercatdogcatdogcatdogcatdog
UpperLowerExtra
Aaalpha36-9-6-6-209
beta-5-33-8-3-258
balpha-44-107-466
df_ex.droplevel([0,1],axis = 0).head(3)
BigCD
Smallcdcd
Othercatdogcatdogcatdogcatdog
Extra
alpha36-9-6-6-209
beta-5-33-8-3-258
alpha-44-107-466

索引属性的修改

通过rename_axis对索引层的名字进行修改, 常用修改方式有传入字典的映射

df_ex.rename_axis(index = {'Upper':'Changed_row'},
                  columns = {'Other':'Changed_col'}).head()
BigCD
Smallcdcd
Changed_colcatdogcatdogcatdogcatdog
Changed_rowLowerExtra
Aaalpha36-9-6-6-209
beta-5-33-8-3-258
balpha-44-107-466
beta-99-685-2-9-8
Baalpha0-91-629-7-9
#通过rename对索引的值进行修改,多级索引是需要指定修改的层号level
df_ex.rename(columns = {'cat':'not_cat'},
            level = 2).head()
BigCD
Smallcdcd
Othernot_catdognot_catdognot_catdognot_catdog
UpperLowerExtra
Aaalpha36-9-6-6-209
beta-5-33-8-3-258
balpha-44-107-466
beta-99-685-2-9-8
Baalpha0-91-629-7-9
df_ex.rename(index = lambda x:str.upper(x),
            level = 2).head()
BigCD
Smallcdcd
Othercatdogcatdogcatdogcatdog
UpperLowerExtra
AaALPHA36-9-6-6-209
BETA-5-33-8-3-258
bALPHA-44-107-466
BETA-99-685-2-9-8
BaALPHA0-91-629-7-9
#通过迭代器实现整个索引的元素替换
#通过iter方法构建迭代器,之后使用next方法对迭代器内容进行遍历
new_values = iter(list('abcdefgh'))
df_ex.rename(index = lambda x:next(new_values), level = 2)
BigCD
Smallcdcd
Othercatdogcatdogcatdogcatdog
UpperLowerExtra
Aaa36-9-6-6-209
b-5-33-8-3-258
bc-44-107-466
d-99-685-2-9-8
Bae0-91-629-7-9
f-9-5-4-3-186-5
bg01-8-8-20-6-3
h259-95-631
#对多级索引对象某个位置的元素进行修改,可通过先将一层索引临时转为表的元素,然后在进行修改,最后重新设定为索引
#主要使用方法可以通过map函数, map函数定义在Index方法上与前面rename方法中层的函数式用法类似,不过传入的不是层的标量值,而是直接传入索引的元组,
df_temp = df_ex.copy()
new_idx = df_temp.index.map(lambda x :(str.upper(x[0]),x[1], str.upper(x[2])))
df_temp.index = new_idx
df_temp.head()
BigCD
Smallcdcd
Othercatdogcatdogcatdogcatdog
UpperLowerExtra
AaALPHA36-9-6-6-209
BETA-5-33-8-3-258
bALPHA-44-107-466
BETA-99-685-2-9-8
BaALPHA0-91-629-7-9
#map也可以被用来多级索引进行压缩,
df_temp = df_ex.copy()
new_idx = df_temp.index.map(lambda x : (x[0]+'-'+x[1]+'-'+x[2]))
df_temp.index = new_idx
df_temp.head(3)
BigCD
Smallcdcd
Othercatdogcatdogcatdogcatdog
A-a-alpha36-9-6-6-209
A-a-beta-5-33-8-3-258
A-b-alpha-44-107-466
#map用来解压索引
new_idx = df_temp.index.map(lambda x:tuple(x.split('-')))
df_temp.index = new_idx
df_temp.head(3)
BigCD
Smallcdcd
Othercatdogcatdogcatdogcatdog
Aaalpha36-9-6-6-209
beta-5-33-8-3-258
balpha-44-107-466

索引的设置与重置

df_new = pd.DataFrame({'A':list('aacd'),
                      'B':list('PQRT'),
                      'C':[1,2,3,4]})
df_new
ABC
0aP1
1aQ2
2cR3
3dT4
#使用set_index函数方法实现设置索引,主要参数为append,表示是否保留原来的索引,直接吧新的设定添加到原索引的内层
df_new.set_index('A')
BC
A
aP1
aQ2
cR3
dT4
df_new.set_index('A', append = True)
BC
A
0aP1
1aQ2
2cR3
3dT4
df_new.set_index(['A','B'])
C
AB
aP1
Q2
cR3
dT4
#添加自己想要添加的索引列
my_index = pd.Series(list('WXYZ'), name = 'D')
df_new = df_new.set_index(['A', my_index])
df_new
BC
AD
aWP1
XQ2
cYR3
dZT4
#set_index函数方法的逆函数, 主要参数为drop表示否需要将去掉的索引层抛弃
df_new.reset_index(['D'])
#如果重置所有的索引,则pandas会重新生成一个默认索引

DBC
A
aWP1
aXQ2
cYR3
dZT4

索引的变形

对索引做一些扩充或剔除,也或者给定一个新的索引,将原表中相应的索引对应元素填充到新索引构成的表中

df_reindex = pd.DataFrame({"Weight":[60,70,80],
                           "Height":[176,180,179]},
                          index=['1001','1003','1002'])
df_reindex
WeightHeight
100160176
100370180
100280179
## reindex的作用即为使用原表中的数据创建待使用的索引,常用在时间点填充
##及Id编号的扩充.并且原表中的数据和新表中会根据索引自动对齐.
df_reindex.reindex(index = ['1001','1002','1003','1004'],
                  columns = ['Weight', 'Gender'])
WeightGender
100160.0NaN
100280.0NaN
100370.0NaN
1004NaNNaN
#与reindex功能类似的函数方法reindex_like,功能是仿照传入的表的索引来进行被
##调用表索引的变形, 即不同点即为使用使用原表中的数据
df_existed = pd.DataFrame(index = ['1001','1002','1003','1004'],
                         columns = ['Weight', 'Gender'])
df_existed
WeightGender
1001NaNNaN
1002NaNNaN
1003NaNNaN
1004NaNNaN

索引运算

集合运算法则

利用集合算取出符合条件行的需求,如取出两表中index索引交集的数据,此时通过Index上的运算操作较为容易实现, 有intersection(交集),union(并集),difference(差集),symmetric_difference(并集减去交集)

一般索引运算

集合的元素是互异的,但是索引中可能有相同的元素,应该首先使用unique方法去重后再进行运算

df_set_1 = pd.DataFrame([[0, 1], [1, 2], [3, 4]],
                        index=pd.Index(['a', 'b', 'a'], name='id1'))

df_set_2 = pd.DataFrame([[4, 5], [2, 6], [7, 1]],
                        index=pd.Index(['b', 'b', 'c'], name='id2'))

id1, id2 = df_set_1.index.unique(), df_set_2.index.unique()

print(id1.intersection(id2))
print(id1.union(id2))
print(id1.difference(id2))
print(id1.symmetric_difference(id2))

##等价符号表示有
print('--'*30)
print(id1 & id2)
print(id1 | id2)
print((id1 ^ id2) & id1)
print(id1 ^ id2)
Index(['b'], dtype='object')
Index(['a', 'b', 'c'], dtype='object')
Index(['a'], dtype='object')
Index(['a', 'c'], dtype='object')
------------------------------------------------------------
Index(['b'], dtype='object')
Index(['a', 'b', 'c'], dtype='object')
Index(['a'], dtype='object')
Index(['a', 'c'], dtype='object')
#当两张表需要做集合运算的列未被设置索引,可以先将其转成索引,运算后再恢复,另一种方法是使用isin函数方法
df_set_in_col_1 = df_set_1.reset_index()
df_set_in_col_2 = df_set_2.reset_index()

df_set_in_col_1
id101
0a01
1b12
2a34
df_set_in_col_2
id201
0b45
1b26
2c71
df_set_in_col_1[df_set_in_col_1.id1.isin(df_set_in_col_2.id2)]
id101
1b12

练习


##练一练 1.
number = [float,int]
df_number = df_demo.loc[:,
                        df_demo.dtypes.apply(lambda x : x in(number) )]
df_number
Weight
Name
Gaopeng Yang46.0
Changqiang You70.0
Mei Sun89.0
Xiaojuan Sun41.0
Gaojuan You74.0
......
Xiaojuan Sun46.0
Li Zhao50.0
Chengqiang Chu45.0
Chengmei Shen71.0
Chunpeng Lv51.0

200 rows × 1 columns

# 练一练 2.
df_ex.rename_axis(index = lambda x : str.upper(x))
BigCD
Smallcdcd
Othercatdogcatdogcatdogcatdog
UPPERLOWEREXTRA
Aaalpha36-9-6-6-209
beta-5-33-8-3-258
balpha-44-107-466
beta-99-685-2-9-8
Baalpha0-91-629-7-9
beta-9-5-4-3-186-5
balpha01-8-8-20-6-3
beta259-95-631

在这里插入图片描述

df = pd.read_csv(path + '\company.csv')
df.head(3)

EmployeeIDbirthdate_keyagecity_namedepartmentjob_titlegender
013181/3/195461VancouverExecutiveCEOM
113191/3/195758VancouverExecutiveVP StoresF
213201/2/195560VancouverExecutiveLegal CounselF
# 1.1
df_query = df.query('(age <= 40) & (department in ["Dairy", "Bakery"]) & (gender=="M")')

#正确答案
dpt = ['Dairy', 'Bakery']
df_query_true = df.query('(age <= 40) & (department == &dpt) & (gender == "M")')


# 1.2
df_loc = df.loc[(df.age <= 40) & (df.department.isin(["Dairy", 'Bakery']))]

#正确答案
dpt = ['Dairy', 'Bakery']
df_loc_true = df.loc[(df.age <= 40)&(df.department.isin(dpt))&(df.gender=='M')].head(3)


# 2
#正确答案
df.iloc[(df.EmployeeID%2==1).values, [0, 2, -2]].head(3)

# 3.1
df_test = df.copy()
df_test = df_test.set_index(list(df.columns[-3:])).reorder_levels([2, 1, 0],
                                                                  axis=0)
df_test.reset_index(level = 1)
df_test.rename_axis(index = {'gender':'Gender'})

#正确答案
df_op = df.copy()
df_op = df_op.set_index(df_op.columns[-3:].tolist()).swaplevel(0,2, axis = 0)
df_op = df_op.reset_index(level = 1)
df_op = df_op.rename(index = {'gender':'Gender'})
df_op.index = df_op.index.map(lambda x : '_'.join(x))
df_op.index = df_op.index.map(lambda x : tuple(x.split('_')))
df_op = df_op.rename_axis(index = ['gender', 'department'])
df_op = df_op.reset_index().reindex(df.columns, axis = 1)

在这里插入图片描述

df = pd.read_csv(path + '\\chocolate.csv')
df.head(3)
CompanyReview\nDateCocoa\nPercentCompany\nLocationRating
0A. Morin201663%France3.75
1A. Morin201570%France2.75
2A. Morin201570%France3.00
# 1.
df_op1 = df.copy()
df_op1.columns = df_op1.columns.map(lambda x : x.replace('\n', ' '))

#正确答案
df_op2 = df.copy()
df_op2.columns = [' '.join(i.split('\n')) for i in df_op2.columns]

# 2.
#正确答案
df_op = df_op1.copy()
df_op['Cocoa Percent'] = df_op['Cocoa Percent'].apply(lambda x : float(x[:-1])/100)
df_op.query('(Rating <= 2.75) & (`Cocoa Percent` > `Cocoa Percent`.median())').head(3)

# 3.
#正确答案
idx = pd.IndexSlice
df_op = df_op.set_index(['Review Date', 'Company Location']).sort_index(level = 0)
exclude = ['France', 'Canada', 'Amsterdam', 'Belgium']
df_op.loc[idx[2012:, ~df_op.index.get_level_values(1).isin(exclude)], :].head(3)
CompanyCocoa PercentRating
Review DateCompany Location
2012AustraliaBahen & Co.0.73.0
AustraliaBahen & Co.0.72.5
AustraliaBahen & Co.0.72.5
  • 2
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 3
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值