03 pandas索引

最新推荐文章于 2023-07-18 10:17:29 发布

宁宁的快乐小朋友

最新推荐文章于 2023-07-18 10:17:29 发布

阅读量426

点赞数

文章标签： python pandas 数据分析

本文链接：https://blog.csdn.net/qq_42101003/article/details/126585067

版权

# 1 索引器

# 01 表的列索引

# 02 序列的行索引

# 以字符串为索引的Series

# 以整数为索引的Series

# 03 loc索引器

# loc['单个元素'] 用索引筛选行

# loc[元素列表]

# loc[切片] 左闭右闭

# loc[布尔列表]

# loc[函数]

# 不要使用链式赋值

# 04 iloc索引器

# 05 query方法

# 对于含有空格的列名，用'列名'

# 06 随机抽样

# 2 多级索引

# 01 构造多级索引 from_tuples,from_arrays,from_product

# 02 多级索引及表结构

# 03 多级索引中loc索引器

# 04 IndexSlice对象

# 3 索引的常用方法

# 01 交换

# swaplevel只能交换两个层，reorder_levels能交换任意层

# 02 删除

# 03 修改属性

# rename_axis 修改索引层名字，传入字典映射

# rename 修改索引的值多级索引需要指定level

# 用迭代器更改整个索引的元素

# map 修改某个位置的索引

# map 压缩多级索引

# 04 索引的设置与重置

# set_index 设置索引，append=True保留原索引

# reset_index 去掉索引，drop=True把去掉的索引层丢弃

# 05 索引的变形

# 4 索引运算

# 索引
# 1 索引器

# 01 表的列索引
import pandas as pd
path = r'C:\Users\lenovo\Desktop\最近要用\pandas\joyful-pandas\data'
df = pd.read_csv('{}/learn_pandas.csv'.format(path),
                 usecols = ['School', 'Grade', 'Name', 'Gender','Weight', 'Transfer'])
df['Name'].head()
df.Name.head() # 列名中没有空格时可以这样用
df[['Gender','Name']].head()

# 02 序列的行索引
# 以字符串为索引的Series
s = pd.Series([1,2,3,4,5,6],
              index=['a','b','a','a','a','c'])
s['a']
s['b']
s[['c','b']]
s['c':'b':-2]
# 以整数为索引的Series
s = pd.Series(['a','b','c','d','e','f'], index=[1,3,1,2,5,4])
s[1]
s[[2,3]]
s[1:-1:2] # 右开

# 03 loc索引器
# loc['单个元素'] 用索引筛选行
df_demo = df.set_index('Name')
df_demo.loc['Qiang Sun']
# loc[元素列表]
df_demo.loc[['Qiang Sun','Quan Zhao'],['School','Gender']]
# loc[切片] 左闭右闭
df_demo.loc['Gaojuan You':'Gaoqiang Qian','School':'Gender']

df_loc_slice_demo = df_demo.copy()
df_loc_slice_demo.index = range(df_demo.shape[0],0,-1)
df_loc_slice_demo.loc[5:3] # 左闭右闭 终点不允许有重复值
# loc[布尔列表]
df_demo.loc[df_demo.Weight>70].head()
df_demo.loc[df_demo.Grade.isin(['Freshaman','Senior'])].head()

condition11 = df_demo.School == 'Fudan University'
condition12 = df_demo.Grade == 'Senior'
condition13 = df_demo.Weight > 70
condition1 = condition11 & condition12 & condition13
condition21 = df_demo.School == 'Peking University'
condition22 = df_demo.Grade == 'Senior'
condition23 = df_demo.Weight > 80
condition2 = condition21 & (~condition22) & condition23
df_demo.loc[condition1|condition2]
# loc[函数]
def condition(x):
    condition11 = x.School == 'Fudan University'
    condition12 = x.Grade == 'Senior'
    condition13 = x.Weight > 70
    condition1 = condition11 & condition12 & condition13
    condition21 = x.School == 'Peking University'
    condition22 = x.Grade == 'Senior'
    condition23 = x.Weight > 80
    condition2 = condition21 & (~condition22) & condition23
    result = condition1|condition2
    return result
df_demo.loc[condition]

df_demo.loc[lambda x:'Quan Zhao', lambda x:'Gender']
df_demo.loc[lambda x:slice('Gaojuan You','Gaoqiang Qian')]

# 不要使用链式赋值
df_chain = pd.DataFrame([[0,0],[1,0],[-1,0]], columns=list('AB'))
df_chain.loc[df_chain.A!=0].B = 1 # 不对
df_chain.loc[df_chain.A!=0,'B'] = 1 # 对

# 04 iloc索引器
df_demo.iloc[1,1]
df_demo.iloc[[0,1],[0,1]]
df_demo.iloc[1:4,2:4] # 右开
df_demo.iloc[lambda x:slice(1,4)]
df_demo.iloc[(df_demo.Weight>80).values].head()

# 05 query方法 
# 对于含有空格的列名，用'列名'
df.query('((School == "Fudan University")&'
         '(Grade == "Senior")&'
         '(Weight > 70))|'
         '((School == "Peking University")&'
         '(Grade != "Senior")&'
         '(Weight > 80))'
    )
df.query('Weight > Weight.mean()').head()
df.query('(Grade not in ["Freshman","Sophomore"]) and'
         '(Gender == "Male")').head()
df.query('Grade == ["Junior","Senior"]').head()
low, high = 70, 80
df.query('Weight.between(@low,@high)').head()

# 06 随机抽样
df_sample = pd.DataFrame({'id':list('abcde'),'value':[1,2,3,4,90]})
# n抽样数量，axis抽样方向 0行1列，frac抽样比例，replace=True放回抽样，weights每个样本的抽样相对概率
df_sample.sample(3, replace=True, weights=df_sample.value)


# 2 多级索引
# 01 构造多级索引 from_tuples,from_arrays,from_product
my_tuple = [('a','dog'),('a','cat'),('b','dog'),('b','cat')]
pd.MultiIndex.from_tuples(my_tuple, names = ['first','second'])

my_array = [list('aabb'),['dog','cat']*2]
pd.MultiIndex.from_arrays(my_array, names = ['first','second'])

my_list1 = ['a','b']
my_list2 = ['dog','cat']
pd.MultiIndex.from_product([my_list1, my_list2],names = ['first', 'second'])

# 02 多级索引及表结构
import numpy as np
np.random.seed(0)
multi_index = pd.MultiIndex.from_product([list('ABCD'),df.Gender.unique()], names = ('School','Gender'))
multi_column = pd.MultiIndex.from_product([['Height','Weight'],df.Grade.unique()], names = ('Indicator', 'Grade'))
df_multi = pd.DataFrame(np.c_[(np.random.randn(8,4)*5+163).tolist(),(np.random.randn(8,4)*5+65).tolist()],
                        index = multi_index, columns = multi_column).round(1)
df_multi

df_multi.index.names
df_multi.columns.names
df_multi.index.values
df_multi.columns.values
df_multi.index.get_level_values(0)

# 03 多级索引中loc索引器
df_multi = df.set_index(['School','Grade']) # 将学校和年级作为多级索引
df_multi = df_multi.sort_index() # 在索引前先排序
df_multi.loc[('Fudan University', 'Junior')].head()
df_multi.loc[[('Fudan University', 'Junior'),('Shanghai Jiao Tong University', 'Freshman')]].head()
df_multi.loc[df_multi.Weight > 70].head()
df_multi.loc[lambda x:('Fudan University', 'Junior')].head()

df_multi.loc[(['Peking University','Fudan University'],['Sophomore','Junior']),:] # 交叉索引
df_multi.loc[[('Peking University','Junior'),('Fudan University','Sophomore')]] # 跟上面的不一样

# 04 IndexSlice对象
np.random.seed(0)
l1, l2 = ['A', 'B', 'C'], ['a' , 'b', 'c']
mul_index1 = pd.MultiIndex.from_product([l1, l2], names = ('Upper', 'Lower'))
l3, l4 = ['D', 'E', 'F'], ['d', 'e', 'f']
mul_index2 = pd.MultiIndex.from_product([l3, l4], names = ('Big', 'Small'))
df_ex = pd.DataFrame(np.random.randint(-9,10,(9,9)), index = mul_index1, columns = mul_index2)
idx = pd.IndexSlice
df_ex.loc[idx['C':,('D','F'):]]
df_ex.loc[idx[:'A', lambda x:x.sum()>0]]


# 3 索引的常用方法
np.random.seed(0)
l1,l2,l3 = ['A','B'],['a','b'],['alpha','beta']
mul_index1 = pd.MultiIndex.from_product([l1,l2,l3],names=('Upper','Lower','Extra'))
l4,l5,l6 = ['C','D'],['c','d'],['cat','dog']
mul_index2 = pd.MultiIndex.from_product([l4,l5,l6],names=('Big','Small','Other'))
df_ex = pd.DataFrame(np.random.randint(-9,10,(8,8)), index=mul_index1, columns=mul_index2)
df_ex
# 01 交换
# swaplevel只能交换两个层，reorder_levels能交换任意层
df_ex.swaplevel(0,2,axis=1).head() # 列索引交换1，3层
df_ex.reorder_levels([2,0,1], axis=0).head()
# 02 删除
df_ex.droplevel(1,axis=1)
# 03 修改属性
# rename_axis 修改索引层名字，传入字典映射
df_ex.rename_axis(index={'Upper':'YY'},columns={'Other':'LL'}).head()
# rename 修改索引的值 多级索引需要指定level
df_ex.rename(columns={'cat':'not_cat'},level=2).head()
df_ex.rename(index=lambda x:str.upper(x),level=2).head()
# 用迭代器更改整个索引的元素
new_values = iter(list('abcdefgh'))
df_ex.rename(index=lambda x:next(new_values), level=2)
# map 修改某个位置的索引
df_temp = df_ex.copy()
new_idx = df_temp.index.map(lambda x:(x[0],x[1],str.upper(x[2])))
df_temp.index = new_idx
df_temp.head()
# map 压缩多级索引
df_temp = df_ex.copy()
new_idx = df_temp.index.map(lambda x:(x[0]+'-'+x[1]+'-'+x[2]))
df_temp.index = new_idx
df_temp.head()

new_idx = df_temp.index.map(lambda x:tuple(x.split('-')))
df_temp.index = new_idx
df_temp.head()
# 04 索引的设置与重置
df_new = pd.DataFrame({'A':list('aacd'),'B':list('PQRT'),'C':[1,2,3,4]})
df_new
# set_index 设置索引，append=True保留原索引
df_new.set_index('A')
df_new.set_index('A',append=True)
df_new.set_index(['A','B'])

my_index = pd.Series(list('YAOE'),name='D')
df_new = df_new.set_index(['A',my_index])
df_new
# reset_index 去掉索引，drop=True把去掉的索引层丢弃
df_new.reset_index(['D'])
df_new.reset_index(['D'],drop=True)
df_new.reset_index()
# 05 索引的变形
df_reindex = pd.DataFrame({'Weight':[60,70,80],'Height':[176,180,179]},index=['1001','1003','1002'])
df_reindex 
df_reindex.reindex(index=['1001','1002','1003','1004'], columns=['Weight','Gender'])

df_existed = pd.DataFrame(index=['1001','1002','1003','1004'],
                          columns=['Weight','Gender'])
df_reindex.reindex_like(df_existed)


# 4 索引运算
ds1 = pd.DataFrame([[0,1],[1,2],[3,4]],index=pd.Index(['a','b','c'],name='id1'))
ds2 = pd.DataFrame([[4,5],[2,6],[7,9]],index=pd.Index(['b','b','c'],name='id2'))
id1,id2 = ds1.index.unique(), ds2.index.unique()

id1.intersection(id2)
id1 & id2

id1.union(id2)
id1 | id2

id1.difference(id2)
(id1 ^ id2) & id1

id1.symmetric_difference(id2) # 交集外边的
id1 ^ id2

dsi1 = ds1.reset_index()
dsi2 = ds2.reset_index()
dsi1[dsi1.id1.isin(dsi2.id2)]


# Ex1 员工数据集
path = r'C:\Users\lenovo\Desktop\最近要用\pandas\joyful-pandas\data'
df = pd.read_csv('{}/company.csv'.format(path))
df.head(3)
# 1 分别只使用query()和loc选出年龄不超过四十岁且工作部门为Dairy或Bakery的男性
df.loc[(df.age <= 40) & (df.department.isin(['Dairy','Bakery'])) & (df.gender == 'M')]
df.query('(Grade not in ["Freshman","Sophomore"]) and'
         '(Gender == "Male")').head()
df.query('(age <= 40) and'
         '(department.isin(["Dairy","Bakery"])) and'
         '(gender == "M")')

dpt = ['Dairy', 'Bakery']
df.query("(age <= 40)&(department == @dpt)&(gender=='M')").head(3)
# 2 选出员工ID号为奇数所在行的第1、第3和倒数第2列
df.loc[df.EmployeeID%2 == 1].iloc[:,[0,2,-2]].head()
df.iloc[(df.EmployeeID%2==1).values,[0,2,-2]].head()
# 把后三列设为索引后交换内外两层
df0 = df.copy()
df0 = df0.set_index(df.columns[-3:].tolist()).swaplevel(0,2,axis=0)
df0.head(2)
# 恢复中间层索引
df0 = df0.reset_index(level=1)
# 修改外层索引名为Gender
df0 = df0.rename_axis(index={'gender':'Gender'})
# 用下划线合并两层行索引
newindex = df0.index.map(lambda x:(x[0]+'_'+x[1]))
# df_op.index = df_op.index.map(lambda x:'_'.join(x))
df0.index = newindex
# 把行索引拆分为原状态
newindex1 = df0.index.map(lambda x:tuple(x.split('_')))
df0.index = newindex1
# 修改索引名为原表名称
df0 = df0.rename_axis(index=['gender', 'department'])
# 恢复默认索引并将列保持为原表的相对位置
df0 = df0.reset_index().reindex(df.columns, axis=1)
df0.equals(df)


# Ex2 巧克力数据集
df = pd.read_csv('{}/chocolate.csv'.format(path))
df.head(2)
# 把列索引名中的换行符替换为空格。
df.columns = df.columns.map(lambda x: ' '.join(x.split('\r\n')))
# 巧克力Rating评分为1至5，每0.25分一档，请选出2.75分及以下且可可含量Cocoa Percent高于中位数的样本。
df['new Cocoa Percent'] = df['Cocoa Percent'].str[:2].astype(int)
df0 = df[(df.Rating <= 2.75) & (df['new Cocoa Percent'] > df['new Cocoa Percent'].median())]

# df['new Cocoa Percent'] = df['Cocoa Percent'].apply(lambda x:float(x[:-1])/100)
# df.query('(Rating<3)&(`Cocoa Percent`>`Cocoa Percent`.median())')

# 选出Review Date在2012年之后且Company Location不属于France、Canada、Amsterdam或Belgium的样本。
df.loc[(df["Review Date"]>2012) & 
       ~df["Company Location"].isin(["France", "Canada", "Amsterdam", "Belgium"])]