import pandas as pd
import numpy as np
Series和DataFrame提供的有丰富的下标存取方法,可以直接使用[],也可以使用如下形式:
.loc[]和.iloc[]和.at[]和.iat以及.ix[]
下标存取
np. random. seed( 42 )
df = pd. DataFrame( np. random. randint( 0 , 10 , ( 5 , 3 ) ) ,
index= [ "r1" , "r2" , "r3" , "r4" , "r5" ] ,
columns= [ "c1" , "c2" , "c3" ] )
[]
操作符
print ( df)
print ( "df的类型为;{}" . format ( type ( df) ) )
print ( "*" * 50 )
print ( df[ 2 : 4 ] )
print ( "df[2:4]的类型为;{}" . format ( type ( df[ 2 : 4 ] ) ) )
print ( "*" * 50 )
print ( df[ "r2" : "r4" ] )
print ( 'df["r2":"r4"]的类型为;{}' . format ( type ( df[ "r2" : "r4" ] ) ) )
c1 c2 c3
r1 6 3 7
r2 4 6 9
r3 2 6 7
r4 4 3 7
r5 7 2 5
df的类型为;<class 'pandas.core.frame.DataFrame'>
**************************************************
c1 c2 c3
r3 2 6 7
r4 4 3 7
df[2:4]的类型为;<class 'pandas.core.frame.DataFrame'>
**************************************************
c1 c2 c3
r2 4 6 9
r3 2 6 7
r4 4 3 7
df["r2":"r4"]的类型为;<class 'pandas.core.frame.DataFrame'>
df[ df. c1 > 4 ]
print ( "df[df.c1>4]的类型:{}" . format ( type ( df[ df. c1> 4 ] ) ) )
df[df.c1>4]的类型:<class 'pandas.core.frame.DataFrame'>
df[ df > 2 ]
c1 c2 c3 r1 6 3 7 r2 4 6 9 r3 nan 6 7 r4 4 3 7 r5 7 nan 5
df[ df >= 2 ]
c1 c2 c3 r1 6 3 7 r2 4 6 9 r3 2 6 7 r4 4 3 7 r5 7 2 5
.loc[]
和.iloc[]
存取器
df. loc[ "r2" ]
c1 4
c2 6
c3 9
Name: r2, dtype: int32
df. loc[ "r2" , "c2" ]
6
df. loc[ [ "r2" , "r3" ] ]
df. loc[ [ "r2" , "r3" ] , [ "c1" , "c2" ] ]
df. loc[ "r2" : "r4" , [ "c2" , "c3" ] ]
df. loc[ df. c1> 2 , [ "c1" , "c2" ] ]
df. iloc[ 2 ]
c1 2
c2 6
c3 7
Name: r3, dtype: int32
df. iloc[ [ 2 , 4 ] ]
df. iloc[ [ 1 , 3 ] , [ 0 , 2 ] ]
df. iloc[ 2 : 4 , [ 0 , 2 ] ]
df. iloc[ df. c1. values> 2 , [ 0 , 1 ] ]
D:\installation\anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: .ix is deprecated. Please use .loc for label based indexing or .iloc for positional indexing
See the documentation here: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated “”"Entry point for launching an IPython kernel. D:\installation\anaconda3\lib\site-packages\pandas\core\indexing.py:822: FutureWarning: .ix is deprecated. Please use .loc for label based indexing or .iloc for positional indexing
See the documentation here: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated retval = getattr(retval, self.name)._getitem_axis(key, axis=i)
获取单个值
.at[]和.iat[]分别使用标签和整数下标获取单个值,此外,get_value()和.at[]类似,但其执行速度会更加快。
df
c1 c2 c3 r1 6 3 7 r2 4 6 9 r3 2 6 7 r4 4 3 7 r5 7 2 5
df. at[ "r2" , "c2" ]
6
df. iat[ 1 , 1 ]
6
如若,希望获取两个列表中每一对标签所对应的元素,可以使用.lookup(),结果返回一个指定元素的数组 。
df. lookup( [ "r2" , "r4" , "r3" ] , [ "c1" , "c2" , "c1" ] )
array([4, 3, 2])
midx = pd. MultiIndex. from_product( [ [ "A" , "B" , "C" ] , [ "x" , "y" ] ] ,
names= [ "class1" , "class2" ] )
df12 = pd. DataFrame( np. random. randint( 0 , 10 , ( 6 , 6 ) ) , columns= midx, index= midx)
df12
class1 A B C class2 x y x y x y class1 class2 A x 0 3 1 7 3 1 y 5 5 9 3 5 1 B x 9 1 9 3 7 6 y 8 7 4 1 4 7 C x 9 8 8 0 8 6 y 8 7 0 7 7 2
print ( type ( df12) )
<class 'pandas.core.frame.DataFrame'>
多级标签的存取
soil_df = pd. read_csv( "./data/Soils-simple.csv" , index_col= [ 0 , 1 ] , parse_dates= [ "Date" ] )
soil_df
pH Dens Ca Conduc Date Name Depth Contour 0-10 Depression 5.4 0.98 11 1.5 2015-05-26 Lois Slope 5.5 1.1 12 2 2015-04-30 Roy Top 5.3 1 13 1.4 2015-05-21 Roy 10-30 Depression 4.9 1.4 7.5 5.5 2015-03-21 Lois Slope 5.3 1.3 9.5 4.9 2015-02-06 Diana Top 4.8 1.3 10 3.6 2015-04-11 Diana
soil_df. loc[ "10-30" , [ "pH" , "Ca" ] ]
pH Ca Contour Depression 4.9 7.5 Slope 5.3 9.5 Top 4.8 10
soil_df. loc[ np. s_[ : , "Top" ] , [ "pH" , "Ca" ] ]
pH Ca Depth Contour 0-10 Top 5.3 13 10-30 Top 4.8 10
query()
方法
有时候需要根据一定的条件,对行,,进行过滤,通常需要先创建一个布尔数组,使用这个数组获取True值所对应的行。示例如下:
soil_df[(soil_df.PH > 5) & (soil_df.Ca < 11)] 这个是原始形式,
print ( soil_df. query( "pH > 5 and Ca < 11" ) )
pH Dens Ca Conduc Date Name
Depth Contour
0-10 Depression 5.4 0.98 11 1.5 2015-05-26 Lois
10-30 Slope 5.3 1.3 9.5 4.9 2015-02-06 Diana
query()的参数是一个运算表达式字符串。其中可以使用:not,and,or等关键字进行向量布尔运算,表达式中的变量名代表与其对应的列。这里使用的@符号:是为了使用其他的全局变量或者局域变量的值。
pH_low = 5
Ca_hi = 11
print ( soil_df. query( "pH > @pH_low and Ca < @Ca_hi" ) )
pH Dens Ca Conduc Date Name
Depth Contour
0-10 Depression 5.4 0.98 11 1.5 2015-05-26 Lois
10-30 Slope 5.3 1.3 9.5 4.9 2015-02-06 Diana