import numpy as np
import pandas as pd
df = pd. read_csv( r'D:\study\pandas\data\table.csv' , index_col= 'ID' )
df. head( )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1102 S_1 C_1 F street_2 192 73 32.5 B+ 1103 S_1 C_1 M street_2 186 82 87.2 B+ 1104 S_1 C_1 F street_2 167 81 80.4 B- 1105 S_1 C_1 F street_4 159 64 84.8 B+
一、单级索引
df. loc[ 1103 ]
School S_1
Class C_1
Gender M
Address street_2
Height 186
Weight 82
Math 87.2
Physics B+
Name: 1103, dtype: object
df. loc[ [ 1102 , 2304 ] ]
School Class Gender Address Height Weight Math Physics ID 1102 S_1 C_1 F street_2 192 73 32.5 B+ 2304 S_2 C_3 F street_6 164 81 95.5 A-
df. loc[ 1304 : ] . head( )
School Class Gender Address Height Weight Math Physics ID 1304 S_1 C_3 M street_2 195 70 85.2 A 1305 S_1 C_3 F street_5 187 69 61.7 B- 2101 S_2 C_1 M street_7 174 84 83.3 C 2102 S_2 C_1 F street_6 161 61 50.6 B+ 2103 S_2 C_1 M street_4 157 61 52.5 B-
df. loc[ 2402 : : - 1 ] . head( )
School Class Gender Address Height Weight Math Physics ID 2402 S_2 C_4 M street_7 166 82 48.7 B 2401 S_2 C_4 F street_2 192 62 45.3 A 2305 S_2 C_3 M street_4 187 73 48.9 B 2304 S_2 C_3 F street_6 164 81 95.5 A- 2303 S_2 C_3 F street_7 190 99 65.9 C
df. loc[ : , 'Height' ] . head( )
ID
1101 173
1102 192
1103 186
1104 167
1105 159
Name: Height, dtype: int64
df. loc[ : , [ 'Height' , 'Math' ] ] . head( )
Height Math ID 1101 173 34.0 1102 192 32.5 1103 186 87.2 1104 167 80.4 1105 159 84.8
df. loc[ : , 'Height' : 'Math' ] . head( )
Height Weight Math ID 1101 173 63 34.0 1102 192 73 32.5 1103 186 82 87.2 1104 167 81 80.4 1105 159 64 84.8
df. loc[ 1102 : 2401 : 3 , 'Height' : 'Math' ] . head( )
Height Weight Math ID 1102 192 73 32.5 1105 159 64 84.8 1203 160 53 58.8 1301 161 68 31.5 1304 195 70 85.2
df. loc[ lambda x: x[ 'Gender' ] == 'M' ] . head( )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1103 S_1 C_1 M street_2 186 82 87.2 B+ 1201 S_1 C_2 M street_5 188 68 97.0 A- 1203 S_1 C_2 M street_6 160 53 58.8 A+ 1301 S_1 C_3 M street_4 161 68 31.5 B+
def f ( x) :
return [ 1101 , 1103 ]
df. loc[ f]
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1103 S_1 C_1 M street_2 186 82 87.2 B+
df. loc[ df[ 'Address' ] . isin( [ 'street_7' , 'street_4' ] ) ] . head( )
School Class Gender Address Height Weight Math Physics ID 1105 S_1 C_1 F street_4 159 64 84.8 B+ 1202 S_1 C_2 F street_4 176 94 63.5 B- 1301 S_1 C_3 M street_4 161 68 31.5 B+ 1303 S_1 C_3 M street_7 188 82 49.7 B 2101 S_2 C_1 M street_7 174 84 83.3 C
df. loc[ [ True if i[ - 1 ] == '4' or i[ - 1 ] == '7' else False for i in df[ 'Address' ] . values] ] . head( )
School Class Gender Address Height Weight Math Physics ID 1105 S_1 C_1 F street_4 159 64 84.8 B+ 1202 S_1 C_2 F street_4 176 94 63.5 B- 1301 S_1 C_3 M street_4 161 68 31.5 B+ 1303 S_1 C_3 M street_7 188 82 49.7 B 2101 S_2 C_1 M street_7 174 84 83.3 C
df. iloc[ 3 ]
School S_1
Class C_1
Gender F
Address street_2
Height 167
Weight 81
Math 80.4
Physics B-
Name: 1104, dtype: object
df. iloc[ 3 : 5 ]
School Class Gender Address Height Weight Math Physics ID 1104 S_1 C_1 F street_2 167 81 80.4 B- 1105 S_1 C_1 F street_4 159 64 84.8 B+
df. iloc[ : , 3 ] . head( )
ID
1101 street_1
1102 street_2
1103 street_2
1104 street_2
1105 street_4
Name: Address, dtype: object
df. iloc[ : , 7 : : - 2 ] . head( )
Physics Weight Address Class ID 1101 A+ 63 street_1 C_1 1102 B+ 73 street_2 C_1 1103 B+ 82 street_2 C_1 1104 B- 81 street_2 C_1 1105 B+ 64 street_4 C_1
df. iloc[ 3 : : 4 , 7 : : - 2 ] . head( )
Physics Weight Address Class ID 1104 B- 81 street_2 C_1 1203 A+ 53 street_6 C_2 1302 A- 57 street_1 C_3 2101 C 84 street_7 C_1 2105 A 81 street_4 C_1
df. iloc[ lambda x: [ 3 ] ] . head( )
School Class Gender Address Height Weight Math Physics ID 1104 S_1 C_1 F street_2 167 81 80.4 B-
s = pd. Series( df[ 'Math' ] , index= df. index)
s[ 1101 ]
34.0
s[ 0 : 4 ]
ID
1101 34.0
1102 32.5
1103 87.2
1104 80.4
Name: Math, dtype: float64
s[ lambda x: x. index[ 16 : : - 6 ] ]
ID
2102 50.6
1301 31.5
1105 84.8
Name: Math, dtype: float64
s[ s> 80 ]
ID
1103 87.2
1104 80.4
1105 84.8
1201 97.0
1302 87.7
1304 85.2
2101 83.3
2205 85.4
2304 95.5
Name: Math, dtype: float64
df[ 1 : 2 ]
School Class Gender Address Height Weight Math Physics ID 1102 S_1 C_1 F street_2 192 73 32.5 B+
row = df. index. get_loc( 1102 )
df[ row: row+ 1 ]
School Class Gender Address Height Weight Math Physics ID 1102 S_1 C_1 F street_2 192 73 32.5 B+
df[ 3 : 5 ]
School Class Gender Address Height Weight Math Physics ID 1104 S_1 C_1 F street_2 167 81 80.4 B- 1105 S_1 C_1 F street_4 159 64 84.8 B+
df[ 'School' ] . head( )
ID
1101 S_1
1102 S_1
1103 S_1
1104 S_1
1105 S_1
Name: School, dtype: object
df[ [ 'School' , 'Math' ] ] . head( )
School Math ID 1101 S_1 34.0 1102 S_1 32.5 1103 S_1 87.2 1104 S_1 80.4 1105 S_1 84.8
df[ lambda x: [ 'Math' , 'Physics' ] ] . head( )
Math Physics ID 1101 34.0 A+ 1102 32.5 B+ 1103 87.2 B+ 1104 80.4 B- 1105 84.8 B+
df[ df[ 'Gender' ] == 'F' ] . head( )
School Class Gender Address Height Weight Math Physics ID 1102 S_1 C_1 F street_2 192 73 32.5 B+ 1104 S_1 C_1 F street_2 167 81 80.4 B- 1105 S_1 C_1 F street_4 159 64 84.8 B+ 1202 S_1 C_2 F street_4 176 94 63.5 B- 1204 S_1 C_2 F street_5 162 63 33.8 B
df[ ( df[ 'Gender' ] == 'F' ) & ( df[ 'Address' ] == 'street_2' ) ] . head( )
School Class Gender Address Height Weight Math Physics ID 1102 S_1 C_1 F street_2 192 73 32.5 B+ 1104 S_1 C_1 F street_2 167 81 80.4 B- 2401 S_2 C_4 F street_2 192 62 45.3 A 2404 S_2 C_4 F street_2 160 84 67.7 B
df[ ( df[ 'Math' ] > 85 ) | ( df[ 'Address' ] == 'street_7' ) ] . head( )
School Class Gender Address Height Weight Math Physics ID 1103 S_1 C_1 M street_2 186 82 87.2 B+ 1201 S_1 C_2 M street_5 188 68 97.0 A- 1302 S_1 C_3 F street_1 175 57 87.7 A- 1303 S_1 C_3 M street_7 188 82 49.7 B 1304 S_1 C_3 M street_2 195 70 85.2 A
df[ ~ ( ( df[ 'Math' ] > 75 ) | ( df[ 'Address' ] == 'street_1' ) ) ] . head( )
School Class Gender Address Height Weight Math Physics ID 1102 S_1 C_1 F street_2 192 73 32.5 B+ 1202 S_1 C_2 F street_4 176 94 63.5 B- 1203 S_1 C_2 M street_6 160 53 58.8 A+ 1204 S_1 C_2 F street_5 162 63 33.8 B 1205 S_1 C_2 F street_6 167 63 68.4 B-
df. loc[ df[ 'Math' ] > 60 , ( df[ : 8 ] [ 'Address' ] == 'street_6' ) . values] . head( )
Physics ID 1103 B+ 1104 B- 1105 B+ 1201 A- 1202 B-
df[ df[ 'Address' ] . isin( [ 'street_1' , 'street_4' ] )
& df[ 'Physics' ] . isin( [ 'A' , 'A+' ] ) ]
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 2105 S_2 C_1 M street_4 170 81 34.2 A 2203 S_2 C_2 M street_4 155 91 73.8 A+
df[ df[ [ 'Address' , 'Physics' ] ] . isin(
{ 'Address' : [ 'street_1' , 'street_4' ] , 'Physics' : [ 'A' , 'A+' ] } ) . all ( 1 ) ]
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 2105 S_2 C_1 M street_4 170 81 34.2 A 2203 S_2 C_2 M street_4 155 91 73.8 A+
display( df. at[ 1101 , 'School' ] )
display( df. loc[ 1101 , 'School' ] )
display( df. iat[ 0 , 0 ] )
display( df. iloc[ 0 , 0 ] )
% timeit df. at[ 1101 , 'School' ]
% timeit df. loc[ 1101 , 'School' ]
% timeit df. iat[ 0 , 0 ]
% timeit df. iloc[ 0 , 0 ]
'S_1'
'S_1'
'S_1'
'S_1'
5.62 µs ± 501 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
11 µs ± 1.21 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)
6.85 µs ± 503 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
12.5 µs ± 1.91 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)
pd. interval_range( start= 0 , end= 5 )
IntervalIndex([(0, 1], (1, 2], (2, 3], (3, 4], (4, 5]],
closed='right',
dtype='interval[int64]')
pd. interval_range( start= 0 , periods= 8 , freq= 5 )
IntervalIndex([(0, 5], (5, 10], (10, 15], (15, 20], (20, 25], (25, 30], (30, 35], (35, 40]],
closed='right',
dtype='interval[int64]')
math_interval= pd. cut( df[ 'Math' ] , bins= [ 0 , 40 , 60 , 80 , 100 ] )
math_interval. head( )
ID
1101 (0, 40]
1102 (0, 40]
1103 (80, 100]
1104 (80, 100]
1105 (80, 100]
Name: Math, dtype: category
Categories (4, interval[int64]): [(0, 40] < (40, 60] < (60, 80] < (80, 100]]
df_i = df. join( math_interval, rsuffix= '_interval' ) [ [ 'Math' , 'Math_interval' ] ] \
. reset_index( ) . set_index( 'Math_interval' )
df_i. head( )
ID Math Math_interval (0, 40] 1101 34.0 (0, 40] 1102 32.5 (80, 100] 1103 87.2 (80, 100] 1104 80.4 (80, 100] 1105 84.8
df_i. loc[ 65 ] . head( )
ID Math Math_interval (60, 80] 1202 63.5 (60, 80] 1205 68.4 (60, 80] 1305 61.7 (60, 80] 2104 72.2 (60, 80] 2202 68.5
df_i. loc[ [ 65 , 90 ] ] . head( )
ID Math Math_interval (60, 80] 1202 63.5 (60, 80] 1205 68.4 (60, 80] 1305 61.7 (60, 80] 2104 72.2 (60, 80] 2202 68.5
df_i[ df_i. index. astype( 'interval' ) . overlaps( pd. Interval( 70 , 85 ) ) ] . head( )
ID Math Math_interval (80, 100] 1103 87.2 (80, 100] 1104 80.4 (80, 100] 1105 84.8 (80, 100] 1201 97.0 (60, 80] 1202 63.5
二、多级索引
tuples = [ ( 'A' , 'a' ) , ( 'A' , 'b' ) , ( 'B' , 'a' ) , ( 'B' , 'b' ) ]
mul_index = pd. MultiIndex. from_tuples( tuples, names= ( 'Upper' , 'Lower' ) )
mul_index
MultiIndex([('A', 'a'),
('A', 'b'),
('B', 'a'),
('B', 'b')],
names=['Upper', 'Lower'])
pd. DataFrame( { 'score' : [ 'perfect' , 'good' , 'fair' , 'bad' ] } , index= mul_index)
score Upper Lower A a perfect b good B a fair b bad
L1 = list ( 'AABB' )
L2 = list ( 'abab' )
tuples = list ( zip ( L1, L2) )
mul_index = pd. MultiIndex. from_tuples( tuples, names= ( 'Upper' , 'Lower' ) )
pd. DataFrame( { 'Score' : [ 'perfect' , 'good' , 'fair' , 'bad' ] } , index= mul_index)
Score Upper Lower A a perfect b good B a fair b bad
arrays = [ [ 'A' , 'a' ] , [ 'A' , 'b' ] , [ 'B' , 'a' ] , [ 'B' , 'b' ] ]
mul_index = pd. MultiIndex. from_tuples( arrays, names= ( 'Upper' , 'Lower' ) )
pd. DataFrame( { 'Score' : [ 'perfect' , 'good' , 'fair' , 'bad' ] } , index= mul_index)
Score Upper Lower A a perfect b good B a fair b bad
mul_index
MultiIndex([('A', 'a'),
('A', 'b'),
('B', 'a'),
('B', 'b')],
names=['Upper', 'Lower'])
L1 = [ 'A' , 'B' ]
L2 = [ 'a' , 'b' ]
pd. MultiIndex. from_product( [ L1, L2] , names= ( 'Upper' , 'Lower' ) )
MultiIndex([('A', 'a'),
('A', 'b'),
('B', 'a'),
('B', 'b')],
names=['Upper', 'Lower'])
df_using_mul = df. set_index( [ 'Class' , 'Address' ] )
df_using_mul. head( )
School Gender Height Weight Math Physics Class Address C_1 street_1 S_1 M 173 63 34.0 A+ street_2 S_1 F 192 73 32.5 B+ street_2 S_1 M 186 82 87.2 B+ street_2 S_1 F 167 81 80.4 B- street_4 S_1 F 159 64 84.8 B+
df_using_mul. head( )
School Gender Height Weight Math Physics Class Address C_1 street_1 S_1 M 173 63 34.0 A+ street_2 S_1 F 192 73 32.5 B+ street_2 S_1 M 186 82 87.2 B+ street_2 S_1 F 167 81 80.4 B- street_4 S_1 F 159 64 84.8 B+
df_using_mul. sort_index( ) . loc[ 'C_2' , 'street_5' ]
School Gender Height Weight Math Physics Class Address C_2 street_5 S_1 M 188 68 97.0 A- street_5 S_1 F 162 63 33.8 B street_5 S_2 M 193 100 39.1 B
df_using_mul. sort_index( ) . loc[ ( 'C_2' , 'street_6' ) : ( 'C_3' , 'street_4' ) ]
School Gender Height Weight Math Physics Class Address C_2 street_6 S_1 M 160 53 58.8 A+ street_6 S_1 F 167 63 68.4 B- street_7 S_2 F 194 77 68.5 B+ street_7 S_2 F 183 76 85.4 B C_3 street_1 S_1 F 175 57 87.7 A- street_2 S_1 M 195 70 85.2 A street_4 S_1 M 161 68 31.5 B+ street_4 S_2 F 157 78 72.3 B+ street_4 S_2 M 187 73 48.9 B
df_using_mul. sort_index( ) . loc[ ( 'C_2' , 'street_7' ) : 'C_3' ] . head( )
School Gender Height Weight Math Physics Class Address C_2 street_7 S_2 F 194 77 68.5 B+ street_7 S_2 F 183 76 85.4 B C_3 street_1 S_1 F 175 57 87.7 A- street_2 S_1 M 195 70 85.2 A street_4 S_1 M 161 68 31.5 B+
df_using_mul. sort_index( ) . loc[ [ ( 'C_2' , 'street_7' ) , ( 'C_3' , 'street_2' ) ] ]
School Gender Height Weight Math Physics Class Address C_2 street_7 S_2 F 194 77 68.5 B+ street_7 S_2 F 183 76 85.4 B C_3 street_2 S_1 M 195 70 85.2 A
df_using_mul. sort_index( ) . loc[ ( [ 'C_2' , 'C_3' ] , [ 'street_4' , 'street_7' ] ) , : ]
School Gender Height Weight Math Physics Class Address C_2 street_4 S_1 F 176 94 63.5 B- street_4 S_2 M 155 91 73.8 A+ street_7 S_2 F 194 77 68.5 B+ street_7 S_2 F 183 76 85.4 B C_3 street_4 S_1 M 161 68 31.5 B+ street_4 S_2 F 157 78 72.3 B+ street_4 S_2 M 187 73 48.9 B street_7 S_1 M 188 82 49.7 B street_7 S_2 F 190 99 65.9 C
L1, L2= [ 'A' , 'B' , 'C' ] , [ 'a' , 'b' , 'c' ]
mul_index1 = pd. MultiIndex. from_product( [ L1, L2] , names= ( 'Upper' , 'Lower' ) )
L3, L4 = [ 'D' , 'E' , 'F' ] , [ 'd' , 'e' , 'f' ]
mul_index2 = pd. MultiIndex. from_product( [ L3, L4] , names= ( 'Big' , 'Small' ) )
df_s = pd. DataFrame( np. random. rand( 9 , 9 ) , index= mul_index1, columns= mul_index2)
df_s
Big D E F Small d e f d e f d e f Upper Lower A a 0.934746 0.827636 0.303160 0.456595 0.101167 0.448518 0.926865 0.246947 0.007101 b 0.784300 0.743920 0.914071 0.615905 0.465386 0.814935 0.820146 0.978239 0.075463 c 0.234725 0.530927 0.334103 0.784667 0.423466 0.715657 0.001406 0.106858 0.858155 B a 0.479080 0.170341 0.894153 0.727380 0.617945 0.224188 0.036767 0.502997 0.761644 b 0.036678 0.109176 0.771535 0.810377 0.475953 0.794977 0.019968 0.181414 0.917155 c 0.261726 0.681957 0.894073 0.985347 0.325102 0.128026 0.252363 0.501242 0.907674 C a 0.608485 0.850945 0.486963 0.750401 0.384161 0.677355 0.923022 0.535271 0.918066 b 0.737564 0.385485 0.020664 0.295569 0.787461 0.952917 0.746152 0.180481 0.802424 c 0.025335 0.531937 0.259362 0.505687 0.775790 0.174319 0.522157 0.011474 0.001091
idx= pd. IndexSlice
df_s. loc[ idx[ 'B' : , df_s[ 'D' ] [ 'd' ] > 0.3 ] , idx[ df_s. sum ( ) > 4 ] ]
Big D E F Small d e f d e f d f Upper Lower B a 0.479080 0.170341 0.894153 0.727380 0.617945 0.224188 0.036767 0.761644 C a 0.608485 0.850945 0.486963 0.750401 0.384161 0.677355 0.923022 0.918066 b 0.737564 0.385485 0.020664 0.295569 0.787461 0.952917 0.746152 0.802424
df_using_mul. head( )
School Gender Height Weight Math Physics Class Address C_1 street_1 S_1 M 173 63 34.0 A+ street_2 S_1 F 192 73 32.5 B+ street_2 S_1 M 186 82 87.2 B+ street_2 S_1 F 167 81 80.4 B- street_4 S_1 F 159 64 84.8 B+
df_using_mul. swaplevel( i= 1 , j= 0 , axis= 0 ) . sort_index( ) . head( )
School Gender Height Weight Math Physics Address Class street_1 C_1 S_1 M 173 63 34.0 A+ C_2 S_2 M 175 74 47.2 B- C_3 S_1 F 175 57 87.7 A- street_2 C_1 S_1 F 192 73 32.5 B+ C_1 S_1 M 186 82 87.2 B+
df_muls = df. set_index( [ 'School' , 'Class' , 'Address' ] )
df_muls. head( )
Gender Height Weight Math Physics School Class Address S_1 C_1 street_1 M 173 63 34.0 A+ street_2 F 192 73 32.5 B+ street_2 M 186 82 87.2 B+ street_2 F 167 81 80.4 B- street_4 F 159 64 84.8 B+
df_muls. reorder_levels( [ 2 , 0 , 1 ] , axis= 0 ) . sort_index( ) . head( )
Gender Height Weight Math Physics Address School Class street_1 S_1 C_1 M 173 63 34.0 A+ C_3 F 175 57 87.7 A- S_2 C_2 M 175 74 47.2 B- street_2 S_1 C_1 F 192 73 32.5 B+ C_1 M 186 82 87.2 B+
df_muls. reorder_levels( [ 'Address' , 'School' , 'Class' ] , axis= 0 ) . sort_index( ) . head( )
Gender Height Weight Math Physics Address School Class street_1 S_1 C_1 M 173 63 34.0 A+ C_3 F 175 57 87.7 A- S_2 C_2 M 175 74 47.2 B- street_2 S_1 C_1 F 192 73 32.5 B+ C_1 M 186 82 87.2 B+
三、索引设定
pd. read_csv( r'D:\study\pandas\data\table.csv' , index_col= [ 'Address' , 'School' ] ) . head( )
Class ID Gender Height Weight Math Physics Address School street_1 S_1 C_1 1101 M 173 63 34.0 A+ street_2 S_1 C_1 1102 F 192 73 32.5 B+ S_1 C_1 1103 M 186 82 87.2 B+ S_1 C_1 1104 F 167 81 80.4 B- street_4 S_1 C_1 1105 F 159 64 84.8 B+
df. head( )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1102 S_1 C_1 F street_2 192 73 32.5 B+ 1103 S_1 C_1 M street_2 186 82 87.2 B+ 1104 S_1 C_1 F street_2 167 81 80.4 B- 1105 S_1 C_1 F street_4 159 64 84.8 B+
df. reindex( index= [ 1101 , 1203 , 1206 , 2402 ] )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173.0 63.0 34.0 A+ 1203 S_1 C_2 M street_6 160.0 53.0 58.8 A+ 1206 NaN NaN NaN NaN NaN NaN NaN NaN 2402 S_2 C_4 M street_7 166.0 82.0 48.7 B
df. reindex( columns= [ 'Height' , 'Gender' , 'Average' ] ) . head( )
Height Gender Average ID 1101 173 M NaN 1102 192 F NaN 1103 186 M NaN 1104 167 F NaN 1105 159 F NaN
df. reindex( index= [ 1101 , 1203 , 1206 , 2402 ] , method= 'bfill' )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1203 S_1 C_2 M street_6 160 53 58.8 A+ 1206 S_1 C_3 M street_4 161 68 31.5 B+ 2402 S_2 C_4 M street_7 166 82 48.7 B
df. reindex( index= [ 1101 , 1203 , 1206 , 2402 ] , method= 'nearest' )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1203 S_1 C_2 M street_6 160 53 58.8 A+ 1206 S_1 C_2 F street_6 167 63 68.4 B- 2402 S_2 C_4 M street_7 166 82 48.7 B
df_temp = pd. DataFrame( { 'Weight' : np. zeros( 5 ) ,
'Height' : np. zeros( 5 ) ,
'ID' : [ 1101 , 1104 , 1103 , 1106 , 1102 ] } ) . set_index( 'ID' )
df_temp. reindex_like( df[ 0 : 5 ] [ [ 'Weight' , 'Height' ] ] )
Weight Height ID 1101 0.0 0.0 1102 0.0 0.0 1103 0.0 0.0 1104 0.0 0.0 1105 NaN NaN
df_temp = pd. DataFrame( { 'Weight' : range ( 5 ) ,
'Height' : range ( 5 ) ,
'ID' : [ 1101 , 1104 , 1103 , 1106 , 1102 ] } ) . set_index( 'ID' ) . sort_index( )
df_temp. reindex_like( df[ 0 : 5 ] [ [ 'Weight' , 'Height' ] ] , method= 'bfill' )
Weight Height ID 1101 0 0 1102 4 4 1103 2 2 1104 1 1 1105 3 3
df. head( )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1102 S_1 C_1 F street_2 192 73 32.5 B+ 1103 S_1 C_1 M street_2 186 82 87.2 B+ 1104 S_1 C_1 F street_2 167 81 80.4 B- 1105 S_1 C_1 F street_4 159 64 84.8 B+
df. set_index( 'Class' ) . head( )
School Gender Address Height Weight Math Physics Class C_1 S_1 M street_1 173 63 34.0 A+ C_1 S_1 F street_2 192 73 32.5 B+ C_1 S_1 M street_2 186 82 87.2 B+ C_1 S_1 F street_2 167 81 80.4 B- C_1 S_1 F street_4 159 64 84.8 B+
df. set_index( 'Class' , append= True ) . head( )
School Gender Address Height Weight Math Physics ID Class 1101 C_1 S_1 M street_1 173 63 34.0 A+ 1102 C_1 S_1 F street_2 192 73 32.5 B+ 1103 C_1 S_1 M street_2 186 82 87.2 B+ 1104 C_1 S_1 F street_2 167 81 80.4 B- 1105 C_1 S_1 F street_4 159 64 84.8 B+
df. set_index( pd. Series( range ( df. shape[ 0 ] ) ) ) . head( )
School Class Gender Address Height Weight Math Physics 0 S_1 C_1 M street_1 173 63 34.0 A+ 1 S_1 C_1 F street_2 192 73 32.5 B+ 2 S_1 C_1 M street_2 186 82 87.2 B+ 3 S_1 C_1 F street_2 167 81 80.4 B- 4 S_1 C_1 F street_4 159 64 84.8 B+
df. set_index( [ pd. Series( range ( df. shape[ 0 ] ) ) , pd. Series( np. ones( df. shape[ 0 ] ) ) ] ) . head( )
School Class Gender Address Height Weight Math Physics 0 1.0 S_1 C_1 M street_1 173 63 34.0 A+ 1 1.0 S_1 C_1 F street_2 192 73 32.5 B+ 2 1.0 S_1 C_1 M street_2 186 82 87.2 B+ 3 1.0 S_1 C_1 F street_2 167 81 80.4 B- 4 1.0 S_1 C_1 F street_4 159 64 84.8 B+
df. reset_index( ) . head( )
ID School Class Gender Address Height Weight Math Physics 0 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1 1102 S_1 C_1 F street_2 192 73 32.5 B+ 2 1103 S_1 C_1 M street_2 186 82 87.2 B+ 3 1104 S_1 C_1 F street_2 167 81 80.4 B- 4 1105 S_1 C_1 F street_4 159 64 84.8 B+
L1, L2 = [ 'A' , 'B' , 'C' ] , [ 'a' , 'b' , 'c' ]
mul_index1 = pd. MultiIndex. from_product( [ L1, L2] , names= ( 'Upper' , 'Lower' ) )
L3, L4 = [ 'D' , 'E' , 'F' ] , [ 'd' , 'e' , 'f' ]
mul_index2 = pd. MultiIndex. from_product( [ L3, L4] , names= ( 'Big' , 'Small' ) )
df_temp = pd. DataFrame( np. random. rand( 9 , 9 ) , index= mul_index1, columns= mul_index2)
df_temp. head( )
Big D E F Small d e f d e f d e f Upper Lower A a 0.726447 0.793006 0.506717 0.899841 0.213879 0.452684 0.890727 0.696849 0.474411 b 0.139219 0.615655 0.589970 0.463351 0.554888 0.075318 0.690641 0.043552 0.783326 c 0.028288 0.091246 0.045964 0.223654 0.344145 0.173138 0.345186 0.214551 0.404022 B a 0.016625 0.801064 0.002058 0.042780 0.484272 0.736578 0.800087 0.391336 0.918262 b 0.954563 0.232570 0.170594 0.440004 0.957831 0.223091 0.990364 0.453247 0.097994
df_temp1 = df_temp. reset_index( level= 1 , col_level= 1 )
df_temp1. head( )
Big D E F Small Lower d e f d e f d e f Upper A a 0.726447 0.793006 0.506717 0.899841 0.213879 0.452684 0.890727 0.696849 0.474411 A b 0.139219 0.615655 0.589970 0.463351 0.554888 0.075318 0.690641 0.043552 0.783326 A c 0.028288 0.091246 0.045964 0.223654 0.344145 0.173138 0.345186 0.214551 0.404022 B a 0.016625 0.801064 0.002058 0.042780 0.484272 0.736578 0.800087 0.391336 0.918262 B b 0.954563 0.232570 0.170594 0.440004 0.957831 0.223091 0.990364 0.453247 0.097994
df_temp1. index
Index(['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C'], dtype='object', name='Upper')
df_temp. rename_axis( index= { 'Lower' : 'LowerLower' } , columns= { 'Big' , 'BigBig' } )
Big D E F BigBig d e f d e f d e f Upper LowerLower A a 0.726447 0.793006 0.506717 0.899841 0.213879 0.452684 0.890727 0.696849 0.474411 b 0.139219 0.615655 0.589970 0.463351 0.554888 0.075318 0.690641 0.043552 0.783326 c 0.028288 0.091246 0.045964 0.223654 0.344145 0.173138 0.345186 0.214551 0.404022 B a 0.016625 0.801064 0.002058 0.042780 0.484272 0.736578 0.800087 0.391336 0.918262 b 0.954563 0.232570 0.170594 0.440004 0.957831 0.223091 0.990364 0.453247 0.097994 c 0.579913 0.233098 0.982724 0.684670 0.940787 0.330079 0.043015 0.568728 0.284846 C a 0.955809 0.318091 0.370619 0.214261 0.548656 0.338555 0.861426 0.256495 0.601147 b 0.081233 0.000421 0.989728 0.927117 0.286745 0.458387 0.381208 0.342766 0.384358 c 0.989351 0.935747 0.003250 0.115069 0.601064 0.488368 0.890740 0.712704 0.730693
df_temp. rename( index= { 'A' : 'T' } , columns= { 'e' : 'changed_e' } ) . head( )
Big D E F Small d changed_e f d changed_e f d changed_e f Upper Lower T a 0.726447 0.793006 0.506717 0.899841 0.213879 0.452684 0.890727 0.696849 0.474411 b 0.139219 0.615655 0.589970 0.463351 0.554888 0.075318 0.690641 0.043552 0.783326 c 0.028288 0.091246 0.045964 0.223654 0.344145 0.173138 0.345186 0.214551 0.404022 B a 0.016625 0.801064 0.002058 0.042780 0.484272 0.736578 0.800087 0.391336 0.918262 b 0.954563 0.232570 0.170594 0.440004 0.957831 0.223091 0.990364 0.453247 0.097994
四、常用索引型函数
df. head( )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1102 S_1 C_1 F street_2 192 73 32.5 B+ 1103 S_1 C_1 M street_2 186 82 87.2 B+ 1104 S_1 C_1 F street_2 167 81 80.4 B- 1105 S_1 C_1 F street_4 159 64 84.8 B+
df. where( df[ 'Gender' ] == 'M' ) . head( )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173.0 63.0 34.0 A+ 1102 NaN NaN NaN NaN NaN NaN NaN NaN 1103 S_1 C_1 M street_2 186.0 82.0 87.2 B+ 1104 NaN NaN NaN NaN NaN NaN NaN NaN 1105 NaN NaN NaN NaN NaN NaN NaN NaN
df. where( df[ 'Gender' ] == 'M' ) . dropna( ) . head( )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173.0 63.0 34.0 A+ 1103 S_1 C_1 M street_2 186.0 82.0 87.2 B+ 1201 S_1 C_2 M street_5 188.0 68.0 97.0 A- 1203 S_1 C_2 M street_6 160.0 53.0 58.8 A+ 1301 S_1 C_3 M street_4 161.0 68.0 31.5 B+
df. where( df[ 'Gender' ] == 'M' , np. random. rand( df. shape[ 0 ] , df. shape[ 1 ] ) ) . head( )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173.000000 63.000000 34.000000 A+ 1102 0.0088742 0.68751 0.17843 0.538873 0.795946 0.860084 0.437721 0.929262 1103 S_1 C_1 M street_2 186.000000 82.000000 87.200000 B+ 1104 0.0508425 0.213038 0.734898 0.1413 0.441188 0.516610 0.896787 0.110277 1105 0.849749 0.775833 0.435329 0.947705 0.050361 0.619292 0.829872 0.963952
df. mask( df[ 'Gender' ] == 'M' ) . dropna( ) . head( )
School Class Gender Address Height Weight Math Physics ID 1102 S_1 C_1 F street_2 192.0 73.0 32.5 B+ 1104 S_1 C_1 F street_2 167.0 81.0 80.4 B- 1105 S_1 C_1 F street_4 159.0 64.0 84.8 B+ 1202 S_1 C_2 F street_4 176.0 94.0 63.5 B- 1204 S_1 C_2 F street_5 162.0 63.0 33.8 B
df. mask( df[ 'Gender' ] == 'M' , np. random. rand( df. shape[ 0 ] , df. shape[ 1 ] ) ) . head( )
School Class Gender Address Height Weight Math Physics ID 1101 0.645595 0.141115 0.0113081 0.346616 0.399427 0.367468 0.260131 0.32254 1102 S_1 C_1 F street_2 192.000000 73.000000 32.500000 B+ 1103 0.0284472 0.513403 0.0249496 0.405156 0.640764 0.468473 0.128766 0.553058 1104 S_1 C_1 F street_2 167.000000 81.000000 80.400000 B- 1105 S_1 C_1 F street_4 159.000000 64.000000 84.800000 B+
df. head( )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1102 S_1 C_1 F street_2 192 73 32.5 B+ 1103 S_1 C_1 M street_2 186 82 87.2 B+ 1104 S_1 C_1 F street_2 167 81 80.4 B- 1105 S_1 C_1 F street_4 159 64 84.8 B+
df. query( '(Address in ["street_6","street_7"])&(Weight>(70+10))&(ID in [1303,2304,2402])' )
School Class Gender Address Height Weight Math Physics ID 1303 S_1 C_3 M street_7 188 82 49.7 B 2304 S_2 C_3 F street_6 164 81 95.5 A- 2402 S_2 C_4 M street_7 166 82 48.7 B
五、重复元素处理
df. duplicated( 'Class' ) . head( )
ID
1101 False
1102 True
1103 True
1104 True
1105 True
dtype: bool
df. duplicated( 'Class' , keep= 'last' ) . tail( )
ID
2401 True
2402 True
2403 True
2404 True
2405 False
dtype: bool
df. duplicated( 'Class' , keep= False ) . head( )
ID
1101 True
1102 True
1103 True
1104 True
1105 True
dtype: bool
df. drop_duplicates( 'Class' )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1201 S_1 C_2 M street_5 188 68 97.0 A- 1301 S_1 C_3 M street_4 161 68 31.5 B+ 2401 S_2 C_4 F street_2 192 62 45.3 A
df. drop_duplicates( 'Class' , keep= 'last' )
School Class Gender Address Height Weight Math Physics ID 2105 S_2 C_1 M street_4 170 81 34.2 A 2205 S_2 C_2 F street_7 183 76 85.4 B 2305 S_2 C_3 M street_4 187 73 48.9 B 2405 S_2 C_4 F street_6 193 54 47.6 B
df. drop_duplicates( [ 'School' , 'Class' ] )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1201 S_1 C_2 M street_5 188 68 97.0 A- 1301 S_1 C_3 M street_4 161 68 31.5 B+ 2101 S_2 C_1 M street_7 174 84 83.3 C 2201 S_2 C_2 M street_5 193 100 39.1 B 2301 S_2 C_3 F street_4 157 78 72.3 B+ 2401 S_2 C_4 F street_2 192 62 45.3 A
六、抽样函数(sample)
df. sample( n= 5 )
School Class Gender Address Height Weight Math Physics ID 2301 S_2 C_3 F street_4 157 78 72.3 B+ 1202 S_1 C_2 F street_4 176 94 63.5 B- 2104 S_2 C_1 F street_5 159 97 72.2 B+ 1101 S_1 C_1 M street_1 173 63 34.0 A+ 2401 S_2 C_4 F street_2 192 62 45.3 A
df. sample( frac= 0.05 )
School Class Gender Address Height Weight Math Physics ID 1301 S_1 C_3 M street_4 161 68 31.5 B+ 2301 S_2 C_3 F street_4 157 78 72.3 B+
df. sample( n= df. shape[ 0 ] , replace= True ) . head( )
School Class Gender Address Height Weight Math Physics ID 1103 S_1 C_1 M street_2 186 82 87.2 B+ 2402 S_2 C_4 M street_7 166 82 48.7 B 2402 S_2 C_4 M street_7 166 82 48.7 B 1203 S_1 C_2 M street_6 160 53 58.8 A+ 1103 S_1 C_1 M street_2 186 82 87.2 B+
df. sample( n= 35 , replace= True ) . index. is_unique
False
df. sample( n= 3 , axis= 1 ) . head( )
Math Weight Physics ID 1101 34.0 63 A+ 1102 32.5 73 B+ 1103 87.2 82 B+ 1104 80.4 81 B- 1105 84.8 64 B+
df. sample( n= 3 , weights= np. random. rand( df. shape[ 0 ] ) ) . head( )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 2205 S_2 C_2 F street_7 183 76 85.4 B 2102 S_2 C_1 F street_6 161 61 50.6 B+
df. sample( n= 3 , weights= df[ 'Math' ] ) . head( )
School Class Gender Address Height Weight Math Physics ID 1201 S_1 C_2 M street_5 188 68 97.0 A- 2304 S_2 C_3 F street_6 164 81 95.5 A- 2301 S_2 C_3 F street_4 157 78 72.3 B+
七、问题与练习
#【问题一】 如何更改列或行的顺序?如何交换奇偶行(列)的顺序? #【问题二】 如果要选出DataFrame的某个子集,请给出尽可能多的方法实现。
#【问题三】 query函数比其他索引方法的速度更慢吗?在什么场合使用什么索引最高效? 1 #【问题四】 单级索引能使用Slice对象吗?能的话怎么使用,请给出一个例子。
#【问题五】 如何快速找出某一列的缺失值所在索引? #【问题六】 索引设定中的所有方法分别适用于哪些场合?怎么直接把某个DataFrame的索引换成任意给定同长度的索引? #【问题七】 多级索引有什么适用场合? #【问题八】 什么时候需要重复元素处理?
df= pd . read_csv( r'D:\study\pandas\data\UFO.csv' )
df. rename( columns= { 'duration (seconds)' : 'duration' } , inplace= True )
df. head( )
datetime shape duration latitude longitude 0 10/10/1949 20:30 cylinder 2700.0 29.883056 -97.941111 1 10/10/1949 21:00 light 7200.0 29.384210 -98.581082 2 10/10/1955 17:00 circle 20.0 53.200000 -2.916667 3 10/10/1956 21:00 circle 20.0 28.978333 -96.645833 4 10/10/1960 20:00 light 900.0 21.418056 -157.803611
df. query( 'duration > 60' ) [ 'shape' ] . value_counts( ) . index[ 0 ]
'light'
bins_long = np. linspace( - 180 , 180 , 13 ) . tolist( )
bins_la = np. linspace( - 90 , 90 , 11 ) . tolist( )
cuts_long = pd. cut( df[ 'longitude' ] , bins= bins_long)
df[ 'cuts_long' ] = cuts_long
cuts_la = pd. cut( df[ 'latitude' ] , bins= bins_la)
df[ 'cuts_la' ] = cuts_la
df. head( )
datetime shape duration latitude longitude cuts_long cuts_la 0 10/10/1949 20:30 cylinder 2700.0 29.883056 -97.941111 (-120.0, -90.0] (18.0, 36.0] 1 10/10/1949 21:00 light 7200.0 29.384210 -98.581082 (-120.0, -90.0] (18.0, 36.0] 2 10/10/1955 17:00 circle 20.0 53.200000 -2.916667 (-30.0, 0.0] (36.0, 54.0] 3 10/10/1956 21:00 circle 20.0 28.978333 -96.645833 (-120.0, -90.0] (18.0, 36.0] 4 10/10/1960 20:00 light 900.0 21.418056 -157.803611 (-180.0, -150.0] (18.0, 36.0]
df. set_index( [ 'cuts_long' , 'cuts_la' ] ) . index. value_counts( ) . head( )
((-90.0, -60.0], (36.0, 54.0]) 27891
((-120.0, -90.0], (18.0, 36.0]) 14280
((-120.0, -90.0], (36.0, 54.0]) 11960
((-90.0, -60.0], (18.0, 36.0]) 9923
((-150.0, -120.0], (36.0, 54.0]) 9658
dtype: int64
df= pd. read_csv( r'D:\study\pandas\data\Pokemon.csv' )
df. head( )
# Name Type 1 Type 2 Total HP Attack Defense Sp. Atk Sp. Def Speed Generation Legendary 0 1 Bulbasaur Grass Poison 318 45 49 49 65 65 45 1 False 1 2 Ivysaur Grass Poison 405 60 62 63 80 80 60 1 False 2 3 Venusaur Grass Poison 525 80 82 83 100 100 80 1 False 3 3 VenusaurMega Venusaur Grass Poison 625 80 100 123 122 120 80 1 False 4 4 Charmander Fire NaN 309 39 52 43 60 50 65 1 False
df[ 'Type 2' ] . count( ) / df. shape[ 0 ]
0.5175
df. query( 'Total>=580' ) [ 'Legendary' ] . value_counts( normalize= True )
True 0.575221
False 0.424779
Name: Legendary, dtype: float64
df[ df[ 'Type 1' ] == 'Fighting' ] . sort_values( by= 'Attack' , ascending= False ) . iloc[ : 3 ]
# Name Type 1 Type 2 Total HP Attack Defense Sp. Atk Sp. Def Speed Generation Legendary 498 448 LucarioMega Lucario Fighting Steel 625 70 145 88 140 70 112 4 False 594 534 Conkeldurr Fighting NaN 505 105 140 95 55 65 45 5 False 74 68 Machamp Fighting NaN 505 90 130 80 65 85 55 1 False
df[ 'range' ] = df. iloc[ : , 5 : 11 ] . max ( axis= 1 ) - df. iloc[ : , 5 : 11 ] . min ( axis= 1 )
attribute= df[ [ 'Type 1' , 'range' ] ] . set_index( 'Type 1' )
max_range = 0
result = ' '
for i in attribute. index. unique( ) :
temp = attribute. loc[ i, : ] . mean( )
if temp. values[ 0 ] > max_range:
max_range = temp. values[ 0 ]
result = i
result
'Steel'
df. query( 'Legendary ==True' ) [ 'Type 1' ] . value_counts( normalize= True ) . index[ 0 ]
'Psychic'
attribute = df. query( 'Legendary == True' ) [ [ 'Type 1' , 'Total' ] ] . set_index( 'Type 1' )
max_value = 0
result = ''
for i in attribute. index. unique( ) :
temp = float ( attribute. loc[ i, : ] . mean( ) )
if temp > max_value:
max_value = temp
result = i
result
'Normal'