import numpy as np
import pandas as pd
df= pd. read_csv( 'E:\jupyter Notebook\天池比赛\joyful-pandas-master\data\\table.csv' , index_col= 'ID' )
df. head( 20 )
Unnamed: 0 School Class Gender Address Height Weight Math Physics ID 1101 0 S_1 C_1 M street_1 173 63 34.0 A+ 1102 1 S_1 C_1 F street_2 192 73 32.5 B+ 1103 2 S_1 C_1 M street_2 186 82 87.2 B+ 1104 3 S_1 C_1 F street_2 167 81 80.4 B- 1105 4 S_1 C_1 F street_4 159 64 84.8 B+ 1201 5 S_1 C_2 M street_5 188 68 97.0 A- 1202 6 S_1 C_2 F street_4 176 94 63.5 B- 1203 7 S_1 C_2 M street_6 160 53 58.8 A+ 1204 8 S_1 C_2 F street_5 162 63 33.8 B 1205 9 S_1 C_2 F street_6 167 63 68.4 B- 1301 10 S_1 C_3 M street_4 161 68 31.5 B+ 1302 11 S_1 C_3 F street_1 175 57 87.7 A- 1303 12 S_1 C_3 M street_7 188 82 49.7 B 1304 13 S_1 C_3 M street_2 195 70 85.2 A 1305 14 S_1 C_3 F street_5 187 69 61.7 B- 2101 15 S_2 C_1 M street_7 174 84 83.3 C 2102 16 S_2 C_1 F street_6 161 61 50.6 B+ 2103 17 S_2 C_1 M street_4 157 61 52.5 B- 2104 18 S_2 C_1 F street_5 159 97 72.2 B+ 2105 19 S_2 C_1 M street_4 170 81 34.2 A
df. columns
df= df. drop( columns= 'Unnamed: 0' )
df. head( )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1102 S_1 C_1 F street_2 192 73 32.5 B+ 1103 S_1 C_1 M street_2 186 82 87.2 B+ 1104 S_1 C_1 F street_2 167 81 80.4 B- 1105 S_1 C_1 F street_4 159 64 84.8 B+
单级索引
loc方法,iloc方法,[]操作符
最常用的三种索引方法,其中**iloc表示位置索引,loc表示标签索引,[]具有良好的便利性
loc方法(注意:所有在loc中使用的切片是包含右端点的!!!)
1.单行索引
df. loc[ 1103 ]
School S_1
Class C_1
Gender M
Address street_2
Height 186
Weight 82
Math 87.2
Physics B+
Name: 1103, dtype: object
2.多行索引
df. loc[ [ 1102 , 1104 , 2304 ] ]
School Class Gender Address Height Weight Math Physics ID 1102 S_1 C_1 F street_2 192 73 32.5 B+ 1104 S_1 C_1 F street_2 167 81 80.4 B- 2304 S_2 C_3 F street_6 164 81 95.5 A-
df. loc[ 1103 : ]
School Class Gender Address Height Weight Math Physics ID 1103 S_1 C_1 M street_2 186 82 87.2 B+ 1104 S_1 C_1 F street_2 167 81 80.4 B- 1105 S_1 C_1 F street_4 159 64 84.8 B+ 1201 S_1 C_2 M street_5 188 68 97.0 A- 1202 S_1 C_2 F street_4 176 94 63.5 B- 1203 S_1 C_2 M street_6 160 53 58.8 A+ 1204 S_1 C_2 F street_5 162 63 33.8 B 1205 S_1 C_2 F street_6 167 63 68.4 B- 1301 S_1 C_3 M street_4 161 68 31.5 B+ 1302 S_1 C_3 F street_1 175 57 87.7 A- 1303 S_1 C_3 M street_7 188 82 49.7 B 1304 S_1 C_3 M street_2 195 70 85.2 A 1305 S_1 C_3 F street_5 187 69 61.7 B- 2101 S_2 C_1 M street_7 174 84 83.3 C 2102 S_2 C_1 F street_6 161 61 50.6 B+ 2103 S_2 C_1 M street_4 157 61 52.5 B- 2104 S_2 C_1 F street_5 159 97 72.2 B+ 2105 S_2 C_1 M street_4 170 81 34.2 A 2201 S_2 C_2 M street_5 193 100 39.1 B 2202 S_2 C_2 F street_7 194 77 68.5 B+ 2203 S_2 C_2 M street_4 155 91 73.8 A+ 2204 S_2 C_2 M street_1 175 74 47.2 B- 2205 S_2 C_2 F street_7 183 76 85.4 B 2301 S_2 C_3 F street_4 157 78 72.3 B+ 2302 S_2 C_3 M street_5 171 88 32.7 A 2303 S_2 C_3 F street_7 190 99 65.9 C 2304 S_2 C_3 F street_6 164 81 95.5 A- 2305 S_2 C_3 M street_4 187 73 48.9 B 2401 S_2 C_4 F street_2 192 62 45.3 A 2402 S_2 C_4 M street_7 166 82 48.7 B 2403 S_2 C_4 F street_6 158 60 59.7 B+ 2404 S_2 C_4 F street_2 160 84 67.7 B 2405 S_2 C_4 F street_6 193 54 47.6 B
df. loc[ 2402 : : - 1 ]
School Class Gender Address Height Weight Math Physics ID 2402 S_2 C_4 M street_7 166 82 48.7 B 2401 S_2 C_4 F street_2 192 62 45.3 A 2305 S_2 C_3 M street_4 187 73 48.9 B 2304 S_2 C_3 F street_6 164 81 95.5 A- 2303 S_2 C_3 F street_7 190 99 65.9 C 2302 S_2 C_3 M street_5 171 88 32.7 A 2301 S_2 C_3 F street_4 157 78 72.3 B+ 2205 S_2 C_2 F street_7 183 76 85.4 B 2204 S_2 C_2 M street_1 175 74 47.2 B- 2203 S_2 C_2 M street_4 155 91 73.8 A+ 2202 S_2 C_2 F street_7 194 77 68.5 B+ 2201 S_2 C_2 M street_5 193 100 39.1 B 2105 S_2 C_1 M street_4 170 81 34.2 A 2104 S_2 C_1 F street_5 159 97 72.2 B+ 2103 S_2 C_1 M street_4 157 61 52.5 B- 2102 S_2 C_1 F street_6 161 61 50.6 B+ 2101 S_2 C_1 M street_7 174 84 83.3 C 1305 S_1 C_3 F street_5 187 69 61.7 B- 1304 S_1 C_3 M street_2 195 70 85.2 A 1303 S_1 C_3 M street_7 188 82 49.7 B 1302 S_1 C_3 F street_1 175 57 87.7 A- 1301 S_1 C_3 M street_4 161 68 31.5 B+ 1205 S_1 C_2 F street_6 167 63 68.4 B- 1204 S_1 C_2 F street_5 162 63 33.8 B 1203 S_1 C_2 M street_6 160 53 58.8 A+ 1202 S_1 C_2 F street_4 176 94 63.5 B- 1201 S_1 C_2 M street_5 188 68 97.0 A- 1105 S_1 C_1 F street_4 159 64 84.8 B+ 1104 S_1 C_1 F street_2 167 81 80.4 B- 1103 S_1 C_1 M street_2 186 82 87.2 B+ 1102 S_1 C_1 F street_2 192 73 32.5 B+ 1101 S_1 C_1 M street_1 173 63 34.0 A+
3.单列索引
df. loc[ : , 'Height' ]
ID
1101 173
1102 192
1103 186
1104 167
1105 159
1201 188
1202 176
1203 160
1204 162
1205 167
1301 161
1302 175
1303 188
1304 195
1305 187
2101 174
2102 161
2103 157
2104 159
2105 170
2201 193
2202 194
2203 155
2204 175
2205 183
2301 157
2302 171
2303 190
2304 164
2305 187
2401 192
2402 166
2403 158
2404 160
2405 193
Name: Height, dtype: int64
4.多列索引
df. loc[ : , [ 'Height' , 'Math' ] ]
Height Math ID 1101 173 34.0 1102 192 32.5 1103 186 87.2 1104 167 80.4 1105 159 84.8 1201 188 97.0 1202 176 63.5 1203 160 58.8 1204 162 33.8 1205 167 68.4 1301 161 31.5 1302 175 87.7 1303 188 49.7 1304 195 85.2 1305 187 61.7 2101 174 83.3 2102 161 50.6 2103 157 52.5 2104 159 72.2 2105 170 34.2 2201 193 39.1 2202 194 68.5 2203 155 73.8 2204 175 47.2 2205 183 85.4 2301 157 72.3 2302 171 32.7 2303 190 65.9 2304 164 95.5 2305 187 48.9 2401 192 45.3 2402 166 48.7 2403 158 59.7 2404 160 67.7 2405 193 47.6
df. loc[ : , 'Height' : 'Math' ]
Height Weight Math ID 1101 173 63 34.0 1102 192 73 32.5 1103 186 82 87.2 1104 167 81 80.4 1105 159 64 84.8 1201 188 68 97.0 1202 176 94 63.5 1203 160 53 58.8 1204 162 63 33.8 1205 167 63 68.4 1301 161 68 31.5 1302 175 57 87.7 1303 188 82 49.7 1304 195 70 85.2 1305 187 69 61.7 2101 174 84 83.3 2102 161 61 50.6 2103 157 61 52.5 2104 159 97 72.2 2105 170 81 34.2 2201 193 100 39.1 2202 194 77 68.5 2203 155 91 73.8 2204 175 74 47.2 2205 183 76 85.4 2301 157 78 72.3 2302 171 88 32.7 2303 190 99 65.9 2304 164 81 95.5 2305 187 73 48.9 2401 192 62 45.3 2402 166 82 48.7 2403 158 60 59.7 2404 160 84 67.7 2405 193 54 47.6
5.联合索引
df. loc[ 1102 : 2401 , 'Height' : 'Math' ]
Height Weight Math ID 1102 192 73 32.5 1103 186 82 87.2 1104 167 81 80.4 1105 159 64 84.8 1201 188 68 97.0 1202 176 94 63.5 1203 160 53 58.8 1204 162 63 33.8 1205 167 63 68.4 1301 161 68 31.5 1302 175 57 87.7 1303 188 82 49.7 1304 195 70 85.2 1305 187 69 61.7 2101 174 84 83.3 2102 161 61 50.6 2103 157 61 52.5 2104 159 97 72.2 2105 170 81 34.2 2201 193 100 39.1 2202 194 77 68.5 2203 155 91 73.8 2204 175 74 47.2 2205 183 76 85.4 2301 157 78 72.3 2302 171 88 32.7 2303 190 99 65.9 2304 164 81 95.5 2305 187 73 48.9 2401 192 62 45.3
df. loc[ 1102 : 2401 : 2 , 'Height' : 'Math' ]
Height Weight Math ID 1102 192 73 32.5 1104 167 81 80.4 1201 188 68 97.0 1203 160 53 58.8 1205 167 63 68.4 1302 175 57 87.7 1304 195 70 85.2 2101 174 84 83.3 2103 157 61 52.5 2105 170 81 34.2 2202 194 77 68.5 2204 175 74 47.2 2301 157 78 72.3 2303 190 99 65.9 2305 187 73 48.9
6.函数式索引
df. loc[ lambda x: x[ 'Gender' ] == 'M' ]
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1103 S_1 C_1 M street_2 186 82 87.2 B+ 1201 S_1 C_2 M street_5 188 68 97.0 A- 1203 S_1 C_2 M street_6 160 53 58.8 A+ 1301 S_1 C_3 M street_4 161 68 31.5 B+ 1303 S_1 C_3 M street_7 188 82 49.7 B 1304 S_1 C_3 M street_2 195 70 85.2 A 2101 S_2 C_1 M street_7 174 84 83.3 C 2103 S_2 C_1 M street_4 157 61 52.5 B- 2105 S_2 C_1 M street_4 170 81 34.2 A 2201 S_2 C_2 M street_5 193 100 39.1 B 2203 S_2 C_2 M street_4 155 91 73.8 A+ 2204 S_2 C_2 M street_1 175 74 47.2 B- 2302 S_2 C_3 M street_5 171 88 32.7 A 2305 S_2 C_3 M street_4 187 73 48.9 B 2402 S_2 C_4 M street_7 166 82 48.7 B
def f ( x) :
return [ 1101 , 1103 ]
df. loc[ f]
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1103 S_1 C_1 M street_2 186 82 87.2 B+
7.布尔索引
df. loc[ df[ 'Address' ] . isin( [ 'street_7' , 'street_4' ] ) ] . head( )
School Class Gender Address Height Weight Math Physics ID 1105 S_1 C_1 F street_4 159 64 84.8 B+ 1202 S_1 C_2 F street_4 176 94 63.5 B- 1301 S_1 C_3 M street_4 161 68 31.5 B+ 1303 S_1 C_3 M street_7 188 82 49.7 B 2101 S_2 C_1 M street_7 174 84 83.3 C
df. loc[ [ True if i[ - 1 ] == '4' or i[ - 1 ] == '7' else False for i in df[ 'Address' ] . values] ] . head( )
School Class Gender Address Height Weight Math Physics ID 1105 S_1 C_1 F street_4 159 64 84.8 B+ 1202 S_1 C_2 F street_4 176 94 63.5 B- 1301 S_1 C_3 M street_4 161 68 31.5 B+ 1303 S_1 C_3 M street_7 188 82 49.7 B 2101 S_2 C_1 M street_7 174 84 83.3 C
总结:loc中能传入只有布尔列表和索引子集构成的列表
iloc方法-切片法,不含右端
1.单行索引
df. head( )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1102 S_1 C_1 F street_2 192 73 32.5 B+ 1103 S_1 C_1 M street_2 186 82 87.2 B+ 1104 S_1 C_1 F street_2 167 81 80.4 B- 1105 S_1 C_1 F street_4 159 64 84.8 B+
df. iloc[ 3 ]
School S_1
Class C_1
Gender F
Address street_2
Height 167
Weight 81
Math 80.4
Physics B-
Name: 1104, dtype: object
2.多行索引
df. iloc[ 3 : 5 ]
School Class Gender Address Height Weight Math Physics ID 1104 S_1 C_1 F street_2 167 81 80.4 B- 1105 S_1 C_1 F street_4 159 64 84.8 B+
3.混合索引
df. iloc[ 3 : : 2 , 7 : : - 2 ] . head( )
Physics Weight Address Class ID 1104 B- 81 street_2 C_1 1201 A- 68 street_5 C_2 1203 A+ 53 street_6 C_2 1205 B- 63 street_6 C_2 1302 A- 57 street_1 C_3
4.函数式索引
df. iloc[ lambda x: [ 3 ] ]
School Class Gender Address Height Weight Math Physics ID 1104 S_1 C_1 F street_2 167 81 80.4 B-
[]操作符
不能再行索引为浮点时使用,因为在Series中的浮点[]并不是进行位置比较,而是值比较
1.单元素索引
s= pd. Series( df[ 'Math' ] , index= df. index)
s[ 1101 ]
34.0
2.多行索引
s[ 0 : 4 ]
ID
1101 34.0
1102 32.5
1103 87.2
1104 80.4
Name: Math, dtype: float64
3.函数式索引
s[ lambda x: x. index[ 16 : : - 6 ] ]
ID
2102 50.6
1301 31.5
1105 84.8
Name: Math, dtype: float64
4.布尔式索引
s[ s> 80 ]
ID
1103 87.2
1104 80.4
1105 84.8
1201 97.0
1302 87.7
1304 85.2
2101 83.3
2205 85.4
2304 95.5
Name: Math, dtype: float64
data与[]操作
df[ 1 : 2 ]
School Class Gender Address Height Weight Math Physics ID 1102 S_1 C_1 F street_2 192 73 32.5 B+
row= df. index. get_loc( 1102 )
df[ row: row+ 1 ]
School Class Gender Address Height Weight Math Physics ID 1102 S_1 C_1 F street_2 192 73 32.5 B+
2.多行索引
df[ 3 : 5 ]
School Class Gender Address Height Weight Math Physics ID 1104 S_1 C_1 F street_2 167 81 80.4 B- 1105 S_1 C_1 F street_4 159 64 84.8 B+
3.单列索引
df[ 'School' ] . head( )
ID
1101 S_1
1102 S_1
1103 S_1
1104 S_1
1105 S_1
Name: School, dtype: object
4.多列索引
df[ [ 'School' , 'Math' ] ]
School Math ID 1101 S_1 34.0 1102 S_1 32.5 1103 S_1 87.2 1104 S_1 80.4 1105 S_1 84.8 1201 S_1 97.0 1202 S_1 63.5 1203 S_1 58.8 1204 S_1 33.8 1205 S_1 68.4 1301 S_1 31.5 1302 S_1 87.7 1303 S_1 49.7 1304 S_1 85.2 1305 S_1 61.7 2101 S_2 83.3 2102 S_2 50.6 2103 S_2 52.5 2104 S_2 72.2 2105 S_2 34.2 2201 S_2 39.1 2202 S_2 68.5 2203 S_2 73.8 2204 S_2 47.2 2205 S_2 85.4 2301 S_2 72.3 2302 S_2 32.7 2303 S_2 65.9 2304 S_2 95.5 2305 S_2 48.9 2401 S_2 45.3 2402 S_2 48.7 2403 S_2 59.7 2404 S_2 67.7 2405 S_2 47.6
5.布尔索引
df[ df[ 'Gender' ] == 'F' ] . head( )
School Class Gender Address Height Weight Math Physics ID 1102 S_1 C_1 F street_2 192 73 32.5 B+ 1104 S_1 C_1 F street_2 167 81 80.4 B- 1105 S_1 C_1 F street_4 159 64 84.8 B+ 1202 S_1 C_2 F street_4 176 94 63.5 B- 1204 S_1 C_2 F street_5 162 63 33.8 B
总结,[]操作常用于列选择或布尔选择,尽量避免行选择!
布尔索引
布尔符号:‘&’,‘\’,’~’:分别代表 and,or 反not
df[ ( df[ 'Gender' ] == 'F' ) & ( df[ 'Math' ] > 80 ) ]
School Class Gender Address Height Weight Math Physics ID 1104 S_1 C_1 F street_2 167 81 80.4 B- 1105 S_1 C_1 F street_4 159 64 84.8 B+ 1302 S_1 C_3 F street_1 175 57 87.7 A- 2205 S_2 C_2 F street_7 183 76 85.4 B 2304 S_2 C_3 F street_6 164 81 95.5 A-
df[ ( df[ 'Gender' ] == 'F' ) | ( df[ 'Math' ] > 80 ) ]
School Class Gender Address Height Weight Math Physics ID 1102 S_1 C_1 F street_2 192 73 32.5 B+ 1103 S_1 C_1 M street_2 186 82 87.2 B+ 1104 S_1 C_1 F street_2 167 81 80.4 B- 1105 S_1 C_1 F street_4 159 64 84.8 B+ 1201 S_1 C_2 M street_5 188 68 97.0 A- 1202 S_1 C_2 F street_4 176 94 63.5 B- 1204 S_1 C_2 F street_5 162 63 33.8 B 1205 S_1 C_2 F street_6 167 63 68.4 B- 1302 S_1 C_3 F street_1 175 57 87.7 A- 1304 S_1 C_3 M street_2 195 70 85.2 A 1305 S_1 C_3 F street_5 187 69 61.7 B- 2101 S_2 C_1 M street_7 174 84 83.3 C 2102 S_2 C_1 F street_6 161 61 50.6 B+ 2104 S_2 C_1 F street_5 159 97 72.2 B+ 2202 S_2 C_2 F street_7 194 77 68.5 B+ 2205 S_2 C_2 F street_7 183 76 85.4 B 2301 S_2 C_3 F street_4 157 78 72.3 B+ 2303 S_2 C_3 F street_7 190 99 65.9 C 2304 S_2 C_3 F street_6 164 81 95.5 A- 2401 S_2 C_4 F street_2 192 62 45.3 A 2403 S_2 C_4 F street_6 158 60 59.7 B+ 2404 S_2 C_4 F street_2 160 84 67.7 B 2405 S_2 C_4 F street_6 193 54 47.6 B
df[ ~ ( ( df[ 'Gender' ] == 'F' ) | ( df[ 'Math' ] > 80 ) ) ]
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1203 S_1 C_2 M street_6 160 53 58.8 A+ 1301 S_1 C_3 M street_4 161 68 31.5 B+ 1303 S_1 C_3 M street_7 188 82 49.7 B 2103 S_2 C_1 M street_4 157 61 52.5 B- 2105 S_2 C_1 M street_4 170 81 34.2 A 2201 S_2 C_2 M street_5 193 100 39.1 B 2203 S_2 C_2 M street_4 155 91 73.8 A+ 2204 S_2 C_2 M street_1 175 74 47.2 B- 2302 S_2 C_3 M street_5 171 88 32.7 A 2305 S_2 C_3 M street_4 187 73 48.9 B 2402 S_2 C_4 M street_7 166 82 48.7 B
loc和[]中相应位置都能使用布尔列表选择:
df. loc[ df[ 'Math' ] > 60 , ( df[ : 8 ] [ 'Address' ] == 'street_6' ) . values] . head( 30 )
Physics ID 1103 B+ 1104 B- 1105 B+ 1201 A- 1202 B- 1205 B- 1302 A- 1304 A 1305 B- 2101 C 2104 B+ 2202 B+ 2203 A+ 2205 B 2301 B+ 2303 C 2304 A- 2404 B
df
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1102 S_1 C_1 F street_2 192 73 32.5 B+ 1103 S_1 C_1 M street_2 186 82 87.2 B+ 1104 S_1 C_1 F street_2 167 81 80.4 B- 1105 S_1 C_1 F street_4 159 64 84.8 B+ 1201 S_1 C_2 M street_5 188 68 97.0 A- 1202 S_1 C_2 F street_4 176 94 63.5 B- 1203 S_1 C_2 M street_6 160 53 58.8 A+ 1204 S_1 C_2 F street_5 162 63 33.8 B 1205 S_1 C_2 F street_6 167 63 68.4 B- 1301 S_1 C_3 M street_4 161 68 31.5 B+ 1302 S_1 C_3 F street_1 175 57 87.7 A- 1303 S_1 C_3 M street_7 188 82 49.7 B 1304 S_1 C_3 M street_2 195 70 85.2 A 1305 S_1 C_3 F street_5 187 69 61.7 B- 2101 S_2 C_1 M street_7 174 84 83.3 C 2102 S_2 C_1 F street_6 161 61 50.6 B+ 2103 S_2 C_1 M street_4 157 61 52.5 B- 2104 S_2 C_1 F street_5 159 97 72.2 B+ 2105 S_2 C_1 M street_4 170 81 34.2 A 2201 S_2 C_2 M street_5 193 100 39.1 B 2202 S_2 C_2 F street_7 194 77 68.5 B+ 2203 S_2 C_2 M street_4 155 91 73.8 A+ 2204 S_2 C_2 M street_1 175 74 47.2 B- 2205 S_2 C_2 F street_7 183 76 85.4 B 2301 S_2 C_3 F street_4 157 78 72.3 B+ 2302 S_2 C_3 M street_5 171 88 32.7 A 2303 S_2 C_3 F street_7 190 99 65.9 C 2304 S_2 C_3 F street_6 164 81 95.5 A- 2305 S_2 C_3 M street_4 187 73 48.9 B 2401 S_2 C_4 F street_2 192 62 45.3 A 2402 S_2 C_4 M street_7 166 82 48.7 B 2403 S_2 C_4 F street_6 158 60 59.7 B+ 2404 S_2 C_4 F street_2 160 84 67.7 B 2405 S_2 C_4 F street_6 193 54 47.6 B
isin方法
df[ df[ 'Address' ] . isin( [ 'street_1' , 'street_4' ] ) & df[ 'Physics' ] . isin( [ 'A' , 'A+' ] ) ]
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 2105 S_2 C_1 M street_4 170 81 34.2 A 2203 S_2 C_2 M street_4 155 91 73.8 A+
df[ df[ [ 'Address' , 'Physics' ] ] . isin( { 'Address' : [ 'street_1' , 'street_4' ] , 'Physics' : [ 'A' , 'A+' ] } ) . all ( 1 ) ]
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 2105 S_2 C_1 M street_4 170 81 34.2 A 2203 S_2 C_2 M street_4 155 91 73.8 A+
快速标量索引
display( df. at[ 1101 , 'School' ] )
'S_1'
区间索引
利用interbal_range方法
pd. interval_range( start= 0 , end= 5 )
IntervalIndex([(0, 1], (1, 2], (2, 3], (3, 4], (4, 5]],
closed='right',
dtype='interval[int64]')
pd. interval_range( start= 0 , periods= 8 , freq= 5 )
IntervalIndex([(0, 5], (5, 10], (10, 15], (15, 20], (20, 25], (25, 30], (30, 35], (35, 40]],
closed='right',
dtype='interval[int64]')
利用cut将数值转换为区间元素的分类变量,例如统计数学成绩的区间情况
math_interval= pd. cut( df[ 'Math' ] , bins= [ 0 , 40 , 60 , 80 , 100 ] )
math_interval
ID
1101 (0, 40]
1102 (0, 40]
1103 (80, 100]
1104 (80, 100]
1105 (80, 100]
1201 (80, 100]
1202 (60, 80]
1203 (40, 60]
1204 (0, 40]
1205 (60, 80]
1301 (0, 40]
1302 (80, 100]
1303 (40, 60]
1304 (80, 100]
1305 (60, 80]
2101 (80, 100]
2102 (40, 60]
2103 (40, 60]
2104 (60, 80]
2105 (0, 40]
2201 (0, 40]
2202 (60, 80]
2203 (60, 80]
2204 (40, 60]
2205 (80, 100]
2301 (60, 80]
2302 (0, 40]
2303 (60, 80]
2304 (80, 100]
2305 (40, 60]
2401 (40, 60]
2402 (40, 60]
2403 (40, 60]
2404 (60, 80]
2405 (40, 60]
Name: Math, dtype: category
Categories (4, interval[int64]): [(0, 40] < (40, 60] < (60, 80] < (80, 100]]
区间索引的选取
df_i = df. join( math_interval, rsuffix= '_interval' ) [ [ 'Math' , 'Math_interval' ] ] \
. reset_index( ) . set_index( 'Math_interval' )
df_i. head( )
ID Math Math_interval (0, 40] 1101 34.0 (0, 40] 1102 32.5 (80, 100] 1103 87.2 (80, 100] 1104 80.4 (80, 100] 1105 84.8
df_i. loc[ 65 ]
ID Math Math_interval (60, 80] 1202 63.5 (60, 80] 1205 68.4 (60, 80] 1305 61.7 (60, 80] 2104 72.2 (60, 80] 2202 68.5 (60, 80] 2203 73.8 (60, 80] 2301 72.3 (60, 80] 2303 65.9 (60, 80] 2404 67.7
df_i. loc[ [ 65 , 90 ] ]
ID Math Math_interval (60, 80] 1202 63.5 (60, 80] 1205 68.4 (60, 80] 1305 61.7 (60, 80] 2104 72.2 (60, 80] 2202 68.5 (60, 80] 2203 73.8 (60, 80] 2301 72.3 (60, 80] 2303 65.9 (60, 80] 2404 67.7 (80, 100] 1103 87.2 (80, 100] 1104 80.4 (80, 100] 1105 84.8 (80, 100] 1201 97.0 (80, 100] 1302 87.7 (80, 100] 1304 85.2 (80, 100] 2101 83.3 (80, 100] 2205 85.4 (80, 100] 2304 95.5
df_i[ df_i. index. astype( 'interval' ) . overlaps( pd. Interval( 70 , 85 ) ) ]
ID Math Math_interval (80, 100] 1103 87.2 (80, 100] 1104 80.4 (80, 100] 1105 84.8 (80, 100] 1201 97.0 (60, 80] 1202 63.5 (60, 80] 1205 68.4 (80, 100] 1302 87.7 (80, 100] 1304 85.2 (60, 80] 1305 61.7 (80, 100] 2101 83.3 (60, 80] 2104 72.2 (60, 80] 2202 68.5 (60, 80] 2203 73.8 (80, 100] 2205 85.4 (60, 80] 2301 72.3 (60, 80] 2303 65.9 (80, 100] 2304 95.5 (60, 80] 2404 67.7
多级索引
创建多级索引
通过from_tuple或from_arrays
1.直接创立元组
tuples= [ ( 'A' , 'a' ) , ( 'A' , 'b' ) , ( 'B' , 'a' ) , ( 'B' , 'b' ) ]
mul_index= pd. MultiIndex. from_tuples( tuples, names= ( 'Upper' , 'Lower' ) )
mul_index
MultiIndex([('A', 'a'),
('A', 'b'),
('B', 'a'),
('B', 'b')],
names=['Upper', 'Lower'])
pd. DataFrame( { 'Score' : [ 'Perfect' , 'Good' , 'Fair' , 'Bad' ] } , index= mul_index)
Score Upper Lower A a Perfect b Good B a Fair b Bad
2.利用zip创建元组
L1= list ( 'AABB' )
L2= list ( 'abab' )
tuples= list ( zip ( L1, L2) )
mul_index= pd. MultiIndex. from_tuples( tuples, names= ( 'Upper' , 'Lower' ) )
pd. DataFrame( { 'Score' : [ 'Perfect' , 'Good' , 'Fair' , 'Bad' ] } , index= mul_index)
Score Upper Lower A a Perfect b Good B a Fair b Bad
3.通过Array创建
通过 from_prodect
L1= [ 'A' , 'B' ]
L2= [ 'a' , 'b' ]
pd. MultiIndex. from_product( [ L1, L2] , names= ( 'Upper' , 'Lower' ) )
MultiIndex([('A', 'a'),
('A', 'b'),
('B', 'a'),
('B', 'b')],
names=['Upper', 'Lower'])
指定df中的列进行创建(set_index)
df_using_mul= df. set_index( [ 'Class' , 'Address' ] )
df_using_mul
School Gender Height Weight Math Physics Class Address C_1 street_1 S_1 M 173 63 34.0 A+ street_2 S_1 F 192 73 32.5 B+ street_2 S_1 M 186 82 87.2 B+ street_2 S_1 F 167 81 80.4 B- street_4 S_1 F 159 64 84.8 B+ C_2 street_5 S_1 M 188 68 97.0 A- street_4 S_1 F 176 94 63.5 B- street_6 S_1 M 160 53 58.8 A+ street_5 S_1 F 162 63 33.8 B street_6 S_1 F 167 63 68.4 B- C_3 street_4 S_1 M 161 68 31.5 B+ street_1 S_1 F 175 57 87.7 A- street_7 S_1 M 188 82 49.7 B street_2 S_1 M 195 70 85.2 A street_5 S_1 F 187 69 61.7 B- C_1 street_7 S_2 M 174 84 83.3 C street_6 S_2 F 161 61 50.6 B+ street_4 S_2 M 157 61 52.5 B- street_5 S_2 F 159 97 72.2 B+ street_4 S_2 M 170 81 34.2 A C_2 street_5 S_2 M 193 100 39.1 B street_7 S_2 F 194 77 68.5 B+ street_4 S_2 M 155 91 73.8 A+ street_1 S_2 M 175 74 47.2 B- street_7 S_2 F 183 76 85.4 B C_3 street_4 S_2 F 157 78 72.3 B+ street_5 S_2 M 171 88 32.7 A street_7 S_2 F 190 99 65.9 C street_6 S_2 F 164 81 95.5 A- street_4 S_2 M 187 73 48.9 B C_4 street_2 S_2 F 192 62 45.3 A street_7 S_2 M 166 82 48.7 B street_6 S_2 F 158 60 59.7 B+ street_2 S_2 F 160 84 67.7 B street_6 S_2 F 193 54 47.6 B
多层索引切片
df_using_mul
School Gender Height Weight Math Physics Class Address C_1 street_1 S_1 M 173 63 34.0 A+ street_2 S_1 F 192 73 32.5 B+ street_2 S_1 M 186 82 87.2 B+ street_2 S_1 F 167 81 80.4 B- street_4 S_1 F 159 64 84.8 B+ C_2 street_5 S_1 M 188 68 97.0 A- street_4 S_1 F 176 94 63.5 B- street_6 S_1 M 160 53 58.8 A+ street_5 S_1 F 162 63 33.8 B street_6 S_1 F 167 63 68.4 B- C_3 street_4 S_1 M 161 68 31.5 B+ street_1 S_1 F 175 57 87.7 A- street_7 S_1 M 188 82 49.7 B street_2 S_1 M 195 70 85.2 A street_5 S_1 F 187 69 61.7 B- C_1 street_7 S_2 M 174 84 83.3 C street_6 S_2 F 161 61 50.6 B+ street_4 S_2 M 157 61 52.5 B- street_5 S_2 F 159 97 72.2 B+ street_4 S_2 M 170 81 34.2 A C_2 street_5 S_2 M 193 100 39.1 B street_7 S_2 F 194 77 68.5 B+ street_4 S_2 M 155 91 73.8 A+ street_1 S_2 M 175 74 47.2 B- street_7 S_2 F 183 76 85.4 B C_3 street_4 S_2 F 157 78 72.3 B+ street_5 S_2 M 171 88 32.7 A street_7 S_2 F 190 99 65.9 C street_6 S_2 F 164 81 95.5 A- street_4 S_2 M 187 73 48.9 B C_4 street_2 S_2 F 192 62 45.3 A street_7 S_2 M 166 82 48.7 B street_6 S_2 F 158 60 59.7 B+ street_2 S_2 F 160 84 67.7 B street_6 S_2 F 193 54 47.6 B
一般切片
df_using_mul. index. is_lexsorted( )
df_using_mul. sort_index( ) . loc[ 'C_2' , 'street_5' ]
School Gender Height Weight Math Physics Class Address C_2 street_5 S_1 M 188 68 97.0 A- street_5 S_1 F 162 63 33.8 B street_5 S_2 M 193 100 39.1 B
df_using_mul. sort_index( ) . loc[ ( 'C_2' , 'street_6' ) : ( 'C_3' , 'street_4' ) ]
School Gender Height Weight Math Physics Class Address C_2 street_6 S_1 M 160 53 58.8 A+ street_6 S_1 F 167 63 68.4 B- street_7 S_2 F 194 77 68.5 B+ street_7 S_2 F 183 76 85.4 B C_3 street_1 S_1 F 175 57 87.7 A- street_2 S_1 M 195 70 85.2 A street_4 S_1 M 161 68 31.5 B+ street_4 S_2 F 157 78 72.3 B+ street_4 S_2 M 187 73 48.9 B
df_using_mul. sort_index( ) . loc[ ( 'C_2' , 'street_7' ) : 'C_3' ]
School Gender Height Weight Math Physics Class Address C_2 street_7 S_2 F 194 77 68.5 B+ street_7 S_2 F 183 76 85.4 B C_3 street_1 S_1 F 175 57 87.7 A- street_2 S_1 M 195 70 85.2 A street_4 S_1 M 161 68 31.5 B+ street_4 S_2 F 157 78 72.3 B+ street_4 S_2 M 187 73 48.9 B street_5 S_1 F 187 69 61.7 B- street_5 S_2 M 171 88 32.7 A street_6 S_2 F 164 81 95.5 A- street_7 S_1 M 188 82 49.7 B street_7 S_2 F 190 99 65.9 C
第一类特殊情况:又元组构成的列表
df_using_mul. sort_index( ) . loc[ [ ( 'C_2' , 'street_7' ) , ( 'C_3' , 'street_2' ) ] ]
School Gender Height Weight Math Physics Class Address C_2 street_7 S_2 F 194 77 68.5 B+ street_7 S_2 F 183 76 85.4 B C_3 street_2 S_1 M 195 70 85.2 A
第二类特殊情况:由列表构成元组
df_using_mul. sort_index( ) . loc[ ( [ 'C_2' , 'C_3' ] , [ 'street_4' , 'street_7' ] ) , : ]
School Gender Height Weight Math Physics Class Address C_2 street_4 S_1 F 176 94 63.5 B- street_4 S_2 M 155 91 73.8 A+ street_7 S_2 F 194 77 68.5 B+ street_7 S_2 F 183 76 85.4 B C_3 street_4 S_1 M 161 68 31.5 B+ street_4 S_2 F 157 78 72.3 B+ street_4 S_2 M 187 73 48.9 B street_7 S_1 M 188 82 49.7 B street_7 S_2 F 190 99 65.9 C
多层索引中的slice对象
L1, L2= [ 'A' , 'B' , 'C' ] , [ 'a' , 'b' , 'c' ]
mul_index1= pd. MultiIndex. from_product( [ L1, L2] , names= ( 'Upper' , 'Lower' ) )
L3, L4= [ 'D' , 'E' , 'F' ] , [ 'd' , 'e' , 'f' ]
mul_index2= pd. MultiIndex. from_product( [ L3, L4] , names= ( 'Big' , 'Small' ) )
df_s= pd. DataFrame( np. random. rand( 9 , 9 ) , index= mul_index1, columns= mul_index2)
df_s
Big D E F Small d e f d e f d e f Upper Lower A a 0.086290 0.016434 0.770953 0.905786 0.774793 0.431968 0.058817 0.639299 0.270593 b 0.469427 0.223689 0.028010 0.665754 0.663030 0.546166 0.593834 0.422424 0.261654 c 0.216910 0.588830 0.226957 0.894320 0.282674 0.874350 0.458656 0.865967 0.617002 B a 0.429723 0.177135 0.967281 0.573328 0.681209 0.563489 0.905860 0.320977 0.992234 b 0.063323 0.118590 0.239837 0.766058 0.823149 0.528363 0.521557 0.534079 0.338802 c 0.926639 0.759275 0.396388 0.851646 0.117874 0.940012 0.045774 0.678996 0.723390 C a 0.939771 0.180097 0.143923 0.409855 0.846589 0.066732 0.664438 0.855066 0.376750 b 0.941022 0.232878 0.234820 0.403061 0.763359 0.019918 0.392424 0.439221 0.545601 c 0.480966 0.730475 0.423153 0.194107 0.756316 0.567378 0.983534 0.328770 0.850334
idx= pd. IndexSlice
df_s. loc[ idx[ 'B' : , df_s[ 'D' ] [ 'd' ] > 0.12 ] , idx[ 'E' , df_s. sum ( ) > 2 ] ]
Big E Small d e f Upper Lower B a 0.573328 0.681209 0.563489 c 0.851646 0.117874 0.940012 C a 0.409855 0.846589 0.066732 b 0.403061 0.763359 0.019918 c 0.194107 0.756316 0.567378
索引层的交换
swaplevel方法(两层交换)
df_using_mul
School Gender Height Weight Math Physics Class Address C_1 street_1 S_1 M 173 63 34.0 A+ street_2 S_1 F 192 73 32.5 B+ street_2 S_1 M 186 82 87.2 B+ street_2 S_1 F 167 81 80.4 B- street_4 S_1 F 159 64 84.8 B+ C_2 street_5 S_1 M 188 68 97.0 A- street_4 S_1 F 176 94 63.5 B- street_6 S_1 M 160 53 58.8 A+ street_5 S_1 F 162 63 33.8 B street_6 S_1 F 167 63 68.4 B- C_3 street_4 S_1 M 161 68 31.5 B+ street_1 S_1 F 175 57 87.7 A- street_7 S_1 M 188 82 49.7 B street_2 S_1 M 195 70 85.2 A street_5 S_1 F 187 69 61.7 B- C_1 street_7 S_2 M 174 84 83.3 C street_6 S_2 F 161 61 50.6 B+ street_4 S_2 M 157 61 52.5 B- street_5 S_2 F 159 97 72.2 B+ street_4 S_2 M 170 81 34.2 A C_2 street_5 S_2 M 193 100 39.1 B street_7 S_2 F 194 77 68.5 B+ street_4 S_2 M 155 91 73.8 A+ street_1 S_2 M 175 74 47.2 B- street_7 S_2 F 183 76 85.4 B C_3 street_4 S_2 F 157 78 72.3 B+ street_5 S_2 M 171 88 32.7 A street_7 S_2 F 190 99 65.9 C street_6 S_2 F 164 81 95.5 A- street_4 S_2 M 187 73 48.9 B C_4 street_2 S_2 F 192 62 45.3 A street_7 S_2 M 166 82 48.7 B street_6 S_2 F 158 60 59.7 B+ street_2 S_2 F 160 84 67.7 B street_6 S_2 F 193 54 47.6 B
df_using_mul. swaplevel( i= 1 , j= 0 , axis= 0 ) . sort_index( )
School Gender Height Weight Math Physics Address Class street_1 C_1 S_1 M 173 63 34.0 A+ C_2 S_2 M 175 74 47.2 B- C_3 S_1 F 175 57 87.7 A- street_2 C_1 S_1 F 192 73 32.5 B+ C_1 S_1 M 186 82 87.2 B+ C_1 S_1 F 167 81 80.4 B- C_3 S_1 M 195 70 85.2 A C_4 S_2 F 192 62 45.3 A C_4 S_2 F 160 84 67.7 B street_4 C_1 S_1 F 159 64 84.8 B+ C_1 S_2 M 157 61 52.5 B- C_1 S_2 M 170 81 34.2 A C_2 S_1 F 176 94 63.5 B- C_2 S_2 M 155 91 73.8 A+ C_3 S_1 M 161 68 31.5 B+ C_3 S_2 F 157 78 72.3 B+ C_3 S_2 M 187 73 48.9 B street_5 C_1 S_2 F 159 97 72.2 B+ C_2 S_1 M 188 68 97.0 A- C_2 S_1 F 162 63 33.8 B C_2 S_2 M 193 100 39.1 B C_3 S_1 F 187 69 61.7 B- C_3 S_2 M 171 88 32.7 A street_6 C_1 S_2 F 161 61 50.6 B+ C_2 S_1 M 160 53 58.8 A+ C_2 S_1 F 167 63 68.4 B- C_3 S_2 F 164 81 95.5 A- C_4 S_2 F 158 60 59.7 B+ C_4 S_2 F 193 54 47.6 B street_7 C_1 S_2 M 174 84 83.3 C C_2 S_2 F 194 77 68.5 B+ C_2 S_2 F 183 76 85.4 B C_3 S_1 M 188 82 49.7 B C_3 S_2 F 190 99 65.9 C C_4 S_2 M 166 82 48.7 B
reorder_levels方法(多层交换)
df_muls= df. set_index( [ 'School' , 'Class' , 'Address' ] )
df_muls
Gender Height Weight Math Physics School Class Address S_1 C_1 street_1 M 173 63 34.0 A+ street_2 F 192 73 32.5 B+ street_2 M 186 82 87.2 B+ street_2 F 167 81 80.4 B- street_4 F 159 64 84.8 B+ C_2 street_5 M 188 68 97.0 A- street_4 F 176 94 63.5 B- street_6 M 160 53 58.8 A+ street_5 F 162 63 33.8 B street_6 F 167 63 68.4 B- C_3 street_4 M 161 68 31.5 B+ street_1 F 175 57 87.7 A- street_7 M 188 82 49.7 B street_2 M 195 70 85.2 A street_5 F 187 69 61.7 B- S_2 C_1 street_7 M 174 84 83.3 C street_6 F 161 61 50.6 B+ street_4 M 157 61 52.5 B- street_5 F 159 97 72.2 B+ street_4 M 170 81 34.2 A C_2 street_5 M 193 100 39.1 B street_7 F 194 77 68.5 B+ street_4 M 155 91 73.8 A+ street_1 M 175 74 47.2 B- street_7 F 183 76 85.4 B C_3 street_4 F 157 78 72.3 B+ street_5 M 171 88 32.7 A street_7 F 190 99 65.9 C street_6 F 164 81 95.5 A- street_4 M 187 73 48.9 B C_4 street_2 F 192 62 45.3 A street_7 M 166 82 48.7 B street_6 F 158 60 59.7 B+ street_2 F 160 84 67.7 B street_6 F 193 54 47.6 B
df_muls. reorder_levels( [ 2 , 0 , 1 ] , axis= 0 ) . sort_index( )
Gender Height Weight Math Physics Address School Class street_1 S_1 C_1 M 173 63 34.0 A+ C_3 F 175 57 87.7 A- S_2 C_2 M 175 74 47.2 B- street_2 S_1 C_1 F 192 73 32.5 B+ C_1 M 186 82 87.2 B+ C_1 F 167 81 80.4 B- C_3 M 195 70 85.2 A S_2 C_4 F 192 62 45.3 A C_4 F 160 84 67.7 B street_4 S_1 C_1 F 159 64 84.8 B+ C_2 F 176 94 63.5 B- C_3 M 161 68 31.5 B+ S_2 C_1 M 157 61 52.5 B- C_1 M 170 81 34.2 A C_2 M 155 91 73.8 A+ C_3 F 157 78 72.3 B+ C_3 M 187 73 48.9 B street_5 S_1 C_2 M 188 68 97.0 A- C_2 F 162 63 33.8 B C_3 F 187 69 61.7 B- S_2 C_1 F 159 97 72.2 B+ C_2 M 193 100 39.1 B C_3 M 171 88 32.7 A street_6 S_1 C_2 M 160 53 58.8 A+ C_2 F 167 63 68.4 B- S_2 C_1 F 161 61 50.6 B+ C_3 F 164 81 95.5 A- C_4 F 158 60 59.7 B+ C_4 F 193 54 47.6 B street_7 S_1 C_3 M 188 82 49.7 B S_2 C_1 M 174 84 83.3 C C_2 F 194 77 68.5 B+ C_2 F 183 76 85.4 B C_3 F 190 99 65.9 C C_4 M 166 82 48.7 B
索引设定
index_col参数
index_col是read_csv中的一个参数,而不是一个方法
pd. read_csv( 'E:\jupyter Notebook\天池比赛\joyful-pandas-master\data\\table.csv' , index_col= [ 'Address' , 'School' ] )
Unnamed: 0 Class ID Gender Height Weight Math Physics Address School street_1 S_1 0 C_1 1101 M 173 63 34.0 A+ street_2 S_1 1 C_1 1102 F 192 73 32.5 B+ S_1 2 C_1 1103 M 186 82 87.2 B+ S_1 3 C_1 1104 F 167 81 80.4 B- street_4 S_1 4 C_1 1105 F 159 64 84.8 B+ street_5 S_1 5 C_2 1201 M 188 68 97.0 A- street_4 S_1 6 C_2 1202 F 176 94 63.5 B- street_6 S_1 7 C_2 1203 M 160 53 58.8 A+ street_5 S_1 8 C_2 1204 F 162 63 33.8 B street_6 S_1 9 C_2 1205 F 167 63 68.4 B- street_4 S_1 10 C_3 1301 M 161 68 31.5 B+ street_1 S_1 11 C_3 1302 F 175 57 87.7 A- street_7 S_1 12 C_3 1303 M 188 82 49.7 B street_2 S_1 13 C_3 1304 M 195 70 85.2 A street_5 S_1 14 C_3 1305 F 187 69 61.7 B- street_7 S_2 15 C_1 2101 M 174 84 83.3 C street_6 S_2 16 C_1 2102 F 161 61 50.6 B+ street_4 S_2 17 C_1 2103 M 157 61 52.5 B- street_5 S_2 18 C_1 2104 F 159 97 72.2 B+ street_4 S_2 19 C_1 2105 M 170 81 34.2 A street_5 S_2 20 C_2 2201 M 193 100 39.1 B street_7 S_2 21 C_2 2202 F 194 77 68.5 B+ street_4 S_2 22 C_2 2203 M 155 91 73.8 A+ street_1 S_2 23 C_2 2204 M 175 74 47.2 B- street_7 S_2 24 C_2 2205 F 183 76 85.4 B street_4 S_2 25 C_3 2301 F 157 78 72.3 B+ street_5 S_2 26 C_3 2302 M 171 88 32.7 A street_7 S_2 27 C_3 2303 F 190 99 65.9 C street_6 S_2 28 C_3 2304 F 164 81 95.5 A- street_4 S_2 29 C_3 2305 M 187 73 48.9 B street_2 S_2 30 C_4 2401 F 192 62 45.3 A street_7 S_2 31 C_4 2402 M 166 82 48.7 B street_6 S_2 32 C_4 2403 F 158 60 59.7 B+ street_2 S_2 33 C_4 2404 F 160 84 67.7 B street_6 S_2 34 C_4 2405 F 193 54 47.6 B
reindex和reindex_like
reindex是指重新索引,重要性与索引对齐类似,用于重新排序
df. head( )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1102 S_1 C_1 F street_2 192 73 32.5 B+ 1103 S_1 C_1 M street_2 186 82 87.2 B+ 1104 S_1 C_1 F street_2 167 81 80.4 B- 1105 S_1 C_1 F street_4 159 64 84.8 B+
df. reindex( columns= [ 'Height' , 'Gender' , 'Average' ] )
Height Gender Average ID 1101 173 M NaN 1102 192 F NaN 1103 186 M NaN 1104 167 F NaN 1105 159 F NaN 1201 188 M NaN 1202 176 F NaN 1203 160 M NaN 1204 162 F NaN 1205 167 F NaN 1301 161 M NaN 1302 175 F NaN 1303 188 M NaN 1304 195 M NaN 1305 187 F NaN 2101 174 M NaN 2102 161 F NaN 2103 157 M NaN 2104 159 F NaN 2105 170 M NaN 2201 193 M NaN 2202 194 F NaN 2203 155 M NaN 2204 175 M NaN 2205 183 F NaN 2301 157 F NaN 2302 171 M NaN 2303 190 F NaN 2304 164 F NaN 2305 187 M NaN 2401 192 F NaN 2402 166 M NaN 2403 158 F NaN 2404 160 F NaN 2405 193 F NaN
可以选择缺失值的填充方法:fill_value and method(bfill/ffill/nearest),其中method参数必须索引单调
df. reindex( index= [ 1101 , 1203 , 1206 , 2402 ] )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173.0 63.0 34.0 A+ 1203 S_1 C_2 M street_6 160.0 53.0 58.8 A+ 1206 NaN NaN NaN NaN NaN NaN NaN NaN 2402 S_2 C_4 M street_7 166.0 82.0 48.7 B
df. reindex( index= [ 1101 , 1203 , 1206 , 2402 ] , method= 'bfill' )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1203 S_1 C_2 M street_6 160 53 58.8 A+ 1206 S_1 C_3 M street_4 161 68 31.5 B+ 2402 S_2 C_4 M street_7 166 82 48.7 B
df. reindex( index= [ 1101 , 1203 , 1206 , 2402 ] , method= 'ffill' )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1203 S_1 C_2 M street_6 160 53 58.8 A+ 1206 S_1 C_2 F street_6 167 63 68.4 B- 2402 S_2 C_4 M street_7 166 82 48.7 B
df. reindex( index= [ 1101 , 1203 , 1206 , 2402 ] , method= 'nearest' )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1203 S_1 C_2 M street_6 160 53 58.8 A+ 1206 S_1 C_2 F street_6 167 63 68.4 B- 2402 S_2 C_4 M street_7 166 82 48.7 B
reindex_like的作用为生成一个横纵索引完全与参数列表一致的DataFrame ,数据使用被调用的表
df_temp= pd. DataFrame( { 'Weight' : np. zeros( 5 ) , 'Height' : np. zeros( 5 ) , 'ID' : [ 1101 , 1104 , 1103 , 1106 , 1102 ] } ) . set_index( 'ID' )
df_temp. reindex_like( df[ 0 : 5 ] [ [ 'Weight' , 'Height' ] ] )
Weight Height ID 1101 0.0 0.0 1102 0.0 0.0 1103 0.0 0.0 1104 0.0 0.0 1105 NaN NaN
如果df_temp单调还可以使用method参数
df_temp= pd. DataFrame( { 'Weight' : range ( 5 ) , 'Height' : range ( 5 ) , 'ID' : [ 1101 , 1104 , 1103 , 1106 , 1102 ] } ) . set_index( 'ID' ) . sort_index( )
df_temp. reindex_like( df[ 0 : 5 ] [ [ 'Weight' , 'Height' ] ] , method= 'bfill' )
Weight Height ID 1101 0 0 1102 4 4 1103 2 2 1104 1 1 1105 3 3
set_index和reset_index
先介绍set_index:从字面意思看,就是将某些列作为索引 使用表内列作为索引:
df. head( )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1102 S_1 C_1 F street_2 192 73 32.5 B+ 1103 S_1 C_1 M street_2 186 82 87.2 B+ 1104 S_1 C_1 F street_2 167 81 80.4 B- 1105 S_1 C_1 F street_4 159 64 84.8 B+
df. set_index( 'Class' )
School Gender Address Height Weight Math Physics Class C_1 S_1 M street_1 173 63 34.0 A+ C_1 S_1 F street_2 192 73 32.5 B+ C_1 S_1 M street_2 186 82 87.2 B+ C_1 S_1 F street_2 167 81 80.4 B- C_1 S_1 F street_4 159 64 84.8 B+ C_2 S_1 M street_5 188 68 97.0 A- C_2 S_1 F street_4 176 94 63.5 B- C_2 S_1 M street_6 160 53 58.8 A+ C_2 S_1 F street_5 162 63 33.8 B C_2 S_1 F street_6 167 63 68.4 B- C_3 S_1 M street_4 161 68 31.5 B+ C_3 S_1 F street_1 175 57 87.7 A- C_3 S_1 M street_7 188 82 49.7 B C_3 S_1 M street_2 195 70 85.2 A C_3 S_1 F street_5 187 69 61.7 B- C_1 S_2 M street_7 174 84 83.3 C C_1 S_2 F street_6 161 61 50.6 B+ C_1 S_2 M street_4 157 61 52.5 B- C_1 S_2 F street_5 159 97 72.2 B+ C_1 S_2 M street_4 170 81 34.2 A C_2 S_2 M street_5 193 100 39.1 B C_2 S_2 F street_7 194 77 68.5 B+ C_2 S_2 M street_4 155 91 73.8 A+ C_2 S_2 M street_1 175 74 47.2 B- C_2 S_2 F street_7 183 76 85.4 B C_3 S_2 F street_4 157 78 72.3 B+ C_3 S_2 M street_5 171 88 32.7 A C_3 S_2 F street_7 190 99 65.9 C C_3 S_2 F street_6 164 81 95.5 A- C_3 S_2 M street_4 187 73 48.9 B C_4 S_2 F street_2 192 62 45.3 A C_4 S_2 M street_7 166 82 48.7 B C_4 S_2 F street_6 158 60 59.7 B+ C_4 S_2 F street_2 160 84 67.7 B C_4 S_2 F street_6 193 54 47.6 B
利用append参数可以将当前索引维持不变
df. set_index( 'Class' , append= True )
School Gender Address Height Weight Math Physics ID Class 1101 C_1 S_1 M street_1 173 63 34.0 A+ 1102 C_1 S_1 F street_2 192 73 32.5 B+ 1103 C_1 S_1 M street_2 186 82 87.2 B+ 1104 C_1 S_1 F street_2 167 81 80.4 B- 1105 C_1 S_1 F street_4 159 64 84.8 B+ 1201 C_2 S_1 M street_5 188 68 97.0 A- 1202 C_2 S_1 F street_4 176 94 63.5 B- 1203 C_2 S_1 M street_6 160 53 58.8 A+ 1204 C_2 S_1 F street_5 162 63 33.8 B 1205 C_2 S_1 F street_6 167 63 68.4 B- 1301 C_3 S_1 M street_4 161 68 31.5 B+ 1302 C_3 S_1 F street_1 175 57 87.7 A- 1303 C_3 S_1 M street_7 188 82 49.7 B 1304 C_3 S_1 M street_2 195 70 85.2 A 1305 C_3 S_1 F street_5 187 69 61.7 B- 2101 C_1 S_2 M street_7 174 84 83.3 C 2102 C_1 S_2 F street_6 161 61 50.6 B+ 2103 C_1 S_2 M street_4 157 61 52.5 B- 2104 C_1 S_2 F street_5 159 97 72.2 B+ 2105 C_1 S_2 M street_4 170 81 34.2 A 2201 C_2 S_2 M street_5 193 100 39.1 B 2202 C_2 S_2 F street_7 194 77 68.5 B+ 2203 C_2 S_2 M street_4 155 91 73.8 A+ 2204 C_2 S_2 M street_1 175 74 47.2 B- 2205 C_2 S_2 F street_7 183 76 85.4 B 2301 C_3 S_2 F street_4 157 78 72.3 B+ 2302 C_3 S_2 M street_5 171 88 32.7 A 2303 C_3 S_2 F street_7 190 99 65.9 C 2304 C_3 S_2 F street_6 164 81 95.5 A- 2305 C_3 S_2 M street_4 187 73 48.9 B 2401 C_4 S_2 F street_2 192 62 45.3 A 2402 C_4 S_2 M street_7 166 82 48.7 B 2403 C_4 S_2 F street_6 158 60 59.7 B+ 2404 C_4 S_2 F street_2 160 84 67.7 B 2405 C_4 S_2 F street_6 193 54 47.6 B
可以直接添加多级索引
df. set_index( [ pd. Series( range ( df. shape[ 0 ] ) ) , pd. Series( np. ones( df. shape[ 0 ] ) ) ] )
School Class Gender Address Height Weight Math Physics 0 1.0 S_1 C_1 M street_1 173 63 34.0 A+ 1 1.0 S_1 C_1 F street_2 192 73 32.5 B+ 2 1.0 S_1 C_1 M street_2 186 82 87.2 B+ 3 1.0 S_1 C_1 F street_2 167 81 80.4 B- 4 1.0 S_1 C_1 F street_4 159 64 84.8 B+ 5 1.0 S_1 C_2 M street_5 188 68 97.0 A- 6 1.0 S_1 C_2 F street_4 176 94 63.5 B- 7 1.0 S_1 C_2 M street_6 160 53 58.8 A+ 8 1.0 S_1 C_2 F street_5 162 63 33.8 B 9 1.0 S_1 C_2 F street_6 167 63 68.4 B- 10 1.0 S_1 C_3 M street_4 161 68 31.5 B+ 11 1.0 S_1 C_3 F street_1 175 57 87.7 A- 12 1.0 S_1 C_3 M street_7 188 82 49.7 B 13 1.0 S_1 C_3 M street_2 195 70 85.2 A 14 1.0 S_1 C_3 F street_5 187 69 61.7 B- 15 1.0 S_2 C_1 M street_7 174 84 83.3 C 16 1.0 S_2 C_1 F street_6 161 61 50.6 B+ 17 1.0 S_2 C_1 M street_4 157 61 52.5 B- 18 1.0 S_2 C_1 F street_5 159 97 72.2 B+ 19 1.0 S_2 C_1 M street_4 170 81 34.2 A 20 1.0 S_2 C_2 M street_5 193 100 39.1 B 21 1.0 S_2 C_2 F street_7 194 77 68.5 B+ 22 1.0 S_2 C_2 M street_4 155 91 73.8 A+ 23 1.0 S_2 C_2 M street_1 175 74 47.2 B- 24 1.0 S_2 C_2 F street_7 183 76 85.4 B 25 1.0 S_2 C_3 F street_4 157 78 72.3 B+ 26 1.0 S_2 C_3 M street_5 171 88 32.7 A 27 1.0 S_2 C_3 F street_7 190 99 65.9 C 28 1.0 S_2 C_3 F street_6 164 81 95.5 A- 29 1.0 S_2 C_3 M street_4 187 73 48.9 B 30 1.0 S_2 C_4 F street_2 192 62 45.3 A 31 1.0 S_2 C_4 M street_7 166 82 48.7 B 32 1.0 S_2 C_4 F street_6 158 60 59.7 B+ 33 1.0 S_2 C_4 F street_2 160 84 67.7 B 34 1.0 S_2 C_4 F street_6 193 54 47.6 B
reset_index方法,主要功能是索引重置
df. reset_index( )
ID School Class Gender Address Height Weight Math Physics 0 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1 1102 S_1 C_1 F street_2 192 73 32.5 B+ 2 1103 S_1 C_1 M street_2 186 82 87.2 B+ 3 1104 S_1 C_1 F street_2 167 81 80.4 B- 4 1105 S_1 C_1 F street_4 159 64 84.8 B+ 5 1201 S_1 C_2 M street_5 188 68 97.0 A- 6 1202 S_1 C_2 F street_4 176 94 63.5 B- 7 1203 S_1 C_2 M street_6 160 53 58.8 A+ 8 1204 S_1 C_2 F street_5 162 63 33.8 B 9 1205 S_1 C_2 F street_6 167 63 68.4 B- 10 1301 S_1 C_3 M street_4 161 68 31.5 B+ 11 1302 S_1 C_3 F street_1 175 57 87.7 A- 12 1303 S_1 C_3 M street_7 188 82 49.7 B 13 1304 S_1 C_3 M street_2 195 70 85.2 A 14 1305 S_1 C_3 F street_5 187 69 61.7 B- 15 2101 S_2 C_1 M street_7 174 84 83.3 C 16 2102 S_2 C_1 F street_6 161 61 50.6 B+ 17 2103 S_2 C_1 M street_4 157 61 52.5 B- 18 2104 S_2 C_1 F street_5 159 97 72.2 B+ 19 2105 S_2 C_1 M street_4 170 81 34.2 A 20 2201 S_2 C_2 M street_5 193 100 39.1 B 21 2202 S_2 C_2 F street_7 194 77 68.5 B+ 22 2203 S_2 C_2 M street_4 155 91 73.8 A+ 23 2204 S_2 C_2 M street_1 175 74 47.2 B- 24 2205 S_2 C_2 F street_7 183 76 85.4 B 25 2301 S_2 C_3 F street_4 157 78 72.3 B+ 26 2302 S_2 C_3 M street_5 171 88 32.7 A 27 2303 S_2 C_3 F street_7 190 99 65.9 C 28 2304 S_2 C_3 F street_6 164 81 95.5 A- 29 2305 S_2 C_3 M street_4 187 73 48.9 B 30 2401 S_2 C_4 F street_2 192 62 45.3 A 31 2402 S_2 C_4 M street_7 166 82 48.7 B 32 2403 S_2 C_4 F street_6 158 60 59.7 B+ 33 2404 S_2 C_4 F street_2 160 84 67.7 B 34 2405 S_2 C_4 F street_6 193 54 47.6 B
用level参数指定哪一层被reset,用col_level参数指定set到哪一层
L1, L2= [ 'A' , 'B' , 'C' ] , [ 'a' , 'b' , 'c' ]
mul_index1= pd. MultiIndex. from_product( [ L1, L2] , names= ( 'Upper' , 'Lower' ) )
L3, L4= [ 'D' , 'E' , 'F' ] , [ 'd' , 'e' , 'f' ]
mul_index2= pd. MultiIndex. from_product( [ L3, L4] , names= ( 'Big' , 'Small' ) )
df_temp= pd. DataFrame( np. random. rand( 9 , 9 ) , index= mul_index1, columns= mul_index2)
df_temp
Big D E F Small d e f d e f d e f Upper Lower A a 0.078439 0.606473 0.027336 0.820540 0.724865 0.013541 0.812242 0.059455 0.411638 b 0.924012 0.655697 0.288471 0.056228 0.201486 0.832335 0.521111 0.207224 0.846760 c 0.135154 0.558603 0.157117 0.148768 0.059379 0.156931 0.903250 0.800708 0.427384 B a 0.608138 0.679760 0.079642 0.876910 0.733128 0.544612 0.619875 0.895887 0.232243 b 0.855668 0.701133 0.367470 0.772994 0.936175 0.487915 0.814533 0.153307 0.711710 c 0.555524 0.645412 0.092394 0.637409 0.848008 0.526416 0.515075 0.175714 0.480151 C a 0.703797 0.408054 0.734173 0.425674 0.807381 0.164609 0.742690 0.049613 0.115544 b 0.425531 0.788919 0.759588 0.486930 0.102313 0.518317 0.197488 0.424122 0.339473 c 0.130966 0.868225 0.840692 0.494551 0.942597 0.109135 0.475903 0.164280 0.973699
df_temp1= df_temp. reset_index( level= 1 , col_level= 1 )
df_temp1
Big D E F Small Lower d e f d e f d e f Upper A a 0.078439 0.606473 0.027336 0.820540 0.724865 0.013541 0.812242 0.059455 0.411638 A b 0.924012 0.655697 0.288471 0.056228 0.201486 0.832335 0.521111 0.207224 0.846760 A c 0.135154 0.558603 0.157117 0.148768 0.059379 0.156931 0.903250 0.800708 0.427384 B a 0.608138 0.679760 0.079642 0.876910 0.733128 0.544612 0.619875 0.895887 0.232243 B b 0.855668 0.701133 0.367470 0.772994 0.936175 0.487915 0.814533 0.153307 0.711710 B c 0.555524 0.645412 0.092394 0.637409 0.848008 0.526416 0.515075 0.175714 0.480151 C a 0.703797 0.408054 0.734173 0.425674 0.807381 0.164609 0.742690 0.049613 0.115544 C b 0.425531 0.788919 0.759588 0.486930 0.102313 0.518317 0.197488 0.424122 0.339473 C c 0.130966 0.868225 0.840692 0.494551 0.942597 0.109135 0.475903 0.164280 0.973699
rename_axis和rename
rename_axis是针对多级索引的方法,作用是修改某一层的索引名,而不是索引标签!
df_temp. rename_axis( index= { 'Lower' : 'LowerLower' } , columns= { 'Big' : 'BigBig' } )
BigBig D E F Small d e f d e f d e f Upper LowerLower A a 0.078439 0.606473 0.027336 0.820540 0.724865 0.013541 0.812242 0.059455 0.411638 b 0.924012 0.655697 0.288471 0.056228 0.201486 0.832335 0.521111 0.207224 0.846760 c 0.135154 0.558603 0.157117 0.148768 0.059379 0.156931 0.903250 0.800708 0.427384 B a 0.608138 0.679760 0.079642 0.876910 0.733128 0.544612 0.619875 0.895887 0.232243 b 0.855668 0.701133 0.367470 0.772994 0.936175 0.487915 0.814533 0.153307 0.711710 c 0.555524 0.645412 0.092394 0.637409 0.848008 0.526416 0.515075 0.175714 0.480151 C a 0.703797 0.408054 0.734173 0.425674 0.807381 0.164609 0.742690 0.049613 0.115544 b 0.425531 0.788919 0.759588 0.486930 0.102313 0.518317 0.197488 0.424122 0.339473 c 0.130966 0.868225 0.840692 0.494551 0.942597 0.109135 0.475903 0.164280 0.973699
rename方法用于修改列或者行索引标签,而不是索引名
df_temp. rename( index= { 'A' : "AS" } , columns= { 'F' : 'FF' } )
Big D E FF Small d e f d e f d e f Upper Lower AS a 0.078439 0.606473 0.027336 0.820540 0.724865 0.013541 0.812242 0.059455 0.411638 b 0.924012 0.655697 0.288471 0.056228 0.201486 0.832335 0.521111 0.207224 0.846760 c 0.135154 0.558603 0.157117 0.148768 0.059379 0.156931 0.903250 0.800708 0.427384 B a 0.608138 0.679760 0.079642 0.876910 0.733128 0.544612 0.619875 0.895887 0.232243 b 0.855668 0.701133 0.367470 0.772994 0.936175 0.487915 0.814533 0.153307 0.711710 c 0.555524 0.645412 0.092394 0.637409 0.848008 0.526416 0.515075 0.175714 0.480151 C a 0.703797 0.408054 0.734173 0.425674 0.807381 0.164609 0.742690 0.049613 0.115544 b 0.425531 0.788919 0.759588 0.486930 0.102313 0.518317 0.197488 0.424122 0.339473 c 0.130966 0.868225 0.840692 0.494551 0.942597 0.109135 0.475903 0.164280 0.973699
常用索引型函数
where函数
df. head( )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1102 S_1 C_1 F street_2 192 73 32.5 B+ 1103 S_1 C_1 M street_2 186 82 87.2 B+ 1104 S_1 C_1 F street_2 167 81 80.4 B- 1105 S_1 C_1 F street_4 159 64 84.8 B+
df. where( df[ 'Gender' ] == 'M' )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173.0 63.0 34.0 A+ 1102 NaN NaN NaN NaN NaN NaN NaN NaN 1103 S_1 C_1 M street_2 186.0 82.0 87.2 B+ 1104 NaN NaN NaN NaN NaN NaN NaN NaN 1105 NaN NaN NaN NaN NaN NaN NaN NaN 1201 S_1 C_2 M street_5 188.0 68.0 97.0 A- 1202 NaN NaN NaN NaN NaN NaN NaN NaN 1203 S_1 C_2 M street_6 160.0 53.0 58.8 A+ 1204 NaN NaN NaN NaN NaN NaN NaN NaN 1205 NaN NaN NaN NaN NaN NaN NaN NaN 1301 S_1 C_3 M street_4 161.0 68.0 31.5 B+ 1302 NaN NaN NaN NaN NaN NaN NaN NaN 1303 S_1 C_3 M street_7 188.0 82.0 49.7 B 1304 S_1 C_3 M street_2 195.0 70.0 85.2 A 1305 NaN NaN NaN NaN NaN NaN NaN NaN 2101 S_2 C_1 M street_7 174.0 84.0 83.3 C 2102 NaN NaN NaN NaN NaN NaN NaN NaN 2103 S_2 C_1 M street_4 157.0 61.0 52.5 B- 2104 NaN NaN NaN NaN NaN NaN NaN NaN 2105 S_2 C_1 M street_4 170.0 81.0 34.2 A 2201 S_2 C_2 M street_5 193.0 100.0 39.1 B 2202 NaN NaN NaN NaN NaN NaN NaN NaN 2203 S_2 C_2 M street_4 155.0 91.0 73.8 A+ 2204 S_2 C_2 M street_1 175.0 74.0 47.2 B- 2205 NaN NaN NaN NaN NaN NaN NaN NaN 2301 NaN NaN NaN NaN NaN NaN NaN NaN 2302 S_2 C_3 M street_5 171.0 88.0 32.7 A 2303 NaN NaN NaN NaN NaN NaN NaN NaN 2304 NaN NaN NaN NaN NaN NaN NaN NaN 2305 S_2 C_3 M street_4 187.0 73.0 48.9 B 2401 NaN NaN NaN NaN NaN NaN NaN NaN 2402 S_2 C_4 M street_7 166.0 82.0 48.7 B 2403 NaN NaN NaN NaN NaN NaN NaN NaN 2404 NaN NaN NaN NaN NaN NaN NaN NaN 2405 NaN NaN NaN NaN NaN NaN NaN NaN
df. where( df[ 'Gender' ] == 'M' ) . dropna( )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173.0 63.0 34.0 A+ 1103 S_1 C_1 M street_2 186.0 82.0 87.2 B+ 1201 S_1 C_2 M street_5 188.0 68.0 97.0 A- 1203 S_1 C_2 M street_6 160.0 53.0 58.8 A+ 1301 S_1 C_3 M street_4 161.0 68.0 31.5 B+ 1303 S_1 C_3 M street_7 188.0 82.0 49.7 B 1304 S_1 C_3 M street_2 195.0 70.0 85.2 A 2101 S_2 C_1 M street_7 174.0 84.0 83.3 C 2103 S_2 C_1 M street_4 157.0 61.0 52.5 B- 2105 S_2 C_1 M street_4 170.0 81.0 34.2 A 2201 S_2 C_2 M street_5 193.0 100.0 39.1 B 2203 S_2 C_2 M street_4 155.0 91.0 73.8 A+ 2204 S_2 C_2 M street_1 175.0 74.0 47.2 B- 2302 S_2 C_3 M street_5 171.0 88.0 32.7 A 2305 S_2 C_3 M street_4 187.0 73.0 48.9 B 2402 S_2 C_4 M street_7 166.0 82.0 48.7 B
df[ df[ 'Gender' ] == 'M' ]
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1103 S_1 C_1 M street_2 186 82 87.2 B+ 1201 S_1 C_2 M street_5 188 68 97.0 A- 1203 S_1 C_2 M street_6 160 53 58.8 A+ 1301 S_1 C_3 M street_4 161 68 31.5 B+ 1303 S_1 C_3 M street_7 188 82 49.7 B 1304 S_1 C_3 M street_2 195 70 85.2 A 2101 S_2 C_1 M street_7 174 84 83.3 C 2103 S_2 C_1 M street_4 157 61 52.5 B- 2105 S_2 C_1 M street_4 170 81 34.2 A 2201 S_2 C_2 M street_5 193 100 39.1 B 2203 S_2 C_2 M street_4 155 91 73.8 A+ 2204 S_2 C_2 M street_1 175 74 47.2 B- 2302 S_2 C_3 M street_5 171 88 32.7 A 2305 S_2 C_3 M street_4 187 73 48.9 B 2402 S_2 C_4 M street_7 166 82 48.7 B
df. where( df[ 'Gender' ] == 'M' , np. random. rand( df. shape[ 0 ] , df. shape[ 1 ] ) )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173.000000 63.000000 34.000000 A+ 1102 0.161738 0.390247 0.657045 0.0881898 0.111276 0.179249 0.671646 0.0845629 1103 S_1 C_1 M street_2 186.000000 82.000000 87.200000 B+ 1104 0.691323 0.0594906 0.580446 0.108647 0.487408 0.891030 0.323322 0.462188 1105 0.249755 0.924071 0.941096 0.136537 0.220908 0.132754 0.590290 0.227241 1201 S_1 C_2 M street_5 188.000000 68.000000 97.000000 A- 1202 0.375964 0.0225906 0.591233 0.738239 0.488456 0.850155 0.133923 0.196627 1203 S_1 C_2 M street_6 160.000000 53.000000 58.800000 A+ 1204 0.98637 0.12643 0.3801 0.457792 0.427230 0.086500 0.665344 0.0635772 1205 0.260435 0.547476 0.427186 0.432637 0.401769 0.285926 0.816576 0.543128 1301 S_1 C_3 M street_4 161.000000 68.000000 31.500000 B+ 1302 0.0582459 0.447284 0.467435 0.781373 0.409279 0.601420 0.259086 0.694789 1303 S_1 C_3 M street_7 188.000000 82.000000 49.700000 B 1304 S_1 C_3 M street_2 195.000000 70.000000 85.200000 A 1305 0.0215951 0.576386 0.39919 0.417488 0.703072 0.132706 0.449112 0.946308 2101 S_2 C_1 M street_7 174.000000 84.000000 83.300000 C 2102 0.956707 0.0782564 0.145532 0.30358 0.607291 0.296191 0.430293 0.0231763 2103 S_2 C_1 M street_4 157.000000 61.000000 52.500000 B- 2104 0.562654 0.54194 0.593674 0.624079 0.872437 0.392348 0.333359 0.459291 2105 S_2 C_1 M street_4 170.000000 81.000000 34.200000 A 2201 S_2 C_2 M street_5 193.000000 100.000000 39.100000 B 2202 0.713538 0.262785 0.106574 0.830532 0.843710 0.928449 0.577797 0.372659 2203 S_2 C_2 M street_4 155.000000 91.000000 73.800000 A+ 2204 S_2 C_2 M street_1 175.000000 74.000000 47.200000 B- 2205 0.255774 0.785794 0.774971 0.147758 0.421119 0.036038 0.360223 0.657362 2301 0.844661 0.15034 0.643687 0.323073 0.879450 0.573411 0.221043 0.123979 2302 S_2 C_3 M street_5 171.000000 88.000000 32.700000 A 2303 0.169579 0.101833 0.310207 0.477993 0.143484 0.575178 0.776476 0.941298 2304 0.349115 0.847212 0.0358821 0.897638 0.252217 0.359201 0.392941 0.813281 2305 S_2 C_3 M street_4 187.000000 73.000000 48.900000 B 2401 0.0898209 0.156882 0.302667 0.543748 0.885245 0.816117 0.132235 0.944388 2402 S_2 C_4 M street_7 166.000000 82.000000 48.700000 B 2403 0.860598 0.872737 0.619551 0.980374 0.939624 0.875574 0.156061 0.840507 2404 0.796595 0.845591 0.967 0.889682 0.122028 0.604279 0.336356 0.663724 2405 0.519106 0.784599 0.984037 0.929875 0.953832 0.398572 0.652583 0.135617
df. shape[ 0 ]
35
df. shape[ 1 ]
8
np.random? 的解释
random 在[0,1)上取均匀浮点值 bytes np.random 的解释 permutation 生成随机序列 choice 生成随机样本 rand值均匀分布。 randn正态分布的值。 ranf均匀分布的浮点数。
mask函数
mask函数与where功能上相反,其余完全一致,即对条件为True的单元进行填充
query函数
df. head( )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1102 S_1 C_1 F street_2 192 73 32.5 B+ 1103 S_1 C_1 M street_2 186 82 87.2 B+ 1104 S_1 C_1 F street_2 167 81 80.4 B- 1105 S_1 C_1 F street_4 159 64 84.8 B+
query函数中的布尔表达式中,下面的符号都是合法的:行列索引名,字符串,and/not/or/&/|/~/not in /in/==/!=、四则运算
df. query( '(Address in ["street_4","street_7"])&(Weight>=(20+40))&(ID in [1101,1102,1103,2402])' )
School Class Gender Address Height Weight Math Physics ID 2402 S_2 C_4 M street_7 166 82 48.7 B
df. query( '(Class in ["C_1","C_2"])&(Math>45) and (Physics in ["A+","A"]) ' )
School Class Gender Address Height Weight Math Physics ID 1203 S_1 C_2 M street_6 160 53 58.8 A+ 2203 S_2 C_2 M street_4 155 91 73.8 A+
df. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 35 entries, 1101 to 2405
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 School 35 non-null object
1 Class 35 non-null object
2 Gender 35 non-null object
3 Address 35 non-null object
4 Height 35 non-null int64
5 Weight 35 non-null int64
6 Math 35 non-null float64
7 Physics 35 non-null object
dtypes: float64(1), int64(2), object(5)
memory usage: 3.7+ KB
重复元素处理
duplicater方法
df. duplicated( 'Class' )
ID
1101 False
1102 True
1103 True
1104 True
1105 True
1201 False
1202 True
1203 True
1204 True
1205 True
1301 False
1302 True
1303 True
1304 True
1305 True
2101 True
2102 True
2103 True
2104 True
2105 True
2201 True
2202 True
2203 True
2204 True
2205 True
2301 True
2302 True
2303 True
2304 True
2305 True
2401 False
2402 True
2403 True
2404 True
2405 True
dtype: bool
可选keep默认为first,即首次出现设为不重复,若为last,则最后一次设为不重复,若为False,则所有重复项为True
drop_duplicates方法
剔除重复项,例如保留每组的第一个值(可以通过keep参数控制)、最后一个值
df. drop_duplicates( 'Class' )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1201 S_1 C_2 M street_5 188 68 97.0 A- 1301 S_1 C_3 M street_4 161 68 31.5 B+ 2401 S_2 C_4 F street_2 192 62 45.3 A
df. drop_duplicates( 'Class' , keep= 'last' )
School Class Gender Address Height Weight Math Physics ID 2105 S_2 C_1 M street_4 170 81 34.2 A 2205 S_2 C_2 F street_7 183 76 85.4 B 2305 S_2 C_3 M street_4 187 73 48.9 B 2405 S_2 C_4 F street_6 193