import numpy as np
import pandas as pd
df = pd.read_csv('./data/table.csv',index_col='ID')
df.head()
|
Unnamed: 0 |
School |
Class |
Gender |
Address |
Height |
Weight |
Math |
Physics |
ID |
|
|
|
|
|
|
|
|
|
1101 |
0 |
S_1 |
C_1 |
M |
street_1 |
173 |
63 |
34.0 |
A+ |
1102 |
1 |
S_1 |
C_1 |
F |
street_2 |
192 |
73 |
32.5 |
B+ |
1103 |
2 |
S_1 |
C_1 |
M |
street_2 |
186 |
82 |
87.2 |
B+ |
1104 |
3 |
S_1 |
C_1 |
F |
street_2 |
167 |
81 |
80.4 |
B- |
1105 |
4 |
S_1 |
C_1 |
F |
street_4 |
159 |
64 |
84.8 |
B+ |
区间索引
RMK:区间索引并不是只能在单极索引中使用,只是作为一种特殊类型的索引方式
利用interval_range方法
pd.interval_range(start=0,end=5,closed = 'both')
IntervalIndex([[0, 1], [1, 2], [2, 3], [3, 4], [4, 5]],
closed='both',
dtype='interval[int64]')
pd.interval_range(start = 0,periods = 8, freq = 5)
IntervalIndex([(0, 5], (5, 10], (10, 15], (15, 20], (20, 25], (25, 30], (30, 35], (35, 40]],
closed='right',
dtype='interval[int64]')
利用cut将数值列转为区间为元素的分类变量,例如统计数学成绩的区间情况
math_interval = pd.cut(df['Math'],bins = [0,40,60,80,100])
math_interval.head()
ID
1101 (0, 40]
1102 (0, 40]
1103 (80, 100]
1104 (80, 100]
1105 (80, 100]
Name: Math, dtype: category
Categories (4, interval[int64]): [(0, 40] < (40, 60] < (60, 80] < (80, 100]]
区间索引的选取
df_i_1 = df.join(math_interval,rsuffix='_interval')
df_i_1.head()
|
Unnamed: 0 |
School |
Class |
Gender |
Address |
Height |
Weight |
Math |
Physics |
Math_interval |
ID |
|
|
|
|
|
|
|
|
|
|
1101 |
0 |
S_1 |
C_1 |
M |
street_1 |
173 |
63 |
34.0 |
A+ |
(0, 40] |
1102 |
1 |
S_1 |
C_1 |
F |
street_2 |
192 |
73 |
32.5 |
B+ |
(0, 40] |
1103 |
2 |
S_1 |
C_1 |
M |
street_2 |
186 |
82 |
87.2 |
B+ |
(80, 100] |
1104 |
3 |
S_1 |
C_1 |
F |
street_2 |
167 |
81 |
80.4 |
B- |
(80, 100] |
1105 |
4 |
S_1 |
C_1 |
F |
street_4 |
159 |
64 |
84.8 |
B+ |
(80, 100] |
df_i_2 = df_i_1[['Math','Math_interval']]
df_i_2.head()