import numpy as np
import matplotlib. pyplot as plt
from sklearn. neighbors import KNeighborsRegressor
x = np. linspace( 0 , 5 , num= 40 )
y = np. sin( x)
plt. scatter( x, y)
<matplotlib.collections.PathCollection at 0x5a83d30>
x = np. linspace( 0 , 5 , num= 40 )
y = np. sin( x)
y[ : : 2 ] += np. random. random( size= 20 )
plt. scatter( x, y)
<matplotlib.collections.PathCollection at 0x6525350>
x = np. linspace( 0 , 5 , num= 40 )
y = np. sin( x)
y[ : : 2 ] += np. random. uniform( - 0.5 , 0.5 , size= 20 )
plt. scatter( x, y)
<matplotlib.collections.PathCollection at 0x695c9f0>
x = np. linspace( 0 , 5 , num= 40 )
y = np. sin( x)
y[ : : 2 ] += np. random. uniform( - 0.5 , 0.5 , size= 20 )
plt. scatter( x, y)
<matplotlib.collections.PathCollection at 0x6a79cb0>
knn_r = KNeighborsRegressor( n_neighbors= 5 )
knn_r. fit( x. reshape( - 1 , 1 ) , y)
KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=5, p=2,
weights='uniform')
x2 = np. linspace( 0 , 5 , num= 100 )
y2 = knn_r. predict( x2. reshape( - 1 , 1 ) )
plt. scatter( x, y)
plt. plot( x2, y2, c= 'red' )
plt. show( )
ks = [ 3 , 4 , 7 , 9 , 11 , 13 ]
plt. figure( figsize= ( 20 , 12 ) )
for i, k in enumerate ( ks) :
plt. subplot( 2 , 3 , i+ 1 )
plt. title( f'Neighbors: { k} ' , size= 20 )
knn_r. set_params( n_neighbors= k)
knn_r. fit( x. reshape( - 1 , 1 ) , y)
y2 = knn_r. predict( x2. reshape( - 1 , 1 ) )
plt. scatter( x, y)
plt. plot( x2, y2, c= 'red' )
plt. show( )
KNN之年收入预测
import numpy as np
import pandas as pd
import matplotlib. pyplot as plt
加载数据
adults = pd. read_csv( 'data/adults.txt' )
adults. head( )
age workclass final_weight education education_num marital_status occupation relationship race sex capital_gain capital_loss hours_per_week native_country salary 0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K 1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K 2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K 3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K 4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K
一次性将非数值的列转化为数值列
columns = adults. dtypes
columns
age int64
workclass object
final_weight int64
education object
education_num int64
marital_status object
occupation object
relationship object
race object
sex object
capital_gain int64
capital_loss int64
hours_per_week int64
native_country object
salary object
dtype: object
for column in adults. columns[ adults. dtypes == np. object ] :
display( adults[ column] . unique( ) )
array(['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov',
'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked'],
dtype=object)
array(['Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college',
'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school',
'5th-6th', '10th', '1st-4th', 'Preschool', '12th'], dtype=object)
array(['Never-married', 'Married-civ-spouse', 'Divorced',
'Married-spouse-absent', 'Separated', 'Married-AF-spouse',
'Widowed'], dtype=object)
array(['Adm-clerical', 'Exec-managerial', 'Handlers-cleaners',
'Prof-specialty', 'Other-service', 'Sales', 'Craft-repair',
'Transport-moving', 'Farming-fishing', 'Machine-op-inspct',
'Tech-support', '?', 'Protective-serv', 'Armed-Forces',
'Priv-house-serv'], dtype=object)
array(['Not-in-family', 'Husband', 'Wife', 'Own-child', 'Unmarried',
'Other-relative'], dtype=object)
array(['White', 'Black', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo',
'Other'], dtype=object)
array(['Male', 'Female'], dtype=object)
array(['United-States', 'Cuba', 'Jamaica', 'India', '?', 'Mexico',
'South', 'Puerto-Rico', 'Honduras', 'England', 'Canada', 'Germany',
'Iran', 'Philippines', 'Italy', 'Poland', 'Columbia', 'Cambodia',
'Thailand', 'Ecuador', 'Laos', 'Taiwan', 'Haiti', 'Portugal',
'Dominican-Republic', 'El-Salvador', 'France', 'Guatemala',
'China', 'Japan', 'Yugoslavia', 'Peru',
'Outlying-US(Guam-USVI-etc)', 'Scotland', 'Trinadad&Tobago',
'Greece', 'Nicaragua', 'Vietnam', 'Hong', 'Ireland', 'Hungary',
'Holand-Netherlands'], dtype=object)
array(['<=50K', '>50K'], dtype=object)
for column in adults. columns[ adults. dtypes == np. object ] :
display( { v: i for i, v in enumerate ( adults[ column] . unique( ) ) } )
{'State-gov': 0,
'Self-emp-not-inc': 1,
'Private': 2,
'Federal-gov': 3,
'Local-gov': 4,
'?': 5,
'Self-emp-inc': 6,
'Without-pay': 7,
'Never-worked': 8}
{'Bachelors': 0,
'HS-grad': 1,
'11th': 2,
'Masters': 3,
'9th': 4,
'Some-college': 5,
'Assoc-acdm': 6,
'Assoc-voc': 7,
'7th-8th': 8,
'Doctorate': 9,
'Prof-school': 10,
'5th-6th': 11,
'10th': 12,
'1st-4th': 13,
'Preschool': 14,
'12th': 15}
{'Never-married': 0,
'Married-civ-spouse': 1,
'Divorced': 2,
'Married-spouse-absent': 3,
'Separated': 4,
'Married-AF-spouse': 5,
'Widowed': 6}
{'Adm-clerical': 0,
'Exec-managerial': 1,
'Handlers-cleaners': 2,
'Prof-specialty': 3,
'Other-service': 4,
'Sales': 5,
'Craft-repair': 6,
'Transport-moving': 7,
'Farming-fishing': 8,
'Machine-op-inspct': 9,
'Tech-support': 10,
'?': 11,
'Protective-serv': 12,
'Armed-Forces': 13,
'Priv-house-serv': 14}
{'Not-in-family': 0,
'Husband': 1,
'Wife': 2,
'Own-child': 3,
'Unmarried': 4,
'Other-relative': 5}
{'White': 0,
'Black': 1,
'Asian-Pac-Islander': 2,
'Amer-Indian-Eskimo': 3,
'Other': 4}
{'Male': 0, 'Female': 1}
{'United-States': 0,
'Cuba': 1,
'Jamaica': 2,
'India': 3,
'?': 4,
'Mexico': 5,
'South': 6,
'Puerto-Rico': 7,
'Honduras': 8,
'England': 9,
'Canada': 10,
'Germany': 11,
'Iran': 12,
'Philippines': 13,
'Italy': 14,
'Poland': 15,
'Columbia': 16,
'Cambodia': 17,
'Thailand': 18,
'Ecuador': 19,
'Laos': 20,
'Taiwan': 21,
'Haiti': 22,
'Portugal': 23,
'Dominican-Republic': 24,
'El-Salvador': 25,
'France': 26,
'Guatemala': 27,
'China': 28,
'Japan': 29,
'Yugoslavia': 30,
'Peru': 31,
'Outlying-US(Guam-USVI-etc)': 32,
'Scotland': 33,
'Trinadad&Tobago': 34,
'Greece': 35,
'Nicaragua': 36,
'Vietnam': 37,
'Hong': 38,
'Ireland': 39,
'Hungary': 40,
'Holand-Netherlands': 41}
{'<=50K': 0, '>50K': 1}
all_column_map = { }
for column in adults. columns[ adults. dtypes == np. object ] :
column_map = { v: i for i, v in enumerate ( adults[ column] . unique( ) ) }
all_column_map[ column] = column_map
adults. loc[ : , f' { column} ' ] = adults[ column] . map ( column_map)
adults. head( )
age workclass final_weight education education_num marital_status occupation relationship race sex capital_gain capital_loss hours_per_week native_country salary 0 39 0 77516 0 13 0 0 0 0 0 2174 0 40 0 0 1 50 1 83311 0 13 1 1 1 0 0 0 0 13 0 0 2 38 2 215646 1 9 2 2 0 0 0 0 0 40 0 0 3 53 2 234721 2 7 1 2 1 1 0 0 0 40 0 0 4 28 2 338409 0 13 1 3 2 1 1 0 0 40 1 0
adults. dtypes
age int64
workclass int64
final_weight int64
education int64
education_num int64
marital_status int64
occupation int64
relationship int64
race int64
sex int64
capital_gain int64
capital_loss int64
hours_per_week int64
native_country int64
salary int64
dtype: object
删除相关的列
final_weight capital_gain education_num
adults. drop( columns = [ 'final_weight' , 'capital_gain' , 'education_num' ] , inplace= True )
adults. head( )
age workclass education marital_status occupation relationship race sex capital_loss hours_per_week native_country salary 0 39 0 0 0 0 0 0 0 0 40 0 0 1 50 1 0 1 1 1 0 0 0 13 0 0 2 38 2 1 2 2 0 0 0 0 40 0 0 3 53 2 2 1 2 1 1 0 0 40 0 0 4 28 2 0 1 3 2 1 1 0 40 1 0
将salary做为目标
all_column_map[ 'salary' ]
{'<=50K': 0, '>50K': 1}
adults. loc[ : , 'salary' ] = adults[ 'salary' ] . map ( { v: k for k, v in all_column_map[ 'salary' ] . items( ) } )
adults. head( )
age workclass education marital_status occupation relationship race sex capital_loss hours_per_week native_country salary 0 39 0 0 0 0 0 0 0 0 40 0 <=50K 1 50 1 0 1 1 1 0 0 0 13 0 <=50K 2 38 2 1 2 2 0 0 0 0 40 0 <=50K 3 53 2 2 1 2 1 1 0 0 40 0 <=50K 4 28 2 0 1 3 2 1 1 0 40 1 <=50K
提取样本集
data = adults. iloc[ : , : - 1 ] . values
target = adults. iloc[ : , - 1 ] . values
display( data. shape, target. shape)
(32561, 11)
(32561,)
from sklearn. neighbors import KNeighborsClassifier
from sklearn. model_selection import train_test_split as split
X_train, X_test, y_train, y_test = split( data, target, random_state= 100 , test_size= 0.2 )
knn = KNeighborsClassifier( )
knn. fit( X_train, y_train)
knn. score( X_test, y_test)
0.8002456625211116
for k in [ 3 , 5 , 7 , 9 , 11 ] :
knn. set_params( n_neighbors= k)
knn. fit( X_train, y_train)
score = knn. score( X_test, y_test)
print ( k, score)
3 0.7925687087363734
5 0.8002456625211116
7 0.8036235221863964
9 0.8036235221863964
11 0.7984031936127745
for k in [ 3 , 5 , 7 , 9 , 11 ] :
for weight in [ 'uniform' , 'distance' ] :
knn. set_params( n_neighbors= k, weights= weight)
knn. fit( X_train, y_train)
score = knn. score( X_test, y_test)
print ( k, weight, score)
3 uniform 0.7925687087363734
3 distance 0.7893443881467833
5 uniform 0.8002456625211116
5 distance 0.796714263780132
7 uniform 0.8036235221863964
7 distance 0.799324428066943
9 uniform 0.8036235221863964
9 distance 0.7999385843697221
11 uniform 0.7984031936127745
11 distance 0.7974819591586059
for k in [ 6 , 7 , 8 , 9 , 13 , 15 ] :
for weight in [ 'uniform' , 'distance' ] :
knn. set_params( n_neighbors= k, weights= weight)
knn. fit( X_train, y_train)
score = knn. score( X_test, y_test)
print ( k, weight, score)
6 uniform 0.8070013818516812
6 distance 0.7984031936127745
7 uniform 0.8036235221863964
7 distance 0.799324428066943
8 uniform 0.8031629049593122
8 distance 0.8017810532780593
9 uniform 0.8036235221863964
9 distance 0.7999385843697221
13 uniform 0.8016275142023646
13 distance 0.7985567326884692
15 uniform 0.7977890373099954
15 distance 0.7997850452940273
model = None
max_score = 0
for k in [ 6 , 7 , 8 , 9 , 13 , 15 ] :
for weight in [ 'uniform' , 'distance' ] :
knn = KNeighborsClassifier( n_neighbors= k, weights= weight)
knn. fit( X_train, y_train)
score = knn. score( X_test, y_test)
print ( k, weight, score)
if score > max_score:
model = knn
max_score = score
6 uniform 0.8070013818516812
6 distance 0.7984031936127745
7 uniform 0.8036235221863964
7 distance 0.799324428066943
8 uniform 0.8031629049593122
8 distance 0.8017810532780593
9 uniform 0.8036235221863964
9 distance 0.7999385843697221
13 uniform 0.8016275142023646
13 distance 0.7985567326884692
15 uniform 0.7977890373099954
15 distance 0.7997850452940273
max_score
0.8070013818516812
model
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=6, p=2,
weights='uniform')
查看’>50k’的平均年龄
adults. query( 'salary == ">50K"' ) [ 'age' ] . mean( )
44.24984058155847
查看’>50k’的最小年龄
adults. query( 'salary == ">50K"' ) [ 'age' ] . min ( )
19
查看’>50k’的最大年龄
adults. query( 'salary == ">50K"' ) [ 'age' ] . max ( )
90
查看’>50k’的年龄标准差
adults. query( 'salary == ">50K"' ) [ 'age' ] . std( )
10.51902771985177
adults. query( 'age ==19 & salary == ">50K"' ) . shape
(2, 12)
adults. query( 'age ==19 & salary == ">50K"' )
age workclass education marital_status occupation relationship race sex capital_loss hours_per_week native_country salary 7741 19 2 8 0 4 0 0 0 0 60 0 >50K 22910 19 5 15 1 11 5 0 1 0 40 0 >50K
all_column_map[ 'workclass' ]
{'State-gov': 0,
'Self-emp-not-inc': 1,
'Private': 2,
'Federal-gov': 3,
'Local-gov': 4,
'?': 5,
'Self-emp-inc': 6,
'Without-pay': 7,
'Never-worked': 8}
all_column_map[ 'education' ]
{'Bachelors': 0,
'HS-grad': 1,
'11th': 2,
'Masters': 3,
'9th': 4,
'Some-college': 5,
'Assoc-acdm': 6,
'Assoc-voc': 7,
'7th-8th': 8,
'Doctorate': 9,
'Prof-school': 10,
'5th-6th': 11,
'10th': 12,
'1st-4th': 13,
'Preschool': 14,
'12th': 15}
all_column_map[ 'occupation' ]
{'Adm-clerical': 0,
'Exec-managerial': 1,
'Handlers-cleaners': 2,
'Prof-specialty': 3,
'Other-service': 4,
'Sales': 5,
'Craft-repair': 6,
'Transport-moving': 7,
'Farming-fishing': 8,
'Machine-op-inspct': 9,
'Tech-support': 10,
'?': 11,
'Protective-serv': 12,
'Armed-Forces': 13,
'Priv-house-serv': 14}
all_column_map[ 'sex' ]
{'Male': 0, 'Female': 1}
all_column_map[ 'race' ]
{'White': 0,
'Black': 1,
'Asian-Pac-Islander': 2,
'Amer-Indian-Eskimo': 3,
'Other': 4}
查看’>50K’的工作时长(最大,最小,平均)
for f in [ np. min , np. max , np. mean] :
print ( f. __name__, f( adults. query( 'salary == ">50K"' ) [ 'hours_per_week' ] ) )
amin 1
amax 99
mean 45.473026399693914
adults. query( 'salary == ">50K" & hours_per_week == 1' )
age workclass education marital_status occupation relationship race sex capital_loss hours_per_week native_country salary 189 58 0 9 1 3 1 0 0 0 1 0 >50K 20072 65 5 1 1 11 1 0 0 0 1 0 >50K
查看’>50K’的男女的比例
pd. crosstab( adults. salary, adults. sex)
sex 0 1 salary <=50K 15128 9592 >50K 6662 1179
6662 / 1179
5.650551314673452
pd. crosstab( adults. salary, adults. education) . rename( columns = { v: k for k, v in all_column_map[ 'education' ] . items( ) } )
education Bachelors HS-grad 11th Masters 9th Some-college Assoc-acdm Assoc-voc 7th-8th Doctorate Prof-school 5th-6th 10th 1st-4th Preschool 12th salary <=50K 3134 8826 1115 764 487 5904 802 1021 606 107 153 317 871 162 51 400 >50K 2221 1675 60 959 27 1387 265 361 40 306 423 16 62 6 0 33
生成预测数据
data. shape
(32561, 11)
test1 = np. array( [
33 ,
2 ,
1 ,
4 ,
1 ,
1 ,
0 ,
0 ,
0 ,
45 ,
0
] )
test1. shape
(11,)
model. predict( test1. reshape( 1 , - 1 ) )
array(['<=50K'], dtype=object)