机器学习之KNN算法(三)

import numpy as np
import matplotlib.pyplot as plt


from sklearn.neighbors import KNeighborsRegressor
x = np.linspace(0,5,num=40)
y = np.sin(x)

plt.scatter(x,y)
<matplotlib.collections.PathCollection at 0x5a83d30>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-SOlIH17V-1650189375296)(output_1_1.png)]

x = np.linspace(0,5,num=40)
y = np.sin(x)
y[::2] += np.random.random(size=20)

plt.scatter(x,y)
<matplotlib.collections.PathCollection at 0x6525350>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-hklF66Vs-1650189375298)(output_2_1.png)]

x = np.linspace(0,5,num=40)
y = np.sin(x)
y[::2] += np.random.uniform(-0.5,0.5,size=20)

plt.scatter(x,y)
<matplotlib.collections.PathCollection at 0x695c9f0>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-xpd4KnT8-1650189375300)(output_3_1.png)]

x = np.linspace(0,5,num=40)
y = np.sin(x)
# 添加随机种子,固定随机数
y[::2] += np.random.uniform(-0.5,0.5,size=20)

plt.scatter(x,y)
<matplotlib.collections.PathCollection at 0x6a79cb0>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-opeNctR9-1650189375301)(output_4_1.png)]

# 1.分类模型:离散值
# 2.回归模型:连续值
knn_r = KNeighborsRegressor(n_neighbors=5)
knn_r.fit(x.reshape(-1,1),y)
KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')
x2 = np.linspace(0,5,num=100)
# 预测的特征集(除了目标之外的数据)  列数同训练的特征集的形状保持一致
y2 = knn_r.predict(x2.reshape(-1,1))  
plt.scatter(x,y)
plt.plot(x2,y2,c='red')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-MzcUGMH1-1650189375303)(output_7_0.png)]

ks = [3,4,7,9,11,13]    # 声明6种不同邻近数量

plt.figure(figsize=(20,12))
for i,k in enumerate(ks):
    plt.subplot(2,3,i+1)
    plt.title(f'Neighbors:{k}',size=20)
    
    knn_r.set_params(n_neighbors=k)
    knn_r.fit(x.reshape(-1,1),y)
    y2 = knn_r.predict(x2.reshape(-1,1))
    
    plt.scatter(x,y)
    plt.plot(x2,y2,c='red')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-izPsjE3o-1650189375305)(output_8_0.png)]

KNN之年收入预测

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
加载数据
  • data/adults.txt
adults = pd.read_csv('data/adults.txt')
adults.head()
ageworkclassfinal_weighteducationeducation_nummarital_statusoccupationrelationshipracesexcapital_gaincapital_losshours_per_weeknative_countrysalary
039State-gov77516Bachelors13Never-marriedAdm-clericalNot-in-familyWhiteMale2174040United-States<=50K
150Self-emp-not-inc83311Bachelors13Married-civ-spouseExec-managerialHusbandWhiteMale0013United-States<=50K
238Private215646HS-grad9DivorcedHandlers-cleanersNot-in-familyWhiteMale0040United-States<=50K
353Private23472111th7Married-civ-spouseHandlers-cleanersHusbandBlackMale0040United-States<=50K
428Private338409Bachelors13Married-civ-spouseProf-specialtyWifeBlackFemale0040Cuba<=50K

一次性将非数值的列转化为数值列

columns = adults.dtypes
columns
age                int64
workclass         object
final_weight       int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
salary            object
dtype: object
for column in adults.columns[adults.dtypes == np.object]:
    display(adults[column].unique())
array(['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov',
       'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked'],
      dtype=object)



array(['Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college',
       'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school',
       '5th-6th', '10th', '1st-4th', 'Preschool', '12th'], dtype=object)



array(['Never-married', 'Married-civ-spouse', 'Divorced',
       'Married-spouse-absent', 'Separated', 'Married-AF-spouse',
       'Widowed'], dtype=object)



array(['Adm-clerical', 'Exec-managerial', 'Handlers-cleaners',
       'Prof-specialty', 'Other-service', 'Sales', 'Craft-repair',
       'Transport-moving', 'Farming-fishing', 'Machine-op-inspct',
       'Tech-support', '?', 'Protective-serv', 'Armed-Forces',
       'Priv-house-serv'], dtype=object)



array(['Not-in-family', 'Husband', 'Wife', 'Own-child', 'Unmarried',
       'Other-relative'], dtype=object)



array(['White', 'Black', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo',
       'Other'], dtype=object)



array(['Male', 'Female'], dtype=object)



array(['United-States', 'Cuba', 'Jamaica', 'India', '?', 'Mexico',
       'South', 'Puerto-Rico', 'Honduras', 'England', 'Canada', 'Germany',
       'Iran', 'Philippines', 'Italy', 'Poland', 'Columbia', 'Cambodia',
       'Thailand', 'Ecuador', 'Laos', 'Taiwan', 'Haiti', 'Portugal',
       'Dominican-Republic', 'El-Salvador', 'France', 'Guatemala',
       'China', 'Japan', 'Yugoslavia', 'Peru',
       'Outlying-US(Guam-USVI-etc)', 'Scotland', 'Trinadad&Tobago',
       'Greece', 'Nicaragua', 'Vietnam', 'Hong', 'Ireland', 'Hungary',
       'Holand-Netherlands'], dtype=object)



array(['<=50K', '>50K'], dtype=object)
for column in adults.columns[adults.dtypes == np.object]:
    display({v:i for i,v in enumerate(adults[column].unique())})
{'State-gov': 0,
 'Self-emp-not-inc': 1,
 'Private': 2,
 'Federal-gov': 3,
 'Local-gov': 4,
 '?': 5,
 'Self-emp-inc': 6,
 'Without-pay': 7,
 'Never-worked': 8}



{'Bachelors': 0,
 'HS-grad': 1,
 '11th': 2,
 'Masters': 3,
 '9th': 4,
 'Some-college': 5,
 'Assoc-acdm': 6,
 'Assoc-voc': 7,
 '7th-8th': 8,
 'Doctorate': 9,
 'Prof-school': 10,
 '5th-6th': 11,
 '10th': 12,
 '1st-4th': 13,
 'Preschool': 14,
 '12th': 15}



{'Never-married': 0,
 'Married-civ-spouse': 1,
 'Divorced': 2,
 'Married-spouse-absent': 3,
 'Separated': 4,
 'Married-AF-spouse': 5,
 'Widowed': 6}



{'Adm-clerical': 0,
 'Exec-managerial': 1,
 'Handlers-cleaners': 2,
 'Prof-specialty': 3,
 'Other-service': 4,
 'Sales': 5,
 'Craft-repair': 6,
 'Transport-moving': 7,
 'Farming-fishing': 8,
 'Machine-op-inspct': 9,
 'Tech-support': 10,
 '?': 11,
 'Protective-serv': 12,
 'Armed-Forces': 13,
 'Priv-house-serv': 14}



{'Not-in-family': 0,
 'Husband': 1,
 'Wife': 2,
 'Own-child': 3,
 'Unmarried': 4,
 'Other-relative': 5}



{'White': 0,
 'Black': 1,
 'Asian-Pac-Islander': 2,
 'Amer-Indian-Eskimo': 3,
 'Other': 4}



{'Male': 0, 'Female': 1}



{'United-States': 0,
 'Cuba': 1,
 'Jamaica': 2,
 'India': 3,
 '?': 4,
 'Mexico': 5,
 'South': 6,
 'Puerto-Rico': 7,
 'Honduras': 8,
 'England': 9,
 'Canada': 10,
 'Germany': 11,
 'Iran': 12,
 'Philippines': 13,
 'Italy': 14,
 'Poland': 15,
 'Columbia': 16,
 'Cambodia': 17,
 'Thailand': 18,
 'Ecuador': 19,
 'Laos': 20,
 'Taiwan': 21,
 'Haiti': 22,
 'Portugal': 23,
 'Dominican-Republic': 24,
 'El-Salvador': 25,
 'France': 26,
 'Guatemala': 27,
 'China': 28,
 'Japan': 29,
 'Yugoslavia': 30,
 'Peru': 31,
 'Outlying-US(Guam-USVI-etc)': 32,
 'Scotland': 33,
 'Trinadad&Tobago': 34,
 'Greece': 35,
 'Nicaragua': 36,
 'Vietnam': 37,
 'Hong': 38,
 'Ireland': 39,
 'Hungary': 40,
 'Holand-Netherlands': 41}



{'<=50K': 0, '>50K': 1}
all_column_map = {}  # 留存
for column in adults.columns[adults.dtypes == np.object]:
    column_map = {v:i for i,v in enumerate(adults[column].unique())}
    all_column_map[column] = column_map
    adults.loc[:,f'{column}'] = adults[column].map(column_map)
    
adults.head()
ageworkclassfinal_weighteducationeducation_nummarital_statusoccupationrelationshipracesexcapital_gaincapital_losshours_per_weeknative_countrysalary
03907751601300000217404000
15018331101311100001300
23822156461922000004000
35322347212712110004000
428233840901313211004010
adults.dtypes
age               int64
workclass         int64
final_weight      int64
education         int64
education_num     int64
marital_status    int64
occupation        int64
relationship      int64
race              int64
sex               int64
capital_gain      int64
capital_loss      int64
hours_per_week    int64
native_country    int64
salary            int64
dtype: object

删除相关的列

  • final_weight
  • capital_gain
  • education_num
adults.drop(columns =['final_weight','capital_gain','education_num'],inplace=True)
adults.head()
ageworkclasseducationmarital_statusoccupationrelationshipracesexcapital_losshours_per_weeknative_countrysalary
039000000004000
150101110001300
238212200004000
353221211004000
428201321104010

将salary做为目标

all_column_map['salary']
{'<=50K': 0, '>50K': 1}
adults.loc[:,'salary'] = adults['salary'].map({ v:k for k,v in all_column_map['salary'].items()})
adults.head()
ageworkclasseducationmarital_statusoccupationrelationshipracesexcapital_losshours_per_weeknative_countrysalary
03900000000400<=50K
15010111000130<=50K
23821220000400<=50K
35322121100400<=50K
42820132110401<=50K

提取样本集

  • 特征集数据 data
  • 目标集数据 target
data = adults.iloc[:,:-1].values
target = adults.iloc[:,-1].values
display(data.shape,target.shape)
(32561, 11)



(32561,)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split as split
X_train,X_test,y_train,y_test = split(data,target,random_state=100,test_size=0.2)
# 创建模型
knn = KNeighborsClassifier()
# 训练
knn.fit(X_train,y_train)
# 查看得分
knn.score(X_test,y_test)
0.8002456625211116
# 分别用不同的邻近数量进行训练
for k in [3,5,7,9,11]:
    knn.set_params(n_neighbors=k)
    knn.fit(X_train,y_train)
    score = knn.score(X_test,y_test)
    print(k,score)
3 0.7925687087363734
5 0.8002456625211116
7 0.8036235221863964
9 0.8036235221863964
11 0.7984031936127745
# 加权重
for k in [3,5,7,9,11]:
    for weight in ['uniform','distance']:
        knn.set_params(n_neighbors=k,weights=weight)
        knn.fit(X_train,y_train)
        score = knn.score(X_test,y_test)
        print(k,weight,score)
3 uniform 0.7925687087363734
3 distance 0.7893443881467833
5 uniform 0.8002456625211116
5 distance 0.796714263780132
7 uniform 0.8036235221863964
7 distance 0.799324428066943
9 uniform 0.8036235221863964
9 distance 0.7999385843697221
11 uniform 0.7984031936127745
11 distance 0.7974819591586059
for k in [6,7,8,9,13,15]:
    for weight in ['uniform','distance']:
        knn.set_params(n_neighbors=k,weights=weight)
        knn.fit(X_train,y_train)
        score = knn.score(X_test,y_test)
        print(k,weight,score)
6 uniform 0.8070013818516812
6 distance 0.7984031936127745
7 uniform 0.8036235221863964
7 distance 0.799324428066943
8 uniform 0.8031629049593122
8 distance 0.8017810532780593
9 uniform 0.8036235221863964
9 distance 0.7999385843697221
13 uniform 0.8016275142023646
13 distance 0.7985567326884692
15 uniform 0.7977890373099954
15 distance 0.7997850452940273
model =None
max_score = 0
for k in [6,7,8,9,13,15]:
    for weight in ['uniform','distance']:
        knn = KNeighborsClassifier(n_neighbors=k,weights=weight)
        knn.fit(X_train,y_train)
        score = knn.score(X_test,y_test)
        print(k,weight,score)
        
        if score > max_score:
            model = knn
            max_score = score
6 uniform 0.8070013818516812
6 distance 0.7984031936127745
7 uniform 0.8036235221863964
7 distance 0.799324428066943
8 uniform 0.8031629049593122
8 distance 0.8017810532780593
9 uniform 0.8036235221863964
9 distance 0.7999385843697221
13 uniform 0.8016275142023646
13 distance 0.7985567326884692
15 uniform 0.7977890373099954
15 distance 0.7997850452940273
max_score
0.8070013818516812
model
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=6, p=2,
                     weights='uniform')

查看’>50k’的平均年龄

adults.query('salary == ">50K"')['age'].mean()
44.24984058155847

查看’>50k’的最小年龄

adults.query('salary == ">50K"')['age'].min()
19

查看’>50k’的最大年龄

adults.query('salary == ">50K"')['age'].max()
90

查看’>50k’的年龄标准差

adults.query('salary == ">50K"')['age'].std()
10.51902771985177
# 查询年龄为19的大于50k的关系
adults.query('age ==19 & salary == ">50K"').shape
(2, 12)
adults.query('age ==19 & salary == ">50K"')
ageworkclasseducationmarital_statusoccupationrelationshipracesexcapital_losshours_per_weeknative_countrysalary
77411928040000600>50K
22910195151115010400>50K
# 查看工作类型
all_column_map['workclass']
{'State-gov': 0,
 'Self-emp-not-inc': 1,
 'Private': 2,
 'Federal-gov': 3,
 'Local-gov': 4,
 '?': 5,
 'Self-emp-inc': 6,
 'Without-pay': 7,
 'Never-worked': 8}
# 查看学历
all_column_map['education']
{'Bachelors': 0,
 'HS-grad': 1,
 '11th': 2,
 'Masters': 3,
 '9th': 4,
 'Some-college': 5,
 'Assoc-acdm': 6,
 'Assoc-voc': 7,
 '7th-8th': 8,
 'Doctorate': 9,
 'Prof-school': 10,
 '5th-6th': 11,
 '10th': 12,
 '1st-4th': 13,
 'Preschool': 14,
 '12th': 15}
# 查看岗位
all_column_map['occupation']
{'Adm-clerical': 0,
 'Exec-managerial': 1,
 'Handlers-cleaners': 2,
 'Prof-specialty': 3,
 'Other-service': 4,
 'Sales': 5,
 'Craft-repair': 6,
 'Transport-moving': 7,
 'Farming-fishing': 8,
 'Machine-op-inspct': 9,
 'Tech-support': 10,
 '?': 11,
 'Protective-serv': 12,
 'Armed-Forces': 13,
 'Priv-house-serv': 14}
# 查看性别
all_column_map['sex']
{'Male': 0, 'Female': 1}
# 查看种族
all_column_map['race']
{'White': 0,
 'Black': 1,
 'Asian-Pac-Islander': 2,
 'Amer-Indian-Eskimo': 3,
 'Other': 4}

查看’>50K’的工作时长(最大,最小,平均)

for f in [np.min,np.max,np.mean]:
    print(f.__name__,f(adults.query('salary == ">50K"')['hours_per_week']))
amin 1
amax 99
mean 45.473026399693914
adults.query('salary == ">50K" & hours_per_week == 1')
ageworkclasseducationmarital_statusoccupationrelationshipracesexcapital_losshours_per_weeknative_countrysalary
189580913100010>50K
200726551111100010>50K

查看’>50K’的男女的比例

pd.crosstab(adults.salary,adults.sex)
sex01
salary
<=50K151289592
>50K66621179
6662 / 1179
5.650551314673452
pd.crosstab(adults.salary,adults.education).rename(columns = { v:k for k,v in all_column_map['education'].items()})
educationBachelorsHS-grad11thMasters9thSome-collegeAssoc-acdmAssoc-voc7th-8thDoctorateProf-school5th-6th10th1st-4thPreschool12th
salary
<=50K3134882611157644875904802102160610715331787116251400
>50K22211675609592713872653614030642316626033

生成预测数据

data.shape
(32561, 11)
test1 = np.array([
    33,  # 年龄
    2,   # 工作单位类型
    1,   # 教育程度
    4,   # 婚姻状态
    1,   # 岗位
    1,   # 家庭关系
    0,   # 种族
    0,   # 性别
    0,   # 损失
    45,  # 每周工作时长
    0    # 国家
])
test1.shape
(11,)
# 预测结果
model.predict(test1.reshape(1,-1))
array(['<=50K'], dtype=object)
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

今晚务必早点睡

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值