机器学习基本套路——从数据分析处理到模型训练预测

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import  pymysql
from sklearn.neighbors import KNeighborsClassifier

#依赖 pip install numpy pandas  matplotlib sklearn

#1.训练数据集与测试数据集
#https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
#https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.testdef get_data():


#2.读取数据,查看简单统计结果,对数据有个基本了解
train_set = pd.read_csv("E:/soft/adult.data", header=None)
test_set = pd.read_csv("E:/soft/adult.test", header=1)
col_labels = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
              'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
              'hours_per_week', 'native_country', 'wage_class']
train_set.columns=col_labels;
test_set.columns=col_labels;
print(train_set.head(n=20))          #看看前20行
print(train_set.describe())     #数据各种统计描述,基于列统计
print(train_set.std())          #只看看标准差

#3看看异常数据情况
plt.boxplot(test_set['age'])
plt.show()
plt.hist(test_set['age'])
plt.show()
err_data=train_set[train_set['age']==0]=np.nan
err_data=train_set.query('age==0')

#4把异常数据去了
print(train_set.shape)
train_set[train_set['age']==0]=np.nan
temp_train_set=train_set.dropna()
print(temp_train_set.shape)
final_train_set=train_set.replace({' ?':np.nan,' ':np.nan}).dropna()
print(final_train_set.shape)

#5.替换测试集中的wage_class值使得其与train_set一致
test_set['wage_class'] = test_set.wage_class.replace({' <=50K.': ' <=50K', ' >50K.': ' >50K'})
print(test_set.wage_class.unique())
print(final_train_set.wage_class.unique())
#[' <=50K' ' >50K']
#[nan ' <=50K' ' >50K']  测试集有nan,说明需要处理,参考训练集
temp_test_set=train_set.dropna()
final_test_set=temp_test_set.replace({' ?':np.nan,' ':np.nan}).dropna()
print(final_train_set.wage_class.unique())
print(final_test_set.wage_class.unique())

#knn训练
x_train=final_train_set[['age','education_num','hours_per_week']]
y_train=final_train_set['wage_class']
knn=KNeighborsClassifier(n_neighbors=5,p=2) #距离(欧式距离P=2)最近的5个值的,离哪个分类距离小就是哪个类
knn.fit(x_train,y_train) #监督学习,需要传入y_train,即每个数据的结果监督标记
p
print(knn.score(x_train,y_train))

#knn预测
x_test=final_test_set[['age','education_num','hours_per_week']].head(n=10)
print("预测值",list(knn.predict(x_test)))
print("实际值",list(final_test_set['wage_class'].head(n=10)))

#随机森林训练与预测
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
print(rf.score(x_train,y_train))
print("预测值",list(rf.predict(x_test)))
print("实际值",list(final_test_set['wage_class'].head(n=10)))

#随机森林训练与预测
rf2 = RandomForestClassifier()
x_train2=final_train_set[['age','education_num','hours_per_week','capital_gain','capital_gain']]
x_text2=final_test_set[['age','education_num','hours_per_week','capital_gain','capital_gain']].head(n=10)
rf2.fit(x_train2, y_train)
print(rf2.score(x_train2,y_train))
print("预测值",list(rf2.predict(x_text2)))
print("实际值",list(final_test_set['wage_class'].head(n=10)))


runfile(‘E:/test/t1/t6.py’, wdir=‘E:/test/t1’)
age workclass fnlwgt … hours_per_week native_country wage_class
0 0 State-gov 1231231 … 40 United-States <=50K
1 50 Self-emp-not-inc 83311 … 13 United-States <=50K
2 38 Private … 40 United-States <=50K
3 53 Private 234721 … 40 United-States <=50K
4 28 Private 338409 … 40 Cuba <=50K
5 37 Private 284582 … 40 United-States <=50K
6 49 Private 160187 … 16 Jamaica <=50K
7 52 Self-emp-not-inc 209642 … 45 United-States >50K
8 31 Private 45781 … 50 United-States >50K
9 42 Private 159449 … 40 United-States >50K
10 37 Private 280464 … 80 United-States >50K
11 30 State-gov 141297 … 40 India >50K
12 23 Private 122272 … 30 United-States <=50K
13 32 Private 205019 … 50 United-States <=50K
14 40 Private 121772 … 40 ? >50K
15 34 Private 245487 … 45 Mexico <=50K
16 25 Self-emp-not-inc 176756 … 35 United-States <=50K
17 32 Private 186824 … 40 United-States <=50K
18 38 Private 28887 … 50 United-States <=50K
19 43 Self-emp-not-inc 292175 … 45 United-States >50K
[20 rows x 15 columns]
age education_num capital_gain capital_loss hours_per_week
count 32561.000000 32561.000000 32561.000000 32561.000000 32561.000000
mean 38.580449 10.080679 1077.648844 87.303830 40.437456
std 13.642108 2.572720 7385.292085 402.960219 12.347429
min 0.000000 1.000000 0.000000 0.000000 1.000000
25% 28.000000 9.000000 0.000000 0.000000 40.000000
50% 37.000000 10.000000 0.000000 0.000000 40.000000
75% 48.000000 12.000000 0.000000 0.000000 45.000000
max 90.000000 16.000000 99999.000000 4356.000000 99.000000
age 13.642108
education_num 2.572720
capital_gain 7385.292085
capital_loss 402.960219
hours_per_week 12.347429
dtype: float64
(32561, 15)
(32560, 15)
(30160, 15)
[’ <=50K’ ’ >50K’]
[’ <=50K’ ’ >50K’]
[’ <=50K’ ’ >50K’]
[’ <=50K’ ’ >50K’]
0.7995026525198939
预测值 [’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ >50K’, ’ >50K’, ’ <=50K’]
实际值 [’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ >50K’, ’ >50K’, ’ >50K’, ’ >50K’, ’ >50K’]
0.8387599469496021
预测值 [’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ >50K’, ’ >50K’, ’ <=50K’]
实际值 [’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ >50K’, ’ >50K’, ’ >50K’, ’ >50K’, ’ >50K’]
0.8661472148541114
预测值 [’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ >50K’, ’ >50K’, ’ >50K’, ’ <=50K’]
实际值 [’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ >50K’, ’ >50K’, ’ >50K’, ’ >50K’, ’ >50K’]

结论:
1.数据处理:需要根据需要去除或填充异常数据,这里训练数据比较多,其实可以通过klearn的trani_test_split按比例切割一部分数据作为测试数据集
2.相同条件,knn->随机森林score稍有提升,随机森林更适合
3.随机森林增加数据维度后,Socre提升,预测更加准确
4.非整型数据需要转换为整型才能应用于模型训练
5.在klearn中,除了knn,RandomForest,还有决策树等算法使用套路是一样的

  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值