上课笔记-机器学习(5)-美国人口普查数据进行收入预测分类

实训六 回顾与学习

美国人口普查数据进行收入预测分类(可以参考:https://www.jianshu.com/p/a6d615f272f6)

# 读入数据
import pandas as pd
df = pd.read_csv("data/adult.data", header=None)
df.head()
01234567891011121314
039State-gov77516Bachelors13Never-marriedAdm-clericalNot-in-familyWhiteMale2174040United-States<=50K
150Self-emp-not-inc83311Bachelors13Married-civ-spouseExec-managerialHusbandWhiteMale0013United-States<=50K
238Private215646HS-grad9DivorcedHandlers-cleanersNot-in-familyWhiteMale0040United-States<=50K
353Private23472111th7Married-civ-spouseHandlers-cleanersHusbandBlackMale0040United-States<=50K
428Private338409Bachelors13Married-civ-spouseProf-specialtyWifeBlackFemale0040Cuba<=50K
# 数据信息
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
0     32561 non-null int64
1     32561 non-null object
2     32561 non-null int64
3     32561 non-null object
4     32561 non-null int64
5     32561 non-null object
6     32561 non-null object
7     32561 non-null object
8     32561 non-null object
9     32561 non-null object
10    32561 non-null int64
11    32561 non-null int64
12    32561 non-null int64
13    32561 non-null object
14    32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
# 数据处理一:去除字符串数值前面的空格
str_cols=[1,3,5,6,7,8,9,13,14]
for col in str_cols:
    df.iloc[:,col]=df.iloc[:,col].map(lambda x: x.strip())
# 数据处理二: 删除缺失值样本
# 将?字符串替换为NaN缺失值标志
import numpy as np
df.replace("?",np.nan,inplace=True)
# 此处直接删除缺失值样本(包含缺失值的行都删除)
df.dropna(inplace=True)
# 数据处理三:对字符数据进行编码
from sklearn.preprocessing import LabelEncoder
label_encoder=[] # 放置每一列的encoder
encoded_set = np.empty(df.shape)
for col in range(df.shape[1]):
    encoder=None
    if df.iloc[:,col].dtype==object: # 字符型数据
        encoder=LabelEncoder()
        encoded_set[:,col]=encoder.fit_transform(df.iloc[:,col])
    else:  # 数值型数据
        encoded_set[:,col]=df.iloc[:,col]
    label_encoder.append(encoder)
# 删除序号为2、10、11的列(老师的方法)
data = np.delete(encoded_set, [2,10,11], axis=1)
# 划分训练集集和测试集
from sklearn.model_selection import train_test_split
X, y = data[:, :-1], data[:,-1]
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=42) 
# 建立朴素贝叶斯分类器模型
from sklearn.naive_bayes import GaussianNB
gaussianNB=GaussianNB()
gaussianNB.fit(train_X,train_y)

# 2 用交叉验证来检验模型的准确性,只是在test set上验证准确性
from sklearn.cross_validation import cross_val_score
num_validations=5
accuracy=cross_val_score(gaussianNB,test_X,test_y,
                         scoring='accuracy',cv=num_validations)
print('准确率:{:.2f}%'.format(accuracy.mean()*100))
precision=cross_val_score(gaussianNB,test_X,test_y,
                         scoring='precision_weighted',cv=num_validations)
print('精确度:{:.2f}%'.format(precision.mean()*100))
recall=cross_val_score(gaussianNB,test_X,test_y,
                         scoring='recall_weighted',cv=num_validations)
print('召回率:{:.2f}%'.format(recall.mean()*100))
f1=cross_val_score(gaussianNB,test_X,test_y,
                         scoring='f1_weighted',cv=num_validations)
print('F1  值:{:.2f}%'.format(f1.mean()*100))
                   
# 3 打印性能报告
from sklearn.metrics import confusion_matrix
y_pred=gaussianNB.predict(test_X)
confusion_mat = confusion_matrix(test_y, y_pred)
print(confusion_mat) #看看混淆矩阵长啥样

from sklearn.metrics import classification_report
# 直接使用sklearn打印精度,召回率和F1值
target_names = ['<=50K', '>50K']
print(classification_report(test_y, y_pred, target_names=target_names))

准确率:76.41%
精确度:79.53%
召回率:76.41%
F1  值:77.40%
[[5359 1408]
 [ 672 1610]]
             precision    recall  f1-score   support

      <=50K       0.89      0.79      0.84      6767
       >50K       0.53      0.71      0.61      2282

avg / total       0.80      0.77      0.78      9049
df[1].values[:100]
array(['State-gov', 'Self-emp-not-inc', 'Private', 'Private', 'Private',
       'Private', 'Private', 'Self-emp-not-inc', 'Private', 'Private',
       'Private', 'State-gov', 'Private', 'Private', 'Private',
       'Self-emp-not-inc', 'Private', 'Private', 'Self-emp-not-inc',
       'Private', 'Private', 'Federal-gov', 'Private', 'Private',
       'Local-gov', 'Private', 'Private', 'Private', 'Local-gov',
       'Private', 'Private', 'Federal-gov', 'State-gov', 'Private',
       'Private', 'Private', 'Self-emp-not-inc', 'Private',
       'Self-emp-not-inc', 'Private', 'Private', 'Private', 'Federal-gov',
       'Private', 'Private', 'State-gov', 'Private', 'Private', 'Private',
       'Federal-gov', 'Self-emp-inc', 'Private', 'Private', 'Private',
       'Private', 'Private', 'Private', 'Private', 'Private', 'Private',
       'Private', 'Private', 'Private', 'Self-emp-inc', 'Private',
       'Private', 'Self-emp-not-inc', 'Private', 'Private', 'Private',
       'Private', 'Private', 'Local-gov', 'Private', 'Private', 'Private',
       'Private', 'Private', 'Private', 'Local-gov', 'Private', 'Private',
       'Federal-gov', 'Private', 'Private', 'Private', 'Local-gov',
       'Local-gov', 'Self-emp-not-inc', 'Private', 'Private',
       'Federal-gov', 'Private', 'Private', 'Self-emp-not-inc', 'Private',
       'Private', 'Self-emp-inc', 'Private', 'Local-gov'], dtype=object)
data[10].values[:100]
array([ 2174,     0,     0,     0,     0,     0,     0,     0, 14084,
        5178,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,  5013,  2407,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0, 14344,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0], dtype=int64)
  • 3
    点赞
  • 38
    收藏
    觉得还不错? 一键收藏
  • 4
    评论
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值