上课笔记-机器学习（5）-美国人口普查数据进行收入预测分类_来源于人口普查的收入预测机器学习-CSDN博客

实训六回顾与学习

美国人口普查数据进行收入预测分类（可以参考：https://www.jianshu.com/p/a6d615f272f6）

# 读入数据
import pandas as pd
df = pd.read_csv("data/adult.data", header=None)
df.head()

	0	1	2	3	4	5	6	7	8	9	10	12	13	14
0	39	State-gov	77516	Bachelors	13	Never-married	Adm-clerical	Not-in-family	White	Male	2174	40	United-States	<=50K
1	50	Self-emp-not-inc	83311	Bachelors	13	Married-civ-spouse	Exec-managerial	Husband	White	Male	0	13	United-States	<=50K
2	38	Private	215646	HS-grad	9	Divorced	Handlers-cleaners	Not-in-family	White	Male	0	40	United-States	<=50K
3	53	Private	234721	11th	7	Married-civ-spouse	Handlers-cleaners	Husband	Black	Male	0	40	United-States	<=50K
4	28	Private	338409	Bachelors	13	Married-civ-spouse	Prof-specialty	Wife	Black	Female	0	40	Cuba	<=50K

# 数据信息
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
0     32561 non-null int64
1     32561 non-null object
2     32561 non-null int64
3     32561 non-null object
4     32561 non-null int64
5     32561 non-null object
6     32561 non-null object
7     32561 non-null object
8     32561 non-null object
9     32561 non-null object
10    32561 non-null int64
11    32561 non-null int64
12    32561 non-null int64
13    32561 non-null object
14    32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB

# 数据处理一：去除字符串数值前面的空格
str_cols=[1,3,5,6,7,8,9,13,14]
for col in str_cols:
    df.iloc[:,col]=df.iloc[:,col].map(lambda x: x.strip())

# 数据处理二： 删除缺失值样本
# 将?字符串替换为NaN缺失值标志
import numpy as np
df.replace("?",np.nan,inplace=True)
# 此处直接删除缺失值样本(包含缺失值的行都删除)
df.dropna(inplace=True)

# 数据处理三：对字符数据进行编码
from sklearn.preprocessing import LabelEncoder
label_encoder=[] # 放置每一列的encoder
encoded_set = np.empty(df.shape)
for col in range(df.shape[1]):
    encoder=None
    if df.iloc[:,col].dtype==object: # 字符型数据
        encoder=LabelEncoder()
        encoded_set[:,col]=encoder.fit_transform(df.iloc[:,col])
    else:  # 数值型数据
        encoded_set[:,col]=df.iloc[:,col]
    label_encoder.append(encoder)

# 删除序号为2、10、11的列(老师的方法)
data = np.delete(encoded_set, [2,10,11], axis=1)

# 划分训练集集和测试集
from sklearn.model_selection import train_test_split
X, y = data[:, :-1], data[:,-1]
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=42)

# 建立朴素贝叶斯分类器模型
from sklearn.naive_bayes import GaussianNB
gaussianNB=GaussianNB()
gaussianNB.fit(train_X,train_y)

# 2 用交叉验证来检验模型的准确性，只是在test set上验证准确性
from sklearn.cross_validation import cross_val_score
num_validations=5
accuracy=cross_val_score(gaussianNB,test_X,test_y,
                         scoring='accuracy',cv=num_validations)
print('准确率：{:.2f}%'.format(accuracy.mean()*100))
precision=cross_val_score(gaussianNB,test_X,test_y,
                         scoring='precision_weighted',cv=num_validations)
print('精确度：{:.2f}%'.format(precision.mean()*100))
recall=cross_val_score(gaussianNB,test_X,test_y,
                         scoring='recall_weighted',cv=num_validations)
print('召回率：{:.2f}%'.format(recall.mean()*100))
f1=cross_val_score(gaussianNB,test_X,test_y,
                         scoring='f1_weighted',cv=num_validations)
print('F1  值：{:.2f}%'.format(f1.mean()*100))
                   
# 3 打印性能报告
from sklearn.metrics import confusion_matrix
y_pred=gaussianNB.predict(test_X)
confusion_mat = confusion_matrix(test_y, y_pred)
print(confusion_mat) #看看混淆矩阵长啥样

from sklearn.metrics import classification_report
# 直接使用sklearn打印精度，召回率和F1值
target_names = ['<=50K', '>50K']
print(classification_report(test_y, y_pred, target_names=target_names))

准确率：76.41%
精确度：79.53%
召回率：76.41%
F1  值：77.40%
[[5359 1408]
 [ 672 1610]]
             precision    recall  f1-score   support

      <=50K       0.89      0.79      0.84      6767
       >50K       0.53      0.71      0.61      2282

avg / total       0.80      0.77      0.78      9049

df[1].values[:100]

array(['State-gov', 'Self-emp-not-inc', 'Private', 'Private', 'Private',
       'Private', 'Private', 'Self-emp-not-inc', 'Private', 'Private',
       'Private', 'State-gov', 'Private', 'Private', 'Private',
       'Self-emp-not-inc', 'Private', 'Private', 'Self-emp-not-inc',
       'Private', 'Private', 'Federal-gov', 'Private', 'Private',
       'Local-gov', 'Private', 'Private', 'Private', 'Local-gov',
       'Private', 'Private', 'Federal-gov', 'State-gov', 'Private',
       'Private', 'Private', 'Self-emp-not-inc', 'Private',
       'Self-emp-not-inc', 'Private', 'Private', 'Private', 'Federal-gov',
       'Private', 'Private', 'State-gov', 'Private', 'Private', 'Private',
       'Federal-gov', 'Self-emp-inc', 'Private', 'Private', 'Private',
       'Private', 'Private', 'Private', 'Private', 'Private', 'Private',
       'Private', 'Private', 'Private', 'Self-emp-inc', 'Private',
       'Private', 'Self-emp-not-inc', 'Private', 'Private', 'Private',
       'Private', 'Private', 'Local-gov', 'Private', 'Private', 'Private',
       'Private', 'Private', 'Private', 'Local-gov', 'Private', 'Private',
       'Federal-gov', 'Private', 'Private', 'Private', 'Local-gov',
       'Local-gov', 'Self-emp-not-inc', 'Private', 'Private',
       'Federal-gov', 'Private', 'Private', 'Self-emp-not-inc', 'Private',
       'Private', 'Self-emp-inc', 'Private', 'Local-gov'], dtype=object)

data[10].values[:100]

array([ 2174,     0,     0,     0,     0,     0,     0,     0, 14084,
        5178,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,  5013,  2407,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0, 14344,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0], dtype=int64)