adult数据集的数据挖掘(决策树)
- 数据简介
项目 | Value |
---|---|
>50K, <=50K. | boolean |
age | continuous |
workclass | Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked |
fnlwgt | continuous |
education | Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters… |
education-num | continuous |
marital-status | Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse. |
occupation | Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct… |
relationship | Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried. |
race | White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black. |
sex | Female, Male |
capital-gain | continuous |
capital-loss | continuous |
hours-per-week | continuous |
native-country | United-States, Cambodia, England, Germany, India,Japan, Greece, South, China… |
2.准备工作
#准备工作
from sklearn.datasets import load_wine
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import tree
import pandas as pd
data=pd.read_csv('adult.data',header=None)#提取数据
3.数据处理
data[1][data[1]==' Private']=0
data[1][data[1]==' Self-emp-not-inc']=1
data[1][data[1]==' Self-emp-inc']=2
data[1][data[1]==' Federal-gov']=3
data[1][data[1]==' Local-gov']=4
data[1][data[1]==' State-gov']=5
data[1][data[1]==' Without-pay']=6
data[1][data[1]==' Never-worked']=7
data[3][data[3]==' Bachelors']=0
data[3][data[3]==' Some-college']=1
data[3][data[3]==' 11th']=2
data[3][data[3]==' HS-grad']=3
data[3][data[3]==' Prof-school']=4
data[3][data[3]==' Assoc-acdm']=5
data[3][data[3]==' Assoc-voc']=6
data[3][data[3]==' 9th']=7
data[3][data[3]==' 7th-8th']=8
data[3][data[3]==' 12th']=9
data[3][data[3]==' Masters']=10
data[3][data[3]==' 1st-4th']=11
data[3][data[3]==' 10th']=12
data[3][data[3]==' Doctorate']=13
data[3][data[3]==' 5th-6th']=14
data[3][data[3]==' Preschool']=15
data[5][data[5]==' Married-civ-spouse']=0
data[5][data[5]==' Divorced']=1
data[5][data[5]==' Never-married']=2
data[5][data[5]==' Separated']=3
data[5][data[5]==' Widowed']=4
data[5][data[5]==' Married-spouse-absent']=5
data[5][data[5]==' Married-AF-spouse']=6
data[6][data[6]==' Tech-support']=0
data[6][data[6]==' Craft-repair']=1
data[6][data[6]==' Other-service']=2
data[6][data[6]==' Sales']=3
data[6][data[6]==' Exec-managerial']=4
data[6][data[6]==' Prof-specialty']=5
data[6][data[6]==' Handlers-cleaners']=6
data[6][data[6]==' Machine-op-inspct']=7
data[6][data[6]==' Adm-clerical']=8
data[6][data[6]==' Farming-fishing']=9
data[6][data[6]==' Transport-moving']=10
data[6][data[6]==' Priv-house-serv']=11
data[6][data[6]==' Protective-serv']=12
data[6][data[6]==' Armed-Forces']=13
data[7][data[7]==' Wife']=0
data[7][data[7]==' Own-child']=1
data[7][data[7]==' Husband']=2
data[7][data[7]==' Not-in-family']=3
data[7][data[7]==' Other-relative']=4
data[7][data[7]==' Unmarried']=5
data[8][data[8]==' White']=0
data[8][data[8]==' Asian-Pac-Islander']=1
data[8][data[8]==' Amer-Indian-Eskimo']=2
data[8][data[8]==' Other']=3
data[8][data[8]==' Black']=4
data[9][data[9]==' Male']=0
data[9][data[9]==' Female']=1
data[14][data[14]==' >50K']=0
data[14][data[14]==' <=50K']=1
4.建模(决策树)
X=data.iloc[:,[3,5,8,9]]#提取特征值
X=X.astype('int')#转换为整数型
Y=data.iloc[:,14]#提取标签值
Y=Y.astype('int')#转换为整数型
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)#分割训练集和测试集
dt_clf = tree.DecisionTreeClassifier()#生成实例
dt_clf = dt_clf.fit(x_train, y_train)
tree_y_pred = dt_clf.predict(x_test)#生成预测值
metrics.accuracy_score(y_test,tree_y_pred)#真实值与预测值的一致性
5.决策树展示
import graphviz
tree.plot_tree(dt_clf)
feature_name=['education','marital-status','race','sex']
dot_data = tree.export_graphviz(dt_clf,filled=True,rounded=True,feature_names=feature_name,class_names=['>50k','<=50k'])
graph = graphviz.Source(dot_data)
graph#直接展示(法一)
#输出为pdf文件
dot_data=dot_data.replace('helvetica','"Microsoft Yahei"') # 修改字体,支持中文
graph = graphviz.Source(dot_data)
graph.render(r'adult')#将可视化结果输出至指定位置
6.结论
“>50k人群”:婚姻状况不是已婚,教育程度为学士
“<50k人群”:婚姻状况是已婚,教育程度为教授以上级别