import pandas
columns = ["age", "workclass", "fnlwgt", "education", "education_num", "marital_status", "occupation", "relationship", "race", "sex",
"capital_gain", "capital_loss", "hours_per_week", "native_country", "high_income"]
#income = pandas.read_csv("income.csv", index_col=False,names=columns)
income = pandas.read_csv("D:\\test\machineLearning\income.csv", names=columns)
print(income.head(2))
age workclass fnlwgt education education_num \
0 39 State-gov 77516 Bachelors 13
1 50 Self-emp-not-inc 83311 Bachelors 13
marital_status occupation relationship race sex \
0 Never-married Adm-clerical Not-in-family White Male
1 Married-civ-spouse Exec-managerial Husband White Male
capital_gain capital_loss hours_per_week native_country high_income
0 2174 0 40 United-States <=50K
1 0 0 13 United-States <=50K
columns = ["education","marital_status","occupation","relationship","race", "sex", "native_country", "high_income"]
for name in columns:
#将不同类别转化为数字
col = pandas.Categorical.from_array(income[name])
income[name]=col.codes
import numpy
import math
numpy.random.seed(10)
#洗牌
income=income.reindex(numpy.random.permutation(income.index))
#求最接近它的一个整数
train_max_row = math.floor(income.shape[0]*.8)
train = income.iloc[:int(train_max_row)]
test = income.iloc[int(train_max_row):]
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
#得到决策树模型
clf = DecisionTreeClassifier(random_state=10)
#训练它
clf.fit(train[columns],train["high_income"])
prediction=clf.predict(test[columns])
error=roc_auc_score(test["high_income"],prediction)
print error
1.0
prediction=clf.predict(train[columns])
error=roc_auc_score(train["high_income"],prediction)
print error
1.0
#如果训练集和测试集结果相差太大,很有可能是发生了棍和的现象
#解决办法:1.减枝 2.使用多颗树来训练 3.使用限制条件,使深度不至于太深
clf = DecisionTreeClassifier(random_state=10)
#指定最小切分的节点树,也就是说,当前节点如果小于5就不继续切分了
clf = DecisionTreeClassifier(min_samples_split=5,random_state=10)
clf.fit(train[columns],train["high_income"])
prediction=clf.predict(test[columns])
error=roc_auc_score(test["high_income"],prediction)
print error
1.0
clf = DecisionTreeClassifier(random_state=10)
#指定树的最大深度
clf = DecisionTreeClassifier(min_samples_split=13,max_depth=7,random_state=1)
clf.fit(train[columns],train["high_income"])
prediction=clf.predict(test[columns])
error=roc_auc_score(test["high_income"],prediction)
print error
1.0