import pandas
columns = ["age", "workclass", "fnlwgt", "education", "education_num", "marital_status", "occupation", "relationship", "race", "sex",
"capital_gain", "capital_loss", "hours_per_week", "native_country", "high_income"]
income = pandas.read_csv("D:\\test\machineLearning\income.csv", names=columns)
print income.head(2)
columns = ["education","marital_status","occupation","relationship","race", "sex", "native_country", "high_income"]
for name in columns:
col = pandas.Categorical.from_array(income[name])
income[name]=col.codes
print income.head(2)
age workclass fnlwgt education education_num \
0 39 State-gov 77516 Bachelors 13
1 50 Self-emp-not-inc 83311 Bachelors 13
marital_status occupation relationship race sex \
0 Never-married Adm-clerical Not-in-family White Male
1 Married-civ-spouse Exec-managerial Husband White Male
capital_gain capital_loss hours_per_week native_country high_income
0 2174 0 40 United-States <=50K
1 0 0 13 United-States <=50K
age workclass fnlwgt education education_num marital_status \
0 39 State-gov 77516 9 13 4
1 50 Self-emp-not-inc 83311 9 13 2
occupation relationship race sex capital_gain capital_loss \
0 1 1 4 1 2174 0
1 4 0 4 1 0 0
hours_per_week native_country high_income
0 40 26 0
1 13 26 0
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
import numpy as np
import math
columns = ["age","capital_gain","occupation","relationship","race", "sex", "native_country", "high_income"]
numpy.random.seed(10)
income=income.reindex(numpy.random.permutation(income.index))
train_max_row = math.floor(income.shape[0]*.8)
train = income.iloc[:int(train_max_row)]
test = income.iloc[int(train_max_row):]
clf = DecisionTreeClassifier(random_state=1,min_samples_leaf=2)
clf.fit(train[columns],train["high_income"])
clf2 = DecisionTreeClassifier(random_state=1,max_depth=5)
clf2.fit(train[columns],train["high_income"])
predictions = clf.predict(test[columns])
print (roc_auc_score(test["high_income"],predictions))
predictions = clf2.predict(test[columns])
print (roc_auc_score(test["high_income"],predictions))
1.0
1.0
predict = clf.predict_proba(test[columns])[:,1]
predict2 = clf2.predict_proba(test[columns])[:,1]
combined = (predict+predict2)/2
rounded = np.round(combined)
print(roc_auc_score(test["high_income"],rounded))
1.0
tree_count = 10
bag_proportion = .6
predictions = []
for i in range(tree_count):
bag = train.sample(frac=bag_proportion,replace=True,random_state=i)
clf = DecisionTreeClassifier(random_state=1,min_samples_leaf=2)
clf.fit(bag[columns],bag["high_income"])
predictions.append(clf.predict_proba(test[columns])[:,1])
combined = numpy.sum(predictions,axis=0)/10
rounded = numpy.round(combined)
print(roc_auc_score(test["high_income"],rounded))
1.0
tree_count = 10
bag_proportion = .6
predictions = []
for i in range(tree_count):
bag = train.sample(frac=bag_proportion,replace=True,random_state=i)
clf = DecisionTreeClassifier(random_state=1,min_samples_leaf=2,splitter="random",max_features="auto")
clf.fit(bag[columns],bag["high_income"])
predictions.append(clf.predict_proba(test[columns])[:,1])
combined = numpy.sum(predictions,axis=0)/10
rounded = numpy.round(combined)
print(roc_auc_score(test["high_income"],rounded))
1.0
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=5,random_state=1,min_samples_leaf=2)
clf.fit(train[columns],train["high_income"])
predict = clf.predict(test[columns])
print(roc_auc_score(test["high_income"],predict))
1.0