-- coding: utf-8 --
from sklearn import tree
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from timeit import time
file = open(’./color_detect2.csv’)
data_temp = pd.read_csv(file, header= None)
data = np.vstack([data_temp[0].values,data_temp[1].values, data_temp[2].values, data_temp[3].values, data_temp[4].values, data_temp[5].values, data_temp[6].values]).T
np.random.shuffle(data)
X_origin = np.vstack([data[:,0], data[:,1], data[:,2], data[:,3], data[:,4], data[:,5]]).T
Y_origin = data[:,6].astype(np.int)
x_train, x_test, y_train, y_test = train_test_split(X_origin, Y_origin, test_size= 0.3)
#clf = tree.DecisionTreeClassifier(splitter=‘random’, min_samples_split=200, max_leaf_nodes = 180, class_weight=‘balanced’)
min_samples_split max_leaf_nodes 调节过拟合
max_acc = 0
a, b, c = 0, 0, 0
for i in range(120,140):
for j in range(20,60):
for k in range(1,7):
#clf = tree.DecisionTreeClassifier(max_leaf_nodes = 180)
clf = tree.DecisionTreeClassifier(splitter=‘random’, class_weight=‘balanced’, min_samples_split=33, min_samples_leaf = 64, max_depth = 6,max_leaf_nodes = 50)
#clf = RandomForestClassifier(n_estimators=5, class_weight=‘balanced’,min_samples_split=110, min_samples_leaf = 70)
clf.fit(x_train, y_train)
acc_train = round(clf.score(x_train, y_train) *100, 2)
t1 = time.time()
acc_test = round(clf.score(x_test, y_test) *100, 2)
t2 = time.time()
#print('acc_train: ', acc_train)
#print('acc_test: ', acc_test)
print('time: ', (t2-t1))
if np.abs(acc_test-acc_train) < 3:
if acc_test > 80:
print('acc_train, acc_test, min_samples_split, min_samples_leaf, max_depth: ',acc_train, acc_test, a, b, c)
if acc_test > max_acc:
max_acc = acc_test
a = i
b = j
c = k
#print(acc_test)
print('acc_test, min_samples_split, min_samples_leaf, max_depth: ',max_acc,a,b,c)
save the trained model
joblib.dump(clf, "colordetect_model.m")
‘’’
load the model
model = joblib.load("./color_detect_model.m")
acc_test2 = round(model.score(x_test, y_test) *100, 2)
print('acc_test: ‘, acc_test2)
‘’’
‘’’
1.决策树在应对高维数据时很容易过拟合,因此保持自变量个数和样本个数间的比例非常重要,其实不管是对什么预测算法,当样本个数接近自变量个数时都容易发生过拟合;
2.可以考虑对自变量进行维数约简,或进行特征选择,以最大程度保留较少更有说服力的自变量,以尽量减少过拟合的风险;
3.多使用决策树结构的可视化方法,可以帮助你理解你当前树的生长效果,也可以更好的与现实业务联系起来进行解释;
4.树的深度(即距离最远的叶结点对应距离根结点的距离)初始化设置为3较好,再逐步增长该参数,观察训练效果的变化情况,以作出最好的选择,以及控制过拟合情况;
5.使用min_samples_spilt或与其等价的参数来控制生成叶结点时的样本个数下限,因为通常该参数越小,树过拟合的风险都越大,因此尽早生成叶结点可以缓解对样本数据独特性的放大,进而减少过拟合风险;
6.在训练之前尽量平衡类别间的比例,以避免训练结果因为类别的严重不平衡而产生虚假结果(比如样本中9个正例1个反例,训练出的模型全部归类为正例也能取得90%的正确率,但这不可靠),或者调节sample_weight来对所有类别进行再缩放;
‘’’