from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
accuracy = clf.score(features_test, labels_test)
min_samples_split :
The minimum number of samples required to split an internal node:
当min_samples_split设为50时,可以一定程度减少过拟合
##决策树编码
def classify(features_train, labels_train):
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(features_train, labels_train)
return clf
##决策树准确性
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
acc = clf.score(features_test, labels_test)
##决策树准确性
from sklearn.ensemble import RandomForestClassifier
##min_samples_split=2
clf = RandomForestClassifier(min_samples_split=2)
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
acc_min_samples_split_2 = clf.score(features_test, labels_test)
##min_samples_split=50
clf = RandomForestClassifier(min_samples_split=50)
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
acc_min_samples_split_50 = clf.score(features_test, labels_test)
##熵公式
![](https://i-blog.csdnimg.cn/blog_migrate/302fa941ddb303c7d1936c25ca2e3bf5.jpeg)
![](https://i-blog.csdnimg.cn/blog_migrate/725e965148309c200a6fffc7e34ae12d.jpeg)
##信息增益
##第一个邮件 DT:准确率
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(min_samples_split = 40)
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
accuracy = clf.score(features_test, labels_test)
print(accuracy)
##通过特征选择加速
print(len(features_train[0]))
##更改特征数量
#email_preprocess.py
selector = SelectPercentile(f_classif, percentile=1) #percentile=1即1%可用特征
#dt_author_id.py
print(len(features_train[0]))