决策树算法
- 检查是否存在缺失数据
def queshi(datafile = 'Data/Decision_tree/HR.csv'):
'''
导入数据并查看是否存在缺失值
'''
import pandas as pd
df = pd.read_csv(datafile, index_col=None)
queshi = df.isnull().any()
queshi = queshi.tolist()
return (queshi)
- 模型训练
def xunlian(modelpath = 'pythonmodel/clf.pkl',datafile = 'Data/Decision_tree/HR.csv',catclo = ["sales","salary"],outclo = 'left'):
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
df = pd.read_csv(datafile, index_col=None)
for i in catclo:
df[i] = df[i].astype('category')
df[i] = df[i].cat.codes
target_name = outclo
X = df.drop(outclo, axis=1)
y = df[target_name]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.15, random_state=123, stratify=y)
clf = DecisionTreeClassifier(
criterion='entropy',
min_weight_fraction_leaf=0.01
)
clf = clf.fit(X_train,y_train)
clf_roc_auc = roc_auc_score(y_test, clf.predict(X_test))
zhunquelv = "决策树 AUC = %2.2f" % clf_roc_auc
from sklearn.externals import joblib
joblib.dump(clf, modelpath)
return zhunquelv
- 图像展示决策树各数据特征
def tezheng(modelpath = 'pythonmodel/clf.pkl',datafile = 'Data/Decision_tree/HR.csv',col = 'left',jpgname = 'TREE.png'):
import pandas as pd
from sklearn.externals import joblib
import numpy as np
import matplotlib.pyplot as plt
df = pd.read_csv(datafile, index_col=None)
clfload = joblib.load(modelpath)
'''
可以通过决策树来分析出不同特征的重要性,进而帮助做出决定。
'''
importances = clfload.feature_importances_
feat_names = df.drop([col],axis=1).columns
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(12,6))
plt.title("Feature importances by Decision Tree")
plt.bar(range(len(indices)), importances[indices], color='lightblue', align="center")
plt.step(range(len(indices)), np.cumsum(importances[indices]), where='mid', label='Cumulative')
plt.xticks(range(len(indices)), feat_names[indices], rotation='vertical',fontsize=14)
plt.xlim([-1, len(indices)])
plt.savefig(jpgname)
plt.show()
- 载入模型,取指定数据文件预测值
def mainfromfile(modelpath = 'pythonmodel/clf.pkl',datafile = 'Data/Decision_tree/HR2.csv',col = 'left',begin = 3242,end = 3244):
from sklearn.externals import joblib
import pandas as pd
clfload = joblib.load(modelpath)
df = pd.read_csv(datafile, index_col=None)
df1 = df.drop(col, axis=1)
xxx = df1.iloc[begin:end,1:]
y_feature = clfload.predict(xxx)
return y_feature
- 载入模型,传入数值后返回预测值
def mainfromdata(modelpath = 'pythonmodel/clf.pkl',data = [[0.38,0.53,2,157,3,0,0,7,1]]):
from sklearn.externals import joblib
import pandas as pd
clfload = joblib.load(modelpath)
data = pd.DataFrame(data)
y_feature = clfload.predict(data)
return y_feature