1、基本架构
# Loadlibraries
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')
# Loaddataset
names =['sepal-length', 'sepal-width','petal-length', 'petal-width','class']
dataset =pd.read_csv("iris.data",names=names)
# shape
print(dataset.shape)
# head
print(dataset.head(20))
# descriptions
print(dataset.describe())
# classdistribution
print(dataset.groupby('class').size())
# boxand whisker plots
dataset.plot(kind='box',subplots=True,layout=(2,2), sharex=False, sharey=False)
plt.show()
#histograms
dataset.hist()
plt.show()
#scatter plot matrix
scatter_matrix(dataset)
plt.show()
#Split-out validation dataset
array =dataset.values
print(array)
X = array[:,:3]#行下标为1到3的数据X =array[:,0:4]
print(X)
Y =array[:,4]#行下标为4的数据
print(Y)
validation_size = 0.20#测试的比例
seed =7
X_train, X_validation, Y_train,Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)
#训练数据集:X_train,Y_train
#验证数据集:X_validation,Y_validation
# Testoptions and evaluation metric
seed =7
scoring= 'accuracy'
# SpotCheck Algorithms
models =[]
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
#evaluate each model in turn
results =[]
names =[]
for name, model in models:
kfold= model_selection.KFold(n_splits=10, random_state=seed)
cv_results= model_selection.cross_val_score(model,X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg= "%s: %f (%f)" % (name, cv_results.mean(),cv_results.std())
print(msg)
#Compare Algorithms
fig =plt.figure()
fig.suptitle('Algorithm Comparison')
ax =fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
# Makepredictions on validation dataset
knn =KNeighborsClassifier()
knn.fit(X_train,Y_train)
predictions = knn.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))
2、绘制混淆矩阵(基于决策树算法)
arrays=["price","age_level","pid","final_gender_code","cate_id","brand","campaign_id","new_user_class_level ","shopping_level","pvalue_level","occupation"]
x=file_3[arrays]
y=file_3["clk"]
def cm_plot(original_label, predict_label, pic=None):
ans = metrics.classification_report(original_label, predict_label, digits=5)
print(ans)
cm = confusion_matrix(original_label, predict_label) # 由原标签和预测标签生成混淆矩阵
plt.figure()
plt.matshow(cm, cmap=plt.cm.Blues) # 画混淆矩阵,配色风格使用cm.Blues
plt.colorbar() # 颜色标签
for x in range(len(cm)):
for y in range(len(cm)):
plt.annotate(cm[x, y], xy=(x, y), horizontalalignment='center', verticalalignment='center')
# annotate主要在图形中添加注释
# 第一个参数添加注释
# 第二个参数是注释的内容
# xy设置箭头尖的坐标
# horizontalalignment水平对齐
# verticalalignment垂直对齐
# 其余常用参数如下:
# xytext设置注释内容显示的起始位置
# arrowprops 用来设置箭头
# facecolor 设置箭头的颜色
# headlength 箭头的头的长度
# headwidth 箭头的宽度
# width 箭身的宽度
plt.ylabel('Predicted label') # 坐标轴标签
plt.xlabel('True label') # 坐标轴标签
plt.title('confusion matrix')
if pic is not None:
plt.savefig(str(pic) + '.jpg')
plt.show()
clf = tree.DecisionTreeClassifier(splitter='random', max_depth=100, min_samples_leaf=12)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
clf = clf.fit(x_train, y_train)
score = clf.score(x_train, y_train)
score1 = clf.score(x_test, y_test)
y_true = y_test
y_pred = clf.predict(x_test)
cm_plot(y_true,y_pred)
c=confusion_matrix(y_true, y_pred)
ans = metrics.classification_report(y_true,y_pred,digits=5)
print(ans)输出相关的报道
print(c)输出文字格式混淆矩阵
print(clf.classes_)#输出分类的类别
3、绘制roc曲线(基于决策树算法)
clf = tree.DecisionTreeClassifier(splitter='random', max_depth=100, min_samples_leaf=12)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
clf = clf.fit(x_train, y_train)
y_true = y_test
y_score = clf.predict_proba(x_test)[:,1]
def rocs():
print(y_pred)
print(roc_curve(y_test, y_score))
fpr, tpr, thresholds = roc_curve(y_test, y_score)
print(fpr)
print(tpr)
print("#"*10)
print("auc")
print(auc(fpr,tpr))
print("#"*10)
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',linestyle='--')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('The roc curve')
plt.legend(loc="lower right")
plt.show()
rocs()
4、绘制决策树
import graphviz
import pandas as pd
from sklearn import metrics, tree
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.model_selection import train_test_split
pd.set_option('display.max_rows',None)#显示所有的行
pd.set_option('display.max_columns',None)#显示所有的列
pd.set_option('max_colwidth',50)#设置列的最大默认宽度
file_3=pd.read_csv("improve_1.csv")
arrays=["price","age_level","pid","final_gender_code","cate_id","brand","campaign_id","new_user_class_level ","shopping_level","pvalue_level","occupation"]
x=file_3[arrays]
y=file_3["clk"]
clf=tree.DecisionTreeClassifier(splitter='random',max_depth=100,min_samples_leaf=12)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.992, stratify=y)
clf = clf.fit(x_train, y_train)
dot_data = tree.export_graphviz(clf,class_names=["noclk","clk"],feature_names=arrays)
graph = graphviz.Source(dot_data)
graph.render("abcd")
中文网站:
http://www.scikitlearn.com.cn/