在数据分析的过程中,往往需要对所建立的模型进行可视化,并调整其中的某些参数。
通常情况下,在Python中可以通过Matplotlib来进行绘制图像。然而该绘制过程是静态的,也就是每次调整完参数需要重新调用绘图语句进行绘图展示。我们的目标是结合GUI组件,实现对模型参数的交互式绘图。这样,可以在展示出的GUI界面中动态的调整模型的参数,并绘制图像。
最终的实现效果如下:
# coding=gbk
'''''
Created on 2016年10月28日
@author: ldk
'''
import time
from sklearn import metrics
import pickle as pickle
from sklearn.externals import joblib ##保存模型
from matplotlib.widgets import Button ##添加按钮功能包
import pandas as pd
import random
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import KFold,StratifiedKFold
from scipy import interp
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
from matplotlib.figure import Figure
from Tkinter import *
###############################################################################
# Multinomial Naive Bayes Classifier
def naive_bayes_classifier(train_x, train_y):
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB(alpha=0.01)
model.fit(train_x, train_y)
return model
###############################################################################
###############################################################################
# KNN Classifier
def knn_classifier(train_x, train_y):
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(train_x, train_y)
return model
###############################################################################
###############################################################################
# Logistic Regression Classifier
def logistic_regression_classifier(train_x, train_y):
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(penalty='l2')
model.fit(train_x, train_y)
return model
###############################################################################
###############################################################################
# Random Forest Classifier
def random_forest_classifier(train_x, train_y):
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=20)
model.fit(train_x, train_y)
return model
###############################################################################
###############################################################################
# Decision Tree Classifier
def decision_tree_classifier(train_x, train_y):
from sklearn import tree
model = tree.DecisionTreeClassifier()
model.fit(train_x, train_y)
return model
###############################################################################
###############################################################################
# GBDT(Gradient Boosting Decision Tree) Classifier
def gradient_boosting_classifier(train_x, train_y):
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(n_estimators=200)
model.fit(train_x, train_y)
return model
###############################################################################
###############################################################################
# SVM Classifier
def svm_classifier(train_x, train_y):
from sklearn.svm import SVC
model = SVC(kernel='rbf', probability=True)
model.fit(train_x, train_y)
return model
###############################################################################
###############################################################################
# SVM Classifier using cross validation
def svm_cross_validation(train_x, train_y):
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
model = SVC(kernel='rbf', probability=True)
param_grid = {'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000], 'gamma': [0.001, 0.0001]}
grid_search = GridSearchCV(model, param_grid, n_jobs = 1, verbose=1)
grid_search.fit(train_x, train_y)
best_parameters = grid_search.best_estimator_.get_params()
for para, val in list(best_parameters.items()):
print(para, val)
model = SVC(kernel='rbf', C=best_parameters['C'], gamma=best_parameters['gamma'], probability=True)
model.fit(train_x, train_y)
return model
###############################################################################
def shuffle_data(inputfile):
csv_input = pd.read_csv(inputfile,error_bad_lines = False)
## print "csv_input.shape[0]",csv_input.shape[0]
randArray = np.random.rand(csv_input.shape[0])
## print "randArray.shape",randArray.shape
csv_input['rand'] = randArray
csv_input.sort_values(by = 'rand',axis = 0,inplace = True )#用值排序较为安全
del csv_input['rand']
csv_input.to_csv(inputfile, index=False,index_label = False ,mode = 'wb+')
###############################################################################
def read_data(data_file,droplabel):
data = pd.read_csv(data_file)
train = data[:int(len(data)*0.7)]
test = data[int(len(data)*0.7):]
train_y = train.label
train_x = train.drop('label', axis=1)
##筛选类别,二分类
train_x , train_y = train_x[train_y != droplabel ],train_y[train_y != droplabel]
###############################################################################
# Data IO and generation,导入iris数据,做数据准备
# import some data to play with
## iris = datasets.load_iris()
## X = iris.data
## y = iris.target
## X, y = X[y != 2], y[y != 2]#去掉了label为2,label只能二分,才可以。这句话很方便啊,直接进行筛选,并且是每条都相互对应,不用手动csv修改了!!
## n_samples, n_features = X.shape
##
## # Add noisy features
## random_state = np.random.RandomState(0)
## X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
##可以使用sklearn自带的traintest分割方法,有随机化参数可选,哈哈,跟自己的方法功能一致。
# Split the dataset in two equal parts
## X_train, X_test, y_train, y_test = train_test_split(
## X, y, test_size=0.5, random_state=0)
###############################################################################
##去掉非属性列
train_x = train_x.drop('SampleDate',axis=1)
train_x = train_x.drop('picktime',axis=1)
train_x = train_x.drop('classes',axis=1)
train_x = train_x.drop('temp_label',axis=1)
test_x = test.drop('label', axis=1)
test_y = test.label
##筛选类别,变为二分类
test_x, test_y = test_x[test_y != droplabel],test_y[test_y != droplabel]
##去掉非属性列
test_x = test_x.drop('SampleDate', axis=1)
test_x = test_x.drop('picktime', axis=1)
test_x = test_x.drop('classes',axis=1)
test_x = test_x.drop('temp_label', axis=1)
## train_x = normalize(train_x, norm='l2')
## test_x = normalize(test_x, norm='l2')
return train_x, train_y, test_x, test_y
###############################################################################
###############################################################################
def read_data_selectfeature(data_file,dimention,droplabel):
data = pd.read_csv(data_file)
#特征选取后的新数据
X = data.drop('label',axis = 1)
X = X.drop('SampleDate',axis = 1)
X = X.drop('picktime',axis = 1)
X = X.drop('classes',axis = 1)
X = X.drop('temp_label',axis = 1)
y = data.label
##筛选类别为两类#############################################################
X,y = X[y != droplabel],y[y != droplabel]
## print "X :",X
X_new = SelectKBest(chi2, k=dimention).fit_transform(X, y)
## print "X_new ***********",X_new
Data_new = np.c_[X_new , y] ##注意最后返回的是组合后的整体,注意先后顺序。所以可以在原始数据第一行上加上属性描述行,就是说标签row在前,被增加数据在后
## print "Data_new ***********",Data_new
## X_new['label'] = np.asarray(y.T)
#
train = Data_new[:int(len(Data_new)*0.7)]
test = Data_new[int(len(Data_new)*0.7):]
## print "train,test .shape" , train
train_y = train[:,-1:]
train_x = train[:,:-1]
test_y = test[:,-1:]
test_x = test[:,:-1]
## print "train_x,train_y shape" ,train_x.shape,train_y.shape
## train_x,train_y = train_x[train_y != droplabel],train_y[train_y != droplabel]
##
## test_x, test_y = test_x[test_y != droplabel ],test_y[test_y != droplabel]
## print "test_x after",test_x
##正则化数据方法,对有的数据可以提高准确度,有的则不然
## train_x = normalize(train_x, norm='l2')
## test_x = normalize(test_x, norm='l2')
return train_x, train_y, test_x, test_y
###############################################################################
def read_Kf(data_file,droplabel):
data = pd.read_csv(data_file)
X = data.drop('label',axis = 1)
y = data.label
kf = KFold(len(y),n_folds = 3,shuffle = True )##kf 里存的是index不是数据
for train_index , test_index in kf:
print "train,test " , train, test
###############################################################################
#清洗数据,滤出异常点样本,关键点在于.iloc[]的使用,可以定位数据样本点第几行,然后可以像操作二维array一样操作DataFrame
def DataClean(inputfile):
csv_input = pd.read_csv(inputfile,error_bad_lines = False)
#print csv_input[:2]
for wave in csv_input.index:
#print csv_input.iloc[wave]['Wavelength'] #打印定位第几行,第wavelength列的数据值
if csv_input.iloc[wave]['Wavelength'] > 500 and csv_input.iloc[wave]['Wavelength'] < 1050:
continue
else:
csv_input.iloc[wave] = 0 #按指定需求滤除某一整行(清零操作)
#csv_input.to_csv(inputfile, index = False, index_label = False, mode = 'wb+')
#用.to_csv自带的参数columns可以指定输出那几个特征属性列到指定位置的文件中
csv_input.to_csv(inputfile, index = False,columns = ['Wavelength','average','label'], index_label = False, mode = 'wb+')
###############################################################################
###############################################################################
def PlotROC(classifier,test_y,predict,label,subfigue):
global mean_tpr,mean_fpr
#画ROC曲线和计算AUC
fpr, tpr, thresholds = roc_curve(test_y, predict , pos_label = label)##指定正例标签,pos_label = ###########在数之联的时候学到的,要指定正例标签
mean_tpr += interp(mean_fpr, fpr, tpr) #对mean_tpr在mean_fpr处进行插值,通过scipy包调用interp()函数
mean_tpr[0] = 0.0 #初始处为0
roc_auc = auc(fpr, tpr)
#画图,只需要plt.plot(fpr,tpr),变量roc_auc只是记录auc的值,通过auc()函数能计算出来
subfigue.plot(fpr, tpr, lw=1, label='ROC %s (area = %0.3f)' % (classifier, roc_auc))
###############################################################################
###############################################################################
def DispROC(subfigue):
global cv,mean_tpr,mean_fpr
#画对角线
subfigue.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')
mean_tpr /= len(cv) #在mean_fpr100个点,每个点处插值插值多次取平均
mean_tpr[-1] = 1.0 #坐标最后一个点为(1,1)
mean_auc = auc(mean_fpr, mean_tpr) #计算平均AUC值
#画平均ROC曲线
#print mean_fpr,len(mean_fpr)
#print mean_tpr
subfigue.plot(mean_fpr ,mean_tpr, 'k--',label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)
subfigue.set_xlim([-0.05, 1.15])
subfigue.set_ylim([-0.05, 1.15])
subfigue.set_xlabel('False Positive Rate')
subfigue.set_ylabel('True Positive Rate')
subfigue.set_title('Receiver operating characteristic example')
subfigue.legend(loc="lower right")
###############################################################################
def button_all_press():
global model_save,model_save_file
pickle.dump(model_save, open(model_save_file , 'wb'))
print 'model is saved!'
def button_smodel_press():
global model_save,model_save_file
##获取文本框内容
save_modelname = inputEntry.get()
try:
save_modelname not in model_save
except:
print "请输入正确的分类器"
inputEntry.delete(0,END)
inputEntry.insert(0,'RF')
pickle.dump(model_save[save_modelname], open(model_save_file , 'wb'))
print 'model is saved!'
def draw_button_saveall():
global button , root#must global
## point = plt.axes([0.3,0.03,0.1,0.03])
button = Button(root, text="save all",command = button_all_press).grid(row=2,column=1,columnspan=3)
def draw_button_savesingle():
global button,root#must global
button = Button(root, text="save singlemodel",command = button_smodel_press).grid(row=2,column=2,columnspan=3)
def trainmodel(subfigue):
print "all model train start"
global data_file,data_file,dimention,droplabel,mean_tpr,mean_fpr,positive_label
##随机化数据
shuffle_data( data_file)
##应用特征选取,进行分类
dimention = 1000
train_x_s, train_y_s, test_x_s, test_y_s = read_data_selectfeature( data_file, dimention, droplabel)
for classifier in test_classifiers:
print('******************* %s ********************' % classifier)
start_time = time.time()
model = classifiers[classifier](train_x_s, train_y_s)
print('training took %fs!' % (time.time() - start_time))
predict = model.predict(test_x_s)
if model_save_file != None:
model_save[classifier] = model
precision = metrics.precision_score(test_y_s, predict)
recall = metrics.recall_score(test_y_s, predict)
print('precision: %.2f%%, recall: %.2f%%' % (100 * precision, 100 * recall))
accuracy = metrics.accuracy_score(test_y_s, predict)
print('accuracy: %.2f%%' % (100 * accuracy))
PlotROC(classifier,test_y_s,predict,positive_label,subfigue)
print "all model train end"
def drawPic():
drawPic.f.clf()
drawPic.a = drawPic.f.add_subplot(111)
trainmodel(drawPic.a)
drawPic.a.set_title('Model Classifier')
drawPic.a.set_xmargin
DispROC(drawPic.a)
drawPic.canvas.show()
if __name__ == '__main__':
## datafilename = 'outputhard10_25.csv'
datafilename = 'multi_apple_union.csv'
## data_file = "L:\\Python\\testdownscale\\"+datafilename
data_file = "L:\\Python\\creditcard\\" + datafilename
##随机化数据
shuffle_data(data_file)
## read_Kf(data_file,'temp_label')
thresh = 0.5
model_save_file = "L:\\Python\\creditcard\\model\\savemodel.txt"
model_save = {}
test_classifiers = ['NB', 'KNN', 'LR', 'RF', 'DT', 'SVM'] #'SVMCV', , 'GBDT'
classifiers = {'NB':naive_bayes_classifier,
'KNN':knn_classifier,
'LR':logistic_regression_classifier,
'RF':random_forest_classifier,
'DT':decision_tree_classifier,
'SVM':svm_classifier,
## 'SVMCV':svm_cross_validation,
## 'GBDT':gradient_boosting_classifier
}
###设置正例标签####################################################
positive_label = 2
###设置要去掉的类别标签####################################################
droplabel = 3
train_x, train_y, test_x, test_y = read_data(data_file,droplabel)
## print "train_y_s",train_y_s
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []
y_target = np.r_[train_y,test_y]
cv = StratifiedKFold(y_target, n_folds=6)
matplotlib.use('TkAgg')
root=Tk()
#在Tk的GUI上放置一个画布,并用.grid()来调整布局
drawPic.f = Figure(figsize = [10,7],dpi=100)
drawPic.canvas = FigureCanvasTkAgg(drawPic.f, master=root)
drawPic.canvas.show()
drawPic.canvas.get_tk_widget().grid(row=0, columnspan=3)
#放置标签、文本框和按钮等部件,并设置文本框的默认值和按钮的事件函数
Label(root,text=u"请输入分类器:").grid(row=1,column=0)##中文字符串请注意加上u,以双字节的Unicode显示中文不会出现乱码###### 在准星云学实习学到的
inputEntry=Entry(root)
inputEntry.grid(row=1,column=1)
inputEntry.insert(0,'RF')
drawPic()
draw_button_saveall()
draw_button_savesingle()
Button(root,text=u"重新训练",command=drawPic).grid(row=1,column=2,columnspan=3)
#启动事件循环
root.mainloop()
#输出预测标签用于感性对比真实标签
## import numpy as np
## model = classifiers['LR'](train_x, train_y)
## predict = model.predict(test_x)
## print "LR :"
## print "Predict:",test_x,predict.T
## if model_save_file != None:
## pickle.dump(model_save, open(model_save_file , 'wb'))