机器学习sklearn模块

最新推荐文章于 2023-05-02 10:30:00 发布

Muwenlong#

最新推荐文章于 2023-05-02 10:30:00 发布

阅读量337

点赞数

分类专栏：数据分析文章标签：机器学习数据挖掘

本文链接：https://blog.csdn.net/qq_44717437/article/details/106834665

版权

数据分析专栏收录该内容

5 篇文章 0 订阅

订阅专栏

1、基本架构

# Loadlibraries
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')
# Loaddataset
names =['sepal-length', 'sepal-width','petal-length', 'petal-width','class']
dataset =pd.read_csv("iris.data",names=names)
# shape
print(dataset.shape)
# head
print(dataset.head(20))
# descriptions
print(dataset.describe())
# classdistribution
print(dataset.groupby('class').size())
# boxand whisker plots
dataset.plot(kind='box',subplots=True,layout=(2,2), sharex=False, sharey=False)
plt.show()
#histograms
dataset.hist()
plt.show()
#scatter plot matrix
scatter_matrix(dataset)
plt.show()
#Split-out validation dataset
array =dataset.values
print(array)
X = array[:,:3]#行下标为1到3的数据X =array[:,0:4]
print(X)
Y =array[:,4]#行下标为4的数据
print(Y)
validation_size = 0.20#测试的比例
seed =7
X_train, X_validation, Y_train,Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)
#训练数据集：X_train，Y_train
#验证数据集：X_validation，Y_validation
# Testoptions and evaluation metric
seed =7
scoring= 'accuracy'
# SpotCheck Algorithms
models =[]
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
#evaluate each model in turn
results =[]
names =[]
for name, model in models:
    kfold= model_selection.KFold(n_splits=10, random_state=seed)
    cv_results= model_selection.cross_val_score(model,X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg= "%s: %f (%f)" % (name, cv_results.mean(),cv_results.std())
    print(msg)
#Compare Algorithms
fig =plt.figure()
fig.suptitle('Algorithm Comparison')
ax =fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

# Makepredictions on validation dataset
knn =KNeighborsClassifier()
knn.fit(X_train,Y_train)
predictions = knn.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

2、绘制混淆矩阵(基于决策树算法)

arrays=["price","age_level","pid","final_gender_code","cate_id","brand","campaign_id","new_user_class_level ","shopping_level","pvalue_level","occupation"]
x=file_3[arrays]
y=file_3["clk"]

def cm_plot(original_label, predict_label, pic=None):
    ans = metrics.classification_report(original_label, predict_label, digits=5)
    print(ans)
    cm = confusion_matrix(original_label, predict_label)   # 由原标签和预测标签生成混淆矩阵
    plt.figure()
    plt.matshow(cm, cmap=plt.cm.Blues)     # 画混淆矩阵，配色风格使用cm.Blues
    plt.colorbar()    # 颜色标签
    for x in range(len(cm)):
        for y in range(len(cm)):
            plt.annotate(cm[x, y], xy=(x, y), horizontalalignment='center', verticalalignment='center')
            # annotate主要在图形中添加注释
            # 第一个参数添加注释
            # 第二个参数是注释的内容
            # xy设置箭头尖的坐标
            # horizontalalignment水平对齐
            # verticalalignment垂直对齐
            # 其余常用参数如下：
            # xytext设置注释内容显示的起始位置
            # arrowprops 用来设置箭头
            # facecolor 设置箭头的颜色
            # headlength 箭头的头的长度
            # headwidth 箭头的宽度
            # width 箭身的宽度
    plt.ylabel('Predicted label')  # 坐标轴标签
    plt.xlabel('True label')  # 坐标轴标签
    plt.title('confusion matrix')
    if pic is not None:
        plt.savefig(str(pic) + '.jpg')
    plt.show()

clf = tree.DecisionTreeClassifier(splitter='random', max_depth=100, min_samples_leaf=12)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
clf = clf.fit(x_train, y_train)
score = clf.score(x_train, y_train)
score1 = clf.score(x_test, y_test)
y_true = y_test
y_pred = clf.predict(x_test)
cm_plot(y_true,y_pred)


c=confusion_matrix(y_true, y_pred)
ans = metrics.classification_report(y_true,y_pred,digits=5)
print(ans)输出相关的报道
print(c)输出文字格式混淆矩阵
print(clf.classes_)#输出分类的类别

3、绘制roc曲线（基于决策树算法）

clf = tree.DecisionTreeClassifier(splitter='random', max_depth=100, min_samples_leaf=12)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
clf = clf.fit(x_train, y_train)
y_true = y_test
y_score = clf.predict_proba(x_test)[:,1]

def rocs():
    print(y_pred)
    print(roc_curve(y_test, y_score))
    fpr, tpr, thresholds = roc_curve(y_test, y_score)
    print(fpr)
    print(tpr)
    print("#"*10)
    print("auc")
    print(auc(fpr,tpr))
    print("#"*10)

    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',linestyle='--')
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('The roc curve')
    plt.legend(loc="lower right")
    plt.show()
rocs()

4、绘制决策树

import graphviz
import pandas as pd
from sklearn import metrics, tree
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.model_selection import train_test_split

pd.set_option('display.max_rows',None)#显示所有的行
pd.set_option('display.max_columns',None)#显示所有的列
pd.set_option('max_colwidth',50)#设置列的最大默认宽度
file_3=pd.read_csv("improve_1.csv")
arrays=["price","age_level","pid","final_gender_code","cate_id","brand","campaign_id","new_user_class_level ","shopping_level","pvalue_level","occupation"]
x=file_3[arrays]
y=file_3["clk"]
clf=tree.DecisionTreeClassifier(splitter='random',max_depth=100,min_samples_leaf=12)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.992, stratify=y)
clf = clf.fit(x_train, y_train)
dot_data = tree.export_graphviz(clf,class_names=["noclk","clk"],feature_names=arrays)
graph = graphviz.Source(dot_data)
graph.render("abcd")

中文网站：
http://www.scikitlearn.com.cn/

Muwenlong#

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
机器学习sklearn模块

基础不牢地动山摇，根据下面代码可以简单了解数据挖掘的基本流程，但是目前自己并不可以灵活运用，在自己重新温习数据结构，概率论知识之后，再深入了解机器学习。。。。。转帖：机器学习关于数据挖掘方向的简单架构# Loadlibrariesimport pandas as pdfrom pandas.plotting import scatter_matriximport matplotlib.pyplot as pltfrom sklearn import model_selectionfrom sk
复制链接

扫一扫