【数据科学与机器学习导论】【笔记】2.4 训练数据,再训练,和交叉验证

课程链接:https://stepik.org/course/4852/syllabus  语言:俄语

文中使用数据:链接: https://pan.baidu.com/s/1PYqtZWdxr8k-_lTGvhgqKQ 提取码: j6im


一、数据训练

from sklearn import tree
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display
from IPython.display import SVG
from sklearn.model_selection import train_test_split
from graphviz import Source
from sklearn.tree import export_graphviz    # 导入的是一个函数
from IPython.display import HTML
style="<style>svg{width:20% !important;height:20% !important;}</style>"
HTML(style)

#读取训练数据和检测数据
#此处需要注意,必需要设置参数index_col,否则读取引用有误,将导致最终图形出错
data_train_iris=pd.read_csv("E:/Training/Python/Trying/data/train_iris.csv",index_col = 0)
data_train_iris.head(10)
data_test_iris=pd.read_csv("E:/Training/Python/Trying/data/test_iris.csv",index_col = 0)
data_test_iris.head()

#更改列名
data_train_iris=data_train_iris.rename(columns={"sepal length":"sepal_length","sepal width":"sepal_width","petal length":"petal_length","petal width":"petal_width"})
data_train_iris.head(10)
data_test_iris=data_test_iris.rename(columns={"sepal length":"sepal_length","sepal width":"sepal_width","petal length":"petal_length","petal width":"petal_width"})
data_test_iris.head(10)

#提取模型所用数据,划分X和y
X_train=data_train_iris.drop("species",axis=1)
y_train=data_train_iris.species
X_test=data_test_iris.drop("species",axis=1)
y_test=data_test_iris.species

#设置相关参数,本模型要求从深度1-100进行计算测试
rs = np.random.seed(0)
max_depth_values = range(1,100)
scores_data = pd.DataFrame()

#设置模型
for max_depth in max_depth_values:
    clf = tree.DecisionTreeClassifier(criterion='entropy',max_depth = max_depth,random_state=rs)
    clf.fit(X_train,y_train)
    train_score = clf.score(X_train,y_train)
    test_score = clf.score(X_test,y_test)
    temp_score_data = pd.DataFrame({"max_depth":[max_depth],
                                    "train_score":[train_score],
                                    "test_score":[test_score]})
    scores_data = scores_data.append(temp_score_data)

#转换列,以便画图
scores_data_long = pd.melt(scores_data,id_vars = ["max_depth"],value_vars = ["train_score","test_score"],var_name = "set_type", value_name = "score")

sns.lineplot(x = "max_depth",y = "score", hue = "set_type",data = scores_data_long)

 关于用到的pd.melt函数,可查看【Python专题】pandas.melt函数

二、预测

本程序是根据不确定的物种的动物的各种特征,预测是猫还是狗,即预测物种。

from sklearn import tree
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display
from IPython.display import SVG
from sklearn.model_selection import train_test_split
from graphviz import Source
from sklearn.tree import export_graphviz    # 导入的是一个函数
from IPython.display import HTML
style="<style>svg{width:20% !important;height:20% !important;}</style>"
HTML(style)

#训练数据读入
data_ori=pd.read_csv("E:/Training/Python/Trying/data/dogs_n_cats.csv")
#data_train.head(10)
data_ori=data_ori.rename(columns={"Лазает по деревьям":"Лазает_по_деревьям"})
data_ori.head(10)

X_ori=data_ori.drop(["Вид"], axis=1)
y_ori=data_ori["Вид"]

rs = np.random.seed(0)

#将训练数据按照0.33分开,以便测试训练结果
X_train,X_test,y_train,y_test = train_test_split(X_ori,y_ori,test_size = 0.33,random_state=rs)

clf = tree.DecisionTreeClassifier(criterion='entropy',max_depth = max_depth,random_state=rs)
clf.fit(X_train,y_train)

#进行预测,并与实际结果进行对比
result = clf.predict(X_test)
pd.Series(result)[result == 'собачка'].count()

pd.Series(y_test)[y_test == 'собачка'].count()

#读入需要预测结果的数据并预测
data_pre=pd.read_json("E:/Training/Python/Trying/data/dataset_209691_15.txt")
#data_test.head()
data_pre=data_pre.rename(columns={"Лазает по деревьям":"Лазает_по_деревьям"})
data_pre.head(10)
X_pre=data_pre

result = clf.predict(X_pre)
pd.Series(result)[result == 'собачка'].count()

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值