20221007 课堂笔记-CSDN博客

本文链接：https://blog.csdn.net/m0_65523585/article/details/127312235

数据拆分的sklearn实现

训练集：用来训练模型
测试集：用来最终最学习方法的评估

#忽略警告信息（不显示警告信息）
import warnings
warnings.filterwarnings("ignore")

拆分为训练集和数据集

#导入波士顿房价数据集
from sklearn import datasets
boston = datasets.load_boston()
boston

{'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
         4.9800e+00],
        [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
         9.1400e+00],
        [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
         4.0300e+00],
        ...,
        [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         5.6400e+00],
        [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
         6.4800e+00],
        [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         7.8800e+00]]),
 'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
        18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
        15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
        13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
        21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
        35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
        19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
        20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
        23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
        33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
        21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
        20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
        23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
        15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21.5, 19.6, 15.3, 19.4,
        17. , 15.6, 13.1, 41.3, 24.3, 23.3, 27. , 50. , 50. , 50. , 22.7,
        25. , 50. , 23.8, 23.8, 22.3, 17.4, 19.1, 23.1, 23.6, 22.6, 29.4,
        23.2, 24.6, 29.9, 37.2, 39.8, 36.2, 37.9, 32.5, 26.4, 29.6, 50. ,
        32. , 29.8, 34.9, 37. , 30.5, 36.4, 31.1, 29.1, 50. , 33.3, 30.3,
        34.6, 34.9, 32.9, 24.1, 42.3, 48.5, 50. , 22.6, 24.4, 22.5, 24.4,
        20. , 21.7, 19.3, 22.4, 28.1, 23.7, 25. , 23.3, 28.7, 21.5, 23. ,
        26.7, 21.7, 27.5, 30.1, 44.8, 50. , 37.6, 31.6, 46.7, 31.5, 24.3,
        31.7, 41.7, 48.3, 29. , 24. , 25.1, 31.5, 23.7, 23.3, 22. , 20.1,
        22.2, 23.7, 17.6, 18.5, 24.3, 20.5, 24.5, 26.2, 24.4, 24.8, 29.6,
        42.8, 21.9, 20.9, 44. , 50. , 36. , 30.1, 33.8, 43.1, 48.8, 31. ,
        36.5, 22.8, 30.7, 50. , 43.5, 20.7, 21.1, 25.2, 24.4, 35.2, 32.4,
        32. , 33.2, 33.1, 29.1, 35.1, 45.4, 35.4, 46. , 50. , 32.2, 22. ,
        20.1, 23.2, 22.3, 24.8, 28.5, 37.3, 27.9, 23.9, 21.7, 28.6, 27.1,
        20.3, 22.5, 29. , 24.8, 22. , 26.4, 33.1, 36.1, 28.4, 33.4, 28.2,
        22.8, 20.3, 16.1, 22.1, 19.4, 21.6, 23.8, 16.2, 17.8, 19.8, 23.1,
        21. , 23.8, 23.1, 20.4, 18.5, 25. , 24.6, 23. , 22.2, 19.3, 22.6,
        19.8, 17.1, 19.4, 22.2, 20.7, 21.1, 19.5, 18.5, 20.6, 19. , 18.7,
        32.7, 16.5, 23.9, 31.2, 17.5, 17.2, 23.1, 24.5, 26.6, 22.9, 24.1,
        18.6, 30.1, 18.2, 20.6, 17.8, 21.7, 22.7, 22.6, 25. , 19.9, 20.8,
        16.8, 21.9, 27.5, 21.9, 23.1, 50. , 50. , 50. , 50. , 50. , 13.8,
        13.8, 15. , 13.9, 13.3, 13.1, 10.2, 10.4, 10.9, 11.3, 12.3,  8.8,
         7.2, 10.5,  7.4, 10.2, 11.5, 15.1, 23.2,  9.7, 13.8, 12.7, 13.1,
        12.5,  8.5,  5. ,  6.3,  5.6,  7.2, 12.1,  8.3,  8.5,  5. , 11.9,
        27.9, 17.2, 27.5, 15. , 17.2, 17.9, 16.3,  7. ,  7.2,  7.5, 10.4,
         8.8,  8.4, 16.7, 14.2, 20.8, 13.4, 11.7,  8.3, 10.2, 10.9, 11. ,
         9.5, 14.5, 14.1, 16.1, 14.3, 11.7, 13.4,  9.6,  8.7,  8.4, 12.8,
        10.5, 17.1, 18.4, 15.4, 10.8, 11.8, 14.9, 12.6, 14.1, 13. , 13.4,
        15.2, 16.1, 17.8, 14.9, 14.1, 12.7, 13.5, 14.9, 20. , 16.4, 17.7,
        19.5, 20.2, 21.4, 19.9, 19. , 19.1, 19.1, 20.1, 19.9, 19.6, 23.2,
        29.8, 13.8, 13.3, 16.7, 12. , 14.6, 21.4, 23. , 23.7, 25. , 21.8,
        20.6, 21.2, 19.1, 20.6, 15.2,  7. ,  8.1, 13.6, 20.1, 21.8, 24.5,
        23.1, 19.7, 18.3, 21.2, 17.5, 16.8, 22.4, 20.6, 23.9, 22. , 11.9]),
 'feature_names': array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
        'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7'),
 'DESCR': ".. _boston_dataset:\n\nBoston house prices dataset\n---------------------------\n\n**Data Set Characteristics:**  \n\n    :Number of Instances: 506 \n\n    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.\n\n    :Attribute Information (in order):\n        - CRIM     per capita crime rate by town\n        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.\n        - INDUS    proportion of non-retail business acres per town\n        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\n        - NOX      nitric oxides concentration (parts per 10 million)\n        - RM       average number of rooms per dwelling\n        - AGE      proportion of owner-occupied units built prior to 1940\n        - DIS      weighted distances to five Boston employment centres\n        - RAD      index of accessibility to radial highways\n        - TAX      full-value property-tax rate per $10,000\n        - PTRATIO  pupil-teacher ratio by town\n        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\n        - LSTAT    % lower status of the population\n        - MEDV     Median value of owner-occupied homes in $1000's\n\n    :Missing Attribute Values: None\n\n    :Creator: Harrison, D. and Rubinfeld, D.L.\n\nThis is a copy of UCI ML housing dataset.\nhttps://archive.ics.uci.edu/ml/machine-learning-databases/housing/\n\n\nThis dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.\n\nThe Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic\nprices and the demand for clean air', J. Environ. Economics & Management,\nvol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics\n...', Wiley, 1980.   N.B. Various transformations are used in the table on\npages 244-261 of the latter.\n\nThe Boston house-price data has been used in many machine learning papers that address regression\nproblems.   \n     \n.. topic:: References\n\n   - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.\n   - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.\n",
 'filename': 'D:\\Anaconda3\\lib\\site-packages\\sklearn\\datasets\\data\\boston_house_prices.csv'}

#拆分为训练集与测试集

#拆分为训练集与测试集 sklern.model_selection.train_test_split(

*arrays :等长度的需要拆分的数据对象
格式可以是1ists, numpy arrays, sci py稀疏矩阵或者panda s数据框显然，对于有监督类模型，x和y需要按相同标准同时进行拆分
test_ size = 0.25 : float, int, None, 用于验证模型的样本比例，范围在0到1为None时所有样本都将用于训练
train_ size = None : float, int, or None, 用于训练模型的样本比例，0到1为None时自动基于test_ size计算
random_ state = None随机种子(随便设）
shuffle = True :是否在拆分前对样本做随机排列
stratify = None : array-like or None, 是否按指定类别标签对数据做分层拆分
)返回:对输入对象进行拆分后的list, length = 2 * ln(arays)

#导入训练集与测试集拆分模块
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(boston.data, 
                                           boston.target,
                                           test_size= 0.3, 
                                           random_state=349
                                          )
len(x_train),len(x_test),len(y_train),len(y_test)

(354, 152, 354, 152)

s折交叉验证：首先随机地将已给数据切分为s个互不相交的大小相同的子集，然后利用s-1个子集的数据训练模型，利用余下的子集测试模型；将这一过程对可能的s种选择重复进行。最后选择s次评测中平均测试误差最小的模型

留一交叉验证：s=n。往往在数据缺乏的情况下使用。

Sklearn交叉验证常用命令
sklearn.model_sekection包括cross_val_score （将拆分与评价合并并执行）
cross_validate （同时使用多个评价指标）
cross_val_predict （使用交互验证后的模型进行预测）

将拆分与评价合并执行

#sklearn交叉验证常用命令
将拆分与评估合并执行

sklearn.model_selection.cross_ val_ score(

estimator :用于拟合数据的估计器对象名称
X : array-like， 用于拟合模型的数据阵
y = None : array-like,有监督模型使用的因变量
groups = None : array-like, 形如(n_ samples,), 样本拆分时使用的分组标签
cv = None : int, 设定交互验证时的样本拆分策略
None,使用默认的3组拆分
integer,设定具体的拆分组数
object 1 iterable FFiÆtF
n_ jobs = 1, verbose = O, fit_ params = None
pre_ dispatch = '2*n_ jobs'

)返回:每轮模型对应评分的数组

#导入cross_val_score模块
from sklearn.model_selection import cross_val_score 
#导入线性回归模块
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
scores = cross_val_score(reg, boston.data, boston.target,cv = 10)
scores

array([ 0.73376082,  0.4730725 , -1.00631454,  0.64113984,  0.54766046,
        0.73640292,  0.37828386, -0.12922703, -0.76843243,  0.4189435 ])

#求平均值和方差
scores.mean(), scores.std()

(0.20252899006055367, 0.5952960169512383)

scores = cross_val_score(reg, boston.data, boston.target, scoring = 'explained_variance', cv = 10)
scores

array([ 0.74784412,  0.5381936 , -0.80757662,  0.66844779,  0.5586898 ,
        0.74128804,  0.41981565, -0.11666214, -0.44561819,  0.42197365])

scores.mean()

0.27263956979413695

保证案例顺序的随机性

样本中案例顺序如果非随机，将会对模型验证带来严重的影响。

KFold等函数有一个内置 的参数shuffle，可以要求在拆分数据前将数据索引随机排序(但该参数默认NFalse) 
cross_ val_ score等函数无此参数，因此必要时应当先对数据进行随机排序。

#使用numpy进行随机重排
import numpy as np
X,y=boston.data,boston.target
indices = np.arange(y.shape[0])
np.random.shuffle(indices)
X,y = X[indices],y[indices]

#随机排序后：
reg = LinearRegression()
scores = cross_val_score(reg, X, y, cv = 10)
scores

array([0.68525792, 0.86167017, 0.43360162, 0.77029655, 0.82813619,
       0.67159777, 0.43783055, 0.68626901, 0.66072544, 0.84392753])

scores.mean(), scores.std()

(0.6879312757654554, 0.14464172569122236)

同时使用多个评价指标

cross_ validate函数使用的参数基本和cross_ val_ score相同，但是功能上有以下扩展:

可以指定多个指标对模型进行评估。
除测试集得分之外，还会返回-个包含训练得分，拟合次数，得分次数的字典。
sklearn.model selection.cross_ _validate(

estimator :用于拟合数据的估计器对象名称
x : array-like，用于拟合模型的数据阵
y = None : array-like, 有监督模型使用的因变量
groups = None : array-like, 形如(n_ samples,), 样本拆分时使用的分组标签scoring = None : string, callable, list/tuple， dict or None
模型评分的计算方法，多评估指标时使用1 ist/dict等方式提供
cV = None : int, 设定交互验证时的样本拆分策略
None,使用默认的3组拆分
integer,设定具体的拆分组数
object / iterable 用于设定拆分
n_obs = 1，verbose = 0，fit_params = None
pre_dispatch = ‘2*n_jobs’
return_train_score = True : boolean,是否返回训练集评分
)返回:每轮模型对应评分的字典，shape = (n_splits,)

from sklearn.model_selection import cross_validate
scoring = ['r2','explained_variance']
scores = cross_validate(reg,X, y,cv = 10, scoring = scoring,return_train_score = False)
scores

{'fit_time': array([0.00144815, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.00400066, 0.        , 0.00400066]),
 'score_time': array([0.        , 0.        , 0.00400949, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ]),
 'test_r2': array([0.68525792, 0.86167017, 0.43360162, 0.77029655, 0.82813619,
        0.67159777, 0.43783055, 0.68626901, 0.66072544, 0.84392753]),
 'test_explained_variance': array([0.68627925, 0.86996345, 0.43698251, 0.77496606, 0.83089913,
        0.67990516, 0.47111897, 0.7236839 , 0.66077957, 0.8462367 ])}

scores['test_r2'].mean()

0.6879312757654554

from sklearn.model_selection import cross_val_predict
pred = cross_val_predict(reg, X, y, cv = 10)
pred[:10]

array([16.64007022, 34.46615482, 19.90505228, 36.18559111, 26.33976946,
       33.22900938, 16.98796546, 37.06187884, 23.11126718, 25.48051526])

from sklearn.metrics import r2_score
r2_score(y, pred)

0.7125529294284265

sklearn实现生成决策树

sklearn实现决策树

class sklearn.tree.DecisionTreeClassifier(

criterion = ‘gini’ :衡量节点拆分质量的指标，{‘gini’, ‘entropy’}
splitter = ‘best’ :节点拆分时的策略
'best '代表最佳拆分，'random '为最佳随机拆分
max_ depth = None :树生长的最大深度(高度)
min_ samples_ split = 2 :节点允许进一步分枝时的最低样本数
min_ samples_ leaf = 1 :叶节点的最低样本量
min_ weight_ fraction_ leaf = 0.0 :有权重时叶节点的最低权重分值
max_ features = ‘auto’ : int/float/ string/None,搜索分支时考虑的特征数
‘auto’/‘sqrt’, max_ features = sqrt(n_ features)
‘log2’, max_ features = log2(n_ features)
None, max_ features = n_ features
random_ state = None
max_ leaf_ nodes = None :最高叶节点数量
min_ impurity_ decrease = 0.0 :分枝时需要的最低信息量下降量
class_ weight = None, presort = False
}

DecisionTreeClassifier类的属性:

classes_ : array of shape = [n_ classes] or a list of such arraysfeature_ importances : array of shape = [n_ features]，特征重要性评价总和为1，也被称为gini重要性
max_ features_ : int
n_ classes_ : int or list
n_ features_ : int
n_ outputs_ :int
tree_ : Tree object )
注意:树模型也可以用于数值变量预测，对应的方法为sklearn.tree .DecisionTreeRegressor

#导入决策树分类模块
from sklearn.tree import DecisionTreeClassifier
#导入鸢尾花数据集
from sklearn.datasets import load_iris
iris = load_iris()
#决策树实例化
ct = DecisionTreeClassifier()
#模型训练
ct.fit(iris.data,iris.target)

DecisionTreeClassifier()

ct.max_features_

ct.feature_importances_

array([0.01333333, 0.        , 0.06405596, 0.92261071])

ct.predict(iris.data)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

from sklearn.metrics import classification_report
print(classification_report(iris.target,ct.predict(iris.data)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        50
           2       1.00      1.00      1.00        50

    accuracy                           1.00       150
   macro avg       1.00      1.00      1.00       150
weighted avg       1.00      1.00      1.00       150

sklearn.tee模块中提供了将模型导出为graphviz格式文件的功能，从而可以对模型做图形观察。

http://www. gr aphviz.org,'下载gr aphvi z的安装包(可选择ms i格式)

安装pydot并进行所需配置，就可以在python环境中直接调用graphviz。

sklearn.tree.export_ graphviz(

decision_ tree, out_ file = “tree. dot”
max_ depth = None, feature_ names = None, class_ names = Nonelabel = ‘all’ : {‘all’， ‘root’, ‘none’}, 是否显示杂质测量指标filled = False :是否对节点填色加强表示
leaves_ parallel = False :是否在树底部绘制所有叶节点
impurity = True, node_ ids = False
proportion = False :是否给出节点样本占比而不是样本量
rotate = False :是否从左至右绘图
rounded = False :是否绘制圆角框而不是直角长方框
special_ characters = False :是否忽略PS兼容的特殊字符
precision = 3
}

from sklearn.tree import export_graphviz
export_graphviz(ct, out_file = 'tree.dot',
               feature_names= iris.feature_names,
               class_names=iris.target_names)

from sklearn.tree import export_graphviz
export_graphviz(ct, out_file = 'tree.dot1',
               feature_names= iris.feature_names,
               class_names=iris.target_names,
               rounded = True,
               filled = True)