sklearn 初探 2

最新推荐文章于 2021-12-20 18:48:00 发布

__susie__

最新推荐文章于 2021-12-20 18:48:00 发布

阅读量274

点赞数

分类专栏： sklearn 文章标签： sklearn basic

本文链接：https://blog.csdn.net/weixin_38070397/article/details/82713004

版权

sklearn 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

# !/usr/bin/python
# -*- coding: utf-8 -*-

# # =================================================
from sklearn import datasets
from sklearn import linear_model
from sklearn.model_selection import cross_val_predict

import matplotlib.pyplot as plt

lr = linear_model.LinearRegression()
boston = datasets.load_boston()
y = boston.target

# cross_val_predict returns an array of the same size as `y` where each entry
# is a prediction obtained by cross validation:
predicted = cross_val_predict(lr, boston.data, y, cv=10)

# fig, ax = plt.subplots()
# ax.scatter(y, predicted, edgecolors=(0, 0, 0))
# ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
# ax.set_xlabel('Measured')
# ax.set_ylabel('Predicted')
# plt.show()

# # ++++++++++++++++++++++++++++

lr = linear_model.LinearRegression()
# 拟合
# lr.fit(x, y)
# 查看最佳拟合系数
# k = lr.coef_

# y_plot = lr.predict(x)
plt.scatter(y, predicted, color='red',
edgecolors='black', label='Scatter', lw=2)
plt.plot([y.min(), y.max()], [y.min(), y.max()],
color='green', label='Linear', ls='--', lw=2)
plt.show()

# # ==========================================================

import numpy as np
import matplotlib.pyplot as plt
# decomposition 分解 pipeline 管道，GridSerachCV
from sklearn import linear_model, decomposition, datasets
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

logistic = linear_model.LogisticRegression()

# PCA 主成分分析 PCR主成分回归
pca = decomposition.PCA()
pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])

digits = datasets.load_digits()
X_digits = digits.data
y_digits = digits.target

# Plot the PCA spectrum
pca.fit(X_digits)

# plt.figure(1, figsize=(4, 3))
# plt.clf()
# plt.axes([.2, .2, .7, .7])
plt.plot(pca.explained_variance_, lw=2)
plt.axis('tight')
plt.xlabel('n_components')
plt.ylabel('explained_variance_')

# Prediction
n_components = [20, 40, 64]
Cs = np.logspace(-4, 4, 3)

# Parameters of pipelines can be set using ‘__’ separated parameter names:
estimator = GridSearchCV(pipe,
dict(pca__n_components=n_components,
logistic__C=Cs))
estimator.fit(X_digits, y_digits)

plt.axvline(estimator.best_estimator_.named_steps['pca'].n_components,
ls='--', label='n_components chosen')
plt.legend(prop=dict(size=12))
plt.show()

# ====================================================================

from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from itertools import product
from sklearn.ensemble import VotingClassifier
import matplotlib.pyplot as plt

iris = datasets.load_iris()
x = iris.data[:, [0, 2]]
y = iris.target

# training classifier
clf1 = DecisionTreeClassifier(max_depth=4)
clf2 = KNeighborsClassifier(n_neighbors=7)
clf3 = SVC(kernel='rbf', probability=True)
eclf = VotingClassifier(estimators=[(
'dt', clf1), ('knn', clf2), ('svc', clf3)], voting='soft', weights=[2, 1, 1])

clf1 = clf1.fit(x, y)
clf2 = clf2.fit(x, y)
clf3 = clf3.fit(x, y)
eclf = eclf.fit(x, y)

# ===============================================================================

from sklearn.preprocessing import MultiLabelBinarizer # 数据预处理二值化

y = [[2, 3, 4], [2], [0, 1, 3], [0, 1, 2, 3, 4], [0, 1, 2]]
y1 = MultiLabelBinarizer().fit_transform(y)
print(y1)
# ========================================================================
# 数据分离
import numpy as np
from sklearn.model_selection import train_test_split
X = np.arange(10).reshape((5, 2))
y = [y for y in range(5)]
print(X, '=====', y, '=====')

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)
print(X_train, '\n++++++++++', X_test, '=====')
print(y_train, '\n++++++++++', y_test)

# =============================================================================
# 数据预处理
from sklearn import preprocessing
import numpy as np

# 创建特征数组每一行表示一个样本每一列表示一个特征
x = np.array([[1., -1., 2.],
[2., 0., 0.],
[0., 1., -1.]])

# 如果你的数据有许多异常值，那么使用数据的均值与方差去做标准化就不行了。
# 在这里，你可以使用robust_scale 和 RobustScaler。它会根据中位数或者四分位数去中心化数据。
x_scale = preprocessing.scale(x) # 将每一列特征标准化为标准正太分布，注意，标准化是针对每一特征而言的
# x_robust_scaler = preprocessing.RobustScaler()
# x_scale = x_robust_scaler(x)
print(x_scale)

x_scale.mean(axis=0) # axis=0，表示每列均值为0
x_scale.std(axis=0) # 方差为1

# 也可以调用fit方法，根据已有的训练数据创建一个标准化的转换器
# StandardScaler(copy=True, with_mean=True, with_std=True)
# with_mean,with_std.这两个都是布尔型的参数，默认情况下都是true,
# 但也可以自定义成false.即不要均值中心化或者不要方差规模化为1.
scaler = preprocessing.StandardScaler().fit(x)
scaler.transform(x)
# 好了，比如现在又来了一组新的样本，也想得到相同的转换
new_x = [[-1., 1., 0.]]
scaler.transform(new_x)

# MinMaxScaler 每个特征中的最小值变成了0，最大值变成了1
min_max_scaler = preprocessing.MinMaxScaler()
x_minmax = min_max_scaler.fit_transform(x)

# MaxAbsScaler 数据会被规模化到[-1,1]之间所有的特征都除以最大值
# 这个方法对那些已经中心化均值维0或者稀疏的数据有意义
max_abs_scaler = preprocessing.MaxAbsScaler()
x_train_maxabs = max_abs_scaler.fit_transform(x)
print(x_train_maxabs)

# Normalization正则化是将样本在向量空间模型上的一个转换，经常被使用在分类与聚类中
x_normal = preprocessing.Normalization(x, norm='l1')
print(x_normal)

# 提供了一个实用类Normalizer,实用transform方法同样也可以对新的数据进行同样的转换
# Normalizer(copy=True, norm='l2') 根据训练数据创建一个正则器
x_normalizer = preprocessing.Normalizer.fit(x)
x_normalizer.transform(x)
# 对新的测试数据进行正则
x_normalizer.transform([[-1., 1., 0.]])

# 特征的二值化是指将数值型的特征数据转换成布尔类型的值
# 创建一个二值器
# 默认是根据0来二值化，大于0的都标记为1，小于等于0的都标记为0。
x_banirizer = preprocessing.Binarizer.fit(x)
x_banirizer.transform(x)

# 当然也可以自己设置这个阀值，只需传出参数threshold即可
binarizer = preprocessing.Binarizer(threshold=1).fit(x)
x_threshold = binarizer.transform(x)
print(x_threshold)

# 有缺失数据是通过NAN，或者空值表示的话，就无法识别与计算了
# 要弥补缺失值，可以使用均值mean，中位数，众数等等。Imputer这个类可以实现
model = preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)
model.fit(x)
model.transform(x)

# =============================================================================
# 决策树的案例分析决策树分类就是要求回答一系列的“Yes/No”,这样逐步划分出所有字段的分类
# 数据清理：错误的数据导致错误的模型
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

iris_data_clean = pd.read_csv('iris-data-clean.csv')
all_inputs = iris_data_clean[['col1', 'col2', 'col3', 'col4']].values
all_class = iris_data_clean['col0'].values

model_accuracies = []
for i in range(1000):
train_input, test_input, train_class, test_class = train_test_split(
all_inputs, all_class, train_size=0.75, random_state=10)

decision_tree_classifier = DecisionTreeClassifier()
decision_tree_classifier.fit(train_input, train_class)
classifier_accuracy = decision_tree_classifier.score(
test_input, test_class)
model_accuracies.append([classifier_accuracy])

# ==================================================================================

# 导入模块
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from pandas import DataFrame
import time

# 二、线性回归——家庭用电预测

# 1. 时间与功率之间的关系

# 导入数据
path = "datas/household_power_consumption_1000.txt"
data = pd.read_csv(path, sep=";")

print(data.head()) # 查看头信息，默认前5行的数据

# iloc进行行列切片只能用数字下标，取出X的原始值（所有行与一、二列的表示时间的数据）
xdata = data.iloc[:, 0:2]
# print(xdata)

y = data.iloc[:, 2] # 取出Y的数据（功率）第二列
# y = data["Global_active_power"] #等价上面一句

def time_format(x):
# join方法取出的两列数据用空格合并成一列
# 用strptime方法将字符串形式的时间转换成时间元祖struct_time
t = time.strptime(" ".join(x), "%d/%m/%Y %H:%M:%S") # 日月年时分秒的格式
# 分别返回年月日时分秒并放入到一个元组中
return (t.tm_year, t.tm_mon, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec)

# apply方法表示对xdata应用后面的转换形式
x = xdata.apply(lambda x: pd.Series(time_format(x)), axis=1)
print("======处理后的时间格式=======")
print(x.head())

# 划分测试集和训练集，random_state是随机数发生器使用的种子
# x 包含两列的数据
x_train, x_test, y_train, y_test = train_test_split(
x, y, test_size=0.3, random_state=1)

# 对数据的训练集和测试集进行标准化预处理
ss = StandardScaler()
# fit做运算，计算标准化需要的均值和方差；transform是进行转化
x_train = ss.fit_transform(x_train)
x_test = ss.transform(x_test)

# 建立线性模型
lr = LinearRegression()
lr.fit(x_train, y_train) # 训练运算训练模型
print("准确率:", lr.score(x_train, y_train)) # 打印预测的决定系数,该值越接近于1越好
y_predict = lr.predict(x_test) # 预测用训练模型对test 数据进行预测
# print(lr.score(x_text,y_predict))

# 模型效果判断
mse = np.average((y_predict - np.array(y_test))**2)
rmse = np.sqrt(mse)
print("均方误差平方和：", mse)
print("均方误差平方和的平方根：", rmse)

# 模型的保存与持久化
from sklearn.externals import joblib

joblib.dump(ss, "data_ss.model") # 将标准化模型保存
joblib.dump(lr, "data_lr.model") # 将训练后的线性模型保存

joblib.load("data_ss.model") # 加载模型,会保存该model文件
joblib.load("data_lr.model") # 加载模型

# 预测值和实际值画图比较

# 解决中文问题
mpl.rcParams["font.sans-serif"] = [u"SimHei"]
mpl.rcParams["axes.unicode_minus"] = False

t = np.arange(len(x_test))
plt.figure(facecolor="w") # 创建画布，facecolor为背景色，w是白色（默认）
plt.plot(t, y_test, "r-", lw=2, label="真实值")
plt.plot(t, y_predict, "g-", lw=2, label="预测值")
plt.legend(loc="upper right") # 显示图例，设置图例的位置
plt.title("线性回归预测时间和功率之间的关系", fontsize=20)
plt.grid(b=True)
plt.savefig("线性回归预测时间和功率之间的关系.png") # 保存图片