Scikit-learn之线性模型（一）

最新推荐文章于 2022-11-10 22:38:33 发布

Handsome coder

最新推荐文章于 2022-11-10 22:38:33 发布

阅读量258

点赞数

分类专栏：机器学习文章标签：机器学习 python

本文链接：https://blog.csdn.net/liaozhaocong/article/details/116400025

版权

机器学习专栏收录该内容

23 篇文章 2 订阅

订阅专栏

一、Fitting a line through data

from sklearn import datasets
boston = datasets.load_boston()

from sklearn.linear_model import LinearRegression
lr = LinearRegression()

lr.fit(boston.data, boston.target)
predictions = lr.predict(boston.data)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline    

pd.Series(boston.target - predictions).hist(bins=50)

二、Fitting a line through data with Machine Learning

from sklearn import datasets
boston = datasets.load_boston()

from sklearn.linear_model import LinearRegression
lr = LinearRegression()

lr.fit(boston.data, boston.target)

from sklearn.model_selection import cross_val_predict

predictions_cv = cross_val_predict(lr, boston.data, boston.target, cv=10)
predictions_cv[:5]

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#within Ipython 
%matplotlib inline 
 
pd.Series(boston.target - predictions_cv).hist(bins=50)

三、Evaluating the linear regression model

from sklearn import datasets
boston = datasets.load_boston()

from sklearn.linear_model import LinearRegression
lr = LinearRegression()

lr.fit(boston.data, boston.target)

from sklearn.model_selection import cross_val_predict

predictions_cv = cross_val_predict(lr, boston.data, boston.target, cv=10)#10折

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

from scipy.stats import probplot
f = plt.figure(figsize=(7, 5))
ax = f.add_subplot(111)#111表示1×1第一个，同理2×2第一个表示为221.
tuple_out = probplot(boston.target - predictions_cv, plot=ax)  #probability plot
tuple_out[1]

def MSE(target, predictions):  #L2损失
    squared_deviation = np.power(target - predictions, 2)
    return np.mean(squared_deviation)

MSE(boston.target, predictions_cv)

def MAD(target, predictions):  #L1损失
    absolute_deviation = np.abs(target - predictions)
    return np.mean(absolute_deviation)

MAD(boston.target, predictions_cv)

在这里插入图片描述

from sklearn.metrics import mean_absolute_error, mean_squared_error
 
print ('MAE: ', mean_absolute_error(boston.target, predictions_cv))
print ('MSE: ', mean_squared_error(boston.target, predictions_cv))

n_bootstraps = 1000 #Bootstrapping从字面意思翻译是拔靴法，又叫自助法，是一种通过对样本进行重采样得到的估计总体的方法。
len_boston = len(boston.target)
subsample_size = np.int(0.5*len_boston)

subsample = lambda: np.random.choice(np.arange(0, len_boston),size=subsample_size)
coefs = np.ones(n_bootstraps) #pre-allocate the space for the coefs 提前为coef分配空间
for i in range(n_bootstraps):
    subsample_idx = subsample()#
    subsample_X = boston.data[subsample_idx]
    subsample_y = boston.target[subsample_idx]
    lr.fit(subsample_X, subsample_y)
    coefs[i] = lr.coef_[0]

在这里插入图片描述

import matplotlib.pyplot as plt
f = plt.figure(figsize=(7, 5))
ax = f.add_subplot(111)
ax.hist(coefs, bins=50)
ax.set_title("Histogram of the lr.coef_[0].")
np.percentile(coefs, [2.5, 97.5])

在这里插入图片描述

三、例子

#本地有个文件C:/Users/lzc/Desktop/income.data_/income.data.csv，下载地址添加链接描述
如图
在这里插入图片描述

import pandas as pd
# df = pd.read_csv('C:/Users/lzc/Desktop/test1.csv',header=None,nrows=5)  #自动添加第一行当作表头 
df = pd.read_csv('C:/Users/lzc/Desktop/income.data_/income.data.csv')  
#注意C:/Users/lzc/Desktop/机器学习/利用python进行数据分析/pydata-book-2nd-edition/pydata-book-2nd-edition/iris-data.csv是你下载好的数据集的位置。nrows=5表示读取5行。
# print(df)
#df.head()

# df['label'] = iris.target
import numpy as np
data = np.array(df)
print(data) 
X1 = df.iloc[:, 1].values
# X1 = df.iloc[:, :-1].values
X = X1[:,np.newaxis]
# print(X1.shape)
print(X.shape)
# #x = df.iloc[:, :-1].values
# #第一个冒号表示提取数据集的全部行，「:-1」则表示提取除最后一列以外的所有列。最后的「.values」表示希望提取所有的值。

y = df.iloc[:, 2].values
print('X=',X)#输出特征
print('y=',y)#输出Class

在这里插入图片描述

from sklearn import datasets
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

lr.fit(X, y)
print(lr.coef_)
print(lr.intercept_)
plt.scatter(X,y)
y_pred = lr.coef_*X+lr.intercept_
plt.plot(X,y_pred,'r')

在这里插入图片描述

采样知识

import numpy as np
 
#从0-4中等概率采样3个数 可以重复 
print(np.random.choice(5,3))

#从0-4中等概率采样3个数 不可以重复
print(np.random.choice(5,3,replace=False))

#注意不放回采样时 size不能比arr数组大
#print(np.random.choice(5,6,replace=False)) 这句不能运行
#从0-4中以指定概率采样3个数 
print(np.random.choice(5,3,p=[0.1,0.1,0.1,0.3,0.4]))

#从0-4中以指定概率采样3个数  不重复
print(np.random.choice(5,3,p=[0.1,0.1,0.1,0.3,0.4],replace=False))

参考文献
1、python杂谈
2、scikit cookbook

Handsome coder

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Scikit-learn之线性模型（一）

一、Fitting a line through datafrom sklearn import datasetsboston = datasets.load_boston()from sklearn.linear_model import LinearRegressionlr = LinearRegression()lr.fit(boston.data, boston.target)predictions = lr.predict(boston.data)import numpy as
复制链接

扫一扫