写在前面:本文章为清华大学出版社出版的数据挖掘课本实验第六章回归分析部分,如有为了方便做实验的同学需要可自取
【例6-1】分析预测房子的大小(平方英尺)和房价(美元)之间的对应关系。
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False# 用来正常显示负号
plt.rcParams['font.size']=13
y=[6450, 7450, 8450, 9450, 11450, 15450, 18450]
x=[150,200, 250,300, 350, 400, 600]
plt.scatter(x,y)
plt.xlabel('面积(平方英尺)')
plt.ylabel('售价(美元)')
plt.show()
对鸢尾花数据集中的'petal-length'和'petal-width'两列数据进行回归分析。
(1)导入包和数据
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.linear_model import LinearRegression
%matplotlib inline
iris = load_iris() #导入数据集iris
data=pd.DataFrame(iris.data)
data.columns=['sepal-length', 'sepal-width', 'petal-length', 'petal-width']
data.head() #显示前5行
(2)对数据集中的'petal-length'和'petal-width'两列数据进行回归分析
from sklearn.model_selection import train_test_split
# 使用sklearn完成一元线性回归
x = data['petal-length'].values
y = data['petal-width'].values
x = x.reshape(-1,1)
y = y.reshape(-1,1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 666)
clf = LinearRegression()
clf.fit(x_train,y_train)
pre = clf.predict(x_test)
plt.scatter(x_test,y_test,s=50)
plt.plot(x_test,pre,'r-',linewidth=2)
plt.xlabel('petal-length',fontsize=15)
plt.ylabel('petal-width',fontsize=15)
for idx, m in enumerate(x_test):
plt.plot([m,m],[y_test[idx],pre[idx]], 'g-')
plt.show()
(3)显示回归线的参数
print(u"系数", clf.coef_ )
print(u"截距", clf.intercept_ )
from sklearn.metrics import mean_squared_error as s_mean_squared_error
from sklearn.metrics import mean_absolute_error as s_mean_absolute_error
from sklearn.metrics import r2_score as s_r2_score
print('MAE:',s_mean_squared_error(y_test, pre))
print('MSE:',s_mean_absolute_error(y_test, pre))
print('R^2:',s_r2_score(y_test, pre))
(4)对花萼长度为3.9的花,预测其花萼宽度。
print(clf.predict([[3.9]]) )
多元线性回归模型的Python实现
(1)导入数据集
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
d=datasets.load_boston()
data=pd.DataFrame(d.data)
data['price']=d.target
data.sample(5)
(2)多元线性回归建模
from sklearn.linear_model import LinearRegression
#引入多元线性回归算法模块进行相应的训练
simple2=LinearRegression()
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=666)
simple2.fit(x_train,y_train)
print('多元线性回归模型系数:\n',simple2.coef_)
print('多元线性回归模型常数项:',simple2.intercept_)
y_predict=simple2.predict(x_test)
(3)模型分析
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
#直接调用库函数进行输出R2
print('预测值的均方误差:',
mean_squared_error(y_test,y_predict))
print(r2_score(y_test,y_predict))
print(simple2.score(x_test,y_test))
print('各特征间的系数矩阵:\n',simple2.coef_)
print('影响房价的特征排序:\n',np.argsort(simple2.coef_))
print('影响房价的特征排序:\n',
d.feature_names[np.argsort(simple2.coef_)])
逻辑回归
Logistic回归Python实现
(1)导入相关包,打开数据
from sklearn.datasets import load_iris
X = load_iris().data
y = load_iris().target
print('前8条数据:\n',X[:8])
print('前8条数据对应的类型:',y[:8])
(2)划分训练集和测试集并进行归一化
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
print(X_train[:5])
(3)训练逻辑回归模型并对测试集进行预测
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
#用LogisticRegression自带的score获得模型在测试集上的准确性
print('Accuracy of LR Classifier:%.3f'%classifier.score(X_test,y_test))
多项式回归Python实现
(1)准备数据
import numpy as np
import matplotlib.pyplot as plt
x = np.random.uniform(-3,3, size=100) # 产生100个随机数
X = x.reshape(-1,1) #将x变成矩阵,1行1列的形式
y = 0.5 * x**2 +x +2 + np.random.normal(0,1,size=100)
#数据中引入噪声
plt.scatter(x,y)
plt.show()
(2)线性回归
from sklearn.linear_model import LinearRegression
#线性回归
lin_reg = LinearRegression()
lin_reg.fit(X,y)
y_predict = lin_reg.predict(X)
plt.rcParams['font.family']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
plt.title('线性回归')
plt.scatter(x,y)
plt.plot(x,y_predict,color='r')
plt.show()
(3)多项式回归
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
#设置最多添加几次幂的特征项
poly.fit(X)
x2 = poly.transform(X)
from sklearn.linear_model import LinearRegression
#接下来的代码和线性回归一致
lin_reg2 = LinearRegression()
lin_reg2.fit(x2,y)
y_predict2 = lin_reg2.predict(x2)
plt.scatter(x,y)
plt.plot(np.sort(x),y_predict2[np.argsort(x)],color='r')
plt.title('多项式回归')
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=12)
#设置最多添加几次幂的特征项
poly.fit(X)
x2 = poly.transform(X)
from sklearn.linear_model import LinearRegression
#接下来的代码和线性回归一致
lin_reg2 = LinearRegression()
lin_reg2.fit(x2,y)
y_predict2 = lin_reg2.predict(x2)
plt.scatter(x,y)
plt.plot(np.sort(x),y_predict2[np.argsort(x)],color='r')
plt.title('多项式回归')
岭回归的Python实现
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge,RidgeCV
# Ridge岭回归,RidgeCV带有广义交叉验证的岭回归
data=[
[0.07,3.12],[0.41,3.82],[0.99,4.55],[0.73,4.25],[0.98,4.56],
[0.55,3.92],[0.34,3.53],[0.03,3.15],[0.13,3.11],[0.13,3.15],
[0.31,3.47],[0.65,4.12],[0.73,4.28],[0.23,3.48],[0.96,4.65],
[0.62,3.95],[0.36,3.51],[0.15,3.12],[0.63,4.09],[0.23,3.46],
[0.08,3.22],[0.06,3.19],[0.92,4.63],[0.71,4.29],[0.01,3.08],
[0.34,3.45],[0.04,3.16],[0.21,3.36],[0.61,3.99],[0.54,3.89] ]
#生成X和y矩阵
dataMat = np.array(data)
X = dataMat[:,0:1] # 变量x
y = dataMat[:,1] #变量y
# 岭回归
model = Ridge(alpha=0.5)
model = RidgeCV(alphas=[0.1, 1.0, 10.0])
# RidgeCV可以设置多个参数,算法使用交叉验证获取最佳参数值
model.fit(X, y) # 线性回归建模
print('系数矩阵:',model.coef_)
print('线性回归模型:\n',model)
# print('交叉验证最佳alpha值',model.alpha_)
# 只有在使用RidgeCV算法时才有效
# 使用模型预测
predicted = model.predict(X)
# 绘制散点图 参数:x横轴 y纵轴
plt.scatter(X, y, marker='o')
plt.plot(X, predicted,c='r')
# 绘制x轴和y轴坐标
plt.xlabel('x')
plt.ylabel('y')
# 显示图形
plt.show()
Lasso回归Python实现
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
#def main():
# 产生一些稀疏数据
np.random.seed(42)
n_samples, n_features = 50, 100
X = np.random.randn(n_samples, n_features)
# randn(...)产生的是正态分布的数据
coef = 3 * np.random.randn(n_features)
# 每个特征对应一个系数
inds = np.arange(n_features)
np.random.shuffle(inds)
coef[inds[10:]] = 0
# 稀疏化系数--随机地把系数向量1x200的其中10个值变为0
y = np.dot(X, coef)
# 添加噪声:零均值,标准差为 0.01 的高斯噪声
y += 0.01 * np.random.normal(size=n_samples)
# 把数据划分成训练集和测试集
n_samples = X.shape[0]
X_train, y_train = X[:n_samples // 2], y[:n_samples // 2]
X_test, y_test = X[n_samples // 2:], y[n_samples // 2:]
# 训练 Lasso 模型
from sklearn.linear_model import Lasso
alpha = 0.1
lasso = Lasso(alpha=alpha)
y_pred_lasso = lasso.fit(X_train, y_train).predict(X_test)
r2_score_lasso = r2_score(y_test, y_pred_lasso)
print("r^2 on test data : %f" % r2_score_lasso)
plt.plot(lasso.coef_, color='gold', linewidth=2,label='Lasso coefficients')
plt.title("Lasso R^2: %f" % r2_score_lasso)
plt.show()