#!/usr/bin/env python
# coding: utf-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn import metrics
from sklearn.linear_model import LinearRegression,SGDRegressor,Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
fn = r'C:/Users/Administrator/Downloads/housing.data'
# 读取数据
df = pd.read_csv(fn, header=None, names=[
'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'PRICE'], sep='\s+')
# 提取特征和标签
features = df.iloc[:, :-1]
label = df.iloc[:, -1]
# 切分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(
features, label, test_size=0.2, random_state=100)
# 导入pipeline,完成正则化和线性回归
pipe = Pipeline(steps=[('sc', StandardScaler()),
('linearregression', LinearRegression())])
# 模型训练
pipe.fit(X=x_train, y=y_train)
# 显示斜率和截距
print(pipe.named_steps['linearregression'].coef_)
print(pipe.named_steps['linearregression'].intercept_)
# 预测
y_pred_test = pipe.predict(x_test)
y_pred_train = pipe.predict(x_train)
# 作图显示预测结果
df_test = y_test.to_frame().reset_index()
df_test_pred = pd.DataFrame(y_pred_test, columns=['PRICE_pred'])
df_test_all = df_test.join(df_test_pred)
df_test_all.drop(columns='index',inplace=True)
fig, ax = plt.subplots(figsize=(10, 6))
for co in df_test_all.columns.values:
ax.scatter(df_test_all.index, y=df_test_all[co], label=co)
ax.legend()
plt.show()
# 显示R²
print(pipe.score(x_test, y_test))
数据集:
链接:https://pan.baidu.com/s/1_H_42vzoW7Ar0JFSX5INKg?pwd=11xb
提取码:11xb