机器学习——线性回归
一元线性回归
- 加载数据;
- 数据切分;
- 模型训练;
- 预测;
- 结果分析;
# -*- coding: utf-8 -*-
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn import linear_model,metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
df = pd.read_csv('ex1data1.txt', names=['population', 'profit'])#读取数据并赋予列名
# sns.lmplot('population', 'profit', df, height=6, fit_reg=False)
# plt.show()
X = df.iloc[:,:1].values
Y = df.iloc[:,1].values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
linreg = linear_model.LinearRegression()
linreg.fit(X_train, Y_train)
#使用测试集进行预测
y_pred = linreg.predict(X_test)
print("截距", linreg.intercept_)
print("系数:", linreg.coef_)
fig = plt.figure(12)
ax = fig.add_subplot(211)
ax.scatter(df.population,df.profit,label="RAW")
ax.plot(df.population,df.population*linreg.coef_+linreg.intercept_,'r',label="FITTING")
ax.set_title(u'原始数据和拟合直线',fontproperties='SimHei')
ax.legend(loc=2)
ax2 = fig.add_subplot(212)
X_index = np.arange(0,Y_test.size,1)
ax2.plot(X_index, Y_test,label='REAL')
ax2.plot(X_index, y_pred,label='PREDICTION')
ax2.set_title(u'实际值和预测值',fontproperties='SimHei')
ax2.legend(loc=2)
plt.show()
# 均方误差MSE
print("MSE: ", metrics.mean_squared_error(Y_test, y_pred))
运行结果:
多元线性回归
- 加载数据;
- 数据切分;
- 特征缩放(StandardScaler)
- 模型训练;
- 预测;
- 结果分析;
# -*- coding: utf-8 -*-
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn import linear_model,metrics,preprocessing
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
df= pd.read_csv('ex1data2.txt', names=['square', 'bedrooms', 'price'])#读取数据并赋予列名
print(df.head())
X = df.iloc[:,:-1].values
Y = df.iloc[:,2].values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
standard_scaler = preprocessing.StandardScaler()
X_train=standard_scaler.fit_transform(X_train)#特征缩放
X_test=standard_scaler.fit_transform(X_test)
linreg = linear_model.LinearRegression()
linreg.fit(X_train, Y_train)
y_pred = linreg.predict(X_test)
print("截距", linreg.intercept_)
print("系数:", linreg.coef_)
# 均方误差MSE
print("MSE: ", metrics.mean_squared_error(Y_test, y_pred))
X_index = np.arange(0,Y_test.size,1)
plt.plot(X_index, Y_test,label='REAL')
plt.plot(X_index, y_pred,label='PREDICTION')
plt.title(u'实际值和预测值',fontproperties='SimHei')
plt.legend(loc=2)
plt.show()
运行结果: