1.利用最小二乘法预测油耗
#读取文件,以空格分割各行,原始数据不包含表头
data=pd.read_csv('auto-mpg.csv', sep='\s+',header=None)
#给所有属性加上列名
data.columns=['MPG','cyl','disp','hp','wt','acc','year','orig','name']
data.head()
df=data[data['hp']!='?'] #排除hp不详的数据
df=df[['disp','MPG']].copy() #选取要计算的列
df.head()
df['x_xbar']=df.disp-df.disp.mean() #添加一列X-\overline{X}
df['y_ybar']=df.MPG-df.MPG.mean() #添加一列Y-\overline{Y}
df.head()
#按照公式计算c值,b值
c=sum(df.x_xbar*df.y_ybar)/sum(df.x_xbar**2) #按公式6-11计算c
b=df.MPG.mean()-c*df.disp.mean() #按公式6-12计算b
print(c,b) #打印输出c和b的值
#输出结果
#数据挖掘中,最小二乘法的函数利用
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(df[['disp']], df.MPG)
print(lr.coef_, lr.intercept_) #系数,截距
print(lr.score(df[['disp']],df.MPG)) # r2
输出结果
2.梯度下降法
使用一个简单线性模型作为示例:f(x) = x * W + b,有2个变量- W 和 b。另外,我们会生成数据让训练好的模型满足 W = 3.0 和 b = 2.0
import tensorflow as tf
# 1、定义模型
# 定义一个简单的类封装变量和计算
class Model(object):
def __init__(self):
# 初始化变量值为(5.0, 0.0)
# 实际上,这些变量应该初始化为随机值
self.W = tf.Variable(5.0)
self.b = tf.Variable(0.0)
def __call__(self, x):
return self.W * x + self.b
model = Model()
assert model(3.0).numpy() == 15.0
#定义损失函数
#损失函数用来衡量在给定输入的情况下,模型的预测输出与实际输出的偏差。我们这里使用标准 L2 损失函数。
def loss(predicted_y, desired_y):
return tf.reduce_mean(tf.square(predicted_y - desired_y))
#获取训练数据
#我们来生成带噪声的训练数据。
TRUE_W = 3.0
TRUE_b = 2.0
NUM_EXAMPLES = 1000
inputs = tf.random.normal(shape=[NUM_EXAMPLES])
noise = tf.random.normal(shape=[NUM_EXAMPLES])
outputs = inputs * TRUE_W + TRUE_b + noise
#在训练模型之前,我们来看看当前的模型表现。我们绘制模型的预测结果和训练数据,预测结果用红色表示,训练数据用蓝色表示。
import matplotlib.pyplot as plt
plt.scatter(inputs, outputs, c='b')
plt.scatter(inputs, model(inputs), c='r')
plt.show()
print('Current loss: '),
print(loss(model(inputs), outputs).numpy())
def train(model, inputs, outputs, learning_rate):
with tf.GradientTape() as t:
current_loss = loss(model(inputs), outputs)
dW, db = t.gradient(current_loss, [model.W, model.b])
model.W.assign_sub(learning_rate * dW)
model.b.assign_sub(learning_rate * db)
model = Model()
# 收集 W 和 b 的历史数值,用于显示
Ws, bs = [], []
epochs = range(10)
for epoch in epochs:
Ws.append(model.W.numpy())
bs.append(model.b.numpy())
current_loss = loss(model(inputs), outputs)
train(model, inputs, outputs, learning_rate=0.1)
print('Epoch %2d: W=%1.2f b=%1.2f, loss=%2.5f' %
(epoch, Ws[-1], bs[-1], current_loss))
# 显示所有
plt.plot(epochs, Ws, 'r',
epochs, bs, 'b')
plt.plot([TRUE_W] * len(epochs), 'r--',
[TRUE_b] * len(epochs), 'b--')
plt.legend(['W', 'b', 'true W', 'true_b'])
plt.show()
3.多元变量线性回归
#读取文件,以空格分割各行,原始数据不包含表头
data=pd.read_csv('auto-mpg.csv', sep='\s+',header=None)
data.columns=['MPG','cyl','disp','hp','wt','acc','year','orig','name']
data.head()
df=data[data['hp']!='?'] #排除hp不详的数据
df=df.drop(['name'],axis=1)
df.head()
X=df.drop(['MPG'],axis=1)
y=df[['MPG']]
# visualizing dataset
plt.figure(figsize=(12,12))
plt.title('dataset visualization')
for i in range(7):
plt.subplot(4,2,i+1)
plt.scatter(X.iloc[:,i], y, color='red')
plt.xlabel(X.columns[i])
plt.ylabel('MPG')
from sklearn.model_selection import train_test_split
#生成训练集和验证集,这里只用到验证集
# split train and test data
X_train ,X_test,y_train,y_test = train_test_split(X,y,train_size = 0.8,random_state = 0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X_train_std = sc_X.fit_transform(X_train)
y_train_std = np.ravel(sc_y.fit_transform(y_train))
# Fitting Simple Linear Regression to the Training set
from sklearn.linear_model import SGDRegressor
regressor = SGDRegressor()
regressor.fit(X_train_std, y_train_std)
#预测分数
print(regressor.score(X_train_std,y_train_std))