李宏毅PM2.5
只是使用了PM2.5的特征,没有考虑其他因素来预测,对数据的处理,因为测试数据是9天,来预测第10的PM2.5,所以我对数据进行切割,9天为特征,第10天为标记,并且循环后移,例如1-9为特征10为标记,2-10为标记11为标记,根据这样的切割使数据变成3600*9的训练数据。
import numpy as np
import pandas as pd
dir_path = r"G:\python3code\DeepLearn\机器学习\回归\week1"
train = pd.read_csv(dir_path + r'\train.csv',engine='python', encoding='utf-8')
test = pd.read_csv(dir_path + r'\test.csv', engine='python', encoding='utf-8')
train = train[train['observation'] == 'PM2.5']
test = test[test['AMB_TEMP'] == 'PM2.5']
train = train.drop(['Date', 'stations', 'observation'], axis=1)
x_test = test.iloc[:, 2:]
train_x = []
train_y = []
for i in range(15):
x = train.iloc[:, i:i + 9]
x.columns = np.array(range(9))
y = train.iloc[:, i + 9]
y.columns = np.array(range(1))
train_x.append(x)
train_y.append(y)
# 对矩阵进行拼接,默认是行,即垂直拼接
x_train= pd.concat(train_x)
y_train = np.array(pd.concat(train_y),dtype=np.float)
print(x_train.shape,y_train.shape)
class LinearRegression(object):
def __init__(self,epochs=200,lr=0.01,b=0,w=0):
"""
:param epochs: 迭代次数
:param lr: 学习率
:param b: 初始偏置
:param w: 初始权重
"""
self.epochs = epochs
self.lr = lr
self.b = b
self.w =w
def SST(self,y_data):
"""总平方和 真实 - 平均"""
sum = np.array(y_data)- np.mean(y_data)
return np.sum(sum)
def MSE(self,y_data,y_predict):
mse = np.sum((y_data - y_predict) ** 2) / len(y_data)
return mse
def SSE(self,y_data,y_predict):
"""残差平方和"""
sse= np.sum((y_data-y_predict)**2)
return sse
def R(self,y_data,y_predict):
sse = self.SSE(y_data=y_data,y_predict=y_predict)
sst = self.SST(y_data=y_data)
print(sse/sst)
return 1-sse/sst
def normalized(self,x_data,x_test):
"""
归一化,x-min/(max-min)
"""
max = np.array(np.max(x_data,axis=0))
min = np.array(np.min(x_data,axis=0))
x_data= np.array(x_data,dtype=np.float32) - min / (max - min)
x_test = np.array(x_test,np.float) - min / (max - min)
return x_data,x_test
# 用矩阵去算 标准方程法
def standard_equation(self,x_data,y_data):
"""
标准方程法,不使用梯度下降,直接求解权重
但需要计算 x_data.T*x_data是否可逆,可逆才能使用
"""
# 对输入添加偏置 3600*9变成 3600*10 偏置为1添加到前面
x_data = np.hstack((np.ones((len(x_data),1)),x_data))
x_Tx = x_data.T.dot(x_data)
# 判断是否有逆矩阵
if np.linalg.det(x_Tx) ==0:
print('矩阵不可逆,不能使用标准方程法')
return
# 计算权重 w = (x.T*x)**-1 x.T y
weight = np.linalg.inv(x_Tx).dot(x_data.T).dot(y_data)
return weight
def predict(self,x_data,weight):
x_data = np.hstack((np.ones((len(x_data),1)),x_data))
y_predict = x_data.dot(weight)
return y_predict
def gradient_descent(self,x_data,y_data):
"""
使用矩阵的方式
:return:
"""
# 加上偏置
x_data = np.hstack((np.ones((len(x_data),1)),x_data))
shape = x_data.shape
# 权重初始化
w = (np.random.random([shape[1],1]) - 0.5) * 2
y_data = y_data.reshape((shape[0],1))
for i in range(20):
res = np.dot(x_data,w)
w_c = self.lr * (x_data.T.dot(y_data - res)) / int(
x_data.shape[0])
w = w - w_c
return w
line =LinearRegression()
x_data,x_test = line.normalized(x_data=x_train,x_test=x_test)
weight =line.standard_equation(x_data=x_data,y_data=y_train)
predict =line.predict(x_data=x_data,weight=weight)
R =line.R(y_data=y_train,y_predict=predict)
print(R)
weight = line.gradient_descent(x_data=x_data,y_data=y_train)
print(weight)