线性回归案例

在这里插入图片描述
首先,分析数据,train中的数据,列代表每天的24小时,行表示日期以及每一天的检测值(18个,即18个特征).结构为:12乘以20乘以18,24,需要转化为:12乘以471,189
text中的数据,列代表九个小时,行代表10
24,十天以及24小时的18个特征,结构为24018,9,需要转化为240,189
模型的输入为:每个小时往后数九个小时的18个特征值,所以一共有18*9个特征.输出为第十个小时的pm2.5的值.

import pandas as pd
import numpy as np
import math

#读取数据
df = pd.read_csv('work/hw1_data/train.csv', encoding='big5')
print(df.shape)#(4320, 27) 12*20*18

#数据处理,取出有用信息,以及填补空值
data = df.iloc[:, 3:]
data[data == 'NR'] = 0
raw_data = data.to_numpy().astype('float32')#转化数据为array
raw_data.shape   #(20*18*12, 24)

#构造特征
month_data = {}
for month in range(12):
    sample = np.empty([18, 480])
    for day in range(20):
        sample[:, day * 24 : (day + 1) * 24] = raw_data[18 * (20 * month + day) : 18 * (20 * month + day + 1), :]
        #这里是将每天的24小时,构造出24列,而行为18个特征,列为24*20
    month_data[month] = sample

# 每个月会有 480hrs,每 9 小时形成一個 data,每个月会有 471 個 data,故总资料数为 471 * 12 笔,而每笔 data 有 9 * 18 的 features (一小时 18 个 features * 9 小时)。
# 这里为什么会形成471个数据呢?因为每个小时都往后数9个小时,为一个data,为输入
# 对应的 target 则有 471 * 12 个(第 10 个小时的 PM2.5)

x = np.empty([12 * 471, 18 * 9], dtype = float)
y = np.empty([12 * 471, 1], dtype = float)
for month in range(12):
    for day in range(20):
        for hour in range(24):
            if day == 19 and hour > 14:
                continue
            x[month * 471 + day * 24 + hour, :] = month_data[month][:,day * 24 + hour : day * 24 + hour + 9].reshape(1, -1) # vector dim:18*9 (9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9)
            y[month * 471 + day * 24 + hour, 0] = month_data[month][9, day * 24 + hour + 9] # value
print(x.shape)    # (471*12, 18*9)
print(y.shape)

###特征工程--标准化
mean_x = np.mean(x, axis=0) # 18 * 9
std_x = np.std(x, axis=0) # 18 * 9

# 规范化
def _normalize(X, train = True, specified_column = None, X_mean = None, X_std = None):
    # This function normalizes specific columns of X.
    # The mean and standard variance of training data will be reused when processing testing data.
    #
    # Arguments:
    #     X: data to be processed
    #     train: 'True' when processing training data, 'False' for testing data
    #     specific_column: indexes of the columns that will be normalized. If 'None', all columns
    #         will be normalized.
    #     X_mean: mean value of training data, used when train = 'False'
    #     X_std: standard deviation of training data, used when train = 'False'
    # Outputs:
    #     X: normalized data
    #     X_mean: computed mean value of training data
    #     X_std: computed standard deviation of training data

    if specified_column == None:
        specified_column = np.arange(X.shape[1])#这里生成一个迭代器
    if train:
        X_mean = np.mean(X[:, specified_column] ,0).reshape(1, -1)#这个式子中0表示axis=0
        X_std  = np.std(X[:, specified_column], 0).reshape(1, -1)

    X[:,specified_column] = (X[:, specified_column] - X_mean) / (X_std + 1e-8)

    return X, X_mean, X_std

# Normalize training and testing data
X_train, X_mean, X_std = _normalize(x, train = True)

#训练集切分,划分为训练集与验证集,所以这里用了交叉验证
def _train_dev_split(X, Y, dev_ratio = 0.25):
    # This function spilts data into training set and development set.
    train_size = int(len(X) * (1 - dev_ratio))
    return X[:train_size], Y[:train_size], X[train_size:], Y[train_size:]

def _shuffle(X, Y):#这一函数的作用就是打乱数据的顺序
    # This function shuffles two equal-length list/array, X and Y, together.
    randomize = np.arange(len(X))
    np.random.shuffle(randomize)
    return (X[randomize], Y[randomize])

# Split data into training set and development set
dev_ratio = 0.1
# 9:1
X_train, y = _shuffle(X_train, y)
x_train, y_train, x_eval, y_eval = _train_dev_split(X_train, y, dev_ratio = dev_ratio)

print("训练集数据:", x_train.shape)
print("训练集标签:", y_train.shape)
print("验证集数据:", x_eval.shape)
print("验证集标签:", y_eval.shape)

#算法预测,这里采用了堆算法
n = 471*12
dim = 18 * 9 + 1  # 162w + 1b
w = np.zeros([dim, 1])


# Some parameters for training
EPOCH = 1000
batch_size = 512
learning_rate = 100

adagrad = np.zeros([dim, 1])#这一参数用在optim update,是一种梯度更新的算法,
eps = 1e-9

# Keep the loss  at every iteration for plotting
train_loss = []
eval_loss = []

x_train = np.concatenate((np.ones((x_train.shape[0], 1)), x_train), axis=1).astype('float32')   # axis=1增加一个数据
#np.concatenate用于拼接列表,由于b的原因所以需要为x加一行全1
x_eval = np.concatenate((np.ones((x_eval.shape[0], 1)), x_eval), axis=1).astype('float32')   # axis=1增加一个数据
for epoch in range(EPOCH):
    x_train, y_train = _shuffle(x_train, y_train)
    x_eval, y_eval = _shuffle(x_eval, y_eval)
    #每一幕都打乱顺序,但是不重新划分

    # Mini-batch training
    step = 0
    steps = int(np.floor(x_train.shape[0] / batch_size))#np.floor向下取整
    for idx in range(steps):
        X = x_train[idx*batch_size:(idx+1)*batch_size]
        Y = y_train[idx*batch_size:(idx+1)*batch_size]

        loss_train = np.sqrt(np.sum(np.power(np.dot(X, w) - Y, 2)) / batch_size) # rmse
        #这里注意X是batch_size = 512条数据里的所有特征,求出总的之后,再除以512,这就是堆更新
        # cal grad
        gradient = 2 * np.dot(X.transpose(), np.dot(X, w) - Y) # dim*1
        # optim update
        adagrad += gradient ** 2
        # loss_backward
        w = w - learning_rate * gradient / np.sqrt(adagrad + eps)
        #这就是optim梯度下降的更新函数

        step += 1
        train_loss.append(loss_train)

    loss_eval = np.sqrt(np.sum(np.power(np.dot(x_eval, w) - y_eval, 2)) / x_eval.shape[0]) # rmse
    eval_loss.append(loss_eval)
    if epoch % 50 == 0 or epoch == EPOCH:
        print(f'Epoch {epoch}/{EPOCH}: train_loss = {loss_train}, eval_loss = {loss_eval}')
        np.save(f'work/checkpoint/weight_epoch{epoch}.npy', w)#保存模型

print('Training loss: {}'.format(train_loss[-1]))
print('Eval loss: {}'.format(eval_loss[-1]))
np.save('work/weight.npy', w)
# w


#处理text文件
testdata = pd.read_csv('work/hw1_data/test.csv', header = None, encoding = 'big5')
test_data = testdata.iloc[:, 2:]
test_data[test_data == 'NR'] = 0
test_data = test_data.to_numpy()
test_x = np.empty([240, 18*9], dtype = float)

for i in range(240):
    test_x[i, :] = test_data[18 * i: 18* (i + 1), :].reshape(1, -1)

for i in range(len(test_x)):
    for j in range(len(test_x[0])):
        if std_x[j] != 0:
            test_x[i][j] = (test_x[i][j] - mean_x[j]) / std_x[j]
            #这里就用train里的标准,对x_text进行标准化

test_x = np.concatenate((np.ones([240, 1]), test_x), axis = 1).astype(float)
test_x.shape

#模型取出,并预测结果
w = np.load('work/weight.npy')
predict_y = np.dot(test_x, w)
predict_y.shape


#写一个csv文件
import csv
with open('work/submit1.csv', mode='w', newline='') as submit_file:
    csv_writer = csv.writer(submit_file)
    header = ['id', 'value']
    # print(header)
    csv_writer.writerow(header)
    for i in range(240):
        row = ['id_' + str(i), predict_y[i][0]]
        csv_writer.writerow(row)
        # print(row)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值