首先,分析数据,train中的数据,列代表每天的24小时,行表示日期以及每一天的检测值(18个,即18个特征).结构为:12乘以20乘以18,24,需要转化为:12乘以471,189
text中的数据,列代表九个小时,行代表1024,十天以及24小时的18个特征,结构为24018,9,需要转化为240,189
模型的输入为:每个小时往后数九个小时的18个特征值,所以一共有18*9个特征.输出为第十个小时的pm2.5的值.
import pandas as pd
import numpy as np
import math
#读取数据
df = pd.read_csv('work/hw1_data/train.csv', encoding='big5')
print(df.shape)#(4320, 27) 12*20*18
#数据处理,取出有用信息,以及填补空值
data = df.iloc[:, 3:]
data[data == 'NR'] = 0
raw_data = data.to_numpy().astype('float32')#转化数据为array
raw_data.shape #(20*18*12, 24)
#构造特征
month_data = {}
for month in range(12):
sample = np.empty([18, 480])
for day in range(20):
sample[:, day * 24 : (day + 1) * 24] = raw_data[18 * (20 * month + day) : 18 * (20 * month + day + 1), :]
#这里是将每天的24小时,构造出24列,而行为18个特征,列为24*20
month_data[month] = sample
# 每个月会有 480hrs,每 9 小时形成一個 data,每个月会有 471 個 data,故总资料数为 471 * 12 笔,而每笔 data 有 9 * 18 的 features (一小时 18 个 features * 9 小时)。
# 这里为什么会形成471个数据呢?因为每个小时都往后数9个小时,为一个data,为输入
# 对应的 target 则有 471 * 12 个(第 10 个小时的 PM2.5)
x = np.empty([12 * 471, 18 * 9], dtype = float)
y = np.empty([12 * 471, 1], dtype = float)
for month in range(12):
for day in range(20):
for hour in range(24):
if day == 19 and hour > 14:
continue
x[month * 471 + day * 24 + hour, :] = month_data[month][:,day * 24 + hour : day * 24 + hour + 9].reshape(1, -1) # vector dim:18*9 (9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9)
y[month * 471 + day * 24 + hour, 0] = month_data[month][9, day * 24 + hour + 9] # value
print(x.shape) # (471*12, 18*9)
print(y.shape)
###特征工程--标准化
mean_x = np.mean(x, axis=0) # 18 * 9
std_x = np.std(x, axis=0) # 18 * 9
# 规范化
def _normalize(X, train = True, specified_column = None, X_mean = None, X_std = None):
# This function normalizes specific columns of X.
# The mean and standard variance of training data will be reused when processing testing data.
#
# Arguments:
# X: data to be processed
# train: 'True' when processing training data, 'False' for testing data
# specific_column: indexes of the columns that will be normalized. If 'None', all columns
# will be normalized.
# X_mean: mean value of training data, used when train = 'False'
# X_std: standard deviation of training data, used when train = 'False'
# Outputs:
# X: normalized data
# X_mean: computed mean value of training data
# X_std: computed standard deviation of training data
if specified_column == None:
specified_column = np.arange(X.shape[1])#这里生成一个迭代器
if train:
X_mean = np.mean(X[:, specified_column] ,0).reshape(1, -1)#这个式子中0表示axis=0
X_std = np.std(X[:, specified_column], 0).reshape(1, -1)
X[:,specified_column] = (X[:, specified_column] - X_mean) / (X_std + 1e-8)
return X, X_mean, X_std
# Normalize training and testing data
X_train, X_mean, X_std = _normalize(x, train = True)
#训练集切分,划分为训练集与验证集,所以这里用了交叉验证
def _train_dev_split(X, Y, dev_ratio = 0.25):
# This function spilts data into training set and development set.
train_size = int(len(X) * (1 - dev_ratio))
return X[:train_size], Y[:train_size], X[train_size:], Y[train_size:]
def _shuffle(X, Y):#这一函数的作用就是打乱数据的顺序
# This function shuffles two equal-length list/array, X and Y, together.
randomize = np.arange(len(X))
np.random.shuffle(randomize)
return (X[randomize], Y[randomize])
# Split data into training set and development set
dev_ratio = 0.1
# 9:1
X_train, y = _shuffle(X_train, y)
x_train, y_train, x_eval, y_eval = _train_dev_split(X_train, y, dev_ratio = dev_ratio)
print("训练集数据:", x_train.shape)
print("训练集标签:", y_train.shape)
print("验证集数据:", x_eval.shape)
print("验证集标签:", y_eval.shape)
#算法预测,这里采用了堆算法
n = 471*12
dim = 18 * 9 + 1 # 162w + 1b
w = np.zeros([dim, 1])
# Some parameters for training
EPOCH = 1000
batch_size = 512
learning_rate = 100
adagrad = np.zeros([dim, 1])#这一参数用在optim update,是一种梯度更新的算法,
eps = 1e-9
# Keep the loss at every iteration for plotting
train_loss = []
eval_loss = []
x_train = np.concatenate((np.ones((x_train.shape[0], 1)), x_train), axis=1).astype('float32') # axis=1增加一个数据
#np.concatenate用于拼接列表,由于b的原因所以需要为x加一行全1
x_eval = np.concatenate((np.ones((x_eval.shape[0], 1)), x_eval), axis=1).astype('float32') # axis=1增加一个数据
for epoch in range(EPOCH):
x_train, y_train = _shuffle(x_train, y_train)
x_eval, y_eval = _shuffle(x_eval, y_eval)
#每一幕都打乱顺序,但是不重新划分
# Mini-batch training
step = 0
steps = int(np.floor(x_train.shape[0] / batch_size))#np.floor向下取整
for idx in range(steps):
X = x_train[idx*batch_size:(idx+1)*batch_size]
Y = y_train[idx*batch_size:(idx+1)*batch_size]
loss_train = np.sqrt(np.sum(np.power(np.dot(X, w) - Y, 2)) / batch_size) # rmse
#这里注意X是batch_size = 512条数据里的所有特征,求出总的之后,再除以512,这就是堆更新
# cal grad
gradient = 2 * np.dot(X.transpose(), np.dot(X, w) - Y) # dim*1
# optim update
adagrad += gradient ** 2
# loss_backward
w = w - learning_rate * gradient / np.sqrt(adagrad + eps)
#这就是optim梯度下降的更新函数
step += 1
train_loss.append(loss_train)
loss_eval = np.sqrt(np.sum(np.power(np.dot(x_eval, w) - y_eval, 2)) / x_eval.shape[0]) # rmse
eval_loss.append(loss_eval)
if epoch % 50 == 0 or epoch == EPOCH:
print(f'Epoch {epoch}/{EPOCH}: train_loss = {loss_train}, eval_loss = {loss_eval}')
np.save(f'work/checkpoint/weight_epoch{epoch}.npy', w)#保存模型
print('Training loss: {}'.format(train_loss[-1]))
print('Eval loss: {}'.format(eval_loss[-1]))
np.save('work/weight.npy', w)
# w
#处理text文件
testdata = pd.read_csv('work/hw1_data/test.csv', header = None, encoding = 'big5')
test_data = testdata.iloc[:, 2:]
test_data[test_data == 'NR'] = 0
test_data = test_data.to_numpy()
test_x = np.empty([240, 18*9], dtype = float)
for i in range(240):
test_x[i, :] = test_data[18 * i: 18* (i + 1), :].reshape(1, -1)
for i in range(len(test_x)):
for j in range(len(test_x[0])):
if std_x[j] != 0:
test_x[i][j] = (test_x[i][j] - mean_x[j]) / std_x[j]
#这里就用train里的标准,对x_text进行标准化
test_x = np.concatenate((np.ones([240, 1]), test_x), axis = 1).astype(float)
test_x.shape
#模型取出,并预测结果
w = np.load('work/weight.npy')
predict_y = np.dot(test_x, w)
predict_y.shape
#写一个csv文件
import csv
with open('work/submit1.csv', mode='w', newline='') as submit_file:
csv_writer = csv.writer(submit_file)
header = ['id', 'value']
# print(header)
csv_writer.writerow(header)
for i in range(240):
row = ['id_' + str(i), predict_y[i][0]]
csv_writer.writerow(row)
# print(row)