【“AI Earth”人工智能创新挑战赛——AI助力精准气象和海洋预测】时间序列基于baseline 提分思路 增加网络层数和神经元数量,改变激活函数,固定随机种子【2】

数据获取

【“AI Earth”人工智能创新挑战赛——AI助力精准气象和海洋预测】时间序列基于baseline 提分思路 增加网络层数和神经元数量,改变激活函数,固定随机种子


import pandas as pd
import numpy  as np
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
import scipy
from netCDF4 import Dataset
import netCDF4 as nc
import gc

# 将标签转化为我们熟悉的pandas形式
label_path = './data/SODA_label.nc'
label_trans_path = './data/'
nc_label = Dataset(label_path, 'r')

years = np.array(nc_label['year'][:])
months = np.array(nc_label['month'][:])

year_month_index = []
vs = []
for i, year in enumerate(years):
    for j, month in enumerate(months):
        year_month_index.append('year_{}_month_{}'.format(year, month))
        vs.append(np.array(nc_label['nino'][i, j]))

df_SODA_label = pd.DataFrame({'year_month': year_month_index})
df_SODA_label['year_month'] = year_month_index
df_SODA_label['label'] = vs

df_SODA_label.to_csv(label_trans_path + 'df_SODA_label.csv', index=None)
# df_label.head()

# SODA_train.nc中[0, 0:36, :, :]
# 为第1 - 第3年逐月的历史观测数据;
#
# SODA_train.nc中[1, 0:36, :, :]
# 为第2 - 第4年逐月的历史观测数据;
# …,
# SODA_train.nc中[99, 0:36, :, :]
# 为第100 - 102
# 年逐月的历史观测数据。
SODA_path = './data/SODA_train.nc'
nc_SODA = Dataset(SODA_path, 'r')


def trans_df(df, vals, lats, lons, years, months):
    '''
        (100, 36, 24, 72) -- year, month,lat,lon
    '''
    for j, lat_ in enumerate(lats):
        for i, lon_ in enumerate(lons):
            c = 'lat_lon_{}_{}'.format(int(lat_), int(lon_))
            v = []
            for y in range(len(years)):
                for m in range(len(months)):
                    v.append(vals[y, m, j, i])
            df[c] = v
    return df


year_month_index = []

years = np.array(nc_SODA['year'][:])
months = np.array(nc_SODA['month'][:])
lats = np.array(nc_SODA['lat'][:])
lons = np.array(nc_SODA['lon'][:])

for year in years:
    for month in months:
        year_month_index.append('year_{}_month_{}'.format(year, month))

df_sst = pd.DataFrame({'year_month': year_month_index})
df_t300 = pd.DataFrame({'year_month': year_month_index})
df_ua = pd.DataFrame({'year_month': year_month_index})
df_va = pd.DataFrame({'year_month': year_month_index})

df_sst = trans_df(df=df_sst, vals=np.array(nc_SODA['sst'][:]), lats=lats, lons=lons, years=years, months=months)
df_t300 = trans_df(df=df_t300, vals=np.array(nc_SODA['t300'][:]), lats=lats, lons=lons, years=years, months=months)
df_ua = trans_df(df=df_ua, vals=np.array(nc_SODA['ua'][:]), lats=lats, lons=lons, years=years, months=months)
df_va = trans_df(df=df_va, vals=np.array(nc_SODA['va'][:]), lats=lats, lons=lons, years=years, months=months)
label_trans_path = './data/'
df_sst.to_csv(label_trans_path + 'df_sst_SODA.csv', index=None)
df_t300.to_csv(label_trans_path + 'df_t300_SODA.csv', index=None)
df_ua.to_csv(label_trans_path + 'df_ua_SODA.csv', index=None)
df_va.to_csv(label_trans_path + 'df_va_SODA.csv', index=None)

label_path = './data/CMIP_label.nc'
label_trans_path = './data/'
nc_label = Dataset(label_path, 'r')

years = np.array(nc_label['year'][:])
months = np.array(nc_label['month'][:])

year_month_index = []
vs = []
for i, year in enumerate(years):
    for j, month in enumerate(months):
        year_month_index.append('year_{}_month_{}'.format(year, month))
        vs.append(np.array(nc_label['nino'][i, j]))

df_CMIP_label = pd.DataFrame({'year_month': year_month_index})
df_CMIP_label['year_month'] = year_month_index
df_CMIP_label['label'] = vs

df_CMIP_label.to_csv(label_trans_path + 'df_CMIP_label.csv', index=None)

# CMIP_train.nc中[0, 0:36, :, :]
# 为CMIP6第一个模式提供的第1 - 第3年逐月的历史模拟数据;
# …,
# CMIP_train.nc中[150, 0:36, :, :]
# 为CMIP6第一个模式提供的第151 - 第153年逐月的历史模拟数据;
#
# CMIP_train.nc中[151, 0:36, :, :]
# 为CMIP6第二个模式提供的第1 - 第3年逐月的历史模拟数据;
# …,
# CMIP_train.nc中[2265, 0:36, :, :]
# 为CMIP5第一个模式提供的第1 - 第3年逐月的历史模拟数据;
# …,
# CMIP_train.nc中[2405, 0:36, :, :]
# 为CMIP5第二个模式提供的第1 - 第3年逐月的历史模拟数据;
# …,
# CMIP_train.nc中[4644, 0:36, :, :]
# 为CMIP5第17个模式提供的第140 - 第142年逐月的历史模拟数据。
#
# 其中每个样本第三、第四维度分别代表经纬度(南纬55度北纬60度,东经0360度),所有数据的经纬度范围相同。
CMIP_path = './data/CMIP_train.nc'
CMIP_trans_path = './data'
nc_CMIP = Dataset(CMIP_path, 'r')
nc_CMIP.variables.keys()

# dict_keys(['sst', 't300', 'ua', 'va', 'year', 'month', 'lat', 'lon'])
nc_CMIP['t300'][:].shape

# (4645, 36, 24, 72)
year_month_index = []

years = np.array(nc_CMIP['year'][:])
months = np.array(nc_CMIP['month'][:])
lats = np.array(nc_CMIP['lat'][:])
lons = np.array(nc_CMIP['lon'][:])

last_thre_years = 1000
for year in years:
    '''
        因为内存限制,我们暂时取最后1000个year的数据,如果内存够强大可以注释掉if
    '''
    if year >= 4645 - last_thre_years:
        for month in months:
            year_month_index.append('year_{}_month_{}'.format(year, month))

df_CMIP_sst = pd.DataFrame({'year_month': year_month_index})
df_CMIP_t300 = pd.DataFrame({'year_month': year_month_index})
df_CMIP_ua = pd.DataFrame({'year_month': year_month_index})
df_CMIP_va = pd.DataFrame({'year_month': year_month_index})


def trans_thre_df(df, vals, lats, lons, years, months, last_thre_years=1000):
    '''
        (4645, 36, 24, 72) -- year, month,lat,lon
    '''
    for j, lat_ in (enumerate(lats)):
        #         print(j)
        for i, lon_ in enumerate(lons):
            c = 'lat_lon_{}_{}'.format(int(lat_), int(lon_))
            v = []
            for y_, y in enumerate(years):
                '''
                    因为内存限制,我们暂时取最后1000个year的数据,如果内存够强大可以注释掉if
                '''
                if y >= 4645 - last_thre_years:
                    for m_, m in enumerate(months):
                        v.append(vals[y_, m_, j, i])
            df[c] = v
    return df

df_CMIP_sst = trans_thre_df(df=df_CMIP_sst, vals=np.array(nc_CMIP['sst'][:]), lats=lats, lons=lons, years=years, months=months)
df_CMIP_sst.to_csv(CMIP_trans_path + 'df_CMIP_sst.csv', index=None)
del df_CMIP_sst
gc.collect()

df_CMIP_t300 = trans_thre_df(df=df_CMIP_t300, vals=np.array(nc_CMIP['t300'][:]), lats=lats, lons=lons, years=years, months=months)
df_CMIP_t300.to_csv(CMIP_trans_path + 'df_CMIP_t300.csv', index=None)
del df_CMIP_t300
gc.collect()

df_CMIP_ua = trans_thre_df(df=df_CMIP_ua, vals=np.array(nc_CMIP['ua'][:]), lats=lats, lons=lons, years=years, months=months)
df_CMIP_ua.to_csv(CMIP_trans_path + 'df_CMIP_ua.csv', index=None)
del df_CMIP_ua
gc.collect()

df_CMIP_va = trans_thre_df(df=df_CMIP_va, vals=np.array(nc_CMIP['va'][:]), lats=lats, lons=lons, years=years, months=months)
df_CMIP_va.to_csv(CMIP_trans_path + 'df_CMIP_va.csv', index=None)
del df_CMIP_va
gc.collect()

 

过程中报错,安装低版本:

pip install netCDF4==1.4.0

 

数据建模


import pandas as pd
import numpy  as np
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
import scipy
import joblib
from netCDF4 import Dataset
import netCDF4 as nc
from tensorflow.keras.callbacks import LearningRateScheduler, Callback
import tensorflow.keras.backend as K
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.keras.optimizers import *
from tensorflow.keras.callbacks import *
from tensorflow.keras.layers import Input
import gc

label_path       = './data/SODA_label.nc'
nc_label         = Dataset(label_path,'r')
tr_nc_labels     = nc_label['nino'][:]

SODA_path = './data/SODA_train.nc'
nc_SODA = Dataset(SODA_path, 'r')

nc_sst = np.array(nc_SODA['sst'][:])
nc_t300 = np.array(nc_SODA['t300'][:])
nc_ua = np.array(nc_SODA['ua'][:])
nc_va = np.array(nc_SODA['va'][:])


def RMSE(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))


def RMSE_fn(y_true, y_pred):
    return np.sqrt(np.mean(np.power(np.array(y_true, float).reshape(-1, 1) - np.array(y_pred, float).reshape(-1, 1), 2)))


def build_model():
    inp = Input(shape=(12, 24, 72, 4))

    x_4 = Dense(1, activation='relu')(inp)
    x_3 = Dense(1, activation='relu')(tf.reshape(x_4, [-1, 12, 24, 72]))
    x_2 = Dense(1, activation='relu')(tf.reshape(x_3, [-1, 12, 24]))
    x_1 = Dense(1, activation='relu')(tf.reshape(x_2, [-1, 12]))

    x = Dense(64, activation='relu')(x_1)
    x = Dropout(0.25)(x)
    x = Dense(32, activation='relu')(x)
    x = Dropout(0.25)(x)
    output = Dense(24, activation='linear')(x)
    model = Model(inputs=inp, outputs=output)

    adam = tf.optimizers.Adam(lr=1e-3, beta_1=0.99, beta_2=0.99)
    model.compile(optimizer=adam, loss=RMSE)

    return model


### 训练特征,保证和训练集一致
tr_features = np.concatenate([nc_sst[:, :12, :, :].reshape(-1, 12, 24, 72, 1), nc_t300[:, :12, :, :].reshape(-1, 12, 24, 72, 1), \
                              nc_ua[:, :12, :, :].reshape(-1, 12, 24, 72, 1), nc_va[:, :12, :, :].reshape(-1, 12, 24, 72, 1)], axis=-1)

### 训练标签,取后24个
tr_labels = tr_nc_labels[:, 12:]

### 训练集验证集划分
tr_len = int(tr_features.shape[0] * 0.8)
tr_fea = tr_features[:tr_len, :].copy()
tr_label = tr_labels[:tr_len, :].copy()
val_len = tr_features.shape[0] - tr_len
val_fea = tr_features[tr_len:, :].copy()
val_label = tr_labels[tr_len:, :].copy()

#### 构建模型
model_mlp = build_model()
#### 模型存储的位置
model_weights = './model_mlp_baseline.h5'

checkpoint = ModelCheckpoint(model_weights, monitor='val_loss', verbose=0, save_best_only=True, mode='min',
                             save_weights_only=True)

plateau = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, verbose=1, min_delta=1e-4, mode='min')
early_stopping = EarlyStopping(monitor="val_loss", patience=20)
history = model_mlp.fit(tr_fea, tr_label,
                        validation_data=(val_fea, val_label),
                        batch_size=4096, epochs=200,
                        callbacks=[plateau, checkpoint, early_stopping],
                        verbose=2)

prediction = model_mlp.predict(val_fea)

from sklearn.metrics import mean_squared_error


def rmse(y_true, y_preds):
    return np.sqrt(mean_squared_error(y_pred=y_preds, y_true=y_true))


def score(y_true, y_preds):
    accskill_score = 0
    rmse_scores = 0
    a = [1.5] * 4 + [2] * 7 + [3] * 7 + [4] * 6
    y_true_mean = np.mean(y_true, axis=0)
    y_pred_mean = np.mean(y_preds, axis=0)
    #     print(y_true_mean.shape, y_pred_mean.shape)

    for i in range(24):
        fenzi = np.sum((y_true[:, i] - y_true_mean[i]) * (y_preds[:, i] - y_pred_mean[i]))
        fenmu = np.sqrt(np.sum((y_true[:, i] - y_true_mean[i]) ** 2) * np.sum((y_preds[:, i] - y_pred_mean[i]) ** 2))
        cor_i = fenzi / fenmu

        accskill_score += a[i] * np.log(i + 1) * cor_i
        rmse_score = rmse(y_true[:, i], y_preds[:, i])
        #         print(cor_i,  2 / 3.0 * a[i] * np.log(i+1) * cor_i - rmse_score)
        rmse_scores += rmse_score

    return 2 / 3.0 * accskill_score - rmse_scores


print('score', score(y_true=val_label, y_preds=prediction))

执行以上两个程序,本地跑出模型文件 model_mlp_baseline.h5 将其放入下面程序要读取的对应位置

模型预测&提交结果

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.keras.optimizers import *
from tensorflow.keras.callbacks import *
from tensorflow.keras.layers import Input 
import numpy as np
import os
import zipfile

def RMSE(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))

def build_model():
    inp = Input(shape=(12, 24, 72, 4))

    x_4 = Dense(1, activation='swish')(inp)
    x_3 = Dense(1, activation='swish')(tf.reshape(x_4, [-1, 12, 24, 72]))
    x_2 = Dense(1, activation='swish')(tf.reshape(x_3, [-1, 12, 24]))
    x_1 = Dense(1, activation='swish')(tf.reshape(x_2, [-1, 12]))

    x = Dense(512, activation='swish')(x_1)
    x = Dropout(0.25)(x)
    x = Dense(512, activation='swish')(x)
    x = Dropout(0.25)(x)
    x = Dense(512, activation='swish')(x)
    x = Dropout(0.25)(x)
    x = Dense(512, activation='swish')(x)
    x = Dropout(0.25)(x)
    output = Dense(24, activation='linear')(x)
    model = Model(inputs=inp, outputs=output)

    adam = tf.optimizers.Adam(lr=1e-3, beta_1=0.99, beta_2=0.99)
    model.compile(optimizer=adam, loss=RMSE)

    return model

model = build_model()
model.load_weights('./user_data/model_data/model_mlp_baseline.h5')

test_path = './tcdata/enso_round1_test_20210201/'

### 1. 测试数据读取
files = os.listdir(test_path)
test_feas_dict = {}
for file in files:
    test_feas_dict[file] = np.load(test_path + file)
    
### 2. 结果预测
test_predicts_dict = {}
for file_name,val in test_feas_dict.items():
    test_predicts_dict[file_name] = model.predict(val).reshape(-1,)
#     test_predicts_dict[file_name] = model.predict(val.reshape([-1,12])[0,:])

### 3.存储预测结果
for file_name,val in test_predicts_dict.items(): 
    np.save('./result/' + file_name,val)

#打包目录为zip文件(未压缩)
def make_zip(source_dir='./result/', output_filename = 'result.zip'):
    zipf = zipfile.ZipFile(output_filename, 'w')
    pre_len = len(os.path.dirname(source_dir))
    source_dirs = os.walk(source_dir)
    print(source_dirs)
    for parent, dirnames, filenames in source_dirs:
        print(parent, dirnames)
        for filename in filenames:
            if '.npy' not in filename:
                continue
            pathfile = os.path.join(parent, filename)
            arcname = pathfile[pre_len:].strip(os.path.sep)   #相对路径
            zipf.write(pathfile, arcname)
    zipf.close()
make_zip()

打包

sudo docker build -t registry.cn-hangzhou.aliyuncs.com/***/weatheroceanforecasts:weatheroceanforecasts-v1.2 .

sudo docker tag aeac34af7e9d registry.cn-hangzhou.aliyuncs.com/***/weatheroceanforecasts:weatheroceanforecasts-v1.2

sudo docker push registry.cn-hangzhou.aliyuncs.com/***/weatheroceanforecasts:weatheroceanforecasts-v1.2

参考链接

https://blog.csdn.net/yichao_ding/article/details/114096881

从0梳理1场时间序列赛事!

 

 

 

 

 

  • 1
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Zda天天爱打卡

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值