写在前面 : 记录一下今年正式做的第一个比赛,算是给自己打个样。从去年12月开始就一直在玩儿,颓废太久啦!比赛的最终结果也不太好但好歹当个记录叭(A榜:0.8793)
赛题是一个回归任务,预测发电的出力功率,感兴趣可以去看看!
完整代码
import pandas as pd
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
import numpy as np
from sklearn.model_selection import KFold
import lightgbm as lgb
import time
import polars as pl
from scipy.signal import savgol_filter
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import StratifiedKFold, KFold
from matplotlib.pyplot import plot, show, title
import random, os, tqdm
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
def seed_everything(seed=42): #bert4torch已经包含了这个东西
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
# torch.manual_seed(seed)
# torch.cuda.manual_seed(seed)
# torch.backends.cudnn.deterministic = True
seed_everything()
数据预处理
train_climate = pd.read_csv('../raw_data/A榜-训练集_分布式光伏发电预测_气象变量数据.csv', encoding='gbk')
test_climate = pd.read_csv('../raw_data/A榜-测试集_分布式光伏发电预测_气象变量数据.csv', encoding='gbk')
train_info = pd.read_csv("../raw_data/A榜-训练集_分布式光伏发电预测_基本信息.csv", encoding='gbk')
test_info = pd.read_csv("../raw_data/A榜-测试集_分布式光伏发电预测_基本信息.csv", encoding='gbk')
train_target = pd.read_csv("../raw_data/A榜-训练集_分布式光伏发电预测_实际功率数据.csv", encoding='gbk')
test_target = pd.read_csv("../raw_data/A榜-测试集_分布式光伏发电预测_实际功率数据.csv", encoding='gbk')
print(train_climate.shape, train_target.shape, train_target.shape, "\n",
test_climate.shape, test_info.shape, test_target.shape)
# (417312, 12) (4337, 99) (4337, 99)
# (79488, 12) (9, 5) (825, 3)
'''
整理一下文件,为每一分钟打上对应标签以及修改列名
'''
train_target.columns = ['ID', 'magnifying_power', 'date_time'] + [f'p{
i}' for i in range(1, 97)]
train_target['date_time'] = pd.to_datetime(train_target['date_time'])
train_data = []
for _, row in tqdm.tqdm(train_target.iterrows(), total=len(train_target)):
for i in range(1, 97):
dt = row['date_time'] + pd.DateOffset(minutes=15*(i-1))
train_data.append({
'ID': row['ID'],
'magnifying_power': row['magnifying_power'],
'date_time': dt,
'target': row[f'p{
i}']})
train_data = pd.DataFrame(train_data)
train_info.drop(['光伏用户名称', '经度', '纬度'], axis=1, inplace=True)
train_info.columns = ['ID', 'install_capacity']
train_data = train_data.merge(train_info, on='ID', how='left')
rename_map = {
'光伏用户编号': 'ID',
'时间': 'date_time',
'气压(Pa)': 'atmospheric_pressure_Pa',
'相对湿度(%)': 'rela_moisture_percent',
'云量': 'cloud_cover',
'10米风速(10m/s)': 'wind_speed_10m/s',
'10米风向(°)': 'wind_dir_10m_degree',
'温度(K)': 'temperature_K',
'辐照强度(J/m2)': 'irradiation_intensity_J/m2',
'降水(m)': 'rainfall_m',
'100m风速(100m/s)': 'wind_speed_100m/s',
'100m风向(°)': 'wind_dir_100m_degree',
}
train_climate = train_climate.rename(columns=rename_map)
train_climate['date_time'] = pd.to_datetime(train_climate['date_time'])
train_data = train_data.merge(train_climate, on=['ID', 'date_time'], how='left')
'''
同样的操作在测试集上来一下
'''
test_target.columns = ['ID', 'magnifying_power', 'date_time']
test_target['date_time'] = pd.to_datetime(test_target['date_time'])
for col in [f'p{
i}' for i in range(1, 97)]:
test_target[col] = -100
test_data = []
for _, row in tqdm.tqdm(test_target.iterrows(), total=len(test_target)):
for i in range(1, 97):
dt = row['date_time'] + pd.DateOffset(minutes=15*(i-1))
test_data.append({
'ID': row['ID'],
'magnifying_power': row['magnifying_power'],
'date_time': dt,
'target': row[f'p{
i}']})
test_data = pd.DataFrame(test_data)
test_info.drop(['光伏用户名称', '经度', '纬度'], axis=1, inplace=True)
test_info.columns = ['ID', 'install_capacity