5 时间特征处理

汀沿河

已于 2024-07-11 19:34:40 修改

阅读量380

点赞数 5

分类专栏： # 2比赛常用的代码文章标签： python 数据分析数据挖掘

于 2024-07-11 14:53:32 首次发布

本文链接：https://blog.csdn.net/qq_28611929/article/details/140352169

版权

2比赛常用的代码专栏收录该内容

5 篇文章 0 订阅

订阅专栏

https://www.kaggle.com/code/vanpatangan/orders-forecasting-challenge

1 时间特征周期性转换、ont-hot编码

date_col = ['date']
for _col in date_col:
    date_col = pd.to_datetime(all_df[_col], errors='coerce')
    all_df[_col + "_year"] = date_col.dt.year.fillna(-1)
    all_df[_col + "_month"] = date_col.dt.month.fillna(-1)
    all_df[_col + "_day"] = date_col.dt.day.fillna(-1)
    all_df[_col + "week"] = date_col.dt.isocalendar().week.fillna(-1)
    all_df[_col + "_day_of_week"] = date_col.dt.dayofweek.fillna(-1)
    all_df[_col + "_day_of_year"] = date_col.dt.dayofyear.fillna(-1)
    
    for m in range(1,13):
        all_df[f'month_{m}']=(all_df[_col + "_month"]==m)
    
    for d in range(7):
        all_df[f'dayofweek_{d}']=(all_df[_col + "_day_of_week"]==d)

    # Apply sine and cosine transformations
    all_df[_col + '_year_sin'] = all_df[_col + "_year"] * np.sin(2 * np.pi * all_df[_col + "_year"])
    all_df[_col + '_year_cos'] = all_df[_col + "_year"] * np.cos(2 * np.pi * all_df[_col + "_year"])
    all_df[_col + '_month_sin'] = all_df[_col + "_month"] * np.sin(2 * np.pi * all_df[_col + "_month"]/12)
    all_df[_col + '_month_cos'] = all_df[_col + "_month"] * np.cos(2 * np.pi * all_df[_col + "_month"]/12)
    all_df[_col + '_day_sin'] = all_df[_col + "_day"] * np.sin(2 * np.pi * all_df[_col + "_day"]/30)
    all_df[_col + '_day_cos'] = all_df[_col + "_day"] * np.cos(2 * np.pi * all_df[_col + "_day"]/30)
    all_df[_col + '_day_of_week_sin'] = all_df[_col + "_day_of_week"] * np.sin(2 * np.pi * all_df[_col + "_day_of_week"]/7)
    all_df[_col + '_day_of_week_cos'] = all_df[_col + "_day_of_week"] * np.cos(2 * np.pi * all_df[_col + "_day_of_week"]/7)
    all_df[_col + '_day_of_year_sin'] = all_df[_col + "_day_of_year"] * np.sin(2 * np.pi * all_df[_col + "_day_of_year"]/365)
    all_df[_col + '_day_of_year_cos'] = all_df[_col + "_day_of_year"] * np.cos(2 * np.pi * all_df[_col + "_day_of_year"]/365)
    
    all_df.loc[(all_df[_col + "_day_of_week"].isin([5, 6]))&(all_df['holiday_name']=='no_holiday'), 'holiday_name'] = 'weekend'

    
    all_df.drop(_col, axis=1, inplace=True)

2 EDA

对数据有一个整体的认识：

def check(df):
    """
    Generates a concise summary of DataFrame columns.
    """
    # Use list comprehension to iterate over each column
    summary = [
        [col, df[col].dtype, df[col].count(), df[col].nunique(), 
         df[col].isnull().sum(), df.duplicated().sum()]
        for col in df.columns]

    # Create a DataFrame from the list of lists
    df_check = pd.DataFrame(summary, columns=["column", "dtype", "instances", "unique", "sum_null", "duplicates"])

    return df_check

print("Training Data Summary")
display(check(train_df))
print("Test Data Summary")
display(check(test_df))

3 参数选择

from sklearn.model_selection import RandomizedSearchCV, cross_val_score

# Define parameter grid for Randomized Search
param_dist = {
    'eta': uniform(0.01, 0.1),
    'max_depth': randint(3, 10),
    'subsample': uniform(0.7, 0.3),
    'colsample_bytree': uniform(0.7, 0.3),
    'n_estimators': randint(100, 500)
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    xgb.XGBRegressor(objective='reg:squarederror', n_jobs=-1),
    param_distributions=param_dist,
    n_iter=50,
    cv=5,
    scoring='neg_mean_absolute_percentage_error',
    n_jobs=-1,
    random_state=42,
    verbose=0
)

# Fit RandomizedSearchCV
random_search.fit(X_train_scaled, y_train)

# Get the best model
best_model = random_search.best_estimator_

# Perform crossvalidation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(best_model, X_train_scaled, y_train, 
                            cv=kf, scoring='neg_mean_absolute_percentage_error')
cv_mape = -cv_scores.mean()