基于XGBOOST模型预测货物运输耗时 - Part 2 通过方差分析了解文本型变量与数值型目标变量的关系

  • 在分析数据之前,我们需要剔除异常值的影响,也就是在某个分组情况下,标准差过大(标准差越大,证明情况越不稳定),如果标准差比较小,就算是最小值和最大值差的比较大,我也认为他是一个比较平稳的波动。
  • 方差分析这个老师讲的很好:[https://www.bilibili.com/video/BV1jB4y1676T/?spm_id_from=333.788&vd_source=642d9a85cff4a726a7de10f2383987df]
    -

Step 6:Reduce Std.

  • assure data stability
# 按 Category 列分组并计算每个分组的标准差
grouped_data = df_replen.groupby(['mot','country','priority','cc']).agg({'lt_pu_pod':['std','count',lambda x:x.quantile(0.9),lambda x:x.quantile(0.1)]})
# 打印结果
grouped_data = grouped_data.reset_index()
grouped_data.columns = ['mot','country','priority','cc','std','count','quantile_90','quantile_10']
grouped_data
motcountrypriorityccstdcountquantile_90quantile_10
0SEAAE40ADNaN110.36010.360

3150 rows × 8 columns

for group_i in grouped_data.index:
    group_mot = grouped_data.loc[group_i,'mot']
    group_country = grouped_data.loc[group_i,'country']
    group_priority = grouped_data.loc[group_i,'priority']
    group_cc = grouped_data.loc[group_i,'cc']
    
    group_std = grouped_data.loc[group_i,'std']
    group_count = grouped_data.loc[group_i,'count']
    group_quantile_90 = grouped_data.loc[group_i,'quantile_90']
    group_quantile_10 = grouped_data.loc[group_i,'quantile_10']
    
    if group_count>=5 and group_std>=8:
        index_replen = df_replen[(df_replen['mot']==group_mot) & (df_replen['country']==group_country) & (df_replen['priority']==group_priority)
                 & ((df_replen['lt_pu_pod']>=group_quantile_90) | (df_replen['lt_pu_pod']<=group_quantile_10))
                 ].index
        for repl_i in index_replen:
            df_replen.loc[repl_i,'Type_'] = 'Y'
df_replen_1 = df_replen[df_replen['Type_'].isnull()]
df_replen_1 = df_replen_1[~df_replen_1['hawb'].isnull() & ~df_replen_1['ts_atd'].isnull() & ~df_replen_1['cc'].isnull() & ~df_replen_1['weight'].isnull()].reset_index().drop('index',axis=1)
df_replen_1.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103166 entries, 0 to 103165
Data columns (total 24 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   hawb               103233 non-null  object 
 1   mot                103233 non-null  object 
 2   geo                103166 non-null  object 
 3   country            103166 non-null  object 
 4   shippingfrequency  103166 non-null  object 
 5   isgreen            103166 non-null  object 
 6   priority           103166 non-null  object 
 7   forwarder          103166 non-null  object 
 8   cc                 93166 non-null  object 
 9   dn_gr_status       103166 non-null  object 
 10  volume             103166 non-null  float64
 11  weight             103166 non-null  float64
 12  dn_seq_qty         103166 non-null  float64
 13  ts_pu              103166 non-null  object 
 14  ts_atd             103166 non-null  object 
 15  ts_ata             103038 non-null  object 
 16  ts_cc              102855 non-null  object 
 17  ts_pod             103166 non-null  object 
 18  ts_atd_pre_2_date  103166 non-null  object 
 19  lt_pu_pod          103166 non-null  float64
 20  lt_pu_atd          103166 non-null  float64
 21  lt_atd_ata         103166 non-null  float64
 22  data_source        103166 non-null  object 
 23  Type_              0 non-null       object 
dtypes: float64(6), object(18)
memory usage: 18.9+ MB
# 按 Category 列分组并计算每个分组的标准差
grouped_data_1 = df_replen_1.groupby(['mot','country','priority','cc']).agg({'lt_pu_pod':['std','count',lambda x:x.quantile(0.9),lambda x:x.quantile(0.1)]})
# 打印结果
grouped_data_1 = grouped_data_1.reset_index()
grouped_data_1.columns = ['mot','country','priority','cc','std','count','quantile_90','quantile_10']
grouped_data_1
motcountrypriorityccstdcountquantile_90quantile_10
0AIRER30ADNaN116.33316.333

3017 rows × 8 columns

print('Reduce std','As is:【',grouped_data['std'].mean(),'-->','】To be:【',grouped_data_1['std'].mean(),'】')
Reduce std As is:【 4.281785006069334 --> 】To be:【 2.748443864082784 】

Step 7:Research Relationship

research weight&leadtime relationship

for mot in df_replen_1['geo'].unique().tolist():
    plt.figure(figsize=(10, 6))
    sns.heatmap(data=df_replen_1[(df_replen_1['geo']==mot)].corr(), cmap='coolwarm', annot=True, fmt=".2f", cbar=True)
    plt.title("Correlation between volume,qty,weight and lt_pu_pod mot")
    plt.xlabel("{0}".format(mot))
    plt.ylabel("country")
    plt.xticks(rotation=45)
    plt.show()

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述在这里插入图片描述

for mot in df_replen_1['mot'].unique().tolist():
    plt.figure(figsize=(10, 6))
    sns.heatmap(data=df_replen_1[(df_replen_1['mot']==mot)].corr(), cmap='coolwarm', annot=True, fmt=".2f", cbar=True)
    plt.title("Correlation between volume,qty,weight and lt_pu_pod mot")
    plt.xlabel("{0}".format(mot))
    plt.ylabel("country")
    plt.xticks(rotation=45)
    plt.show()

在这里插入图片描述在这里插入图片描述在这里插入图片描述

  • Consequently:weight&dn_seq_qty is Strong correlation relationship

  • but weight&leadtime is weak correlation relationship

# 计算相关系数
corr = df_replen_1.corr().iloc[0,1]

# 按照运输类型分组并绘制散点图
for t, group in df_replen_1.groupby('mot'):
    plt.scatter(group['volume'], group['lt_pu_atd'], label=t)
    corr = group.corr().iloc[0,2]
    # 添加图标题和坐标轴标签
    plt.title(f"Weight vs Time ({t.title()}) (Correlation: {corr:.2f})")
    plt.xlabel("volume")
    plt.ylabel("Time")

    # 添加图例
    plt.legend()

    # 显示图形
    plt.show()

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

# 计算相关系数
corr = df_replen_1.corr().iloc[0,1]

# 按照运输类型分组并绘制散点图
for t, group in df_replen_1.groupby('mot'):
    plt.scatter(group['weight'], group['lt_pu_pod'], label=t)
    corr = group.corr().iloc[0,2]
    # 添加图标题和坐标轴标签
    plt.title(f"Weight vs Time ({t.title()}) (Correlation: {corr:.2f})")
    plt.xlabel("Weight")
    plt.ylabel("Time")

    # 添加图例
    plt.legend()

    # 显示图形
    plt.show()

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

方差1、GEO与LT_PU_POD的关联性

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

方差分析

  • 原假设:每个GEO直接与LT_PU_POD没有关联性

在独立样本T检验中,自由度的计算方法与样本数量有关。如果你有4个geo,假设每个geo对应一个独立的样本组别,那么自由度的计算如下:

自由度 = (样本1的观测数量 - 1) + (样本2的观测数量 - 1) + … + (样本4的观测数量 - 1)

具体计算时,需要知道每个geo对应的样本数量。假设分别为n1、n2、n3、n4,则自由度为 (n1 - 1) + (n2 - 1) + (n3 - 1) + (n4 - 1)。

df_replen_1.groupby(['geo']).agg({'lt_pu_pod':['std','count','mean',lambda x:x.quantile(0.5),lambda x:x.quantile(0.9),lambda x:x.quantile(0.1)]}).reset_index()
geolt_pu_pod
stdcountmean<lambda_0><lambda_1><lambda_2>
0EA3.444474248.9408076.3517.533.98
1BrazilNaN17.8300007.837.837.83
2MX8.325913224767.9956706.349.433.87
3NE7.462291754215.75251014.3226.357.39
4PJ5.33368619.6332068.1217.435.43
from scipy.stats import f_oneway

# 提取不同地理位置的运输耗时数据
geo1_data = df_replen_1[df_replen_1['geo'] == 'AP']['lt_pu_pod']
geo2_data = df_replen_1[df_replen_1['geo'] == 'EMEA']['lt_pu_pod']
geo3_data = df_replen_1[df_replen_1['geo'] == 'LAS']['lt_pu_pod']
geo4_data = df_replen_1[df_replen_1['geo'] == 'NA']['lt_pu_pod']

# 执行单因素方差分析
f_stat, p_value = f_oneway(geo1_data, geo2_data, geo3_data, geo4_data)

# 输出分析结果
print("单因素方差分析结果:")
print("F统计量:", f_stat)
print("p值:", p_value)
单因素方差分析结果:
F统计量: 2676.3050920291266
p值: 0.0
  • 当p值等于0时,通常表示在检验中观察到的差异极其显著。这意味着在假设检验中,得到的样本数据非常不可能出现,或者可以说观察到的差异非常显著,远远超过了我们预期的随机差异。

方差2、MOT与LT_PU_POD的关联性

from scipy.stats import f_oneway

# 提取不同地理位置的运输耗时数据
mot1_data = df_replen_1[df_replen_1['mot'] == 'AIR']['lt_pu_pod']
mot2_data = df_replen_1[df_replen_1['mot'] == 'SEA']['lt_pu_pod']
mot3_data = df_replen_1[df_replen_1['mot'] == 'TRUCK']['lt_pu_pod']

# 执行单因素方差分析
f_stat, p_value = f_oneway(mot1_data, mot2_data, mot3_data)

# 输出分析结果
print("单因素方差分析结果:")
print("F统计量:", f_stat)
print("p值:", p_value)
单因素方差分析结果:
F统计量: 42951.20078674416
p值: 0.0
df_replen_1.groupby(['mot']).agg({'lt_pu_pod':['std','count','mean',lambda x:x.quantile(0.5),lambda x:x.quantile(0.9),lambda x:x.quantile(0.1)]}).reset_index()
motlt_pu_pod
stdcountmean<lambda_0><lambda_1><lambda_2>
0SEA3.3333900397.9132566.8113.344.13
1SEA10.3070481075245.99920.5941.5213.28
2TRUCK0.90333623754.2156513.995.013.40

方差3、priority与LT_PU_POD的关联性

from scipy.stats import f_oneway

# 提取不同地理位置的运输耗时数据
mot1_data = df_replen_1[df_replen_1['priority'] == '20']['lt_pu_pod']
mot2_data = df_replen_1[df_replen_1['priority'] == '40']['lt_pu_pod']
mot3_data = df_replen_1[df_replen_1['priority'] == '60']['lt_pu_pod']
mot4_data = df_replen_1[df_replen_1['priority'] == 'PPF']['lt_pu_pod']

# 执行单因素方差分析
f_stat, p_value = f_oneway(mot1_data, mot2_data, mot3_data,mot4_data)

# 输出分析结果
print("单因素方差分析结果:")
print("F统计量:", f_stat)
print("p值:", p_value)
单因素方差分析结果:
F统计量: 6366.387676680081
p值: 0.0
df_replen_1.groupby(['priority']).agg({'lt_pu_pod':['std','count','mean',lambda x:x.quantile(0.5),lambda x:x.quantile(0.9),lambda x:x.quantile(0.1)]}).reset_index()
prioritylt_pu_pod
stdcountmean<lambda_0><lambda_1><lambda_2>
0335.207475160008.5010196.8814.324.11
1214.303070668187.9838866.8413.384.08
28111.2947881986515.00900613.2429.424.45
3TUU2.2369964835.5698144.877.803.77

方差4、cc与LT_PU_POD的关联性

df_replen_1.groupby(['cc']).agg({'lt_pu_pod':['std','count','mean',lambda x:x.quantile(0.5),lambda x:x.quantile(0.9),lambda x:x.quantile(0.1)]}).reset_index()
cclt_pu_pod
stdcountmean<lambda_0><lambda_1><lambda_2>
0AD7.705418219010.1731467.11520.4604.219
1AN6.97200419949.1716506.85017.3574.080
2BA14.73170513026.28707721.14048.35713.240
3BC7.25939329369.8190537.12019.8104.140
4BI11.435282108920.74596018.56037.4308.948
from scipy.stats import f_oneway

# 创建一个空列表用于存储各个分类的数据
data_list = []

# 遍历每个分类,提取对应的数据并添加到列表中
for category in df_replen_1['cc'].unique().tolist():
    category_data = df_replen_1[df_replen_1['cc'] == category]['lt_pu_pod']
    data_list.append(category_data)

# 绘制箱线图
plt.figure(figsize=(25, 6))
sns.boxplot(data=data_list)
plt.xlabel('Category')
plt.ylabel('lt_pu_pod')
plt.title('Boxplot of lt_pu_pod for different categories')
plt.xticks(range(len(df_replen_1['cc'].unique().tolist())), df_replen_1['cc'].unique().tolist(),rotation=45)
plt.show()

# 执行方差分析
f_stat, p_value = f_oneway(*data_list)

# 输出结果
print("F统计量:", f_stat)
print("p值:", p_value)

在这里插入图片描述

F统计量: 165.67483191021196
p值: 0.0

方差5、isgreen与LT_PU_POD的关联性

df_replen_1.groupby(['isgreen']).agg({'lt_pu_pod':['std','count','mean',lambda x:x.quantile(0.5),lambda x:x.quantile(0.9),lambda x:x.quantile(0.1)]}).reset_index()
isgreenlt_pu_pod
stdcountmean<lambda_0><lambda_1><lambda_2>
0GL16.818674581010.3401277.5520.315.05
1GL20.83661264.2100004.744.763.13
# 创建一个空列表用于存储各个分类的数据
data_list = []

# 遍历每个分类,提取对应的数据并添加到列表中
for category in df_replen_1['isgreen'].unique().tolist():
    category_data = df_replen_1[df_replen_1['isgreen'] == category]['lt_pu_pod']
    data_list.append(category_data)

# 绘制箱线图
plt.figure(figsize=(25, 6))
sns.boxplot(data=data_list)
plt.xlabel('Category')
plt.ylabel('lt_pu_pod')
plt.title('Boxplot of lt_pu_pod for different categories')
plt.xticks(range(len(df_replen_1['isgreen'].unique().tolist())), df_replen_1['isgreen'].unique().tolist(),rotation=45)
plt.show()

# 执行方差分析
f_stat, p_value = f_oneway(*data_list)

# 输出结果
print("F统计量:", f_stat)
print("p值:", p_value)

在这里插入图片描述

F统计量: 49.45782385934109
p值: 1.2062238051192215e-41

方差6、shippingfrequency与LT_PU_POD的关联性

df_replen_1.groupby(['shippingfrequency']).agg({'lt_pu_pod':['std','count','mean',lambda x:x.quantile(0.5),lambda x:x.quantile(0.9),lambda x:x.quantile(0.1)]}).reset_index()
shippingfrequencylt_pu_pod
stdcountmean<lambda_0><lambda_1><lambda_2>
0Bi-Weekly(Thu)17.66472713024.45323115.94054.189.731
1Bi-Weekly(Tue)9.3063142936.93827635.59049.5723.954
2Bi-Weekly(Wed)2.271108164.7343753.8405.483.820
3Daily6.448783353227.8900036.76010.384.400
from scipy.stats import f_oneway

# 创建一个空列表用于存储各个分类的数据
data_list = []

# 遍历每个分类,提取对应的数据并添加到列表中
for category in df_replen_1['shippingfrequency'].unique().tolist():
    category_data = df_replen_1[df_replen_1['shippingfrequency'] == category]['lt_pu_pod']
    data_list.append(category_data)

# 绘制箱线图
plt.figure(figsize=(25, 6))
sns.boxplot(data=data_list)
plt.xlabel('Category')
plt.ylabel('lt_pu_pod')
plt.title('Boxplot of lt_pu_pod for different categories')
plt.xticks(range(len(df_replen_1['shippingfrequency'].unique().tolist())), df_replen_1['shippingfrequency'].unique().tolist(),rotation=45)
plt.show()

# 执行方差分析
f_stat, p_value = f_oneway(*data_list)

# 输出结果
print("F统计量:", f_stat)
print("p值:", p_value)

在这里插入图片描述

F统计量: 2693.5635083145007
p值: 0.0

方差7、forwarder与LT_PU_POD的关联性

# 创建一个空列表用于存储各个分类的数据
data_list = []

# 遍历每个分类,提取对应的数据并添加到列表中
for category in df_replen_1['forwarder'].unique().tolist():
    category_data = df_replen_1[df_replen_1['forwarder'] == category]['lt_pu_pod']
    data_list.append(category_data)

# 绘制箱线图
plt.figure(figsize=(25, 6))
sns.boxplot(data=data_list)
plt.xlabel('Category')
plt.ylabel('lt_pu_pod')
plt.title('Boxplot of lt_pu_pod for different categories')
plt.xticks(range(len(df_replen_1['forwarder'].unique().tolist())), df_replen_1['forwarder'].unique().tolist(),rotation=45)
plt.show()

# 执行方差分析
f_stat, p_value = f_oneway(*data_list)

# 输出结果
print("F统计量:", f_stat)
print("p值:", p_value)
F统计量: 12060.165772842343
p值: 0.0
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# 选择需要的特征列
cols = ['geo', 'country', 'mot', 'volume', 'lt_pu_pod']

# 提取对应的数据子集
data_subset = df_replen_1[cols]

# 计算每个geo下每个country和每个mot的volume与lt_pu_pod的相关性
correlation_matrix = data_subset.groupby(['geo', 'country', 'mot']).corr().loc[:, 'volume'].unstack()

# 绘制热力图
plt.figure(figsize=(24, 16))
sns.heatmap(data=correlation_matrix, cmap='coolwarm', annot=True, fmt=".2f", cbar=True)
plt.title("Correlation between volume and lt_pu_pod by geo, country, and mot")
plt.xlabel("mot")
plt.ylabel("country")
plt.xticks(rotation=45)
plt.show()

在这里插入图片描述

Step 5:Research climate&lt_pu_atd

df_replen_2 = df_replen_1.copy()
df_replen_2['ts_pu_date'] = df_replen_2['ts_pu'].map(lambda x:pd.to_datetime(x).strftime('%Y-%m-%d'))
df_replen_open['ts_pu_date'] = df_replen_open['ts_pu'].map(lambda x:pd.to_datetime(x).strftime('%Y-%m-%d'))
for rep_index in df_replen_2.index:
    ts_pu = (pd.to_datetime(df_replen_2.loc[rep_index,'ts_atd_pre_2_date']))
    ts_atd = (pd.to_datetime(df_replen_2.loc[rep_index,'ts_atd']))
    if pd.isna(ts_atd)==False:
        if ts_pu > ts_atd:
#             print(rep_index)
            ts_atd = (pd.to_datetime(df_replen_2.loc[rep_index,'ts_pu']) + datetime.timedelta(1.5))
    if pd.isna(ts_atd):
        ts_atd = (pd.to_datetime(df_replen_2.loc[rep_index,'ts_pu']) + datetime.timedelta(1.5))
    df_climate_tmp = df_climate_workday[(df_climate_workday['Date']<=ts_atd) & (df_climate_workday['Date']>=ts_pu)]
    holiday_count = len(df_climate_tmp[df_climate_tmp['weekday_cat']=='holiday'])
    climate_count = len(df_climate_tmp[df_climate_tmp['Alarm_info_cat']=='Abnormal climate'])
    # 判断期间有多少天假期
    if holiday_count == 0:
        df_replen_2.loc[rep_index,'holiday_count'] = '^== 0 hldys'
    if holiday_count == 1:
        df_replen_2.loc[rep_index,'holiday_count'] = '^== 1 hldys'
    if holiday_count == 2:
        df_replen_2.loc[rep_index,'holiday_count'] = '^== 2 hldys'
    if holiday_count > 2 and holiday_count <=7:
        df_replen_2.loc[rep_index,'holiday_count'] = '^<= 7 hldys'
    if holiday_count > 7 and holiday_count <=14:
        df_replen_2.loc[rep_index,'holiday_count'] = '^<= 14 hldys'
    if holiday_count > 14:
        df_replen_2.loc[rep_index,'holiday_count'] = '^> 14 hldys'
    
    # 判断期间有多少天恶劣天气
    if climate_count == 0:
        df_replen_2.loc[rep_index,'climate_count'] = '^== 0 abnr'
    if climate_count == 1:
        df_replen_2.loc[rep_index,'climate_count'] = '^== 1 abnr'
    if climate_count == 2:
        df_replen_2.loc[rep_index,'climate_count'] = '^== 2 abnr'
    if climate_count > 2 and climate_count <=7:
        df_replen_2.loc[rep_index,'climate_count'] = '^<= 7 abnr'
    if climate_count > 7 and climate_count <=14:
        df_replen_2.loc[rep_index,'climate_count'] = '^<= 14 abnr'
    if climate_count > 14:
        df_replen_2.loc[rep_index,'climate_count'] = '^> 14 abnr'
for rep_index in df_replen_open.index:
    ts_pu = (pd.to_datetime(df_replen_open.loc[rep_index,'ts_atd_pre_2_date']))
    ts_atd = (pd.to_datetime(df_replen_open.loc[rep_index,'ts_atd']))
    if pd.isna(ts_atd)==False:
        if ts_pu > ts_atd:
#             print(rep_index)
            ts_atd = (pd.to_datetime(df_replen_open.loc[rep_index,'ts_pu']) + datetime.timedelta(1.5))
    if pd.isna(ts_atd):
        ts_atd = (pd.to_datetime(df_replen_open.loc[rep_index,'ts_pu']) + datetime.timedelta(1.5))
    df_climate_tmp = df_climate_workday[(df_climate_workday['Date']<=ts_atd) & (df_climate_workday['Date']>=ts_pu)]
    holiday_count = len(df_climate_tmp[df_climate_tmp['weekday_cat']=='holiday'])
    climate_count = len(df_climate_tmp[df_climate_tmp['Alarm_info_cat']=='Abnormal climate'])
    # 判断期间有多少天假期
    if holiday_count == 0:
        df_replen_open.loc[rep_index,'holiday_count'] = '^== 0 hldys'
    if holiday_count == 1:
        df_replen_open.loc[rep_index,'holiday_count'] = '^== 1 hldys'
    if holiday_count == 2:
        df_replen_open.loc[rep_index,'holiday_count'] = '^== 2 hldys'
    if holiday_count > 2 and holiday_count <=7:
        df_replen_open.loc[rep_index,'holiday_count'] = '^<= 7 hldys'
    if holiday_count > 7 and holiday_count <=14:
        df_replen_open.loc[rep_index,'holiday_count'] = '^<= 14 hldys'
    if holiday_count > 14:
        df_replen_open.loc[rep_index,'holiday_count'] = '^> 14 hldys'
    
    # 判断期间有多少天恶劣天气
    if climate_count == 0:
        df_replen_open.loc[rep_index,'climate_count'] = '^== 0 abnr'
    if climate_count == 1:
        df_replen_open.loc[rep_index,'climate_count'] = '^== 1 abnr'
    if climate_count == 2:
        df_replen_open.loc[rep_index,'climate_count'] = '^== 2 abnr'
    if climate_count > 2 and climate_count <=7:
        df_replen_open.loc[rep_index,'climate_count'] = '^<= 7 abnr'
    if climate_count > 7 and climate_count <=14:
        df_replen_open.loc[rep_index,'climate_count'] = '^<= 14 abnr'
    if climate_count > 14:
        df_replen_open.loc[rep_index,'climate_count'] = '^> 14 abnr'
df_replen_2.groupby(['holiday_count']).agg({'lt_pu_atd':['std','count','mean',lambda x:x.quantile(0.5),lambda x:x.quantile(0.9),lambda x:x.quantile(0.1)]}).reset_index()
holiday_countlt_pu_atd
stdcountmean<lambda_0><lambda_1><lambda_2>
0^<= 7 hldys2.05380922044.7935934.407.542.82
1^== 0 hldys2.184254528962.6496062.125.440.44
2^== 1 hldys2.411114282552.4618882.114.610.47
3^== 2 hldys1.749583198113.2935602.905.131.75
# 创建一个空列表用于存储各个分类的数据
for mot in df_replen_2['mot'].unique().tolist():
    # 遍历每个分类,提取对应的数据并添加到列表中
    data_list = []
    for category in df_replen_2[df_replen_2['mot']==mot]['climate_count'].unique().tolist():
        category_data = df_replen_2[df_replen_2['climate_count'] == category]['lt_pu_pod']
        data_list.append(category_data)

    # 绘制箱线图
    plt.figure(figsize=(25, 6))
    sns.boxplot(data=data_list)
    plt.xlabel('Category')
    plt.ylabel('lt_pu_pod')
    plt.title('{0} Boxplot of lt_pu_pod for different categories'.format(mot))
    plt.xticks(range(len(df_replen_2[df_replen_2['mot']==mot]['climate_count'].unique().tolist())), df_replen_2[df_replen_2['mot']==mot]['climate_count'].unique().tolist(),rotation=45)
    plt.show()

    # 执行方差分析
    f_stat, p_value = f_oneway(*data_list)

    # 输出结果
    print("F统计量:", f_stat)
    print("p值:", p_value)

在这里插入图片描述

F统计量: 168.19664286910245
p值: 8.91244672555718e-109

在这里插入图片描述

F统计量: 168.19664286910245
p值: 8.91244672555718e-109

在这里插入图片描述

F统计量: 168.19664286910245
p值: 8.91244672555718e-109
import matplotlib.pyplot as plt
import seaborn as sns
# 创建一个空列表用于存储各个分类的数据
for mot in df_replen_2['mot'].unique().tolist():
    # 遍历每个分类,提取对应的数据并添加到列表中
    data_list = []
    for category in df_replen_2[df_replen_2['mot']==mot]['holiday_count'].unique().tolist():
        category_data = df_replen_2[df_replen_2['holiday_count'] == category]['lt_pu_pod']
        data_list.append(category_data)

    # 绘制箱线图
    plt.figure(figsize=(25, 6))
    sns.boxplot(data=data_list)
    plt.xlabel('Category')
    plt.ylabel('lt_pu_pod')
    plt.title('{0} Boxplot of lt_pu_pod for different categories'.format(mot))
    plt.xticks(range(len(df_replen_2[df_replen_2['mot']==mot]['holiday_count'].unique().tolist())), df_replen_2[df_replen_2['mot']==mot]['holiday_count'].unique().tolist(),rotation=45)
    plt.show()

    # 执行方差分析
    f_stat, p_value = f_oneway(*data_list)

    # 输出结果
    print("F统计量:", f_stat)
    print("p值:", p_value)

在这里插入图片描述

F统计量: 134.89417962378948
p值: 3.1763774706854116e-87

在这里插入图片描述

F统计量: 134.89417962378948
p值: 3.1763774706854116e-87

在这里插入图片描述

F统计量: 82.45295180623998
p值: 1.6609295478352026e-36
df_replen_2['ts_pu_date'] = pd.to_datetime(df_replen_2['ts_pu_date'])
df_replen_open['ts_pu_date'] = pd.to_datetime(df_replen_open['ts_pu_date'])
df_replen_2 = pd.merge(df_replen_2,df_climate_workday[['Date','Alarm_info_cat','weekday_cat','Week','weekday']],left_on='ts_pu_date',right_on='Date',how='left').drop('Date',axis=1).rename(columns={'天气':'climate','分类':'climate_category'})
df_replen_open = pd.merge(df_replen_open,df_climate_workday[['Date','Alarm_info_cat','weekday_cat','Week','weekday']],left_on='ts_pu_date',right_on='Date',how='left').drop('Date',axis=1).rename(columns={'天气':'climate','分类':'climate_category'})
df_replen_2['lt_pu_ata'] = (pd.to_datetime(df_replen_1['ts_ata']) - pd.to_datetime(df_replen_1['ts_pu'])).astype('timedelta64[D]').astype(float)
df_replen_2['lt_pu_atd'] = (pd.to_datetime(df_replen_1['ts_atd']) - pd.to_datetime(df_replen_1['ts_pu'])).astype('timedelta64[D]').astype(float)
df_climate_workday
DateWeekmaximum_temperatureminimum_temperatureclimatewind_directioncityis_workdayis_holidayholiday_nameweekdayweekday_catdate_alarmAlarm_infoAlarm_info_catunic_versionrank_1
02022-01-01星期六0℃-9℃多云西风 2级上海01New year5holidayNaTNoneNone2023-05-18 16:25:251
12022-01-02星期日8℃-4℃多云西风 3级上海006holidayNaTNoneNone2023-05-18 16:25:251
22022-01-03星期一13℃6℃东南风 2级上海01New Year shift0holidayNaTNoneNone2023-05-18 16:25:251
32022-01-04星期二13℃9℃多云东北风 1级上海101workdayNaTNoneNone2023-05-18 16:25:251

502 rows × 17 columns


# 创建一个空列表用于存储各个分类的数据
for mot in df_replen_2['mot'].unique().tolist():
    # 遍历每个分类,提取对应的数据并添加到列表中
    data_list = []
    for category in df_replen_2[(df_replen_2['mot']==mot) & (~df_replen_2['weekday'].isnull())]['weekday'].unique().tolist():
        category_data = df_replen_2[(df_replen_2['mot']==mot) & (~df_replen_2['weekday'].isnull()) & (df_replen_2['weekday'] == category)]['lt_pu_pod']
        data_list.append(category_data)

    # 绘制箱线图
    plt.figure(figsize=(25, 6))
    sns.boxplot(data=data_list)
    plt.xlabel('Category')
    plt.ylabel('lt_pu_pod')
    plt.title('{0} Boxplot of lt_pu_pod for different categories'.format(mot))
    plt.xticks(range(len(df_replen_2[df_replen_2['mot']==mot]['weekday'].unique().tolist())), df_replen_2[df_replen_2['mot']==mot]['weekday'].unique().tolist(),rotation=45)
    plt.show()

    # 执行方差分析
    f_stat, p_value = f_oneway(*data_list)

    # 输出结果
    print("F统计量:", f_stat)
    print("p值:", p_value)

在这里插入图片描述

F统计量: 127.26212162797701
p值: 4.906400437665232e-156

在这里插入图片描述

F统计量: 326.9319114756572
p值: 0.0

在这里插入图片描述

F统计量: 52.61132203829418
p值: 1.6153144700352586e-42
group_mot_climate_pu_atd = df_replen_2.groupby(['mot','Week']).agg({'lt_pu_atd':['std','count','mean',lambda x:x.quantile(0.5),lambda x:x.quantile(0.9),lambda x:x.quantile(0.1)]}).reset_index()
group_mot_climate_pu_atd.columns = ['mot','holiday_count','std','count','mean','quantile_5','quantile_9','quantile_1']
group_mot_climate_pu_atd
motholiday_countstdcountmeanquantile_5quantile_9quantile_1
0AIR星期一1.547858165441.8962162.04.00.0
1AIR星期三1.566505151212.0171952.04.00.0
2AIR星期二1.751875176812.0236982.04.00.0
3AIR星期五2.986730123852.4269682.06.00.0
group_mot_climate_pu_atd = df_replen_2.groupby(['mot','holiday_count']).agg({'lt_pu_atd':['std','count','mean',lambda x:x.quantile(0.5),lambda x:x.quantile(0.9),lambda x:x.quantile(0.1)]}).reset_index()
group_mot_climate_pu_atd.columns = ['mot','holiday_count','std','count','mean','quantile_5','quantile_9','quantile_1']
group_mot_climate_pu_atd
motholiday_countstdcountmeanquantile_5quantile_9quantile_1
0AIR^<= 7 hldys1.94587817614.0403184.07.02.0
1AIR^== 0 hldys1.994225466452.0121991.05.00.0
2AIR^== 1 hldys2.298573243031.8701392.04.00.0
3AIR^== 2 hldys1.453881173302.5364112.04.01.0
4SEA^<= 7 hldys2.1249204435.3160276.07.02.0
group_mot_climate_pu_atd = df_replen_2.groupby(['mot','climate_count']).agg({'lt_pu_atd':['std','count','mean',lambda x:x.quantile(0.5),lambda x:x.quantile(0.9),lambda x:x.quantile(0.1)]}).reset_index()
group_mot_climate_pu_atd.columns = ['mot','climate_count','std','count','mean','quantile_5','quantile_9','quantile_1']
group_mot_climate_pu_atd
motclimate_countstdcountmeanquantile_5quantile_9quantile_1
0AIR^<= 7 abnr1.18756534951.6148781.03.00.0
1AIR^== 0 abnr2.271866621672.2805512.05.00.0
2AIR^== 1 abnr1.276714121951.7785982.03.00.0
3AIR^== 2 abnr1.197433121821.7461012.03.00.0
group_mot_climate_pu_atd = df_replen_2.groupby(['mot','climate_count','holiday_count']).agg({'lt_pu_atd':['std','count','mean',lambda x:x.quantile(0.5),lambda x:x.quantile(0.9),lambda x:x.quantile(0.1)]}).reset_index()
group_mot_climate_pu_atd.columns = ['mot','climate_count','holiday_count','std','count','mean','quantile_5','quantile_9','quantile_1']
group_mot_climate_pu_atd.sort_values('mean',ascending=False)
motclimate_countholiday_countstdcountmeanquantile_5quantile_9quantile_1
17SEA^== 0 abnr^<= 7 hldys1.8304522726.0000007.07.02.2
df_replen_2['ts_pu'] = pd.to_datetime((df_replen_2['ts_pu']))
df_replen_open['ts_pu'] = pd.to_datetime((df_replen_open['ts_pu']))
df_replen_2['ts_pu_hour'] = df_replen_2['ts_pu'].dt.hour
df_replen_open['ts_pu_hour'] = df_replen_open['ts_pu'].dt.hour
df_replen_2['dayofweek'] = df_replen_2['ts_pu'].dt.dayofweek
df_replen_open['dayofweek'] = df_replen_open['ts_pu'].dt.dayofweek
for mot in df_replen_2['geo'].unique().tolist():
    plt.figure(figsize=(10, 6))
    sns.heatmap(data=df_replen_2[(df_replen_2['geo']==mot)][['lt_pu_pod','lt_pu_atd','ts_pu_hour','dayofweek']].corr(), cmap='coolwarm', annot=True, fmt=".2f", cbar=True)
    plt.title("Correlation between dayofweek,ts_pu_hour and lt_pu_pod lt_pu_atd")
    plt.xlabel("{0}".format(mot))
    plt.ylabel("country")
    plt.xticks(rotation=45)
    plt.show()

在这里插入图片描述

在这里插入图片描述

在这里插入图片描述

在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值