- 提取年月日
def DateSplit(df, col):
"""
split the object of '2010-01-02' into year(2010), month(1) and day(2).
:param df: to operate data (type:DataFrame)
:param col: column label of date object (type:str)
:return: converted date (type: DataFrame)
"""
year, month, day = [], [], []
data = df.loc[:, col].values
df = df.drop([col], axis=1)
for i in range(data.shape[0]):
year.append(int(data[i][:4]))
month.append(int(data[i][5:7]))
day.append(int(data[i][8:]))
date = pd.DataFrame({'year': year, 'month': month, 'day': day})
result = pd.concat([date, df], axis=1)
return result
- 按年/月/日汇总取平均
首先要保证日期解析并将日期列设置为索引
select = pd.read_csv('sales_select.csv', parse_dates=['order_date'], index_col=['order_date'], usecols=['order_date', 'csl'])
然后取平均,‘D’是按天,‘M’是按月,‘Y’是按年
csl_avg_month = select.resample('Y', how='mean')
print(csl_avg_month)
# csl
#order_date
#2018-12-31 0.951209
#2019-12-31 0.958564
#2020-12-31 0.854004
- 对不同商品先分类,再按照类别进行按年/月/日平均,要用到groupby()函数
select = pd.read_csv('sales_select.csv', parse_dates=['order_date'], index_col=['order_date'], usecols=['order_date', 'replace_sku_id','csl'])
csl_avg_month = select.groupby('replace_sku_id').resample('Y', how='mean')
print(csl_avg_month)
# replace_sku_id csl
#replace_sku_id order_date
#10136343 2018-12-31 10136343 0.925223
# 2019-12-31 10136343 0.941729
# 2020-12-31 10136343 0.811877
#10136360 2018-12-31 10136360 0.932824
# 2019-12-31 10136360 0.963128
# 2020-12-31 10136360 0.844330
- 写to_excel时追加sheet
直接写to_excel会覆盖,是writer每次写完会自动关闭的原因,要改成:
writer = pd.ExcelWriter('avg_csl.xlsx')
csl_avg_y.to_excel(excel_writer=writer, sheet_name='csl_avg_year')
csl_avg_m.to_excel(excel_writer=writer, sheet_name='csl_avg_month')
writer.save()
writer.close()