- 问题描述:
- 1、大批量数据表,字段为:shop_name、site、sku、date、quantity
- 2、日期不连贯,且有有多sku对应的日期是缺失的。
方法一:pandas、切片
test1=order_data.reset_index()
#近7天销量和
for i in test1.index:
test1[‘purchase_date’]=pd.to_datetime(test1[‘purchase_date’],format=’%Y-%m-%d’)
qiepian = test1.loc[(((test1[‘shop_name’]==test1.loc[i][‘shop_name’]))&
((test1[‘site’]==test1.loc[i][‘site’]))&
((test1[‘sku’]==test1.loc[i][‘sku’]))&
((test1.loc[i,‘purchase_date’]-test1[‘purchase_date’]).apply(lambda x : x.days)<7)&
((test1.loc[i,‘purchase_date’]-test1[‘purchase_date’]).apply(lambda x : x.days)>-1))]
day7_sum=qiepian[‘quantity’].sum()
#print(qiepian)
#print(day7_sum)
test1.loc[i,‘day7_sum’]=day7_sum
#近15天销量和
for i in test1.index:
test1[‘purchase_date’]=pd.to_datetime(test1[‘purchase_date’],format=’%Y-%m-%d’)
qiepian = test1.loc[(((test1[‘shop_name’]==test1.loc[i][‘shop_name’]))&
((test1[‘site’]==test1.loc[i][‘site’]))&
((test1[‘sku’]==test1.loc[i][‘sku’]))&
((test1.loc[i,‘purchase_date’]-test1[‘purchase_date’]).apply(lambda x : x.days)<15)&
((test1.loc[i,‘purchase_date’]-test1[‘purchase_date’]).apply(lambda x : x.days)>-1))]
day15_sum=qiepian[‘quantity’].sum()
#print(qiepian)
#print(day7_sum)
test1.loc[i,‘day15_sum’]=day15_sum
#近30天销量和
for i in test1.index:
test1[‘purchase_date’]=pd.to_datetime(test1[‘purchase_date’],format=’%Y-%m-%d’)
qiepian = test1.loc[(((test1[‘shop_name’]==test1.loc[i][‘shop_name’]))&
((test1[‘site’]==test1.loc[i][‘site’]))&
((test1[‘sku’]==test1.loc[i][‘sku’]))&
((test1.loc[i,‘purchase_date’]-test1[‘purchase_date’]).apply(lambda x : x.days)<30)&
((test1.loc[i,‘purchase_date’]-test1[‘purchase_date’]).apply(lambda x : x.days)>-1))]
day30_sum=qiepian[‘quantity’].sum()
#print(qiepian)
#print(day7_sum)
test1.loc[i,‘day30_sum’]=day30_sum
使用该方法即可得出想要的数据
*****该方法逻辑简单,但是运行超过5万条的数据就会比较慢;由于我的数据量比较大,30多万条,上千个sku,所以该方法太慢。
- 方法二:pandas、切片、for循环
在这里插入代码片
order_data_new = pd.DataFrame(columns=[‘purchase_date’,‘shop_name’,‘site’,‘sku’,‘quantity’,‘day7_sum’,‘day15_sum’,‘day30_sum’])
order_data[‘purchase_date’]=pd.to_datetime(order_data[‘purchase_date’],format=’%Y-%m-%d’)
shop=list(order_data[‘shop_name’].unique())
for a in shop:
site = list(order_data.loc[order_data[‘shop_name’]==a,‘site’].unique())
for b in site:
sku =list(order_data.loc[(order_data[‘shop_name’]==a)&(order_data[‘site’]==b),‘sku’].unique())
for c in sku:
sku_qp = order_data.loc[((order_data[‘shop_name’]==a)&(order_data[‘site’]==b)&(order_data[‘sku’]==c))].reset_index()
for i in sku_qp.index:
#对近7天数据进行切片
day7_qiepian = sku_qp.loc[(((sku_qp.loc[i,‘purchase_date’]-sku_qp[‘purchase_date’]).apply(lambda x : x.days)<7)&
((sku_qp.loc[i,‘purchase_date’]-sku_qp[‘purchase_date’]).apply(lambda x : x.days)>-1))]
day7_sum=day7_qiepian[‘quantity’].sum()
#对15天数据进行切片
day15_qiepian = sku_qp.loc[(((sku_qp.loc[i,‘purchase_date’]-sku_qp[‘purchase_date’]).apply(lambda x : x.days)<15)&
((sku_qp.loc[i,‘purchase_date’]-sku_qp[‘purchase_date’]).apply(lambda x : x.days)>-1))]
day15_sum=day15_qiepian[‘quantity’].sum()
#对30天数据进行切片
day30_qiepian = sku_qp.loc[(((sku_qp.loc[i,‘purchase_date’]-sku_qp[‘purchase_date’]).apply(lambda x : x.days)<15)&
((sku_qp.loc[i,‘purchase_date’]-sku_qp[‘purchase_date’]).apply(lambda x : x.days)>-1))]
day30_sum=day30_qiepian[‘quantity’].sum()
sku_qp.loc[i,‘day7_sum’]=day7_sum
sku_qp.loc[i,‘day15_sum’]=day15_sum
sku_qp.loc[i,‘day30_sum’]=day30_sum
order_data_new = order_data_new.append(sku_qp,ignore_index = True)
使用该方法,用每次的切片代替整个dataframe全量索引,稍微快一点。
方法三:把想要的店铺作为列表,每次循环取数,减低每次运行的时间
shop_name=list([‘飞跃美国一店’,‘开拓者美国一店’])
for i in shop_name:
#读取亚马逊销售订单明细数据
conn = pymssql.connect(serverName , userName , passWord, database=“ADS”,charset = ‘UTF-8’)
cursor = conn.cursor()
sql=“SELECT [purchase_date],convert(nvarchar(50),shop_name)[shop_name],convert(nvarchar(50),site)site,sku,sum(quantity)quantity
FROM [ADS].[dbo].[Amazon_order_data] where purchase_date>‘2021-01-01’ and shop_name =’”+str(i)+"’
group by [purchase_date],[shop_name],site,sku
order by purchase_date"
cursor.execute(sql)
amazon_order_list = cursor.fetchall()
order_data = pd.DataFrame(list(amazon_order_list),columns = [‘purchase_date’,‘shop_name’,‘site’,‘sku’,‘quantity’])
cursor.close()#关闭游标
conn.close()#关闭连接
#制作近7天、近15天、近30天销量和
order_data_new = pd.DataFrame(columns=['purchase_date','shop_name','site','sku','quantity','day7_sum','day15_sum','day30_sum'])
order_data['purchase_date']=pd.to_datetime(order_data['purchase_date'],format='%Y-%m-%d')
shop=list(order_data['shop_name'].unique())
for a in shop:
site = list(order_data.loc[order_data['shop_name']==a,'site'].unique())
for b in site:
sku =list(order_data.loc[(order_data['shop_name']==a)&(order_data['site']==b),'sku'].unique())
for c in sku:
sku_qp = order_data.loc[((order_data['shop_name']==a)&(order_data['site']==b)&(order_data['sku']==c))].reset_index()
for i in sku_qp.index:
#对近7天数据进行切片
day7_qiepian = sku_qp.loc[(((sku_qp.loc[i,'purchase_date']-sku_qp['purchase_date']).apply(lambda x : x.days)<7)&\
((sku_qp.loc[i,'purchase_date']-sku_qp['purchase_date']).apply(lambda x : x.days)>-1))]
day7_sum=day7_qiepian['quantity'].sum()
#对15天数据进行切片
day15_qiepian = sku_qp.loc[(((sku_qp.loc[i,'purchase_date']-sku_qp['purchase_date']).apply(lambda x : x.days)<15)&\
((sku_qp.loc[i,'purchase_date']-sku_qp['purchase_date']).apply(lambda x : x.days)>-1))]
day15_sum=day15_qiepian['quantity'].sum()
#对30天数据进行切片
day30_qiepian = sku_qp.loc[(((sku_qp.loc[i,'purchase_date']-sku_qp['purchase_date']).apply(lambda x : x.days)<30)&\
((sku_qp.loc[i,'purchase_date']-sku_qp['purchase_date']).apply(lambda x : x.days)>-1))]
day30_sum=day30_qiepian['quantity'].sum()
sku_qp.loc[i,'day7_sum']=day7_sum
sku_qp.loc[i,'day15_sum']=day15_sum
sku_qp.loc[i,'day30_sum']=day30_sum
order_data_new = order_data_new.append(sku_qp,ignore_index = True)
order_data_new['predict30_quantity'] = (order_data_new['day30_sum']*1/6+order_data_new['day15_sum']*2/3+order_data_new['day7_sum']*2).astype(int)
#把结果存入导DWS层数据库中
engine = create_engine("mssql+pyodbc://user_name:password@服务器IP:端口/DWS?Trusted_Connection=No&driver=ODBC+Driver+17+for+SQL+Server",fast_executemany=True)
order_data_new.to_sql('amazon_order_predict_test',engine,if_exists='append',index=False)