由于兴趣爱好参加CCF大数据与计算智能大赛(2020),遇到题目为“基于买方意向的货物撮合交易”,本人用python中的pandas库处理此问题,最后结果还可以,但是运行时间太长了,所以请教如何对代码进行优化?
比赛的网址为:https://www.datafountain.cn/competitions/468
import pandas as pd
buyer = pd.read_csv(r'\buyer.csv', header=0, encoding='ANSI', low_memory=False)
seller = pd.read_csv(r'\seller.csv', header=0, encoding='ANSI', low_memory=False)
output = pd.read_excel(r'\output.xlsx', header=0)
# 把表分成两类:SR和CF:
buyer_SR = buyer[buyer['品种'] == 'SR']
buyer_CF = buyer[buyer['品种'] == 'CF']
seller_SR = seller[seller['品种'] == 'SR']
seller_CF = seller[seller['品种'] == 'CF']
# 统计买方第一意向
x = pd.DataFrame(buyer_SR)
x['第一志愿和值'] = buyer_SR['第一意向'] + buyer_SR['值']
buyer_first_unique_SR = x.loc[x.duplicated(subset='第一志愿和值') == False, '第一志愿和值'].values
re = filter(lambda x: x == x, buyer_first_unique_SR)
buyer_first_unique_SR = list(re)
buyer_first_unique_SR.sort()
y = pd.DataFrame(buyer_CF)
y['第一志愿和值'] = buyer_CF['第一意向'] + buyer_CF['值']
buyer_first_unique_CF = y.loc[y.duplicated(subset='第一志愿和值') == False, '第一志愿和值'].values
re = filter(lambda x: x == x, buyer_first_unique_CF)
buyer_first_unique_CF = list(re)
buyer_first_unique_CF.sort()
re2 = filter(lambda x: x[:2] == '产地', buyer_first_unique_CF)
buyer_first_unique_1 = list(re2)
re3 = filter(lambda x: x[:2] != '产地', buyer_first_unique_CF)
buyer_first_unique_2 = list(re3)
buyer_first_unique_CF = buyer_first_unique_2 + buyer_first_unique_1
# 建立一个一个的第一意向单元
names = locals()
for i in range(len(buyer_first_unique_SR)):
names['buyer_unique_SR%s' % i] = buyer_SR[
(buyer_SR['第一意向'] == buyer_first_unique_SR[i][:2]) & (buyer_SR['值'] == str(buyer_first_unique_SR[i])[2:])]
names = locals()
for i in range(len(buyer_first_unique_CF)):
names['buyer_unique_CF%s' % i] = buyer_CF[
(buyer_CF['第一意向'] == buyer_first_unique_CF[i][:2]) & (buyer_CF['值'] == str(buyer_first_unique_CF[i])[2:])]
# 建立判断SR的每个意向单元格对应的seller的货物总数
seller_SR_warehouse_sum = seller_SR.groupby('仓库')['货物数量(张)'].sum()
seller_SR_brand_sum = seller_SR.groupby('品牌')['货物数量(张)'].sum()
seller_SR_year_sum = seller_SR.groupby('年度')['货物数量(张)'].sum()
seller_SR_class_sum = seller_SR.groupby('等级')['货物数量(张)'].sum()
def search_seller_SR_num(column_index_str, index_value_str):
sum_buyers = 0
if column_index_str == '仓库':
sum_buyers = seller_SR_warehouse_sum[seller_SR_warehouse_sum.index == int(index_value_str)].values
elif column_index_str == '品牌':
sum_buyers = seller_SR_brand_sum[seller_SR_brand_sum.index == index_value_str].values
elif column_index_str == '年度':
sum_buyers = seller_SR_year_sum[seller_SR_year_sum.index == int(index_value_str)].values
elif column_index_str == '等级':
sum_buyers = seller_SR_class_sum[seller_SR_class_sum.index == int(index_value_str)].values
return sum_buyers
# 建立判断CF的每个意向单元格对应的seller的货物总数
seller_CF_warehouse_sum = seller_CF.groupby('仓库')['货物数量(张)'].sum()
seller_CF_area_sum = seller_CF.groupby('产地')['货物数量(张)'].sum()
seller_CF_year_sum = seller_CF.groupby('年度')['货物数量(张)'].sum()
seller_CF_class_sum = seller_CF.groupby('等级')['货物数量(张)'].sum()
seller_CF_category_sum = seller_CF.groupby('类别')['货物数量(张)'].sum()
def search_seller_CF_num(column_index_str, index_value_str):
sum_buyers = 0
if column_index_str == '仓库':
sum_buyers = seller_CF_warehouse_sum[seller_CF_warehouse_sum.index == int(index_value_str)].values
elif column_index_str == '产地':
sum_buyers = seller_CF_area_sum[seller_CF_area_sum.index == index_value_str].values
elif column_index_str == '年度':
sum_buyers = seller_CF_year_sum[seller_CF_year_sum.index == int(index_value_str)].values
elif column_index_str == '等级':
sum_buyers = seller_CF_class_sum[seller_CF_class_sum.index == index_value_str].values
elif column_index_str == '类别':
sum_buyers = seller_CF_category_sum[seller_CF_category_sum.index == index_value_str].values
return sum_buyers
# 函数是为了修改取货后的seller表:
def cut_test(seller_goods_cache):
return 0
# 函数为了统计当前交易数据所满足的意向:
def check_satisfy(x, check_information2):
satisfy_str = ''
for i in range(0, len(check_information2), 2):
if x[check_information2[i]] == check_information2[i + 1] and satisfy_str == '':
satisfy_str = satisfy_str + str(int((i + 2) / 2))
elif x[check_information2[i]] == check_information2[i + 1] and satisfy_str != '':
satisfy_str = '-'.join([satisfy_str, str(int((i + 2) / 2))])
if satisfy_str == '':
satisfy_str = '0'
return satisfy_str
# ________________________________________________________对SR进行操作_________________________________________________
# 当SR意向单元格的所有货物都能被第一意向seller_SR满足:
def pick_SR_1(buyer_information):
global output
check_information = [x for x in buyer_information[4:14] if x == x]
for i in range(0, len(check_information), 2):
if check_information[i] != '品牌' and check_information[i] != '等级':
check_information[i + 1] = int(check_information[i + 1])
seller_pick_goods = seller[(seller['品种'] == 'SR') & (seller['货物数量(张)'] != 0)]
for i in range(0, len(check_information), 2):
# 这里要加个判断!每个意向加进来,筛选后的数量必须>=需求量,否则停止筛选!
if seller_pick_goods[(seller_pick_goods[check_information[i]] == check_information[i + 1])]['货物数量(张)'].sum() >= \
buyer_information[2]:
seller_pick_goods = seller_pick_goods[(seller_pick_goods[check_information[i]] == check_information[i + 1])]
# 得到的表,按照仓库排序,使货物尽量从一个仓库出货
start_pick_seller = seller_pick_goods.sort_values(by=['仓库', '货物数量(张)'], ascending=False)
start_pick_seller['累计货物数量(张)'] = start_pick_seller['货物数量(张)'].cumsum()
start_pick_seller['是否拣选'] = 1
start_pick_seller['是否拣选'].where(start_pick_seller['累计货物数量(张)'] <= buyer_information[2], 0, inplace=True)
final_pick_goods_pd = start_pick_seller[start_pick_seller['是否拣选'] == 1]
# 还得判断下一个是否加进来
if len(final_pick_goods_pd) != 0 and final_pick_goods_pd['累计货物数量(张)'].values[-1] < buyer_information[2]:
final_pick_goods_pd = final_pick_goods_pd.append(start_pick_seller.iloc[len(final_pick_goods_pd), :])
elif len(final_pick_goods_pd) == 0:
final_pick_goods_pd = final_pick_goods_pd.append(start_pick_seller.iloc[0, :])
# 取走的卖方的Index,这个index 还是最大的seller的index,不是seller_SR的index。
final_pick_goods_index = final_pick_goods_pd.index
# 每个index所拿走的货物数量:
final_pick_goods = final_pick_goods_pd['货物数量(张)'].values
if len(final_pick_goods) == 1:
final_pick_goods[0] = buyer_information[2]
else:
final_pick_goods[-1] = buyer_information[2] - final_pick_goods_pd['累计货物数量(张)'].values[-2]
# 拿走对应的seller的货:
judge_binary_list = seller.index.isin(final_pick_goods_index)
last_pick_goods_number = seller.loc[final_pick_goods_index[-1], '货物数量(张)']
seller.loc[judge_binary_list, '货物数量(张)'] = seller.loc[judge_binary_list, '货物数量(张)'].apply(cut_test)
seller.loc[final_pick_goods_index[-1], '货物数量(张)'] = last_pick_goods_number - final_pick_goods[-1]
# 记录满足意向的情况,并且转化成字符串的形式:
final_pick_goods_satisfy = final_pick_goods_pd.apply(check_satisfy, args=(check_information,), axis=1).tolist()
# 每次跑完一条买方数据,都要保存在Output里面:
len_of_final_index = len(final_pick_goods_index)
output_append = pd.DataFrame(
{'买方客户': [buyer_information[0]] * len_of_final_index, '卖方客户': seller.iloc[final_pick_goods_index, 0].tolist(),
'品种': ['SR'] * len_of_final_index, '货物编号': seller.iloc[final_pick_goods_index, 2].tolist(),
'仓库': seller.iloc[final_pick_goods_index, 4].tolist(), '分配货物数量': final_pick_goods,
'对应意向顺序': final_pick_goods_satisfy})
output = pd.concat([output, output_append], ignore_index=True)
return final_pick_goods_index, final_pick_goods, final_pick_goods_satisfy
# 当SR意向单元格的所有货物不能被第一意向seller_SR满足:
def pick_SR_2(buyer_information):
global output
check_information = [x for x in buyer_information[4:14] if x == x]
for i in range(0, len(check_information), 2):
if check_information[i] != '品牌' and check_information[i] != '等级':
check_information[i + 1] = int(check_information[i + 1])
# 需求大于总卖方:
if buyer_information[2] > search_seller_SR_num(check_information[0], check_information[1]):
seller_pick_goods = seller[(seller['品种'] == 'SR') &
(seller[check_information[0]] == check_information[1]) & (seller['货物数量(张)'] != 0)]
# 取走的卖方所有的Index,这个index 还是最大的seller的index,不是seller_SR的index。
final_pick_goods_index_1 = seller_pick_goods.index
# 每个index所拿走的货物数量:
final_pick_goods_1 = seller_pick_goods['货物数量(张)'].values
# 拿走对应的seller的货:
judge_binary_list = seller.index.isin(final_pick_goods_index_1)
seller.loc[judge_binary_list, '货物数量(张)'] = seller.loc[judge_binary_list, '货物数量(张)'].apply(cut_test)
# 记录满足意向的情况,并且转化成字符串的形式:
final_pick_goods_satisfy_1 = seller_pick_goods.apply(check_satisfy, args=(check_information,), axis=1).tolist()
# 每次跑完一条买方数据,都要保存在Output里面:
len_of_final_index = len(final_pick_goods_index_1)
output_append = pd.DataFrame(
{'买方客户': [buyer_information[0]] * len_of_final_index,
'卖方客户': seller.iloc[final_pick_goods_index_1, 0].tolist(),
'品种': ['SR'] * len_of_final_index, '货物编号': seller.iloc[final_pick_goods_index_1, 2].tolist(),
'仓库': seller.iloc[final_pick_goods_index_1, 4].tolist(), '分配货物数量': final_pick_goods_1,
'对应意向顺序': final_pick_goods_satisfy_1})
output = pd.concat([output, output_append], ignore_index=True)
# 更新buyer买方的数量,因为其中一部分已经被seller拿走.
buyer_information_new = buyer_information.copy()
buyer_information_new[2] = buyer_information[2] - seller_pick_goods['货物数量(张)'].sum()
# 剩下未满足的需求,带入对应的SR_1函数,求index\goods\satisfy
final_pick_goods_index_2, final_pick_goods_2, final_pick_goods_satisfy_2 = pick_SR_1(buyer_information_new)
# 该买方的需求<=卖方还剩的供给:
else:
seller_pick_goods = seller[(seller['品种'] == 'SR') & (seller['货物数量(张)'] != 0)]
for i in range(0, len(check_information), 2):
# 这里要加个判断!每个意向加进来,筛选后的数量必须>=需求量,否则停止筛选!
if seller_pick_goods[(seller_pick_goods[check_information[i]] == check_information[i + 1])][
'货物数量(张)'].sum() >= \
buyer_information[2]:
seller_pick_goods = seller_pick_goods[
(seller_pick_goods[check_information[i]] == check_information[i + 1])]
# 得到的表,按照仓库排序,使货物尽量从一个仓库出货
start_pick_seller = seller_pick_goods.sort_values(by=['仓库', '货物数量(张)'], ascending=False)
start_pick_seller['累计货物数量(张)'] = start_pick_seller['货物数量(张)'].cumsum()
start_pick_seller['是否拣选'] = 1
start_pick_seller['是否拣选'].where(start_pick_seller['累计货物数量(张)'] <= buyer_information[2], 0, inplace=True)
final_pick_goods_pd = start_pick_seller[start_pick_seller['是否拣选'] == 1]
# 还得判断下一个是否加进来?
if len(final_pick_goods_pd) != 0 and final_pick_goods_pd['累计货物数量(张)'].values[-1] < buyer_information[2]:
final_pick_goods_pd = final_pick_goods_pd.append(start_pick_seller.iloc[len(final_pick_goods_pd), :])
elif len(final_pick_goods_pd) == 0:
final_pick_goods_pd = final_pick_goods_pd.append(start_pick_seller.iloc[0, :])
# 取走的卖方的Index,这个index 还是最大的seller的index,不是seller_SR的index。
final_pick_goods_index = final_pick_goods_pd.index
# 每个index所拿走的货物数量:
final_pick_goods = final_pick_goods_pd['货物数量(张)'].values
if len(final_pick_goods) == 1:
final_pick_goods[0] = buyer_information[2]
else:
final_pick_goods[-1] = buyer_information[2] - final_pick_goods_pd['累计货物数量(张)'].values[-2]
# 拿走对应的seller的货:
judge_binary_list = seller.index.isin(final_pick_goods_index)
last_pick_goods_number = seller.loc[final_pick_goods_index[-1], '货物数量(张)']
seller.loc[judge_binary_list, '货物数量(张)'] = seller.loc[judge_binary_list, '货物数量(张)'].apply(cut_test)
seller.loc[final_pick_goods_index[-1], '货物数量(张)'] = last_pick_goods_number - final_pick_goods[-1]
# 记录满足意向的情况,并且转化成字符串的形式:
final_pick_goods_satisfy = final_pick_goods_pd.apply(check_satisfy, args=(check_information,), axis=1).tolist()
# 每次跑完一条买方数据,都要保存在Output里面:
len_of_final_index = len(final_pick_goods_index)
output_append = pd.DataFrame(
{'买方客户': [buyer_information[0]] * len_of_final_index,
'卖方客户': seller.iloc[final_pick_goods_index, 0].tolist(),
'品种': ['SR'] * len_of_final_index, '货物编号': seller.iloc[final_pick_goods_index, 2].tolist(),
'仓库': seller.iloc[final_pick_goods_index, 4].tolist(), '分配货物数量': final_pick_goods,
'对应意向顺序': final_pick_goods_satisfy})
output = pd.concat([output, output_append], ignore_index=True)
return True
for p in range(len(buyer_first_unique_SR)):
if names.get('buyer_unique_SR' + str(p))[
'购买货物数量'].sum() <= search_seller_SR_num(buyer_first_unique_SR[p][:2],
buyer_first_unique_SR[p][2:]):
buyer_unique_cache = names.get('buyer_unique_SR' + str(p)).sort_values(by='购买货物数量', ascending=False)
buyer_unique_cache.apply(pick_SR_1, axis=1, result_type="expand")
else:
buyer_unique_cache = names.get('buyer_unique_SR' + str(p)).sort_values(by='平均持仓时间', ascending=False)
buyer_unique_cache.apply(pick_SR_2, axis=1, result_type="expand")
# ________________________________________________________对CF进行操作_________________________________________________
...
...
我处理该问题的思路是:
- 把CF和SR分开处理;
- 把买方数据分成两大类,有第一意向和没有任何意向;
- 有第一意向的买方数据,按照第一意向不同,分成不同的意向单元,对意向单元做标准处理;
- 处理意向单元时,先判断意向单元的所有货物能否被剩下的卖方所满足。如果满足,带入pick_goods_CF_1或pick_goods_SR_1中,如果不满足,就带入pick_goods_CF_2或pick_goods_SR_2中;
- 选货时,选择同一个仓库出货;
- 卖方的数据及时更新,这步比较耗费时间;
另外,还有一些代码没有放上去:
- 第一意向单元格如果只有第一意向的,应该在该单元格处理完其他买方数据后,集中处理;
- 处理没有第一意向的数据;
现在的情况是,笔记本跑完整套代码,需要3+h,非常耗费时间,如何解决此问题???