基于买方意向的货物撮合交易

由于兴趣爱好参加CCF大数据与计算智能大赛(2020),遇到题目为“基于买方意向的货物撮合交易”,本人用python中的pandas库处理此问题,最后结果还可以,但是运行时间太长了,所以请教如何对代码进行优化?

比赛的网址为:https://www.datafountain.cn/competitions/468

import pandas as pd


buyer = pd.read_csv(r'\buyer.csv', header=0, encoding='ANSI', low_memory=False)
seller = pd.read_csv(r'\seller.csv', header=0, encoding='ANSI', low_memory=False)
output = pd.read_excel(r'\output.xlsx', header=0)

# 把表分成两类:SR和CF:
buyer_SR = buyer[buyer['品种'] == 'SR']
buyer_CF = buyer[buyer['品种'] == 'CF']
seller_SR = seller[seller['品种'] == 'SR']
seller_CF = seller[seller['品种'] == 'CF']

# 统计买方第一意向
x = pd.DataFrame(buyer_SR)
x['第一志愿和值'] = buyer_SR['第一意向'] + buyer_SR['值']
buyer_first_unique_SR = x.loc[x.duplicated(subset='第一志愿和值') == False, '第一志愿和值'].values
re = filter(lambda x: x == x, buyer_first_unique_SR)
buyer_first_unique_SR = list(re)
buyer_first_unique_SR.sort()

y = pd.DataFrame(buyer_CF)
y['第一志愿和值'] = buyer_CF['第一意向'] + buyer_CF['值']
buyer_first_unique_CF = y.loc[y.duplicated(subset='第一志愿和值') == False, '第一志愿和值'].values
re = filter(lambda x: x == x, buyer_first_unique_CF)
buyer_first_unique_CF = list(re)
buyer_first_unique_CF.sort()
re2 = filter(lambda x: x[:2] == '产地', buyer_first_unique_CF)
buyer_first_unique_1 = list(re2)
re3 = filter(lambda x: x[:2] != '产地', buyer_first_unique_CF)
buyer_first_unique_2 = list(re3)
buyer_first_unique_CF = buyer_first_unique_2 + buyer_first_unique_1

# 建立一个一个的第一意向单元
names = locals()
for i in range(len(buyer_first_unique_SR)):
    names['buyer_unique_SR%s' % i] = buyer_SR[
        (buyer_SR['第一意向'] == buyer_first_unique_SR[i][:2]) & (buyer_SR['值'] == str(buyer_first_unique_SR[i])[2:])]
names = locals()
for i in range(len(buyer_first_unique_CF)):
    names['buyer_unique_CF%s' % i] = buyer_CF[
        (buyer_CF['第一意向'] == buyer_first_unique_CF[i][:2]) & (buyer_CF['值'] == str(buyer_first_unique_CF[i])[2:])]

# 建立判断SR的每个意向单元格对应的seller的货物总数
seller_SR_warehouse_sum = seller_SR.groupby('仓库')['货物数量(张)'].sum()
seller_SR_brand_sum = seller_SR.groupby('品牌')['货物数量(张)'].sum()
seller_SR_year_sum = seller_SR.groupby('年度')['货物数量(张)'].sum()
seller_SR_class_sum = seller_SR.groupby('等级')['货物数量(张)'].sum()


def search_seller_SR_num(column_index_str, index_value_str):
    sum_buyers = 0
    if column_index_str == '仓库':
        sum_buyers = seller_SR_warehouse_sum[seller_SR_warehouse_sum.index == int(index_value_str)].values
    elif column_index_str == '品牌':
        sum_buyers = seller_SR_brand_sum[seller_SR_brand_sum.index == index_value_str].values
    elif column_index_str == '年度':
        sum_buyers = seller_SR_year_sum[seller_SR_year_sum.index == int(index_value_str)].values
    elif column_index_str == '等级':
        sum_buyers = seller_SR_class_sum[seller_SR_class_sum.index == int(index_value_str)].values
    return sum_buyers


# 建立判断CF的每个意向单元格对应的seller的货物总数
seller_CF_warehouse_sum = seller_CF.groupby('仓库')['货物数量(张)'].sum()
seller_CF_area_sum = seller_CF.groupby('产地')['货物数量(张)'].sum()
seller_CF_year_sum = seller_CF.groupby('年度')['货物数量(张)'].sum()
seller_CF_class_sum = seller_CF.groupby('等级')['货物数量(张)'].sum()
seller_CF_category_sum = seller_CF.groupby('类别')['货物数量(张)'].sum()


def search_seller_CF_num(column_index_str, index_value_str):
    sum_buyers = 0
    if column_index_str == '仓库':
        sum_buyers = seller_CF_warehouse_sum[seller_CF_warehouse_sum.index == int(index_value_str)].values
    elif column_index_str == '产地':
        sum_buyers = seller_CF_area_sum[seller_CF_area_sum.index == index_value_str].values
    elif column_index_str == '年度':
        sum_buyers = seller_CF_year_sum[seller_CF_year_sum.index == int(index_value_str)].values
    elif column_index_str == '等级':
        sum_buyers = seller_CF_class_sum[seller_CF_class_sum.index == index_value_str].values
    elif column_index_str == '类别':
        sum_buyers = seller_CF_category_sum[seller_CF_category_sum.index == index_value_str].values
    return sum_buyers


# 函数是为了修改取货后的seller表:
def cut_test(seller_goods_cache):
    return 0

# 函数为了统计当前交易数据所满足的意向:
def check_satisfy(x, check_information2):
    satisfy_str = ''
    for i in range(0, len(check_information2), 2):
        if x[check_information2[i]] == check_information2[i + 1] and satisfy_str == '':
            satisfy_str = satisfy_str + str(int((i + 2) / 2))
        elif x[check_information2[i]] == check_information2[i + 1] and satisfy_str != '':
            satisfy_str = '-'.join([satisfy_str, str(int((i + 2) / 2))])
    if satisfy_str == '':
        satisfy_str = '0'
    return satisfy_str


# ________________________________________________________对SR进行操作_________________________________________________
# 当SR意向单元格的所有货物都能被第一意向seller_SR满足:
def pick_SR_1(buyer_information):
    global output
    check_information = [x for x in buyer_information[4:14] if x == x]
    for i in range(0, len(check_information), 2):
        if check_information[i] != '品牌' and check_information[i] != '等级':
            check_information[i + 1] = int(check_information[i + 1])
    seller_pick_goods = seller[(seller['品种'] == 'SR') & (seller['货物数量(张)'] != 0)]
    for i in range(0, len(check_information), 2):
        # 这里要加个判断!每个意向加进来,筛选后的数量必须>=需求量,否则停止筛选!
        if seller_pick_goods[(seller_pick_goods[check_information[i]] == check_information[i + 1])]['货物数量(张)'].sum() >= \
                buyer_information[2]:
            seller_pick_goods = seller_pick_goods[(seller_pick_goods[check_information[i]] == check_information[i + 1])]
    # 得到的表,按照仓库排序,使货物尽量从一个仓库出货
    start_pick_seller = seller_pick_goods.sort_values(by=['仓库', '货物数量(张)'], ascending=False)
    start_pick_seller['累计货物数量(张)'] = start_pick_seller['货物数量(张)'].cumsum()
    start_pick_seller['是否拣选'] = 1
    start_pick_seller['是否拣选'].where(start_pick_seller['累计货物数量(张)'] <= buyer_information[2], 0, inplace=True)
    final_pick_goods_pd = start_pick_seller[start_pick_seller['是否拣选'] == 1]
    # 还得判断下一个是否加进来
    if len(final_pick_goods_pd) != 0 and final_pick_goods_pd['累计货物数量(张)'].values[-1] < buyer_information[2]:
        final_pick_goods_pd = final_pick_goods_pd.append(start_pick_seller.iloc[len(final_pick_goods_pd), :])
    elif len(final_pick_goods_pd) == 0:
        final_pick_goods_pd = final_pick_goods_pd.append(start_pick_seller.iloc[0, :])
    # 取走的卖方的Index,这个index 还是最大的seller的index,不是seller_SR的index。
    final_pick_goods_index = final_pick_goods_pd.index
    # 每个index所拿走的货物数量:
    final_pick_goods = final_pick_goods_pd['货物数量(张)'].values
    if len(final_pick_goods) == 1:
        final_pick_goods[0] = buyer_information[2]
    else:
        final_pick_goods[-1] = buyer_information[2] - final_pick_goods_pd['累计货物数量(张)'].values[-2]
    # 拿走对应的seller的货:
    judge_binary_list = seller.index.isin(final_pick_goods_index)
    last_pick_goods_number = seller.loc[final_pick_goods_index[-1], '货物数量(张)']
    seller.loc[judge_binary_list, '货物数量(张)'] = seller.loc[judge_binary_list, '货物数量(张)'].apply(cut_test)
    seller.loc[final_pick_goods_index[-1], '货物数量(张)'] = last_pick_goods_number - final_pick_goods[-1]
    # 记录满足意向的情况,并且转化成字符串的形式:
    final_pick_goods_satisfy = final_pick_goods_pd.apply(check_satisfy, args=(check_information,), axis=1).tolist()
    # 每次跑完一条买方数据,都要保存在Output里面:
    len_of_final_index = len(final_pick_goods_index)
    output_append = pd.DataFrame(
        {'买方客户': [buyer_information[0]] * len_of_final_index, '卖方客户': seller.iloc[final_pick_goods_index, 0].tolist(),
         '品种': ['SR'] * len_of_final_index, '货物编号': seller.iloc[final_pick_goods_index, 2].tolist(),
         '仓库': seller.iloc[final_pick_goods_index, 4].tolist(), '分配货物数量': final_pick_goods,
         '对应意向顺序': final_pick_goods_satisfy})
    output = pd.concat([output, output_append], ignore_index=True)
    return final_pick_goods_index, final_pick_goods, final_pick_goods_satisfy


# 当SR意向单元格的所有货物不能被第一意向seller_SR满足:
def pick_SR_2(buyer_information):
    global output
    check_information = [x for x in buyer_information[4:14] if x == x]
    for i in range(0, len(check_information), 2):
        if check_information[i] != '品牌' and check_information[i] != '等级':
            check_information[i + 1] = int(check_information[i + 1])
    # 需求大于总卖方:
    if buyer_information[2] > search_seller_SR_num(check_information[0], check_information[1]):
        seller_pick_goods = seller[(seller['品种'] == 'SR') &
                                   (seller[check_information[0]] == check_information[1]) & (seller['货物数量(张)'] != 0)]
        # 取走的卖方所有的Index,这个index 还是最大的seller的index,不是seller_SR的index。
        final_pick_goods_index_1 = seller_pick_goods.index
        # 每个index所拿走的货物数量:
        final_pick_goods_1 = seller_pick_goods['货物数量(张)'].values
        # 拿走对应的seller的货:
        judge_binary_list = seller.index.isin(final_pick_goods_index_1)
        seller.loc[judge_binary_list, '货物数量(张)'] = seller.loc[judge_binary_list, '货物数量(张)'].apply(cut_test)
        # 记录满足意向的情况,并且转化成字符串的形式:
        final_pick_goods_satisfy_1 = seller_pick_goods.apply(check_satisfy, args=(check_information,), axis=1).tolist()
        # 每次跑完一条买方数据,都要保存在Output里面:
        len_of_final_index = len(final_pick_goods_index_1)
        output_append = pd.DataFrame(
            {'买方客户': [buyer_information[0]] * len_of_final_index,
             '卖方客户': seller.iloc[final_pick_goods_index_1, 0].tolist(),
             '品种': ['SR'] * len_of_final_index, '货物编号': seller.iloc[final_pick_goods_index_1, 2].tolist(),
             '仓库': seller.iloc[final_pick_goods_index_1, 4].tolist(), '分配货物数量': final_pick_goods_1,
             '对应意向顺序': final_pick_goods_satisfy_1})
        output = pd.concat([output, output_append], ignore_index=True)
        # 更新buyer买方的数量,因为其中一部分已经被seller拿走.
        buyer_information_new = buyer_information.copy()
        buyer_information_new[2] = buyer_information[2] - seller_pick_goods['货物数量(张)'].sum()
        # 剩下未满足的需求,带入对应的SR_1函数,求index\goods\satisfy
        final_pick_goods_index_2, final_pick_goods_2, final_pick_goods_satisfy_2 = pick_SR_1(buyer_information_new)
    # 该买方的需求<=卖方还剩的供给:
    else:
        seller_pick_goods = seller[(seller['品种'] == 'SR') & (seller['货物数量(张)'] != 0)]
        for i in range(0, len(check_information), 2):
            # 这里要加个判断!每个意向加进来,筛选后的数量必须>=需求量,否则停止筛选!
            if seller_pick_goods[(seller_pick_goods[check_information[i]] == check_information[i + 1])][
                '货物数量(张)'].sum() >= \
                    buyer_information[2]:
                seller_pick_goods = seller_pick_goods[
                    (seller_pick_goods[check_information[i]] == check_information[i + 1])]
        # 得到的表,按照仓库排序,使货物尽量从一个仓库出货
        start_pick_seller = seller_pick_goods.sort_values(by=['仓库', '货物数量(张)'], ascending=False)
        start_pick_seller['累计货物数量(张)'] = start_pick_seller['货物数量(张)'].cumsum()
        start_pick_seller['是否拣选'] = 1
        start_pick_seller['是否拣选'].where(start_pick_seller['累计货物数量(张)'] <= buyer_information[2], 0, inplace=True)
        final_pick_goods_pd = start_pick_seller[start_pick_seller['是否拣选'] == 1]
        # 还得判断下一个是否加进来?
        if len(final_pick_goods_pd) != 0 and final_pick_goods_pd['累计货物数量(张)'].values[-1] < buyer_information[2]:
            final_pick_goods_pd = final_pick_goods_pd.append(start_pick_seller.iloc[len(final_pick_goods_pd), :])
        elif len(final_pick_goods_pd) == 0:
            final_pick_goods_pd = final_pick_goods_pd.append(start_pick_seller.iloc[0, :])
        # 取走的卖方的Index,这个index 还是最大的seller的index,不是seller_SR的index。
        final_pick_goods_index = final_pick_goods_pd.index
        # 每个index所拿走的货物数量:
        final_pick_goods = final_pick_goods_pd['货物数量(张)'].values
        if len(final_pick_goods) == 1:
            final_pick_goods[0] = buyer_information[2]
        else:
            final_pick_goods[-1] = buyer_information[2] - final_pick_goods_pd['累计货物数量(张)'].values[-2]
        # 拿走对应的seller的货:
        judge_binary_list = seller.index.isin(final_pick_goods_index)
        last_pick_goods_number = seller.loc[final_pick_goods_index[-1], '货物数量(张)']
        seller.loc[judge_binary_list, '货物数量(张)'] = seller.loc[judge_binary_list, '货物数量(张)'].apply(cut_test)
        seller.loc[final_pick_goods_index[-1], '货物数量(张)'] = last_pick_goods_number - final_pick_goods[-1]
        # 记录满足意向的情况,并且转化成字符串的形式:
        final_pick_goods_satisfy = final_pick_goods_pd.apply(check_satisfy, args=(check_information,), axis=1).tolist()
        # 每次跑完一条买方数据,都要保存在Output里面:
        len_of_final_index = len(final_pick_goods_index)
        output_append = pd.DataFrame(
            {'买方客户': [buyer_information[0]] * len_of_final_index,
             '卖方客户': seller.iloc[final_pick_goods_index, 0].tolist(),
             '品种': ['SR'] * len_of_final_index, '货物编号': seller.iloc[final_pick_goods_index, 2].tolist(),
             '仓库': seller.iloc[final_pick_goods_index, 4].tolist(), '分配货物数量': final_pick_goods,
             '对应意向顺序': final_pick_goods_satisfy})
        output = pd.concat([output, output_append], ignore_index=True)
    return True


for p in range(len(buyer_first_unique_SR)):
    if names.get('buyer_unique_SR' + str(p))[
        '购买货物数量'].sum() <= search_seller_SR_num(buyer_first_unique_SR[p][:2],
                                                buyer_first_unique_SR[p][2:]):
        buyer_unique_cache = names.get('buyer_unique_SR' + str(p)).sort_values(by='购买货物数量', ascending=False)
        buyer_unique_cache.apply(pick_SR_1, axis=1, result_type="expand")
    else:
        buyer_unique_cache = names.get('buyer_unique_SR' + str(p)).sort_values(by='平均持仓时间', ascending=False)
        buyer_unique_cache.apply(pick_SR_2, axis=1, result_type="expand")


# ________________________________________________________对CF进行操作_________________________________________________
...
...

我处理该问题的思路是:

  1. 把CF和SR分开处理;
  2. 把买方数据分成两大类,有第一意向和没有任何意向;
  3. 有第一意向的买方数据,按照第一意向不同,分成不同的意向单元,对意向单元做标准处理;
  4. 处理意向单元时,先判断意向单元的所有货物能否被剩下的卖方所满足。如果满足,带入pick_goods_CF_1或pick_goods_SR_1中,如果不满足,就带入pick_goods_CF_2或pick_goods_SR_2中;
  5. 选货时,选择同一个仓库出货;
  6. 卖方的数据及时更新,这步比较耗费时间; 

另外,还有一些代码没有放上去:

  • 第一意向单元格如果只有第一意向的,应该在该单元格处理完其他买方数据后,集中处理;
  • 处理没有第一意向的数据;

 

现在的情况是,笔记本跑完整套代码,需要3+h,非常耗费时间,如何解决此问题???

 

撮合交易数据库设计和 SQL 优化是一个复杂而庞大的主题,我会尽量给出一些基本的指导和建议。首先,让我们讨论数据库设计方面的考虑。 1. 数据库模式设计: - 确定关键实体和关系:分析交易过程中涉及的实体,如交易订单、买方、卖方等,并确定它们之间的关系。 - 规范化:将数据分解为更小的、无冗余的表,以减少数据冗余和更新异常。 - 考虑性能:根据查询需求和数据访问模式,优化表结构、索引和关系,以提高查询和插入/更新性能。 2. 索引设计: - 选择适当的索引:根据查询需求和数据访问模式,选择合适的索引类型(如 B 树索引、哈希索引等)。 - 考虑复合索引:如果某些查询需要多个列进行过滤或排序,可以考虑创建复合索引,以提高查询性能。 - 避免过多索引:创建过多的索引可能会增加维护成本和插入/更新性能,需要权衡索引的数量和查询性能的权衡。 3. 查询优化: - 编写有效的查询:使用正确的 JOIN、WHERE 和 ORDER BY 子句,避免全表扫描、避免不必要的数据读取。 - 避免大表查询:对于大表查询,可以考虑使用分页、缓存或其他技术进行性能优化。 - 优化查询计划:通过分析和调整查询计划,可以提高查询性能。可以使用数据库的 Explain Plan 功能来分析查询计划,并根据需要进行索引调整或重写查询。 4. 数据库性能监控和调优: - 监控数据库性能:使用数据库性能监控工具监测数据库的性能指标,如响应时间、并发连接数等。 - 调整数据库参数:根据监控数据,调整数据库的配置参数,如缓冲池大小、并发连接数等,以优化数据库性能。 当然,以上只是一些基本的指导,实际情况可能更加复杂。对于具体的需求和环境,可能需要更深入的分析和优化。希望这些指导能给你一些启示。
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值