python代码,用pandas实现递归数据查找,解决apply单线程的模式

def select_main_object_dg(index, max_rows,dic_rows):
    try:
        memory = Memory(location='your_cache_directory', verbose=0)
        program_start_time=time.time()
        row_jg=pd.DataFrame()
        select_sql = ("select mo.id,mo.pid from school_info_dev.main_object mo where mo.is_deleted=0 limit %s,%s")% (index * max_rows, max_rows)
        with Session() as session:
            rows_main = pd.read_sql_query(select_sql, session.bind)
            rows_main1 = rows_main
            row_jg['main_object_id'] = rows_main['id'].to_frame()
            if not rows_main.empty:
                print("第%d次:机构数(机构数)数据处理开始时间:%s" % (index, tool.time_nyr(program_start_time)))
                #rows_main['row_jg'] = rows_main['id'].apply(jigou, extra_param=rows_main_all)
                chunks = [rows_main1.iloc[i:i + len(rows_main)] for i in range(0, len(rows_main1), len(rows_main))]
                processed_rows = Parallel(n_jobs=-1)(delayed(process_chunk)(id, dic_rows,rows_main) for id in chunks)
                result_df = pd.concat(processed_rows)
                memory.clear()
                jigou_dic = ['yjjgid', 'ejjgid', 'sjjgid', 'sijjgid', 'wjjgid']
                print("第%d次:机构数(机构数)apply结束:%s,耗时:%.2f" % (
                index, tool.time_nyr(time.time()), (time.time() - program_start_time)))
                for i, value in enumerate(jigou_dic):
                    row_jg[value] = [x[i] if len(x)>i else None for x in result_df['row_jg']]
                row_jg.to_sql('main_object_jigou', con=session.bind, index=False, if_exists='append', chunksize=1000)
                print("第%d次:机构数(机构数)处理结束时间:%s,耗时:%.2f" % ( index,tool.time_nyr(time.time()), (time.time() - program_start_time)))
    except Exception as e:
        print('select_main_object_all',e,{index})
    finally:
        memory.clear() #这里用法是为了是否内存,因为用了Parallel后 系统的内存会爆涨
def process_chunk(rows_main1, dic_rows, rows_main):
    rows_main['row_jg'] = rows_main1['id'].apply(jigou, extra_param=dic_rows)
    return rows_main #这里用了2个一样的rows_main  一个是1一个不是  是为了合并的时候不报错
children_all = []
def jigou(x,extra_param):

    try:
        if not pd.isna(x):
            child_nodes = find_children(extra_param, extra_param[extra_param['id'] == x]['id'].values[0])
            reversed_list = child_nodes[::-1]
            #反转数据 只插入前面的5组数据
            if reversed_list[:7][::-1] not in children_all:
               children_all.append(reversed_list[:7][::-1] )
            return reversed_list
    except Exception as e:
        print('jigou',e)
def find_children(df, parent_id):
    try:
        children = []
        if children_all:#如果不为空
            for c_a_i in children_all:
                if parent_id in c_a_i:
                    sum =c_a_i.index(parent_id)
                    children_q=c_a_i[sum:]
                    children.extend(children_q)
                    return children
        child_rows = df[df['id'] == parent_id]
        for index, row in child_rows.iterrows():
            children.append(row['id'])
            children.extend(find_children(df, row['pid']))
        return children
    except Exception as e:
        print('find_children',e)
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值