pandas 处理数据【两个DataFrame】要进行for 循环嵌套匹配数据

最新推荐文章于 2024-02-23 20:48:08 发布

Cincinnati_De

最新推荐文章于 2024-02-23 20:48:08 发布

阅读量4.8k

点赞数 1

分类专栏：个人日记

本文链接：https://blog.csdn.net/Cincinnati_De/article/details/108884334

版权

个人日记专栏收录该内容

142 篇文章 4 订阅

订阅专栏

需求描述: 1、有两个 DataFrame A和 B ，遍历 B DataFrame 通过A 的三个字段起始时间和结束时间， id 进行判断，若B 的时间戳在 A 的起始和结束时间范围内，并且 a.id = b.id 则将两条数据拼接输出。

2、B的某字段和加起来等于 A 的某字段。

一句话描述就是 A 是汇总结果，B 是明细数据，搞清楚哪些B 构成了A 。判断条件是属于时间范围、分组后的累加和等于A。

import pandas as pd

# 获取 汇总数据 并进行 格式转化

file_path = '/home/hadoop/liangde/2019_Project/Gaming/four_data/汇总.xlsx'
df = pd.read_excel(file_path, sheet_name=0)
df1= df[['金额','申请时间','审核时间','号']]

# print(df1.号.map(lambda x:str(x)))

# print(df1.head(10))


df1['申请时间'] = pd.to_datetime(df1['申请时间'])
df1['审核时间'] = pd.to_datetime(df1['审核时间'])
df1['号'] = df1['卡号'].apply(lambda x:x[1:] ) 

df1.info()
# df1['号'][1]

import os 
# 获取 流水数据 并进行 格式转化
file_path2 = '/home/hadoop/liangde/2019_Project/Gaming/zhen_data/'
files= [file_path2 + _ for _ in os.listdir(file_path2) ]

count = 1

df_mingxi = pd.DataFrame(columns=['交易卡号' '交易账号' '查询反馈结果原因' '交易户名' '交易证件号码' '交易时间' '交易金额' '交易余额' '收付标志'
 '交易对手账卡号' '现金标志' '对手身份证号' '对手开户银行' '摘要说明' '交易币种' '交易网点名称' '交易场所'
 '交易发生地' '交易是否成功' '传票号' 'IP地址' 'MAC地址' '对手交易余额' '交易流水号' '渠道' '日志号' '凭证种类'
 '凭证号' '交易柜员号' '备注']
)

for _ in files:
    pd1 = pd.read_excel( _ , sheet_name=0)
    #print(pd1.columns.values) 
    df_mingxi = df_mingxi.append(pd1)

    # 131070
df_mingxi = df_mingxi[['交易卡号','对手户名' ,'交易对手账卡号','交易金额', '交易时间' ]].reset_index()
df_mingxi['交易卡号'] = df_mingxi.交易卡号.map(lambda x:str(x).split('.')[0])

df_mingxi['交易时间'] = pd.to_datetime(df_mingxi['交易时间'],errors='coerce')

df_mingxi.info()

# 筛选过滤数据


clomun_name = ['交易卡号','对手户名','交易对手账卡号','交易金额','交易时间','到账金额','申请时间','审核时间','卡号']

df_match =  pd.DataFrame(columns=clomun_name)


list1 =  []
count = 1
for index, row in df_mingxi.iterrows():
    trade_time = row['交易时间']
    trade_cash = row['交易金额']
    trade_name = row['对手户名']
    trade_opp_card = row['交易对手账卡号']
    trade_card = row['交易卡号']
    print(count)
    count += 1
    #print(trade_card)
#     print(df1['卡号'])
    
#     print()
    
    #df2 = df1[(df1.卡号==trade_card)]
    df2 = df1[df1['卡号']==trade_card]
    if df2.empty:
        continue
    else:
        #print(df2)
       df3 = df2[ (df2['审核时间']> trade_time )&( df2 ['申请时间'] < trade_time)]
       if df3.empty:
            continue
       else:
            #print(df3)
            match = pd.concat([row, df3.iloc[-1]], axis=0)
            # print(match)
            list1.append(match)
            #print(type(match))
 
    
df_match = df_match.append(list1,ignore_index=True)


df_match.info()

df_match.info()
df_match.drop('index',axis=1, inplace=True)
df_match.to_csv('./0930_new.csv',index=False)
##################### 帅选完成 #################################

################ 计算数据 ####################
from collections import defaultdict

d = defaultdict(list)

file_path = '/home/hadoop/liangde/2019_Project/Gaming/0930_new.csv'

with open(file_path,'r') as rf:
    rf.readline()
    lines = rf.readlines()
    for line in lines:
        apply_time = line.split(',')[6].strip()
        check_time = line.split(',')[8].strip()
        key = str(apply_time) + '-' +str(check_time)
        d[key].append(line)

with open('./match_0930.csv','w')as wf:
    title = ','.join( ['交易卡号','交易对手账卡号','交易时间','交易金额','到账金额','卡号','审核时间','对手户名','申请时间'])
    wf.write(title)
    wf.write('\n')
    for _ in d:

        crash_no =  float(d[_][0].split(',')[4].strip('"'))
        #print("到账金额:",crash_no)

        lines = (d[_])

        #print("复杂度:",len(lines))

        list_trade = []
        count = 0

        if len(lines)> 20:
             print("复杂度:",len(lines),'太高了不做了')
             continue

        if len(lines) ==1:
            trade_no =  float(d[_][0].split(',')[3].strip('"'))
            if trade_no==crash_no:
                wf.write(lines[0])
                wf.write('\n')
            else:
                continue
        
        if len(lines)> 1:
            # 排列组合计算
            #  lista ---> 
            # # print(judge_sum(lista,10430.0))
            # 交易金额的数组
            lista = [ float(x.split(',')[3]) for x in lines ]
            match_index = judge_sum(lista,crash_no)
            if match_index:
                if len(match_index)> 1:
                    # 说明有多种组合情况
                    print('多种--------------》')
                    print(match_index)
#                     for combs in match_index:
#                     # 对 tuple 进行循环，并进行输出打印
#                         for index in combs:
#                             wf.write(lines[index])
#                         wf.write('\n')
                else:
                    for index in match_index[0]:
                        wf.write(lines[index])
                    wf.write('\n')

import itertools

def m_sum(list_index_comb,list1,cash_no,match_tuples_list):

    for tuples in list_index_comb:
        # 如果索引对应的元素相加和 等于 cash_no
        # 返回索引
        #print(tuples)
        list_t = []
        for _ in tuples:
            list_t.append(list1[_])
        if sum(list_t) == cash_no:
            match_tuples_list.append(tuples)    
            #print(match_tuples_list)
    return match_tuples_list
        
         
def match_list(list1,cash_no):
    list_m = []
    match_tuples_list = []
    list_index = [ x for x in range(0,len(list1))]
    for _ in range(1,len(list_index)):
        # 元素为1、2、3
        list_index_comb = list(itertools.combinations(list_index,_))
        # print(list_index_comb)   
        match_tuples_list = m_sum(list_index_comb,list1,cash_no,match_tuples_list)
    return match_tuples_list


   #print(match_tuples_list)
# 输入  交易金额的 列表  和 提现金额 的账目
# match_list(list1,8)




# 分组求和 交易金额 小于 提现金额  直接过滤掉该组


def judge_sum(trade_list, cash_no):
    
    list_sum = sum(trade_list)
    
    if  list_sum == cash_no:
        print(list_sum)
        print(cash_no)
        print("输入的明细总和 %s 等于 交易金额 %s "% (list_sum,cash_no))
        list_index = [ x for x in range(0,len(trade_list))]
        return [tuple(list_index)]

    
    if list_sum > cash_no:
        # 进行 排列组合计算，返回一个数组。数组的一个元素 代表一种结果，
        print('-----排列组合计算------>')
        match_index_list = match_list(trade_list,cash_no)
        
        if match_index_list:
            
            print("等于 交易金额 %s "% (cash_no))
            return  match_index_list
        else:
            print('没有匹配到')
            return None
 
    else:
        print( '明细总金额为%s' %(list_sum))
        print( '交易金额为%s' %(cash_no))
        print('明细总金额小于交易金额 ， 差 %s  钱 ！'%(cash_no-list_sum))
        return None
    
    

# lista = [10540.0, 4000.0, 930.0, 5500.0]

# lista = [10000.0,400.0, 30.0,20.0,]
# print(judge_sum(lista,10430.0))

Cincinnati_De

关注

1
点赞
踩
22

收藏

觉得还不错? 一键收藏
2
评论
pandas 处理数据【两个DataFrame】要进行for 循环嵌套匹配数据

需求描述: 1、有两个 DataFrame A和 B ，遍历 B DataFrame 通过A 的三个字段起始时间和结束时间， id 进行判断，若B 的时间戳在 A 的起始和结束时间范围内，并且 a.id = b.id 则将两条数据拼接输出。 2、B的某字段和加起来等于 A 的某字段。一句话描述就是 A 是汇总结果，B 是明细数据，搞清楚哪些B 构成了A 。判断条件是属于时间范围、分组后的累加和等于A。import pandas...
复制链接

扫一扫