python脚本处理读取txt文件,并且处理json数据,核对两文件数据是否一致

实际工作处理数据报文格式,以及数据内容去做数据的核对脚本。
之前是有直接消费kafka去读取数据的,一个topic放了多个省份的数据,通过sourceID区分数据。目前这个是直接读取本地文件夹。如果有疑问欢迎私信提问

#encoding=utf8
import json
import multiprocessing
import sys
import time


starttime=int(time.time())
def opptype(op):
    if op=='insert':
        rst='update'
    elif op=='update':
        rst='insert'
    else:
        rst=''
    return rst

def getColumnsDict(infoObj, content, value):
    columns = {}
    columnsList = infoObj[content][value]
    for col in columnsList:
        columns[col['col']] = col['col']
    return columns

def checkColumns(demoColumns, checkColumns):
    diffcol = ''
    for col in demoColumns:
        if col in checkColumns:
            pass
        else:
            diffcol+=col+'|'
    return diffcol

def checkdiff(comid,demondata,checkdata):
    diffoffset=''
    samerowcnt=0
    diffrowcnt=0

    for x in checkdata:
        diffcolinfo=checkColumns(getColumnsDict(demondata,'content', 'value'),getColumnsDict(x,'content', 'value'))
        if diffcolinfo:
            diffoffset+=x['LogOffset']+'|'
            diffrowcnt+=1
        else:
            samerowcnt+=1
    checkreslt='topic:'+topicinfo+'\t\tcomid:'+comid+'\t\ttype:'+x['op']+'\t\tcheck row count:'+str(samerowcnt+diffrowcnt)+'\t\t same row count:'+str(samerowcnt)+'\t\t diff row count:'+str(diffrowcnt)+'\t\tdiff column:'+diffcolinfo+'\t\tinfer demon offset:'+demondata['LogOffset']+'\n'
    print ('topic:',topicinfo,'\t\tcomid:',comid,'\t\ttype:',x['op'],'\t\tcheck row count:',str(samerowcnt+diffrowcnt),'\t\t same row count:',samerowcnt,'\t\t diff row count:',diffrowcnt,'\t\tdiff column:',diffcolinfo,'\t\tinfer demon offset:',demondata['LogOffset'])

    with open(topicinfo+'.log','a+') as f:
        f.write(checkreslt)

#从这里读取的kafka的数据,现在要读取文件数据 readline

def getkafkadata(topicname):
    checkcode=['3300','1100']
    kafkadata={}
    with open("C:/Users/LENOVO/Desktop/3702/"+topicname+".txt","r",encoding="utf-8") as f:
        for ms in f:
            #print(type(ms))
            msfmt = json.loads(ms)
            #f.seek(0)
            #print(msfmt['SOURCEID'])
            
    	    #装数据到kafkadata
            if msfmt['SOURCEID']+msfmt['op'] in kafkadata:
                if  kafkadata[msfmt['SOURCEID']+msfmt['op']]['rowcount']<4:
                    kafkadata[msfmt['SOURCEID'] + msfmt['op']]['data'].append(msfmt)
                    kafkadata[msfmt['SOURCEID'] + msfmt['op']]['rowcount'] += 1
                else:
                    if msfmt['SOURCEID']+opptype(msfmt['op']) in kafkadata:
                        if kafkadata[msfmt['SOURCEID']+opptype(msfmt['op'])]['rowcount']>3:
                            #print(kafkadata[msfmt['SOURCEID']+opptype(msfmt['op'])],'>>>',msfmt['SOURCEID']+opptype(msfmt['op']))
                           
                            
                            if msfmt['SOURCEID'] in checkcode and msfmt['SOURCEID']!='3300' and  '3300insert' in kafkadata and '3300update' in kafkadata:
                                #print(kafkadata,'>>>',kafkadata[msfmt['SOURCEID']+opptype(msfmt['op'])]['rowcount'])
                               
                                checkdiff(msfmt['SOURCEID'],kafkadata['3300insert']['data'][0],kafkadata[msfmt['SOURCEID']+'insert']['data'])
                                kafkadata.pop(msfmt['SOURCEID']+'insert')
    	    
                                checkdiff(msfmt['SOURCEID'], kafkadata['3300update']['data'][0],kafkadata[msfmt['SOURCEID'] + 'update']['data'])
                                kafkadata.pop(msfmt['SOURCEID'] + 'update')
    	    
                                checkcode.remove(msfmt['SOURCEID'])
            else:
                kafkadata[msfmt['SOURCEID'] + msfmt['op']]={}
                kafkadata[msfmt['SOURCEID'] + msfmt['op']]['rowcount'] =1
                kafkadata[msfmt['SOURCEID'] + msfmt['op']]['data']=[]
                kafkadata[msfmt['SOURCEID'] + msfmt['op']]['data'].append(msfmt)

    return kafkadata 
	    		     
if __name__=='__main__':
    topicinfo=sys.argv[1]
    #print(topicinfo)
    datas = getkafkadata(topicinfo)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值