实际工作处理数据报文格式,以及数据内容去做数据的核对脚本。
之前是有直接消费kafka去读取数据的,一个topic放了多个省份的数据,通过sourceID区分数据。目前这个是直接读取本地文件夹。如果有疑问欢迎私信提问
#encoding=utf8
import json
import multiprocessing
import sys
import time
starttime=int(time.time())
def opptype(op):
if op=='insert':
rst='update'
elif op=='update':
rst='insert'
else:
rst=''
return rst
def getColumnsDict(infoObj, content, value):
columns = {}
columnsList = infoObj[content][value]
for col in columnsList:
columns[col['col']] = col['col']
return columns
def checkColumns(demoColumns, checkColumns):
diffcol = ''
for col in demoColumns:
if col in checkColumns:
pass
else:
diffcol+=col+'|'
return diffcol
def checkdiff(comid,demondata,checkdata):
diffoffset=''
samerowcnt=0
diffrowcnt=0
for x in checkdata:
diffcolinfo=checkColumns(getColumnsDict(demondata,'content', 'value'),getColumnsDict(x,'content', 'value'))
if diffcolinfo:
diffoffset+=x['LogOffset']+'|'
diffrowcnt+=1
else:
samerowcnt+=1
checkreslt='topic:'+topicinfo+'\t\tcomid:'+comid+'\t\ttype:'+x['op']+'\t\tcheck row count:'+str(samerowcnt+diffrowcnt)+'\t\t same row count:'+str(samerowcnt)+'\t\t diff row count:'+str(diffrowcnt)+'\t\tdiff column:'+diffcolinfo+'\t\tinfer demon offset:'+demondata['LogOffset']+'\n'
print ('topic:',topicinfo,'\t\tcomid:',comid,'\t\ttype:',x['op'],'\t\tcheck row count:',str(samerowcnt+diffrowcnt),'\t\t same row count:',samerowcnt,'\t\t diff row count:',diffrowcnt,'\t\tdiff column:',diffcolinfo,'\t\tinfer demon offset:',demondata['LogOffset'])
with open(topicinfo+'.log','a+') as f:
f.write(checkreslt)
#从这里读取的kafka的数据,现在要读取文件数据 readline
def getkafkadata(topicname):
checkcode=['3300','1100']
kafkadata={}
with open("C:/Users/LENOVO/Desktop/3702/"+topicname+".txt","r",encoding="utf-8") as f:
for ms in f:
#print(type(ms))
msfmt = json.loads(ms)
#f.seek(0)
#print(msfmt['SOURCEID'])
#装数据到kafkadata
if msfmt['SOURCEID']+msfmt['op'] in kafkadata:
if kafkadata[msfmt['SOURCEID']+msfmt['op']]['rowcount']<4:
kafkadata[msfmt['SOURCEID'] + msfmt['op']]['data'].append(msfmt)
kafkadata[msfmt['SOURCEID'] + msfmt['op']]['rowcount'] += 1
else:
if msfmt['SOURCEID']+opptype(msfmt['op']) in kafkadata:
if kafkadata[msfmt['SOURCEID']+opptype(msfmt['op'])]['rowcount']>3:
#print(kafkadata[msfmt['SOURCEID']+opptype(msfmt['op'])],'>>>',msfmt['SOURCEID']+opptype(msfmt['op']))
if msfmt['SOURCEID'] in checkcode and msfmt['SOURCEID']!='3300' and '3300insert' in kafkadata and '3300update' in kafkadata:
#print(kafkadata,'>>>',kafkadata[msfmt['SOURCEID']+opptype(msfmt['op'])]['rowcount'])
checkdiff(msfmt['SOURCEID'],kafkadata['3300insert']['data'][0],kafkadata[msfmt['SOURCEID']+'insert']['data'])
kafkadata.pop(msfmt['SOURCEID']+'insert')
checkdiff(msfmt['SOURCEID'], kafkadata['3300update']['data'][0],kafkadata[msfmt['SOURCEID'] + 'update']['data'])
kafkadata.pop(msfmt['SOURCEID'] + 'update')
checkcode.remove(msfmt['SOURCEID'])
else:
kafkadata[msfmt['SOURCEID'] + msfmt['op']]={}
kafkadata[msfmt['SOURCEID'] + msfmt['op']]['rowcount'] =1
kafkadata[msfmt['SOURCEID'] + msfmt['op']]['data']=[]
kafkadata[msfmt['SOURCEID'] + msfmt['op']]['data'].append(msfmt)
return kafkadata
if __name__=='__main__':
topicinfo=sys.argv[1]
#print(topicinfo)
datas = getkafkadata(topicinfo)