python重复数据删除_Python中的重复数据删除

# This can run either as a python2 or python3 codefromfuture.builtinsimportnextimportosimportcsvimportreimportloggingimportoptparseimportdedupefromunidecodeimportunidecodeinput_file='data/csv_example_input_with_true_ids.csv'output_file='data/csv_example_output1.csv'settings_file='data/csv_example_learned_settings'training_file='data/csv_example_training.json'# Clean or process the datadefpreProcess(column):try:column=column.decode('utf-8')exceptAttributeError:passcolumn=unidecode(column)column=re.sub(' +',' ',column)column=re.sub('\n',' ',column)column=column.strip().strip('"').strip("'").lower().strip()ifnotcolumn:column=Nonereturncolumn# Read in the data from CSV file:defreadData(filename):data_d={}withopen(filename)asf:reader=csv.DictReader(f)forrowinreader:clean_row=[(k,preProcess(v))for(k,v)inrow.items()]row_id=int(row['Id'])data_d[row_id]=dict(clean_row)returndata_dprint('importing data ...')data_d=readData(input_file)ifos.path.exists(settings_file):print('reading from',settings_file)withopen(settings_file,'rb')asf:deduper=dedupe.StaticDedupe(f)else:fields=[{'field':'Site name','type':'String'},{'field':'Address','type':'String'},{'field':'Zip','type':'Exact','has missing':True},{'field':'Phone','type':'String','has missing':True},]deduper=dedupe.Dedupe(fields)deduper.sample(data_d,15000)ifos.path.exists(training_file):print('reading labeled examples from ',training_file)withopen(training_file,'rb')asf:deduper.readTraining(f)print('starting active labeling...')dedupe.consoleLabel(deduper)deduper.train()withopen(training_file,'w')astf:deduper.writeTraining(tf)withopen(settings_file,'wb')assf:deduper.writeSettings(sf)threshold=deduper.threshold(data_d,recall_weight=1)print('clustering...')clustered_dupes=deduper.match(data_d,threshold)print('# duplicate sets',len(clustered_dupes))cluster_membership={}cluster_id=0for(cluster_id,cluster)inenumerate(clustered_dupes):id_set,scores=clustercluster_d=[data_d[c]forcinid_set]canonical_rep=dedupe.canonicalize(cluster_d)forrecord_id,scoreinzip(id_set,scores):cluster_membership[record_id]={"cluster id":cluster_id,"canonical representation":canonical_rep,"confidence":score}singleton_id=cluster_id+1withopen(output_file,'w')asf_output,open(input_file)asf_input:writer=csv.writer(f_output)reader=csv.reader(f_input)heading_row=next(reader)heading_row.insert(0,'confidence_score')heading_row.insert(0,'Cluster ID')canonical_keys=canonical_rep.keys()forkeyincanonical_keys:heading_row.append('canonical_'+key)writer.writerow(heading_row)forrowinreader:row_id=int(row[0])ifrow_idincluster_membership:cluster_id=cluster_membership[row_id]["cluster id"]canonical_rep=cluster_membership[row_id]["canonical representation"]row.insert(0,cluster_membership[row_id]['confidence'])row.insert(0,cluster_id)forkeyincanonical_keys:row.append(canonical_rep[key].encode('utf8'))else:row.insert(0,None)row.insert(0,singleton_id)singleton_id+=1forkeyincanonical_keys:row.append(None)writer.writerow(row)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值