# This can run either as a python2 or python3 codefromfuture.builtinsimportnextimportosimportcsvimportreimportloggingimportoptparseimportdedupefromunidecodeimportunidecodeinput_file='data/csv_example_input_with_true_ids.csv'output_file='data/csv_example_output1.csv'settings_file='data/csv_example_learned_settings'training_file='data/csv_example_training.json'# Clean or process the datadefpreProcess(column):try:column=column.decode('utf-8')exceptAttributeError:passcolumn=unidecode(column)column=re.sub(' +',' ',column)column=re.sub('\n',' ',column)column=column.strip().strip('"').strip("'").lower().strip()ifnotcolumn:column=Nonereturncolumn# Read in the data from CSV file:defreadData(filename):data_d={}withopen(filename)asf:reader=csv.DictReader(f)forrowinreader:clean_row=[(k,preProcess(v))for(k,v)inrow.items()]row_id=int(row['Id'])data_d[row_id]=dict(clean_row)returndata_dprint('importing data ...')data_d=readData(input_file)ifos.path.exists(settings_file):print('reading from',settings_file)withopen(settings_file,'rb')asf:deduper=dedupe.StaticDedupe(f)else:fields=[{'field':'Site name','type':'String'},{'field':'Address','type':'String'},{'field':'Zip','type':'Exact','has missing':True},{'field':'Phone','type':'String','has missing':True},]deduper=dedupe.Dedupe(fields)deduper.sample(data_d,15000)ifos.path.exists(training_file):print('reading labeled examples from ',training_file)withopen(training_file,'rb')asf:deduper.readTraining(f)print('starting active labeling...')dedupe.consoleLabel(deduper)deduper.train()withopen(training_file,'w')astf:deduper.writeTraining(tf)withopen(settings_file,'wb')assf:deduper.writeSettings(sf)threshold=deduper.threshold(data_d,recall_weight=1)print('clustering...')clustered_dupes=deduper.match(data_d,threshold)print('# duplicate sets',len(clustered_dupes))cluster_membership={}cluster_id=0for(cluster_id,cluster)inenumerate(clustered_dupes):id_set,scores=clustercluster_d=[data_d[c]forcinid_set]canonical_rep=dedupe.canonicalize(cluster_d)forrecord_id,scoreinzip(id_set,scores):cluster_membership[record_id]={"cluster id":cluster_id,"canonical representation":canonical_rep,"confidence":score}singleton_id=cluster_id+1withopen(output_file,'w')asf_output,open(input_file)asf_input:writer=csv.writer(f_output)reader=csv.reader(f_input)heading_row=next(reader)heading_row.insert(0,'confidence_score')heading_row.insert(0,'Cluster ID')canonical_keys=canonical_rep.keys()forkeyincanonical_keys:heading_row.append('canonical_'+key)writer.writerow(heading_row)forrowinreader:row_id=int(row[0])ifrow_idincluster_membership:cluster_id=cluster_membership[row_id]["cluster id"]canonical_rep=cluster_membership[row_id]["canonical representation"]row.insert(0,cluster_membership[row_id]['confidence'])row.insert(0,cluster_id)forkeyincanonical_keys:row.append(canonical_rep[key].encode('utf8'))else:row.insert(0,None)row.insert(0,singleton_id)singleton_id+=1forkeyincanonical_keys:row.append(None)writer.writerow(row)
python重复数据删除_Python中的重复数据删除
最新推荐文章于 2024-04-24 14:45:00 发布