測試代碼
import numpy as np
lst = np.arange(1500)
tmp = list(lst[:5])
print(tmp)
[0, 1, 2, 3, 4]
for i in tmp[::-1]:
tmp.pop()
print(i)
4
3
2
1
0
%time
dic = {}
def make_dic(p,center):
# if tar in dic:
# dic[tar].append(e)
# else:
# dic[tar] = [e]
if center in dic:
ele = dic[center]
del dic[center]
dic[p] = ele + [center]
else:
dic[p] = [center]
def simi(num1,num2):
if ((num1 - num2) <= 100) and ((num1 - num2) >= -100):
return True
else:
return False
def main(lst):
for i in range(len(lst)-1,-1,-1):
center = lst[i]
for p in lst[:i]:
if simi(p,center):
make_dic(p,center)
break
print(dic)
main(lst)
CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 7.87 µs
{0: [1401, 1301, 1201, 1101, 1001, 901, 801, 701, 601, 501, 401, 301, 201, 101, 1]}
實際運行代碼
import json
from tqdm import tqdm
import pandas as pd
from nlutools import tools as nlu
def maintain_dic(p, center):
"""
dic = {"b":["c"]}
p="a", center="b"
dic = {"a":["b","c"]}
"""
if center in dic:
ele = dic[center]
del dic[center]
dic[p] = [center] + ele
else:
dic[p] = [center]
def simi(a, b):
try:
score = nlu.simiscore(a, b, type='tencent')
except:
score = 0
if score >= 0.8:
print(a, b, score)
line = a + ',' + b + ',' + str(score) + '\n'
fout_tmp.write(line)
return True
else:
return False
def my_cluster(lst):
for i in tqdm(range(len(lst)-1, -1, -1)):
center = lst[i]
for p in lst[:i]:
if simi(p, center):
maintain_dic(p, center)
break
return dic
if __name__ == "__main__":
dic = {}
fout_tmp = open("concat_tmp_all.csv", mode="w", encoding="utf8")
fout = open("concat_result_all.json", mode="w", encoding="utf8")
df = pd.read_csv("to_merge_all.tsv", sep="\t")
entity_lst = df['entity'].to_list()
re = my_cluster(entity_lst)
fout.write(json.dumps(re))
fout_tmp.close()
fout.close()
dic:
{‘塑胶模具’: [‘五金模具’],
‘网络-局域网’: [‘无线局域网’, ‘无线网络’],
‘污泥处理’: [‘污水厂’],
‘体育中心’: [‘文体中心’],
‘市场策划’: [‘文案策划’],
‘网络营销’: [‘微信营销’],
‘网页设计’: [‘网站设计’],
‘网站策划’: [‘网站建设’],
‘手机银行’: [‘网上银行’, ‘网上支付’],
‘网购’: [‘网上购物’],
‘网络安全’: [‘网络信息安全’],
‘投行’: [‘投资银行’],
‘投资’: [‘投资项目’],
‘收购并购’: [‘收购兼并’],
‘通信’: [‘通讯’],
‘调度平台’: [‘调度系统’],
‘诉讼’: [‘诉讼案件’],
‘税局业务’: [‘税务业务’],
‘水泵项目’: [‘水利项目’],
‘汽轮机’: [‘水轮发电机’],
‘数据营销’: [‘数字营销’, ‘整合营销’],
‘数据管理系统’: [‘数字管理系统’],
‘数据分析’: [‘数据统计分析’],
‘数据服务’: [‘数据服务平台’],
‘数据采集’: [‘数据处理’, ‘数据管理’],
‘石油管道’: [‘输油管道’],
‘票务系统’: [‘售票系统’],
‘售楼处’: [‘售楼中心’],
‘融资租赁’: [‘融资租赁业务’, ‘租赁业务’],
‘示范项目’: [‘试点项目’],
‘施工’: [‘施工现场’],
‘社保审计业务’: [‘审计业务’],
‘社交平台’: [‘社交网络’],
‘商业地产’: [‘商业物业’],
‘侵权案件’: [‘商标侵权案件’],
‘融资’: [‘融资计划’],
‘品牌策划’: [‘全案策划’]}