plain python两两循环比较算法

測試代碼

import numpy as np

lst = np.arange(1500)
tmp = list(lst[:5])
print(tmp)

[0, 1, 2, 3, 4]

for i in tmp[::-1]:   
    tmp.pop()
    print(i)

4
3
2
1
0

%time
dic = {}
def make_dic(p,center):
#     if tar in dic:
#         dic[tar].append(e)
#     else:
#         dic[tar] = [e]
    if center in dic:
        ele = dic[center]
        del dic[center]
        dic[p] =  ele + [center]
    else:
        dic[p] = [center]
    
        
def simi(num1,num2):
    if ((num1 - num2) <= 100) and ((num1 - num2) >= -100):
        return True
    else:
        return False

def main(lst):
    for i in range(len(lst)-1,-1,-1):
        center = lst[i]
        for p in lst[:i]:
            if simi(p,center):
                make_dic(p,center)
                break
    print(dic)
    
main(lst)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 7.87 µs
{0: [1401, 1301, 1201, 1101, 1001, 901, 801, 701, 601, 501, 401, 301, 201, 101, 1]}

實際運行代碼

import json
from tqdm import tqdm
import pandas as pd
from nlutools import tools as nlu


def maintain_dic(p, center):
    """
    dic = {"b":["c"]}

    p="a", center="b"

    dic = {"a":["b","c"]}
    """
    if center in dic:
        ele = dic[center]
        del dic[center]
        dic[p] = [center] + ele
    else:
        dic[p] = [center]


def simi(a, b):
    try:
        score = nlu.simiscore(a, b, type='tencent')
    except:
        score = 0

    if score >= 0.8:
        print(a, b, score)
        line = a + ',' + b + ',' + str(score) + '\n'
        fout_tmp.write(line)
        return True
    else:
        return False


def my_cluster(lst):
    for i in tqdm(range(len(lst)-1, -1, -1)):
        center = lst[i]
        for p in lst[:i]:
            if simi(p, center):
                maintain_dic(p, center)
                break
    return dic


if __name__ == "__main__":
    dic = {}
    fout_tmp = open("concat_tmp_all.csv", mode="w", encoding="utf8")
    fout = open("concat_result_all.json", mode="w", encoding="utf8")

    df = pd.read_csv("to_merge_all.tsv", sep="\t")
    entity_lst = df['entity'].to_list()

    re = my_cluster(entity_lst)
    fout.write(json.dumps(re))

    fout_tmp.close()
    fout.close()

dic:
{‘塑胶模具’: [‘五金模具’],
‘网络-局域网’: [‘无线局域网’, ‘无线网络’],
‘污泥处理’: [‘污水厂’],
‘体育中心’: [‘文体中心’],
‘市场策划’: [‘文案策划’],
‘网络营销’: [‘微信营销’],
‘网页设计’: [‘网站设计’],
‘网站策划’: [‘网站建设’],
‘手机银行’: [‘网上银行’, ‘网上支付’],
‘网购’: [‘网上购物’],
‘网络安全’: [‘网络信息安全’],
‘投行’: [‘投资银行’],
‘投资’: [‘投资项目’],
‘收购并购’: [‘收购兼并’],
‘通信’: [‘通讯’],
‘调度平台’: [‘调度系统’],
‘诉讼’: [‘诉讼案件’],
‘税局业务’: [‘税务业务’],
‘水泵项目’: [‘水利项目’],
‘汽轮机’: [‘水轮发电机’],
‘数据营销’: [‘数字营销’, ‘整合营销’],
‘数据管理系统’: [‘数字管理系统’],
‘数据分析’: [‘数据统计分析’],
‘数据服务’: [‘数据服务平台’],
‘数据采集’: [‘数据处理’, ‘数据管理’],
‘石油管道’: [‘输油管道’],
‘票务系统’: [‘售票系统’],
‘售楼处’: [‘售楼中心’],
‘融资租赁’: [‘融资租赁业务’, ‘租赁业务’],
‘示范项目’: [‘试点项目’],
‘施工’: [‘施工现场’],
‘社保审计业务’: [‘审计业务’],
‘社交平台’: [‘社交网络’],
‘商业地产’: [‘商业物业’],
‘侵权案件’: [‘商标侵权案件’],
‘融资’: [‘融资计划’],
‘品牌策划’: [‘全案策划’]}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值