cluster代码


import sys
seed_addr_dict = {}
ori_seed_addr_dict = {}

sep = ' '
cluster_dict = {}
cluster_id = 0
hot_addr = {}

def merge_cluster(cluster_dict, seed_addr_dict, addr1_cluster_id, addr2_cluster_id):
    addr1_cluster_dic = cluster_dict[addr1_cluster_id]
    addr2_cluster_dic = cluster_dict[addr2_cluster_id]
    for addr in addr2_cluster_dic:
        addr1_cluster_dic[addr] = 1
        seed_addr_dict[addr] = addr1_cluster_id
    del  cluster_dict[addr2_cluster_id]

##hot addrr filt
with open(sys.argv[1], 'r') as f:
#with open(r'D:\work\ocpc_hinet\240506affiliate-ad-ctr\240506affiliate-ad-ctr\model\data\seed_dict.txt', 'r') as f:
    for line in f:
        addr = line.strip()
        hot_addr[addr] = 1
with open(sys.argv[2], 'r') as f:
#with open(r'D:\work\ocpc_hinet\240506affiliate-ad-ctr\240506affiliate-ad-ctr\model\data\seed_dict.txt', 'r') as f:
    for line in f:
        seed_addr = line.strip()
        seed_addr_dict[seed_addr] = -1
        ori_seed_addr_dict[seed_addr] = 1


record = []





with open(sys.argv[3], 'r') as f:
#with open(r'D:\work\ocpc_hinet\240506affiliate-ad-ctr\240506affiliate-ad-ctr\model\data\record.txt', 'r') as f:
    for line in f:
        arr = line.strip().split(sep)
        if len(arr) != 3:
            # print(arr)
            continue
        addr1 = arr[0]
        addr2 = arr[1]
        if addr1 in hot_addr or addr2 in hot_addr:
            continue
        t = [addr1, addr2]
        record.append(t)
    record2 = []
    use_record = []
    ##扩展
    while True:
        expand_num = 0
        for i in range(len(record)):
            addr1 = record[i][0]
            addr2 = record[i][1]
            if addr1 in seed_addr_dict and addr2 not in seed_addr_dict:
                seed_addr_dict[addr2] = -1
                expand_num+=1
                use_record.append(record[i])
            elif addr2 in seed_addr_dict and addr1 not in seed_addr_dict:
                seed_addr_dict[addr1] = -1
                expand_num+=1
                use_record.append(record[i])
            elif addr1 not in seed_addr_dict and addr2 not in seed_addr_dict:
                record2.append(record[i])
            else:
                use_record.append(record[i])
        record = record2
        record2 = []
        if expand_num == 0 :
            break
    ##聚类
    for i in range(len(use_record)):
        addr1 = use_record[i][0]
        addr2 = use_record[i][1]
        addr1_cluster_id = seed_addr_dict[addr1]
        addr2_cluster_id = seed_addr_dict[addr2]
#           if addr1 in seed_addr_dict:
#             for cid in cluster_dict:
#                 cid_dict = cluster_dict[cid]
#                 if addr1 in cid_dict:
#                     addr1_cluster_id = cid
#                 if addr2 in cid_dict:
#                     addr2_cluster_id = cid
        if addr1_cluster_id == addr2_cluster_id and addr1_cluster_id>0:
            continue

        if addr1_cluster_id >0 and addr2_cluster_id <0 :
            cluster_dict[addr1_cluster_id][addr2] = 1
            seed_addr_dict[addr2] = addr1_cluster_id
        elif addr2_cluster_id >0 and addr1_cluster_id <0 :
            cluster_dict[addr2_cluster_id][addr1] = 1
            seed_addr_dict[addr1] = addr2_cluster_id

        elif addr1_cluster_id <0 and addr2_cluster_id < 0:
            cluster_id += 1
            tmp_dict ={}
            tmp_dict[addr1] = cluster_id
            tmp_dict[addr2] = cluster_id
            cluster_dict[cluster_id] = tmp_dict
            seed_addr_dict[addr1] = cluster_id
            seed_addr_dict[addr2] = cluster_id

        elif addr1_cluster_id >0 and addr2_cluster_id >0  and addr1_cluster_id != addr2_cluster_id:
            merge_cluster(cluster_dict, seed_addr_dict, addr1_cluster_id, addr2_cluster_id)
        else:
            pass
for addr  in ori_seed_addr_dict:
    if seed_addr_dict[addr] > 0:
        print('seed_addr_cluster', seed_addr_dict[addr], addr)
    else:
        cluster_id += 1
        seed_addr_dict[addr] = cluster_id
        print('seed_addr_cluster', seed_addr_dict[addr], addr)

for i in range(len(use_record)):
    addr1 = use_record[i][0]
    addr2 = use_record[i][1]
    addr1_cluster_id = seed_addr_dict[addr1]
    addr2_cluster_id = seed_addr_dict[addr2]
    print('record_cluster', addr1_cluster_id, addr1, addr2)

# for id in cluster_dict:
#     for addr in cluster_dict[id]:
#         if addr in ori_seed_addr_dict:
#             print(id, addr)
    # print(id, len(cluster_dict[id]))

0xf081914ab0934b30376d331077689bec67f7ddee
0x1217557185fbe8fca846b90fa91b4bd1bfdef4d5
0xa2dc97fc74e5b62aed662a1cfdd5fa4063c2b5cc
0x377688f982ebd1d5321d5e0a120074cff71e5645
0x317ad2b605144c80b19761cea30a4abcfbd39369
0xcbbdc70c8767c0376bec085633aec12a493b32b8
0x4927b07899fa38fdfd91e40a7c5ff6a1c02880ce
0xea907dd4dc251c34d7742d5d82f0e7296949f744
0xb01cb49fe0d6d6e47edf3a072d15dfe73155331c
0x2fc617e933a52713247ce25730f6695920b3befe
0x9696f59e4d72e237be84ffd425dcad154bf96976
0x56de1961fda5454e6f8e6d0e3124ff648fd69400
0xf89d7b9c864f589bbf53a82105107622b35eaa40
0x7aeb3314e041153c4f6bbea19abecbce20946fd4
0x142d20f8e9c09e12402a415fd98de7b5f504fd76
0xa047a4f13a2be5e696111c571b998182eae9036c
0xd3e2ba43520f60de2eee8c635d33245b9bc850ed
0x151b381058f91cf871e7ea1ee83c45326f61e96d
0x6081258689a75d253d87ce902a8de3887239fe80
0xf70da97812cb96acdf810712aa562db8dfa3dbef
0x0023ed9589921912d7fcaeab096371b7af190e6a
0x3cd751e6b0078be393132286c442345e5dc49699
0xa636393e7b823242ba352ccb3eceda8109d1c281
0x4e5b2e1dc63f6b91cb6cd759936495434c7e972f
0xb9bc33fc4451de5d30467664b5cc863f43471369
0x2c81f91ee8c090ed90a461eb25d5ff335b6e917d
0x5dbf92e7cd3ab9434e495b195ec8fd07e4baf86b
0x0ad13a4badccfd952bdc8534c65687ddd6a0820d
0x24e25e6b31add60b844f38b28be4479de016897d
0xdc07e127cfd37065790f39956b7489aa5f32818b
0x686ad824ed9e3470a3bf88697d68d0a7531da60d
0x2f730920f265ff855e604303b7da713d50943da9
0x9cb312bc4b0b82ed6d067d0bd066c52dee5ee0a7
0x721befb93aad9015859ec6b2a20f3da541c60874
0x0d9f0fef92ee0915865153a0e98286c4e21f90c6
0x023589ad41055403102597e3c3c6a99bf3291baa
0xd6d7147cc3e4f216730e49495344e00d30325957
0x86f4d3a9af802ff9a06317ead9d3a39b4e6a938c
0xf7c0dee967afaa9dfeac5534dfbc974e72381bea
0xce671af32752e2d1e7499cb9ad554e8ff0f76cfa
0x07ae4b1dd08289a549886c991faf890b445644a2
0x0c2718874629d40ad1e68f8ddcf8a9a3cf84e152
0xde40de56cf31ae045ffbf0b3f3845100a6125a21
0xe8e757adbdd8b5caaa7ae788ee4b9bd6b1fd5be2
0x4976a4a02f38326660d17bf34b431dc6e2eb2327
0x803aca9a98735965128da200ef9d55f6dc7e2a66
0xbf94f0ac752c739f623c463b5210a7fb2cbb420b
0x6ff5fb3020c44dc1a8e77c8a8b3eca0291d31e73
0x357c6fd2cee77ba5de49e0bb9d49444781a8f0cc
0xeaf330d8dda9e7a4fd6da37a2328b6cadce3a558
0x5abc98c756ead696afb6d82fcf14194916127e5c
0x6af1cffdf98cf70f883dd39bf4f23f94d39795cb
0x73795d43e365276bb03109e4f8d6f40956b7826c
0xa1b7ee55e316c5e0d10c38d08c9b3d9610daf16b
0xd5b04aa0457cd77dda2087e1abb66b672373d30f
0x8dd8b7340b9ff5e196bcfe26a9028e70b29fb75e
0x205603fede753bbfb0d8d7f56bc8b491d20abe7b
0x2a90606aa6fe305f6ba903772f218c5121099f97
0xd38bfef443bad0b7a40c3c0886325d0779c8d0d5
0x4dd11631683d10801561b174f45b45b8a201f2c3
0x2438a284180e4f32e607b59077b6e39c98de08d2
0x40db97cd472846a0086917de7b1d8784e135a657
0x81e8a04952bebae17f0e7fcfe8d2111d7f5464c4
0x11542ac6d775bd2d9925f40f0ff0c3824928f5a3
0x78020b049e09c6400ae315641b4a6c3d159358aa
0x8dc4bb7dbfca27de1164c75ddeecae480a55b528
0xfb18890070e09a8defedbbd0b219c487a017ae5c
0xa3e6c7d8b72efef9d7153da9bc112e3380d542ab
0x62c83cc399cbfff5968e1cfe97c536b5bd395e05
0xfed11fff39e81a3c8831e20bcaaee72269127421
0x94c50bdfeb3365ebfc0a48b9d6f1c1d4274a4f31
0xa2bb307b2206a7b3422a7a6f26c88026b54b91d6
0xcbedb0f7c8c5785ef32d43686e3d3df373de50e1
0xcd4fa64addff56283f1844aaaab14d149c4351a1
0x344f16da052cd437c776c2af2ca47ed0fe1ca767
0x9bbc5b73b17243ce83f746e66b1f5370ba5e7ded
0x4d69710df767fb7a26dde8f92c06b0c3e04c7f98
0x7db2bbbb4f1fb6a1e08d7bdb430f7f185ca825f9
0x924f6ddb94ff27346b214bbd17a28dec229c32a4
0x52a258ed593c793251a89bfd36cae158ee9fc4f8
0x974caa59e49682cda0ad2bbe82983419a2ecc400
0xae45a8240147e6179ec7c9f92c5a18f9a97b3fca
0x1c727a55ea3c11b0ab7d3a361fe0f3c47ce6de5d
0x46af00d623ff52f42c293ba9a05d865d3d34993c
0x12268c296e3559ea1f906d7c6a6a8de845586184
0xe41d67ac9032e7d89668b3a7a415de9e8dc3e2a4
0xa4838cf9e998abed0897f8f9e2b432c3526cdfbb
0x9fbcc6a460b11e0449dddb766a07b86a3b4d9676
0x596be56467d81b695200342bd78ede8a5b65d577
0x6e78684e47e3cd25d0b08adcf8f041061168154c
0x555aff05653d0b2c6e31c78463ad2af4ef05f616
0xf45afb1992e9c95ea1a57033282739ddffa3002c
0xd6d9743240330089537cd311a760409eee72c550
0x6371924ae1ef80acc84f926c511a2f79b3c29821
0x852003dd81c5ee21305aeeff8240d99fd7aad554
0xfa4d41ae04a9ef67e4e526987eeed60ea97425ab
0xB87eB1095F816D4C1CaB1796b2052843465788B0
0xd173c4faecC446B025AF9a6f7762640c1838C831
0x67365531579b4b55eFB03C246A053f7aa81bbB57
0x3084d4F4E77F0BaE4886D05aC1d9C2BC25576E51

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值