import sys
seed_addr_dict = {}
ori_seed_addr_dict = {}
sep = ' '
cluster_dict = {}
cluster_id = 0
hot_addr = {}
def merge_cluster(cluster_dict, seed_addr_dict, addr1_cluster_id, addr2_cluster_id):
addr1_cluster_dic = cluster_dict[addr1_cluster_id]
addr2_cluster_dic = cluster_dict[addr2_cluster_id]
for addr in addr2_cluster_dic:
addr1_cluster_dic[addr] = 1
seed_addr_dict[addr] = addr1_cluster_id
del cluster_dict[addr2_cluster_id]
##hot addrr filt
# with open(sys.argv[1], 'r') as f:
with open(r'D:\work\ocpc_hinet\cluster\hotwallet_addr.csv', 'r') as f:
for line in f:
addr = line.strip()
hot_addr[addr] = 1
# with open(sys.argv[2], 'r') as f:
with open(r'D:\work\ocpc_hinet\cluster\test_seed.csv', 'r') as f:
for line in f:
seed_addr = line.strip()
seed_addr_dict[seed_addr] = -1
ori_seed_addr_dict[seed_addr] = 1
record = []
# with open(sys.argv[3], 'r') as f:
with open(r'D:\work\ocpc_hinet\cluster\test_record.csv', 'r') as f:
for line in f:
arr = line.strip().split(sep)
if len(arr) < 2:
# print(arr)
continue
addr1 = arr[0]
addr2 = arr[1]
if addr1 in hot_addr or addr2 in hot_addr:
continue
t = [addr1, addr2]
record.append(t)
record2 = []
use_record = []
##扩展
while True:
expand_num = 0
for i in range(len(record)):
addr1 = record[i][0]
addr2 = record[i][1]
if addr1 in seed_addr_dict and addr2 not in seed_addr_dict:
seed_addr_dict[addr2] = -1
expand_num += 1
use_record.append(record[i])
elif addr2 in seed_addr_dict and addr1 not in seed_addr_dict:
seed_addr_dict[addr1] = -1
expand_num += 1
use_record.append(record[i])
elif addr1 not in seed_addr_dict and addr2 not in seed_addr_dict:
record2.append(record[i])
else:
use_record.append(record[i])
record = record2
record2 = []
if expand_num == 0:
break
##聚类
for i in range(len(use_record)):
addr1 = use_record[i][0]
addr2 = use_record[i][1]
addr1_cluster_id = seed_addr_dict[addr1]
addr2_cluster_id = seed_addr_dict[addr2]
# if addr1 in seed_addr_dict:
# for cid in cluster_dict:
# cid_dict = cluster_dict[cid]
# if addr1 in cid_dict:
# addr1_cluster_id = cid
# if addr2 in cid_dict:
# addr2_cluster_id = cid
if addr1_cluster_id == addr2_cluster_id and addr1_cluster_id > 0:
continue
if addr1_cluster_id > 0 and addr2_cluster_id < 0:
cluster_dict[addr1_cluster_id][addr2] = 1
seed_addr_dict[addr2] = addr1_cluster_id
elif addr2_cluster_id > 0 and addr1_cluster_id < 0:
cluster_dict[addr2_cluster_id][addr1] = 1
seed_addr_dict[addr1] = addr2_cluster_id
elif addr1_cluster_id < 0 and addr2_cluster_id < 0:
cluster_id += 1
tmp_dict = {}
tmp_dict[addr1] = cluster_id
tmp_dict[addr2] = cluster_id
cluster_dict[cluster_id] = tmp_dict
seed_addr_dict[addr1] = cluster_id
seed_addr_dict[addr2] = cluster_id
elif addr1_cluster_id > 0 and addr2_cluster_id > 0 and addr1_cluster_id != addr2_cluster_id:
merge_cluster(cluster_dict, seed_addr_dict, addr1_cluster_id, addr2_cluster_id)
else:
pass
for addr in ori_seed_addr_dict:
if seed_addr_dict[addr] > 0:
print('seed_addr_cluster', seed_addr_dict[addr], addr)
else:
cluster_id += 1
seed_addr_dict[addr] = cluster_id
print('seed_addr_cluster', seed_addr_dict[addr], addr)
cluster_record_num = {}
for i in range(len(use_record)):
addr1 = use_record[i][0]
addr2 = use_record[i][1]
addr1_cluster_id = seed_addr_dict[addr1]
addr2_cluster_id = seed_addr_dict[addr2]
# print('record_cluster', addr1_cluster_id, addr1, addr2)
if addr1_cluster_id not in cluster_record_num:
cluster_record_num[addr1_cluster_id] = 1
else:
cluster_record_num[addr1_cluster_id] += 1
for cluster_id in cluster_record_num:
print(cluster_id,cluster_record_num[cluster_id])
cluster_num =33
seed_file = r'D:\work\ocpc_hinet\cluster\data\seed_file_' + str(cluster_num) + '.csv'
f1 = open(seed_file, 'w', encoding='utf-8')
f1.write('addr,attr\n')
ori_seed_cluster_num = {}
for addr in ori_seed_addr_dict:
cid = seed_addr_dict[addr]
if cid in ori_seed_cluster_num:
ori_seed_cluster_num[cid] +=1
else:
ori_seed_cluster_num[cid] = 1
if cid == cluster_num:
f1.write(addr+','+'1\n')
f1.close()
seed_trans_file = r'D:\work\ocpc_hinet\cluster\data\seed_trans_file_' + str(cluster_num) + '.csv'
f2 = open(seed_trans_file, 'w', encoding='utf-8')
f2.write('addr_from,addr_to,attr\n')
transaction_lay1_file = r'D:\work\ocpc_hinet\cluster\data\transaction_lay1_file_' + str(cluster_num) + '.csv'
f4 = open(transaction_lay1_file, 'w', encoding='utf-8')
f4.write('addr_from,addr_to,attr\n')
transaction_lay2_file = r'D:\work\ocpc_hinet\cluster\data\transaction_lay2_file_' + str(cluster_num) + '.csv'
f5 = open(transaction_lay2_file, 'w', encoding='utf-8')
f5.write('addr_from,addr_to,attr\n')
for i in range(len(use_record)):
addr1 = use_record[i][0]
addr2 = use_record[i][1]
addr1_cluster_id = seed_addr_dict[addr1]
addr2_cluster_id = seed_addr_dict[addr2]
## addr1_cluster_id 和 addr2_cluster_id 永远一致
if addr1_cluster_id == cluster_num :
if addr1 in ori_seed_addr_dict and addr2 in ori_seed_addr_dict:
f2.write(addr1 + ',' + addr2 + ',1\n')
if addr1 in ori_seed_addr_dict and addr2 not in ori_seed_addr_dict:
f4.write(addr1 + ',' + addr2 + ',1\n')
if addr1 not in ori_seed_addr_dict and addr2 in ori_seed_addr_dict:
f4.write(addr2 + ',' + addr1 + ',1\n')
if addr1 not in ori_seed_addr_dict and addr2 not in ori_seed_addr_dict:
f5.write(addr1 + ',' + addr2 + ',1\n')
f2.close()
f4.close()
f5.close()
expand_addr_file = r'D:\work\ocpc_hinet\cluster\data\expand_addr_file_' + str(cluster_num) + '.csv'
f3 = open(expand_addr_file, 'w', encoding='utf-8')
f3.write('addr,attr\n')
for addr in seed_addr_dict:
if addr in ori_seed_addr_dict:
continue
if seed_addr_dict[addr] == cluster_num:
f3.write(addr+','+'1\n')
f3.close()
seed_cluster_num = {}
for addr in seed_addr_dict:
cid = seed_addr_dict[addr]
if cid in seed_cluster_num:
seed_cluster_num[cid] +=1
else:
seed_cluster_num[cid] = 1
print('cluster num:' + str(len(ori_seed_cluster_num)))
for cid in ori_seed_cluster_num:
print('seed_cluster_num ' + str(cid) + ' ' + str(ori_seed_cluster_num[cid]) + ' ' + str(seed_cluster_num[cid]))
# for id in cluster_dict:
# for addr in cluster_dict[id]:
# if addr in ori_seed_addr_dict:
# print(id, addr)
# print(id, len(cluster_dict[id]))
cluster0629
最新推荐文章于 2024-10-15 11:38:32 发布