def find_intersection(file1, file2, stop_threshold=15000000):
ids_set = set() # 存储交集的ID
# 读取第二个txt文件中的ID和tag
with open(file2, 'r') as f:
for line in f:
parts = line.strip().split('\t')
id_value = parts[0]
tag = parts[1]
ids_set.add((id_value, tag))
count = 0 # 记录交集数量
result_set = set() # 存储最终结果
# 逐行读取第一个txt文件的ID列,与第二个文件的ID进行匹配
with open(file1, 'r') as f:
for line in f:
id_value = line.strip()
# 如果该ID在第二个文件中存在,则添加到结果集中
if (id_value, '') in ids_set:
result_set.add(id_value)
count += 1
# 当交集数量达到阈值时停止求交
if count >= stop_threshold:
break
# 将结果写入文件
with open('intersection.txt', 'w') as f:
for id_value in result_set:
f.write(id_value + '\n')
# 调用函数进行求交集
find_intersection('file1.txt', 'file2.txt', stop_threshold=15000000)
python求交
最新推荐文章于 2024-07-06 12:03:33 发布