前言
在做项目的时候,经常会遇到一个大列表中存在多个小列表,然后小列表中有很多相同的值,需要对存在相同的值的列进行合并去重。
如题:
lists = [
[0, 1, 2],
[6, 7],
[3, 4, 5],
[3, 6, 7],
[8, 9, 10],
[23, 24, 25],
[10, 11, 12],
[13, 14, 15],
[15, 16, 17],
[2, 3],
[20],
]
解决思路
思路一
先遍历,先找出没个元素出现在列表中的索引,再遍历存在索引值就进行数据合并
list_dict = {}
for i, item in enumerate(lists):
for e in item:
if e in list_dict:
if i not in list_dict[e]:
list_dict[e].append(i)
else:
list_dict[e] = [i]
merge_idx_list = []
element_set = set()
index_set = set()
for e, tmp_list in list_dict.items():
if e in element_set:
continue
element_set.add(e)
cursor = 0
queue_list = []
for idx in tmp_list:
if idx not in index_set:
index_set.add(idx)
queue_list.append(idx)
while cursor < len(queue_list):
l_idx = queue_list[cursor]
for ee in lists[l_idx]:
if ee in element_set:
continue
element_set.add(ee)
cand_list = list_dict[ee]
for cand_idx in cand_list:
if cand_idx not in index_set:
index_set.add(cand_idx)
queue_list.append(cand_idx)
cursor += 1
merge_idx_list.append(queue_list)
print(merge_idx_list)
result_list = []
for idx_list in merge_idx_list:
union_set = set()
for idx in idx_list:
union_set = union_set.union(set(lists[idx]))
result_list.append(union_set)
print(result_list)
结果:
[[0, 9, 2, 3, 1], [4, 6], [5], [7, 8], [10]]
[{0, 1, 2, 3, 4, 5, 6, 7}, {8, 9, 10, 11, 12}, {24, 25, 23}, {16, 17, 13, 14, 15}, {20}]
思路二
先遍历,先找出没个元素出现在列表中的索引和元素,遍历过的先过滤掉,采用递归的方式去查找合并。
merge_idx_list = []
element_set = set()
index_set = set()
def search(lists, query_list):
new_query_list = []
index_list = []
for query in query_list:
for index, list in enumerate(lists):
if index not in index_set:
if query in list:
index_set.add(index)
index_list.append(index)
for item in list:
if item not in element_set:
element_set.add(item)
new_query_list.append(item)
if len(new_query_list) > 0:
index_list += search(lists, new_query_list)
return index_list
for index, list in enumerate(lists):
if index not in index_set:
index_set.add(index)
index_list = [index]
query_list = []
for item in list:
if item not in element_set:
element_set.add(item)
query_list.append(item)
if len(query_list) > 0:
index_list += search(lists, query_list)
if len(index_list) > 0:
merge_idx_list.append(index_list)
print(merge_idx_list)
result_list = []
for idx_list in merge_idx_list:
union_set = set()
for idx in idx_list:
union_set = union_set.union(set(lists[idx]))
result_list.append(union_set)
print(result_list)
结果:
[[0, 9, 2, 3, 1], [4, 6], [5], [7, 8], [10]]
[{0, 1, 2, 3, 4, 5, 6, 7}, {8, 9, 10, 11, 12}, {24, 25, 23}, {16, 17, 13, 14, 15}, {20}]
完毕!!!
经过大批量数据测试,采用思路二的方式速度会更快。