python 对txt文本的去重处理
前言
日常工作需要经常对数据进行去重的处理,在这里简单记录分享下
1.引入库
import os
2.完整代码(读取单个txt文件)
代码如下:
应事先在将txt文件放在对应路径下
import os
path = './tutu'
if not os.path.exists(path):
os.makedirs(path)
f3 = open(f"./tutu/a.txt","r",encoding='utf-8')
text_list = []
s = set()
document = f3.readlines()
document_num = int(len(document))
print('原条数:' + str(document_num))
print('================去重中================')
content = [x.strip() for x in document]
# print(content)
for x in range(0,len(content)):
url = content[x]
if url not in s:
s.add(url)
text_list.append(url)
filename = int(len(text_list))
print('现条数:' + str(filename))
print('减少了:'+ str(document_num-filename ))
with open(f'./tutu/b.txt','a+',encoding='utf-8') as f:
for i in range(len(text_list)):
# s = str(i).split()
s = str(text_list[i])
s = s + '\n'
f.write(s)
print('================保存文件成功================')
3.完整代码(读取多个txt文件)
import os
path = './tutu'
if not os.path.exists(path):
os.makedirs(path)
print('================读取中================')
for filename in os.listdir(f"./tutu/"):
print(filename)
with open(f"./tutu/"+filename,encoding='utf-8') as f: #读取每个文件
for line in f.readlines(): #将每个文件文本同意逐行写入一个word中
with open(f"./tutu/全部数据整理.txt","a+",encoding='utf-8') as mom:
mom.write(line)
print('================已完成TXT读取并写入新TXT================')
f3 = open(f"./tutu/全部数据整理.txt","r",encoding='utf-8')
text_list = []
s = set()
document = f3.readlines()
document_num = int(len(document))
print('原条数:' + str(document_num))
print('================去重中================')
content = [x.strip() for x in document]
# print(content)
for x in range(0,len(content)):
url = content[x]
if url not in s:
s.add(url)
text_list.append(url)
filename = int(len(text_list))
print('现条数:' + str(filename))
print('减少了:'+ str(document_num-filename ))
f3.close()
with open(f'./tutu/全部数据整理(去重后).txt','a+',encoding='utf-8') as f:
for i in range(len(text_list)):
# s = str(i).split()
s = str(text_list[i])
s = s + '\n'
f.write(s)
print('================保存去重文件成功================')
os.remove(f"./tutu/全部数据整理.txt") #删除全部数据整理.txt