from tqdm import tqdm
import string
from zhon.hanzi import punctuation as pun
allPun = string.punctuation + pun
def delPunctuation(infile, outfile):
nums_line = 77397242 # 文件行数
with open(infile, 'r',encoding="utf-8") as readFile, open(outfile, 'w', encoding="utf-8") as writeFile:
for idx, line in tqdm(enumerate(readFile), total=nums_line):
out = ''.join([i for i in line if i not in allPun])
writeFile.write(out)
readFile.close()
writeFile.close()
if __name__ == '__main__':
delPunctuation("test.txt", "ans.txt")
string.punctuation 代表英文标点符号
zhon.hanzi 中的punctuation 代表中文标点符号
查找到文中的所有标点符号,将其删除并将处理后的文本写入到新的文件中。


被折叠的 条评论
为什么被折叠?



