通过结巴分词+gensim实现简单的文档查重,先分词处理再用gensim计算相似度。
#!/usr/bin/env python
# _*_ coding:utf-8 _*_
import os
import jieba # 导入结巴分词模块
import sys
import time
import jieba.posseg as pseg
sys.path.append("C:/Users/asus/Desktop/User_Interests/fenci") # 添加程序所在路径
jieba.load_userdict("database/userdict.txt") # 加载用户自定义词典,可在此txt文档中自行添加词汇
def cutTxtWord(dealpath, savepath, stopwordspath):
"""
函数功能:对单个txt文档进行分词
入口参数:dealpath:需要分词的txt文件路径
savepath:分词完成后保存分词结果的txt路径
stopwordspath:停用词txt文件路径
"""
with open(stopwordspath, "r") as f:
stopwords = {}.fromkeys([line.rstrip() for line in f])
with open(dealpath, "r") as f:
txtlist = f.read()
txtlist.decode('gbk', 'ignore').encode('utf-8')
words = pseg.cut(txtlist)
cutresult = ""
for word, flag in words:
if word not in stopwords:
cutresult += word + "/" + flag + " "
# print(cutresult)
getFlag(cutresult, savepath)
def cutFileWord(read_folder_path, write_folder_path, stopwordspath):
"""
函数功能:对整个文件夹内的txt文件进行分词