import jieba
import math
import jieba.analyse
class TF_IDF:
def __init__(self, file, stop_file):
self.file = file
self.stop_file = stop_file
self.stop_words = self.getStopWords()
# 获取停用词列表
def getStopWords(self):
swlist = list()
for line in open(self.stop_file, "r", encoding="utf-8").readlines():
swlist.append(line.strip())
print("加载停用词完成...")
return swlist
# 加载商品和其对应的短标题,使用jieba进行分词并去除停用词
def loadData(self):
dMap = dict()
for line in open(self.file, "r", encoding="utf-8").readlines():
id, title = line.strip().split("\t")
dMap.setdefault(id, [])
for word in list(jieba.cut(str(title).replace(" ", ""), cut_all=False)):
if word not in self.stop_words:
dMap[id].append(word)
推荐系统--基于TF-IDF算法实现商品标题的关键词提取
最新推荐文章于 2024-11-22 21:50:02 发布