运行程序需要,下载哈工大停词用表,并且需要手动上传两篇文章进行是相似度计算
#-*- conding:utf-8 -*-
import pprint
from collections import Counter
import jieba
import numpy as np
#数据抽取(从文件中读取)
file_path='1.txt'
def readFile(file_path):
content = []
with open(file_path, 'r',encoding="utf-8") as f:
content = f.read()
return content
#数据清理(分词去掉停用词)
def cleanWord(content):
# 分词