""" 构建词频矩阵, 从而得到TF、IDF"""
import csv
import math
import re
import jieba
import numpy as np
import scipy.sparse as ss
# 加载补充词典
jieba.load_userdict("./words/dict.txt")
stopwords = [] # 去除停用词
for stop in open("./words/stop_words", 'r'):
stopwords.append(stop.split('\n')[0])
stopwords.append('')
stopwords.append(' ')
stop_word = [u'的', u'是', u'和', u'丨']
# 预切断句子,以免得到太多无意义(不是中文、英文、数字)的字符串
def text_filter(line):
line_arr = []
w = u'#'
# for line in texts:
for word in stop_word:
line = line.replace(word, w)
line_arr.append(line)
for a in line_arr:
# 这个正则表达式匹配的是任意非中文、非英文、非数字,因此它的意思就是用任意非中文、非英文的字符断开句子
for t in re.split(u'[^\u4e00-\u9fa5a-zA-Z]+', a):
if t:
yield t
def cut(txt):
word_list = []
for
构建词频矩阵, 从而得到TF、IDF
最新推荐文章于 2022-06-19 21:28:50 发布
![](https://img-home.csdnimg.cn/images/20240711042549.png)