# 静态配置
stop_word_path = "/InferenceSystem/src/I5_algorithm/NLP数据集合/停词库/stop_word_for_chinese.txt"
corpus = "南方网 3.csv"
get_news_paper(url,corpus)
#临时删除文本元素
def del_element(strings,symbles):
srcrep = {i:'' for i in symbles }
rep = dict((re.escape(k), v) for k, v in srcrep.items())
pattern = re.compile("|".join(rep.keys()))
return pattern.sub(lambda m: rep[re.escape(m.group(0))], strings)
#加载停用词
stop_words = stop_words = open(stop_word_path,'r').read().split('\n')+['\n']
#过滤停用词
def filter_stop_word(paper,stop_words):
return np.array(list(filter(lambda x: x not in stop_words,jieba.cut(del_element(paper,'\n')))))
#读取本地新闻
def read_txt(corpus):
return np.array([re.sub('\n','',str(word)) for word in tqdm(pd.read_csv(corpus).text,desc='加载文章')])
#只要中文
def just_chinese(strings):
regStr = ".*?([\u4E00-\u9FA5]+).*?"
expr = ''.join(re.findall(regStr, strings))
if expr:
return expr
return '\n'
#分词
def split_word(original,temp_del=stop_words):
result = []
for paper in tqdm(original,desc='分词文章'):
chinese = just_chinese(paper)
temp_split_words = filter_stop_word(chinese,stop_words)
result.append(temp_split_words)
return np.array(result)
# 排序字典
def sort_dict(dict_items):
sorted_tuple = np.array(sorted(dict_items.items(), key=lambda x: x[0], reverse=True))
return dict(zip(sorted_tuple[:,0],sorted_tuple[:,1]))
'''数据预处理函数'''
def data_preprocessing(corpus):
# 读取原文
read_original = read_txt(corpus)
# 倒入文章并分词
init_paper = split_word(read_original,stop_words)
# 所有单词降维到一维
all_words = np.array([j for i in tqdm(init_paper,desc='词列表降维') for j in i])
# 单词去重
word_vector = np.unique(all_words)
# 测量共有词汇量
m = all_words.size
init_word_dict = {word:(all_words==word).dot(np.ones(m))/m for word in tqdm(word_vector,desc='构建频率词典')}
#构建排序字典和特征向量
word_dict = sort_dict(init_word_dict)
word_vector = np.array(list(word_dict))
return word_dict,word_vector,read_original,init_paper
'''TF-IDF标准词袋'''
def TF(paper_words,word_vector):
m = word_vector.size
init_TF = np.zeros(m)
for word in paper_words:
if word in word_vector:
index_ = np.argwhere(word_vector==word)[0][0]
init_TF[index_] += 1
return init_TF
def IDF(paper_words_list,word_vector):
m = word_vector.size
init_IDF = np.zeros(m)
N = paper_words_list.shape
n = -1
for word in tqdm(word_vector,desc = 'IDF词汇'):
n += 1
for paper_arr in paper_words_list:
if word in paper_arr:
init_IDF[n] += 1
return np.log(N/(init_IDF+1))
def TFIDF(paper_words_list,word_vector):
IDF_arr = IDF(init_paper,word_vector)
TF_arr = np.array([TF(paper,word_vector) for paper in tqdm(paper_words_list,desc = 'TF矩阵')])
return TF_arr*IDF_arr
纯中文使用TF-IDF 标准词袋
最新推荐文章于 2024-01-30 22:54:58 发布