各类关键词抽取:tf-idf、textrank4zh(英文)RAKE(中文)
import codecs
import os
import jieba.analyse
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
#要提取关键词的文本所在文件夹
base_path = "D:\小周\论文\知乎数据\tfidf-article\\base"
#分词后的文本保存的文件位置
seg_path = "D:\小周\论文\知乎数据\tfidf-article\\segfile"
f_stop = open("D:\小周\论文\学位论文\文本分析类\hit_stopwords.txt","r", encoding = 'UTF-8')
stop_words = f_stop.read()
f_stop.close()
def segment():
"""word segment"""
for txt in os.listdir(base_path):
whole_base = os.path.join(base_path, txt)
whole_seg = os.path.join(seg_path, txt)
with codecs.open(whole_base, 'r', 'utf-8') as fr:
fw = codecs.open(whole_seg, 'w', 'utf-8')
for line in fr.readlines():
# seg_list = jieba.cut(line.strip())
seg_list = jieba.analyse.extract_tags(line.strip(), topK=20, withWeight=False, allowPOS=())
# 第一个参数:待提取关键词的文本
# 第二个参数:返回关键词的数量,重要性从高到低排序
# 第三个参数:是否同时返回每个关键词的权重
# 第四个参数:词性过滤,为空表示不过滤,若提供则仅返回符合词性要求的关键词
# for item in seg_list:
# # 分别为关键词和相应的权重(要返回权重时withweight值需改为True)
# print(item[0], item[1])
words = [ w for w in seg_list if w not in stop_words]
fw.write(" ".join(words))
fw.close()
def read_doc_list():
trade_list = []
doc_list = []
for txt in os.listdir(seg_path):
trade_list.append(txt.split(".")[0])
with codecs.open(os.path.join(seg_path, txt), "r", "utf-8") as fr:
doc_list.append(fr.read().replace('\n', ''))
return trade_list, doc_list
def tfidf_top(trade_list, doc_list, max_df, topn):
vectorizer = TfidfVectorizer(max_df=max_df)
matrix = vectorizer.fit_transform(doc_list)
feature_dict = {v: k for k, v in vectorizer.vocabulary_.items()} # index -> feature_name
top_n_matrix = np.argsort(-matrix.todense())[:, :topn] # top tf-idf words for each row
df = pd.DataFrame(np.vectorize(feature_dict.get)(top_n_matrix), index=trade_list) # convert matrix to df
return df
segment()
tl, dl = read_doc_list()
tdf = tfidf_top(tl, dl, max_df=0.5, topn=10)
#忽略出现在50%以上文档中的术语,提取前十个关键词
tdf.to_csv("D:\小周\论文\知乎数据\tfidf-article\\keywords1.txt", header=False, encoding='utf-8')