#conding=utf-8 """ author=wanggang date:Jan,10,2020 """ from nltk import tokenize from nltk import word_tokenize import nltk from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer import time class DP(): def __init__(self, stopwords_path=None): self.stopwords = self.load_stopwords(stopwords_path) self.vectorizer = CountVectorizer() self.transformer = TfidfTransformer() def load_stopwords(self, stopwords=None): """ 加载停用词 :param stopwords: :return: """ if stopwords: with open(stopwords, 'r', encoding='utf-8') as f: return [line.strip() for line in f] else: return [] def preprocess_data(self, corpus_path): """ 文本预处理,每行一个文本 :param corpus_path: :return: """ corpus = [] with open(corpus_path, 'r', encoding='utf-8') as f: for line in f: corpus.append(' '.join([word for word in nltk.word_tokenize(line.strip()) if word not in self.stopwords])) return corpus def get_text_tfidf_matrix(self, corpus): """ 获取tfidf矩阵 :param corpus: :return: """ tfidf = self.transformer.fit_transform(self.vectorizer.fit_transform(corpus)) # 获取词袋中所有词语 # words = self.vectorizer.get_feature_names() # 获取tfidf矩阵中权重 weights = tfidf.toarray() return weights def generate_weights(self, corpus_path): """ KMeans文本聚类 :param corpus_path: 语料路径(每行一篇),文章id从0开始 :param n_clusters: :聚类类别数目 :return: {cluster_id1:[text_id1, text_id2]} """ corpus = self.preprocess_data(corpus_path) weights = self.get_text_tfidf_matrix(corpus) return weights if __name__ == '__main__': DPcluster = DP(stopwords_path='C:\\Users\\Administrator\\PycharmProjects\\OpenReview\\Cluster\\data\\stop_words.txt') result = DPcluster.generate_weights('C:\\Users\\Administrator\\PycharmProjects\\OpenReview\\Cluster\\data\keyword.txt') count=2269#代表文本文档中的行数 #time.sleep(10) k=0 str=[] for k in range(2268): for i in result[k]: str.append(i) print(str,end='') str=[]
TF-IDF生成向量的方法 直接上代码
最新推荐文章于 2024-06-17 21:12:11 发布