import logging from logging import NullHandler log = logging.getLogger(__name__) log.addHandler(NullHandler()) from corpussrc import DoubanCorpus from gensim import corpora, models, similarities from cleaner import StopWordFilter def test_lsi_query(dictionary, lsi, index): teststr = u'哈哈有个评论太可爱了,你们知道吴京有多努力吗?不过打一星是看新闻气的。' filter = StopWordFilter() vec_bow = dictionary.doc2bow(filter.transform(jieba.cut(teststr))) vec_lsi = lsi[vec_bow] sims = index[vec_lsi] sims = sorted(enumerate(sims), key=lambda item: -item[1]) log.warn(sims) def test_deep_learning(): # size:特征向量的维度 window:上下文相关环境的长度 min_count:最小词频率 workers:进程数 model = models.Word2Vec(DoubanCorpus('tbDoubanReview'), size=100, window=5, min_count=5, workers=4) model.wv[u'吴京'] model.wv.most_similar(positive=['woman', 'king'], negative=['man']) # 语料-》词典->bow->model->similar # 基本思路:向量化、比较向量 def testapi(): dictionary = corpora.Dictionary(DoubanCorpus('tbDoubanReview')) log.warn(dictionary.token2id) docs = DoubanCorpus('tbDoubanReview') # bow格式[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)], [(11, 1), (12, 1)] # 每个元组的含义是 字典中的id,本文档含有的次数, 只存储本文档含有的词汇 corpus = [] for docwords in docs: log.warn(docwords) corpus.append(dictionary.doc2bow(docwords)) log.warn(corpus) # 1、bow tfidf = models.TfidfModel(corpus) # 2、bow ->tfidf # [(4, 0.447213595499958), (5, 0.447213595499958), (6, 0.447213595499958), (7, 0.447213595499958), (8, 0.447213595499958)] # 体现每个词的重要性,数值越大,越重要 corpus_tfidf = tfidf[corpus] for doc in corpus_tfidf: log.warn(doc) # 3、lsi 模型, 使用奇异矩阵来压缩数据(SVD),是去掉噪声的过程(去掉同义词、反义词的干扰) lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10) corpus_lsi = lsi[corpus_tfidf] lsi.print_topics(10) lda = models.LdaModel(corpus, id2word=dictionary, num_topics=100) # 输入是bow格式 lda.print_topics(20) # 先建立向量model的索引 index = similarities.MatrixSimilarity(lsi[corpus_tfidf]) test_lsi_query(dictionary, lsi, index) if __name__ == '__main__': testapi()
nlp示例代码
最新推荐文章于 2022-10-12 10:53:45 发布