转载加原创
简单程序代码和注释::
raw_corpus = ["Human machine interface for lab abc computer applications",
"A survey of user opinion of computer system response time",
"The EPS user interface management system",
"System and human system engineering testing of EPS",
"Relation of user perceived response time to error measurement",
"The generation of random binary unordered trees",
"The intersection graph of paths in trees",
"Graph minors IV Widths of trees and well quasi ordering",
"Graph minors A survey"]
stoplist = set('for a of the and to in'.split(' ')) #停用词
# print(stoplist)
texts = [[word for word in document.lower().split() if word not in stoplist]
for document in raw_corpus] #不含停用词的单词进行切分 生成的是一个个的列表
# print(texts)
from collections import defaultdict #字典新用法,边用边创建,当值不存在时,自动创建置空
frequency = defaultdict(int)
# print(type(frequency))
for text in texts:
for token in text:
frequency[token] += 1
# print(frequency) #统计每个词出现的个数,用字典存储
precessed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
# print(precessed_corpus) #最终结果找出词频大于1的词组成列表
from gensim import corpora #应该是为以后构建向量找出一个固定的位置 以后向量的长度就是字典的长度
dictionary = corpora.Dictionary(precessed_corpus)
print(dictionary) #单词列表
print(dictionary.token2id) #对单词个数进行统计 结果是一个字典
new_doc = "human computer interaction" #小测试 由训练预料 得出单词向量
new_vec = dictionary.doc2bow(new_doc.lower().split())
# print(new_vec) #[(0, 1), (2, 1)] 元组第一个位置表示个数,第二个位置表示在语料中出现的次数 ,没有出现的隐含为0 省略不写
bow_corpus = [dictionary.doc2bow(text) for text in precessed_corpus] #训练预料的向量
print(bow_corpus) #列表完全存在于内存中,可以改进 使用迭代器,每次返回一个文档向量
#gensim TF IDF 模型
from gensim import models
tfidf = models.TfidfModel(bow_corpus)
string = "system minors" #相当于是关键词,计算与文档的tf*idf的值
string_bow = dictionary.doc2bow(string.lower().split())
string_tfidf = tfidf[string_bow]
print(string_bow)
print(string_tfidf) #结果是元组 第一个元素表示id,第二个表示tf*idf权重
gensim中 Wordvec的使用和针对大预料的改进:
"""训练模型 找出相似词""" import logging import os from gensim.models import word2vec #一次输入整个文档 可能造成内存溢出 # sentences = word2vec.LineSentence('./in_the_name_of_people_segment.txt') # model = word2vec.Word2Vec(sentences, hs=1,min_count=1,window=3,size=100) class sentences_generator(): def __init__(self, filename): self.filename = filename def __iter__(self): for line in open(self.filename,encoding="utf8"): sentence = line.rstrip().split(' ') yield sentence #语料库过大的时候用此方法 sentences = sentences_generator('./in_the_name_of_people_segment.txt') model = word2vec.Word2Vec(sentences, hs=1,min_count=1,window=3,size=100) req_count = 5 # for key in model.wv.similar_by_word('沙瑞金', topn =100): # # if len(key[0])==3: # req_count -= 1 # print(key[0], key[1]) # if req_count == 0: # break for word,similarity in model.wv.similar_by_word('沙瑞金', topn =100): # if len(key[0])==3: req_count -= 1 print(word, similarity) if req_count == 0: break
在针对大语料进行实验的时候,用类迭代器的方法和用普通方法相比结果有点区别,可能是使用叠加器针对大预料的时候是一行行的去训练模型。
使用叠加器针对大语料:
使用普通方法的时候:
关于模型的储存问题
#模型的储存 model.save('model1') new_model = word2vec.Word2Vec.load('model1')