doc2vec的训练

防止以后要再次用到doc2vec。



train_doc2vec.py

import gensim
import jieba
import deal_with_data

neg_file = open('D:/Python/project/selected_data/neg.txt', 'r')
pos_file = open('D:/Python/project/selected_data/pos.txt', 'r')

#sentences = []
#for line in neg_file:
#	sentences.append([i for i in jieba.cut(line.strip())])
#for line in pos_file:
#	sentences.append([i for i in jieba.cut(line.strip())])

sentences = deal_with_data.get_data()
for line in neg_file:
	sentences.append([i for i in jieba.cut(line.strip())])
for line in pos_file:
	sentences.append([i for i in jieba.cut(line.strip())])


LabeledSentence = gensim.models.doc2vec.LabeledSentence

class LabeledLineSentence(object):
	def __init__(self, sentences):
		self.sentences = sentences
	def __iter__(self):
		for id, line in enumerate(sentences):
			yield LabeledSentence(words=line, tags=['SENT_%s' % id])
			
it = LabeledLineSentence(sentences)



#各参数含义
'''sg (int {1, 0}) – Defines the training algorithm. If 1, CBOW is used, otherwise, skip-gram is employed.
size (int) – Dimensionality of the feature vectors.
window (int) – The maximum distance between the current and predicted word within a sentence.
alpha (float) – The initial learning rate.
min_alpha (float) – Learning rate will linearly drop to min_alpha as training progresses.
seed (int) – Seed for the random number generator. Initial vectors for each word are seeded with a hash of the concatenation of word + str(seed). Note that for a fully deterministically-reproducible run, you must also limit the model to a single worker thread (workers=1), to eliminate ordering jitter from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires use of the PYTHONHASHSEED environment variable to control hash randomization).
min_count (int) – Ignores all words with total frequency lower than this.
max_vocab_size (int) – Limits the RAM during vocabulary building; if there are more unique words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM. Set to None for no limit.
sample (float) – The threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5).
workers (int) – Use these many worker threads to train the model (=faster training with multicore machines).
hs (int {1,0}) – If 1, hierarchical softmax will be used for model training. If set to 0, and negative is non-zero, negative sampling will be used.
negative (int) – If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used.
cbow_mean (int {1,0}) – If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
hashfxn (function) – Hash function to use to randomly initialize weights, for increased training reproducibility.
iter (int) – Number of iterations (epochs) over the corpus.
trim_rule (function) – Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, be trimmed away, or handled using the default (discard if word count < min_count). Can be None (min_count will be used, look to keep_vocab_item()), or a callable that accepts parameters (word, count, min_count) and returns either gensim.utils.RULE_DISCARD, gensim.utils.RULE_KEEP or gensim.utils.RULE_DEFAULT. Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model.
sorted_vocab (int {1,0}) – If 1, sort the vocabulary by descending frequency before assigning word indexes.
batch_words (int) – Target size (in words) for batches of examples passed to worker threads (and thus cython routines).(Larger batches will be passed if individual texts are longer than 10000 words, but the standard cython code truncates to that maximum.)
compute_loss (bool) – If True, computes and stores loss value which can be retrieved using model.get_latest_training_loss().
callbacks – List of callbacks that need to be executed/run at specific stages during training.'''
#model = gensim.models.Doc2Vec(size=300, window=10, min_count=1, workers=11, alpha=0.025, min_alpha=0.025)
model = gensim.models.Doc2Vec(size=64, window=5, min_count=5, workers=4, alpha=0.025, min_alpha=0.025)
model.build_vocab(it)

print('begin training model')
for epoch in range(20):
	model.train(it, total_examples=model.corpus_count, epochs=model.iter)
	model.alpha -= 0.002
	model.min_alpha = model.alpha
	model.train(it, total_examples=model.corpus_count, epochs=model.iter)
print('finish training model')

model.save('doc2vec_sohu.model')


deal_with_data.py

import os
import jieba.posseg as pseg
import re

regular_expression = re.compile('http://[#a-zA-Z\d\-\./]+`1`2')
useless = ['w', 'x', 'un']

def get_sentence(line):
	'''
	传入参数:未经处理的句子
	返回值:经过去噪并分词的句子列表
	'''
	result = re.match(regular_expression, line.strip()).span()
	after_filter = line[result[1]:]
	words = pseg.cut(after_filter)
	sentence = [word for word, flag in words if flag not in useless]
	
	return sentence
			


def get_data():
	'''
	传入参数:无
	返回值:经过处理的句子集合
	'''
	path = 'big_data'
	files = os.listdir(path)
	s = []
	print('begin reading sentences...')
	for file in files:
		print(file)
		if not os.path.isdir(file):
			f = open(path + '/' + file, 'r', encoding='utf-8')
			for line in f:
				try:
					s.append(get_sentence(line))
				except:
					continue
			f.close()
	print('finish reading sentences')
	return s

if __name__ == '__main__':
	get_data()


训练模型的效果:

>>> print(model.most_similar('江苏'))

[('浙江', 0.9632761478424072), ('广东', 0.95753413438797), ('广州', 0.9534556865

692139), ('山东', 0.9530595541000366), ('天津', 0.9492722749710083), ('深圳', 0.

9487243890762329), ('河南', 0.94721519947052), ('上海', 0.9386720657348633), ('

美国', 0.9356528520584106), ('南京', 0.9339625239372253)]

>>> print(model.most_similar('开心'))

[('高兴', 0.917695164680481), ('兴奋', 0.8879016041755676), ('轻松', 0.843565285

2058411), ('艰难', 0.8346458673477173), ('顺利', 0.8285167813301086), ('幸运', 0

.8214830160140991), ('满意', 0.8194190263748169), ('难过', 0.8085115551948547),

('自豪', 0.8020496368408203), ('清楚', 0.7977278232574463)]

>>> print(model.most_similar('伤心'))

[('气愤', 0.7900305986404419), ('沮丧', 0.7884936332702637), ('失望', 0.77730947

73292542), ('痛苦', 0.777236819267273), ('激动', 0.777047872543335), ('生气', 0.

7749567031860352), ('委屈', 0.7744845151901245), ('兴奋', 0.7734708189964294), (

'愤怒', 0.7673838138580322), ('抱歉', 0.765254020690918)]



训练集本来是400w+的搜狐新闻数据的(这里感谢微博@__dada__  提供的数据集),但是@__dada__  训练好的模型我运行报错,而自己的电脑又无法训练全部数据,最后只能自己截取一部分数据来训练,结果勉强还行,模型大小只有165M,@__dada__  训练的模型有1.5G,如果能运行,他的模型效果肯定更好。


  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值