doc2vec的训练

最新推荐文章于 2023-09-28 23:29:31 发布

快去吃烤肉

最新推荐文章于 2023-09-28 23:29:31 发布

阅读量2.6k

点赞数 1

本文链接：https://blog.csdn.net/wave2587/article/details/79296215

版权

防止以后要再次用到doc2vec。

train_doc2vec.py

import gensim
import jieba
import deal_with_data

neg_file = open('D:/Python/project/selected_data/neg.txt', 'r')
pos_file = open('D:/Python/project/selected_data/pos.txt', 'r')

#sentences = []
#for line in neg_file:
#	sentences.append([i for i in jieba.cut(line.strip())])
#for line in pos_file:
#	sentences.append([i for i in jieba.cut(line.strip())])

sentences = deal_with_data.get_data()
for line in neg_file:
	sentences.append([i for i in jieba.cut(line.strip())])
for line in pos_file:
	sentences.append([i for i in jieba.cut(line.strip())])


LabeledSentence = gensim.models.doc2vec.LabeledSentence

class LabeledLineSentence(object):
	def __init__(self, sentences):
		self.sentences = sentences
	def __iter__(self):
		for id, line in enumerate(sentences):
			yield LabeledSentence(words=line, tags=['SENT_%s' % id])
			
it = LabeledLineSentence(sentences)



#各参数含义
'''sg (int {1, 0}) – Defines the training algorithm. If 1, CBOW is used, otherwise, skip-gram is employed.
size (int) – Dimensionality of the feature vectors.
window (int) – The maximum distance between the current and predicted word within a sentence.
alpha (float) – The initial learning rate.
min_alpha (float) – Learning rate will linearly drop to min_alpha as training progresses.
seed (int) – Seed for the random number generator. Initial vectors for each word are seeded with a hash of the concatenation of word + str(seed). Note that for a fully deterministically-reproducible run, you must also limit the model to a single worker thread (workers=1), to eliminate ordering jitter from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires use of the PYTHONHASHSEED environment variable to control hash randomization).
min_count (int) – Ignores all words with total frequency lower than this.
max_vocab_size (int) – Limits the RAM during vocabulary building; if there are more unique words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM. Set to None for no limit.
sample (float) – The threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5).
workers (int) – Use these many worker threads to train the model (=faster training with multicore machines).
hs (int {1,0}) – If 1, hierarchical softmax will be used for model training. If set to 0, and negative is non-zero, negative sampling will be used.
negative (int) – If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used.
cbow_mean (int {1,0}) – If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
hashfxn (function) – Hash function to use to randomly initialize weights, for increased training reproducibility.
iter (int) – Number of iterations (epochs) over the corpus.
trim_rule (function) – Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, be trimmed away, or handled using the default (discard if word count < min_count). Can be None (min_count will be used, look to keep_vocab_item()), or a callable that accepts parameters (word, count, min_count) and returns either gensim.utils.RULE_DISCARD, gensim.utils.RULE_KEEP or gensim.utils.RULE_DEFAULT. Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model.
sorted_vocab (int {1,0}) – If 1, sort the vocabulary by descending frequency before assigning word indexes.
batch_words (int) – Target size (in words) for batches of examples passed to worker threads (and thus cython routines).(Larger batches will be passed if individual texts are longer than 10000 words, but the standard cython code truncates to that maximum.)
compute_loss (bool) – If True, computes and stores loss value which can be retrieved using model.get_latest_training_loss().
callbacks – List of callbacks that need to be executed/run at specific stages during training.'''

#model = gensim.models.Doc2Vec(size=300, window=10, min_count=1, workers=11, alpha=0.025, min_alpha=0.025)
model = gensim.models.Doc2Vec(size=64, window=5, min_count=5, workers=4, alpha=0.025, min_alpha=0.025)
model.build_vocab(it)

print('begin training model')
for epoch in range(20):
	model.train(it, total_examples=model.corpus_count, epochs=model.iter)
	model.alpha -= 0.002
	model.min_alpha = model.alpha
	model.train(it, total_examples=model.corpus_count, epochs=model.iter)
print('finish training model')

model.save('doc2vec_sohu.model')

deal_with_data.py

import os
import jieba.posseg as pseg
import re

regular_expression = re.compile('http://[#a-zA-Z\d\-\./]+`1`2')
useless = ['w', 'x', 'un']

def get_sentence(line):
	'''
	传入参数：未经处理的句子
	返回值：经过去噪并分词的句子列表
	'''
	result = re.match(regular_expression, line.strip()).span()
	after_filter = line[result[1]:]
	words = pseg.cut(after_filter)
	sentence = [word for word, flag in words if flag not in useless]
	
	return sentence
			


def get_data():
	'''
	传入参数：无
	返回值：经过处理的句子集合
	'''
	path = 'big_data'
	files = os.listdir(path)
	s = []
	print('begin reading sentences...')
	for file in files:
		print(file)
		if not os.path.isdir(file):
			f = open(path + '/' + file, 'r', encoding='utf-8')
			for line in f:
				try:
					s.append(get_sentence(line))
				except:
					continue
			f.close()
	print('finish reading sentences')
	return s

if __name__ == '__main__':
	get_data()

训练模型的效果：

>>> print(model.most_similar('江苏'))

[('浙江', 0.9632761478424072), ('广东', 0.95753413438797), ('广州', 0.9534556865

692139), ('山东', 0.9530595541000366), ('天津', 0.9492722749710083), ('深圳', 0.

9487243890762329), ('河南', 0.94721519947052), ('上海', 0.9386720657348633), ('

美国', 0.9356528520584106), ('南京', 0.9339625239372253)]

>>> print(model.most_similar('开心'))

[('高兴', 0.917695164680481), ('兴奋', 0.8879016041755676), ('轻松', 0.843565285

2058411), ('艰难', 0.8346458673477173), ('顺利', 0.8285167813301086), ('幸运', 0

.8214830160140991), ('满意', 0.8194190263748169), ('难过', 0.8085115551948547),

('自豪', 0.8020496368408203), ('清楚', 0.7977278232574463)]

>>> print(model.most_similar('伤心'))

[('气愤', 0.7900305986404419), ('沮丧', 0.7884936332702637), ('失望', 0.77730947

73292542), ('痛苦', 0.777236819267273), ('激动', 0.777047872543335), ('生气', 0.

7749567031860352), ('委屈', 0.7744845151901245), ('兴奋', 0.7734708189964294), (

'愤怒', 0.7673838138580322), ('抱歉', 0.765254020690918)]

训练集本来是400w+的搜狐新闻数据的（这里感谢微博@__dada__ 提供的数据集），但是@__dada__ 训练好的模型我运行报错，而自己的电脑又无法训练全部数据，最后只能自己截取一部分数据来训练，结果勉强还行，模型大小只有165M，@__dada__ 训练的模型有1.5G，如果能运行，他的模型效果肯定更好。

快去吃烤肉

关注

1
点赞
踩
3

收藏

觉得还不错? 一键收藏
1
评论
doc2vec的训练

防止以后要再次用到doc2vec。train_doc2vec.pyimport gensimimport jiebaimport deal_with_dataneg_file = open('D:/Python/project/selected_data/neg.txt', 'r')pos_file = open('D:/Python/project/selected_data/pos....
复制链接

扫一扫