算法中最主要的是用到了gensim.models.doc2vec将信息存储成词典进行建模并将信息文件转存到数据库中供其他代码使用。因为注释写的比较清晰。所以逻辑不再赘述,直接上代码看看就知道啦~因为在前面的算法中已经对数据库操作类进行了描述,这里就不再黏贴出来,有需要的亲可以翻看以前的文章参考即可~算法代码如下:
import tensorflow as tf
import os
import gensim
import re
import jieba.posseg as pseg
from gensim.models.doc2vec import Doc2Vec
from loadData import loadData
tf.flags.DEFINE_string("base_dir", ".", "files base_dir")
tf.flags.DEFINE_string("train_dir", ".\\train", "trainning files base_dir")
tf.flags.DEFINE_string("test_dir", ".\\test", "test files base_dir")
tf.flags.DEFINE_string("model_dir", "./doc2vecmodel", "Model directory from training run")
tf.flags.DEFINE_integer('vector_dim', 500,'dimensionality of characters')
tf.flags.DEFINE_integer('epoch_num', 70,'the number of epoch')
tf.flags.DEFINE_integer('min_count', 1,'ignore the words which freq lower than min_count')
tf.flags.DEFINE_integer('window', 3,'the max distance between relative content')
tf.flags.DEFINE_integer('negative', 5,'the number of negative that we can accept')
tf.flags.DEFINE_integer('workers', 4,'the module number of worker')
FLAGS = tf.flags.FLAGS
FLAGS.is_parsed()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
print("{}={}".format(attr.upper(), value))
print("")
class Singleton