from gensim.models import Word2Vec import numpy as np import pandas as pd import collections import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) embedding_size = 32 # 嵌入向量的维度 vector. max_vocabulary_size = 50000 # 词汇表中不同单词的总数words in the vocabulary. min_occurrence = 10 # 删除出现小于n次的所有单词 skip_window = 3 # 左右各要考虑多少个单词 num_skips = 2 # 重复使用输入生成标签的次数 num_sampled = 64 # 负采样数量 # 读取数据 data_file = "C:/project/data/movielens-m1/ratings.dat" orig_data = pd.read_csv(data_file, sep="::", names=["user_id", "item_id", "score", "timestamp"], dtype={"user_id": int, "item_id": str, "score": int, "timestamp": int}) # 根据user_id合并item_ids grouped_data = orig_data.groupby("user_id")["item_id"].apply(",".join).reset_index() grouped_data.columns = ["user_id", "item_ids"] grouped_data["item_ids_array"] = grouped_data["item_ids"].apply(lambda s: s.split(",")) sentences = grouped_data["item_ids_array"] # class gensim.models.word2vec.Word2Vec(sentences=None, corpus_file=None, size=100, # alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=0.001, seed=1, # workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, ns_exponent=0.75, # cbow_mean=1, hashfxn=<built-in function hash>, iter=5, null_word=0, trim_rule=None, # sorted_vocab=1, batch_words=10000, compute_loss=False, callbacks=(), max_final_vocab=None) model = Word2Vec(size=embedding_size, min_count=min_occurrence, max_vocab_size=max_vocabulary_size) # train(self, sentences=None, corpus_file=None, total_examples=None, total_words=None, # epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, # report_delay=1.0, compute_loss=False, callbacks=()) model.build_vocab(sentences) model.train(sentences, total_words=max_vocabulary_size, epochs=500, start_alpha=0.1, end_alpha=0.02, compute_loss=True) model.save("gensim/item2vec.model") model.wv.save_word2vec_format("gensim/item2vec.txt", total_vec=max_vocabulary_size)
tensorflow2.0 word2vec=》item2vec 【gensim】比 tensorflow 快
最新推荐文章于 2022-08-03 22:41:35 发布