1、Word2Vec()
- from gensim.models import Word2Vec
Word2Vec(sentences=None, size=100, alpha=0.025, window=5, min_count=5,max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH))
- ----sentences:可以是一个list,对于大语料集,建议使用BrownCorpus,Text8Corpus或lineSentence构建。
- ----size:是指特征向量的维度,默认为100
- ----alpha: 是初始的学习速率,在训练过程中会线性地递减到min_alpha
- ----window:窗口大小,表示当前词与预测词在一个句子中的最大距离是多少
- ----min_count: 可以对字典做截断. 词频少于min_count次数的单词会被丢弃掉, 默认值为5
- ----max_vocab_size: 设置词向量构建期间的RAM限制,设置成None则没有限制
- ----sample: 高频词汇的随机降采样的配置阈值,默认为1e-3,范围是(0,1e-5)
- ----seed:用于随机数发生器。与初始化词向量有关
- ----workers:用于控制训练的并行数
- ----min_alpha:学习率的最小值
- ----sg: 用于设置训练算法,默认为0,对应CBOW算法;sg=1则采用skip-gram算法
- ----hs: 如果为1则会采用hierarchica·softmax技巧。如果设置为0(默认),则使用negative sampling
- ----negative: 如果>0,则会采用negativesampling,用于设置多少个noise words(一般是5-20)
- ----cbow_mean: 如果为0,则采用上下文词向量的和,如果为1(default)则采用均值,只有使用CBOW的时候才起作用
- ----hashfxn: hash函数来初始化权重,默认使用python的hash函数
- ----iter: 迭代次数,默认为5
- ----trim_rule: 用于设置词汇表的整理规则,指定那些单词要留下,哪些要被删除。可以设置为None(min_count会被使用)
- ----sorted_vocab: 如果为1(默认),则在分配word index 的时候会先对单词基于频率降序排序
- ----batch_words:每一批的传递给线程的单词的数量,默认为10000
2、说明
参数:
sentences:中文的或者英文的文章都可以,一般一篇文章就是一行,要经过预处理才能使用,将文本语料进行分词,以空格,tab隔开都可以。可以是读入数据以后为list的语料,一般是二维list,每一个子list是一篇文章切词后的形式。要么使用
模型的词向量调用:
model.wv.vocab:可以直接调用生成的词向量
模型的保存(二进制):
# 构建数据通道,进行数据加载,在此之前数据已经切分完毕,一行一篇切好词的文章(词与词之间用空格间隔开)
sentences = LineSentence('./test_w2c_qieci_wenben.txt')
# pdb.set_trace()
# 模型训练
model = Word2Vec(sentences , size=100, window=5, min_count=1, workers=4)
# 模型保存
model.save('./w2cmodel/word2vec_test.model')
# model = Word2Vec(qieci_str, size=100, window=5, min_count=1, workers=4)
print(model.wv.vocab)
模型的保存(txt文本形式):
# 构建数据通道,进行数据加载,在此之前数据已经切分完毕,一行一篇切好词的文章(词与词之间用空格间隔开)
sentences = LineSentence('./test_w2c_qieci_wenben.txt')
# 模型训练
model = Word2Vec(sentences , size=100, window=5, min_count=1, workers=4)
# 模型保存
model.wv.save_word2vec_format('./w2cmodel/word2vec_test.txt',binary=False)
模型的加载(二进制文本):
model = Word2Vec.load('./w2cmodel/word2vec_test.model')
print(model.wv.vocab)
模型的加载(txt文本):
# 加载txt词向量
model = gensim.models.KeyedVectors.load_word2vec_format('./w2cmodel/word2vec_test.txt')
print(model.wv.vocab)
模型数据传入说明:
这里就是对于参数sentences的传入结果说明:
- 尽量使用gensim提供的LineSentence (建议:对于大语料集,建议使用BrownCorpus,Text8Corpus或lineSentence构建),这一种传入的数据必须是切分后的文本,而且一个文本为一个list,词之间使用空格进行连接
sentences = LineSentence('./test_w2c_qieci_wenben.txt')
model = Word2Vec(sentences , size=100, window=5, min_count=1, workers=4)
print(model.wv.vocab)
"""
'熟悉': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C15F8>, '汉朝': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1630>, '历史': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1668>, '的': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C16A0>, '人': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C16D8>, '都': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1710>, '知道': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1748>, ',': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1780>, '卫青': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C17B8>, '是': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C17F0>, '西汉时期': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1828>, '名将': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1860>, '他': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1898>, '揭开': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C18D0>, '了': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1908>, '汉': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1940>, '匈': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1978>, '战争': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C19B0>, '反败为胜': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C19E8>, '序幕': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1A20>, '曾': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1A58>, '七': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1A90>, '战七捷': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1AC8>, '收复': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1B00>, '河朔': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1B38>, '、': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1B70>, '河套': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1BA8>, '地区': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1BE0>, '击破': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1C18>, '单于': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1C50>, '为': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1C88>, '北部': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1CC0>, '疆域': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1CF8>, '开拓': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1D30>, '做出': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1D68>, '重大贡献': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1DA0>, '。': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1DD8>, '一生': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1E10>, '十分': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1E48>, '传奇': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1E80>, '立下': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1EB8>, '战功': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1EF0>, '无数': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1F28>, '汉武帝': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1F60>, '在位': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1F98>, '时卫': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1FD0>, '青官': <gensim.models.keyedvectors.Vocab object at 0x0000000012872048>, '至': <gensim.models.keyedvectors.Vocab object at 0x0000000012872080>, '大司马': <gensim.models.keyedvectors.Vocab object at 0x00000000128720B8>, '大将军': <gensim.models.keyedvectors.Vocab object at 0x00000000128720F0>, '封长平': <gensim.models.keyedvectors.Vocab object at 0x0000000012872128>, '侯': <gensim.models.keyedvectors.Vocab object at 0x0000000012872160>, '家族': <gensim.models.keyedvectors.Vocab object at 0x0000000012872198>, '也': <gensim.models.keyedvectors.Vocab object at 0x00000000128721D0>,
"""
- 也可使用list传入,但是所构建的词向量结果是单个字的,没有词汇的词向量,所以最后使用的是LineSentence
# 这个就是直接传入列表
model = Word2Vec(qieci_str, size=100, window=5, min_count=1, workers=4)
print(model.wv.vocab)
"""
'熟': <gensim.models.keyedvectors.Vocab object at 0x000000000298E940>, '悉': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C14E0>, ' ': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1668>, '汉': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C15F8>, '朝': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C16D8>, '历': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1748>, '史': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C17B8>, '的': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C17F0>, '人': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1828>, '都': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1860>, '知': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1898>, '道': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C18D0>, ',': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1908>, '卫': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1940>, '青': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1978>, '是': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C19B0>, '西': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C19E8>, '时': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1A20>, '期': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1A58>, '名': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1A90>, '将': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1AC8>, '他': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1B00>, '揭': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1B38>, '开': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1B70>, '了': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1BA8>, '匈': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1BE0>, '战': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1C18>, '争': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1C50>, '反': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1C88>, '败': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1CC0>, '为': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1CF8>, '胜': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1D30>, '序': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1D68>, '幕': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1DA0>, '曾': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1DD8>, '七': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1E10>, '捷': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1E48>, '收': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1E80>, '复': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1EB8>, '河': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1EF0>, '朔': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1F28>, '、': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1F60>, '套': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1F98>, '地': <gensim.models.keyedvectors.Vocab object at 0x000000000E5C1FD0>, '区': <gensim.models.keyedvectors.Vocab object at 0x0000000012874048>, '击': <gensim.models.keyedvectors.Vocab object at 0x0000000012874080>, '破': <gensim.models.keyedvectors.Vocab object at 0x00000000128740B8>, '单': <gensim.models.keyedvectors.Vocab object at 0x00000000128740F0>, '于': <gensim.models.keyedvectors.Vocab object at 0x0000000012874128>, '北': <gensim.models.keyedvectors.Vocab object at 0x0000000012874160>, '部': <gensim.models.keyedvectors.Vocab object at 0x0000000012874198>, '疆': <gensim.models.keyedvectors.Vocab object at 0x00000000128741D0>, '域': <gensim.models.keyedvectors.Vocab object at 0x0000000012874208>, '拓': <gensim.models.keyedvectors.Vocab object at 0x0000000012874240>, '做':
"""
3、word2vec所有测试
from gensim.models.word2vec import Word2Vec,LineSentence
import jieba
import pandas as pd
import pdb
# from gensim.models.keyedvectors.Word2VecKeyedVectors import load_word2vec_format
import gensim
"""
1、将语料库预处理:一行一个文档或句子,将文档或句子分词(以空格分割,英文可以不用分词,英文单词之间已经由空格分割,
中文预料需要使用分词工具进行分词,常见的分词工具有StandNLP、ICTCLAS、Ansj、FudanNLP、HanLP、结巴分词等)
2、将原始的训练语料转化成一个sentence的迭代器,每一次迭代返回的sentence是一个word(utf8格式)的列表。
可以使用Gensim中word2vec.py中的LineSentence()方法实现
3、将上面处理的结果输入Gensim内建的word2vec对象进行训练即可
"""
# with open('./test_w2c.csv', "r",encoding='utf-8',errors='ignore') as f:
# # 原始短文本,每个文本切词存放为一个list
# qieci_list = []
# for data in f.readlines():
# qieci_list.append(jieba.lcut(data))
# # 切词后再次连接在一起
# qieci_str = []
# for wenben in qieci_list:
# qieci_str.append(' '.join(wenben))
# with open('./test_w2c_qieci_wenben.txt', 'w', encoding='utf-8') as f:
# for i in qieci_str:
# f.write(i)
# # print(qieci_str)
# # 构建数据通道,进行数据加载,在此之前数据已经切分完毕,一行一篇切好词的文章(词与词之间用空格间隔开)
# sentences = LineSentence('./test_w2c_qieci_wenben.txt')
# # pdb.set_trace()
# # 模型训练
# model = Word2Vec(sentences , size=100, window=5, min_count=1, workers=4)
# # 模型保存
# # model.save('./w2cmodel/word2vec_test.model')
# model.wv.save_word2vec_format('./w2cmodel/word2vec_test.txt',binary=False)
# # print(model.wv.vocab)
# 加载txt词向量
model = gensim.models.KeyedVectors.load_word2vec_format('./w2cmodel/word2vec_test.txt')
print(model.wv.vocab)
# model = Word2Vec(qieci_str, size=100, window=5, min_count=1, workers=4)
# 模型加载
# model = Word2Vec.load('./w2cmodel/word2vec_test.model')
# print(model.wv.vocab)