gensim中的word2vec的使用

开始数据分析之旅 专栏收录该内容
26 篇文章 0 订阅

本着尊重原著的想法,我们先把一些引用的文章贴上来,供大家参考

word2vec的理论知识,这个真的蛮详细的,我表示没有耐心全部搞透啊!:https://blog.csdn.net/itplus/article/details/37969519

苏剑林苏大神的博客,我很喜欢的一位大神:https://kexue.fm/archives/3863

刘建平Pinard 大神的博客:https://www.cnblogs.com/pinard/p/7278324.html

我是个搬运工,同时也是个调包侠(这个称呼是张美琦小可耐提出来的,我觉得很赞,美琦也帮助了我很多,还跟我聊聊天,遇到她我觉得在公司挺开心的!)

谢谢以上大神们的分享,我把我的一些理解和实践贴出来,希望留下些痕迹!

最近我同事做了一款无监督的推荐算法,就是基于word2vec,让我觉得很有创意和想法,这个同事就是马云龙,云龙同学一直特别爱学,特别上进,就是最近我们沟通不是很多!

基于用户的购买数据,将sku的ID作为词,以用户的购买清单为句子,训练word2vec模型,然后将每个用户的购买清单的sku的ID们的词向量求和平均,得到用户兴趣向量,然后将推进候选集中的sku的ID的词向量与用户兴趣向量计算相似度!

这使用的是word2vec一个很特色的地方:

利用人工神经网络训练的字词向量非常有趣,因为它可以用来编码许多线性翻译的模式。比如:利用向量关系表示:Madrid 之于 Spain = Paris 之于 France : 
vec(“Madrid”) - vec(“spain”) = vec(“Paris”) - vec(“France”).因此对于一个好的训练结果往往可以通过计算 与vec(“Paris”) + vec(“Spain”) - vec(“Marid”)向量最近的词来求出 France.

这叫作类别推理,这也是目前检测一个词向量系统质量的常用方法。

word2vec有两种方式都可以训练,还可以训练一个模型,加入一些语料再接着训练,这个可以看gensim的官网介绍

以下实践用到的数据,若有需要留邮箱给我吧,我发给你们,因为不知道怎么添加数据连接(o(╯□╰)o)

1. 直接流式输入,可以走迭代因子

# -*- coding: utf-8 -*-
"""
Created on  2018/8/20 15:08
 利用gensim库的word2vec功能对“人民的名义”进行训练,得到模型,查看效果
@author: sh
"""
import jieba
import jieba.analyse
import logging
import os
import gensim
from gensim.models import word2vec
import importlib,sys 
importlib.reload(sys)

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
def deal_data(path_in,path_out):

    jieba.suggest_freq('沙瑞金', True)
    jieba.suggest_freq('田国富', True)
    jieba.suggest_freq('高育良', True)
    jieba.suggest_freq('侯亮平', True)
    jieba.suggest_freq('钟小艾', True)
    jieba.suggest_freq('陈岩石', True)
    jieba.suggest_freq('欧阳菁', True)
    jieba.suggest_freq('易学习', True)
    jieba.suggest_freq('王大路', True)
    jieba.suggest_freq('蔡成功', True)
    jieba.suggest_freq('孙连城', True)
    jieba.suggest_freq('季昌明', True)
    jieba.suggest_freq('丁义珍', True)
    jieba.suggest_freq('郑西坡', True)
    jieba.suggest_freq('赵东来', True)
    jieba.suggest_freq('高小琴', True)
    jieba.suggest_freq('赵瑞龙', True)
    jieba.suggest_freq('林华华', True)
    jieba.suggest_freq('陆亦可', True)
    jieba.suggest_freq('刘新建', True)
    jieba.suggest_freq('刘庆祝', True)

    with open(path_in,'r') as f:
        document = f.read()
        document_cut = jieba.cut(document)
        result = ' '.join(document_cut)
        result = result
        with open(path_out,'w') as f2:
            f2.write(str(result))
    f.close()
    f2.close()

def train_model(path_in,path_out):
    sentences = word2vec.LineSentence(path_in)
    model = word2vec.Word2Vec(sentences,hs=1,min_count=1,window=3,size=100)
    model.save(path_out)

def predict_model(path_in):
    model = gensim.models.Word2Vec.load(path_in)
    req_count = 5
    for key in model.most_similar('李达康', topn=100):
        if len(key[0]) == 3:
            req_count -= 1
            print(str(key[0])+"    "+str(key[1]))
            if req_count == 0:
                break;
    print(model.similarity('沙瑞金', '高育良'))
    print((model.similarity('李达康', '王大路')))
    print(model.doesnt_match("沙瑞金 高育良 李达康 刘庆祝".split()))

if __name__ == '__main__':
    file_path = 'in_the_name_of_people.txt'
    sege_path = 'in_the_name_of_people_segment.txt'
    model_path = 'word2vec_model_rmmy.model'
    #deal_data(file_path, sege_path)
    #train_model(sege_path, model_path)
    predict_model(model_path)

2.提前将语料存储进去,后续还可以追加

# -*- coding: utf-8 -*-
"""
Created on  2018/5/17 16:42

@author: sh
"""
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.externals import joblib
import xlrd
import jieba
import re

n_dim = 300


# get the set of disused words
def getstopword(stopwordPath):
    stoplist = set()
    for line in stopwordPath:
        stoplist.add(line.strip())
        # print line.strip()
    return stoplist


# participle and removal of discontinuation words
def cutStopword(x, stoplist):
    seg_list = jieba.cut(x.strip())
    fenci = []

    for item in seg_list:
        if item not in stoplist and re.match(r'-?\d+\.?\d*', item) == None and len(item.strip()) > 0:
            fenci.append(item)
    return fenci


# read data files,get training data and test data
def loadfile():
    neg = pd.read_excel('neg.xls', header=None, index=None)
    pos = pd.read_excel('pos.xls', header=None, index=None)
    stopwordPath = open('stopwords1.txt', 'r')
    stoplist = getstopword(stopwordPath)

    pos['words'] = pos[0].apply(cutStopword, args=(stoplist,))
    neg['words'] = neg[0].apply(cutStopword, args=(stoplist,))
    print(pos['words'][:10])

    # use 1 for positive sentiment,0 for negative
    y = np.concatenate((np.ones(len(pos)), np.zeros(len(neg))))
    x = np.concatenate((pos['words'], neg['words']))
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    np.save('y_train.npy', y_train)
    np.save('y_test.npy', y_test)
    return x, x_train, x_test, y_train, y_test


# get summation of word vectors of all word in a copus,and then get the average,as the input of the model
def buildWordVector(text, size, imdb_w2v):
    vec = np.zeros(size).reshape((1, size))
    count = 0
    for word in text:
        try:
            vec += imdb_w2v[word].reshape((1, size))
            count += 1
        except KeyError:
            continue
    if count != 0:
        vec /= count

    return vec


# calculating test set and training set
def get_train_vecs(x, x_train, x_test):
    # Initialize model and build vocab
    imdb_w2v = Word2Vec(size=n_dim, min_count=10, seed=1)
    imdb_w2v.build_vocab(x)
    # Train the model over train_reviews (this may take several minutes)
    imdb_w2v.train(x, total_examples=imdb_w2v.corpus_count, epochs=50)
    imdb_w2v.save('w2v_model.pkl')
    train_vecs = np.concatenate([buildWordVector(z, n_dim, imdb_w2v) for z in x_train])
    # train_vecs = scale(train_vecs)

    np.save('train_vecs.npy', train_vecs)
    print(train_vecs.shape)
    # Train word2vec on test tweets
    # imdb_w2v.train(x_test)

    # Build test tweet vectors then scale
    test_vecs = np.concatenate([buildWordVector(z, n_dim, imdb_w2v) for z in x_test])
    # test_vecs = scale(test_vecs)
    np.save('test_vecs.npy', test_vecs)
    print(test_vecs.shape)
    return train_vecs, test_vecs


# train svm model with sklearn
def svm_train(train_vecs, y_train, test_vecs, y_test):
    clf = SVC(kernel='rbf', verbose=True)
    clf.fit(train_vecs, y_train)
    joblib.dump(clf, 'model.pkl')
    print(clf.score(test_vecs, y_test))


# load word2vec and smv model and use them to predict
def svm_predict(str):
    clf = joblib.load('model.pkl')
    model = Word2Vec.load('w2v_model.pkl')
    stopwordPath = open('stopwords1.txt', 'r')
    stoplist = getstopword(stopwordPath)
    str_sege = cutStopword(str, stoplist)
    str_pre = np.array(str_sege).reshape(1, -1)
    str_vecs = np.concatenate([buildWordVector(z, n_dim, model) for z in str_pre])
    pred_result = clf.predict(str_vecs)
    print(pred_result)


if __name__ == '__main__':
    print("loading data ...")
    x, x_train, x_test, y_train, y_test = loadfile()
    print("train word2vec model and get the input of svm model")
    train_vecs, test_vecs = get_train_vecs(x, x_train, x_test)
    print("train svm model...")
    svm_train(train_vecs, y_train, test_vecs, y_test)

    print("use svm model to predict...")
    str = '屏幕较差,拍照也很粗糙。'
    # str ='质量不错,是正品 ,安装师傅也很好,才要了83元材料费'
    # str ='东西非常不错,安装师傅很负责人,装的也很漂亮,精致,谢谢安装师傅!'
    svm_predict(str)

# -*- coding: utf-8 -*-
"""
Created on  2018/8/21 13:30
用word2vec和lstm对短文本进行情感分析
@author: sh
"""
import imp
import sys
imp.reload(sys)
import numpy as np
import pandas as pd
import jieba
import re
from gensim.models import Word2Vec
from keras.preprocessing import sequence
from gensim.corpora.dictionary import Dictionary
import multiprocessing
from sklearn.model_selection import train_test_split
import yaml
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Dropout,Activation
from keras.models import model_from_yaml
np.random.seed(1337)  # For Reproducibility

vocab_dim = 100
maxlen = 100
n_iterations = 1  # ideally more..
n_exposures = 10
window_size = 7
batch_size = 32
n_epoch = 20
input_length = 100

cpu_count = multiprocessing.cpu_count()
# 加载训练文件
def loadfile():
    neg = pd.read_excel('neg.xls',header=None,index=None)
    pos = pd.read_excel('pos.xls',header=None,index=None)

    combined = np.concatenate((pos[0], neg[0]))
    y = np.concatenate((np.ones(len(pos),dtype=int), np.zeros(len(neg),dtype=int)))

    return combined , y
# 获取停用词
def getstopword(stopwordPath):
    stoplist = set()
    for line in stopwordPath:
        stoplist.add(line.strip())
        # print line.strip()
    return stoplist

# 分词并剔除停用词
def tokenizer(text):
    stopwordPath = open('stopwords1.txt','r')
    stoplist = getstopword(stopwordPath)
    stopwordPath.close()
    text_list = []
    for document in text:

        seg_list = jieba.cut(document.strip())
        fenci = []

        for item in seg_list:
            if item not in stoplist and re.match(r'-?\d+\.?\d*', item) == None and len(item.strip()) > 0:
                fenci.append(item)
        text_list.append(fenci)
    return text_list
#创建词语字典,并返回每个词语的索引,词向量,以及每个句子所对应的词语索引
def create_dictionaries(model=None,
                        combined=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries
    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(),
                            allow_update=True)
        w2indx = {v: k+1 for k, v in gensim_dict.items()}#所有频数超过10的词语的索引
        w2vec = {word: model[word] for word in w2indx.keys()}#所有频数超过10的词语的词向量

        def parse_dataset(combined):
            ''' Words become integers
            '''
            data=[]
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data
        combined=parse_dataset(combined)
        combined= sequence.pad_sequences(combined, maxlen=maxlen)#每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
        return w2indx, w2vec,combined
    else:
        print('No data provided...')
#创建词语字典,并返回每个词语的索引,词向量,以及每个句子所对应的词语索引
def word2vec_train(combined):

    model = Word2Vec(size=vocab_dim,
                     min_count=n_exposures,
                     window=window_size,
                     workers=cpu_count,
                     iter=n_iterations)
    model.build_vocab(combined)
    model.train(combined,total_examples = model.corpus_count,epochs = 50)
    model.save('Word2vec_model.pkl')
    index_dict, word_vectors,combined = create_dictionaries(model=model,combined=combined)
    return   index_dict, word_vectors,combined
def get_data(index_dict,word_vectors,combined,y):

    n_symbols = len(index_dict) + 1  # 所有单词的索引数,频数小于10的词语索引为0,所以加1
    embedding_weights = np.zeros((n_symbols, vocab_dim))#索引为0的词语,词向量全为0
    for word, index in index_dict.items():#从索引为1的词语开始,对每个词语对应其词向量
        embedding_weights[index, :] = word_vectors[word]
    x_train, x_test, y_train, y_test = train_test_split(combined, y, test_size=0.2)
    print(x_train.shape,y_train.shape)
    return n_symbols,embedding_weights,x_train,y_train,x_test,y_test
##定义网络结构
def train_lstm(n_symbols,embedding_weights,x_train,y_train,x_test,y_test):
    print('Defining a Simple Keras Model...')
    model = Sequential()  # or Graph or whatever
    model.add(Embedding(output_dim=vocab_dim,
                        input_dim=n_symbols,
                        mask_zero=True,
                        weights=[embedding_weights],
                        input_length=input_length))  # Adding Input Length
    model.add(LSTM(output_dim=50, activation='sigmoid', inner_activation='hard_sigmoid'))
    model.add(Dropout(0.5))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    print ('Compiling the Model...')
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',metrics=['accuracy'])

    print ("Train...")
    model.fit(x_train, y_train, batch_size=batch_size, nb_epoch=n_epoch,verbose=1)

    print ("Evaluate...")
    score = model.evaluate(x_test, y_test,
                                batch_size=batch_size)

    yaml_string = model.to_yaml()
    with open('lstm.yml', 'w') as outfile:
        outfile.write( yaml.dump(yaml_string, default_flow_style=True) )
    model.save_weights('lstm.h5')
    print ('Test score:', score)
#训练模型,并保存
def train():
    print ('Loading Data...')
    combined,y = loadfile()
    print(len(combined), len(y))
    print('Tokenising...')
    combined = tokenizer(combined)
    print('Training a Word2vec model...')
    index_dict, word_vectors,combined=word2vec_train(combined)
    print('Setting up Arrays for Keras Embedding Layer...')
    n_symbols,embedding_weights,x_train,y_train,x_test,y_test=get_data(index_dict, word_vectors,combined,y)
    print(x_train.shape,y_train.shape)
    train_lstm(n_symbols,embedding_weights,x_train,y_train,x_test,y_test)

def input_transform(string):
    words=jieba.cut(string)
    words=np.array(words).reshape(1,-1)
    model=Word2Vec.load('Word2vec_model.pkl')
    _,_,combined=create_dictionaries(model,words)
    return combined

def lstm_predict(string):
    print('loading model......')
    with open('lstm.yml', 'r') as f:
        yaml_string = yaml.load(f)
    model = model_from_yaml(yaml_string)

    print('loading weights......')
    model.load_weights('lstm.h5')
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',metrics=['accuracy'])
    data=input_transform(string)
    data.reshape(1,-1)
    #print data
    result=model.predict_classes(data)
    if result[0][0]==1:
        print(string,' positive')
    else:
        print(string,' negative')
if __name__=='__main__':
    train()
    #string='电池充完了电连手机都打不开.简直烂的要命.真是金玉其外,败絮其中!连5号电池都不如'
    #string='牛逼的手机,从3米高的地方摔下去都没坏,质量非常好'
    #string='酒店的环境非常好,价格也便宜,值得推荐'
    #string='手机质量太差了,傻逼店家,赚黑心钱,以后再也不会买了'
    #string='我是傻逼'
    #string='你是傻逼'
    string='屏幕较差,拍照也很粗糙。'
    #string='质量不错,是正品 ,安装师傅也很好,才要了83元材料费'
    #string='东西非常不错,安装师傅很负责人,装的也很漂亮,精致,谢谢安装师傅!'

    lstm_predict(string)

 

  • 8
    点赞
  • 7
    评论
  • 24
    收藏
  • 一键三连
    一键三连
  • 扫一扫,分享海报

©️2021 CSDN 皮肤主题: 大白 设计师:CSDN官方博客 返回首页
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、C币套餐、付费专栏及课程。

余额充值