【tensorflow2.0】34.word2vec作业代码

#没有包的可以先下载
pip install gensim
pip install jieba

import pandas as pd
import numpy as np
from gensim.models import Word2Vec

#读取数据集:这里我们有训练集 验证集 测试集
train = pd.read_csv('./cnews/train.tsv',sep='\t',header=None,names=['label','content'])
val = pd.read_csv('./cnews/dev.tsv',sep='\t',header=None,names=['label','content'])
test = pd.read_csv('./cnews/test.tsv',sep='\t',header=None,names=['label','content'])

#.head()函数可以让我们查看从头开始的前n行
train.head(10)

在这里插入图片描述

import jieba
#jieba.lcut 直接生成一个list  jion函数将这个list连成一个字符串
def content_cut(x):
    x = jieba.lcut(x)
    x = " ".join(x)
    return x

#调用我们刚定义的函数
train['content'] = train['content'].map(lambda x: content_cut(x))
val['content'] = val['content'].map(lambda x: content_cut(x))
test['content'] = test['content'].map(lambda x: content_cut(x))
'''
out:
Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\贾昊\AppData\Local\Temp\jieba.cache
Loading model cost 2.109 seconds.
Prefix dict has been built successfully.
'''
#将训练集验证集测试机合并
df = pd.concat([train,val,test],axis=0)

#训练Word2vec
sentences = [document.split(' ') for document in df['content'].values]
model = Word2Vec(sentences=sentences,
     size=200,#维度
     alpha=0.025, #默认
     window=5, #默认
     min_count=2,#2,3
     sample=0.001,#
     seed=2018, #
     workers=11, #线程
     min_alpha=0.0001, 
     sg=0, #cbow
     hs=0, #负采样
     negative=5,#负采样个数
     ns_exponent=0.75, 
     cbow_mean=1,#求和再取平均
     iter=10, #10到20
     compute_loss =True
     )

#保存和读取模型
model.save("./word2vec/word2vec_word_200")
model = Word2Vec.load("./word2vec/word2vec_word_200")

#查看单词的向量
model.wv['816903'].shape  #(200,)

#和这个单词最相似的20个单词
model.most_similar("",topn=20)

'''

[('12875', 0.8677932620048523),
 ('679169', 0.8625671863555908),
 ('90540', 0.841310977935791),
 ('425105', 0.8043540716171265),
 ('866203', 0.7445841431617737),
 ('122513', 0.7241939902305603),
 ('1234861', 0.7100560069084167),
 ('85838', 0.7024739980697632),
 ('1189755', 0.6224364638328552),
 ('426716', 0.5778474807739258),
 ('816903', 0.5615671873092651),
 ('797828', 0.557973325252533),
 ('1254728', 0.5530299544334412),
 ('11177', 0.546566367149353),
 ('850976', 0.5452205538749695),
 ('48896', 0.5422906875610352),
 ('903604', 0.5324429273605347),
 ('1146147', 0.5293028354644775),
 ('1200328', 0.527854859828949),
 ('1104318', 0.5183314085006714)]
'''
#计算两个单词之间相似性
model.wv.similarity("816903","1226448") #0.6617146


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值