pip install gensim
pip install jieba
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
train = pd.read_csv('./cnews/train.tsv',sep='\t',header=None,names=['label','content'])
val = pd.read_csv('./cnews/dev.tsv',sep='\t',header=None,names=['label','content'])
test = pd.read_csv('./cnews/test.tsv',sep='\t',header=None,names=['label','content'])
train.head(10)

import jieba
def content_cut(x):
x = jieba.lcut(x)
x = " ".join(x)
return x
train['content'] = train['content'].map(lambda x: content_cut(x))
val['content'] = val['content'].map(lambda x: content_cut(x))
test['content'] = test['content'].map(lambda x: content_cut(x))
'''
out:
Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\贾昊\AppData\Local\Temp\jieba.cache
Loading model cost 2.109 seconds.
Prefix dict has been built successfully.
'''
df = pd.concat([train,val,test],axis=0)
sentences = [document.split(' ') for document in df['content'].values]
model = Word2Vec(sentences=sentences,
size=200,
alpha=0.025,
window=5,
min_count=2,
sample=0.001,
seed=2018,
workers=11,
min_alpha=0.0001,
sg=0,
hs=0,
negative=5,
ns_exponent=0.75,
cbow_mean=1,
iter=10,
compute_loss =True
)
model.save("./word2vec/word2vec_word_200")
model = Word2Vec.load("./word2vec/word2vec_word_200")
model.wv['816903'].shape
model.most_similar("",topn=20)
'''
[('12875', 0.8677932620048523),
('679169', 0.8625671863555908),
('90540', 0.841310977935791),
('425105', 0.8043540716171265),
('866203', 0.7445841431617737),
('122513', 0.7241939902305603),
('1234861', 0.7100560069084167),
('85838', 0.7024739980697632),
('1189755', 0.6224364638328552),
('426716', 0.5778474807739258),
('816903', 0.5615671873092651),
('797828', 0.557973325252533),
('1254728', 0.5530299544334412),
('11177', 0.546566367149353),
('850976', 0.5452205538749695),
('48896', 0.5422906875610352),
('903604', 0.5324429273605347),
('1146147', 0.5293028354644775),
('1200328', 0.527854859828949),
('1104318', 0.5183314085006714)]
'''
model.wv.similarity("816903","1226448")