相关原理可参见:文本表示(一)—— word2vec(skip-gram CBOW) glove, transformer, BERT
其它相关链接:
二话不说,直接上代码
import pandas as pd
import jieba
import re
from gensim.models import Word2Vec
embedding_size = 200 # 设置词向量大小
iters = 10 # 设置迭代次数
min_count = 5
model_file = 'word2vec_{}'.format(embedding_size)
splitter = '(。|!|\!|?|\?|;|;|\n)'
# 文档加载
class TextLoader(object):
def __init__(self):
pass
def __iter__(self):
file_name = '语料.csv'
df = pd.read_csv(file_name, sep=',', encoding='utf-8')
texts = df['文本描述'].tolist()
for text in texts:
text = str(text)
if len(text) == 0 or text == 'nan':
continue
sentences = re.split(splitter, str(text))
for sen in sentences:
segments = [str(i) for i in jieba.cut(sen)]
yield segments
corpus = TextLoader()
model = Word2Vec(corpus, workers=20, size=embedding_size, sg=1, iter=iters, min_count=min_count)
model.save(model_file)