数据集MSRP、SICK、STS下载地址分享
百度云:https://pan.baidu.com/s/1sqlCc702owp_T6KjyNT6Yw
提取码: 66nb
运行:网盘中msr_train.zip是msr_train.txt处理后可直接训练的数据,结合word2vec.py代码训练,注意文件路径自行修改
预处理过程:txt文件在excel表格中导入,然后去掉多余部分只保留文本,在另存为.csv文件并utf-8编码,再压缩为.zip文件
word2vec代码(中英文均可训练),代码已更新到网盘中
import collections
import math
import random
import zipfile
import numpy as np
from six.moves import xrange
import tensorflow as tf
def read_data(filename):
with zipfile.ZipFile(filename) as f:
data = tf.compat.as_str(f.read(f.namelist()[0])).split()
return data
# 1.输入训练语料的文件路径(注意要去掉标注,只包含分词结果)
words = read_data('data/msr_train.zip')
print('Data size', len(words))
# 2.设置输出的词向量的词汇量
vocabulary_size = 8000
def build_dataset(words, vocabulary_size):
count = [['UNK', -1]]
count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
dictionary = dict()
for word, _ in count:
dictionary[word] = len(dictionary)
data = list()
unk_count = 0
for word in words:
if word in dictionary:
index = dictionary[word]
else:
index = 0 # dictionary['UNK']
unk_count += 1
data.append(index)
count[0][1] = unk_count
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return data, count, dictionary, reverse_dictionary
data, count, dictionary, reverse_dictionary = build_dataset(words, vocabulary_size)
# 删除words引用
del words
#****************************** 训练开始 ********************************************
data_index = 0
# Step 3: Function to generate a training batch for the skip-gram model.
def generate_batch(batch_size, num_skips, skip_window):
global data_index
assert batch_size % num_skips == 0
assert num_skips <= 2 * skip_window
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
span = 2 * skip_window + 1 # [ skip_window target skip_window ]