1. 加载punkt tokenizer用于分句
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
raw_sentences = tokenizer.tokenize(review.strip())
2. 训练word2vec模型
from gensim.models import word2vec
model = word2vec.Word2Vec(sentences, workers=num_workers, \
size=num_features, min_count = min_word_count, \
window = context, sample = downsampling)
其中,sentences是输入数据,worker是并行运行的线程数,size是词向量的维数,min_count是最小的词频,window是上下文窗口大小,sample是对频繁词汇下采样设置
3. 如果你不打算进一步训练模型,调用init_sims将使得模型的存储更加高效
model.init_sims(replace=True)
4.保存模型
model_name = "300features_40minwords_10context"
model.save(model_name)
5. 加载模型
model = Word2Vec.load("300features_40minwords_10context")
6. model.wv.syn0保存了特征向量
word_vectors = model.wv.syn0
7. 原始Word2Vec模型中的词汇表存储在model.wv.index2word
model.wv.index2word[:10]
Out[103]: ['the', 'and', 'a', 'of', 'to', 'is', 'it', 'in', 'i', 'this']
8. model.wv[word] 保存的是词word的特征向量
model.wv['cute']
Out[107]:
array([ 0.10395749, 0.01340602, -0.05515463, 0.08028193, 0.0831662 ,
0.01887189, -0.04345329, -0.05330021, 0.12171001, -0.04085367,
-0.02210595, -0.02724563, -0.11547018, -0.04441912, -0.11273406,
0.07782781, 0.01631736, -0.07693825, -0.04581402, 0.10695351,
0.05138232, -0.0677945 , -0.07055901, 0.11972319, 0.08994358,
0.0911851 , -0.0570991 , -0.00350709, -0.02417902, 0.10895319,
0.01303014, -0.01725169, -0.14882621, -0.02619713, -0.10519824,
-0.04493091, -0.16000146, 0.00747467, -0.04279637, 0.00875237,
-0.00187243, 0.10082789, -0.02459648, -0.00999216, 0.05895302,
0.00673912, -0.01889222, 0.02808409, 0.01648255, 0.02626997,
0.01269405, -0.03702597, 0.07169444, -0.0352535 , -0.05717425,
-0.04763654, 0.02663715, 0.10974685, 0.06736938, 0.0174698 ,
0.06920658, 0.01708665, -0.07029437, 0.05515318, 0.05991052,
0.05298909, -0.08868574, -0.06191275, -0.03539872, -0.05076154,
-0.02606378, 0.01011586, -0.01805843, -0.02551923, -0.15421656,
0.01473052, -0.03454492, -0.01709277, 0.03901476, -0.02083028,
-0.00472902, 0.05752712, -0.01705575, 0.02167751, -0.01060016,
0.06334381, 0.01365775, -0.02642423, -0.03185539, -0.04229859,
-0.00960711, -0.05159323, -0.0107929 , 0.03905606, -0.01018345,
-0.16724686, -0.0740692 , 0.04011698, 0.00084314, 0.08778731,
0.00043498, -0.06322528, 0.01398241, -0.13803165, 0.04917759,
-0.07478268, -0.04414773, 0.00494875, -0.06643019, -0.01880562,
-0.0085729 , 0.11102808, -0.07788483, -0.02472506, 0.00832106,
0.08655448, -0.03044587, -0.11335427, -0.01019672, -0.00891955,
-0.0046578 , -0.04484824, -0.00480774, 0.00990628, 0.0304127 ,
0.07221536, 0.02026715, 0.02251668, -0.12073576, 0.01114497,
-0.0925098 , 0.08802549, -0.06144339, -0.02550698, -0.01278893,
-0.09792975, 0.00019877, 0.08281778, 0.08710661, -0.05954188,
0.02163956, 0.01532864, 0.00475272, -0.12350821, 0.05543683,
-0.03548338, 0.00060662, 0.00130515, -0.01039108, -0.08204419,
-0.01705559, -0.02216858, 0.00746337, 0.01450412, -0.04393485,
0.00950245, 0.01706349, 0.13297872, 0.07323734, -0.0214741 ,
-0.12967218, -0.10249887, -0.03105918, -0.00937948, 0.00526513,
-0.06067234, 0.03748649, -0.01337284, 0.08072616, -0.05385198,
-0.03888603, -0.04407704, -0.0367489 , 0.04569077, -0.10303388,
-0.05465897, 0.06709852, -0.05746002, 0.01949417, -0.05603344,
0.00171373, -0.00993067, 0.01545683, 0.01067998, -0.0062286 ,
0.09300097, -0.01666072, -0.12535034, 0.09240779, -0.06217002,
-0.04754363, -0.00299655, 0.01855474, 0.06256441, -0.05956341,
0.05185251, -0.0640633 , -0.03402514, 0.02647401, 0.01069647,
-0.02547205, -0.07947176, -0.04808271, -0.04108227, 0.11412466,
-0.00151837, -0.04015661, -0.06796671, -0.03290284, 0.03310379,
-0.01257008, -0.04306978, 0.02287693, -0.05664393, -0.05226184,
0.00175329, -0.02349281, -0.0027167 , 0.08252641, 0.00597219,
0.06197369, -0.06398601, 0.01810714, -0.07447384, -0.03710881,
0.06342086, -0.01379431, -0.07353684, -0.06600461, 0.01718475,
-0.02449206, -0.09625625, 0.00970756, -0.0322529 , -0.01138916,
-0.0621329 , -0.02559469, 0.03321688, -0.08050417, 0.05371548,
0.04054398, 0.00163386, -0.04174059, 0.00092536, -0.05284683,
0.02452098, 0.01993499, -0.00741496, -0.07127699, -0.10550134,
0.01929919, 0.01416306, 0.03589554, -0.02885906, 0.02407024,
0.07062983, -0.13104135, 0.06513657, 0.02129908, 0.0536918 ,
0.00706343, 0.07720044, -0.1053992 , 0.01949217, -0.07627254,
0.03320942, 0.03549356, -0.05003519, -0.0254082 , 0.02865515,
0.02038228, -0.01332848, -0.08172073, -0.03527837, -0.03215587,
-0.04730712, 0.09934537, -0.00610903, -0.0346164 , -0.00188222,
0.0057266 , 0.08050963, -0.05251392, 0.06276695, -0.01655651,
0.02373511, 0.02576121, 0.02054879, 0.01534211, -0.03256356,
-0.07760137, 0.04657681, 0.02044256, 0.09615038, 0.10455099,
0.01405706, -0.01645165, 0.03262366, -0.07067423, 0.03945063], dtype=float32)
model.wv['cute'].shape
Out[108]: (300,)