手撸的word2vec,cbow的层次softmax已经调通。
目前学习率定死啦!
后续继续改bug。
#coding+UTF-8
import numpy as np
from collections import Counter
from math import e as E
embeddingDim = 100
vocabSize = 0
syn0 = None
syn1 = None
syn1Neg = None
c = 4
window = np.array([x for x in range(2*c + 1) if x != c])-c
trainFile = []
L = 0
alpha = .001
neg = 4
M = 1000000
wordNodes = []
table = np.array([-1 for x in range(M)])
MAXEXP = 6
def InitUnigramTable():
trainWordsPow = sum([pow(node.count, .75) for node in wordNodes])
i, d1 = 0, 1.0 * pow(wordNodes[0].count, .75)/trainWordsPow
for a in range(M):
table[a] = i
if 1.0 * a / M >= d1:
i += 1
d1 += pow(wordNodes[i].count, .75) / trainWordsPow
if i > len(wordNodes):
i = len(wordNodes) - 1
def readWord(path):
global trainFile
words = []
with open('trainfile1', 'r') as fp:
for line in fp.readlines():
words.extend(line.strip('\n').split(' '))
trainFile = words[:]
L = len(trainFile)
counter = Counter(words)
#enumerate produce pairs like (index, word)
#so exchange index and word through zip
tmp = dict(enumerate(list(counter)))
wordCount = counter.items()
wordCount.sort(key=lambda x:x[1], reverse=True)
return wordCount, dict(zip(tmp.values(), tmp.keys()))
def huffmanTree(wordsCount):
l = len(wordsCount)
wordSize = l
wordNodes = [node(x) for x in wordsCount]
pos1, pos2 = l-1, l
count = [x[1] for x in wordsCount]+[float('inf') for _ in range(l)]
parent = [-1 for _ in range(2*l)]
binary = [0 for _ in range(2*l)]
for i in range(l-1):
if pos1>=0:
if count[pos1] < count[pos2]:
min1i = pos1
pos1 -= 1
else:
min1i = pos2
pos2 += 1
else:
min1i = pos2
pos2 += 1
if pos1>=0:
if count[pos1] < count[pos2]:
min2i = pos1
pos1 -= 1
else:
min2i = pos2
pos2 += 1
else:
min2i = pos2
pos2 += 1
count[l+i] = count[min1i] + count[min2i]
parent[min2i], parent[min1i] = l + i, l + i
binary[min2i] = 1
for i in range(l):
b = i
while 1:
wordNodes[i].addCodeAttr(binary[b])
b = parent[b]
wordNodes[i].addPoint(b-l)
if b == 2 * l - 2:
break
return wordNodes
def initNet(vocabSize):
global syn0, syn1, syn1Neg
syn0 = np.random.rand(vocabSize*embeddingDim).reshape((vocabSize, -1))/embeddingDim - 0.5/embeddingDim
syn1 = np.zeros((vocabSize, embeddingDim))
syn1Neg = np.zeros((vocabSize, embeddingDim))
def trainModel(hashWord, cbow=0, hs=0, negative=0):
if cbow:
if hs:
for pos in range(len(trainFile)):
trainingWin = window + pos
trainingWin = trainingWin[(trainingWin>0)&(trainingWin<L)]
trainingWords = [hashWord[trainFile[x]] for x in trainingWin]
if len(trainingWords)>0:
print(len(trainingWords))
Xw = syn0[trainingWords].mean(axis=0)
node_ = wordNodes[hashWord[trainFile[pos]]]
e = np.zeros(embeddingDim)
for x in range(node_.codeLen):
f = Xw.dot(syn1[node_.point[x]])
if -MAXEXP<= f <= MAXEXP:
continue
g = alpha * (1-int(node_.code[x])-1.0 /(1 + pow(E, f)))
e += g * syn1[node_.point[x]]
syn1[node_.point[x]] += g * Xw
for x in trainingWin:
syn0[x] += e
if negative > 0:
targets = (np.random.rand(negative) * 1e8).astype(np.int)
targets = table[targets]
f = 1.0/(1+pow(E, -1*Xw.dot(syn1[pos])))
g = (1.0 - f)*alpha
syn1[pos] += g * Xw
e = np.zeros(embeddingDim)
e += g*Xw
for d in targets:
f = 1.0/(1+pow(E, -1*Xw.dot(syn1[d])))
g = -f
syn1Neg[d] += g * Xw
e += g * Xw
for d in targets:
syn0[d] += e
else:
if hs:
for pos in range(len(trainFile)):
trainingWin = window + pos
trainingWin = trainingWin[(trainingWin>0)&(trainingWin<L)]
trainingWords = [hashWord[trainFile[x]] for x in trainingWin]
Xw = syn0[hashWOrd[pos]]
node_ = wordNodes[hashWord[trianFile[pos]]]
for word in trainingWords:
e = np.zeros(embeddingDim)
for x in range(node_.codeLen):
f = 1.0/(1 + pow(E, syn0[word].dot(syn1[node_.point[x]])))
g = alpha * (1-int(node_.code[x])-f)
e += g*syn1[node_.parent[x]]
syn1[node_.point[x]] += g * syn0[word]
syn0[word] += e
if negative > 0:
for pos in range(len(trainFile)):
cen = hashWord[trianFile[pos]]
Xw = syn0[cen]
targets = (np.random.rand(negative) * 1e8).astype(np.int)
targets = table[targets]
f = 1.0/(1+pow(E, Xw.dot(syn1Neg[hashWord[trainFile[pos]]])))
g = (1.0 - f)*alpha
e = np.zeros(embeddingDim)
e += g*sys1Neg[cen]
syn1Neg[cen] += g * Xw
for d in targets:
f = 1.0/(1+pow(E, Xw.dot(syn1Neg[d])))
g = -f * alpha
e += g * syn1Neg[d]
syn1Neg[d] += g * Xw
syn0[cen] += e
class node:
def __init__(self, wordcnttuple):
self.word, self.count = wordcnttuple
self.code, self.codeLen = '', 0,
self.point = []
def addCodeAttr(self, newBinary):
self.code = '{}'.format(newBinary) + self.code
self.codeLen += 1
def addPoint(self, point):
self.point[0:0]=[point]
if __name__ == '__main__':
wordCount, hashWOrd = readWord('./trainFile')
wordNodes = huffmanTree(wordCount)
InitUnigramTable()
initNet(len(wordCount))
for x in syn0:
print(x)
trainModel(hashWOrd, 1, 1)
wordEmbedding = {}
for x, y in zip(wordCount, syn0):
wordEmbedding[x[0]]=y
first = wordEmbedding['phone']
rank = []
for x, y in wordEmbedding.items():
dis = np.sum((first - y)**2)
rank.append((x, dis))
print(rank)
rank.sort(key=lambda x:x[1])
for x in rank[:10]:
print('the distance between {} and {} is: {}'.format('phone', x[0], x[1]))