''' Created on 2017年9月13日 @author: weizhen ''' import numpy as np def sigmoid(x): return 1 / (1 + np.exp(-x))
首先上来的是最简单的sigmoid激励函数,
至于为什么选他做激励函数,
1、因为这个函数能将定义域为(-inf,+inf)的值映射到(0,1)区间,便于计算(能够消除量纲的影响)
2、这个函数的变化曲线不是特变陡峭,每一点处都可导
3、这个函数的导数为y(1-y),即用他原来的函数值就可以求出其导数的值,便于计算
如果感觉写得还行请大家抢个沙发啥的,小弟不胜感激。
误差的反向传递求gradxi
令
令
所以
令
则
To Summary:
Tip3:上面公式中 的值是由输出向量wc与每一轮的变量值x相乘得来的,使用交叉熵函数作为损失函数主要是 为了恒量输出向量与正确分布的相似程度
那么每次在误差的反向传播过程中,
如何将梯度下降方法应用到变量x和变量wc上呢?
需要首先求出目标函数对于x和变量wc的导数gradxi和gradwi
然后指定相应的步长d(可以对于x和变量wc应用相同的步长)
然后使用梯度下降公式,来对x和wc的值进行更新
xi+1=xi+d*gradxi
wi+1=wi+d*gradwi
在将得到的两个新的值代入到 交叉熵函数里边CrossEntropy(xi+1,wi+1)<Predefine Error 则停止迭代
否则继续迭代。
上面所有的公式推导都是在求gradxi,并且在这过程中需要使用到交叉熵关于 x和变量wc隐函数的导数,也就是交叉熵对softmax函数求导,这时候可以进行相应的化简
这里边有两种情况
当fy=fc时 $\frac{\partial Entropy}{\partial z_{y}}=z_{y}-1$ 即如果是正对的节点的误差传播,对后一节点的导数,可以用当前前向传播传播过程计算出来的值$z_{y}-1$来计算gradz
当fy不等于fc时 $\frac{\partial Entropy}{\partial z_{y}}=z_{y}$ 即如果不是正对的节点的误差传播,对后一节点的导数,可以用当前前向传播传播过程计算出来的值$z_{y}$来计算gradz
如果实在不明白可以参考以下图片,第一张图片是交叉熵函数对x变量求偏导,第二张图片是交叉熵函数对w变量求偏导
下面这张图片是交叉熵损失函数对变量w求偏导,这里将w=(wc,wy)
接下来是sigmoid函数的导数
''' Created on Sep 11, 2017 @author: p0079482 ''' def sigmoid_graf(f): """ 计算Sigmoid的梯度 """ f = f * (1 - f) return f
上面这个就没啥可说的了,就是求神经元的导数
下面是softmax函数,能够将[3,1,3]类似这种的输出结果通过exp(3)/[exp(3)+exp(1)+exp(3)]这种形式映射到(0,1)的概率区间,
便于下一步我们进行交叉熵计算
下面的代码是数值求微分的方法,来检验上一步中计算的导数值和真实值的区别是否足够小,即通过公式算出来的导数是否精确
''' Created on Sep 11, 2017 @author: p0079482 ''' import random import numpy as np def gradcheck_naive(f,x): """ 对一个函数f求梯度检验 -f 输入x,然后输出loss和梯度的函数 -x 就是输入 """ rndstate = random.getstate() #获取当前随机数的环境状态,以便下次再使用 random.setstate(rndstate) fx,grad=f(x) h=1e-4 #遍历x的每一维 #单个数组的迭代 #nditer将输入数组视为只读对象。要修改数组元素,必须指定读写( read-write)或只写(write-only)模式 it = np.nditer(x,flags=['multi_index'],op_flags=['readwrite']) while not it.finished: ix=it.multi_index old_val=x[ix] x[ix]=old_val-h random.setstate(rndstate) (fxh1,_)=f(x) x[ix]=old_val+h random.setstate(rndstate) (fxh2,_)=f(x) numgrad = (fxh2-fxh1)/(2*h) x[ix] = old_val #对比梯度 reldiff = abs(numgrad-grad[ix])/max(1,abs(numgrad),abs(grad[ix])) if reldiff>1e-5: print("Gradient check failed.") print("First gradient error found at index %s"%str(ix)) print("Your gradient: %f \t Numerical gradient: %f"%(grad[ix],numgrad)) return it.iternext() #Step to next dimension print("Gradient check passed!")
下面的代码是计算一个一层全连接网络,一层softmax层的程序。并且计算了梯度的反向传播。
''' Created on Sep 11, 2017 @author: p0079482 ''' """写出只有一个隐藏层其激活函数为sigmoid的神经网络前向和后向传播代码""" import numpy as np from q1_sigmoid import sigmoid from q1_softmax import softmax from q2_sigmoid import sigmoid_graf def forward_backward_prop(data, labels, params, verbose=False): """2 个隐层的神经网络的前向运算和反向传播""" if len(data.shape) >= 2: (N, _) = data.shape dimensions = [1] * 3 dimensions[0]=N dimensions[1]=20 dimensions[2]=labels.shape # ## 展开每一层神经网络的参数 t = 0 W1 = np.reshape(params[t:t + dimensions[0] * dimensions[1]], (dimensions[0], dimensions[1])) t += dimensions[0] * dimensions[1] b1 = np.reshape(params[t:t + dimensions[1]], (1, dimensions[1])) t += dimensions[1] W2 = np.reshape(params[t:t + dimensions[1] * dimensions[2]], (dimensions[1], dimensions[2])) t += dimensions[1] * dimensions[2] b2 = np.reshape(params[t:t + dimensions[2]], (1, dimensions[2])) # ##前向运算 # 第一个隐层做内积 a1 = sigmoid(data.dot(W1) + b1) # 第二个隐层做内积 a2 = softmax(a1.dot(W2) + b2) cost = -np.sum(np.log(a2[labels == 1])) / N # ## 反向传播 # Calculate analytic gradient for the cross entropy function grad_a2 = (a2 - labels) / N # Backpropagate through the second latent layer gradW2 = np.dot(a1.T, grad_a2) gradb2 = np.sum(grad_a2, axis=0, keepdims=True) # Backpropagate through the first latent layer grad_a1 = np.dot(grad_a2, W2.T) * sigmoid_graf(a1) gradW1 = np.dot(data.T, grad_a1) gradb1 = np.sum(grad_a1, axis=0, keepdims=True) if verbose: # Verbose mode for logging information print("W1 shape:{}".format(str(W1.shape))) print("W1 gradient shape: {}".format(str(gradW1.shape))) print("b1 shape: {}".format(str(b1.shape))) print("b1 gradient shape: {}".format(str(gradb1.shape))) # ## 梯度拼起来 grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
下面的代码是随机梯度下降,SGD真的是和普通的梯度下降算法相比特别特别地块和有效来着
''' Created on Sep 12, 2017 @author: p0079482 实现随机梯度下降 随机梯度下降每1000轮,就保存一下现在训练得到的参数 ''' SAVE_PARAMS_EVERY = 1000 import glob import os.path as op import pickle as pickle import sys import random def load_saved_params(): """ 载入之前的参数以免从头开始训练 """ st = 0 for f in glob.glob("saved_params_*.npy"): iter = int(op.splitext(op.basename(f))[0].split("_")[2]) if(iter > st): st = iter if st > 0: with open("saved_params_%d.npy" % st, "rb") as f: params = pickle.load(f) state = pickle.load(f) return st, params, state else: return st, None, None def save_params(iter, params): with open("saved_params_%d.npy" % iter, "wb") as f: pickle.dump(params, f) pickle.dump(random.getstate(), f) def sgd(f, x0, step, iterations, postprocessing=None, useSaved=False, PRINT_EVERY=10, ANNEAL_EVERY=20000): """随机梯度下降 输入: f:需要最优化的函数 x0:SGD的初始值 step:SGD的步长 iterations:总的迭代次数 postprocessing:参数后处理(比如word2vec里需要对词向量做归一化处理) PRINT_EVERY:指明多少次迭代以后输出一下状态 输出: x:SGD完成后的输出参数 """ if useSaved: start_iter, oldx, state = load_saved_params() if start_iter > 0: x0 = oldx; step *= 0.5 if state: random.setstate(state) else: start_iter = 0 x = x0 if not postprocessing: postprocessing = lambda x:x expcost = None for iter in range(start_iter + 1, iterations + 1): cost, grad = f(x) x = x - step * grad x = postprocessing(x) if iter % PRINT_EVERY == 0: print("iter#{},cost={}".format(iter, cost)) sys.stdout.flush() if iter % SAVE_PARAMS_EVERY == 0 and useSaved: save_params(iter, x) if iter % ANNEAL_EVERY == 0: step *= 0.5 return x
下面的代码是softmax函数的误差反向传播,
skip-gram模型的实现
''' Created on Sep 11, 2017 @author: p0079482 ''' """实现word2vec模型,并使用随机梯度下降方法SGD训练属于自己的词向量 首先写一个辅助函数对矩阵中的每一行进行归一化, 同样在这个文件中完成对softmax, 负采样损失函数 以及梯度计算函数的实现 然后完成面向skip-gram的梯度损失函数 """ import numpy as np import random from q1_softmax import softmax from q2_gradcheck import gradcheck_naive from q2_sigmoid import sigmoid_graf from q1_sigmoid import sigmoid def normalizeRows(x): """ 行归一化函数 """ N=x.shape[0] x/=np.sqrt(np.sum(x**2,axis=1)).reshape((N,1))+1e-30 return x def test_normalize_rows(): print("Testing normalizeRows") x = normalizeRows(np.array([[3.0,4.0],[1,2]])) """结果应该是[[0.6,0.8],[0.4472136,0.89442719]]""" print(x) assert(np.amax(np.fabs(x-np.array([[0.6,0.8],[0.4472136,0.89442719]])))<= 1e-6) print(" ") def softmaxCostAndGradient(predicted,target,outputVectors,dataset): """word2vec的Softmax损失函数""" """ 输入: predicted:预测词向量的numpy数组 target:目标词的下标 outputVectors:所有token的"output"向量(行形式) dataset:用来做负例采样的,这里其实没用着 输出: cost:输出的互熵损失 gradPred:the gradient with respect to the predicted word vector grad: the gradient with respect to all the other word vectors """ probabilities = softmax(predicted.dot(outputVectors.T)) cost = -np.log(probabilities[target]) delta = probabilities delta[target] -= 1 N = delta.shape[0] D = predicted.shape[0] grad = delta.reshape((N,1))*predicted.reshape((1,D)) gradPred = (delta.reshape((1,N)).dot(outputVectors)).flatten() return cost,gradPred,grad def negSamplingCostAndGradient(predicted,target,outputVectors,dataset,K=10): """ Word2vec模型负例采样后的损失函数和梯度 """ grad = np.zeros(outputVectors.shape) gradPred = np.zeros(predicted.shape) indices = [target] for k in range(K): newidx = dataset.sampleTokenIdx() while newidx == target: newidx = dataset.sampleTokenIdx() indices+=[newidx] labels = np.array([1]+[-1 for k in range(K)]) vecs = outputVectors[indices,:] t=sigmoid(vecs.dot(predicted)*labels) cost = -np.sum(np.log(t)) delta = labels*(t-1) gradPred = delta.reshape((1,K+1)).dot(vecs).flatten() gradtemp = delta.reshape((K+1,1)).dot(predicted.reshape((1,predicted.shape[0]))) for k in range(K+1): grad[indices[k]]+=gradtemp[k,:] t = sigmoid(predicted.dot(outputVectors[target,:])) cost = -np.log(t) delta = t - 1 gradPred+=delta*outputVectors[target,:] grad[target,:] += delta*predicted for k in range(K): idx = dataset.sampleTokenIdx() t = sigmoid(-predicted.dot(outputVectors[idx,:])) cost += -np.log(t) delta = 1-t gradPred += delta*outputVectors[idx,:] grad[idx,:]+=delta*predicted return cost,gradPred,grad def skipgram(currentWord,C,contextWords,tokens,inputVectors,outputVectors, dataset,word2vecCostAndGradient=softmaxCostAndGradient): """ Skip-gram model in word2vec skip-gram模型的实现 输入: currentWord:当前中心词所对应的串 C:上下文大小(词窗大小) contextWords:最多2*C个词 tokens:对应词向量中词下标的字典 inputVectors:"input" word vectors (as rows) for all tokens outputVectors: "output" word vectors (as rows) for all tokens word2vecCostAndGradient: the cost and gradient function for a prediction vector given the target word vectors, could be one of the two cost functions you implemented above 输出: cost:skip-gram模型算得的损失值 grad:词向量对应的梯度 """ currentI=tokens[currentWord] predicted = inputVectors[currentI,:] cost=0.0 gradIn = np.zeros(inputVectors.shape) gradOut = np.zeros(outputVectors.shape) for cwd in contextWords: idx = tokens[cwd] cc,gp,gg = word2vecCostAndGradient(predicted,idx,outputVectors,dataset) cost+=cc gradOut += gg gradIn[currentI,:]+=gp return cost,gradIn,gradOut def word2vec_sgd_wrapper(word2vecModel,tokens,wordVectors,dataset,C,word2vecCostAndGradient=softmaxCostAndGradient): batchsize = 50 cost =0.0 grad = np.zeros(wordVectors.shape) N=wordVectors.shape[0] splitIndex=int(N/2) inputVectors = wordVectors[:splitIndex,:] outputVectors = wordVectors[splitIndex:,:] for i in range(batchsize): C1 = random.randint(1,C) centerword,context=dataset.getRandomContext(C1) if word2vecModel == skipgram: denom =1 else: denom =1 c,gin,gout = word2vecModel(centerword,C1,context,tokens,inputVectors,outputVectors,dataset,word2vecCostAndGradient) cost+=c/batchsize/denom grad[:splitIndex,:]+=gin/batchsize/denom grad[splitIndex:,:]+=gout/batchsize/denom return cost, grad def test_word2vec(): """ Interface to the dataset for negative sampling """ dataset = type('dummy',(),{})() def dummySampleTokenIdx(): return random.randint(0,4) def getRandomContext(C): tokens = ["a","b","c","d","e"] return tokens[random.randint(0,4)],[tokens[random.randint(0,4)] for i in range(2*C)] dataset.sampleTokenIdx = dummySampleTokenIdx dataset.getRandomContext = getRandomContext random.seed(31415) np.random.seed(9265) dummy_vectors = normalizeRows(np.random.randn(10,3)) dummy_tokens = dict([("a",0),("b",1),("c",2),("d",3),("e",4)]) print("===== Gradient check for skip-gram ======") #gradcheck_naive(lambda vec:word2vec_sgd_wrapper(skipgram,dummy_tokens,vec,dataset,5),dummy_vectors) gradcheck_naive(lambda vec:word2vec_sgd_wrapper(skipgram,dummy_tokens,vec,dataset,5,negSamplingCostAndGradient),dummy_vectors) print("\n===== Gradient check for CBOW") #gradcheck_naive(lambda vec:word2vec_sgd_wrapper(cbow,dummy_tokens,vec,dataset,5),dummy_vectors) #gradcheck_naive(lambda vec:word2vec_sgd_wrapper(cbow,dummy_tokens,vec,dataset,5,negSamplingCostAndGradient),dummy_vectors) print("\n===== Results ===") print(skipgram("c",3,["a","b","e","d","b","c"],dummy_tokens,dummy_vectors[:5,:],dummy_vectors[5:,:],dataset)) #print(skipgram("c",1,["a","b"],dummy_tokens,dummy_vectors[:5,:],dummy_vectors[5:,:],dataset,negSamplingCostAndGradient)) #print(cbow("a",2,["a","b","c","a"],dummy_tokens,dummy_vectors[:5,:],dummy_vectors[5:,:],dataset)) #print(cbow("a",2,["a","b","a","c"],dummy_tokens,dummy_vectors[:5,:],dummy_vectors[5:,:],dataset,negSamplingCostAndGradient)) if __name__=="__main__": test_normalize_rows() test_word2vec()
下面的模型是word2vector模型的训练,真正的纯手写版的,没有利用任何包。
(看来我还是默默地再天朝搬砖吧,感觉一旦进了斯坦福都毕业不了)
''' Created on 2017年9月17日 @author: weizhen ''' import random import numpy as np from q3_word2vec import skipgram,word2vec_sgd_wrapper,negSamplingCostAndGradient from q3_sgd import sgd,load_saved_params from data_utils import StanfordSentiment # Reset the random seed to make sure that everyone gets the same results dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) #going to train 10-dimensional vectors for this assignment dimVectors = 10 #Context size C=5 # Train word vectors (this could take a while!) # Reset the random seed to make sure that everyone gets the same results random.seed(31415) np.random.seed(9265) wordVectors = np.concatenate(((np.random.rand(nWords, dimVectors) - .5) / dimVectors, np.zeros((nWords, dimVectors))), axis=0) wordVectors0 = sgd(lambda vec: word2vec_sgd_wrapper(skipgram, tokens, vec, dataset, C, negSamplingCostAndGradient), wordVectors, 0.3, 40000, None, True, PRINT_EVERY=10) # sanity check: cost at convergence should be around or below 10 # sum the input and output word vectors #st, wordVectors0, state2 = load_saved_params(); wordVectors = (wordVectors0[:int(nWords),:] + wordVectors0[int(nWords):,:]) print("\n=== For autograder ===") checkWords = ["the", "a", "an", "movie", "ordinary", "but", "and"] checkIdx = [tokens[word] for word in checkWords] checkVecs = wordVectors[checkIdx, :] print(checkVecs)
输出的log如下所示
纯手写版的,真的是山外有山,人外有人呀。
既然tensorflow一行代码就行实现,那么为啥还要,自己写呢?
不为啥,就为了给自己装逼用。哈哈哈