# 深度学习与自然语言处理(4)_斯坦福cs224d 大作业测验1与解答

http://blog.csdn.net/han_xiaoyang/article/details/51760923
http://blog.csdn.net/longxinchen_ml/article/details/51765418

## 1 Softmax (10 分)

(part a) (5分)

softmax(x)=softmax(x+c)

softmax(x)i=exijexj

(softmax(x+c))i=exp(xi+c)dim(x)j=1exp(xj+c)=exp(c)exp(xi)exp(c)dim(x)j=1exp(xj)=exp(xi)dim(x)j=1exp(xj)=(softmax(x))i

(part b) (5 分)

import numpy as np

def softmax(x):
"""
Softmax 函数
"""
assert len(x.shape) > 1, "Softmax的得分向量要求维度高于1"
x -= np.max(x, axis=1, keepdims=True)
x = np.exp(x) / np.sum(np.exp(x), axis=1, keepdims=True)

return x

## 2 神经网络基础（30分）

(part a) (3 分)

σ(x)=11+ex

(part b) (3 分)

CE(y,y^)=iyilog(y^i)

CE(y,y^)θ={y^i1,i=ky^i,otherwise

(part c) (6 分)

h=sigmoid(xW1+b1)y^=softmax(hW2+b2)

δ1=CEz2=y^yδ2=CEh=δ1z2h=δ1WT2δ3=CEz1=δ2hz1=δ2σ(z1)CEx=δ3z1x=δ3WT

(part d) (2 分)

(part e) (4 分) 在q2_sigmoid.py中补充写出sigmoid激活函数的和求它的梯度的对应代码。并使用python q2_sigmoid.py进行测试，同样的，测试用例有可能不太详尽，因此尽量检查下自己的代码。

def sigmoid_grad(f):
"""
计算Sigmoid的梯度
"""
#好在我有numpy
f = f * ( 1 - f )

return f

(part f) (4 分)

def gradcheck_naive(f, x):
"""
对一个函数f求梯度的梯度检验
- f 输入x，然后输出loss和梯度的函数
- x 就是输入咯
"""
rndstate = random.getstate()
random.setstate(rndstate)
h = 1e-4

# 遍历x的每一维
while not it.finished:
ix = it.multi_index

old_val = x[ix]
x[ix] = old_val - h
random.setstate(rndstate)
( fxh1, _ ) = f(x)

x[ix] = old_val + h
random.setstate(rndstate)
( fxh2, _ ) = f(x)

x[ix] = old_val

# 比对梯度
if reldiff > 1e-5:
print "First gradient error found at index %s" % str(ix)
return

it.iternext() # Step to next dimension

print "Gradient check passed!"

(part g) (8 分)

def forward_backward_prop(data, labels, params, verbose = False):
"""
2个隐层的神经网络的前向运算和反向传播
"""

if len(data.shape) >= 2:
(N, _) = data.shape

### 展开每一层神经网络的参数
t = 0
W1 = np.reshape(params[t:t+dimensions[0]*dimensions[1]], (dimensions[0], dimensions[1]))
t += dimensions[0]*dimensions[1]
b1 = np.reshape(params[t:t+dimensions[1]], (1, dimensions[1]))
t += dimensions[1]
W2 = np.reshape(params[t:t+dimensions[1]*dimensions[2]], (dimensions[1], dimensions[2]))
t += dimensions[1]*dimensions[2]
b2 = np.reshape(params[t:t+dimensions[2]], (1, dimensions[2]))

### 前向运算

# 第一个隐层做内积
a1 = sigmoid(data.dot(W1) + b1)
# 第二个隐层做内积
a2 = softmax(a1.dot(W2) + b2)

cost = - np.sum(np.log(a2[labels == 1]))/N

### 反向传播

# Calculate analytic gradient for the cross entropy loss function
grad_a2 = ( a2 - labels ) / N

# Backpropagate through the second latent layer

# Backpropagate through the first latent layer

if verbose: # Verbose mode for logging information
print "W1 shape: {}".format( str(W1.shape) )
print "b1 shape: {}".format( str(b1.shape) )

### 梯度拼起来

return cost, grad

## 3 word2vec(40分+5附加分)

(part a) (3分)

y^o=p(o|c)=exp(μoυc)Ww=1exp(μwυc)(4)

JsoftmaxCE(o,υc,U)=CE(y,y^)(5)

Jυc=UT(y^y).

Jυc=μi+w=1Wyw^μw

(part b) (3分)

JU=υc(y^y)

Jμw={(yw^1)υc,yw^,w=ootherwise

(part c) (6分)

Jnegsample(o,υc,U)=log(σ(μoυc))k=1Klog(σ(μkυc))(6)

JυcJμoJμk=(σ(μoυc)1)μok=1K(σ(μkυc)1)μk=(σ(μoυc)1)υc=(σ(μkυc)1)υc,for all k=1,2,,K

(part d) (8分)

Jskipgram(wordcmc+m)=mjm,j0F(wc+j,υc)(7)

CBOW略有不同，不同于使用υc$\upsilon_c$作为预测向量，我们以υ^$\hat{\upsilon}$为底，在CBOW中（一个小小的变体），我们计算上下文输入词向量的和:

υ^=mjm,j0υc+j(8)

JCBOW(wordcmc+m)=F(wc,υ^)(9)

F(wi,υ^)U$\frac{\partial{F(w_i,\hat{\upsilon})}}{\partial{U}}$F(wi,υ^)υ^$\frac{\partial{F(w_i,\hat{\upsilon})}}{\partial{\hat{\upsilon}}}$

Jskipgram(wordcm,,c+m)UJskipgram(wordcm,,c+m)υcJskipgram(wordcm,,c+m)υj=mjm,j0F(wc+j,υc)U,=mjm,j0F(wc+j,υc)υc=0, for all jc.

JCBOW(wordcm,,c+m)UJCBOW(wordcm,,c+m)υjJCBOW(wordcm,,c+m)υj=F(wc,υ^)U,(using the definition of υ^ in the problem)=F(wc,υ^)υ^,for all j{cm,,c1,c+1,,c+m}=0,for all j{cm,,c1,c+1,,c+m}

(part e) (12分)

import numpy as np
import random

from q1_softmax import softmax

def normalizeRows(x):
"""
行归一化函数
"""

N = x.shape[0]
x /= np.sqrt(np.sum(x**2, axis=1)).reshape((N,1)) + 1e-30

return x

def test_normalize_rows():
print "Testing normalizeRows..."
x = normalizeRows(np.array([[3.0,4.0],[1, 2]]))
# 结果应该是 [[0.6, 0.8], [0.4472, 0.8944]]
print x
assert (np.amax(np.fabs(x - np.array([[0.6,0.8],[0.4472136,0.89442719]]))) <= 1e-6)
print ""

"""
word2vec的Softmax损失函数
"""

# 输入:
# - predicted: 预测词向量的numpy数组
# - target: 目标词的下标
# - outputVectors: 所有token的"output"向量(行形式)
# - dataset: 用来做负例采样的，这里其实没用着

# 输出:
# - cost: 输出的互熵损失
#        vector
# - grad: the gradient with respect to all the other word
#        vectors

probabilities = softmax(predicted.dot(outputVectors.T))
cost = -np.log(probabilities[target])
delta = probabilities
delta[target] -= 1
N = delta.shape[0]
D = predicted.shape[0]

K=10):
"""
Word2vec模型负例采样后的损失函数和梯度
"""

indices = [target]
for k in xrange(K):
newidx = dataset.sampleTokenIdx()
while newidx == target:
newidx = dataset.sampleTokenIdx()
indices += [newidx]

labels = np.array([1] + [-1 for k in xrange(K)])
vecs = outputVectors[indices,:]

t = sigmoid(vecs.dot(predicted) * labels)
cost = -np.sum(np.log(t))

delta = labels * (t - 1)
(1,predicted.shape[0])))
for k in xrange(K+1):

t = sigmoid(predicted.dot(outputVectors[target,:]))
cost = -np.log(t)
delta = t - 1

gradPred += delta * outputVectors[target, :]
grad[target, :] += delta * predicted

for k in xrange(K):
idx = dataset.sampleTokenIdx()

t = sigmoid(-predicted.dot(outputVectors[idx,:]))
cost += -np.log(t)
delta = 1 - t

gradPred += delta * outputVectors[idx, :]
grad[idx, :] += delta * predicted

def skipgram(currentWord, C, contextWords, tokens, inputVectors, outputVectors,
""" Skip-gram model in word2vec """

# skip-gram模型的实现

# 输入:
# - currrentWord: 当前中心词所对应的串
# - C: 上下文大小(词窗大小)
# - contextWords: 最多2*C个词
# - tokens: 对应词向量中词下标的字典
# - inputVectors: "input" word vectors (as rows) for all tokens
# - outputVectors: "output" word vectors (as rows) for all tokens
# - word2vecCostAndGradient: the cost and gradient function for a prediction vector given the target word vectors, could be one of the two cost functions you implemented above

# 输出:
# - cost: skip-gram模型算得的损失值

currentI = tokens[currentWord]
predicted = inputVectors[currentI, :]

cost = 0.0
for cwd in contextWords:
idx = tokens[cwd]
cc, gp, gg = word2vecCostAndGradient(predicted, idx, outputVectors, dataset)
cost += cc

batchsize = 50
cost = 0.0
N = wordVectors.shape[0]
inputVectors = wordVectors[:N/2,:]
outputVectors = wordVectors[N/2:,:]
for i in xrange(batchsize):
C1 = random.randint(1,C)
centerword, context = dataset.getRandomContext(C1)

if word2vecModel == skipgram:
denom = 1
else:
denom = 1

c, gin, gout = word2vecModel(centerword, C1, context, tokens, inputVectors, outputVectors, dataset, word2vecCostAndGradient)
cost += c / batchsize / denom
grad[:N/2, :] += gin / batchsize / denom
grad[N/2:, :] += gout / batchsize / denom

def test_word2vec():
# Interface to the dataset for negative sampling
dataset = type('dummy', (), {})()
def dummySampleTokenIdx():
return random.randint(0, 4)

def getRandomContext(C):
tokens = ["a", "b", "c", "d", "e"]
for i in xrange(2*C)]
dataset.sampleTokenIdx = dummySampleTokenIdx
dataset.getRandomContext = getRandomContext

random.seed(31415)
np.random.seed(9265)
dummy_vectors = normalizeRows(np.random.randn(10,3))
dummy_tokens = dict([("a",0), ("b",1), ("c",2),("d",3),("e",4)])
print "==== Gradient check for skip-gram ===="
gradcheck_naive(lambda vec: word2vec_sgd_wrapper(skipgram, dummy_tokens, vec, dataset, 5), dummy_vectors)
print "\n==== Gradient check for CBOW      ===="
gradcheck_naive(lambda vec: word2vec_sgd_wrapper(cbow, dummy_tokens, vec, dataset, 5), dummy_vectors)

print "\n=== Results ==="
print skipgram("c", 3, ["a", "b", "e", "d", "b", "c"], dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset)
print skipgram("c", 1, ["a", "b"], dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset, negSamplingCostAndGradient)
print cbow("a", 2, ["a", "b", "c", "a"], dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset)
print cbow("a", 2, ["a", "b", "a", "c"], dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset, negSamplingCostAndGradient)

if __name__ == "__main__":
test_normalize_rows()
test_word2vec()

(f) (4分) 在代码q3_sgd.py中完成对随即梯度下降优化函数的实现。并且在该代码中运行测试你的实现。

# 实现随机梯度下降

# 随机梯度下降每1000轮，就保存一下现在训练得到的参数
SAVE_PARAMS_EVERY = 1000

import glob
import os.path as op
import cPickle as pickle
import sys

"""
载入之前的参数以免从头开始训练
"""
st = 0
for f in glob.glob("saved_params_*.npy"):
iter = int(op.splitext(op.basename(f))[0].split("_")[2])
if (iter > st):
st = iter

if st > 0:
with open("saved_params_%d.npy" % st, "r") as f:
return st, params, state
else:
return st, None, None

def save_params(iter, params):
with open("saved_params_%d.npy" % iter, "w") as f:
pickle.dump(params, f)
pickle.dump(random.getstate(), f)

def sgd(f, x0, step, iterations, postprocessing = None, useSaved = False, PRINT_EVERY=10, ANNEAL_EVERY = 20000):
""" 随机梯度下降 """
###########################################################
# 输入
#   - f: 需要最优化的函数
#   - x0: SGD的初始值
#   - step: SGD的步长
#   - iterations: 总得迭代次数
#   - postprocessing: 参数后处理（比如word2vec里需要对词向量做归一化处理）
#   - PRINT_EVERY: 指明多少次迭代以后输出一下状态
# 输出:
#   - x: SGD完成后的输出参数                   #
###########################################################

if useSaved:
if start_iter > 0:
x0 = oldx;
step *= 0.5 ** (start_iter / ANNEAL_EVERY)

if state:
random.setstate(state)
else:
start_iter = 0

x = x0

if not postprocessing:
postprocessing = lambda x: x

expcost = None

for iter in xrange(start_iter + 1, iterations + 1):
x = x - step * grad
x = postprocessing(x)

if iter % PRINT_EVERY == 0:
print "Iter#{}, cost={}".format(iter, cost)
sys.stdout.flush()

if iter % SAVE_PARAMS_EVERY == 0 and useSaved:
save_params(iter, x)

if iter % ANNEAL_EVERY == 0:
step *= 0.5

return x

(part g) (4分)

(part h) 附加题（5分）

def cbow(currentWord, C, contextWords, tokens, inputVectors, outputVectors,
"""
word2vec的CBOW模型
"""

cost = 0

D = inputVectors.shape[1]
predicted = np.zeros((D,))

indices = [tokens[cwd] for cwd in contextWords]
for idx in indices:
predicted += inputVectors[idx, :]

for idx in indices:

return cost, gradIn, gradOut

## 4 情感分析（20分）

“超级消极”，“比较消极”，“中立”，“积极”，“非常积极”

(part a)（10分）

import numpy as np
import random

from cs224d.data_utils import *

from q1_softmax import softmax

def getSentenceFeature(tokens, wordVectors, sentence):
"""
简单粗暴的处理方式，直接对句子的所有词向量求平均做为情感分析的输入
"""

# 输入:
# - tokens: a dictionary that maps words to their indices in the word vector list
# - wordVectors: word vectors (each row) for all tokens
# - sentence: a list of words in the sentence of interest

# 输出:
# - sentVector: feature vector for the sentence

sentVector = np.zeros((wordVectors.shape[1],))

indices = [tokens[word] for word in sentence]
sentVector = np.mean(wordVectors[indices, :], axis=0)

return sentVector

def softmaxRegression(features, labels, weights, regularization = 0.0, nopredictions = False):
""" Softmax Regression """
# 完成加正则化的softmax回归

# 输入:
# - features: feature vectors, each row is a feature vector
# - labels: labels corresponding to the feature vectors
# - weights: weights of the regressor
# - regularization: L2 regularization constant

# 输出:
# - cost: cost of the regressor
# - grad: gradient of the regressor cost with respect to its weights
# - pred: label predictions of the regressor (you might find np.argmax helpful)

prob = softmax(features.dot(weights))
if len(features.shape) > 1:
N = features.shape[0]
else:
N = 1
# A vectorized implementation of    1/N * sum(cross_entropy(x_i, y_i)) + 1/2*|w|^2
cost = np.sum(-np.log(prob[range(N), labels])) / N
cost += 0.5 * regularization * np.sum(weights ** 2)

if N > 1:
pred = np.argmax(prob, axis=1)
else:
pred = np.argmax(prob)

if nopredictions:
else:

def accuracy(y, yhat):
""" Precision for classifier """
assert(y.shape == yhat.shape)
return np.sum(y == yhat) * 100.0 / y.size

def softmax_wrapper(features, labels, weights, regularization = 0.0):
cost, grad, _ = softmaxRegression(features, labels, weights,
regularization)

def sanity_check():
"""
Run python q4_softmaxreg.py.
"""
random.seed(314159)
np.random.seed(265)

dataset = StanfordSentiment()
tokens = dataset.tokens()
nWords = len(tokens)

wordVectors = (wordVectors0[:nWords,:] + wordVectors0[nWords:,:])
dimVectors = wordVectors.shape[1]

dummy_weights = 0.1 * np.random.randn(dimVectors, 5)
dummy_features = np.zeros((10, dimVectors))
dummy_labels = np.zeros((10,), dtype=np.int32)
for i in xrange(10):
words, dummy_labels[i] = dataset.getRandomTrainSentence()
dummy_features[i, :] = getSentenceFeature(tokens, wordVectors, words)
print "==== Gradient check for softmax regression ===="
dummy_labels, weights, 1.0, nopredictions = True), dummy_weights)

print "\n=== Results ==="
print softmaxRegression(dummy_features, dummy_labels, dummy_weights, 1.0)

if __name__ == "__main__":
sanity_check()

(part b)（2分）

(part c)（4分）
q4 sentiment.py中完成超参数的实现代码从而获取“最佳”的惩罚因子。你是如何选择的？报告你的训练、调试和测试精度，在最多一个句子中校正你的超参数选定方法。 注释：在开发中应该获取至少30%的准确率。

import numpy as np
import matplotlib.pyplot as plt

from cs224d.data_utils import *

from q4_softmaxreg import softmaxRegression, getSentenceFeature, accuracy, softmax_wrapper

# 试试不同的正则化系数，选最好的
REGULARIZATION = [0.0, 0.00001, 0.00003, 0.0001, 0.0003, 0.001, 0.003, 0.01]

# 载入数据集
dataset = StanfordSentiment()
tokens = dataset.tokens()
nWords = len(tokens)

# 载入预训练好的词向量
wordVectors = (wordVectors0[:nWords,:] + wordVectors0[nWords:,:])
dimVectors = wordVectors.shape[1]

# 载入训练集
trainset = dataset.getTrainSentences()
nTrain = len(trainset)
trainFeatures = np.zeros((nTrain, dimVectors))
trainLabels = np.zeros((nTrain,), dtype=np.int32)
for i in xrange(nTrain):
words, trainLabels[i] = trainset[i]
trainFeatures[i, :] = getSentenceFeature(tokens, wordVectors, words)

# 准备好训练集的特征
devset = dataset.getDevSentences()
nDev = len(devset)
devFeatures = np.zeros((nDev, dimVectors))
devLabels = np.zeros((nDev,), dtype=np.int32)
for i in xrange(nDev):
words, devLabels[i] = devset[i]
devFeatures[i, :] = getSentenceFeature(tokens, wordVectors, words)

# 尝试不同的正则化系数
results = []
for regularization in REGULARIZATION:
random.seed(3141)
np.random.seed(59265)
weights = np.random.randn(dimVectors, 5)
print "Training for reg=%f" % regularization

# batch optimization
weights = sgd(lambda weights: softmax_wrapper(trainFeatures, trainLabels,
weights, regularization), weights, 3.0, 10000, PRINT_EVERY=100)

# 训练集上测效果
_, _, pred = softmaxRegression(trainFeatures, trainLabels, weights)
trainAccuracy = accuracy(trainLabels, pred)
print "Train accuracy (%%): %f" % trainAccuracy

# dev集合上看效果
_, _, pred = softmaxRegression(devFeatures, devLabels, weights)
devAccuracy = accuracy(devLabels, pred)
print "Dev accuracy (%%): %f" % devAccuracy

# 保存结果权重
results.append({
"reg" : regularization,
"weights" : weights,
"train" : trainAccuracy,
"dev" : devAccuracy})

# 输出准确率
print ""
print "=== Recap ==="
print "Reg\t\tTrain\t\tDev"
for result in results:
print "%E\t%f\t%f" % (
result["reg"],
result["train"],
result["dev"])
print ""

# 选最好的正则化系数
BEST_REGULARIZATION = None
BEST_WEIGHTS = None

best_dev = 0
for result in results:
if result["dev"] > best_dev:
best_dev = result["dev"]
BEST_REGULARIZATION = result["reg"]
BEST_WEIGHTS = result["weights"]

# Test your findings on the test set
testset = dataset.getTestSentences()
nTest = len(testset)
testFeatures = np.zeros((nTest, dimVectors))
testLabels = np.zeros((nTest,), dtype=np.int32)
for i in xrange(nTest):
words, testLabels[i] = testset[i]
testFeatures[i, :] = getSentenceFeature(tokens, wordVectors, words)

_, _, pred = softmaxRegression(testFeatures, testLabels, BEST_WEIGHTS)
print "Best regularization value: %E" % BEST_REGULARIZATION
print "Test accuracy (%%): %f" % accuracy(testLabels, pred)

# 画出正则化和准确率的关系
plt.plot(REGULARIZATION, [x["train"] for x in results])
plt.plot(REGULARIZATION, [x["dev"] for x in results])
plt.xscale('log')
plt.xlabel("regularization")
plt.ylabel("accuracy")
plt.legend(['train', 'dev'], loc='upper left')
plt.savefig("q4_reg_v_acc.png")
plt.show()

(d)（4分）绘出在训练和开发过程中的分类准确率，并在x轴使用对数刻度来对正则化值进行相关设置。这应该自动化的进行。包括在你作业中详细展示的坐标图q4_reg_acc.png简明解释最多三个句子在此坐标图中的显示情况。

©️2019 CSDN 皮肤主题: 猿与汪的秘密 设计师: 上身试试