@hanxiaoyang 2016-06-26T17:13:18.000000Z 字数 24253 阅读 2546

# 深度学习与自然语言处理(4)_斯坦福cs224d 大作业测验1与解答

http://blog.csdn.net/han_xiaoyang/article/details/51760923
http://blog.csdn.net/longxinchen_ml/article/details/51765418

## 1 Softmax (10 分)

(part a) (5分)

(part b) (5 分)

import numpy as npdef softmax(x):    """        Softmax 函数    """    assert len(x.shape) > 1, "Softmax的得分向量要求维度高于1"    x -= np.max(x, axis=1, keepdims=True)    x = np.exp(x) / np.sum(np.exp(x), axis=1, keepdims=True)    return x

## 2 神经网络基础（30分）

(part a) (3 分)

(part b) (3 分)

(part c) (6 分)

(part d) (2 分)

(part e) (4 分) 在q2_sigmoid.py中补充写出sigmoid激活函数的和求它的梯度的对应代码。并使用python q2_sigmoid.py进行测试，同样的，测试用例有可能不太详尽，因此尽量检查下自己的代码。

def sigmoid_grad(f):    """         计算Sigmoid的梯度     """    #好在我有numpy    f = f * ( 1 - f )    return f

(part f) (4 分)

def gradcheck_naive(f, x):    """         对一个函数f求梯度的梯度检验         - f 输入x，然后输出loss和梯度的函数        - x 就是输入咯    """     rndstate = random.getstate()    random.setstate(rndstate)      fx, grad = f(x)    h = 1e-4    # 遍历x的每一维    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])    while not it.finished:        ix = it.multi_index        old_val = x[ix]        x[ix] = old_val - h        random.setstate(rndstate)        ( fxh1, _ ) = f(x)        x[ix] = old_val + h        random.setstate(rndstate)        ( fxh2, _ ) = f(x)        numgrad = (fxh2 - fxh1)/(2*h)        x[ix] = old_val        # 比对梯度        reldiff = abs(numgrad - grad[ix]) / max(1, abs(numgrad), abs(grad[ix]))        if reldiff > 1e-5:            print "Gradient check failed."            print "First gradient error found at index %s" % str(ix)            print "Your gradient: %f \t Numerical gradient: %f" % (grad[ix], numgrad)            return        it.iternext() # Step to next dimension    print "Gradient check passed!"

(part g) (8 分)

def forward_backward_prop(data, labels, params, verbose = False):    """         2个隐层的神经网络的前向运算和反向传播    """    if len(data.shape) >= 2:        (N, _) = data.shape    ### 展开每一层神经网络的参数    t = 0    W1 = np.reshape(params[t:t+dimensions[0]*dimensions[1]], (dimensions[0], dimensions[1]))    t += dimensions[0]*dimensions[1]    b1 = np.reshape(params[t:t+dimensions[1]], (1, dimensions[1]))    t += dimensions[1]    W2 = np.reshape(params[t:t+dimensions[1]*dimensions[2]], (dimensions[1], dimensions[2]))    t += dimensions[1]*dimensions[2]    b2 = np.reshape(params[t:t+dimensions[2]], (1, dimensions[2]))    ### 前向运算    # 第一个隐层做内积    a1 = sigmoid(data.dot(W1) + b1)        # 第二个隐层做内积    a2 = softmax(a1.dot(W2) + b2)    cost = - np.sum(np.log(a2[labels == 1]))/N    ### 反向传播    # Calculate analytic gradient for the cross entropy loss function    grad_a2 = ( a2 - labels ) / N    # Backpropagate through the second latent layer    gradW2 = np.dot( a1.T, grad_a2 )    gradb2 = np.sum( grad_a2, axis=0, keepdims=True )    # Backpropagate through the first latent layer    grad_a1 = np.dot( grad_a2, W2.T ) * sigmoid_grad(a1)    gradW1 = np.dot( data.T, grad_a1 )    gradb1 = np.sum( grad_a1, axis=0, keepdims=True )    if verbose: # Verbose mode for logging information        print "W1 shape: {}".format( str(W1.shape) )        print "W1 gradient shape: {}".format( str(gradW1.shape) )        print "b1 shape: {}".format( str(b1.shape) )        print "b1 gradient shape: {}".format( str(gradb1.shape) )    ### 梯度拼起来    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten()))    return cost, grad

## 3 word2vec(40分+5附加分)

(part a) (3分)

(part b) (3分)

(part c) (6分)

(part d) (8分)

CBOW略有不同，不同于使用$\upsilon_c$作为预测向量，我们以$\hat{\upsilon}$为底，在CBOW中（一个小小的变体），我们计算上下文输入词向量的和:

$\frac{\partial{F(w_i,\hat{\upsilon})}}{\partial{U}}$$\frac{\partial{F(w_i,\hat{\upsilon})}}{\partial{\hat{\upsilon}}}$

(part e) (12分)

import numpy as npimport randomfrom q1_softmax import softmaxfrom q2_gradcheck import gradcheck_naivefrom q2_sigmoid import sigmoid, sigmoid_graddef normalizeRows(x):    """         行归一化函数     """    N = x.shape[0]    x /= np.sqrt(np.sum(x**2, axis=1)).reshape((N,1)) + 1e-30    return xdef test_normalize_rows():    print "Testing normalizeRows..."    x = normalizeRows(np.array([[3.0,4.0],[1, 2]]))     # 结果应该是 [[0.6, 0.8], [0.4472, 0.8944]]    print x    assert (np.amax(np.fabs(x - np.array([[0.6,0.8],[0.4472136,0.89442719]]))) <= 1e-6)    print ""def softmaxCostAndGradient(predicted, target, outputVectors, dataset):    """         word2vec的Softmax损失函数     """                                                       # 输入:                                                             # - predicted: 预测词向量的numpy数组    # - target: 目标词的下标                  # - outputVectors: 所有token的"output"向量(行形式)     # - dataset: 用来做负例采样的，这里其实没用着             # 输出:                                                            # - cost: 输出的互熵损失        # - gradPred: the gradient with respect to the predicted word       #        vector                                                    # - grad: the gradient with respect to all the other word            #        vectors                                                   probabilities = softmax(predicted.dot(outputVectors.T))    cost = -np.log(probabilities[target])    delta = probabilities    delta[target] -= 1    N = delta.shape[0]    D = predicted.shape[0]    grad = delta.reshape((N,1)) * predicted.reshape((1,D))    gradPred = (delta.reshape((1,N)).dot(outputVectors)).flatten()    return cost, gradPred, graddef negSamplingCostAndGradient(predicted, target, outputVectors, dataset,     K=10):    """         Word2vec模型负例采样后的损失函数和梯度    """    grad = np.zeros(outputVectors.shape)    gradPred = np.zeros(predicted.shape)    indices = [target]    for k in xrange(K):        newidx = dataset.sampleTokenIdx()        while newidx == target:            newidx = dataset.sampleTokenIdx()        indices += [newidx]    labels = np.array([1] + [-1 for k in xrange(K)])    vecs = outputVectors[indices,:]    t = sigmoid(vecs.dot(predicted) * labels)    cost = -np.sum(np.log(t))    delta = labels * (t - 1)    gradPred = delta.reshape((1,K+1)).dot(vecs).flatten()    gradtemp = delta.reshape((K+1,1)).dot(predicted.reshape(        (1,predicted.shape[0])))    for k in xrange(K+1):        grad[indices[k]] += gradtemp[k,:]     t = sigmoid(predicted.dot(outputVectors[target,:]))     cost = -np.log(t)     delta = t - 1     gradPred += delta * outputVectors[target, :]     grad[target, :] += delta * predicted     for k in xrange(K):         idx = dataset.sampleTokenIdx()         t = sigmoid(-predicted.dot(outputVectors[idx,:]))         cost += -np.log(t)         delta = 1 - t         gradPred += delta * outputVectors[idx, :]         grad[idx, :] += delta * predicted    return cost, gradPred, graddef skipgram(currentWord, C, contextWords, tokens, inputVectors, outputVectors,     dataset, word2vecCostAndGradient = softmaxCostAndGradient):    """ Skip-gram model in word2vec """    # skip-gram模型的实现    # 输入:                                                             # - currrentWord: 当前中心词所对应的串               # - C: 上下文大小(词窗大小)                              # - contextWords: 最多2*C个词                                 # - tokens: 对应词向量中词下标的字典                    # - inputVectors: "input" word vectors (as rows) for all tokens               # - outputVectors: "output" word vectors (as rows) for all tokens             # - word2vecCostAndGradient: the cost and gradient function for a prediction vector given the target word vectors, could be one of the two cost functions you implemented above    # 输出:                                                       # - cost: skip-gram模型算得的损失值       # - grad: 词向量对应的梯度     currentI = tokens[currentWord]    predicted = inputVectors[currentI, :]    cost = 0.0    gradIn = np.zeros(inputVectors.shape)    gradOut = np.zeros(outputVectors.shape)    for cwd in contextWords:        idx = tokens[cwd]        cc, gp, gg = word2vecCostAndGradient(predicted, idx, outputVectors, dataset)        cost += cc        gradOut += gg        gradIn[currentI, :] += gp    return cost, gradIn, gradOutdef word2vec_sgd_wrapper(word2vecModel, tokens, wordVectors, dataset, C, word2vecCostAndGradient = softmaxCostAndGradient):    batchsize = 50    cost = 0.0    grad = np.zeros(wordVectors.shape)    N = wordVectors.shape[0]    inputVectors = wordVectors[:N/2,:]    outputVectors = wordVectors[N/2:,:]    for i in xrange(batchsize):        C1 = random.randint(1,C)        centerword, context = dataset.getRandomContext(C1)        if word2vecModel == skipgram:            denom = 1        else:            denom = 1        c, gin, gout = word2vecModel(centerword, C1, context, tokens, inputVectors, outputVectors, dataset, word2vecCostAndGradient)        cost += c / batchsize / denom        grad[:N/2, :] += gin / batchsize / denom        grad[N/2:, :] += gout / batchsize / denom    return cost, graddef test_word2vec():    # Interface to the dataset for negative sampling    dataset = type('dummy', (), {})()    def dummySampleTokenIdx():        return random.randint(0, 4)    def getRandomContext(C):        tokens = ["a", "b", "c", "d", "e"]        return tokens[random.randint(0,4)], [tokens[random.randint(0,4)] \           for i in xrange(2*C)]    dataset.sampleTokenIdx = dummySampleTokenIdx    dataset.getRandomContext = getRandomContext    random.seed(31415)    np.random.seed(9265)    dummy_vectors = normalizeRows(np.random.randn(10,3))    dummy_tokens = dict([("a",0), ("b",1), ("c",2),("d",3),("e",4)])    print "==== Gradient check for skip-gram ===="    gradcheck_naive(lambda vec: word2vec_sgd_wrapper(skipgram, dummy_tokens, vec, dataset, 5), dummy_vectors)    gradcheck_naive(lambda vec: word2vec_sgd_wrapper(skipgram, dummy_tokens, vec, dataset, 5, negSamplingCostAndGradient), dummy_vectors)    print "\n==== Gradient check for CBOW      ===="    gradcheck_naive(lambda vec: word2vec_sgd_wrapper(cbow, dummy_tokens, vec, dataset, 5), dummy_vectors)    gradcheck_naive(lambda vec: word2vec_sgd_wrapper(cbow, dummy_tokens, vec, dataset, 5, negSamplingCostAndGradient), dummy_vectors)    print "\n=== Results ==="    print skipgram("c", 3, ["a", "b", "e", "d", "b", "c"], dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset)    print skipgram("c", 1, ["a", "b"], dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset, negSamplingCostAndGradient)    print cbow("a", 2, ["a", "b", "c", "a"], dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset)    print cbow("a", 2, ["a", "b", "a", "c"], dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset, negSamplingCostAndGradient)if __name__ == "__main__":    test_normalize_rows()    test_word2vec()

(f) (4分) 在代码q3_sgd.py中完成对随即梯度下降优化函数的实现。并且在该代码中运行测试你的实现。

# 实现随机梯度下降# 随机梯度下降每1000轮，就保存一下现在训练得到的参数SAVE_PARAMS_EVERY = 1000import globimport os.path as opimport cPickle as pickleimport sysdef load_saved_params():    """        载入之前的参数以免从头开始训练    """    st = 0    for f in glob.glob("saved_params_*.npy"):        iter = int(op.splitext(op.basename(f))[0].split("_")[2])        if (iter > st):            st = iter    if st > 0:        with open("saved_params_%d.npy" % st, "r") as f:            params = pickle.load(f)            state = pickle.load(f)        return st, params, state    else:        return st, None, Nonedef save_params(iter, params):    with open("saved_params_%d.npy" % iter, "w") as f:        pickle.dump(params, f)        pickle.dump(random.getstate(), f)def sgd(f, x0, step, iterations, postprocessing = None, useSaved = False, PRINT_EVERY=10, ANNEAL_EVERY = 20000):    """ 随机梯度下降 """    ###########################################################    # 输入    #   - f: 需要最优化的函数    #   - x0: SGD的初始值    #   - step: SGD的步长    #   - iterations: 总得迭代次数    #   - postprocessing: 参数后处理（比如word2vec里需要对词向量做归一化处理）    #   - PRINT_EVERY: 指明多少次迭代以后输出一下状态    # 输出:     #   - x: SGD完成后的输出参数                   #    ###########################################################    if useSaved:        start_iter, oldx, state = load_saved_params()        if start_iter > 0:            x0 = oldx;            step *= 0.5 ** (start_iter / ANNEAL_EVERY)        if state:            random.setstate(state)    else:        start_iter = 0    x = x0    if not postprocessing:        postprocessing = lambda x: x    expcost = None    for iter in xrange(start_iter + 1, iterations + 1):        cost, grad = f(x)        x = x - step * grad        x = postprocessing(x)        if iter % PRINT_EVERY == 0:            print "Iter#{}, cost={}".format(iter, cost)            sys.stdout.flush()        if iter % SAVE_PARAMS_EVERY == 0 and useSaved:            save_params(iter, x)        if iter % ANNEAL_EVERY == 0:            step *= 0.5    return x

(part g) (4分)

(part h) 附加题（5分）

def cbow(currentWord, C, contextWords, tokens, inputVectors, outputVectors,     dataset, word2vecCostAndGradient = softmaxCostAndGradient):    """        word2vec的CBOW模型    """    cost = 0    gradIn = np.zeros(inputVectors.shape)    gradOut = np.zeros(outputVectors.shape)     D = inputVectors.shape[1]     predicted = np.zeros((D,))     indices = [tokens[cwd] for cwd in contextWords]     for idx in indices:         predicted += inputVectors[idx, :]     cost, gp, gradOut = word2vecCostAndGradient(predicted, tokens[currentWord], outputVectors, dataset)     gradIn = np.zeros(inputVectors.shape)     for idx in indices:         gradIn[idx, :] += gp    return cost, gradIn, gradOut

## 4 情感分析（20分）

“超级消极”，“比较消极”，“中立”，“积极”，“非常积极”

(part a)（10分）

import numpy as npimport randomfrom cs224d.data_utils import *from q1_softmax import softmaxfrom q2_gradcheck import gradcheck_naivefrom q3_sgd import load_saved_paramsdef getSentenceFeature(tokens, wordVectors, sentence):    """         简单粗暴的处理方式，直接对句子的所有词向量求平均做为情感分析的输入    """    # 输入:                                                             # - tokens: a dictionary that maps words to their indices in the word vector list                                    # - wordVectors: word vectors (each row) for all tokens     # - sentence: a list of words in the sentence of interest     # 输出:                                                             # - sentVector: feature vector for the sentence        sentVector = np.zeros((wordVectors.shape[1],))    indices = [tokens[word] for word in sentence]    sentVector = np.mean(wordVectors[indices, :], axis=0)    return sentVectordef softmaxRegression(features, labels, weights, regularization = 0.0, nopredictions = False):    """ Softmax Regression """    # 完成加正则化的softmax回归            # 输入:                                                             # - features: feature vectors, each row is a feature vector     # - labels: labels corresponding to the feature vectors         # - weights: weights of the regressor                           # - regularization: L2 regularization constant                  # 输出:                                                             # - cost: cost of the regressor                                 # - grad: gradient of the regressor cost with respect to its weights                                                   # - pred: label predictions of the regressor (you might find np.argmax helpful)      prob = softmax(features.dot(weights))    if len(features.shape) > 1:        N = features.shape[0]    else:        N = 1    # A vectorized implementation of    1/N * sum(cross_entropy(x_i, y_i)) + 1/2*|w|^2    cost = np.sum(-np.log(prob[range(N), labels])) / N     cost += 0.5 * regularization * np.sum(weights ** 2)    grad = np.array(prob)    grad[range(N), labels] -= 1.0    grad = features.T.dot(grad) / N    grad += regularization * weights    if N > 1:        pred = np.argmax(prob, axis=1)    else:        pred = np.argmax(prob)    if nopredictions:        return cost, grad    else:        return cost, grad, preddef accuracy(y, yhat):    """ Precision for classifier """    assert(y.shape == yhat.shape)    return np.sum(y == yhat) * 100.0 / y.sizedef softmax_wrapper(features, labels, weights, regularization = 0.0):    cost, grad, _ = softmaxRegression(features, labels, weights,         regularization)    return cost, graddef sanity_check():    """    Run python q4_softmaxreg.py.    """    random.seed(314159)    np.random.seed(265)    dataset = StanfordSentiment()    tokens = dataset.tokens()    nWords = len(tokens)    _, wordVectors0, _ = load_saved_params()    wordVectors = (wordVectors0[:nWords,:] + wordVectors0[nWords:,:])    dimVectors = wordVectors.shape[1]    dummy_weights = 0.1 * np.random.randn(dimVectors, 5)    dummy_features = np.zeros((10, dimVectors))    dummy_labels = np.zeros((10,), dtype=np.int32)        for i in xrange(10):        words, dummy_labels[i] = dataset.getRandomTrainSentence()        dummy_features[i, :] = getSentenceFeature(tokens, wordVectors, words)    print "==== Gradient check for softmax regression ===="    gradcheck_naive(lambda weights: softmaxRegression(dummy_features,        dummy_labels, weights, 1.0, nopredictions = True), dummy_weights)    print "\n=== Results ==="    print softmaxRegression(dummy_features, dummy_labels, dummy_weights, 1.0)if __name__ == "__main__":    sanity_check()

(part b)（2分）

(part c)（4分）
q4 sentiment.py中完成超参数的实现代码从而获取“最佳”的惩罚因子。你是如何选择的？报告你的训练、调试和测试精度，在最多一个句子中校正你的超参数选定方法。 注释：在开发中应该获取至少30%的准确率。

import numpy as npimport matplotlib.pyplot as pltfrom cs224d.data_utils import *from q3_sgd import load_saved_params, sgdfrom q4_softmaxreg import softmaxRegression, getSentenceFeature, accuracy, softmax_wrapper# 试试不同的正则化系数，选最好的REGULARIZATION = [0.0, 0.00001, 0.00003, 0.0001, 0.0003, 0.001, 0.003, 0.01]# 载入数据集dataset = StanfordSentiment()tokens = dataset.tokens()nWords = len(tokens)# 载入预训练好的词向量 _, wordVectors0, _ = load_saved_params()wordVectors = (wordVectors0[:nWords,:] + wordVectors0[nWords:,:])dimVectors = wordVectors.shape[1]# 载入训练集trainset = dataset.getTrainSentences()nTrain = len(trainset)trainFeatures = np.zeros((nTrain, dimVectors))trainLabels = np.zeros((nTrain,), dtype=np.int32)for i in xrange(nTrain):    words, trainLabels[i] = trainset[i]    trainFeatures[i, :] = getSentenceFeature(tokens, wordVectors, words)# 准备好训练集的特征devset = dataset.getDevSentences()nDev = len(devset)devFeatures = np.zeros((nDev, dimVectors))devLabels = np.zeros((nDev,), dtype=np.int32)for i in xrange(nDev):    words, devLabels[i] = devset[i]    devFeatures[i, :] = getSentenceFeature(tokens, wordVectors, words)# 尝试不同的正则化系数results = []for regularization in REGULARIZATION:    random.seed(3141)    np.random.seed(59265)    weights = np.random.randn(dimVectors, 5)    print "Training for reg=%f" % regularization     # batch optimization    weights = sgd(lambda weights: softmax_wrapper(trainFeatures, trainLabels,         weights, regularization), weights, 3.0, 10000, PRINT_EVERY=100)    # 训练集上测效果    _, _, pred = softmaxRegression(trainFeatures, trainLabels, weights)    trainAccuracy = accuracy(trainLabels, pred)    print "Train accuracy (%%): %f" % trainAccuracy    # dev集合上看效果    _, _, pred = softmaxRegression(devFeatures, devLabels, weights)    devAccuracy = accuracy(devLabels, pred)    print "Dev accuracy (%%): %f" % devAccuracy    # 保存结果权重    results.append({        "reg" : regularization,         "weights" : weights,         "train" : trainAccuracy,         "dev" : devAccuracy})# 输出准确率print ""print "=== Recap ==="print "Reg\t\tTrain\t\tDev"for result in results:    print "%E\t%f\t%f" % (        result["reg"],         result["train"],         result["dev"])print ""# 选最好的正则化系数BEST_REGULARIZATION = NoneBEST_WEIGHTS = Nonebest_dev = 0for result in results:    if result["dev"] > best_dev:        best_dev = result["dev"]        BEST_REGULARIZATION = result["reg"]        BEST_WEIGHTS = result["weights"]# Test your findings on the test settestset = dataset.getTestSentences()nTest = len(testset)testFeatures = np.zeros((nTest, dimVectors))testLabels = np.zeros((nTest,), dtype=np.int32)for i in xrange(nTest):    words, testLabels[i] = testset[i]    testFeatures[i, :] = getSentenceFeature(tokens, wordVectors, words)_, _, pred = softmaxRegression(testFeatures, testLabels, BEST_WEIGHTS)print "Best regularization value: %E" % BEST_REGULARIZATIONprint "Test accuracy (%%): %f" % accuracy(testLabels, pred)# 画出正则化和准确率的关系plt.plot(REGULARIZATION, [x["train"] for x in results])plt.plot(REGULARIZATION, [x["dev"] for x in results])plt.xscale('log')plt.xlabel("regularization")plt.ylabel("accuracy")plt.legend(['train', 'dev'], loc='upper left')plt.savefig("q4_reg_v_acc.png")plt.show()

(d)（4分）绘出在训练和开发过程中的分类准确率，并在x轴使用对数刻度来对正则化值进行相关设置。这应该自动化的进行。包括在你作业中详细展示的坐标图q4_reg_acc.png简明解释最多三个句子在此坐标图中的显示情况。

• 私有
• 公开
• 删除