[TensorFlow实战练习]1-对电影评论的分类

英文电影评论的分类

本实战主要练习如何对简单的文本进行分类,练习word2vector过程,并且简单搭建feed-forward结构。

问题描述

  1. 将文本字符串转换成向量方法。有定长的方法,也有变长的方法。如果输入文本定长的情况,可以直接转换成对应词汇表中的index,这种方法可能效果更好(并未尝试)。变长的方法构建词汇表,若出现在词汇表中,就在相应index设置为1,否则为0。这种有一个问题在于,当词汇表长的时候,输入的向量将会非常长。
  2. 构建词汇表,将正负样本中所有出现的单词构建词汇表。因为总共应该中1.8w个单词大约(预处理过程加入了nltk的word_tokenize,WordNetLemmatizer,这两个过程用于将句子分成单词,第二个用于词形还原)。之后,选取了中间一部分单词,考虑了齐普夫定律。参考链接:http://blog.topspeedsnail.com/archives/9546
  3. 生成数据集
  4. 利用sklearn将数据集分为训练和测试,8:2,方便之后评估
  5. 搭建feed-forward神经网络架构,分batch进行训练。(batch部分的代码写的很不local,本人也很纠结,欢迎大神给出好的意见)
  6. 训练结束,评估准确率

什么也不说了,直接贴代码。

# -*- coding: UTF-8 -*
import tensorflow as tf
import nltk
import numpy as np
from nltk.tokenize import word_tokenize
from collections import Counter
from nltk.stem import WordNetLemmatizer
from sklearn.cross_validation import train_test_split

neg_filename = "../data/neg.txt"
pos_filename = "../data/pos.txt"


def preprocess_sentence(filename):
    wordList = []
    with open(filename, 'r') as f:
        lines = f.readlines()
        for line in lines:
            line_text = word_tokenize(line.lower().decode('utf-8'))
            wordList.extend(line_text)
    return wordList

#创建词汇表
def create_vacabularyDict(pos_file,neg_file):
    wordList = []
    pos_list = preprocess_sentence(pos_file)
    neg_list = preprocess_sentence(neg_file)
    wordList.extend(pos_list)
    wordList.extend(neg_list)

    # 词形还原 (cats->cat)
    lemmatizer = WordNetLemmatizer()
    wordList = [lemmatizer.lemmatize(word) for word in wordList]

    # wordSet = list(set(wordList))

    wordSet = []

    #计数
    word_count = Counter(wordList)
    for word in word_count:
        if word_count[word] < 2000 and word_count[word] > 20:
            wordSet.append(word)
    #pass

    char2int = dict((c, i) for i, c in enumerate(wordSet))
    int2char = dict((i, c) for i, c in enumerate(wordSet))

    return wordSet,char2int,int2char

def word2vector(filename,wordList,char2int,int2char,clf):
    dataset = []
    lemmatizer = WordNetLemmatizer()
    with open(filename,'r') as f:
        for line in f.readlines():
            line_text = word_tokenize(line.lower().decode('utf-8'))
            line_text = [lemmatizer.lemmatize(word) for word in line_text]
            line_feature = np.zeros(len(wordList))
            for word_text in line_text:
                if word_text in wordList:
                    line_feature[char2int[word_text]] = 1
            # print sum(line_feature)
            # exit()
            dataset.append([line_feature,clf])

    return dataset

def vectorFile(pos_file,neg_file,wordList,char2int,int2char):
    dataset = []
    pos_dataset = word2vector(pos_file,wordList,char2int,int2char,[1,0])
    neg_dataset = word2vector(neg_file,wordList,char2int,int2char,[0,1])
    dataset.extend(pos_dataset)
    dataset.extend(neg_dataset)

    return dataset

#定义全连接神经网络结构
def inference(input_tesnor):
    with tf.variable_scope('layer1-fc'):
        fc1_W = tf.get_variable("weight",[N_INPUT,NUM_LAYER1]
                                 ,initializer=tf.truncated_normal_initializer(stddev=0.1))
        fc1_b = tf.get_variable("bias",[NUM_LAYER1],
                                initializer=tf.random_normal_initializer(stddev=1.0))

        fc1 = tf.add(tf.matmul(input_tesnor,fc1_W),fc1_b)
        relu1 = tf.nn.relu(fc1)
        # if train:
        #     fc1 = tf.nn.dropout(fc1,0.5)
    with tf.variable_scope('layer2-fc'):
        fc2_W = tf.get_variable("weight", [NUM_LAYER1, NUM_LAYER2]
                                , initializer=tf.truncated_normal_initializer(stddev=0.1))
        fc2_b = tf.get_variable("bias", [NUM_LAYER2],
                                initializer=tf.random_normal_initializer(stddev=1.0))

        fc2 = tf.add(tf.matmul(relu1,fc2_W),fc2_b)
        relu2 = tf.nn.relu(fc2)

    with tf.variable_scope('output-fc'):
        output_W = tf.get_variable("weight", [NUM_LAYER2, N_OUTPUT]
                                , initializer=tf.truncated_normal_initializer(stddev=0.1))
        output_b = tf.get_variable("bias", [N_OUTPUT],
                                initializer=tf.random_normal_initializer(stddev=1.0))

        output = tf.add(tf.matmul(relu2,output_W),output_b)

    return output

def train():
    X = tf.placeholder(tf.float32,[None,N_INPUT],name='x-input')
    y = tf.placeholder(tf.float32,[None,N_OUTPUT],name='y-output')

    predict = inference(X)
    cost_func = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y,logits=predict))
    optimizer = tf.train.AdamOptimizer().minimize(cost_func)

    #eval statics
    correct = tf.equal(tf.argmax(predict, 1), tf.argmax(y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct, 'float'))

    with tf.Session() as sess:
        tf.global_variables_initializer().run()

        for i in range(TRAIN_EPOCHS):
            batch_start = 0
            while batch_start < len(X_train):
                start = batch_start
                end = batch_start + BATCH_SIZE

                batch_x = X_train[start:end]
                batch_y = y_train[start:end]

                _, loss = sess.run([optimizer, cost_func], feed_dict={X: list(batch_x), y: list(batch_y)})
                batch_start += BATCH_SIZE

            # if i % 10 == 0:
            print "After %d training step,loss is %g" %(i,loss)


        print('accuracy: ', accuracy.eval({X: list(X_test), y: list(y_test)}))


wordList,char2int,int2char = create_vacabularyDict(pos_filename,neg_filename)
dataset = vectorFile(pos_filename,neg_filename,wordList,char2int,int2char)

dataset = np.array(dataset)
X,y = dataset[:,0],dataset[:,1]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=20)

#batch 过程
# min_after_dequeue = 1000
# batch_size = 100
# capacity = min_after_dequeue + 3 * batch_size
# word_feature,wordLabel = tf.train.shuffle_batch(
#     [X_train,y_train],batch_size=batch_size,
#     capacity=capacity,min_after_dequeue=min_after_dequeue
# )
BATCH_SIZE = 100
N_INPUT = len(wordList)
N_OUTPUT = 2
NUM_LAYER1 = 1000
NUM_LAYER2 = 1000
TRAIN_EPOCHS = 5

train()

中间有一部分batch的过程本来想利用多线程和队列去做的,但是输入需要时搭建数据输入队列,不太熟悉。等我把TFRecord构建文本队列的过程理清,再做修改。

我训练的准确率只有71%左右,效果很不好。但是似乎参考的例子原本只有60%,那这个效果还稍微提升了一点点。

待修改问题

  • 数据集是不是过小
  • 全连接层要不要加入dropout防止对于训练集的过拟合
  • 画出随迭代的loss出现,并对应画出对应测试集的loss。判断过拟合还是欠拟合
  • 尝试把dataset转换成TFRecord类型,用队列和多线程进行加入。
  • 画出加入数据集后的Learning Curves,完善评估过程

参考资料

  • 1
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值