[TensorFlow实战练习]2-对推特数据的情绪分析分类

前言

本实战延续上一贴的内容,主要是学习文本数据分类过程,相比第一个实战练习,讲一下不同和优化过程。

  1. 数据量增加,有160万条推特数据,我的小Air完全读不下这么多。大数据处理方面还有很多工作要去做。
  2. 数据增大带来的是使用TFRecord去读取和处理数据,这样会极大的方便我们之后的队列处理过程。
  3. 相比上一个实战过程,不在使用readline这种读取方法,利用pandas对原始数据进行处理,还是因为数据量太大,不能把160W完全读入内存中。

实验流程

  • 构建词汇表,保存词汇表。构建过长的词汇表这个方法对于不定长数据的效果不好,速度变得很慢。无形中加大了数据量,以后想到好的解决方法再修改。
  • 创建训练数据集,word2vector向量化,保存为TFRecord格式。
  • 搭建前向神经网络,batch化训练,利用队列读取TFRecord数据,每次迭代对所有数据进行训练,每次利用测试数据评估accuracy
  • 比较准确率,保存最好的模型
  • 加载模型,对新的tweet数据进行预测

    实验数据 http://help.sentiment140.com/for-students/

0 – the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
1 – the id of the tweet (2087)
2 – the date of the tweet (Sat May 16 23:58:44 UTC 2009)
3 – the query (lyx). If there is no query, then this value is NO_QUERY.
4 – the user that tweeted (robotickilldozr)
5 – the text of the tweet (Lyx is cool)


下面开始贴代码了
create_lexcion.py

# -*- coding: UTF-8 -*
import numpy as np
import tensorflow as tf
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
from nltk.stem import WordNetLemmatizer
import pickle


def create_vacabularyDict(filename):
    wordList = []
    for word in filename:
        word = word_tokenize(word.lower().decode('latin-1'))
        wordList.extend(word)

    # 词形还原 (cats->cat)
    lemmatizer = WordNetLemmatizer()
    wordList = [lemmatizer.lemmatize(word) for word in wordList]

    wordSet = []
    word_count = Counter(wordList)
    for word in word_count:
        if word_count[word] < 100000 and word_count[word] > 100:
            wordSet.append(word)

    # char2int = dict((c, i) for i, c in enumerate(wordSet))
    # int2char = dict((i, c) for i, c in enumerate(wordSet))

    return wordSet

org_train_filepath = "../data/trainingandtestdata/training.1600000.processed.noemoticon.csv"
org_test_filepath = "../data/trainingandtestdata/testdata.manual.2009.06.14.csv"

df = pd.read_csv(org_train_filepath,
                 names=['polarity','id','data','Query','username','text'],
                 usecols=['polarity','text'])

# dftemp = df[:10000]
dftext = df['text']

wordList = create_vacabularyDict(dftext)

#保存词汇表
with open('../lexcion.pickle', 'wb') as f:
    pickle.dump(wordList, f)

create_dataset.py

# -*- coding: UTF-8 -*
import numpy as np
import tensorflow as tf
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
from nltk.stem import WordNetLemmatizer
import pickle
from sklearn.utils import shuffle

#预处理
def process_for_train(file,wordList):
    dftemp = file.copy()
    # length = len(df)
    # num_process = 1000
    # dftemp = pd.DataFrame(columns=['polarity','text'])
    dataset = []
    lemmatizer = WordNetLemmatizer()
    int_to_vector = {
        0:[0,0,1],
        2:[0,1,0],
        4:[1,0,0]
    }

    def word2vector(text):
        line_text = word_tokenize(text.lower().decode('latin-1'))
        words = [lemmatizer.lemmatize(word) for word in line_text]
        feature = np.zeros(len(wordList))
        for word in words:
            if word in wordList:
                feature[wordList.index(word)] = 1
        feature = list(feature)
        return feature

    dftemp['polarity'] = dftemp['polarity'].map(int_to_vector)

    dftemp['text'] = dftemp['text'].map(word2vector)

    return dftemp

def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

org_train_filepath = "../data/trainingandtestdata/training.1600000.processed.noemoticon.csv"
org_test_filepath = "../data/trainingandtestdata/testdata.manual.2009.06.14.csv"

df = pd.read_csv(org_train_filepath,
                 names=['polarity','id','data','Query','username','text'],
                 usecols=['polarity','text'])

df = shuffle(df)
df = df[:3000].reset_index(drop=True)

f = open('../lexcion.pickle', 'rb')
wordList = pickle.load(f)
f.close()

dftemp = process_for_train(df,wordList)
num_dataset = len(dftemp)
# len_dataset = 5000

filename = "../data/output_train.tfrecords"
writer = tf.python_io.TFRecordWriter(filename)
for i in range(num_dataset):
    # word = ",".join(map(str,dftemp.get_value(index,'text')))
    # label = ",".join(map(str,dftemp.get_value(index,'polarity')))
    word = map(int,dftemp.get_value(i,'text'))
    label = map(int,dftemp.get_value(i,'polarity'))

    example = tf.train.Example(features=tf.train.Features(
        feature={
            'word': _int64_feature(word),
            'label': _int64_feature(label)
        }
    ))

    writer.write(example.SerializeToString())

writer.close()

train.py

# -*- coding: UTF-8 -*
import numpy as np
import tensorflow as tf
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
from nltk.stem import WordNetLemmatizer
import pickle
# from create_dataset import process_for_train

def process_for_train(file,wordList):
    dftemp = file.copy()
    dataset = []
    lemmatizer = WordNetLemmatizer()
    int_to_vector = {
        0:[0,0,1],
        2:[0,1,0],
        4:[1,0,0]
    }

    def word2vector(text):
        line_text = word_tokenize(text.lower().decode('latin-1'))
        words = [lemmatizer.lemmatize(word) for word in line_text]
        feature = np.zeros(len(wordList))
        for word in words:
            if word in wordList:
                feature[wordList.index(word)] = 1
        feature = list(feature)
        return feature

    dftemp['polarity'] = dftemp['polarity'].map(int_to_vector)

    dftemp['text'] = dftemp['text'].map(word2vector)

    return dftemp

#定义全连接神经网络结构
def inference(input_tesnor):
    with tf.variable_scope('layer1-fc'):
        fc1_W = tf.get_variable("weight",[N_INPUT,NUM_LAYER1]
                                 ,initializer=tf.truncated_normal_initializer(stddev=0.1))
        fc1_b = tf.get_variable("bias",[NUM_LAYER1],
                                initializer=tf.random_normal_initializer(stddev=1.0))

        fc1 = tf.add(tf.matmul(input_tesnor,fc1_W),fc1_b)
        relu1 = tf.nn.relu(fc1)
        # if train:
        #     fc1 = tf.nn.dropout(fc1,0.5)
    with tf.variable_scope('layer2-fc'):
        fc2_W = tf.get_variable("weight", [NUM_LAYER1, NUM_LAYER2]
                                , initializer=tf.truncated_normal_initializer(stddev=0.1))
        fc2_b = tf.get_variable("bias", [NUM_LAYER2],
                                initializer=tf.random_normal_initializer(stddev=1.0))

        fc2 = tf.add(tf.matmul(relu1,fc2_W),fc2_b)
        relu2 = tf.nn.relu(fc2)

    with tf.variable_scope('output-fc'):
        output_W = tf.get_variable("weight", [NUM_LAYER2, N_OUTPUT]
                                , initializer=tf.truncated_normal_initializer(stddev=0.1))
        output_b = tf.get_variable("bias", [N_OUTPUT],
                                initializer=tf.random_normal_initializer(stddev=1.0))

        output = tf.add(tf.matmul(relu2,output_W),output_b)

    return output


N_INPUT = 8053
N_OUTPUT = 3
NUM_LAYER1 = 1000
NUM_LAYER2 = 1000
TRAIN_EPOCHS = 1000

NUM_TRAIN = 3000

org_test_filepath = "../data/trainingandtestdata/testdata.manual.2009.06.14.csv"

df = pd.read_csv(org_test_filepath,
                 names=['polarity','id','data','Query','username','text'],
                 usecols=['polarity','text'])

f = open('../lexcion.pickle', 'rb')
wordList = pickle.load(f)
f.close()

dftemp = process_for_train(df[:300],wordList)

X_test = np.reshape(np.array(dftemp['text'].tolist()),(len(dftemp),N_INPUT))
y_test = np.reshape(np.array(dftemp['polarity'].tolist()),(len(dftemp),N_OUTPUT))

file = "../data/output_train.tfrecords"
test_file = "output_test.tfrecords"
#string_input_producer用于多文件输入
filename_queue = tf.train.string_input_producer([file], shuffle=False)

reader = tf.TFRecordReader()
_,serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(
    serialized_example,
    features={
        'word':tf.FixedLenFeature([8053],tf.int64),
        'label':tf.FixedLenFeature([3],tf.int64)
    }
    )

X_train = features['word']
y_train = features['label']

min_after_dequeue = 1000
batch_size = 100
capacity = min_after_dequeue + batch_size * 3

X_batch,y_batch = tf.train.shuffle_batch([X_train,y_train],
                                                 min_after_dequeue=min_after_dequeue,
                                                 batch_size=batch_size,
                                                 capacity=capacity)

def train():
    X = tf.placeholder(tf.float32,[None,N_INPUT],name='x-input')
    y = tf.placeholder(tf.float32,[None,N_OUTPUT],name='y-output')

    predict = inference(X)
    cost_func = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_batch,logits=predict))
    optimizer = tf.train.AdamOptimizer().minimize(cost_func)

    #eval statics
    # test_predict = inference(X)
    correct = tf.equal(tf.argmax(predict, 1), tf.argmax(y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct, 'float'))

    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess,coord=coord)
        saver = tf.train.Saver()

        pre_accuracy = 0 #准确率

        for i in range(TRAIN_EPOCHS):
            # for j in range()
            # cur_X_batch,cur_y_batch = sess.run([X_batch,y_batch])
            # feed_dict = {X: cur_X_batch, y: cur_y_batch}
            for j in range(int(NUM_TRAIN/batch_size)):
                cur_X_batch, cur_y_batch = sess.run([X_batch, y_batch])
                _, loss = sess.run([optimizer, cost_func],feed_dict = {X: cur_X_batch, y: cur_y_batch})
            if i % 10 == 0:
                print "After %d training step,loss is %g" %(i,loss)

            # if i % 10 == 0:
            temp_accuracy = sess.run(accuracy,feed_dict={X: X_test, y: y_test})
            if temp_accuracy > pre_accuracy:  # 保存准确率最高的训练模型
                print('accuracy is : ', temp_accuracy)
                pre_accuracy = temp_accuracy
                saver.save(sess, '../model/model.ckpt')  # 保存session
        # print('accuracy: ', accuracy.eval({X: list(X_test), y: list(y_test)}))


        coord.request_stop()
        coord.join(threads)

train()

test.py

# -*- coding: UTF-8 -*
import tensorflow as tf
import pickle
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np

f = open('../lexcion.pickle', 'rb')
wordList = pickle.load(f)
f.close()

N_INPUT = len(wordList)  # 输入层
N_OUTPUT = 3
NUM_LAYER1 = 1000
NUM_LAYER2 = 1000


def word2vector(text):
    line_text = word_tokenize(text.lower().decode('latin-1'))
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in line_text]
    feature = np.zeros(len(wordList))
    for word in words:
        if word in wordList:
            feature[wordList.index(word)] = 1
    feature = list(feature)
    return feature

#定义全连接神经网络结构
def inference(input_tesnor):
    with tf.variable_scope('layer1-fc'):
        fc1_W = tf.get_variable("weight",[N_INPUT,NUM_LAYER1]
                                 ,initializer=tf.truncated_normal_initializer(stddev=0.1))
        fc1_b = tf.get_variable("bias",[NUM_LAYER1],
                                initializer=tf.random_normal_initializer(stddev=1.0))

        fc1 = tf.add(tf.matmul(input_tesnor,fc1_W),fc1_b)
        relu1 = tf.nn.relu(fc1)
        # if train:
        #     fc1 = tf.nn.dropout(fc1,0.5)
    with tf.variable_scope('layer2-fc'):
        fc2_W = tf.get_variable("weight", [NUM_LAYER1, NUM_LAYER2]
                                , initializer=tf.truncated_normal_initializer(stddev=0.1))
        fc2_b = tf.get_variable("bias", [NUM_LAYER2],
                                initializer=tf.random_normal_initializer(stddev=1.0))

        fc2 = tf.add(tf.matmul(relu1,fc2_W),fc2_b)
        relu2 = tf.nn.relu(fc2)

    with tf.variable_scope('output-fc'):
        output_W = tf.get_variable("weight", [NUM_LAYER2, N_OUTPUT]
                                , initializer=tf.truncated_normal_initializer(stddev=0.1))
        output_b = tf.get_variable("bias", [N_OUTPUT],
                                initializer=tf.random_normal_initializer(stddev=1.0))

        output = tf.add(tf.matmul(relu2,output_W),output_b)

    return output


def prediction(tweet_text):

    tweet_vector = word2vector(tweet_text)

    X = tf.placeholder('float')

    predict = inference(X)

    with tf.Session() as sess:

        tf.global_variables_initializer().run()
        saver = tf.train.Saver()
        saver.restore(sess, '../model/model.ckpt')

        res = sess.run(tf.argmax(predict.eval(feed_dict={X: [tweet_vector]}),1))
        return res


print prediction("happy time ")

小计
讲道理,实验效果不是很好,准确率大约在80%左右,对于中性推特的预测效果最差。改了很多地方,效果提升不大。有好的建议欢迎大神提出。

文本分类暂时告一段落,下面应该会把TFLearn和TensorBorder可视化做一做。

参考文献

评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值