前言
本实战延续上一贴的内容,主要是学习文本数据分类过程,相比第一个实战练习,讲一下不同和优化过程。
- 数据量增加,有160万条推特数据,我的小Air完全读不下这么多。大数据处理方面还有很多工作要去做。
- 数据增大带来的是使用TFRecord去读取和处理数据,这样会极大的方便我们之后的队列处理过程。
- 相比上一个实战过程,不在使用readline这种读取方法,利用pandas对原始数据进行处理,还是因为数据量太大,不能把160W完全读入内存中。
实验流程
- 构建词汇表,保存词汇表。构建过长的词汇表这个方法对于不定长数据的效果不好,速度变得很慢。无形中加大了数据量,以后想到好的解决方法再修改。
- 创建训练数据集,word2vector向量化,保存为TFRecord格式。
- 搭建前向神经网络,batch化训练,利用队列读取TFRecord数据,每次迭代对所有数据进行训练,每次利用测试数据评估accuracy
- 比较准确率,保存最好的模型
加载模型,对新的tweet数据进行预测
0 – the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
1 – the id of the tweet (2087)
2 – the date of the tweet (Sat May 16 23:58:44 UTC 2009)
3 – the query (lyx). If there is no query, then this value is NO_QUERY.
4 – the user that tweeted (robotickilldozr)
5 – the text of the tweet (Lyx is cool)
下面开始贴代码了
create_lexcion.py
# -*- coding: UTF-8 -*
import numpy as np
import tensorflow as tf
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
from nltk.stem import WordNetLemmatizer
import pickle
def create_vacabularyDict(filename):
wordList = []
for word in filename:
word = word_tokenize(word.lower().decode('latin-1'))
wordList.extend(word)
# 词形还原 (cats->cat)
lemmatizer = WordNetLemmatizer()
wordList = [lemmatizer.lemmatize(word) for word in wordList]
wordSet = []
word_count = Counter(wordList)
for word in word_count:
if word_count[word] < 100000 and word_count[word] > 100:
wordSet.append(word)
# char2int = dict((c, i) for i, c in enumerate(wordSet))
# int2char = dict((i, c) for i, c in enumerate(wordSet))
return wordSet
org_train_filepath = "../data/trainingandtestdata/training.1600000.processed.noemoticon.csv"
org_test_filepath = "../data/trainingandtestdata/testdata.manual.2009.06.14.csv"
df = pd.read_csv(org_train_filepath,
names=['polarity','id','data','Query','username','text'],
usecols=['polarity','text'])
# dftemp = df[:10000]
dftext = df['text']
wordList = create_vacabularyDict(dftext)
#保存词汇表
with open('../lexcion.pickle', 'wb') as f:
pickle.dump(wordList, f)
create_dataset.py
# -*- coding: UTF-8 -*
import numpy as np
import tensorflow as tf
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
from nltk.stem import WordNetLemmatizer
import pickle
from sklearn.utils import shuffle
#预处理
def process_for_train(file,wordList):
dftemp = file.copy()
# length = len(df)
# num_process = 1000
# dftemp = pd.DataFrame(columns=['polarity','text'])
dataset = []
lemmatizer = WordNetLemmatizer()
int_to_vector = {
0:[0,0,1],
2:[0,1,0],
4:[1,0,0]
}
def word2vector(text):
line_text = word_tokenize(text.lower().decode('latin-1'))
words = [lemmatizer.lemmatize(word) for word in line_text]
feature = np.zeros(len(wordList))
for word in words:
if word in wordList:
feature[wordList.index(word)] = 1
feature = list(feature)
return feature
dftemp['polarity'] = dftemp['polarity'].map(int_to_vector)
dftemp['text'] = dftemp['text'].map(word2vector)
return dftemp
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def _float_feature(value):
return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
org_train_filepath = "../data/trainingandtestdata/training.1600000.processed.noemoticon.csv"
org_test_filepath = "../data/trainingandtestdata/testdata.manual.2009.06.14.csv"
df = pd.read_csv(org_train_filepath,
names=['polarity','id','data','Query','username','text'],
usecols=['polarity','text'])
df = shuffle(df)
df = df[:3000].reset_index(drop=True)
f = open('../lexcion.pickle', 'rb')
wordList = pickle.load(f)
f.close()
dftemp = process_for_train(df,wordList)
num_dataset = len(dftemp)
# len_dataset = 5000
filename = "../data/output_train.tfrecords"
writer = tf.python_io.TFRecordWriter(filename)
for i in range(num_dataset):
# word = ",".join(map(str,dftemp.get_value(index,'text')))
# label = ",".join(map(str,dftemp.get_value(index,'polarity')))
word = map(int,dftemp.get_value(i,'text'))
label = map(int,dftemp.get_value(i,'polarity'))
example = tf.train.Example(features=tf.train.Features(
feature={
'word': _int64_feature(word),
'label': _int64_feature(label)
}
))
writer.write(example.SerializeToString())
writer.close()
train.py
# -*- coding: UTF-8 -*
import numpy as np
import tensorflow as tf
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
from nltk.stem import WordNetLemmatizer
import pickle
# from create_dataset import process_for_train
def process_for_train(file,wordList):
dftemp = file.copy()
dataset = []
lemmatizer = WordNetLemmatizer()
int_to_vector = {
0:[0,0,1],
2:[0,1,0],
4:[1,0,0]
}
def word2vector(text):
line_text = word_tokenize(text.lower().decode('latin-1'))
words = [lemmatizer.lemmatize(word) for word in line_text]
feature = np.zeros(len(wordList))
for word in words:
if word in wordList:
feature[wordList.index(word)] = 1
feature = list(feature)
return feature
dftemp['polarity'] = dftemp['polarity'].map(int_to_vector)
dftemp['text'] = dftemp['text'].map(word2vector)
return dftemp
#定义全连接神经网络结构
def inference(input_tesnor):
with tf.variable_scope('layer1-fc'):
fc1_W = tf.get_variable("weight",[N_INPUT,NUM_LAYER1]
,initializer=tf.truncated_normal_initializer(stddev=0.1))
fc1_b = tf.get_variable("bias",[NUM_LAYER1],
initializer=tf.random_normal_initializer(stddev=1.0))
fc1 = tf.add(tf.matmul(input_tesnor,fc1_W),fc1_b)
relu1 = tf.nn.relu(fc1)
# if train:
# fc1 = tf.nn.dropout(fc1,0.5)
with tf.variable_scope('layer2-fc'):
fc2_W = tf.get_variable("weight", [NUM_LAYER1, NUM_LAYER2]
, initializer=tf.truncated_normal_initializer(stddev=0.1))
fc2_b = tf.get_variable("bias", [NUM_LAYER2],
initializer=tf.random_normal_initializer(stddev=1.0))
fc2 = tf.add(tf.matmul(relu1,fc2_W),fc2_b)
relu2 = tf.nn.relu(fc2)
with tf.variable_scope('output-fc'):
output_W = tf.get_variable("weight", [NUM_LAYER2, N_OUTPUT]
, initializer=tf.truncated_normal_initializer(stddev=0.1))
output_b = tf.get_variable("bias", [N_OUTPUT],
initializer=tf.random_normal_initializer(stddev=1.0))
output = tf.add(tf.matmul(relu2,output_W),output_b)
return output
N_INPUT = 8053
N_OUTPUT = 3
NUM_LAYER1 = 1000
NUM_LAYER2 = 1000
TRAIN_EPOCHS = 1000
NUM_TRAIN = 3000
org_test_filepath = "../data/trainingandtestdata/testdata.manual.2009.06.14.csv"
df = pd.read_csv(org_test_filepath,
names=['polarity','id','data','Query','username','text'],
usecols=['polarity','text'])
f = open('../lexcion.pickle', 'rb')
wordList = pickle.load(f)
f.close()
dftemp = process_for_train(df[:300],wordList)
X_test = np.reshape(np.array(dftemp['text'].tolist()),(len(dftemp),N_INPUT))
y_test = np.reshape(np.array(dftemp['polarity'].tolist()),(len(dftemp),N_OUTPUT))
file = "../data/output_train.tfrecords"
test_file = "output_test.tfrecords"
#string_input_producer用于多文件输入
filename_queue = tf.train.string_input_producer([file], shuffle=False)
reader = tf.TFRecordReader()
_,serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(
serialized_example,
features={
'word':tf.FixedLenFeature([8053],tf.int64),
'label':tf.FixedLenFeature([3],tf.int64)
}
)
X_train = features['word']
y_train = features['label']
min_after_dequeue = 1000
batch_size = 100
capacity = min_after_dequeue + batch_size * 3
X_batch,y_batch = tf.train.shuffle_batch([X_train,y_train],
min_after_dequeue=min_after_dequeue,
batch_size=batch_size,
capacity=capacity)
def train():
X = tf.placeholder(tf.float32,[None,N_INPUT],name='x-input')
y = tf.placeholder(tf.float32,[None,N_OUTPUT],name='y-output')
predict = inference(X)
cost_func = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_batch,logits=predict))
optimizer = tf.train.AdamOptimizer().minimize(cost_func)
#eval statics
# test_predict = inference(X)
correct = tf.equal(tf.argmax(predict, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
with tf.Session() as sess:
tf.global_variables_initializer().run()
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess,coord=coord)
saver = tf.train.Saver()
pre_accuracy = 0 #准确率
for i in range(TRAIN_EPOCHS):
# for j in range()
# cur_X_batch,cur_y_batch = sess.run([X_batch,y_batch])
# feed_dict = {X: cur_X_batch, y: cur_y_batch}
for j in range(int(NUM_TRAIN/batch_size)):
cur_X_batch, cur_y_batch = sess.run([X_batch, y_batch])
_, loss = sess.run([optimizer, cost_func],feed_dict = {X: cur_X_batch, y: cur_y_batch})
if i % 10 == 0:
print "After %d training step,loss is %g" %(i,loss)
# if i % 10 == 0:
temp_accuracy = sess.run(accuracy,feed_dict={X: X_test, y: y_test})
if temp_accuracy > pre_accuracy: # 保存准确率最高的训练模型
print('accuracy is : ', temp_accuracy)
pre_accuracy = temp_accuracy
saver.save(sess, '../model/model.ckpt') # 保存session
# print('accuracy: ', accuracy.eval({X: list(X_test), y: list(y_test)}))
coord.request_stop()
coord.join(threads)
train()
test.py
# -*- coding: UTF-8 -*
import tensorflow as tf
import pickle
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
f = open('../lexcion.pickle', 'rb')
wordList = pickle.load(f)
f.close()
N_INPUT = len(wordList) # 输入层
N_OUTPUT = 3
NUM_LAYER1 = 1000
NUM_LAYER2 = 1000
def word2vector(text):
line_text = word_tokenize(text.lower().decode('latin-1'))
lemmatizer = WordNetLemmatizer()
words = [lemmatizer.lemmatize(word) for word in line_text]
feature = np.zeros(len(wordList))
for word in words:
if word in wordList:
feature[wordList.index(word)] = 1
feature = list(feature)
return feature
#定义全连接神经网络结构
def inference(input_tesnor):
with tf.variable_scope('layer1-fc'):
fc1_W = tf.get_variable("weight",[N_INPUT,NUM_LAYER1]
,initializer=tf.truncated_normal_initializer(stddev=0.1))
fc1_b = tf.get_variable("bias",[NUM_LAYER1],
initializer=tf.random_normal_initializer(stddev=1.0))
fc1 = tf.add(tf.matmul(input_tesnor,fc1_W),fc1_b)
relu1 = tf.nn.relu(fc1)
# if train:
# fc1 = tf.nn.dropout(fc1,0.5)
with tf.variable_scope('layer2-fc'):
fc2_W = tf.get_variable("weight", [NUM_LAYER1, NUM_LAYER2]
, initializer=tf.truncated_normal_initializer(stddev=0.1))
fc2_b = tf.get_variable("bias", [NUM_LAYER2],
initializer=tf.random_normal_initializer(stddev=1.0))
fc2 = tf.add(tf.matmul(relu1,fc2_W),fc2_b)
relu2 = tf.nn.relu(fc2)
with tf.variable_scope('output-fc'):
output_W = tf.get_variable("weight", [NUM_LAYER2, N_OUTPUT]
, initializer=tf.truncated_normal_initializer(stddev=0.1))
output_b = tf.get_variable("bias", [N_OUTPUT],
initializer=tf.random_normal_initializer(stddev=1.0))
output = tf.add(tf.matmul(relu2,output_W),output_b)
return output
def prediction(tweet_text):
tweet_vector = word2vector(tweet_text)
X = tf.placeholder('float')
predict = inference(X)
with tf.Session() as sess:
tf.global_variables_initializer().run()
saver = tf.train.Saver()
saver.restore(sess, '../model/model.ckpt')
res = sess.run(tf.argmax(predict.eval(feed_dict={X: [tweet_vector]}),1))
return res
print prediction("happy time ")
小计
讲道理,实验效果不是很好,准确率大约在80%左右,对于中性推特的预测效果最差。改了很多地方,效果提升不大。有好的建议欢迎大神提出。
文本分类暂时告一段落,下面应该会把TFLearn和TensorBorder可视化做一做。
参考文献
- http://blog.topspeedsnail.com/archives/10420
- https://www.tensorflow.org/api_docs/
- TensorFlow实战Google深度学习框架 Caicloud