cnn、rnn实现中文文本分类(基于tensorflow)

代码主要参考这位大神的博客,这边说的比较清楚,http://blog.csdn.net/u011439796/article/details/77692621


tensorflow版本:

   
   
  1. In [33]: tf.__version__
  2. Out[33]: '1.2.1'

首先是数据获取:

   
   
  1. curl -O "http://thuctc.thunlp.org/source/THUCNews.zip"
数据主要是财经、彩票、房产、股票、家居等相关主题的数据

现在之后用unzip 解压既可以,再抽取10个主题的数据,参考的博客是每个主题抽取6500条数据,其中每个主题5000用于训练,1000用于测试,500用于验证


抽取数据的python代码如下,原博客写了个shell脚本,我用python写了个:

   
   
  1. import os
  2. import glob
  3. import shutil
  4. import random
  5. basepath="/Users/shuubiasahi/Desktop/THUCNews/"
  6. newpath="/Users/shuubiasahi/Desktop/tensorflow/text/"
  7. listpath=list(map(lambda x:basepath+str(x)+"/",list(filter(lambda x:not str(x).startswith("."),os.listdir(basepath)))))
  8. def copy(listpath,MAXCOUNT=6500):
  9. for path in listpath:
  10. newdir=newpath+ str(path).split("/")[-2]
  11. print(newdir)
  12. if not os.path.exists(newdir):
  13. os.mkdir(newdir)
  14. files=glob.glob(path+"*.txt")
  15. if len(files)<MAXCOUNT:
  16. resultlist=[]
  17. for i in range(MAXCOUNT):
  18. resultlist.append(random.choice(files))
  19. else:
  20. resultlist=random.sample(files,MAXCOUNT)
  21. for file in resultlist:
  22. shutil.copy(file,newdir)
  23. if __name__=='__main__':
  24. copy(listpath)
  25. print("抽取成功")



把数据整合到一个文件里面去,格式如下:

   
   
  1. 标签+“\t”+实际文本内容
分为训练集合测试集、验证集合,涉及到代码如下:

   
   
  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. """
  4. 将文本整合到 traintestval 三个文件中
  5. """
  6. import os
  7. basepath="/Users/shuubiasahi/Desktop/tensorflow/text/"
  8. trainpath="/Users/shuubiasahi/Desktop/tensorflow/train/"
  9. def _read_file(filename):
  10. with open(filename,'r',encoding='utf-8') as f:
  11. """
  12. u3000中文全角下的空格
  13. """
  14. return f.read().replace('\n','').replace('\t','').replace('\u3000','')
  15. def save_file(dirname):
  16. """
  17. 将多个文件整合并存到3个文件中
  18. dirname: 原数据目录
  19. 文件内容格式: 类别\t内容
  20. """
  21. f_train=open(trainpath+"cnews.train.txt",'w',encoding='utf-8')
  22. f_test = open(trainpath + "cnews.test.txt", 'w', encoding='utf-8')
  23. f_val = open(trainpath + "cnews.val.txt", 'w', encoding='utf-8')
  24. for category in os.listdir(dirname):
  25. catdir=os.path.join(dirname,category)
  26. if not os.path.isdir(catdir):
  27. continue
  28. files=os.listdir(catdir)
  29. print(len(files))
  30. count=0
  31. for cur_file in files:
  32. filename=os.path.join(catdir,cur_file)
  33. content=_read_file(filename)
  34. if count<5000:
  35. f_train.write(category+"\t"+content+"\n")
  36. elif count<6000:
  37. f_test.write(category+"\t"+content+"\n")
  38. else:
  39. f_val.write(category + '\t' + content + '\n')
  40. count+=1
  41. print("finish:",category)
  42. f_train.close()
  43. f_test.close()
  44. f_val.close()
  45. if __name__=='__main__':
  46. save_file(basepath)
  47. print(len(open(trainpath+"cnews.train.txt", 'r', encoding='utf-8').readlines()))
  48. print(len(open(trainpath + "cnews.test.txt", 'r', encoding='utf-8').readlines()))
  49. print(len(open(trainpath + "cnews.val.txt", 'r', encoding='utf-8').readlines()))

预处理代码说明:

    
    
  1. read_file():读取上一部分生成的数据文件,将内容和标签分开返回;
  2. _build_vocab(): 构建词汇表,这里不需要对文档进行分词,单字的效果已经很好,这一函数会将词汇表存储下来,避免每一次重复处理;
  3. _read_vocab(): 读取上一步存储的词汇表,转换为{词:id}表示;
  4. _read_category(): 将分类目录固定,转换为{类别: id}表示;
  5. _file_to_ids(): 基于上面定义的函数,将数据集从文字转换为id表示;
  6. to_words(): 将一条由id表示的数据重新转换为文字;
  7. preocess_file(): 一次性处理所有的数据并返回;
  8. batch_iter(): 为神经网络的训练准备批次的数据。
     
     
  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. from collections import Counter
  4. import tensorflow.contrib.keras as kr
  5. import numpy as np
  6. import os
  7. trainpath="/Users/shuubiasahi/Desktop/tensorflow/train/"
  8. def _read_file(filename):
  9. """读取文件数据"""
  10. counters=[]
  11. labels=[]
  12. with open(filename,'r',encoding='utf-8') as f:
  13. for line in f.readlines():
  14. try:
  15. label,contet=line.strip().split('\t')
  16. counters.append(list(contet))
  17. labels.append(label)
  18. except Exception as e:
  19. pass
  20. return counters,labels
  21. def build_vocab(filename,vocab_size=5000):
  22. data,_=_read_file(filename)
  23. all_data=[]
  24. for content in data:
  25. all_data.extend(content)
  26. counter=Counter(all_data)
  27. count_parirs=counter.most_common(vocab_size-1)
  28. words,_=list(zip(*count_parirs))
  29. # 添加一个 <PAD> 来将所有文本pad为同一长度
  30. words = ['<PAD>'] + list(words)
  31. open(trainpath+'vocab_cnews.txt', 'w',
  32. encoding='utf-8').write('\n'.join(words))
  33. def _read_vocab(filename):
  34. """读取词汇列别"""
  35. words=list(map(lambda line:line.strip(),open(filename,'r',encoding='utf-8').readlines()))
  36. word_to_id=dict(zip(words,range(len(words))))
  37. return words,word_to_id
  38. def _read_category():
  39. """读取分类目录,固定"""
  40. categories=["财经","彩票","房产","股票",
  41. "家居","教育","科技","社会","时尚","体育"]
  42. cat_to_id=dict(zip(categories,range(len(categories))))
  43. return categories,cat_to_id
  44. def to_words(content,words):
  45. """降id表示的内容转换成文字"""
  46. return ''.join(words[x] for x in content)
  47. def _file_to_ids(filename,word_to_id,max_len=600):
  48. """将文件转换为id表示"""
  49. _,cat_to_id=_read_category()
  50. contents,labels=_read_file(filename)
  51. data_id=[]
  52. label_id=[]
  53. for i in range(len(contents)):
  54. data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
  55. label_id.append(cat_to_id[labels[i]])
  56. # 使用keras提供的pad_sequences来将文本pad为固定长度
  57. x_pad=kr.preprocessing.sequence.pad_sequences(data_id,max_len)
  58. y_pad=kr.utils.to_categorical(label_id)
  59. return x_pad,y_pad
  60. def preocess_file(data_path="/Users/shuubiasahi/Desktop/tensorflow/train/",seq_length=600):
  61. """一次性返回所有的数据"""
  62. words,word_to_id=_read_vocab(os.path.join(data_path,'vocab_cnews.txt'))
  63. x_train,y_train=_file_to_ids(os.path.join(data_path,"cnews.train.txt"),word_to_id,seq_length)
  64. x_test,y_test=_file_to_ids(os.path.join(data_path,
  65. 'cnews.test.txt'), word_to_id, seq_length)
  66. x_val, y_val = _file_to_ids(os.path.join(data_path,
  67. 'cnews.val.txt'), word_to_id, seq_length)
  68. return x_train, y_train, x_test, y_test, x_val, y_val,words
  69. def batch_iter(data,batch_size=64,num_epochs=5):
  70. """生成批次数据"""
  71. data=np.array(data)
  72. data_size=len(data)
  73. num_batchs_per_epchs=int((data_size-1)/batch_size)+1
  74. for epoch in range(num_epochs):
  75. indices=np.random.permutation(np.arange(data_size))
  76. shufflfed_data=data[indices]
  77. for batch_num in range(num_batchs_per_epchs):
  78. start_index=batch_num*batch_size
  79. end_index=min((batch_num + 1) * batch_size, data_size)
  80. yield shufflfed_data[start_index:end_index]
  81. if __name__=='__main__':
  82. if not os.path.exists(os.path.join(trainpath,"vocab_cnews.txt")):
  83. build_vocab(trainpath+'cnews.train.txt')
  84. x_train, y_train, x_test, y_test, x_val, y_val,words = preocess_file()
  85. print(x_train.shape, y_train.shape)
  86. print(x_test.shape, y_test.shape)
  87. print(x_val.shape, y_val.shape)


lstm、gru、cnn相关参数配置:

   
   
  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. class TCNNConfig(object):
  4. # 模型参数
  5. embedding_dim = 64 # 词向量维度
  6. seq_length = 600 # 序列长度
  7. num_classes = 10 # 类别数
  8. num_filters = 256 # 卷积核数目
  9. kernel_size = 5 # 卷积核尺寸
  10. vocab_size = 5000 # 词汇表达小
  11. hidden_dim = 128 # 全连接层神经元
  12. dropout_keep_prob = 0.8 # dropout保留比例
  13. learning_rate = 1e-3 # 学习率
  14. batch_size = 128 # 每批训练大小
  15. num_epochs = 10 # 总迭代轮次
  16. print_per_batch = 100 # 每多少轮输出一次结果
  17. class TRNNConfig(object):
  18. """RNN配置参数"""
  19. # 模型参数
  20. embedding_dim = 64 # 词向量维度
  21. seq_length = 600 # 序列长度
  22. num_classes = 10 # 类别数
  23. vocab_size = 5000 # 词汇表达小
  24. num_layers= 2 # 隐藏层层数
  25. hidden_dim = 128 # 隐藏层神经元
  26. rnn = 'gru' # lstm 或 gru
  27. dropout_keep_prob = 0.8 # dropout保留比例
  28. learning_rate = 1e-3 # 学习率
  29. batch_size = 128 # 每批训练大小
  30. num_epochs = 100 # 总迭代轮次
  31. print_per_batch = 10 # 每多少轮输出一次结果

cnn代码:

   
   
  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. import tensorflow as tf
  4. class TextCNN(object):
  5. """文本分类,cnn模型"""
  6. def __init__(self,config):
  7. self.config=config
  8. self.input_x=tf.placeholder(tf.int32,[None,self.config.seq_length],name='input_x')
  9. self.input_y=tf.placeholder(tf.float32,[None,self.config.num_classes],name="input_y")
  10. self.keep_prob=tf.placeholder(tf.float32,name='keep_prob')
  11. self.cnn()
  12. def input_embedding(self):
  13. """词嵌套"""
  14. with tf.device('/cpu:0'):
  15. embedding=tf.get_variable("embedding",[self.config.vocab_size,self.config.embedding_dim])
  16. _input=tf.nn.embedding_lookup(embedding,self.input_x)
  17. return _input
  18. def cnn(self):
  19. """cnn模型"""
  20. embedding_inputs=self.input_embedding()
  21. with tf.name_scope("cnn"):
  22. # cnn与全局最大池化
  23. conv=tf.layers.conv1d(embedding_inputs,self.config.num_filters,self.config.kernel_size,name="conv")
  24. #global max pooling
  25. gmp=tf.reduce_max(conv,reduction_indices=[1],name='gmp')
  26. with tf.name_scope("score"):
  27. # 全连接层,后面接dropout以及relu激活
  28. fc=tf.layers.dense(gmp,self.config.hidden_dim,name='fc1')
  29. fc=tf.contrib.layers.dropout(fc, self.keep_prob)
  30. fc=tf.nn.relu(fc)
  31. #分类器
  32. self.logits=tf.layers.dense(fc,self.config.num_classes,name='fc2')
  33. self.pred_y=tf.nn.softmax(self.logits)
  34. with tf.name_scope("loss"):
  35. cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
  36. logits=self.logits, labels=self.input_y)
  37. self.loss = tf.reduce_mean(cross_entropy)
  38. with tf.name_scope("optimize"):
  39. # 优化器
  40. optimizer = tf.train.AdamOptimizer(
  41. learning_rate=self.config.learning_rate)
  42. self.optim = optimizer.minimize(self.loss)
  43. with tf.name_scope("accuracy"):
  44. # 准确率
  45. correct_pred = tf.equal(tf.argmax(self.input_y, 1),
  46. tf.argmax(self.pred_y, 1))
  47. self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

rnn代码:

   
   
  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. import tensorflow as tf
  4. class TextRNN(object):
  5. """文本分类,RNN模型"""
  6. def __init__(self, config):
  7. self.config = config
  8. self.input_x = tf.placeholder(tf.int32,
  9. [None, self.config.seq_length], name='input_x')
  10. self.input_y = tf.placeholder(tf.float32,
  11. [None, self.config.num_classes], name='input_y')
  12. self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
  13. self.rnn()
  14. def input_embedding(self):
  15. """词嵌入"""
  16. with tf.device('/cpu:0'):
  17. embedding = tf.get_variable('embedding',
  18. [self.config.vocab_size, self.config.embedding_dim])
  19. _inputs = tf.nn.embedding_lookup(embedding, self.input_x)
  20. return _inputs
  21. def rnn(self):
  22. """rnn模型"""
  23. def lstm_cell():
  24. """lstm核"""
  25. return tf.nn.rnn_cell.BasicLSTMCell(self.config.hidden_dim,
  26. state_is_tuple=True)
  27. def gru_cell():
  28. """gru核"""
  29. return tf.nn.rnn_cell.GRUCell(self.config.hidden_dim)
  30. def dropout():
  31. """为每一个rnn核后面加一个dropout层"""
  32. if (self.config.rnn == 'lstm'):
  33. cell = lstm_cell()
  34. else:
  35. cell = gru_cell()
  36. return tf.contrib.rnn.DropoutWrapper(cell,
  37. output_keep_prob=self.keep_prob)
  38. embedding_inputs = self.input_embedding()
  39. with tf.name_scope("rnn"):
  40. # 多层rnn网络
  41. cells = [dropout() for _ in range(self.config.num_layers)]
  42. rnn_cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)
  43. _outputs, _ = tf.nn.dynamic_rnn(cell=rnn_cell,
  44. inputs=embedding_inputs, dtype=tf.float32)
  45. last = _outputs[:, -1, :] # 取最后一个时序输出作为结果
  46. with tf.name_scope("score"):
  47. # 全连接层,后面接dropout以及relu激活
  48. fc = tf.layers.dense(last, self.config.hidden_dim, name='fc1')
  49. fc = tf.contrib.layers.dropout(fc, self.keep_prob)
  50. fc = tf.nn.relu(fc)
  51. # 分类器
  52. self.logits = tf.layers.dense(fc, self.config.num_classes,
  53. name='fc2')
  54. self.pred_y = tf.nn.softmax(self.logits)
  55. with tf.name_scope("loss"):
  56. # 损失函数,交叉熵
  57. cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
  58. logits=self.logits, labels=self.input_y)
  59. self.loss = tf.reduce_mean(cross_entropy)
  60. with tf.name_scope("optimize"):
  61. # 优化器
  62. optimizer = tf.train.AdamOptimizer(
  63. learning_rate=self.config.learning_rate)
  64. self.optim = optimizer.minimize(self.loss)
  65. with tf.name_scope("accuracy"):
  66. # 准确率
  67. correct_pred = tf.equal(tf.argmax(self.input_y, 1),
  68. tf.argmax(self.pred_y, 1))
  69. self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))


运行代码:

   
   
  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. from tensorflowexcise.cnn_model import TextCNN
  4. from tensorflowexcise.rnn_model import TextRNN
  5. from tensorflowexcise.configuration import TCNNConfig
  6. from tensorflowexcise.configuration import TRNNConfig
  7. from tensorflowexcise.cnews_loader import preocess_file,batch_iter,build_vocab
  8. import time
  9. import tensorflow as tf
  10. import os
  11. from datetime import timedelta
  12. trainpath="/Users/shuubiasahi/Desktop/tensorflow/train/"
  13. def run_epoch(cnn=True):
  14. # 载入数据
  15. print('Loading data...')
  16. start_time = time.time()
  17. if not os.path.exists(trainpath+'vocab_cnews.txt'):
  18. build_vocab(trainpath+'cnews.train.txt')
  19. x_train, y_train, x_test, y_test, x_val, y_val, words = preocess_file()
  20. if cnn:
  21. print('Using CNN model...')
  22. config = TCNNConfig()
  23. config.vocab_size = len(words)
  24. model = TextCNN(config)
  25. tensorboard_dir = '/Users/shuubiasahi/Desktop/tensorflow/board.log'
  26. else:
  27. print('Using RNN model...')
  28. config = TRNNConfig()
  29. config.vocab_size = len(words)
  30. model = TextRNN(config)
  31. tensorboard_dir = '/Users/shuubiasahi/Desktop/tensorflow/board.log'
  32. end_time = time.time()
  33. time_dif = end_time - start_time
  34. time_dif = timedelta(seconds=int(round(time_dif)))
  35. print('Time usage:', time_dif)
  36. print('Constructing TensorFlow Graph...')
  37. session = tf.Session()
  38. session.run(tf.global_variables_initializer())
  39. # 配置 tensorboard
  40. tf.summary.scalar("loss", model.loss)
  41. tf.summary.scalar("accuracy", model.acc)
  42. if not os.path.exists(tensorboard_dir):
  43. os.makedirs(tensorboard_dir)
  44. merged_summary = tf.summary.merge_all()
  45. writer = tf.summary.FileWriter(tensorboard_dir)
  46. writer.add_graph(session.graph)
  47. # 生成批次数据
  48. print('Generating batch...')
  49. batch_train = batch_iter(list(zip(x_train, y_train)),
  50. config.batch_size, config.num_epochs)
  51. def feed_data(batch):
  52. """准备需要喂入模型的数据"""
  53. x_batch, y_batch = zip(*batch)
  54. feed_dict = {
  55. model.input_x: x_batch,
  56. model.input_y: y_batch
  57. }
  58. return feed_dict, len(x_batch)
  59. def evaluate(x_, y_):
  60. """
  61. 模型评估
  62. 一次运行所有的数据会OOM,所以需要分批和汇总
  63. """
  64. batch_eval = batch_iter(list(zip(x_, y_)), 128, 1)
  65. total_loss = 0.0
  66. total_acc = 0.0
  67. cnt = 0
  68. for batch in batch_eval:
  69. feed_dict, cur_batch_len = feed_data(batch)
  70. feed_dict[model.keep_prob] = 1.0
  71. loss, acc = session.run([model.loss, model.acc],
  72. feed_dict=feed_dict)
  73. total_loss += loss * cur_batch_len
  74. total_acc += acc * cur_batch_len
  75. cnt += cur_batch_len
  76. return total_loss / cnt, total_acc / cnt
  77. # 训练与验证
  78. print('Training and evaluating...')
  79. start_time = time.time()
  80. print_per_batch = config.print_per_batch
  81. for i, batch in enumerate(batch_train):
  82. feed_dict, _ = feed_data(batch)
  83. feed_dict[model.keep_prob] = config.dropout_keep_prob
  84. if i % 5 == 0: # 每5次将训练结果写入tensorboard scalar
  85. s = session.run(merged_summary, feed_dict=feed_dict)
  86. writer.add_summary(s, i)
  87. if i % print_per_batch == print_per_batch - 1: # 每200次输出在训练集和验证集上的性能
  88. loss_train, acc_train = session.run([model.loss, model.acc],
  89. feed_dict=feed_dict)
  90. loss, acc = evaluate(x_val, y_val)
  91. # 时间
  92. end_time = time.time()
  93. time_dif = end_time - start_time
  94. time_dif = timedelta(seconds=int(round(time_dif)))
  95. msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},'\
  96. + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5}'
  97. print(msg.format(i + 1, loss_train, acc_train, loss, acc, time_dif))
  98. session.run(model.optim, feed_dict=feed_dict) # 运行优化
  99. # 最后在测试集上进行评估
  100. print('Evaluating on test set...')
  101. loss_test, acc_test = evaluate(x_test, y_test)
  102. msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}'
  103. print(msg.format(loss_test, acc_test))
  104. session.close()
  105. if __name__ == '__main__':
  106. run_epoch(cnn=True)



运行结果:

   
   
  1. Iter: 3100, Train Loss: 0.0043, Train Acc: 100.00%, Val Loss: 0.54, Val Acc: 87.42%, Time: 1:36:34
  2. Iter: 3200, Train Loss: 0.0057, Train Acc: 100.00%, Val Loss: 0.55, Val Acc: 87.58%, Time: 1:38:34
  3. Iter: 3300, Train Loss: 0.0068, Train Acc: 100.00%, Val Loss: 0.57, Val Acc: 87.58%, Time: 1:40:36
  4. Iter: 3400, Train Loss: 0.0097, Train Acc: 100.00%, Val Loss: 0.62, Val Acc: 87.16%, Time: 1:42:40
  5. Iter: 3500, Train Loss: 0.0093, Train Acc: 100.00%, Val Loss: 0.58, Val Acc: 87.68%, Time: 1:44:43
  6. Iter: 3600, Train Loss: 0.0047, Train Acc: 100.00%, Val Loss: 0.57, Val Acc: 88.04%, Time: 1:46:45
  7. Iter: 3700, Train Loss: 0.0037, Train Acc: 100.00%, Val Loss: 0.6, Val Acc: 87.74%, Time: 1:48:49
  8. Iter: 3800, Train Loss: 0.0013, Train Acc: 100.00%, Val Loss: 0.6, Val Acc: 87.18%, Time: 1:50:54
  9. Iter: 3900, Train Loss: 0.019, Train Acc: 99.22%, Val Loss: 0.63, Val Acc: 87.08%, Time: 1:52:59
  10. Evaluating on test set...
  11. Test Loss: 0.48, Test Acc: 90.77%



tensorfboard:






有疑问或者侵权的联系我 ,xuxu_ge

  • 1
    点赞
  • 22
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值