深度学习-*-TextCNN对头条文章进行分类

数据集–头条文章

根据晚上搜集的数据资源-github地址:头条数据集
数据概览
处理该数据集的策略是 分词,抽词等,主要是通过原爬取者抽取的关键字和我从标题中抽取的关键字结合作为分类的特征词。

模型

参考的是网上的教程:

import tensorflow as tf

class TextCNN(object):
    """
    TextCNN model
    """
    def __init__(self,sequence_length,num_classes,vocab_size,embedding_size,filter_sizes,num_filters,l2_reg_lambda=0.0):
        self.input_x = tf.placeholder(tf.int32,shape=[None,sequence_length],name='input_x')
        self.input_y = tf.placeholder(tf.float32,shape=[None,num_classes],name='input_y')
        self.dropout_keep_prob = tf.placeholder(tf.float32,name='dropout_keep_prob')

        l2_loss = tf.constant(0.0)

        with tf.name_scope('embedding'):
            self.W = tf.Variable(tf.random_uniform([vocab_size,embedding_size],-1.0,1.0,dtype=tf.float32),name='W')
            self.embedded_chars = tf.nn.embedding_lookup(self.W,self.input_x)
            self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars,-1)
        pooled_outputs = []
        for i,filter_size in enumerate(filter_sizes):
            with tf.name_scope('conv-maxpool-%s'% filter_size):
                filter_shape = [filter_size,embedding_size,1,num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape,stddev=0.1,dtype=tf.float32),name='W')
                b = tf.Variable(tf.constant(0.1,shape=[num_filters],dtype=tf.float32),name='b')
                conv = tf.nn.conv2d(self.embedded_chars_expanded,W,strides=[1,1,1,1],padding="VALID",name='conv')
                h = tf.nn.relu(tf.nn.bias_add(conv,b),name='relu')
                pooled = tf.nn.max_pool(h,ksize=[1,sequence_length-filter_size+1,1,1],strides=[1,1,1,1],padding='VALID',name='pool')
                pooled_outputs.append(pooled)
        num_filters_total  = num_filters * len(filter_sizes)
        self.h_pool = tf.concat(pooled_outputs,3)
        self.h_pool_flat = tf.reshape(self.h_pool,[-1,num_filters_total])

        with tf.name_scope('droupout'):
            self.h_drop = tf.nn.dropout(self.h_pool_flat,keep_prob=self.dropout_keep_prob)

        with tf.name_scope('output'):
            W = tf.get_variable("W",shape=[num_filters_total,num_classes],initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1,shape=[num_classes],name='b'))
            l2_loss += tf.nn.l2_loss(W)
            l2_loss += tf.nn.l2_loss(b)
            self.scores = tf.nn.xw_plus_b(self.h_drop,W,b,name='scores')
            self.predictions = tf.argmax(self.scores,1,name='predictions')

        with tf.name_scope("loss"):
            losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.scores,labels=self.input_y)
            self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss

        with tf.name_scope('accuracy'):
            correct_predictions = tf.equal(self.predictions,tf.argmax(self.input_y,1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions,"float"),name='accuracy')
        self.train_op = tf.train.AdamOptimizer(1e-4).minimize(self.loss)

from dl_learn.utils import split_toutiao_to_train_test
from dl_learn.utils import read_vocab

def main(batch_size = 32,train_epochs = 1000,dropout_prob=0.5,save_path='./model/toutiao/textcnn/cnn.ckpt'):
    # 读取词典
    vocab = read_vocab()
    vocab_size  = len(vocab)
    train_x, train_y, test_x, test_y, label = split_toutiao_to_train_test(test_size=0.02)
    num_classes = len(label)
    train_rows = train_x.shape[0]
    sequence_length = train_x.shape[-1]

    # 构建TextCNN模型
    textcnn = TextCNN(sequence_length=sequence_length,num_classes=num_classes,vocab_size=vocab_size
                      ,embedding_size=64,filter_sizes=[3,4,5],num_filters=16,l2_reg_lambda=0.01)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver(max_to_keep=3)
        for epoch in range(1,1+train_epochs):
            # 每次迭代均生成批量数据   利用Dataset生成batch数据
            if epoch>1:
                train_x, train_y, test_x, test_y, label = split_toutiao_to_train_test(test_size=0.02)
            step_start = 0
            go_batch = True
            all_steps = 1
            while go_batch:
                x = train_x[step_start:step_start+batch_size]
                y = train_y[step_start:step_start+batch_size]
                if x.shape[0] == 0:
                    print('---------next epoch---------')
                    go_batch = False
                else:
                    step_start += batch_size
                    _,train_loss,train_accuracy = sess.run([textcnn.train_op,textcnn.loss,textcnn.accuracy],
                                                           feed_dict={textcnn.input_x:x,textcnn.input_y:y,textcnn.dropout_keep_prob:dropout_prob})
                    if all_steps % 10 == 0:
                        test_loss, test_accuracy = sess.run([textcnn.loss, textcnn.accuracy],
                                                                 feed_dict={textcnn.input_x: test_x, textcnn.input_y: test_y,
                                                                            textcnn.dropout_keep_prob: 1.0})
                        print('epoch:{},steps:{},train loss:{},train accuracy:{},test loss:{},test accuracy:{}'.format(epoch, all_steps, train_loss,
                                                                                                                     train_accuracy, test_loss,test_accuracy))
                    all_steps+=1
            saver.save(sess,save_path=save_path,global_step=epoch)

if __name__ == '__main__':
    tf.app.run(main(batch_size=128))

split_toutiao_to_train_test() 是一个读取数据的函数,输入参数是测试集的占比的大小 返回参数是 训练集X,训练集Y,测试X,测试Y,类标集合。
read_vocab()就是读取完整的词库字典,我这里是60万的词库。

结果

迭代了6次

  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值