textcnn文本分类简单实现

1.导入库
import pandas as pd
import numpy as np
import tensorflow as tf
import jieba
import os
from tensorflow.contrib import slim
2.读取数据
train_data = pd.read_csv("train.txt", sep='\t', names=['label', 'content'])
test_data= pd.read_csv("test.txt", sep='\t', names=['label', 'content'])
val_data= pd.read_csv("val.txt", sep='\t', names=['label', 'content'])

在这里插入图片描述

3.对label进行处理
label_int = {i: j for j, i in enumerate(set(train_data['label']))}
int_label = {i: j for j, i in label_int.items()}
def process_data(data):
    data['label'] = data['label'].apply(lambda x: label_int[x])
process_data(train_data)
process_data(val_data)
process_data(test_data)

在这里插入图片描述

4.对content进行分词处理,并且生成词汇表
contents = []
seq_length = 0
train_con = []
val_con = []
test_con = []
def data_fenci(data, con):
	for i in data['content']:
		h = jieba.lcut(i)
		con.append(h)
		if len(h)>seq_length:
			seq_length = h
		contents.extend(h)
data_fenci(train_data, train_con)
data_fenci(test_data, test_con)
data_fenci(val_data, val_con)
contents = set(contents)
vocab_size = len(contents)
print(vocab_size)
print(seq_length)
5.生成词汇表
vocab_int = {i : j for j,i in enumerate(contents)}
6.将数据中的content用词汇表进行转换
train_x = []
val_x = []
test_x = []
def data_transform(data, data_x):
	for i in data:
		con = [vocab_int[x] for x in i]
		con = (con + [0]*(seq_length-len(con)))
		data_x.append(con)
data_transform(train_data, train_x)
data_transform(test_data, test_x)
data_transform(val_data, val_x)
7.对label进行onehot处理
train_x=np.array(train_x)
val_x = np.array(val_x)
test_x = np.array(test_x)
train_y = np.array(train_data['label'])
val_y = np.array(val_data['label'])
train_y = (np.eye(10)[train_y])
val_y = (np.eye(10)[val_y])
8.textcnn模型
class TextCNN(object):
    def __init__(self,
            num_classes,
            seq_length,
            vocab_size,
            embedding_size,
            learning_rate,
            learning_decay_rate,
            learning_decay_steps,
            epoch,
            filter_sizes,
            num_filters,
            dropout_keep_prob,
            l2_lambda
            ):
        self.num_classes = num_classes
        self.seq_length = seq_length
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.learning_rate = learning_rate
        self.learning_decay_rate = learning_decay_rate
        self.learning_decay_steps = learning_decay_steps
        self.epoch = epoch
        self.filter_sizes = filter_sizes
        self.num_filters = num_filters
        self.dropout_keep_prob = dropout_keep_prob
        self.l2_lambda = l2_lambda
        self.inputs = tf.placeholder(tf.int32, [None, self.seq_length], name='inputs')
        self.targets = tf.placeholder(tf.float32, [None, self.num_classes], name='targets')
        self.l2_loss = tf.constant(0.0)
        self.model()

    def model(self):
        # embedding层
        with tf.name_scope("embedding"):
            self.embedding = tf.Variable(tf.random_uniform([self.vocab_size, self.embedding_size], -1.0, 1.0),
                                        name="embedding")
            self.embedding_inputs = tf.nn.embedding_lookup(self.embedding, self.inputs)
            self.embedding_inputs = tf.expand_dims(self.embedding_inputs, -1)

        # 卷积层 + 池化层
        pooled_outputs = []
        for i, filter_size in enumerate(self.filter_sizes):
            with tf.name_scope("conv_{0}".format(filter_size)):
                filter_shape = [filter_size, self.embedding_size, 1, self.num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
                b = tf.Variable(tf.constant(0.1, shape=[self.num_filters]), name="b")
                conv = tf.nn.conv2d(
                    self.embedding_inputs,
                    W,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv"
                )
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
                pooled = tf.nn.max_pool(
                    h,
                    ksize=[1, self.seq_length - filter_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name="pool"
                )
                pooled_outputs.append(pooled)

        # 将每种尺寸的卷积核得到的特征向量进行拼接
        num_filters_total = self.num_filters * len(self.filter_sizes)
        h_pool = tf.concat(pooled_outputs, 3)
        h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])
        # 对最终得到的句子向量进行dropout
        with tf.name_scope("dropout"):
            h_drop = tf.nn.dropout(h_pool_flat, self.dropout_keep_prob)

        # 全连接层
        with tf.variable_scope("output"):
            W = tf.get_variable("W", shape=[num_filters_total, self.num_classes],
                                initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[self.num_classes]), name="b")
            self.l2_loss += tf.nn.l2_loss(W)
            self.l2_loss += tf.nn.l2_loss(b)
            self.logits = tf.nn.xw_plus_b(h_drop, W, b, name="scores")
            self.pred = tf.argmax(self.logits, 1, name="predictions")

        # 损失函数
        with tf.name_scope('loss'):
            self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.targets)) + self.l2_lambda * self.l2_loss

        # 优化函数
        with tf.name_scope('optimizer'):
            self.global_step = tf.train.get_or_create_global_step()
            learning_rate = tf.train.exponential_decay(self.learning_rate, self.global_step,
                                                       self.learning_decay_steps, self.learning_decay_rate,
                                                       staircase=True)

            optimizer = tf.train.AdamOptimizer(learning_rate)
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            self.optim = slim.learning.create_train_op(total_loss=self.loss, optimizer=optimizer, update_ops=update_ops)

        # 准确率
        with tf.name_scope('accuracy'):
            correct_predictions = tf.equal(self.pred, tf.argmax(self.targets, 1))
            self.acc = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")

    def fit(self, train_x, train_y, val_x, val_y, batch_size):
        # 创建模型保存路径
        if not os.path.exists('./saves/textcnn'):
            os.makedirs('./saves/textcnn')
        if not os.path.exists('./train_logs/textcnn'):
            os.makedirs('./train_logs/textcnn')

        # 开始训练
        train_steps = 0
        best_val_acc = 0
        # summary
        tf.summary.scalar('val_loss', self.loss)
        tf.summary.scalar('val_acc', self.acc)
        merged = tf.summary.merge_all()

        # 初始化变量
        sess = tf.Session()
        writer = tf.summary.FileWriter('./train_logs/textcnn', sess.graph)
        saver = tf.train.Saver(max_to_keep=10)
        sess.run(tf.global_variables_initializer())

        for i in range(self.epoch):
            batch_train = self.batch_iter(train_x, train_y, batch_size)
            for batch_x, batch_y in batch_train:
                train_steps += 1
                feed_dict = {self.inputs: batch_x, self.targets: batch_y}
                _, train_loss, train_acc = sess.run([self.optim, self.loss, self.acc], feed_dict=feed_dict)
                if train_steps % 1000 == 0:
                    feed_dict = {self.inputs: val_x, self.targets: val_y}
                    val_loss, val_acc = sess.run([self.loss, self.acc], feed_dict=feed_dict)

                    summary = sess.run(merged, feed_dict=feed_dict)
                    writer.add_summary(summary, global_step=train_steps)

                    if val_acc >= best_val_acc:
                        best_val_acc = val_acc
                        saver.save(sess, "./saves/textcnn/", global_step=train_steps)

                    msg = 'epoch:%d/%d,train_steps:%d,train_loss:%.4f,train_acc:%.4f,val_loss:%.4f,val_acc:%.4f'
                    print(msg % (i, self.epoch, train_steps, train_loss, train_acc, val_loss, val_acc))

        sess.close()

    def batch_iter(self, x, y, batch_size=32, shuffle=True):
        """
        生成batch数据
        :param x: 训练集特征变量
        :param y: 训练集标签
        :param batch_size: 每个batch的大小
        :param shuffle: 是否在每个epoch时打乱数据
        :return:
        """
        data_len = len(x)
        num_batch = int((data_len - 1) / batch_size) + 1

        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_len))
            x_shuffle = x[shuffle_indices]
            y_shuffle = y[shuffle_indices]
        else:
            x_shuffle = x
            y_shuffle = y
        for i in range(num_batch):
            start_index = i * batch_size
            end_index = min((i + 1) * batch_size, data_len)
            yield (x_shuffle[start_index:end_index], y_shuffle[start_index:end_index])

    def predict(self, x):
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver(tf.global_variables())
        ckpt = tf.train.get_checkpoint_state('./saves/textcnn/')
        saver.restore(sess, ckpt.model_checkpoint_path)

        feed_dict = {self.inputs: x}
        logits = sess.run(self.logits, feed_dict=feed_dict)
        y_pred = np.argmax(logits, 1)
        return y_pred

textcnn tensorflow实现见上篇博客

9.训练模型
model = TextCNN(len(set(train_data['label'])),
                seq_length,
                vocab_size,
                128,
                0.01,
                0.9,
                10,
                20,
                [3,4,5],
                2,
                0.8,
                0.0,
               )
model.fit(train_x,train_y,val_x,val_y,4)
model.predict(test_x)               

ps:只是简单的实现,没有调参,数据也没有进行预处理

  • 1
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
下面是一个简单TextCNN文本分类模型的代码示例: ```python import tensorflow as tf class TextCNN(object): """ A CNN for text classification. """ def __init__(self, sequence_length, num_classes, vocab_size, embedding_size, filter_sizes, num_filters): # Placeholders for input, output and dropout self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x") self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y") self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") # Embedding layer with tf.device('/cpu:0'), tf.name_scope("embedding"): self.W = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), name="W") self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x) self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1) # Convolution + maxpool layer for each filter size pooled_outputs = [] for i, filter_size in enumerate(filter_sizes): with tf.name_scope("conv-maxpool-%s" % filter_size): # Convolution Layer filter_shape = [filter_size, embedding_size, 1, num_filters] W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W") b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b") conv = tf.nn.conv2d( self.embedded_chars_expanded, W, strides=[1, 1, 1, 1], padding="VALID", name="conv") # Apply nonlinearity h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") # Maxpooling over the outputs pooled = tf.nn.max_pool( h, ksize=[1, sequence_length - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="pool") pooled_outputs.append(pooled) # Combine all the pooled features num_filters_total = num_filters * len(filter_sizes) self.h_pool = tf.concat(pooled_outputs, 3) self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total]) # Add dropout with tf.name_scope("dropout"): self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob) # Final (unnormalized) scores and predictions with tf.name_scope("output"): W = tf.get_variable( "W", shape=[num_filters_total, num_classes], initializer=tf.contrib.layers.xavier_initializer()) b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b") self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores") self.predictions = tf.argmax(self.scores, 1, name="predictions") # Calculate mean cross-entropy loss with tf.name_scope("loss"): losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y) self.loss = tf.reduce_mean(losses) # Accuracy with tf.name_scope("accuracy"): correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") ``` 这段代码使用TensorFlow实现了一个TextCNN模型,并包含了以下组件: - Embedding Layer: 将输入的单词序列转换为稠密的向量表示 - Convolutional Layer + Maxpooling Layer: 使用不同大小的卷积核和池化操作来提取文本特征 - Dropout: 防止过拟合 - Output Layer: 用于分类的全连接层 你可以根据需要修改模型的超参数和输入输出维度来适应你的任务。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值