使用RNN进行文本分类 python代码实现

1、本博客项目由来是oxford 的nlp 深度学习课程第三周作业,作业要求使用LSTM进行文本分类。和上一篇CNN文本分类类似,本此代码风格也是仿照sklearn风格,三步走形式(模型实体化,模型训练和模型预测)但因为训练时间较久不知道什么时候训练比较理想,因此在次基础上加入了继续训练的功能。
2、构造文本分类的rnn类,(保存文件为ClassifierRNN.py)
2.1 相应配置参数因为较为繁琐,不利于阅读,因此仿照tensorflow源码形式,将代码分成 网络配置参数 nn_config 和计算配置参数: calc_config,也相应声明了其对应的类:NN_config,CALC_config,
2.2 声明 ClassifierRNN类,该类的主要函数有:(init, build_inputs, build_rnns, build_loss, build_optimizer, random_batches,fit, load_model, predict_accuracy, predict),代码如下:

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import os
import time
class NN_config(object):
    def __init__(self,num_seqs=1000,num_steps=10,num_units=128,num_classes = 8,\
                num_layers = 1,embedding_size=100,vocab_size = 10000,\
                use_embeddings=False,embedding_init=None):
        self.num_seqs   = num_seqs
        self.num_steps  = num_steps
        self.num_units  = num_units
        self.num_classes = num_classes
        self.num_layers = num_layers
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.use_embeddings = use_embeddings
        self.embedding_init = embedding_init

class CALC_config(object):
    def __init__(self,batch_size=64,num_epoches = 20,learning_rate = 1.0e-3, \
                 keep_prob=0.5,show_every_steps = 10,save_every_steps=100):
        self.batch_size     = batch_size
        self.num_epoches    = num_epoches
        self.learning_rate  = learning_rate
        self.keep_prob      = keep_prob
        self.show_every_steps = show_every_steps
        self.save_every_steps = save_every_steps

class ClassifierRNN(object):
    def __init__(self, nn_config, calc_config):
        # assign revalent parameters
        self.num_seqs   = nn_config.num_seqs
        self.num_steps  = nn_config.num_steps
        self.num_units  = nn_config.num_units
        self.num_layers = nn_config.num_layers
        self.num_classes    = nn_config.num_classes
        self.embedding_size = nn_config.embedding_size
        self.vocab_size     = nn_config.vocab_size
        self.use_embeddings = nn_config.use_embeddings
        self.embedding_init = nn_config.embedding_init
        # assign calc ravalant values
        self.batch_size     = calc_config.batch_size
        self.num_epoches    = calc_config.num_epoches
        self.learning_rate  = calc_config.learning_rate
        self.train_keep_prob= calc_config.keep_prob
        self.show_every_steps = calc_config.show_every_steps
        self.save_every_steps = calc_config.save_every_steps
        # create networks models
        tf.reset_default_graph()
        self.build_inputs()
        self.build_rnns()
        self.build_loss()
        self.build_optimizer()
        self.saver = tf.train.Saver()

    def build_inputs(self):
        with tf.name_scope('inputs'):
            self.inputs = tf.placeholder(tf.int32, shape=[None,self.num_seqs],\
                                                                name='inputs')
            self.targets = tf.placeholder(tf.int32, shape=[None, self.num_classes],\
                                                                name='classes')
            self.keep_prob = tf.placeholder(tf.float32,name='keep_prob')
            self.embedding_ph = tf.placeholder(tf.float32, name='embedding_ph')

            if self.use_embeddings == False:
                self.embeddings = tf.Variable(tf.random_uniform([self.vocab_size,\
                                self.embedding_size],-0.1,0.1),name='embedding_flase')  
                self.rnn_inputs = tf.nn.embedding_lookup(self.embeddings,self.inputs)
            else:
                embeddings = tf.Variable(tf.constant(0.0,shape=[self.vocab_size,self.embedding_size]),\
                                                            trainable=False,name='embeddings_true')
                self.embeddings = embeddings.assign(self.embedding_ph)
                self.rnn_inputs = tf.nn.embedding_lookup(self.embeddings,self.inputs)
                print('self.rnn_inputs.shape:',self.rnn_inputs.shape)

    def build_rnns(self):
        def get_a_cell(num_units,keep_prob):
            rnn_cell = tf.contrib.rnn.BasicLSTMCell(num_units=num_units)
            drop = tf.contrib.rnn.DropoutWrapper(rnn_cell, output_keep_prob=keep_prob)
            return drop
        with tf.name_scope('rnns'):
            self.cell = tf.contrib.rnn.MultiRNNCell([get_a_cell(self.num_units,self.keep_prob) for _ in range(self.num_layers)]) 
            self.initial_state = self.cell.zero_state(self.batch_size,tf.float32)
            self.outputs, self.final_state = tf.nn.dynamic_rnn(self.cell,tf.cast(self.rnn_inputs,tf.float32),\
              initial_state = self.initial_state  )
            print('rnn_outputs',self.outputs.shape)

    def bu
  • 5
    点赞
  • 36
    收藏
    觉得还不错? 一键收藏
  • 5
    评论
以下是利用Python代码实现文本分类的示例: ``` import numpy as np import pandas as pd from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.models import Sequential from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D from sklearn.model_selection import train_test_split # 加载数据 data = pd.read_csv('data.csv') data = data[['text', 'label']] data = data[data['text'].notnull()] # 将文本转换为序列 tokenizer = Tokenizer(num_words=5000, split=" ") tokenizer.fit_on_texts(data['text'].values) X = tokenizer.texts_to_sequences(data['text'].values) X = pad_sequences(X) # 将数据集拆分为训练集和测试集 Y = pd.get_dummies(data['label']).values X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42) # 构建模型 model = Sequential() model.add(Embedding(5000, 128, input_length=X.shape[1])) model.add(SpatialDropout1D(0.4)) model.add(LSTM(256, dropout=0.2, recurrent_dropout=0.2)) model.add(Dense(2, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) # 训练模型 model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=10, batch_size=32) # 在测试集上评估模型 score, accuracy = model.evaluate(X_test, Y_test, verbose=2) print("Test Score: ", score) print("Test Accuracy: ", accuracy) ``` 其中,`data.csv`文件是包含文本和标签的数据集,`text`列包含文本,`label`列包含标签。此代码使用LSTM模型训练了一个文本分类器,将文本分为两类,输出每个类别的概率。您可以根据需要对代码进行调整和修改。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 5
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值