LSTM之文本分类实例

待分类数据为已经分词的文本文档,其中每一行代表一篇文章,分词较为粗糙,未进行停用词过滤,使用停用词过滤后效果应该会有明显提升。
1、加载数据

# -*- coding: utf-8 -*-

import sys
reload(sys)
sys.setdefaultencoding('utf-8')



def loadData(fileName):#读取分词数据,存储在list列表里,每个list列表里的每一个list代表一个句子
    f=open(fileName,'r')
    senLists=[]
    for row in f.readlines():
        senList=row.split(' ')
        sens=[]
        for word in senList:
            if word.strip() not in ['',',','。',';','!','#','?','“','”','(',')','1','2','3',':','.','《','》','【','】']:
                sens.append(word)
        senLists.append(sens)
    print 'Has Loaded ',len(senLists),'句子列表'
    return senLists

2、wor2vec训练词向量

# -*- coding: utf-8 -*-

import sys
from gensim.models import Word2Vec
from LoadData import loadData
import pickle
from gensim.corpora.dictionary import Dictionary
reload(sys)
sys.setdefaultencoding('utf-8')


def saveWordIndex(model):
    word2vec_dict=Dictionary()
    word2vec_dict.doc2bow(model.wv.vocab.keys(),allow_update=True)
    w2v_index={w:i+1 for i,w in word2vec_dict.items()}#建立词典索引字典,如{'中国':1},索引从1开始,0目前不存储,以后存储不在字典索引里的词语
    print w2v_index.keys()[:20]
    w2v_vec={w:model[w.encode('utf-8')] for w in w2v_index.keys()}#建立词向量字典,如{'中国':['0.01','0.25',......]}
    pickle.dump(w2v_index,open('./w2v_index.pkl','w'))
    pickle.dump(w2v_vec,open('./w2v_vec.pkl','w'))


def trainWord2Vec():#训练word2vec模型并存储
    sentences=loadData(r'./sen_cut.txt')
    model=Word2Vec(sentences=sentences,size=100,min_count=8,window=5)
    model.save('./word2vec.model')
    # model=Word2Vec.load('./word2vec.model')
    saveWordIndex(model=model)


if __name__=='__main__':
    trainWord2Vec()

3、训练LSTM模型

# -*- coding: utf-8 -*-

import sys
from gensim.models import Word2Vec
from LoadData import loadData
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dropout,Dense,Embedding,LSTM,Activation
import pickle
from sklearn.model_selection import train_test_split
from gensim.corpora.dictionary import Dictionary
reload(sys)
sys.setdefaultencoding('utf-8')




# 参数设置
vocab_dim = 100  # 向量维度
maxlen = 150  # 文本保留的最大长度
batch_size = 120
n_epoch = 5
input_length = 150

def getLabels():
    labels=[]
    for label in open('./labels.txt','r'):
        labels.append(label.strip())
    return labels

def text2index(index_dic,sentences):
    """
    把词语转换为数字索引,比如[['中国','安徽','合肥'],['安徽财经大学','今天','天气','很好']]转换为[[1,5,30],[2,3,105,89]]
    """
    new_sentences=[]
    for sen in sentences:
        new_sen=[]
        for word in sen:
            try:
                new_sen.append(index_dic[word])
            except:
                new_sen.append(0)
        new_sentences.append(new_sen)
    return new_sentences


def train_lstm(p_n_symbols, p_embedding_weights, p_X_train, p_y_train, p_X_test, p_y_test):
    """
    :param p_n_symbols: word2vec训练后保留的词语的个数
    :param p_embedding_weights: 词索引与词向量对应矩阵
    :param p_X_train: 训练X
    :param p_y_train: 训练y
    :param p_X_test: 测试X
    :param p_y_test: 测试y
    :return: 
    """
    print u'创建模型...'
    model = Sequential()
    model.add(Embedding(input_dim=p_n_symbols,
                        output_dim=vocab_dim,
                        mask_zero=True,
                        weights=[p_embedding_weights],
                        input_length=input_length,trainable=False))
    model.add(LSTM(units=50))
    model.add(Dropout(0.3))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    print u'编译模型...'
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    print u"训练..."
    model.fit(p_X_train, p_y_train, batch_size=batch_size, epochs=n_epoch,
              validation_data=(p_X_test, p_y_test),verbose=1)

    print u"评估..."
    score, acc = model.evaluate(p_X_test, p_y_test, batch_size=batch_size)
    print 'Test score:', score
    print 'Test accuracy:', acc


def createModel():
    maxlen=150
    index_dict=pickle.load(open('./w2v_index.pkl','r'))
    vec_dict = pickle.load(open('./w2v_vec.pkl', 'r'))
    n_words=len(index_dict.keys())
    print n_words
    vec_matrix=np.zeros((n_words+1,100))
    for k,i in index_dict.items():#将所有词索引与词向量一一对应
        try:
            vec_matrix[i,:]=vec_dict[k]
        except:
            print k,i
            print vec_dict[k]
            exit(1)
    labels=getLabels()
    sentences=loadData('./sen_cut.txt')
    X_train,X_test,y_train,y_test=train_test_split(sentences,labels,test_size=0.2)
    X_train=text2index(index_dict,X_train)
    X_test = text2index(index_dict, X_test)
    print u"训练集shape: ", np.shape(X_train)
    print u"测试集shape: ", np.shape(X_test)
    y_train=np.array(y_train)
    y_test=np.array(y_test)
    print('Pad sequences (samples x time)')
    X_train = sequence.pad_sequences(X_train, maxlen=maxlen)#扩展长度不足的补0
    X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
    print u"训练集shape: ", np.shape(X_train)
    print u"测试集shape: ", np.shape(X_test)
    train_lstm(n_words+1, vec_matrix, X_train, y_train, X_test, y_test)


if __name__=="__main__":
    createModel()
  • 2
    点赞
  • 26
    收藏
    觉得还不错? 一键收藏
  • 3
    评论
您好!要实现LSTM文本分类,您可以使用PyTorch库。下面是一个简单的示例代码: 首先,导入所需的库和模块: ```python import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import DataLoader from torchtext.datasets import AG_NEWS from torchtext.data.utils import get_tokenizer from torchtext.vocab import build_vocab_from_iterator ``` 接下来,定义一个LSTM模型类: ```python class LSTMModel(nn.Module): def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers): super(LSTMModel, self).__init__() self.embedding = nn.Embedding(vocab_size, embedding_dim) self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True) self.fc = nn.Linear(hidden_dim, output_dim) def forward(self, x): embedded = self.embedding(x) output, (hidden, cell) = self.lstm(embedded) hidden = hidden[-1] return self.fc(hidden) ``` 然后,加载数据集并进行预处理: ```python train_iter = AG_NEWS(split='train') tokenizer = get_tokenizer('basic_english') train_iter = [tokenizer(item[1]) for item in train_iter] vocab = build_vocab_from_iterator(train_iter) vocab.set_default_index(vocab["<unk>"]) vocab_size = len(vocab) train_dataset = [[vocab[token] for token in tokens] for tokens in train_iter] ``` 接下来,创建一个自定义的collate函数用于将数据打包成batch: ```python def collate(batch): labels = torch.tensor([entry[0] for entry in batch]) text = [entry[1:] for entry in batch] text = nn.utils.rnn.pad_sequence(text, padding_value=vocab["<pad>"], batch_first=True) return labels, text ``` 然后,设置模型的超参数并创建数据加载器: ```python embedding_dim = 100 hidden_dim = 256 output_dim = 4 num_layers = 2 batch_size = 64 epochs = 10 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate) ``` 接下来,实例化模型并定义损失函数和优化器: ```python model = LSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim, num_layers) model.to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters()) ``` 然后,进行训练和评估: ```python for epoch in range(epochs): model.train() for labels, text in train_dataloader: labels = labels.to(device) text = text.to(device) optimizer.zero_grad() outputs = model(text) loss = criterion(outputs, labels) loss.backward() optimizer.step() model.eval() # 在验证集上评估模型性能 ``` 这是一个基本的LSTM文本分类的示例,您可以根据您的数据集和任务需求进行适当的修改和调整。希望对您有帮助!如果有任何问题,请随时提问。
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值