【python实现基于深度学习的文本情感分类(4)】——pretained词向量+1dCNN神经网络实现

最新推荐文章于 2023-04-16 20:58:21 发布

UCAS菌皓

最新推荐文章于 2023-04-16 20:58:21 发布

阅读量2.4k

点赞数 3

分类专栏： Python 文章标签： python 人工智能深度学习 keras

本文链接：https://blog.csdn.net/qq_41831350/article/details/87353147

版权

Python 专栏收录该内容

4 篇文章 1 订阅

订阅专栏

主要模块：keras, gensim, pandas
利用训练好的词向量，基于keras使用1dCNN神经网络完成文本情感分类。
keras参考代码链接
 参考博客

准备工作

1.训练好的词向量模型"f.model";
2.把原始数据集中的文本内容分词，存放到data.xlsx中。

import openpyxl
import jieba

# read origin excel
fname = "classfied_data.xlsx"
excelBook = openpyxl.load_workbook(r'E:\python\Deep_Text_Classfication\data\classfied_data.xlsx')
sheetNames = excelBook.get_sheet_names()
sheet1 = excelBook.get_sheet_by_name(sheetNames[0])

# creat new excel
myBook = openpyxl.Workbook()
mySheet = myBook.active

# write data to myBook
mySheet['A1'] = 'content'
mySheet['B1'] = 'label'
i = 1
rows = len(tuple(sheet1.rows))
while i <= (rows-1):
    try:
        text = sheet1.cell(row=i+1,column=1).value+sheet1.cell(row=i+1,column=2).value
    except:
        text = ''
    #text = sheet1.cell_value(i,0)+sheet1.cell_value(i,1)
    new_text = jieba.cut(text, cut_all=False)  # 精确模式
    text_out = ' '.join(new_text).replace('，', '').replace('。', '').replace('？', '').replace('！', '') \
        .replace('“', '').replace('”', '').replace('：', '').replace('…', '').replace('（', '').replace('）', '') \
        .replace('—', '').replace('《', '').replace('》', '').replace('、', '').replace('‘', '') \
        .replace('’', '')     # 去掉标点符号
    label = sheet1.cell(row=i+1,column=3).value
    mySheet.cell(row=i+1,column=1).value=text_out
    mySheet.cell(row=i+1,column=2).value=label
    i += 1

myBook.save('data.xlsx')

文本情感分类

import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
import gensim
import pandas as pd

MAX_SEQUENCE_LENGTH = 1000 # 每篇文章选取1000个词
MAX_NB_WORDS = 10000 # 将字典设置为含有1万个词
EMBEDDING_DIM = 200 # 词向量维度
VALIDATION_SPLIT = 0.2 # 测试集大小

第一步：得到一份字典

# STEP 1
# 得到一份字典(embeddings_index)
embeddings_index = {}

print('Indexing word vectors.')
if os.path.exists('f.model'):     # 判断文件是否存在
    model = gensim.models.Word2Vec.load('f.model')
else:
    print('Model not found.')
word_vectors = model.wv
for word, vocab_obj in model.wv.vocab.items():
    if int(vocab_obj.index) < MAX_NB_WORDS:
        embeddings_index[word] = word_vectors[word]
del model, word_vectors # 删掉gensim模型释放内存
print('Found %s word vectors.' % len(embeddings_index))

# print out:
# Indexing word vectors.
# Found 10000 word vectors.

第二步：获取训练文本和标签

训练数据有两列，一列是包括标题和文章内容的content，一列是文本情感标签label（1或-1）。之前的准备工作已将content分好词。

# STEP 2
print('Processing text dataset')

texts = []  # list of text samples
labels = []  # list of label ids

# 读取数据
data = pd.read_excel('data.xlsx')

#提取内容和标签
texts = data['content'].values.tolist()
labels = data['label'].values.tolist()
del data

print('Found %s texts.' % len(texts))

# print out
# Processing text dataset
# Found 6946 texts.

第三步：准备文本（ vectorize the text samples into a 2D integer tensor)

# STEP 3
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts) # 传入训练数据，得到训练数据中出现的词的字典
sequences = tokenizer.texts_to_sequences(texts) # 根据训练数据中出现的词的字典，将训练数据转换为sequences

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) # 限制每篇文章的长度

labels = np.asarray(labels)
labels = [x+1 if x==-1 else x for x in labels]
labels = to_categorical(labels) # label one hot表示
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# print out
# Found 100323 unique tokens.
# Shape of data tensor: (6946, 1000)
# Shape of label tensor: (6946, 2) # 文本类别有2类

第四步：切割数据为训练集和测试集

# STEP 4
# 准备训练集和测试集

# 打乱文本顺序
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

# 切割数据
x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

第五步：准备embedding matrix

# STEP 5
# 准备embedding layer

num_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # 文本数据中的词在词向量字典中没有，向量为取0；如果有则取词向量中该词的向量
        embedding_matrix[i] = embedding_vector

# 将预训练好的词向量加载如embedding layer
# 我们设置 trainable = False，代表词向量不作为参数进行更新
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

第六步：搭建1dcnn并训练模型

# STEP 6
# 训练模型
# 训练  1D 卷积神经网络 使用 Maxpooling1D
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(filters=128, kernel_size=5, activation='relu')(embedded_sequences)
x = MaxPooling1D(pool_size=5)(x)
x = Conv1D(filters=128, kernel_size=5, activation='relu')(x)
x = MaxPooling1D(pool_size=5)(x)
x = Conv1D(filters=128, kernel_size=5, activation='relu')(x)
x = MaxPooling1D(pool_size=35)(x)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(2, activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

# 如果希望短一些时间，epochs调小
model.fit(x_train, y_train,
          validation_data=(x_val, y_val),
          epochs=10,
          batch_size=128)

score, acc = model.evaluate(x_val, y_val)
print('Test score:',score)
print('Test accuracy:', acc)

model.save('1dcnn_model.h5')

训练结果

1.程序运行时输出:
在这里插入图片描述
2.训练过程输出：

3.训练评估结果:

总结

训练集准确率能达到98%，测试集能达到90%

UCAS菌皓

关注

3
点赞
踩
26

收藏

觉得还不错? 一键收藏
11
评论
【python实现基于深度学习的文本情感分类(4)】——pretained词向量+1dCNN神经网络实现

主要模块：keras, gensim, pandas利用训练好的词向量，基于keras使用1dCNN神经网络完成文本情感分类。keras参考代码链接准备工作1.训练好的词向量模型&quot;f.model&quot;;2.把原始数据集中的文本内容分词，存放到data.xlsx中。import openpyxlimport jieba# read origin excelfname = &quot;class...
复制链接

扫一扫

专栏目录