中文自然语言处理--基于textCNN的股吧数据情感分类

以中美贸易战背景下中兴通讯在股吧解禁前一段时间的评论数据,来进行情感数据人工打标签和CNN分类。其中,把消极 、中性 、积极分别用0、1、2来表示。

import pandas as pd
import numpy as np
import jieba
import random
import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.datasets import imdb  # 互联网电影资料库
from keras.models import model_from_json
from keras.utils import np_utils
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# 整个文本分类流程主要包括以下6个步骤:
# 中文语料;
# 分词;
# 复杂规则;
# 特征向量;
# 算法建模;
# 情感分析。

# 引入停用词和语料文件
'''
数据分布不均匀一般怎么处理呢?从以下几个方面考虑:
数据采样,包括上采样、下采样和综合采样;
改变分类算法,在传统分类算法的基础上对不同类别采取不同的加权方式,使得模型更看重少数类;
采用合理的性能评价指标;
代价敏感。
'''
dir = "./datas/"
# 不设置quoting,默认会去除英文双引号,只留下英文双引号内的内容,设置quoting = 3,会如实读取内容
stopwords = pd.read_csv(dir +"stopwords.txt", index_col=False, quoting=3, sep="\t", names=['stopword'], encoding='utf-8')
stopwords = stopwords['stopword'].values
data_1 = pd.read_csv(dir+"data.csv",encoding='gbk')
#把内容有缺失值的删除
data_1.dropna(inplace=True)
print(data_1.head())

#抽取文本数据和标签
#把消极  中性  积极分别为0、1、2的语料分别拿出来
data_label_0 = data_1.loc[data_1['label'] ==0,:]
data_label_1 = data_1.loc[data_1['label'] ==1,:]
data_label_2 = data_1.loc[data_1['label'] ==2,:]

#定义分词函数
def preprocess_text(content_lines, sentences, category):
    for line in content_lines:
        try:
            segs=jieba.lcut(line)
            segs = list(filter(lambda x:len(x)>1, segs))
            segs = [v for v in segs if not str(v).isdigit()]  # 去数字
            segs = list(filter(lambda x:x.strip(), segs))  # 去左右空格
            segs = list(filter(lambda x:x not in stopwords, segs))
            temp = " ".join(segs)
            if(len(temp)>1):
                sentences.append((temp, category))
        except Exception:
            print(line)
            continue

# 生成训练的分词数据,并进行打散,使其分布均匀
#获取数据
data_label_0_content = data_label_0['content'].values.tolist()
data_label_1_content = data_label_1['content'].values.tolist()
data_label_2_content = data_label_2['content'].values.tolist()
#生成训练数据
sentences = []
preprocess_text(data_label_0_content, sentences, 0)
preprocess_text(data_label_1_content, sentences, 1)
preprocess_text(data_label_2_content, sentences,2)
#我们打乱一下顺序,生成更可靠的训练集
random.shuffle(sentences)
print(sentences)

# 对数据集进行切分,按照训练集合测试集7:3的比例
#所以把原数据集分成训练集的测试集,咱们用sklearn自带的分割函数。
x, y = zip(*sentences)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1234)

#抽取特征,我们对文本抽取词袋模型特征
'''
CountVectorizer 的作用是将文本文档转换为计数的稀疏矩阵;
'''
# 抽取特征,将文本中的词语转换为词频矩阵
# 将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频
# CountVectorizer是属于常见的特征数值计算类,是一个文本特征提取方法。对于每一个训练文本,它只考虑每种词汇在该训练文本中出现的频率。
# CountVectorizer会将文本中的词语转换为词频矩阵,它通过fit_transform函数计算各个词语出现的次数。
# analyzer    一般使用默认,可设置为string类型,如’word’, ‘char’, ‘char_wb’,还可设置为callable类型,比如函数是一个callable类型
# max_features    默认为None,可设为int,对所有关键词的term frequency进行降序排序,只取前max_features个作为关键词集
vec = CountVectorizer(
    analyzer='word', #tokenise by character ngrams
    max_features=4000,  #keep the most common 1000 ngrams
)
vec.fit(x_train)

# 定义模型参数
# 设置参数
max_features = 1001
maxlen = 100
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 5
nclasses = 3

# 输入特征转成 Array 和标签处理
x_train = vec.transform(x_train)
x_test = vec.transform(x_test)
x_train = x_train.toarray()
x_test = x_test.toarray()
# 独热编码
y_train = np_utils.to_categorical(y_train, nclasses)
y_test = np_utils.to_categorical(y_test, nclasses)
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print(x_train[:3])
print(y_train[:3])
print('x_test shape:', x_test.shape)

# 定义一个绘制 Loss 曲线的类
class LossHistory(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.losses = {'batch': [], 'epoch': []}
        self.accuracy = {'batch': [], 'epoch': []}
        self.val_loss = {'batch': [], 'epoch': []}
        self.val_acc = {'batch': [], 'epoch': []}

    def on_batch_end(self, batch, logs={}):
        self.losses['batch'].append(logs.get('loss'))
        self.accuracy['batch'].append(logs.get('accuracy'))
        self.val_loss['batch'].append(logs.get('val_loss'))
        self.val_acc['batch'].append(logs.get('val_accuracy'))

    def on_epoch_end(self, epoch, logs={}):
        self.losses['epoch'].append(logs.get('loss'))
        self.accuracy['epoch'].append(logs.get('accuracy'))
        self.val_loss['epoch'].append(logs.get('val_loss'))
        self.val_acc['epoch'].append(logs.get('val_accuracy'))

    def loss_plot(self, loss_type):
        iters = range(len(self.losses[loss_type]))
        plt.figure()
        # acc
        # print(self.accuracy[loss_type])
        plt.plot(iters, self.accuracy[loss_type], 'r', label='train acc')
        # loss
        # print(self.losses[loss_type])
        plt.plot(iters, self.losses[loss_type], 'g', label='train loss')
        if loss_type == 'epoch':
            # val_acc
            # print(self.val_acc[loss_type])
            plt.plot(iters, self.val_acc[loss_type], 'b', label='val acc')
            # val_loss
            # print(self.val_loss[loss_type])
            plt.plot(iters, self.val_loss[loss_type], 'k', label='val loss')
        plt.grid(True)
        plt.xlabel(loss_type)
        plt.ylabel('acc-loss')
        plt.legend(loc="upper right")
        plt.show()

    def on_train_end(self, logs={}):
        self.loss_plot('batch')
        self.loss_plot('epoch')

# 初始化上面类的对象,并作为模型的回调函数输入,训练模型
history = LossHistory()

print('Build model...')
model = Sequential()
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))
model.add(Dropout(0.5))
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
model.add(GlobalMaxPooling1D())
model.add(Dense(hidden_dims))
model.add(Dropout(0.5))
model.add(Activation('relu'))
model.add(Dense(nclasses))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test),
          callbacks=[history])

原文(数据似乎没有,我自己参考原文随机构造的):
https://soyoger.blog.csdn.net/article/details/108729401

  • 0
    点赞
  • 20
    收藏
    觉得还不错? 一键收藏
  • 8
    评论
评论 8
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值