CNN应用:文本情感预测

数据示例如下所示,

UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment

3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/iFz9FAn2Pa and https://t.co/xX6ghGFzCC and https://t.co/I2NlzdxNo8,Neutral

3800,48752,UK,16-03-2020,advice Talk to your neighbours family to exchange phone numbers create contact list with phone numbers of neighbours schools employer chemist GP set up online shopping accounts if poss adequate supplies of regular meds but not over order,Positive

3801,48753,Vagabonds,16-03-2020,"Coronavirus Australia: Woolworths to give elderly, disabled dedicated shopping hours amid COVID-19 outbreak https://t.co/bInCA9Vp8P",Positive

3802,48754,,16-03-2020,"My food stock is not the only one which is empty...

... ... 

数据读取如下所示,

train_path = r'./train.csv'
test_path = r'./test.csv'

test_data = pd.read_csv(test_path)
test_data = test_data.OriginalTweet

# 读取训练集数据

# 将情绪标签转化为数值标签
def label_numeric_transfer(labels):
    label_names = ['Neutral', 'Positive', 'Negative', 'Extremely Positive',  'Extremely Negative']

    numeric_labels = []
    for label in labels:
        if label not in label_names:
            print('WARNING! Invalid label named {}'.format(label))
        if label == label_names[0]:
            numeric_labels.append(0)
        elif label == label_names[1]:
            numeric_labels.append(1)
        elif label == label_names[2]:
            numeric_labels.append(2)
        elif label == label_names[3]:
            numeric_labels.append(3)
        elif label == label_names[4]:
            numeric_labels.append(4)

    return numeric_labels

def read_file(file_name=r'./train.csv'):
    all_data = pd.read_csv(file_name, engine='python')

    # 文本特征
    comment_data = all_data['OriginalTweet'].values.tolist()

    # 标签数据
    label_data = all_data.Sentiment.values.tolist()

    tokens = [element.strip().split(' ') for element in comment_data]
    labels = [element for element in label_data]

    new_tokens = []
    for token in tokens:
        new_tokens.append([x.strip() for x in token if x.strip()!='' ])
    labels = label_numeric_transfer(labels)

    return new_tokens, labels

train_tokens, train_labels = read_file(file_name=train_path)

# 读取测试集数据

test_tokens = read_test_file(file_name=test_path)

def read_test_file(file_name=r'./test.csv'):
    all_data = pd.read_csv(file_name, engine='python')
    comment_data = all_data['OriginalTweet'].values.tolist()

    tokens = [element.strip().split(' ') for element in comment_data]
    new_tokens = []
    for token in tokens:
        new_tokens.append([x.strip() for x in token if x.strip()!='' ])

    return new_tokens

# 将文本数据的长度对齐,CNN模型要求定长的输入数据
train_padding_tokens = truncation_and_padding(train_tokens, '<pad>', 64)
test_padding_tokens = truncation_and_padding(test_tokens, '<pad>', 64)

# 将token向量化
all_padding_tokens = train_padding_tokens + test_padding_tokens
word2vec = Word2Vec(all_padding_tokens, min_count=1)

def corpus(padding_token, word2vec):
    corpus_tokens = []
    
    for sentence in padding_token:
        corpus_tokens.append([word2vec.wv.get_index(token) for token in sentence])
    
    return corpus_tokens

train_corpus_tokens = corpus(train_padding_tokens, word2vec)

# 划分训练家和验证集
split_len = int(len(train_corpus_tokens) * 0.8 )

val_corpus_tokens = train_corpus_tokens[split_len:]
val_labels = train_labels[split_len:]
train_corpus_tokens = train_corpus_tokens[:split_len]
train_labels = train_labels[:split_len]

test_corpus_tokens = corpus(test_padding_tokens, word2vec)

网络模型设计如下所示,

class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_size, kernel_sizes, num_channels, embed, **kwargs):
        super(TextCNN, self).__init__(**kwargs)
        
        # 定义embedding词嵌入模型,并将word2vec生成的词向量嵌入进来
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.embedding.from_pretrained(torch.tensor(embed))
        
        # 此嵌入层同上
        self.constant_embedding = nn.Embedding(vocab_size, embed_size)
        self.constant_embedding.from_pretrained(torch.tensor(embed))
        
        # 暂退法减少模型复杂度,并定义全连接层decoder
        self.dropout = nn.Dropout(0.5)
        self.decoder = nn.Linear(sum(num_channels), 5)
        
        # 最大时间汇聚层没有参数,因此可以共享此实例
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.relu = nn.ReLU()
        
        # 创建多个一维卷积层
        self.convs = nn.ModuleList()
        
        # 追加多个一维卷积层,用以提取文本的不同特征信息
        for c, k in zip(num_channels, kernel_sizes):
            # 输入通道数为 2 * embed_size, 输出通道数为 c, 卷积核为 k
            self.convs.append(nn.Conv1d(2 * embed_size, c, k))
    
    def forward(self, inputs):
        # 沿着向量维度将两个嵌入层连结起来,
        # 每个嵌入层的输出形状都是(批量大小,词元数量,词元向量维度)连结起来
        embeddings = torch.cat((
            self.embedding(inputs), self.constant_embedding(inputs)), dim=2)
        
        # 根据一维卷积层的输入格式,重新排列张量,以便通道作为第2维
        embeddings = embeddings.permute(0, 2, 1)
        
        # 每个一维卷积层在最大时间汇聚层合并后,获得的张量形状是(批量大小,通道数,1)
        # 删除最后一个维度并沿通道维度连结
        encoding = torch.cat([
            torch.squeeze(self.relu(self.pool(conv(embeddings))), dim=-1)
            for conv in self.convs], dim=1)
        
        # 使用全连接层输出概率分布,并返回概率结果
        outputs = self.decoder(self.dropout(encoding))
        return outputs

模型初始化,

embed_size, kernel_sizes, nums_channels = 100, [3, 4, 5], [100, 100, 100]
net = TextCNN(len(word2vec.wv.vectors), embed_size, kernel_sizes, nums_channels, word2vec.wv.vectors)
def init_weights(m):
    if type(m) in (nn.Linear, nn.Conv1d):
        nn.init.xavier_uniform_(m.weight)
net.apply(init_weights)

模型训练和评估,

def evaluate(net, comments_data, labels_data):   
    sum_correct, i = 0, 0
    
    while i <= len(comments_data):
        
        # 批量计算正确率,一次计算64个评论信息
        comments = comments_data[i: min(i + 64, len(comments_data))]
        
        tokens_X = torch.tensor(comments)
        tokens_X = torch.tensor(comments)

        res = net(tokens_X)                                          # 获得到预测结果

        y = torch.tensor(labels_data[i: min(i + 64, len(comments_data))]).reshape(-1)

        sum_correct += (res.argmax(axis=1) == y).sum()              # 累加预测正确的结果
        i += 64

    return sum_correct/len(comments_data)                           # 返回(总正确结果/所有样本),精确率

def train(net, optimizer, criterion, train_comments, train_labels, val_comments, val_labels, num_epochs):    
    max_value = 0.5                       # 初始化模型预测最大精度
    for epoch in range(num_epochs):
        sum_loss, i = 0, 0                # 定义模型损失总和为 sum_loss, 变量 i
        
        while i < len(train_comments):            
            # 批量64个数据训练模型
            comments = train_comments[i: min(i+64, len(train_comments))]
            
            inputs_X = torch.tensor(comments)
            y = torch.tensor(train_labels[i: min(i+64, len(train_comments))]) # 

            res = net(inputs_X)

            loss = criterion(res, y)                    
            optimizer.zero_grad()                       
            loss.sum().backward()                        
            optimizer.step()                            

            sum_loss += loss.sum()                      

            i += 16

        print('loss:\t', sum_loss/len(train_comments))
        
        train_acc = evaluate(net, train_comments, train_labels)
        test_acc = evaluate(net, val_comments, val_labels)

        if test_acc >= max_value:
            max_value = test_acc
            torch.save(net.state_dict(), r'.\save_models\text_cnn_best.pth.tar')

        print('-epoch:\t', epoch+1,
              '\t-loss:\t', sum_loss / len(train_comments),
              '\ttrain-acc:', train_acc,
              '\ttest-acc:', test_acc)
    

联系我们

oceannedlg@outlook.com

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值