pt第6章 情感分类

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import seaborn as sns
from wordcloud import WordCloud
import time
import copy
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import transforms
from torchtext import data
from torchtext.vocab import Vectors, GloVe
# 定义加载文本数据的函数
def load_text_data(path):
    # 初始化存储文本和标签的列表
    text_data = []
    label = []
    # 遍历情感标签,'pos' 代表正面,'neg' 代表负面
    for dset in ["pos", "neg"]:
        # 拼接数据集的路径
        path_dset = os.path.join(path, dset)
        # 获取路径下的所有文件名
        path_list = os.listdir(path_dset)
        # 遍历文件名列表
        for fname in path_list:
            # 检查文件是否为文本文件
            if fname.endswith(".txt"):
                # 拼接文件的完整路径
                filename = os.path.join(path_dset, fname)
                # 打开文件并读取内容,指定utf-8编码避免编码错误
                with open(filename, 'r', encoding='utf-8') as f:
                    text_data.append(f.read())
                # 根据数据集类型添加相应的标签
                if dset == "pos":
                    label.append(1)
                else:
                    label.append(0)
    # 将列表转换为NumPy数组并返回
    return np.array(text_data), np.array(label)
##读取训练集和测试集
train_path = "E:\\Anaconda3\\jupyter\\pt第六章 卷积神经网络\\data\\chap6\\imdb\\train"
train_text,train_label = load_text_data(train_path)
test_path = "E:\\Anaconda3\\jupyter\\pt第六章 卷积神经网络\\data\\chap6\\imdb\\test"
test_text,test_label = load_text_data(test_path)
print(len(train_text),len(train_label))
print(len(test_text),len(test_label))
25000 25000
25000 25000
def text_preprocess(text_data):
    text_pre = []
    for text1 in text_data:
        ##去除指定的字符
        text1 = re.sub("<br /><br />"," ",text1)
        ##转化为小写,去除数字,去除标点符号
        text1 = text1.lower() ##将输入字符串text1中所有的"<br /><br />"子串替换为单个空格" "
        text1 = re.sub("\d+","",text1) ##使用正则表达式"\d+"来匹配字符串中的一个或多个数字,并将这些数字替换为空字符串
        text1 = text1.translate(str.maketrans("","",string.punctuation.replace("'","")))##将所有标点符号替换为空字符串
        text1 = text1.strip()##去除字符串text1两端的空白字符
        text_pre.append(text1)
    return np.array(text_pre)
train_text_pre = text_preprocess(train_text)
test_text_pre = text_preprocess(test_text)
def stop_stem_word(datalist, stop_words):
    datalist_pre = []
    for text in datalist:
        text_words = word_tokenize(text)
        # 去除停用词
        text_words = [word for word in text_words if word.lower() not in stop_words]
        # 删除带有"'"的词语
        text_words = [word for word in text_words if "'" not in word]
        datalist_pre.append(text_words)
    return datalist_pre

# 文本符号化处理,去除停用词
stop_words = stopwords.words("english") ##获取英文的停用词列表
stop_words = set(stop_words) ##停用词列表转换成一个集合

train_text_pre2 = stop_stem_word(train_text_pre, stop_words) 
test_text_pre2 = stop_stem_word(test_text_pre, stop_words)

print(train_text_pre[10000])  
print("=" * 10)
print(test_text_pre2[10000]) 
i really liked tom barman's awtwb you just have to let it come over you and enjoy it while it lasts and don't expect anything it's like sitting on a caféterrace with a beer in the summer sun and watching the people go by it definitely won't keep you pondering afterwards that's true but that's not a prerequisite for a good film it's just the experience during the movie that's great i felt there were a few strands that could have been worked out a little more but being a lynch fan i don't care that much anymore  and i loved the style or flair of this movie it's slick but fresh and the soundtrack is a beauty any musiclover will get his kicks out of awtwb i can assure you i'll give it  out  musicwise  out of
==========
['words', 'really', 'describe', 'series', 'premise', 'behind', 'concept', 'highly', 'hyperactive', 'girl', 'eccentric', 'personality', 'ends', 'whirling', 'team', 'oddballs', 'rendering', 'world', 'creation', 'haruhi', 'since', 'wants', 'world', 'aliens', 'espers', 'time', 'travelers', 'breath', 'fresh', 'air', 'world', 'ridden', 'repetitive', 'anime', 'series', 'non', 'innovative', 'tv', 'shows', 'characters', 'well', 'developed', 'end', 'loving', 'less', 'others', 'word', 'describe', 'animation', 'job', 'exist', 'since', 'excellent', 'would', 'really', 'fall', 'short', 'describe', 'done', 'many', 'funny', 'situations', 'either', 'make', 'smile', 'put', 'deep', 'thoughts', 'fall', 'impression', 'first', 'episode', 'since', 'tip', 'iceberg', 'novels', 'yet', 'come', 'problem', 'comes', 'due', 'lack', 'chronological', 'order', 'episodes', 'solve', 'problem', 'conclusion', 'unquestionably', 'one', 'best', 'series']
##将处理好的文本保存到CSV中
texts = [" ".join(words) for words in train_text_pre2]
traindatasave = pd.DataFrame({"text":texts,"label":train_label})
texts = [" ".join(words) for words in test_text_pre2]
testdatasave = pd.DataFrame({"text":texts,"label":test_label})
textdatasave = pd.DataFrame({"text":texts,"label":test_label})
traindatasave.to_csv("data/chap6/imdb_train.csv")
testdatasave.to_csv("data/chap6/imdb_test.csv")
##将预处理好的文本转化为数据表
traindata = pd.DataFrame({"train_text":train_text,
                          "train_word":train_text_pre2,
                          "train_label":train_label})
##计算每个影评使用词的数量
train_word_num = [len(text) for text in train_text_pre2]
traindata["train_word_num"] = train_word_num
##可视化影评词语长度分布
plt.figure(figsize=(8,5))
_ = plt.hist(train_word_num, bins=100)
plt.xlabel("word number")
plt.ylabel("Freq")
plt.show()


在这里插入图片描述

##使用词云可视化两种情感的词频差距
plt.figure(figsize=(16,10))
for ii in np.unique(train_label):
    ##准备每种情感所有词语
    text = np.array(traindata.train_word[traindata.train_label == ii])
    text = " ".join(np.concatenate(text))
    plt.subplot(1,2,ii+1)
    ##生成词云
    wordcod = WordCloud(margin=5,width=1800,
                        height=1000,max_words=500,
                        min_font_size=5,background_color='white',
                        max_font_size=150)
    wordcod.generate_from_text(text)
    plt.imshow(wordcod)
    plt.axis("off")
    if ii == 1:
        plt.title("Positive")
    else:
        plt.title("Negative")
    plt.subplots_adjust(wspace=0.05)
plt.show()


在这里插入图片描述

##使用torchtext库进行数据准备,定义文件中对文本和标签所要做的操作
##定义文本切分方法,因为前面已经做过处理,直接使用空格切分
mytonkenize = lambda x: x.split()
TEXT = data.Field(sequential=True,tokenize=mytonkenize,include_lengths=True,
                  use_vocab=True, batch_first=True,fix_length=200)
LABEL = data.Field(sequential=False, use_vocab=False,
                   pad_token=None, unk_token=False)
##对要读取的数据集的列进行处理
train_test_fields = [("label", LABEL),
                    ("text", TEXT)]

##读取数据
traindata, testdata = data.TabularDataset.splits(
    path = "E:\\Anaconda3\\jupyter\\pt第六章 卷积神经网络\\data\\chap6",
    format="csv",
    train = "imdb_train.csv",
    fields=train_test_fields,
    test = "imdb_test.csv",
    skip_header = True
)
print(len(traindata),len(testdata))
25000 25000
ex0 = traindata.examples[0]
print(ex0.label)
print(ex0.text)
0
['bromwell', 'high', 'cartoon', 'comedy', 'ran', 'time', 'programs', 'school', 'life', 'teachers', 'years', 'teaching', 'profession', 'lead', 'believe', 'bromwell', 'high', 'satire', 'much', 'closer', 'reality', 'teachers', 'scramble', 'survive', 'financially', 'insightful', 'students', 'see', 'right', 'pathetic', 'teachers', 'pomp', 'pettiness', 'whole', 'situation', 'remind', 'schools', 'knew', 'students', 'saw', 'episode', 'student', 'repeatedly', 'tried', 'burn', 'school', 'immediately', 'recalled', 'high', 'classic', 'line', 'inspector', 'sack', 'one', 'teachers', 'student', 'welcome', 'bromwell', 'high', 'expect', 'many', 'adults', 'age', 'think', 'bromwell', 'high', 'far', 'fetched', 'pity']
##训练集切分为训练集和测试集
train_data, val_data = traindata.split(split_ratio=0.7)
print(len(train_data),len(val_data))
17500 7500
##加载预训练的词向量和构建词汇表
glove_dir = "E:\\Anaconda3\\jupyter\\pt第六章 卷积神经网络\\data\\chap6" # 使用绝对路径
vec = Vectors(name='glove.6B.100d.txt', cache=glove_dir)
##将训练集转化为词向量,使用训练集构建单词表,导入预先训练的词嵌入
TEXT.build_vocab(train_data,max_size=20000,vectors=vec)
LABEL.build_vocab(train_data)
##训练集中的前10个高频词
print(TEXT.vocab.freqs.most_common(n=10))
print("词典的词数:",len(TEXT.vocab.itos))
print("前10个单词",len(TEXT.vocab.itos))
##类别标签的数量和类别
print("类别标签的情况: ",LABEL.vocab.freqs)
C:\Anaconda3\envs\DL_01\lib\site-packages\torchtext\vocab.py:432: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
  self.itos, self.stoi, self.vectors, self.dim = torch.load(path_pt)
##定义一个加载器,将类似长度的示例一起批处理
BATCH_SIZE = 32
train_iter = data.BucketIterator(train_data,batch_size=BATCH_SIZE)
val_iter = data.BucketIterator(val_data,batch_size=BATCH_SIZE)
test_iter = data.BucketIterator(testdata,batch_size=BATCH_SIZE)
##获得一个batch的数据,对数据内容进行介绍
for step,batch in enumerate(train_iter):
    if step > 0:
        break
    ##针对一个batch的数据,可以使用batch.label获得数据的类别标签
    print("数据的类别标签:\n",batch.label)
    ##batch.text[0]是文本对应标签向量
    print("数据的尺寸:\n",batch.text[0].shape)
    ##batch.text[1]是文本对应的标签向量
    print("数据的样本数:",len(batch.text[1]))
数据的类别标签:
 tensor([16500, 11093,  7106, 16327, 17562, 12520,    63, 20618, 15454, 13820,
        18862, 16437, 12299, 13938,  1844, 18661, 19621,  5788, 17920, 14664,
        17872, 13718,  7421, 17244, 16627, 14934,  6069, 21741, 12151, 15489,
         6129, 14208])
数据的尺寸:
 torch.Size([32, 200])
数据的样本数: 32
class CNN_Text(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout, pad_idx):
        super(CNN_Text, self).__init__()
        # 对文本进行词嵌入操作
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        # 卷积操作,创建多个不同尺寸的卷积核
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs, embedding_dim))
            for fs in filter_sizes
        ])
        # 全连接层和 Dropout 层
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.embedding(text)
        embedded = embedded.unsqueeze(1)  # 增加一个维度以匹配 Conv2d 的输入需求
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]  # 应用卷积并激活
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        # 将所有卷积层的输出合并
        cat = self.dropout(torch.cat(pooled, dim=1))
        # 通过一个全连接层
        return self.fc(cat)
INPUT_DIM = len(TEXT.vocab)  # 词典的大小
EMBEDDING_DIM = 100  # 词向量的维度
N_FILTERS = 100  # 每个卷积核的个数
FILTER_SIZES = [3, 4, 5]  # 卷积核的尺寸
OUTPUT_DIM = 1  # 输出的维度
DROPOUT = 0.5  # Dropout 的比率
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]  # 填充词的索引
# 实例化模型
model = CNN_Text(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
# 实例化模型
model = CNN_Text(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

pretrained_embeddings = TEXT.vocab.vectors  # 获取预训练的词向量
model.embedding.weight.data.copy_(pretrained_embeddings) # 将预训练的词嵌入作为 embedding.weight 的初始值

# 将无法识别的词<unk> 的向量初始化为 0
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
# 将填充词'<pad>'的向量初始化为 0
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
# 定义一个对数据集训练一轮的函数
def train_epoch(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    train_corrects = 0
    train_num = 0

    model.train()  # 设置模型为训练模式
    for batch in iterator:
        optimizer.zero_grad()  # 清除之前的梯度
        predictions = model(batch.text[0]).squeeze(1)  # 获取模型的预测结果
        loss = criterion(predictions, batch.label.type(torch.FloatTensor))  # 计算损失
        predicted_labels = torch.round(torch.sigmoid(predictions))
        train_corrects += torch.sum(predicted_labels.long() == batch.label).item()
        train_num += len(batch.label)  # 样本数量
        loss.backward()  # 反向传播
        optimizer.step()  # 更新参数
        epoch_loss += loss.item()  # 累加损失
    # 计算所有样本的平均损失和精度
    epoch_loss = epoch_loss / train_num
    epoch_acc = train_corrects / train_num
    return epoch_loss, epoch_acc
# 定义一个对数据集验证一轮的函数
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    train_corrects = 0
    train_num = 0

    model.eval()  # 设置模型为评估模式
    with torch.no_grad():  # 禁止梯度计算
        for batch in iterator:
            predictions = model(batch.text[0]).squeeze(1)
            loss = criterion(predictions, batch.label.type(torch.FloatTensor))

            predicted_labels = torch.round(torch.sigmoid(predictions))
            train_corrects += torch.sum(predicted_labels.long() == batch.label).item()
            train_num += len(batch.label)  # 样本数量

            epoch_loss += loss.item()

    # 计算所有样本的平均损失和精度
    epoch_loss = epoch_loss / train_num
    epoch_acc = train_corrects / train_num

    return epoch_loss, epoch_acc
##使用训练集训练模型,使用验证集测试模型
EPOCHS = 10
best_val_loss = float("inf")
best_acc = float(0)
for epoch in range(EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train_epoch(model, train_iter,optimizer, criterion)
    val_loss, val_acc = evaluate(model, val_iter, criterion)
    end_time = time.time()
    print("Epoch:",epoch+1,"|","Epoch time:",end_time - start_time, "s")
    print("train_loss:",train_loss,"|","train acc:",train_acc)
    print("val_loss:", val_loss, "|", "val acc:", val_acc)
    ##保存效果较好的模型
    if (val_acc < best_val_loss) & (val_acc > best_acc):
        best_model_wts = copy.deepcopy(model.state_dict())
        best_val_loss = val_loss
        best_acc = val_acc
        model.load_state_dict(best_model_wts)

##使用evaluate函数对测试集进行预测
test_loss, test_acc = evaluate(model, test_iter, criterion)
print("在测试集上的预测精度为:",test_acc)
Epoch: 1 | Epoch time: 99.17507767677307 s
train_loss: -7057299.832365144 | train acc: 0.0
val_loss: -28801235.370666668 | val acc: 0.00013333333333333334
Epoch: 2 | Epoch time: 97.63081002235413 s
train_loss: -107327369.1318857 | train acc: 0.0
val_loss: -229042840.84906667 | val acc: 0.00013333333333333334
Epoch: 3 | Epoch time: 96.93891739845276 s
train_loss: -448399362.7209143 | train acc: 0.0
val_loss: -734407364.608 | val acc: 0.00013333333333333334
Epoch: 4 | Epoch time: 97.02122902870178 s
train_loss: -1133050154.7154286 | train acc: 0.0
val_loss: -1620770057.4208 | val acc: 0.00013333333333333334
Epoch: 5 | Epoch time: 96.99107241630554 s
train_loss: -2223178940.181943 | train acc: 0.0
val_loss: -2945424951.4325333 | val acc: 0.00013333333333333334
Epoch: 6 | Epoch time: 97.05503940582275 s
train_loss: -3758460059.179886 | train acc: 0.0
val_loss: -4728479283.063467 | val acc: 0.00013333333333333334
Epoch: 7 | Epoch time: 97.12549257278442 s
train_loss: -5777058173.981257 | train acc: 0.0
val_loss: -7028308528.3328 | val acc: 0.00013333333333333334
Epoch: 8 | Epoch time: 97.04441905021667 s
train_loss: -8301495497.5232 | train acc: 0.0
val_loss: -9825870090.513067 | val acc: 0.00013333333333333334
Epoch: 9 | Epoch time: 97.49299716949463 s
train_loss: -11376795386.792229 | train acc: 0.0
val_loss: -13201752339.2512 | val acc: 0.00013333333333333334
Epoch: 10 | Epoch time: 97.42624521255493 s
train_loss: -14967613801.852343 | train acc: 0.0
val_loss: -17156713148.142933 | val acc: 0.00013333333333333334
在测试集上的预测精度为: 4e-05

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值