CNN代码结构详解

???Bessie

于 2023-03-09 16:07:44 发布

阅读量201

点赞数

文章标签： cnn 深度学习 python

本文链接：https://blog.csdn.net/jacksonkarry/article/details/129368325

版权

CNN代码详解（文本处理为例）

代码总体结构（具体的解释都写在注释当中）：
在这里插入图片描述

1.模型训练部分代码

下面展示 run.py部分，注释见代码中：

# coding: UTF-8
import time
import torch
import numpy as np
from train_eval import train, init_network
from importlib import import_module
import argparse
from utils import build_dataset, build_iterator, get_time_dif

# 参数
parser = argparse.ArgumentParser(description='Chinese Text Classification')
parser.add_argument('--model', default='TextCNN', type=str, required=False, help='TextCNN')
args = parser.parse_args()
#

if __name__ == '__main__':
    dataset = 'data'  # 数据集

    model_name = args.model  # 'TextCNN'
//x就是TextCNN.py
    x = import_module(model_name)//importlib包中的函数
    # 初始化config
    config = x.Config(dataset, 'random')
    # 随机种子 保证每次结果一样
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed_all(1)
    torch.backends.cudnn.deterministic = True

    # 记录时间
    start_time = time.time()
    print("Loading data...")
    # 创建数据集和字典
    vocab, train_data, dev_data, test_data = build_dataset(config)
    # 把数据转化为模型输入格式
    train_iter = build_iterator(train_data, config)
    dev_iter = build_iterator(dev_data, config)
    test_iter = build_iterator(test_data, config)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)

    # train
    config.n_vocab = len(vocab)
    # 模型初始化
    model = x.Model(config).to(config.device)
    init_network(model)
    print(model.parameters)
    # 训练模型
    train(config, model, train_iter, dev_iter, test_iter)

模型代码

下面展示 TextCNN.py，这部分就是模型的结构。

# coding: UTF-8
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np


class Config(object):

    """配置参数"""
    def __init__(self, dataset, embedding):
        self.model_name = 'TextCNN'
        self.train_path = dataset + '/train.txt'                                # 训练集路径，“+”为字符串的拼接
        self.dev_path = dataset + '/dev.txt'                                    # 验证集路径
        self.test_path = dataset + '/test.txt'                                  # 测试集路径
        self.class_list = [x.strip() for x in open(
            dataset + '/class.txt', encoding='utf-8').readlines()]              # 类别名单
        self.vocab_path = dataset + '/vocab.pkl'                                # 词表路径
        self.save_path = 'ckpt/' + self.model_name + '.ckpt'        # 保存模型训练结果
        self.embedding_pretrained = torch.tensor(
            np.load(embedding)["embeddings"].astype('float32'))\
            if embedding != 'random' else None                                       # 预训练词向量
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')   # 设备

        self.dropout = 0.5                                              # 随机失活概率
        self.require_improvement = 1000                                 # 若超过1000batch效果还没提升，则提前结束训练
        self.num_classes = len(self.class_list)                         # 类别数
        self.n_vocab = 0                                                # 词表大小，在运行时赋值
        self.num_epochs = 20   //训练轮次
        # epoch数，batchsize就是批大小（批尺寸），批量大小将决定一次训练的样本数目。相对于正常数据集，如果Batch_Size过小，训练数据就会非常难收敛，从而导致underfitting。增大Batch_Size,相对处理速度加快，但是所需内存容量增加。iteration：1个iteration等于使用batchsize个样本训练一次；
		# epoch：1个epoch等于使用训练集中的全部样本训练一次；
		#举个例子，训练集有1000个样本，batchsize=10，那么训练完整个样本集需要：100次iteration，1次epoch。
        self.batch_size = 64                                           # batch大小，64个文本作为一个batch
        self.pad_size = 24                                             # 每句话处理成的长度(短填长切)
        self.learning_rate = 1e-3                                       # 学习率
        self.embed = self.embedding_pretrained.size(1)\
            if self.embedding_pretrained is not None else 200           # 字向量维度200
        self.filter_sizes = (2, 3, 4)                                   # 卷积核尺寸
        self.num_filters = 256                                          # 卷积核数量(channels数)
        self.inner_size = 128
		#卷积核的内容这里放一个链接：链接: [https://blog.csdn.net/qq_42414972/article/details/118416422?ops_request_misc=%257B%2522request%255Fid%2522%253A%2522167817551316800211555072%2522%252C%2522scm%2522%253A%252220140713.130102334..%2522%257D&request_id=167817551316800211555072&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2~all~top_positive~default-1-118416422-null-null.142^v73^pc_search_v2,201^v4^add_ask,239^v2^insert_chatgpt&utm_term=%E5%8D%B7%E7%A7%AF%E6%A0%B8%E5%A4%A7%E5%B0%8F&spm=1018.2226.3001.4187]
'''Convolutional Neural Networks for Sentence Classification'''


class Model(nn.Module):
    def __init__(self, config):#初始化并且传进参数
        super(Model, self).__init__()#初始化父类
        if config.embedding_pretrained is not None:
            self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False)
        else:
            self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1)
        self.convs = nn.ModuleList(
            [nn.Conv2d(1, config.num_filters, (k, config.embed)) for k in config.filter_sizes])#nn.Conv1d文本用一维卷积，图像用二维卷积，config.num_filters有多少核：256核
        self.dropout = nn.Dropout(config.dropout)
        self.fc = nn.Linear(config.num_filters * len(config.filter_sizes), config.inner_size)##卷积之后进行线性变换降维，num_filters卷积核个数进行3层拼接
        self.softmax = nn.Softmax(dim=1)
        self.bn = nn.BatchNorm1d(num_features=config.num_filters)
        self.fc2 = nn.Linear(config.inner_size, config.num_classes)#降低为列表个数

    def conv_and_pool(self, x, conv):#卷积后决定是取最大polling还是平均pooling
        x = F.relu(conv(x)).squeeze(3)
        #数据传入conv(x)卷积层，经过relu激活层，squeeze表示减少一个维度，第三个维度均为1
        x = F.max_pool1d(x, x.size(2)).squeeze(2)#进行maxpooling，如果卷积卷出来最大的一层
        x = self.bn(x)
        # x = self.dropout(x)
        return x

    def forward(self, x):
        out = self.embedding(x)#首先进入embedding层
        out = out.unsqueeze(1)
        out = torch.cat([self.conv_and_pool(out, conv) for conv in self.convs], 1)#进行网络层拼接，可在此进行网络结构优化
        #out = self.bn(out)
        out = self.dropout(out)
        out = self.fc(out)
        out = self.dropout(out)
        out = self.fc2(out)
        #out = self.softmax(out)
        return out

模型

下面展示 run.py。

import time
import torch
import numpy as np
from train_eval import train, init_network
from importlib import import_module
import argparse
from utils import build_dataset, build_iterator, get_time_dif

# 参数
parser = argparse.ArgumentParser(description='Chinese Text Classification')
parser.add_argument('--model', default='TextCNN', type=str, required=False, help='TextCNN')
args = parser.parse_args()


if __name__ == '__main__':
    dataset = 'data'  # 数据集

    model_name = args.model  # 'TextCNN'

    x = import_module(model_name)
    # 初始化config
    config = x.Config(dataset, 'random')
    # 随机种子 保证每次结果一样
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed_all(1)
    torch.backends.cudnn.deterministic = True

    # 记录时间
    start_time = time.time()
    print("Loading data...")
    # 创建数据集和字典
    vocab, train_data, dev_data, test_data = build_dataset(config)
    # 把数据转化为模型输入格式，build_iterator见utils中
    train_iter = build_iterator(train_data, config)
    dev_iter = build_iterator(dev_data, config)
    test_iter = build_iterator(test_data, config)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)

    # train
    config.n_vocab = len(vocab)
    # 模型初始化
    model = x.Model(config).to(config.device)
    init_network(model)
    print(model.parameters)
    # 训练模型
    train(config, model, train_iter, dev_iter, test_iter)

一些自定义函数

下面展示 utils.py。

# 创建词典
def build_vocab(file_path, tokenizer, max_size, min_freq):
    vocab_dic = {}
    with open(file_path, 'r', encoding='UTF-8') as f:
        for line in tqdm(f):
            lin = line.strip()
            if not lin:
                continue
            content = lin.split('\t')[0]
            for word in tokenizer(content):
                vocab_dic[word] = vocab_dic.get(word, 0) + 1
        vocab_list = sorted([_ for _ in vocab_dic.items() if _[1] >= min_freq], key=lambda x: x[1], reverse=True)[
                     :max_size]
        vocab_dic = {word_count[0]: idx for idx, word_count in enumerate(vocab_list)}
        vocab_dic.update({UNK: len(vocab_dic), PAD: len(vocab_dic) + 1})
    return vocab_dic

# 创建数据集
def build_dataset(config):
    tokenizer = lambda x: [y for y in x.split()]#tokenizer把数据进行拆分
    if os.path.exists(config.vocab_path):
        vocab = pkl.load(open(config.vocab_path, 'rb'))
    else:
        vocab = build_vocab(config.train_path, tokenizer=tokenizer, max_size=MAX_VOCAB_SIZE, min_freq=1)
        pkl.dump(vocab, open(config.vocab_path, 'wb'))
    print(f"Vocab size: {len(vocab)}")

    def load_dataset(path, pad_size=32):
        contents = []
        with open(path, 'r', encoding='UTF-8') as f:
            for line in tqdm(f):
                lin = line.strip()
                if not lin:
                    continue
                try:
                    content, label = lin.split('\t')
                except Exception:
                    continue
                words_line = []
                token = tokenizer(content)
                seq_len = len(token)
                if pad_size:
                    if len(token) < pad_size:
                        token.extend([PAD] * (pad_size - len(token)))
                    else:
                        token = token[:pad_size]
                        seq_len = pad_size
                # word to id
                for word in token:
                    words_line.append(vocab.get(word, vocab.get(UNK)))
                contents.append((words_line, int(label)))
        return contents  # [([...], 0), ([...], 1), ...]

    train = load_dataset(config.train_path, config.pad_size)
    dev = load_dataset(config.dev_path, config.pad_size)
    test = load_dataset(config.test_path, config.pad_size)
    return vocab, train, dev, test

# 构建迭代器，控制一次性送进模型的数量
class DatasetIterater(object):
    def __init__(self, batches, batch_size, device):
        self.batch_size = batch_size
        self.batches = batches
        self.n_batches = len(batches) // batch_size
        self.residue = False  # 记录batch数量是否为整数
        if len(batches) % self.n_batches != 0:
            self.residue = True
        self.index = 0
        self.device = device

    def _to_tensor(self, datas):#把数据转成longtensor形态放进gpu
        x = torch.LongTensor([_[0] for _ in datas]).to(self.device)
        y = torch.LongTensor([_[1] for _ in datas]).to(self.device)

        return x, y

    def __next__(self):
        if self.residue and self.index == self.n_batches:#看是否最后一个batch
            batches = self.batches[self.index * self.batch_size: len(self.batches)]
            self.index += 1
            batches = self._to_tensor(batches)
            return batches

        elif self.index >= self.n_batches:
            self.index = 0
            raise StopIteration
        else:
            batches = self.batches[self.index * self.batch_size: (self.index + 1) * self.batch_size]
            self.index += 1
            batches = self._to_tensor(batches)
            return batches

    def __iter__(self):
        return self

    def __len__(self):
        if self.residue:
            return self.n_batches + 1
        else:
            return self.n_batches


def build_iterator(dataset, config):
    iter = DatasetIterater(dataset, config.batch_size, config.device)
    return iter


def get_time_dif(start_time):
    """获取已使用时间"""
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))


if __name__ == "__main__":
    '''提取预训练词向量'''
    # 下面的目录、文件名按需更改。
    train_dir = "./THUCNews/data/train.txt"
    vocab_dir = "./THUCNews/data/vocab.pkl"
    pretrain_dir = "./THUCNews/data/sgns.sogou.char"
    emb_dim = 300
    filename_trimmed_dir = "./THUCNews/data/embedding_SougouNews"
    if os.path.exists(vocab_dir):
        word_to_id = pkl.load(open(vocab_dir, 'rb'))
    else:
        # tokenizer = lambda x: x.split(' ')  # 以词为单位构建词表(数据集中词之间以空格隔开)
        tokenizer = lambda x: [y for y in x]  # 以字为单位构建词表
        word_to_id = build_vocab(train_dir, tokenizer=tokenizer, max_size=MAX_VOCAB_SIZE, min_freq=1)
        pkl.dump(word_to_id, open(vocab_dir, 'wb'))

    embeddings = np.random.rand(len(word_to_id), emb_dim)
    f = open(pretrain_dir, "r", encoding='UTF-8')
    for i, line in enumerate(f.readlines()):
        # if i == 0:  # 若第一行是标题，则跳过
        #     continue
        lin = line.strip().split(" ")
        if lin[0] in word_to_id:
            idx = word_to_id[lin[0]]
            emb = [float(x) for x in lin[1:301]]
            embeddings[idx] = np.asarray(emb, dtype='float32')
    f.close()
    np.savez_compressed(filename_trimmed_dir, embeddings=embeddings)

模型训练模块

下面展示 train_eval.py。

# 权重初始化，默认xavier
def init_network(model, method='xavier', exclude='embedding', seed=123):
    for name, w in model.named_parameters():
        if exclude not in name:
            if 'weight' in name:
                if method == 'xavier':
                    if len(w.shape) < 2:
                        nn.init.xavier_normal_(w.unsqueeze(0))
                    else:
                        nn.init.xavier_normal_(w)
                elif method == 'kaiming':
                    nn.init.kaiming_normal_(w)
                else:
                    nn.init.normal_(w)
            elif 'bias' in name:
                nn.init.constant_(w, 0)
            else:
                pass


def train(config, model, train_iter, dev_iter, test_iter):#模型训练部分
    start_time = time.time()
    # 把模型置为训练状态
    model.train()
    # 定义优化器
    optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)

    # 学习率指数衰减，每次epoch：学习率 = gamma * 学习率
    # scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
    total_batch = 0  # 记录进行到多少batch
    dev_best_loss = float('inf')
    last_improve = 0  # 记录上次验证集loss下降的batch数
    flag = False  # 记录是否很久没有效果提升

    for epoch in range(config.num_epochs):
        print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs))
        # scheduler.step() # 学习率衰减
        for i, (trains, labels) in enumerate(train_iter):
            outputs = model(trains)#一个batch出来的结果
            model.zero_grad()#梯度归零
            # 计算交叉熵损失
            loss = F.cross_entropy(outputs, labels)
            # 反向传播
            loss.backward()
            optimizer.step()#优化器调参
            if total_batch % 100 == 0:
                # 每多少轮输出在训练集和验证集上的效果
                true = labels.data.cpu()
                predic = torch.max(outputs.data, 1)[1].cpu()
                train_acc = metrics.accuracy_score(true, predic)#计算准确率，直接调用自动计算
                dev_acc, dev_loss = evaluate(config, model, dev_iter)#验证集
                if dev_loss < dev_best_loss:
                    dev_best_loss = dev_loss
                    torch.save(model.state_dict(), config.save_path)#存储最优模型
                    improve = '*'
                    last_improve = total_batch#=
                else:
                    improve = ''
                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6},  Train Loss: {1:>5.2},  Train Acc: {2:>6.2%},  Val Loss: {3:>5.2},  Val Acc: {4:>6.2%},  Time: {5} {6}'
                print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve))
                model.train()
            total_batch += 1
            if total_batch - last_improve > config.require_improvement:
                # 验证集loss超过1000batch没下降，结束训练
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break
        if flag:
            break
    test(config, model, test_iter)#测试集


def test(config, model, test_iter):
    # test
    model.load_state_dict(torch.load(config.save_path))
    # 把模型置为评估状态 评估状态不改变参数
    model.eval()
    start_time = time.time()
    test_acc, test_loss, test_report, test_confusion = evaluate(config, model, test_iter, test=True)
    msg = 'Test Loss: {0:>5.2},  Test Acc: {1:>6.2%}'
    print(msg.format(test_loss, test_acc))
    print("Precision, Recall and F1-Score...")
    print(test_report)
    print("Confusion Matrix...")
    print(test_confusion)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


def evaluate(config, model, data_iter, test=False):#评估
    model.eval()#告诉模型停止调参，要进行模型评估
    loss_total = 0
    predict_all = np.array([], dtype=int)
    labels_all = np.array([], dtype=int)
    with torch.no_grad():#因为验证集的时候，我们只是想看一下训练的效果，并不是想通过验证集来更新网络时，就可以使用with torch.no_grad()。最终，torch.save就是保存的训练集的训练模型
        for texts, labels in data_iter:
            outputs = model(texts)
            loss = F.cross_entropy(outputs, labels)
            loss_total += loss
            labels = labels.data.cpu().numpy()
            predic = torch.max(outputs.data, 1)[1].cpu().numpy()
            labels_all = np.append(labels_all, labels)
            predict_all = np.append(predict_all, predic)#放进一个列表

    acc = metrics.accuracy_score(labels_all, predict_all)#两个值计算准确率
    if test:#判断是不是测试集
        report = metrics.classification_report(labels_all, predict_all, target_names=config.class_list, digits=4)#混淆矩阵
        confusion = metrics.confusion_matrix(labels_all, predict_all)
        return acc, loss_total / len(data_iter), report, confusion
    return acc, loss_total / len(data_iter)

关于torch.nn

torch.nn是专门为神经网络设计的模块化接口，nn构建于autgrad之上，可以用来定义和运行神经网络
nn.Module是nn中重要的类，包含网络各层的定义，以及forward方法
对于自己定义的网络，需要注意以下几点:
1）需要继承nn.Module类，并实现forward方法,只要在nn.Module的子类中定义forward方法， backward函数就会被自动实现(利用autograd机制)
2)一般把网络中可学习参数的层放在构造函数中_init（），没有可学习参数的层如Relu层可以放在构造函数中，也可以不放在构造函数中(在forward函数中使用nn Functonal)
3)在forward中可以使用任何Variable支持的函数，在整个pytorch构建的图中，是Variable在流动，也可以使用for，print，log等
4）基于nn.Module构建的模型中，只支持mini-batch的Variable的输入方式，如，NCH*W