史上最全DARTS源代码分析

仅对CIFAR10的搜索和重新训练做了全面的注释

  1. train_search.py 搜索最好的cell
  2. model_search 搜索cell时用到的模型,8层,包含所有操作连接
  3. architecture.py 使用文中提出的方法更新α
  4. train.py 重训练,将搜到的最好的cell堆积20层重头开始训练
  5. model.py 重训练时用到的模型,20层,离散后的模型
    新手上路,都是一些自己的理解,供大家参考,若有问题,欢迎该我留言。
    以下是代码注释:
    model_search.py -> class Network
class Network(nn.Module):

  def __init__(self, C, num_classes, layers, criterion, steps=4, multiplier=4, stem_multiplier=3):#C:通道数(16);layer:层数(8)
    super(Network, self).__init__()
    self._C = C
    self._num_classes = num_classes
    self._layers = layers
    self._criterion = criterion
    self._steps = steps#一个cell内有4个node需要进行operation操作的搜索
    self._multiplier = multiplier

    C_curr = stem_multiplier*C# 当前Sequential模块的输出通道数~~~C_curr=48,C=16
    #通过卷积层将通道数扩充到48
    self.stem = nn.Sequential(
      nn.Conv2d(3, C_curr, 3, padding=1, bias=False),#in_channel:3,out_channel:C_curr;kernel_size:3*3;padding:1
      nn.BatchNorm2d(C_curr)#归一化处理
    )#将通道数扩充为48
 
    C_prev_prev, C_prev, C_curr = C_curr, C_curr, C # C_prev_prev=48, C_prev=48,c_curr=16
    self.cells = nn.ModuleList()#建立一个空的ModuleList
    reduction_prev = False#连接的前一个cell是否是reduction cell
    for i in range(layers):
      if i in [layers//3, 2*layers//3]:
        C_curr *= 2
        reduction = True#网络的1/3,2/3处为reduction cell
      else:
        reduction = False#其余位置为norml cell
      cell = Cell(steps, multiplier, C_prev_prev, C_prev, C_curr, reduction, reduction_prev)#构建Cell
      reduction_prev = reduction#reduction_prev是下一个Cell的参数,就等于上一个的reduction情况
      self.cells += [cell]#加入当前Cell
      C_prev_prev, C_prev = C_prev, multiplier*C_curr#四个node采用concat方式连接,所以C需要承4

    self.global_pooling = nn.AdaptiveAvgPool2d(1)#构建一个平均池化层,output size是1x1
    self.classifier = nn.Linear(C_prev, num_classes)#构建一个分类器

    self._initialize_alphas()#初始化参数

  def new(self):#新建一个Network,并将当前对象的架构参数复制至新建的对象
    model_new = Network(self._C, self._num_classes, self._layers, self._criterion).cuda()
    for x, y in zip(model_new.arch_parameters(), self.arch_parameters()):
        x.data.copy_(y.data)
    return model_new

  def forward(self, input):#重写forward函数
    s0 = s1 = self.stem(input)#self.stem扩充通道数
    for i, cell in enumerate(self.cells):#遍历cells中的8个细胞
      if cell.reduction:#为每一个细胞赋权重
        weights = F.softmax(self.alphas_reduce, dim=-1)
      else:
        weights = F.softmax(self.alphas_normal, dim=-1)
      s0, s1 = s1, cell(s0, s1, weights)#第k个细胞有两个输入,分别是第k-1,k-2个细胞
    out = self.global_pooling(s1)
    logits = self.classifier(out.view(out.size(0),-1))#classifier=liner
    return logits

  def _loss(self, input, target):
    logits = self(input)#调用父类__call__,调用forward
    return self._criterion(logits, target) #返回交叉熵损失

  def _initialize_alphas(self):#Cell参数初始化
    k = sum(1 for i in range(self._steps) for n in range(2+i))#参数一共有14行,即cell有14条边待选择
    num_ops = len(PRIMITIVES)#每行(条边)有8种选择

    self.alphas_normal = Variable(1e-3*torch.randn(k, num_ops).cuda(), requires_grad=True)#初始化normal cell的alphas
    self.alphas_reduce = Variable(1e-3*torch.randn(k, num_ops).cuda(), requires_grad=True)#初始化reduction cell的alphas
    self._arch_parameters = [#设置初始化参数
      self.alphas_normal,
      self.alphas_reduce,
    ]

  def arch_parameters(self):
    return self._arch_parameters

  # 根据训练结果获得训练后的Cell
  def genotype(self):

    def _parse(weights):#经过softmax后的weight weight=[14*8]
      gene = []
      n = 2
      start = 0#确定节点的前置边开始的那条边
      for i in range(self._steps):#self._steps=4
        end = start + n#确定节点的前置边结束的那条边{[0,2],[2,5],[5,9],[9,14]}
        W = weights[start:end].copy()#复制第i节点所有入度边到W。W长度分别为2,3,4,5
        edges = sorted(range(i + 2), key=lambda x: -max(W[x][k] for k in range(len(W[x])) if k != PRIMITIVES.index('none')))[:2]
        #选出包含最大权重的两条边(每条边上有8个操作)||range(i+2)该点入度的边数,即x的取值范围||sort排序是从小到大,所以取负数
        for j in edges:#便利需要选取操作的边
          k_best = None
          for k in range(len(W[j])):
            if k != PRIMITIVES.index('none'):
              if k_best is None or W[j][k] > W[j][k_best]:
                k_best = k#便利每条边的8个操作,选出权重最大的那个操作
          gene.append((PRIMITIVES[k_best], j))#将该操作加入gene,gene[14*2],第一位是操作,第二维是该操作对应的前直接点的序号
        start = end
        n += 1#后一个node的入度边数比前一个节点多1
      return gene

    #_parse函数饭后权重采样之后的权重信息
    gene_normal = _parse(F.softmax(self.alphas_normal, dim=-1).data.cpu().numpy())#对normal cell的参数做softmax,将数据取出放在CPU上并转换为numpy格式
    gene_reduce = _parse(F.softmax(self.alphas_reduce, dim=-1).data.cpu().numpy())#对reduction cell做同样操作

    concat = range(2+self._steps-self._multiplier, self._steps+2)#[2,3,4,5]
    genotype = Genotype(
      normal=gene_normal, normal_concat=concat,
      reduce=gene_reduce, reduce_concat=concat
    )
    return genotype在这里插入代码片

moder_search -> class Cell

class Cell(nn.Module):

  def __init__(self, steps, multiplier, C_prev_prev, C_prev, C, reduction, reduction_prev):
    super(Cell, self).__init__()
    self.reduction = reduction
    #input node结构是固定的,不需要参与搜索
    if reduction_prev:#上一个Cell是reduction Cell,这决定第一个node的结构。
      self.preprocess0 = FactorizedReduce(C_prev_prev, C, affine=False)
    else:
      self.preprocess0 = ReLUConvBN(C_prev_prev, C, 1, 1, 0, affine=False)#第一个input node第k-2个cell的输出,通道数为C_prev_prev
    self.preprocess1 = ReLUConvBN(C_prev, C, 1, 1, 0, affine=False)#第二个input node是第k-1个cell的输出,通道数为C_prev
    self._steps = steps#self._steps=4,每个cell中有4个节点的连接状态待确定
    self._multiplier = multiplier

    self._ops = nn.ModuleList()#构建operation的ModuleList
    self._bns = nn.ModuleList()
    for i in range(self._steps):
      for j in range(2+i):#对于每一个节点,它有2+i个前驱节点
        stride = 2 if reduction and j < 2 else 1#reduction Cell补偿为2,normal cell步长为1
        op = MixedOp(C, stride)#构建两个节点之间的混合操作
        self._ops.append(op)#所有操作添加到_ops,len(_ops)=14~2+3+4+5
  def forward(self, s0, s1, weights):
    s0 = self.preprocess0(s0)#s0是第一个输入
    s1 = self.preprocess1(s1)#s1是第二个输入

    states = [s0, s1]#当前节点的前驱节点
    offset = 0
    # 遍历每个intermediate nodes,得到每个节点的output
    for i in range(self._steps):
      # s为当前节点i的output,在ops找到i对应的操作,然后对i的所有前驱节点做相应的操作(调用了MixedOp的forward),然后把结果相加
      s = sum(self._ops[offset+j](h, weights[offset+j]) for j, h in enumerate(states))#_ops中存放的是MixedOP类的对象,相当于调用MxedOP的__call__->forward
      offset += len(states)#下一个节点的起始行数
      states.append(s)#把当前节点i的output作为下一个节点的输入[s0,s1,b1,b2,b3,b4]

    return torch.cat(states[-self._multiplier:], dim=1)#对节点的output进行concat作为当前cell的输出

model_search -> MixedOp

class MixedOp(nn.Module):#构建operation内部的操作

  def __init__(self, C, stride):
    super(MixedOp, self).__init__()
    self._ops = nn.ModuleList()#构建一个空的ModuleList
    for primitive in PRIMITIVES:#ORIMITIVES中存储所有操作~genotypes.py
      op = OPS[primitive](C, stride, False)#OPS中定义了所有操作的函数
      if 'pool' in primitive:
        op = nn.Sequential(op, nn.BatchNorm2d(C, affine=False))#给池化操作后面加一个batchnormalization
      self._ops.append(op)#把这些op都放在预先定义好的modulelist里
  def forward(self, x, weights):
    return sum(w * op(x) for w, op in zip(weights, self._ops))#输出值乘上权重相加,权重即α

train_search -> def main

def main():
  if not torch.cuda.is_available():
    logging.info('no gpu device available')
    sys.exit(1)

  np.random.seed(args.seed)
  torch.cuda.set_device(args.gpu)
  cudnn.benchmark = True
  torch.manual_seed(args.seed)
  cudnn.enabled=True
  torch.cuda.manual_seed(args.seed)
  logging.info('gpu device = %d' % args.gpu)
  logging.info("args = %s", args)

  criterion = nn.CrossEntropyLoss()
  criterion = criterion.cuda()
  #Network初始化一个8层的网络
  model = Network(args.init_channels, CIFAR_CLASSES, args.layers, criterion)#init_channels:16,CIFAR_CLASSES:10;layers:8;criterion:EntropyLoss
  model = model.cuda()
  logging.info("param size = %fMB", utils.count_parameters_in_MB(model))
  #设置优化器
  optimizer = torch.optim.SGD(
      model.parameters(),#待优化参数,此处是w
      args.learning_rate,#学习率
      momentum=args.momentum,#动量因子0.9
      weight_decay=args.weight_decay)#正则化参数
  train_transform, valid_transform = utils._data_transforms_cifar10(args)#图像变化处理
  train_data = dset.CIFAR10(root=args.data, train=True, download=True, transform=train_transform)#train=True(训练集)=False(测试机),transform:数据增强变换

  num_train = len(train_data)#训练集图片的数量  CIFAR10:50000
  indices = list(range(num_train))#list[1,2,3,...,num_train]
  split = int(np.floor(args.train_portion * num_train))#np.floor(x):不大于x的最大整数

  train_queue = torch.utils.data.DataLoader(#加载训练用数据集
      train_data, batch_size=args.batch_size,#dataset:dataset from which to load;batch_size:每次喂给神经网络多少行数据
      sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]),#采样数据集,前一半数据用来验证~CIFAR10:[25001,50000]
      pin_memory=True, num_workers=2)#首先采样数据集,再按照batch_size分成一个一个batch用于训练

  valid_queue = torch.utils.data.DataLoader(#加载验证用数据集
      train_data, batch_size=args.batch_size,
      sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[split:num_train]),#采样数据集,后一半数据用来验证~CIFAR10:[25001,50000]
      pin_memory=True, num_workers=2)

############
  scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(#调整学习率,出自文章《SGDR: Stochastic Gradient DescentWarm Restarts》有空再看
        optimizer, float(args.epochs), eta_min=args.learning_rate_min)####此处有空时专门研究####
############
  architect = Architect(model, args)#用来优化架构参数,文章核心

  for epoch in range(args.epochs):#文中定义 epoch:600~26:92.91
    scheduler.step()
    lr = scheduler.get_lr()[0]#得到本次训练的学习率
    logging.info('epoch %d lr %e', epoch, lr)

    genotype = model.genotype()#获得离散之后的结果,每个节点只有两个前置操作
    logging.info('genotype = %s', genotype)

    print(F.softmax(model.alphas_normal, dim=-1))#输出normal cell的α矩阵
    print(F.softmax(model.alphas_reduce, dim=-1))#输出reduction cell的α矩阵

    # training
    train_acc, train_obj = train(train_queue, valid_queue, model, architect, criterion, optimizer, lr)#模型训练
    logging.info('train_acc %f', train_acc)#输出训练正确率

    # validation
    valid_acc, valid_obj = infer(valid_queue, model, criterion)#模型验证
    logging.info('valid_acc %f', valid_acc)#输出验证正确率

    utils.save(model, os.path.join(args.save, 'weights.pt'))入代码片

utils.py -> class Cutout

```python
class Cutout(object):#Cutout 数据增强
    def __init__(self, length):
        self.length = length

    def __call__(self, img):
        h, w = img.size(1), img.size(2)#获得图片的长宽信息
        mask = np.ones((h, w), np.float32)#返回一个全1的h*w数组
        y = np.random.randint(h)#返回一个范围(0,h)的整数
        x = np.random.randint(w)#返回一个范围(0,w)的整数

        y1 = np.clip(y - self.length // 2, 0, h)#y=np.clip(a,b,c)       a,b,c:int
        y2 = np.clip(y + self.length // 2, 0, h)#if a<b a=b;else if a>c a=c
        x1 = np.clip(x - self.length // 2, 0, w)#这而是确定cutout的取值边界
        x2 = np.clip(x + self.length // 2, 0, w)#cutout的取值范围2*length  *  2*length

        mask[y1: y2, x1: x2] = 0.#取值范围内像素点置为0
        mask = torch.from_numpy(mask)#to tensor
        mask = mask.expand_as(img)#将mask的维度扩充至为image一致
        img *= mask#实现cutout 选中区域被置为0
        return img


def _data_transforms_cifar10(args):
  CIFAR_MEAN = [0.49139968, 0.48215827, 0.44653124]
  CIFAR_STD = [0.24703233, 0.24348505, 0.26158768]

  train_transform = transforms.Compose([
    transforms.RandomCrop(32, padding=4),#CenterCrop是以输入图的中心点为中心点做指定size的crop操作
    transforms.RandomHorizontalFlip(),#是随机的图像水平翻转,即将图像的左右对调
    transforms.ToTensor(),#Convert a PIL Image or numpy.ndarray to tensor
    transforms.Normalize(CIFAR_MEAN, CIFAR_STD),#做数据归一化的,一般都会对输入数据做这样的操作
  ])
  if args.cutout:
    train_transform.transforms.append(Cutout(args.cutout_length))#Cotout数据增强

  valid_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(CIFAR_MEAN, CIFAR_STD),
    ])
  return train_transform, valid_transform

utils.py -> def create_exp_dir

def create_exp_dir(path, scripts_to_save=None):
  if not os.path.exists(path):
    os.mkdir(path)
  print('Experiment dir : {}'.format(path))#输出文件建立记录

  if scripts_to_save is not None:
    os.mkdir(os.path.join(path, 'scripts'))#建立path/scripts目录
    for script in scripts_to_save:
      dst_file = os.path.join(path, 'scripts', os.path.basename(script))
      shutil.copyfile(script, dst_file)#复制所有文件到scripts下

architect.py -> class Architect

class Architect(object):

  def __init__(self, model, args):
    self.network_momentum = args.momentum
    self.network_weight_decay = args.weight_decay
    self.model = model
    self.optimizer = torch.optim.Adam(self.model.arch_parameters(),
        lr=args.arch_learning_rate, betas=(0.5, 0.999), weight_decay=args.arch_weight_decay)

  def _compute_unrolled_model(self, input, target, eta, network_optimizer):
    loss = self.model._loss(input, target)#对model进行一次训练,获取交叉熵损失,获得的的是Ltraib(w,α)
    theta = _concat(self.model.parameters()).data#把参数整理成一行代表一个参数的形式,得到我们要更新的参数theta
    try:###此处需要先学习带有动量的梯度下降法###
      moment = _concat(network_optimizer.state[v]['momentum_buffer'] for v in self.model.parameters()).mul_(self.network_momentum)#network_momentum=0.9,momentum*v,用的就是Network进行w更新的momentum
    except:
      moment = torch.zeros_like(theta)#不加momentum
    dtheta = _concat(torch.autograd.grad(loss, self.model.parameters())).data + self.network_weight_decay*theta#前面的是loss对参数theta求梯度,后面是正则项,即  dwLtrain(w,α)+weight_decay*theta
    unrolled_model = self._construct_model_from_theta(theta.sub(eta, moment+dtheta))#w'=w − ξ*dwLtrain(w, α)
    return unrolled_model

  def step(self, input_train, target_train, input_valid, target_valid, eta, network_optimizer, unrolled):
    self.optimizer.zero_grad()#清除上一步残留的参数值
    if unrolled:#如unrolled==True,则使用论文中提出的方法
        self._backward_step_unrolled(input_train, target_train, input_valid, target_valid, eta, network_optimizer)#eta=learning rate
    else:
        self._backward_step(input_valid, target_valid)
    self.optimizer.step()

  def _backward_step(self, input_valid, target_valid):
    loss = self.model._loss(input_valid, target_valid)
    loss.backward()

  def _backward_step_unrolled(self, input_train, target_train, input_valid, target_valid, eta, network_optimizer):
    #w' = w − ξ*dwLtrain(w, α)
    unrolled_model = self._compute_unrolled_model(input_train, target_train, eta, network_optimizer)
    # Lval(w',α)
    unrolled_loss = unrolled_model._loss(input_valid, target_valid)
    unrolled_loss.backward()
    #dαLval(w',α)
    dalpha = [v.grad for v in unrolled_model.arch_parameters()]
    #dwLval(w',α)
    vector = [v.grad.data for v in unrolled_model.parameters()]
    # (dαLtrain(w+,α)-dαLtrain(w-,α))/(2*epsilon)
    implicit_grads = self._hessian_vector_product(vector, input_train, target_train)

    for g, ig in zip(dalpha, implicit_grads):
      g.data.sub_(eta, ig.data)
    # 公式六减公式八 dαLval(w',α)-(dαLtrain(w+,α)-dαLtrain(w-,α))/(2*epsilon)
    for v, g in zip(self.model.arch_parameters(), dalpha):
      if v.grad is None:
        v.grad = Variable(g.data)
      else:
        v.grad.data.copy_(g.data)

  def _construct_model_from_theta(self, theta):#theta=w'=w − ξ*dwLtrain(w, α)
    model_new = self.model.new()#model_new有self有共同的架构参数
    model_dict = self.model.state_dict()## Returns a dictionary containing a whole state of the module.

    params, offset = {}, 0
    for k, v in self.model.named_parameters():
      v_length = np.prod(v.size())#获取参数量
      params[k] = theta[offset: offset+v_length].view(v.size())#将named_parameters中参数复制到params
      offset += v_length

    assert offset == len(theta)
    model_dict.update(params)#更新参数地点
    model_new.load_state_dict(model_dict)#model_new的参数等于更新后的参数
    return model_new.cuda()

  # 计算(dαLtrain(w+,α)-dαLtrain(w-,α))/(2*epsilon)     其中w+=w + dw'Lval(w',α)*epsilon      w- =w - dw'Lval(w',α)*epsilon
  def _hessian_vector_product(self, vector, input, target, r=1e-2):#vector就是dw'Lval(w',α)
    R = r / _concat(vector).norm()# epsilon
    for p, v in zip(self.model.parameters(), vector):
      p.data.add_(R, v) # 将模型中所有的w'更新成w+=w+dw'Lval(w',α)*epsilon
    loss = self.model._loss(input, target)
    grads_p = torch.autograd.grad(loss, self.model.arch_parameters())
    # dαLtrain(w-,α)
    for p, v in zip(self.model.parameters(), vector):
      p.data.sub_(2*R, v)# 将模型中所有的w'更新成w- = w+ - (w-)*2*epsilon = w+dw'Lval(w',α)*epsilon - 2*epsilon*dw'Lval(w',α)=w-dw'Lval(w',α)*epsilon
    loss = self.model._loss(input, target)
    grads_n = torch.autograd.grad(loss, self.model.arch_parameters())
    # 将模型的参数从w-恢复成w
    for p, v in zip(self.model.parameters(), vector):
      p.data.add_(R, v)
    # w=(w-) +dw'Lval(w',α)*epsilon = w-dw'Lval(w',α)*epsilon + dw'Lval(w',α)*epsilon = w
    return [(x-y).div_(2*R) for x, y in zip(grads_p, grads_n)]
  • 9
    点赞
  • 39
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值