Pytorch MNIST损失降低问题,转置的损失降低速度的更快?为什么?

最近在学习pytorch的过程中,遇到了一个问题,是关于损失降低的,现分别贴上相关代码,求大佬讲解,谢谢。

import torch
import torch.optim as optim
import torch.nn.functional as F
from torchvision.datasets import MNIST
from torchvision.transforms import Compose,ToTensor,Normalize
from torch.utils.data import DataLoader

w1,b1=torch.randn(200,784,requires_grad=True),torch.zeros(200,requires_grad=True)

w2,b2=torch.randn(200,200,requires_grad=True),torch.zeros(200,requires_grad=True)

w3,b3=torch.randn(10,200,requires_grad=True),torch.zeros(10,requires_grad=True)
torch.nn.init.kaiming_normal_(w1)
torch.nn.init.kaiming_normal_(w2)
torch.nn.init.kaiming_normal_(w3)
# 0.准备数据
def get_data_loader(BATCH_SIZE=128,train=True):
    transform_fn=Compose([
        ToTensor(),
        Normalize(mean=(0.1307,),std=(0.3081,))
    ])

    dataset=MNIST(root="./data",train=train,transform=transform_fn)

    data_loader=DataLoader(dataset,batch_size=BATCH_SIZE,shuffle=True)

    return data_loader


def forward(x):
    x=x@w1.t()+b1
    x=F.relu(x)

    x=x@w2.t()+b2
    x=F.relu(x)

    x=x@w3.t()+b3
    x=F.relu(x)

    return x
optimizer=optim.SGD([w1,b1,w2,b2,w3,b3],lr=1e-3)
criteon=torch.nn.CrossEntropyLoss()
epochs=5
for epoch in range(epochs):
    train_loader=get_data_loader()
    # train_loader.dataset
    for batch_idex,(data,target) in enumerate(train_loader):
         #梯度设置为0
        optimizer.zero_grad()
        # x=data
        data=data.view(-1,28*28)
        # break
        logits=forward(data)
        loss=criteon(logits,target)
        # optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if batch_idex%100==0:
            print(epoch,batch_idex,loss.item())

test_dataloader=get_data_loader(BATCH_SIZE=1,train=False)
test_dataloader.batch_size
for idx,(x,target) in enumerate(test_dataloader):
    print(target)
    target=target
    # break
    data=x.view(-1,28*28)
    with torch.no_grad():
        
        # print(forward(data))
        output=forward(data)
        # cur_loss=F.nll_loss(output,target[0])
        # loss_list.append(cur_loss)
        pred=output.max(dim=-1)[-1]
        print(pred)
    break

# a=torch.randint(4,10,[3,4])

结果如下:

0 0 2.587043285369873
0 100 1.9474238157272339
0 200 1.5906157493591309
0 300 1.3711450099945068
0 400 1.0317896604537964
1 0 1.205894112586975
1 100 1.0516211986541748
1 200 0.8303559422492981
1 300 0.8546297550201416
1 400 0.733218252658844
2 0 0.7834727764129639
2 100 0.6731536388397217
2 200 0.5265911817550659
2 300 0.5402327179908752
2 400 0.5170225501060486
3 0 0.575351357460022
3 100 0.6737942099571228
3 200 0.554531991481781
3 300 0.47510600090026855
3 400 0.4966387450695038
4 0 0.43965044617652893
4 100 0.40633752942085266
4 200 0.3536660075187683
4 300 0.391279935836792
4 400 0.4432230591773987

可以看到,针对MNIST数据集,通过矩阵相乘和relu函数的三次处理,针对全部数据进行了5次训练,损失降低到了0.4432,这里我发现使用了***x@w1.t()+b1***这种方式进行矩阵运算,那我换一种直接生成相对应的矩阵,不用转置可以吗?答案是可以的,但是收敛速度显著降低了,代码如下:

import torch
import torch.optim as optim
import torch.nn.functional as F
from torchvision.datasets import MNIST
from torchvision.transforms import Compose,ToTensor,Normalize
from torch.utils.data import DataLoader

w1,b1=torch.randn(784,200,requires_grad=True),torch.zeros(200,requires_grad=True)

w2,b2=torch.randn(200,200,requires_grad=True),torch.zeros(200,requires_grad=True)

w3,b3=torch.randn(200,10,requires_grad=True),torch.zeros(10,requires_grad=True)
torch.nn.init.kaiming_normal_(w1)
torch.nn.init.kaiming_normal_(w2)
torch.nn.init.kaiming_normal_(w3)
# 0.准备数据
def get_data_loader(BATCH_SIZE=128,train=True):
    transform_fn=Compose([
        ToTensor(),
        Normalize(mean=(0.1307,),std=(0.3081,))
    ])

    dataset=MNIST(root="./data",train=train,transform=transform_fn)

    data_loader=DataLoader(dataset,batch_size=BATCH_SIZE,shuffle=True)

    return data_loader


def forward(x):
    x=x@w1+b1
    x=F.relu(x)

    x=x@w2+b2
    x=F.relu(x)

    x=x@w3+b3
    x=F.relu(x)
    return x
optimizer=optim.SGD([w1,b1,w2,b2,w3,b3],lr=1e-3)
criteon=torch.nn.CrossEntropyLoss()
epochs=5
for epoch in range(epochs):
    train_loader=get_data_loader()
    # train_loader.dataset
    for batch_idex,(data,target) in enumerate(train_loader):
         #梯度设置为0
        optimizer.zero_grad()
        # x=data
        data=data.view(-1,28*28)
        # break
        logits=forward(data)
        loss=criteon(logits,target)
        # optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if batch_idex%100==0:
            print(epoch,batch_idex,loss.item())

test_dataloader=get_data_loader(BATCH_SIZE=1,train=False)
test_dataloader.batch_size
for idx,(x,target) in enumerate(test_dataloader):
    print(target)
    target=target
    # break
    data=x.view(-1,28*28)
    with torch.no_grad():
        
        # print(forward(data))
        output=forward(data)
        # cur_loss=F.nll_loss(output,target[0])
        # loss_list.append(cur_loss)
        pred=output.max(dim=-1)[-1]
        print(pred)
    break

# a=torch.randint(4,10,[3,4])

训练结果如下:

0 0 12.452335357666016
0 100 2.323484182357788
0 200 1.9058589935302734
0 300 1.9633543491363525
0 400 1.8302357196807861
1 0 1.880374789237976
1 100 1.7841637134552002
1 200 1.8975179195404053
1 300 1.8928344249725342
1 400 1.8756489753723145
2 0 1.9496670961380005
2 100 1.7696168422698975
2 200 1.968169927597046
2 300 1.777085781097412
2 400 1.7634800672531128
3 0 1.8082146644592285
3 100 1.8717498779296875
3 200 1.7398643493652344
3 300 1.5511747598648071
3 400 1.9726353883743286
4 0 1.72966468334198
4 100 1.5711679458618164
4 200 1.6703119277954102
4 300 1.8036141395568848
4 400 1.7966620922088623

与转置后的模型相比,损失扩大了4倍甚至更多,原因是什么?有相关的大佬能说明一下吗?

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值