PointNet:从pytorch代码角度理解
前言
最近准备入门3D视觉,主要应用于3D点云的深度学习检测。所以从点云处理的开篇之作pointnet入手,定期做做笔记,不然容易忘记,与大家共勉哈。
论文地址:pointnet
源码地址:源码
点云
点云是某个坐标系下的点的数据集。点包含了丰富的信息,包括三维坐标X,Y,Z、颜色、分类值、强度值、时间等等。从论文中作者主要介绍了点云的三个主要特征:
- 无序性。点云的本质就是一系列的点,无论以什么顺序出现,所指代的信息并不改变。
- 点与点之间的关系。每个点都包含空间的位置信息,所有点构成了集合空间关系。
- 旋转性。点云集合做相同的刚性变化(旋转和平移)时,空间坐标位置会发生变化。
pointnet网络
下面正式进入论文的学习。首先先看网络的整体结构:
- 首先网络的输入是n*3, n是点云的数量,3指代x,y,z的空间坐标。论文的最大亮点就是对点云数据进行了两次transform变化,第一次的input transform结构如上图,最终输出的是一个3✖️3旋转矩阵,主要目的就是将原始的点云集合旋转到有利于分类或分割操作的角度。
具体的结构(T-Net)可以从代码理解:
class STN3d(nn.Module):
'''首先数据的输入为(B, 3, n)--(32, 3, 2500)'''
def __init__(self):
super(STN3d, self).__init__()
self.conv1 = torch.nn.Conv1d(3, 64, 1)
self.conv2 = torch.nn.Conv1d(64, 128, 1)
self.conv3 = torch.nn.Conv1d(128, 1024, 1)
self.fc1 = nn.Linear(1024, 512)
self.fc2 = nn.Linear(512, 256)
self.fc3 = nn.Linear(256, 9)
self.relu = nn.ReLU()
self.bn1 = nn.BatchNorm1d(64)
self.bn2 = nn.BatchNorm1d(128)
self.bn3 = nn.BatchNorm1d(1024)
self.bn4 = nn.BatchNorm1d(512)
self.bn5 = nn.BatchNorm1d(256)
def forward(self, x):
batchsize = x.size()[0] # batchsize=32
x = F.relu(self.bn1(self.conv1(x))) # [32, 3, 2500] ---[32, 64, 2500]
x = F.relu(self.bn2(self.conv2(x))) # [32, 64, 2500] --- [32, 128, 2500]
x = F.relu(self.bn3(self.conv3(x))) # [32, 128, 2500] --- [32, 1024, 2500]
x = torch.max(x, 2, keepdim=True)[0] # [32, 1024, 2500] --- [32, 1024, 1] 这里代表每个维度选取一个最大值
x = x.view(-1, 1024) # [32, 1024, 1] --- [32, 1024]
x = F.relu(self.bn4(self.fc1(x))) # [32, 1024] --- [32, 512]
x = F.relu(self.bn5(self.fc2(x))) # [32, 512] --- [32, 256]
x = self.fc3(x) # [32, 256] --- [32, 9]
# iden生成单位变换矩阵, Variable不用管,已经不用了
iden = Variable(torch.from_numpy(np.array([1,0,0,0,1,0,0,0,1]).astype(np.float32))).view(1,9).repeat(batchsize,1)
if x.is_cuda:
iden = iden.cuda()
x = x + iden
x = x.view(-1, 3, 3) # [32, 9] --- [32, 3, 3]
return x
- 之后进行MLP操作,为什么用MLP呢?主要因为点云的稀疏性使得直接使得直接使用3D卷积操作变得困难,PointNet作者没有使用卷积操作,而是使用了MLP进行点云特征的提取。(缺陷是虽然避开了点云稀疏特性,但是直接进行全局的感知机会缺失局部信息的特征)
- 再之后就是第二次的transform----feature transform.是对提取的特征进行变换(在特征层面对其64维特征,话有点官方呀,我的理解就和第一个transform作用一样吧,代码也一样)
class STNkd(nn.Module):
def __init__(self, k=64):
super(STNkd, self).__init__()
self.conv1 = torch.nn.Conv1d(k, 64, 1)
self.conv2 = torch.nn.Conv1d(64, 128, 1)
self.conv3 = torch.nn.Conv1d(128, 1024, 1)
self.fc1 = nn.Linear(1024, 512)
self.fc2 = nn.Linear(512, 256)
self.fc3 = nn.Linear(256, k*k)
self.relu = nn.ReLU()
self.bn1 = nn.BatchNorm1d(64)
self.bn2 = nn.BatchNorm1d(128)
self.bn3 = nn.BatchNorm1d(1024)
self.bn4 = nn.BatchNorm1d(512)
self.bn5 = nn.BatchNorm1d(256)
self.k = k
def forward(self, x):
batchsize = x.size()[0] # batchsize = 32
x = F.relu(self.bn1(self.conv1(x))) # [32, 64, 2500]
x = F.relu(self.bn2(self.conv2(x))) # [32, 128, 2500]
x = F.relu(self.bn3(self.conv3(x))) # [32, 1024, 2500]
x = torch.max(x, 2, keepdim=True)[0] # [32, 1024, 1]
x = x.view(-1, 1024) # [32, 1024]
x = F.relu(self.bn4(self.fc1(x))) # [32, 512]
x = F.relu(self.bn5(self.fc2(x))) # [32, 256]
x = self.fc3(x) # [32, 4096] 4096 = 64 * 64
iden = Variable(torch.from_numpy(np.eye(self.k).flatten().astype(np.float32))).view(1,self.k*self.k).repeat(batchsize,1)
if x.is_cuda:
iden = iden.cuda()
x = x + iden
x = x.view(-1, self.k, self.k) # [32, 64, 64]
return x
- 接着往下走,MLP(64,128,1024)代码中就是都进行一维卷积,再进行全局池化
max pool
,根据global_feat
选择输出维度状态–主要为了之后判断是进行分类还是分割。
class PointNetfeat(nn.Module):
def __init__(self, global_feat = True, feature_transform = False):
super(PointNetfeat, self).__init__()
self.stn = STN3d()
self.conv1 = torch.nn.Conv1d(3, 64, 1)
self.conv2 = torch.nn.Conv1d(64, 128, 1)
self.conv3 = torch.nn.Conv1d(128, 1024, 1)
self.bn1 = nn.BatchNorm1d(64)
self.bn2 = nn.BatchNorm1d(128)
self.bn3 = nn.BatchNorm1d(1024)
self.global_feat = global_feat
self.feature_transform = feature_transform
if self.feature_transform:
self.fstn = STNkd(k=64)
def forward(self, x):
n_pts = x.size()[2] # 点云的数量 2500
trans = self.stn(x) # [32, 3, 3]
x = x.transpose(2, 1) # [32, 2500, 3]
x = torch.bmm(x, trans) # [32, 2500, 3]
x = x.transpose(2, 1) # [32, 3, 2500]
x = F.relu(self.bn1(self.conv1(x))) # [32, 64, 2500]
if self.feature_transform:
trans_feat = self.fstn(x)
x = x.transpose(2,1)
x = torch.bmm(x, trans_feat)
x = x.transpose(2,1)
else:
trans_feat = None
pointfeat = x # [32, 64, 2500]
x = F.relu(self.bn2(self.conv2(x))) # [32, 128, 2500]
x = self.bn3(self.conv3(x)) # [32, 1024, 2500]
x = torch.max(x, 2, keepdim=True)[0] # [32, 1024, 1]
x = x.view(-1, 1024) # [32, 1024]
if self.global_feat:
return x, trans, trans_feat
else:
x = x.view(-1, 1024, 1).repeat(1, 1, n_pts) # [32, 1024, 2500]
return torch.cat([x, pointfeat], 1), trans, trans_feat
这里补充个知识点`max pool`主要为了解决点云的无序问题,在维度上任意打乱的时候,为了表述同一个物体,最简单的就是使用对称函数。论文使用的是Max,无论顺序如何变化,最大值是不会变的。
还有个问题:如果点云特征为2500*3,在空间维度x,y,z是那个进行最大池化后就变为1*3,这样做损失的特征太多了,所以论文将点云的每个点先映射到一个冗余的高维空间后(例如1024维),再去进行max的对称函数操作,损失的特征就没那么多了。代码中是从[32, 1024,2500]变为[32,1024,1]。32是batchsize的大小,从每个维度上选取最大值。
- 如果是进行分类,直接将这个全局特征再进过MLP去输出每一类的概率即可。(经过一个mlp(代码中运用全连接)得到k个score。分类网络最后接的loss是softmax)
class PointNetCls(nn.Module):
def __init__(self, k=2, feature_transform=False):
super(PointNetCls, self).__init__()
self.feature_transform = feature_transform
self.feat = PointNetfeat(global_feat=True, feature_transform=feature_transform)
self.fc1 = nn.Linear(1024, 512)
self.fc2 = nn.Linear(512, 256)
self.fc3 = nn.Linear(256, k)
self.dropout = nn.Dropout(p=0.3)
self.bn1 = nn.BatchNorm1d(512)
self.bn2 = nn.BatchNorm1d(256)
self.relu = nn.ReLU()
def forward(self, x):
x, trans, trans_feat = self.feat(x) # x=[32, 1024], trans=[32, 3, 3], trans_feat = none
x = F.relu(self.bn1(self.fc1(x))) # x = [32, 512]
x = F.relu(self.bn2(self.dropout(self.fc2(x)))) # x = [32, 256]
x = self.fc3(x) # x = [32, k] 此时k=5
return F.log_softmax(x, dim=1), trans, trans_feat
- 如果是分割,将局部信息和全局信息简单地连接起来,就得到用于分割的全部信息(需要输出的是逐点的类别,因此其将全局特征拼接在了点云64维的逐点特征上,最后通过MLP,输出逐点的分类概率)。
class PointNetDenseCls(nn.Module):
def __init__(self, k = 2, feature_transform=False):
super(PointNetDenseCls, self).__init__()
self.k = k
self.feature_transform=feature_transform
self.feat = PointNetfeat(global_feat=False, feature_transform=feature_transform)
self.conv1 = torch.nn.Conv1d(1088, 512, 1)
self.conv2 = torch.nn.Conv1d(512, 256, 1)
self.conv3 = torch.nn.Conv1d(256, 128, 1)
self.conv4 = torch.nn.Conv1d(128, self.k, 1)
self.bn1 = nn.BatchNorm1d(512)
self.bn2 = nn.BatchNorm1d(256)
self.bn3 = nn.BatchNorm1d(128)
def forward(self, x):
batchsize = x.size()[0] # batchsize = 32
n_pts = x.size()[2] # 2500
x, trans, trans_feat = self.feat(x) # x= [32, 1088, 2500], trans = [32, 3, 3]
x = F.relu(self.bn1(self.conv1(x))) # [32, 512, 2500]
x = F.relu(self.bn2(self.conv2(x))) # [32, 256, 2500]
x = F.relu(self.bn3(self.conv3(x))) # [32, 128, 2500]
x = self.conv4(x) # [32, 3, 2500]
x = x.transpose(2,1).contiguous() # [32, 2500, 3]
x = F.log_softmax(x.view(-1,self.k), dim=-1) # [80000, 3]
x = x.view(batchsize, n_pts, self.k) # [32, 2500, 3]
return x, trans, trans_feat
不足
这里其实是自己有些不理解的地方哈哈。下面这个代码我看有的解释是控制最后的loss来对变换矩阵进行调整。不是太理解具体是怎么调整的
def feature_transform_regularizer(trans):
d = trans.size()[1] # d =3
batchsize = trans.size()[0] #batchsize = 32
I = torch.eye(d)[None, :, :] # [1, 3, 3]
if trans.is_cuda:
I = I.cuda()
loss = torch.mean(torch.norm(torch.bmm(trans, trans.transpose(2,1)) - I, dim=(1,2)))
return loss
同样的还有个地方:生成单位矩阵加到输入上就能起到旋转的作用了?
iden = Variable(torch.from_numpy(np.eye(self.k).flatten().astype(np.float32))).view(1,self.k*self.k).repeat(batchsize,1)
这两个应该是同一类型的问题,脑子里完全没这方面的概念哈哈,下去再找点资料看看吧。也希望有大佬能帮忙解决下,感谢感谢🙏
参考资料
链接: PointNet原理详解.
点云的无序性_三维点云分类与分割-PointNet
三维深度学习之pointnet系列详解(一)
PointNet:论文总结及pytorch源码详解