复现程序
#!/bin/bash
cd /tmp
fallocate -l 256M disk
mkfs.ext4 disk
nbd-server 8000 /tmp/disk
nbd-client -d /dev/nbd0
nbd-client localhost 8000 /dev/nbd0
sleep 1s
nbd-client -d /dev/nbd0 & #后台执行,让第二次connect立即开始
nbd-client localhost 8000 /dev/nbd0
rm -f disk
第二次connect并没有在disconnect阻塞期间就开始执行,为什么?
测试用例本身没有问题,是disconnect这个进程中持有资源,然后connect进程正好需要这个资源,所以导致connect这个进程必须等待disconnect进程结束之后才可以执行。
具体的代码分析如下;
建立连接会调用:nbd_genl_connect
断开连接胡调用:nbd_genl_disconnect
driver/block/nbd.c中:
//建立连接和断开连接都是同一个doit,都在genl_family_rcv_msg被调用,
2100 static const struct genl_ops nbd_connect_genl_ops[] = {
2101 {
2102 .cmd = NBD_CMD_CONNECT,
2103 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
2104 .doit = nbd_genl_connect,
2105 },
2106 {
2107 .cmd = NBD_CMD_DISCONNECT,
2108 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
2109 .doit = nbd_genl_disconnect,
2110 },
2111 {
2112 .cmd = NBD_CMD_RECONFIGURE,
2113 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
2114 .doit = nbd_genl_reconfigure,
2115 },
2116 {
2117 .cmd = NBD_CMD_STATUS,
2118 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
2119 .doit = nbd_genl_status,
2120 },
2121 };
net/netlink/genetlink.c
genl_rcv_msg
genl_family_rcv_msg
err = ops->doit(skb, &info);
net/netlink/genetlink.c中
641 static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
642 struct netlink_ext_ack *extack)
643 {
644 const struct genl_family *family;
645 int err;
646
647 family = genl_family_find_byid(nlh->nlmsg_type);
648 if (family == NULL)
649 return -ENOENT;
650
651 if (!family->parallel_ops) //判断是否允许并行,不允许,就会上锁
652 genl_lock(); //所以造成了不能并行,导致我的测试脚本中的
653
654 err = genl_family_rcv_msg(family, skb, nlh, extack); //connect和disconnect都是通过这里调用
655
656 if (!family->parallel_ops)
657 genl_unlock(); //解锁
658
659 return err;
660 }