​​Linux开源存储漫谈(7)SPDK vhost-user-scsi初体验

存储性能软件加速库SPDK 和 virtio vhost & SPDK vhost 两篇文章分别介绍了SPDK和vhost在I/O虚拟化中的原理及架构,本篇将介绍SPDK vhost-user-scsi在QEMU中的IO加速

SPDK运行环境配置

NVMe内核驱动unbind相关部分,参见Linux存储漫谈(5)SPDK环境章节(第一、二步)

第一步,运行setup.sh绑定NVMe驱动,并配置大页内存

root@nvme:~# cd /data/github/spdk
root@nvme:/data/github/spdk# HUGEMEM=8192 ./scripts/setup.sh
0000:02:00.0 (2646 5017): nvme -> vfio-pci
root@nvme:/data/github/spdk# cat /proc/meminfo | grep -i huge
......
HugePages_Total:    4096
HugePages_Free:     4096
HugePages_Rsvd:        0
HugePages_Surp:        0
Hugepagesize:       2048 kB
Hugetlb:         8388608 kB
root@nvme:/data/github/spdk#

第二步,运行vhost启动SPDK自带vhost user target服务程序

root@nvme:/data/github/spdk# ./build/bin/vhost -S /var/tmp -s 1024 -m 0x3
[2023-05-21 01:50:54.338207] Starting SPDK v23.05-pre / DPDK 22.11.1 initialization...
[2023-05-21 01:50:54.338370] [ DPDK EAL parameters: vhost --no-shconf -c 0x1 --huge-unlink --log-level=lib.eal:6 --log-level=lib.cryptodev:5 --log-level=user1:6 --iova-mode=pa --base-virtaddr=0x200000000000 --match-allocations --file-prefix=spdk_pid1836 ]
TELEMETRY: No legacy callbacks, legacy socket not created
[2023-05-21 01:50:54.350808] app.c: 738:spdk_app_start: *NOTICE*: Total cores available: 1
[2023-05-21 01:50:54.380674] reactor.c: 937:reactor_run: *NOTICE*: Reactor started on core 0
[2023-05-21 01:50:54.388704] accel_sw.c: 601:sw_accel_module_init: *NOTICE*: Accel framework software module initialized.

第三步,配置SPDK bdev、vhost user server并配置vhost-user-scsi target

# create nvme bdev

root@nvme:/data/github/spdk# ./scripts/rpc.py bdev_nvme_attach_controller -b Nvme0 -t pcie -a 0000:02:00.0
Nvme0n1

root@nvme:/data/github/spdk# ./scripts/rpc.py vhost_create_scsi_controller --cpumask 0x1 vhost.0

root@nvme:/data/github/spdk# ./scripts/rpc.py vhost_scsi_controller_add_target vhost.0 0 Nvme0n1
0
root@nvme:/data/github/spdk# ./scripts/rpc.py bdev_malloc_create 512 8192 -b Malloc0
root@nvme:/data/github/spdk# ./scripts/rpc.py vhost_scsi_controller_add_target vhost.0 1 Malloc0
root@nvme:/data/github/spdk# ./scripts/rpc.py bdev_malloc_create 256 2048 -b Malloc1
root@nvme:/data/github/spdk# ./scripts/rpc.py vhost_create_blk_controller --cpumask 0x2 vhost.1 Malloc1

第四步,启动QEMU虚拟机,虚拟机相关配置见(1)环境准备篇

# 创建tap,并连接到kvmbr网桥
root@nvme:/data/kvm# tunctl -t tap1
root@nvme:/data/kvm# brctl addif kvmbr0 tap1
root@nvme:/data/kvm# ifconfig tap1 up

# 使用iscsi-vm.img生成启动qemu虚拟机脚本,并启动虚拟机
root@nvme:/data/kvm# cat > start-vhost.sh << EOF
> taskset -c 2,3 qemu-system-x86_64 \
> --enable-kvm \
> -cpu host -smp 2 \
> -m 4G -object memory-backend-file,id=mem0,size=4G,mem-path=/dev/hugepages,share=on -numa node,memdev=mem0 \
> -drive file=iscsi-vm.img,if=none,id=disk \
> -drive if=virtio,format=raw,file=seed.img \
> -device ide-hd,drive=disk,bootindex=0 \
> -chardev socket,id=spdk_vhost_scsi0,path=/var/tmp/vhost.0 \
> -device vhost-user-scsi-pci,id=scsi0,chardev=spdk_vhost_scsi0,num_queues=2 \
> -device virtio-net-pci,netdev=net0 \
> -netdev tap,ifname=tap1,script=no,id=net0 \
> -nographic
> EOF
root@nvme:/data/kvm# chmod a+x start-vhost.sh
root@nvme:/data/kvm# ./start-vhost.sh

第五步,登录虚拟机,查看块设备并运行fio测试SPDK vhost-user-scsi性能

root@vm-nvme:~/fio# lsblk --output "NAME,KNAME,MODEL,HCTL,SIZE,VENDOR,SUBSYSTEMS"
NAME    KNAME MODEL         HCTL         SIZE VENDOR   SUBSYSTEMS
......
sda     sda   QEMU HARDDISK 1:0:0:0     32.2G ATA      block:scsi:pci
├─sda1  sda1                            32.1G          block:scsi:pci
├─sda14 sda14                              4M          block:scsi:pci
└─sda15 sda15                            106M          block:scsi:pci
sdb     sdb   NVMe disk     2:0:0:0    931.5G INTEL    block:scsi:virtio:pci
sdc     sdc   Malloc disk   2:0:1:0        0B INTEL    block:scsi:virtio:pci
vda     vda                              366K 0x1af4   block:virtio:pci
root@vm-nvme:~/fio# 

# 结合qemu启动参数及块设备信息,可知sdb即为本地NVME SSD盘
root@vm-nvme:~/fio# cat nread.fio
[global]
bs=4096
rw=read
ioengine=libaio
size=50G
direct=1
iodepth=256
iodepth_batch=128
iodepth_low=128
iodepth_batch_complete=128
userspace_reap
group_reporting
[test]
numjobs=1
filename=/dev/sdb
root@vm-nvme:~/fio# fio nread.fio
test: (g=0): rw=read, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 4096B-4096B, ioengine=libaio, iodepth=256
fio-3.28
Starting 1 process
Jobs: 1 (f=1): [R(1)][100.0%][r=2705MiB/s][r=693k IOPS][eta 00m:00s]
test: (groupid=0, jobs=1): err= 0: pid=1053: Sun May 21 02:39:07 2023
  read: IOPS=673k, BW=2629MiB/s (2757MB/s)(50.0GiB/19472msec)
    slat (usec): min=65, max=264, avg=77.68, stdev= 7.36
    clat (usec): min=178, max=5722, avg=286.70, stdev=62.48
     lat (usec): min=268, max=5913, avg=364.39, stdev=62.20
    clat percentiles (usec):
     |  1.00th=[  210],  5.00th=[  223], 10.00th=[  235], 20.00th=[  247],
     | 30.00th=[  258], 40.00th=[  265], 50.00th=[  273], 60.00th=[  285],
     | 70.00th=[  293], 80.00th=[  314], 90.00th=[  355], 95.00th=[  396],
     | 99.00th=[  515], 99.50th=[  562], 99.90th=[  676], 99.95th=[  725],
     | 99.99th=[  840]
   bw (  MiB/s): min= 2068, max= 2717, per=99.97%, avg=2628.55, stdev=185.41, samples=38
   iops        : min=529408, max=695552, avg=672909.47, stdev=47464.30, samples=38
  lat (usec)   : 250=22.17%, 500=76.67%, 750=1.12%, 1000=0.03%
  lat (msec)   : 2=0.01%, 10=0.01%
  cpu          : usr=18.30%, sys=44.66%, ctx=162897, majf=0, minf=267
  IO depths    : 1=0.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=100.0%
     submit    : 0=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=100.0%
     complete  : 0=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=100.0%
     issued rwts: total=13107200,0,0,0 short=0,0,0,0 dropped=0,0,0,0
     latency   : target=0, window=0, percentile=100.00%, depth=256

Run status group 0 (all jobs):
   READ: bw=2629MiB/s (2757MB/s), 2629MiB/s-2629MiB/s (2757MB/s-2757MB/s), io=50.0GiB (53.7GB), run=19472-19472msec

Disk stats (read/write):
  sdb: ios=204006/0, merge=12852539/0, ticks=58434/0, in_queue=58434, util=99.55%
root@vm-nvme:~/fio# 

顺序读fio测试结果可以看出,IOPS:673k,bw:2629MiB/s,对比本机内核NVMe驱动IOPS=728k, BW=2842MiB/s(详见:(2)IO性能测试利器fio),不到8%的性能损耗

混合随机读写测试指标数据明显好于本机内核NVMe驱动,近30%的性能提升

iodepth/_low/_batch/_completenumjobs

iops(read)

kenel/spdk

bw(read)

kernel/spdk

iops(write)

kernel/spdk

bw(write)

kernel/spdk

32/8/8/8837.9k/51.1k148MiB/200MiB9500/12.8k37.1MiB/50.0MiB

混合随机读写测试vhost-user-scsi

iodepth/_low/_batch/_completenumjobs

iops(read)

bw(read)

iops(write)

bw(write)

64/16/16/161651.1k200MiB/s12.8k49.9MiB/s
32/8/8/816

51.3k

200MiB/s12.8k50.1MiB/s
32/8/8/8851.0k199MiB/s12.8k50.0MiB/s

(3)Linux-IO iSCSI)中测试过虚拟机连接网桥的网络带宽(3743 MBytes/sec,远远高于磁盘IO带宽,对Linux-IO iSCSI及spdk iSCSI Target测试结果影响应该非常小),虽然iSCSI属于网络块存储的解决方案与虚拟化存储不具有可比性,但是还是在网络带宽对结果影响非常小的情况下,对SPDK vhost的高性能还是有一点点借鉴意义的

QEMU挂载本地分区性能测试

现在测试一下,guest直接挂载nvme0n1分区,性能如何?为了减少其它因素干扰,重启宿主机后测试

# 创建tap,并连接到kvmbr网桥(我的环境并未保存临时创建的tap设备,重启OS后需要重新创建)
root@nvme:/data/kvm# tunctl -t tap1
root@nvme:/data/kvm# brctl addif kvmbr0 tap1
root@nvme:/data/kvm# ifconfig tap1 up

# 查看块设备
root@nvme:/data/kvm# lsblk
NAME    MAJ:MIN RM   SIZE RO TYPE MOUNTPOINTS
......
sda       8:0    0   1.8T  0 disk
├─sda1    8:1    0     1M  0 part
├─sda2    8:2    0   200G  0 part /
├─sda3    8:3    0   200G  0 part /data
├─sda4    8:4    0   1.4T  0 part
└─sda5    8:5    0  1007K  0 part
nvme0n1 259:0    0 931.5G  0 disk

# 创建文件系统ext4,SPDK使用非常简单文件系统blobfs,不能直接供QEMU使用,输出忽略
root@nvme:/data/kvm# mkfs -t ext4 /dev/nvme0n1

# 修改qemu启动参数
root@nvme:/data/kvm# cat start-kvm.iscsi.sh
taskset -c 2,3 qemu-system-x86_64 \
  --enable-kvm \
  -cpu host -smp 2 \
  -m 8G \
  -drive file=iscsi-vm.img,if=none,id=disk \
  -drive if=virtio,format=raw,file=seed.img \
  -device ide-hd,drive=disk,bootindex=0 \
  -hdb /dev/nvme0n1 \
  -device virtio-net-pci,netdev=net0 \
  -netdev tap,ifname=tap1,script=no,id=net0 \
  -nographic
root@nvme:/data/kvm# 

# 启动虚拟机,输出忽略
root@nvme:/data/kvm# ./start-kvm.iscsi.sh
......

进入虚拟机测试

# 查看块设备,/dev/sda 挂载的宿主机的/dev/nvme0n1分区
root@vm-nvme:~/fio# lsblk
NAME    MAJ:MIN RM   SIZE RO TYPE MOUNTPOINTS
fd0       2:0    1     4K  0 disk
loop0     7:0    0  63.5M  1 loop /snap/core20/1891
loop1     7:1    0  63.3M  1 loop /snap/core20/1879
loop2     7:2    0 111.9M  1 loop /snap/lxd/24322
loop3     7:3    0  53.2M  1 loop /snap/snapd/19122
loop4     7:4    0  53.2M  1 loop /snap/snapd/18933
sda       8:0    0 931.5G  0 disk
sdb       8:16   0  32.2G  0 disk
├─sdb1    8:17   0  32.1G  0 part /
├─sdb14   8:30   0     4M  0 part
└─sdb15   8:31   0   106M  0 part /boot/efi
vda     252:0    0   366K  0 disk

# 修改nread.fio的filename=/dev/sda,后运行,fio nread.fio
root@vm-nvme:~/fio# vim nread.fio
root@vm-nvme:~/fio# fio nread.fio
test: (g=0): rw=read, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 4096B-4096B, ioengine=libaio, iodepth=256
fio-3.28
Starting 1 process
Jobs: 1 (f=1): [R(1)][100.0%][r=1284MiB/s][r=329k IOPS][eta 00m:00s]
test: (groupid=0, jobs=1): err= 0: pid=985: Sun May 21 23:52:20 2023
  read: IOPS=328k, BW=1279MiB/s (1341MB/s)(50.0GiB/40021msec)
    slat (usec): min=61, max=2421, avg=73.21, stdev=21.88
    clat (usec): min=298, max=10620, avg=692.49, stdev=74.15
     lat (usec): min=532, max=11037, avg=765.71, stdev=80.10
    clat percentiles (usec):
     |  1.00th=[  603],  5.00th=[  619], 10.00th=[  627], 20.00th=[  644],
     | 30.00th=[  660], 40.00th=[  676], 50.00th=[  685], 60.00th=[  701],
     | 70.00th=[  717], 80.00th=[  734], 90.00th=[  766], 95.00th=[  783],
     | 99.00th=[  840], 99.50th=[  865], 99.90th=[  996], 99.95th=[ 1418],
     | 99.99th=[ 2311]
   bw (  MiB/s): min= 1110, max= 1303, per=100.00%, avg=1279.89, stdev=30.31, samples=80
   iops        : min=284388, max=333568, avg=327651.12, stdev=7759.72, samples=80
  lat (usec)   : 500=0.08%, 750=86.41%, 1000=13.42%
  lat (msec)   : 2=0.08%, 4=0.02%, 10=0.01%, 20=0.01%
  cpu          : usr=7.00%, sys=17.14%, ctx=134265, majf=0, minf=267
  IO depths    : 1=0.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=100.0%
     submit    : 0=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=100.0%
     complete  : 0=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=100.0%
     issued rwts: total=13107200,0,0,0 short=0,0,0,0 dropped=0,0,0,0
     latency   : target=0, window=0, percentile=100.00%, depth=256

Run status group 0 (all jobs):
   READ: bw=1279MiB/s (1341MB/s), 1279MiB/s-1279MiB/s (1341MB/s-1341MB/s), io=50.0GiB (53.7GB), run=40021-40021msec

Disk stats (read/write):
  sda: ios=102071/0, merge=12963159/0, ticks=65240/0, in_queue=65240, util=99.79%
root@vm-nvme:~/fio#

对比(2)IO性能测试利器fio中内核NVMe驱动的测试结果,IOPS:328k/728k, BW:1279MiB/2842MiB,性能损耗55%,如果你运行fio同时观测宿主机负载、内存等指标,你会发现,iostat的buff列飙升,也就是说如果宿主机资源资源相对不充足时虚拟机IO指标会更难看一些,相比于SPDK vhost-user-scsik的性能损耗,IOPS:328k/673k,BW:1279MiB/2629MiB,差距不是一点点的大,混合随机读写的指标数据差距更大,大约只有SPDK vhost-user-scsi的12%。

综合测试结果、宿主机资源消耗,SPDK vhost-user-scsi虚拟化IO加速方案都有非常明显的性能提升

欢迎转载,请注明出处

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值