全局概览
测试环境为CentOS 7 X64
从RPM获取源码
$ cd ~/rpmbuild/
$ yumdownloader --source device-mapper-multipath
$ rpm -ivh device-mapper-multipath-0.4.9-85.el7_2.6.src.rpm
$ rpmbuild -ba SPECS/device-mapper-multipath.spec
$ ll RPMS/x86_64/ | awk '{print $9}'
device-mapper-multipath-0.4.9-85.el7.centos.6.x86_64.rpm
device-mapper-multipath-debuginfo-0.4.9-85.el7.centos.6.x86_64.rpm
device-mapper-multipath-libs-0.4.9-85.el7.centos.6.x86_64.rpm
device-mapper-multipath-sysvinit-0.4.9-85.el7.centos.6.x86_64.rpm
kpartx-0.4.9-85.el7.centos.6.x86_64.rpm
从官方或github获取源码
$ mkdir -pv /opt/multipath
$ cd /opt/multipath
$ git clone http://git.opensvc.com/multipath-tools/.git src
# or
$ git clone https://github.com/cathay4t/multipath-tools.git src
独立编译RPM源码
$ rpmbuild -bp SPECS/device-mapper-multipath.spec
$ mkdir -pv /opt/multipath
$ cp -rv BUILD/multipath-tools-130222/ /opt/multipath/
$ cd /opt/multipath/multipath-tools-130222/
$ make LIB=lib64
$ make install DESTDIR=/opt/multipath/target \
bindir=/usr/sbin \
syslibdir=/usr/lib64 \
libdir=/usr/lib64/multipath \
rcdir=/etc/rc.d/init.d \
unitdir=/usr/lib/systemd/system
查看目标文件
$ cd /opt/multipath/target/
$ tree
.
├── etc
│ └── rc.d
│ └── init.d
│ └── multipathd
└── usr
├── lib
│ ├── systemd
│ │ └── system
│ │ └── multipathd.service
│ └── udev
│ └── rules.d
│ ├── 11-dm-mpath.rules
│ └── 62-multipath.rules
├── lib64
│ ├── libmpathpersist.so -> libmpathpersist.so.0
│ ├── libmpathpersist.so.0
│ ├── libmultipath.so -> libmultipath.so.0
│ ├── libmultipath.so.0
│ └── multipath
│ ├── libcheckcciss_tur.so
│ ├── libcheckdirectio.so
│ ├── libcheckemc_clariion.so
│ ├── libcheckhp_sw.so
│ ├── libcheckhp_tur.so
│ ├── libcheckrdac.so
│ ├── libcheckreadsector0.so
│ ├── libchecktur.so
│ ├── libprioalua.so
│ ├── libprioconst.so
│ ├── libpriodatacore.so
│ ├── libprioemc.so
│ ├── libpriohds.so
│ ├── libpriohp_sw.so
│ ├── libprioiet.so
│ ├── libprioontap.so
│ ├── libpriorandom.so
│ ├── libpriordac.so
│ └── libprioweightedpath.so
├── sbin
│ ├── kpartx
│ ├── mpathconf
│ ├── mpathpersist
│ ├── multipath
│ └── multipathd
└── share
└── man
├── man3
│ ├── mpath_persistent_reserve_in.3.gz
│ └── mpath_persistent_reserve_out.3.gz
├── man5
│ └── multipath.conf.5.gz
└── man8
├── kpartx.8.gz
├── mpathconf.8.gz
├── mpathpersist.8.gz
├── multipath.8.gz
└── multipathd.8.gz
17 directories, 40 files
主要的文件有:
SysV和systemd启动脚本
/etc/rc.d/init.d/multipathd
/usr/lib/systemd/system/multipathd.service
udev命名规则
/usr/lib//udev/rules.d/11-dm-mpath.rules
/usr/lib//udev/rules.d/62-multipath.rules
用户工具
/usr/sbin/multipathd 守护进程,监听系统中路径状态的变化,并做相应的处理。
/usr/sbin/mpathpersist SCSI PR命令工具,主要用于隔离。
/usr/sbin/mpathconf 修改多路径配置
/usr/sbin/kpartx DeviceMapper虚拟设备创建工具
/usr/sbin/multipath 多路径命令工具
用户库
/usr/lib64/libmpathpersist.so.0
/usr/lib64/libmultipath.so
/usr/lib64/libmpathpersist.so
/usr/lib64/libmultipath.so.0
用户手册
/usr/share/man/*
插件
/usr/lib64/multipath/*
实例分析
这里有一个iSCSI设备,名称为iqn.2016-10.org.lr:storage,有两条路径,都处于活动状态。现在使用一些简单命令查看他们的状态,并简要分析相互关系,对其有一个初步认识。
查看多路径配置
$ cat /etc/multipath.conf
blacklist {
devnode "^(ram|raw|loop|fd|md|dm-|sr|scd|st)[0-9]*"
devnode "^sd[a-b][0-9]*"
}
defaults {
user_friendly_names yes
path_grouping_policy multibus
failback immediate
no_path_retry fail
}
multipaths {
multipath {
wwid 360000000000000000e00000000010001
alias iscsi
}
}
除了一些基本的配置,这里把wwid为360000000000000000e00000000010001的SCSI设备命名为iscsi。
查看DeviceMapper映射表
$ dmsetup table
iscsi: 0 209715200 multipath 0 0 1 1 service-time 0 2 2 8:64 1 1 8:80 1 1
可以看到,多路径设备iscsi设备的全部扇区都是用multipath驱动进行映射,传递给驱动的参数是“0 0 1 1 service-time 0 2 2 8:64 1 1 8:80 1 1 ”,多路径对应的两个设备的设备号为“8:64”和“8:80”
查看磁盘块设备路径及其设备号
$ ll /dev/sd*
brw-rw---- 1 root disk 8, 0 11月 10 14:24 /dev/sda
brw-rw---- 1 root disk 8, 1 11月 10 14:24 /dev/sda1
brw-rw---- 1 root disk 8, 2 11月 10 14:24 /dev/sda2
brw-rw---- 1 root disk 8, 3 11月 10 14:24 /dev/sda3
brw-rw---- 1 root disk 8, 16 11月 10 14:24 /dev/sdb
brw-rw---- 1 root disk 8, 17 11月 10 14:24 /dev/sdb1
brw-rw---- 1 root disk 8, 32 11月 10 14:24 /dev/sdc
brw-rw---- 1 root disk 8, 33 11月 10 14:24 /dev/sdc1
brw-rw---- 1 root disk 8, 48 11月 10 14:25 /dev/sdd
brw-rw---- 1 root disk 8, 49 11月 30 11:49 /dev/sdd1
brw-rw---- 1 root disk 8, 50 11月 10 14:25 /dev/sdd2
brw-rw---- 1 root disk 8, 64 11月 10 14:24 /dev/sde
brw-rw---- 1 root disk 8, 80 11月 17 15:04 /dev/sdf
上述多路径设备号对应的两个设备的路径分别为/dev/sde和/dev/sdf。
查看被映射的多路径设备
$ ll /dev/mapper/
crw------- 1 root root 10, 236 11月 8 17:03 control
lrwxrwxrwx 1 root root 7 11月 30 10:47 iscsi -> ../dm-0
被映射的多路径设备/dev/mapper/iscsi实际上是/dev/dm-0的符号链接(其实LVM也是这样的)。
查看多路径创建的DeviceMapper设备
$ ll /dev/dm-*
brw-rw---- 1 root disk 253, 0 11月 30 10:47 /dev/dm-0
多路径创建的DeviceMapper设备的主设备号都是253。
查看全部块设备
$ ll /dev/block/
lrwxrwxrwx 1 root root 6 11月 8 17:03 11:0 -> ../sr0
lrwxrwxrwx 1 root root 7 11月 30 10:47 253:0 -> ../dm-0
lrwxrwxrwx 1 root root 6 11月 10 14:24 8:0 -> ../sda
lrwxrwxrwx 1 root root 7 11月 10 14:24 8:1 -> ../sda1
lrwxrwxrwx 1 root root 6 11月 10 14:24 8:16 -> ../sdb
lrwxrwxrwx 1 root root 7 11月 10 14:24 8:17 -> ../sdb1
lrwxrwxrwx 1 root root 7 11月 10 14:24 8:2 -> ../sda2
lrwxrwxrwx 1 root root 7 11月 10 14:24 8:3 -> ../sda3
lrwxrwxrwx 1 root root 6 11月 10 14:24 8:32 -> ../sdc
lrwxrwxrwx 1 root root 7 11月 10 14:24 8:33 -> ../sdc1
lrwxrwxrwx 1 root root 6 11月 10 14:25 8:48 -> ../sdd
lrwxrwxrwx 1 root root 7 11月 30 11:49 8:49 -> ../sdd1
lrwxrwxrwx 1 root root 7 11月 10 14:25 8:50 -> ../sdd2
lrwxrwxrwx 1 root root 6 11月 10 14:24 8:64 -> ../sde
lrwxrwxrwx 1 root root 6 11月 17 15:04 8:80 -> ../sdf
查看设备的WWID
$ /lib/udev/scsi_id -g -u /dev/sde
360000000000000000e00000000010001
$ /lib/udev/scsi_id -g -u /dev/sdf
360000000000000000e00000000010001
$ /lib/udev/scsi_id -g -u /dev/mapper/iscsi
360000000000000000e00000000010001
同一设备的两条路径,以及被映射的新设备的WWID相同
查看当前多路径信息
$ multipath -ll
iscsi (360000000000000000e00000000010001) dm-0 IET ,VIRTUAL-DISK
size=100G features='0' hwhandler='0' wp=rw
`-+- policy='service-time 0' prio=1 status=active
|- 3:0:0:1 sde 8:64 active ready running
`- 4:0:0:1 sdf 8:80 active ready running
总结
-
Initator在使用两个路径连接名称为iqn.2016-10.org.lr:storage, WWID为360000000000000000e00000000010001的设备时创建两个块设备文件,路径分别为/dev/sde,/dev/sdf,他们的设备号分别为8:64和8:80;
-
多路径软件根据两个iSCSI块设备的相关信息,把他们确定为同一个设备的多条路径,并根据配置使用libdevicemapper的函数创建一个虚拟的块设备;
-
libdevicemapper函数根据multipathd或者multipath提供的参数,创建DeviceMapper映射表,并加载multipath驱动,创建虚拟磁盘/dev/dm-0,设备号为253:0,同时创建其符号链接/dev/mapper/iscsi。
驱动层分析
源码获取
多路径的驱动源码在内核源码的drivers/md目录下,获取内核的基本步骤如下:
$ yum install rpmbuild rpmdevtools
$ rpmdev-setuptree
$ yumdownloader --source kernel
$ rpm -ivh kernel-*.src.rpm
$ rpmbuild -bp ~/rpmbuild/SPECS/kernel.spec
$ ls ~/rpmbuild/BUILD/kernel-*/linux-*/
编译配置选项分析
查看Kconfig
$ vi driver/md/Kconfig
...
config MD_MULTIPATH
tristate "Multipath I/O support"
depends on BLK_DEV_MD
help
MD_MULTIPATH provides a simple multi-path personality for use
the MD framework. It is not under active development. New
projects should consider using DM_MULTIPATH which has more
features and more testing.
If unsure, say N.
config DM_MULTIPATH
tristate "Multipath target"
depends on BLK_DEV_DM
# nasty syntax but means make DM_MULTIPATH independent
# of SCSI_DH if the latter isn't defined but if
# it is, DM_MULTIPATH must depend on it. We get a build
# error if SCSI_DH=m and DM_MULTIPATH=y
depends on SCSI_DH || !SCSI_DH
---help---
Allow volume managers to support multipath hardware.
config DM_MULTIPATH_QL
tristate "I/O Path Selector based on the number of in-flight I/Os"
depends on DM_MULTIPATH
---help---
This path selector is a dynamic load balancer which selects
the path with the least number of in-flight I/Os.
If unsure, say N.
config DM_MULTIPATH_ST
tristate "I/O Path Selector based on the service time"
depends on DM_MULTIPATH
---help---
This path selector is a dynamic load balancer which selects
the path expected to complete the incoming I/O in the shortest
time.
If unsure, say N.
...
其中MD_MULTIPATH为旧版驱动,已被抛弃,不建议使用。DM_MULTIPATH为新版驱动,DM_MULTIPATH_QL和DM_MULTIPATH_ST为两种路径选择算法。
查看Makefile
$ vi driver/md/Makefile
...
obj-$(CONFIG_MD_MULTIPATH) += multipath.o
dm-multipath-y += dm-path-selector.o dm-mpath.o
obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o
obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o
driver/md/dm-mpath.o
driver/md/dm-path-selector.o
driver/md/dm-round-robin.o
driver/md/dm-queue-length.o
driver/md/dm-service-time.o
...
如果使用新版多路径,则会编译dm-path-selector.c,dm-mpath.c和dm-round-robin.c。dm-mpath.c是多路径驱动的核心,主要是负责初始化一些数据结构,以及注册DeviceMapper的Target Type;而dm-path-selector.c是负责管理路径选择算法管理的库函数;dm-round-robin.c是必备的路径选择算法,在一条路径上完成指定的IO次数后就切换到下一条,不断循环。另外就是可选的两种路径选择算法,dm-service-time.c根据路径的吞吐量以及未完成的字节数选择负荷较轻的路径,dm-queue-length.c根据正在处理的IO个数较少的那个。
dm-mpath.c 分析
初始化
static struct target_type multipath_target = {
.name = "multipath",
.version = {1, 9, 0},
.module = THIS_MODULE,
.ctr = multipath_ctr,
.dtr = multipath_dtr,
.map_rq = multipath_map,
.clone_and_map_rq = multipath_clone_and_map,
.release_clone_rq = multipath_release_clone,
.rq_end_io = multipath_end_io,
.presuspend = multipath_presuspend,
.postsuspend = multipath_postsuspend,
.resume = multipath_resume,
.status = multipath_status,
.message = multipath_message,
.ioctl = multipath_ioctl,
.iterate_devices = multipath_iterate_devices,
.busy = multipath_busy,
};
static int __init dm_multipath_init(void)
{
int r;
/* allocate a slab for the dm_ios */
_mpio_cache = KMEM_CACHE(dm_mpath_io, 0);
if (!_mpio_cache)
return -ENOMEM;
r = dm_register_target(&multipath_target);
if (r < 0) {
DMERR("register failed %d", r);
r = -EINVAL;
goto bad_register_target;
}
kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0);
if (!kmultipathd) {
DMERR("failed to create workqueue kmpathd");
r = -ENOMEM;
goto bad_alloc_kmultipathd;
}
/*
* A separate workqueue is used to handle the device handlers
* to avoid overloading existing workqueue. Overloading the
* old workqueue would also create a bottleneck in the
* path of the storage hardware device activat