iostat计算元数据diskstat

Configure-Handler

已于 2022-10-31 07:53:06 修改

阅读量342

点赞数

分类专栏： Linux 内核文章标签： bash linux 运维

于 2022-10-30 21:30:55 首次发布

本文链接：https://blog.csdn.net/qq_42931917/article/details/127356750

版权

Linux 内核专栏收录该内容

21 篇文章 0 订阅

订阅专栏

iostat

iostat按磁盘（分区）分别输出I/O统计信息，可以提供IOPS、吞吐量、I/O请求时长，以及使用率等信息。iostat需要获取的内核统计数据需要开启相应的内核选项，额外开销忽略不计。

常用参数 -dxz 1，显示磁盘使用率（-d），使用更多的列（-x），忽略指标为0的设备（-z），相关数据输出的间隔时间（1）。

有两列提供了有关磁盘I/O合并的信息：当系统发现一个新的读或者写I/O请求与队列中的另外一个I/O位置相邻时，这两个IO请求就会被合并，目的就是提高性能。

$ iostat -dxz 1
Device            r/s     rkB/s   rrqm/s  %rrqm r_await rareq-sz     w/s     wkB/s   wrqm/s  %wrqm w_await wareq-sz     d/s     dkB/s   drqm/s  %drqm d_await dareq-sz  aqu-sz  %util
sda              0.00      0.00     0.00   0.00    0.00     0.00    3.00     28.00     3.00  50.00    1.33     9.33    0.00      0.00     0.00   0.00    0.00     0.00    0.01   0.80

性能指标	含义	说明
Device	设备或者是分区的Name	在/dev/目录下可以找到对应设备名
r/s	每秒发送给磁盘的读请求数	合并之后的请求数
w/s	每秒发送给磁盘的写请求数	合并之后的请求数
rkB/s	每秒从磁盘读取的数据量	单位为kB
wkB/s	每秒向磁盘写入的数据量	单位为kB
rrqm/s	每秒合并的读请求数	%rrqm表示合并读请求的的百分比
wrqm/s	每秒合并的写请求数	%rrqm表示合并写请求的的百分比
avgrq-sz	平均请求尺寸，单位为扇区（512个字节）
avgqu-sz	等待队列平均长度，包括在驱动程序队列中等待与在设备内部队列中等待的请求
await	平均IO请求时长（也就是设备的相应时间），包括在驱动程序队列中等待的时间，以及设备实际响应的时长（单位为ms）
r_await	与await一致，但是仅包含读请求（单位为ms）
w_await	与await一致，但是仅包含写请求（单位为ms）
svctm	平均（推测的）设备IO响应时间（单位为ms）
util	设备忙于处理 IO请求的时间百分比

cat /proc/diskstats 显示结果说明

从iostat相关指标计算，依赖内核磁盘IO相关的统计数据，统计数据来源于/proc/diskstats。

# 内核版本
$ uname -a
Linux curtis-Aspire-E5-471G 5.15.0-41-generic #44~20.04.1-Ubuntu SMP Fri Jun 24 13:27:29 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux

$ cat /proc/diskstat
8       0 sda 26490 15433 2535970 60218 8936 13415 314058 24165 0 31064 85633 0 0 0 0 579 1248
8       1 sda1 148 29 11102 161 2 0 2 3 0 136 164 0 0 0 0 0 0
8       2 sda2 26196 15404 2520410 59739 8934 13415 314056 24162 0 30964 83901 0 0 0 0 0 0

内核Documentation/ABI/testing/procfs-diskstats 中对以上各个字段的说明：

What:		/proc/diskstats
Date:		February 2008
Contact:	Jerome Marchand <jmarchan@redhat.com>
Description:
		The /proc/diskstats file displays the I/O statistics
		of block devices. Each line contains the following 14
		fields:

		==  ===================================
		 1  major number	//主设备号
		 2  minor mumber	//次设备号
		 3  device name	//设备名
		 4  reads completed successfully	//成功读请求次数
		 5  reads merged	//读请求合并次数
		 6  sectors read	//总读扇区数
		 7  time spent reading (ms)	//读消耗的总时间
		 8  writes completed	//成功写请求次数
		 9  writes merged	//读请求合并次数
		10  sectors written	//总写扇区数
		11  time spent writing (ms)	//写消耗的总时间
		12  I/Os currently in progress	//系统正在处理的I/O请求数
		13  time spent doing I/Os (ms)	//I/O操作花费的ms数，iotics
		14  weighted time spent doing I/Os (ms)	//I/O操作花费的加权ms数
		==  ===================================

		/* 4.18+内核为SSD磁盘discard功能追踪 */
		Kernel 4.18+ appends four more fields for discard
		tracking putting the total at 18:

		==  ===================================
		15  discards completed successfully	//成功discard次数
		16  discards merged	//discard合并请求次数
		17  sectors discarded	//discard总扇区数
		18  time spent discarding	//discard操作花费的总时间
		==  ===================================

		/* 内核5.5+为磁盘刷新请求追加两个字段	*/ 
		Kernel 5.5+ appends two more fields for flush requests:

		==  =====================================
		19  flush requests completed successfully	//成功刷新磁盘次数
		20  time spent flushing	//fush耗费的总时间
		==  =====================================

		For more details refer to Documentation/admin-guide/iostats.rst

磁盘discard是什么

在SSD出现之后，产生了一个问题：由于SSD底层的存储实现上和传统的机械硬盘截然不同，导致两者在数据写入和删除上表现出了极大的差异，传统机械硬盘删除数据时仅仅需要将元数据标记为删除，而真正的数据block实际上并没有立即被删除，当下一次写入数据时，只需要用新的数据覆盖掉旧的数据即可。

而SSD则不一样，新的数据只能往空白区域写入，不能像机械硬盘那样直接覆盖，因此一次覆盖写操作将变成一个 read-erase-modify-write 的循环，操作系统如果还是按照机械硬盘的覆盖写入方式来操作SSD的话，将带来性能上的大打折扣和硬盘使用寿命的缩短。

因此对于SSD的操作人们提出了新的需求——需要在数据删除的时候通知硬盘，将数据立即清理掉，在这种需求下，trim指令[1]诞生了，trim指令允许操作系统在SSD上将不再使用的数据通知到SSD底层并在其内部将数据擦除掉。

获取统计数据的内核接口

//block/genhd.c
/*
 * aggregate disk stat collector.  Uses the same stats that the sysfs
 * entries do, above, but makes them available through one seq_file.
 *
 * The output looks suspiciously like /proc/partitions with a bunch of
 * extra fields.
 */
//遍历系统所有的磁盘分区
static int diskstats_show(struct seq_file *seqf, void *v)
{
	struct gendisk *gp = v;
	struct block_device *hd;
	unsigned int inflight;
	struct disk_stats stat;
	unsigned long idx;

	/*
	if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
		seq_puts(seqf,	"major minor name"
				"     rio rmerge rsect ruse wio wmerge "
				"wsect wuse running use aveq"
				"\n\n");
	*/

	rcu_read_lock();
	xa_for_each(&gp->part_tbl, idx, hd) {
		if (bdev_is_partition(hd) && !bdev_nr_sectors(hd))
			continue;
		part_stat_read_all(hd, &stat);
		if (queue_is_mq(gp->queue))
			inflight = blk_mq_in_flight(gp->queue, hd);
		else
			inflight = part_in_flight(hd);

		seq_printf(seqf, "%4d %7d %pg "
			   "%lu %lu %lu %u "
			   "%lu %lu %lu %u "
			   "%u %u %u "
			   "%lu %lu %lu %u "
			   "%lu %u"
			   "\n",
			   MAJOR(hd->bd_dev), MINOR(hd->bd_dev), hd,
			   stat.ios[STAT_READ],
			   stat.merges[STAT_READ],
			   stat.sectors[STAT_READ],
			   (unsigned int)div_u64(stat.nsecs[STAT_READ],
							NSEC_PER_MSEC),
			   stat.ios[STAT_WRITE],
			   stat.merges[STAT_WRITE],
			   stat.sectors[STAT_WRITE],
			   (unsigned int)div_u64(stat.nsecs[STAT_WRITE],
							NSEC_PER_MSEC),
			   inflight,
			   jiffies_to_msecs(stat.io_ticks),
			   (unsigned int)div_u64(stat.nsecs[STAT_READ] +
						 stat.nsecs[STAT_WRITE] +
						 stat.nsecs[STAT_DISCARD] +
						 stat.nsecs[STAT_FLUSH],
							NSEC_PER_MSEC),
			   stat.ios[STAT_DISCARD],
			   stat.merges[STAT_DISCARD],
			   stat.sectors[STAT_DISCARD],
			   (unsigned int)div_u64(stat.nsecs[STAT_DISCARD],
						 NSEC_PER_MSEC),
			   stat.ios[STAT_FLUSH],
			   (unsigned int)div_u64(stat.nsecs[STAT_FLUSH],
						 NSEC_PER_MSEC)
			);
	}
	rcu_read_unlock();

	return 0;
}

//part_stat_read 从传入hd_struct中获取相关的统计数据
#define part_stat_read(part, field)	((part)->dkstats.field)

//struct hd_struct
struct hd_struct {
	sector_t start_sect;
	/*
	 * nr_sects is protected by sequence counter. One might extend a
	 * partition while IO is happening to it and update of nr_sects
	 * can be non-atomic on 32bit machines with 64bit sector_t.
	 */
	sector_t nr_sects;
	seqcount_t nr_sects_seq;
	sector_t alignment_offset;
	unsigned int discard_alignment;
	struct device __dev;
	struct kobject *holder_dir;
	int policy, partno;
	struct partition_meta_info *info;
#ifdef CONFIG_FAIL_MAKE_REQUEST
	int make_it_fail;
#endif
	unsigned long stamp;
	atomic_t in_flight[2];
#ifdef	CONFIG_SMP
	struct disk_stats __percpu *dkstats;
#else
	struct disk_stats dkstats;
#endif
	struct percpu_ref ref;
	struct rcu_work rcu_work;
};

//Device name的获取方法
/*
 * disk_name() is used by partition check code and the genhd driver.
 * It formats the devicename of the indicated disk into
 * the supplied buffer (of size at least 32), and returns
 * a pointer to that same buffer (for convenience).
 */

char *disk_name(struct gendisk *hd, int partno, char *buf)
{
	if (!partno)
		snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name);
	else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1]))
		snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno);
	else
		snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno);

	return buf;
}

以下为linux 内核更新IO ticks的函数

static void update_io_ticks(struct block_device *part, unsigned long now,
		bool end)
{
	unsigned long stamp;
again:
	stamp = READ_ONCE(part->bd_stamp);
	if (unlikely(time_after(now, stamp))) {
		if (likely(cmpxchg(&part->bd_stamp, stamp, now) == stamp))
			__part_stat_add(part, io_ticks, end ? now - stamp : 1);
	}
	if (part->bd_partno) {
		part = bdev_whole(part);
		goto again;
	}
}

// 调用该更新iotics的IO路径
void blk_account_io_done(struct request *req, u64 now)；
void blk_account_io_start(struct request *rq);
static unsigned long __part_start_io_acct(struct block_device *part,
					  unsigned int sectors, unsigned int op,
					  unsigned long start_time);
static void __part_end_io_acct(struct block_device *part, unsigned int op,
			       unsigned long start_time);