在我们的云机系统上,某个计算节点利用率极高,该计算节点起了66个虚拟机
[o***g@compute-13 ~]$ ps -eLf |grep kvm |wc -l
469
8070进程利用率极高
仔细分析8070进程,发现它下面还派生了5个线程
[olinjg@compute-13 ~]$ ps -eLf |grep kvm |grep 8070 |awk '{print $2 " " $3 " "$4}'
8070 1 8070
8070 1 8072
8070 1 8073
8070 1 8075
8070 1 8078
8070 1 8087
top –p 8078
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
8078 qemu 20 0 8416m 4.0g 2192 R
180.1 2.8 2187:57 qemu-kvm
[olinjg@compute-13 ~]$ ps -eLf |grep kvm |grep 8070 |awk '{print $2 " " $3 " "$4}'
8070 1 8070
8070 1 8072
8070 1 8073
8070 1 8075
8070 1 8078
8070 1 8087
8070 1 8070
8070 1 8072
8070 1 8073
8070 1 8075
8070 1 8078
8070 1 8087
实时追踪:
先看看这些线程都在干什么:
[root@compute-13 olinjg]# strace -p 8072
Process 8072 attached - interrupt to quit
rt_sigtimedwait([BUS USR1], 0x7f33174e7c00, {0, 0}, 8) = -1 EAGAIN (Resource temporarily unavailable)
rt_sigpending([]) = 0
ioctl(15, 0xae80, 0) = 0
ioctl(15, 0xae80, 0) = 0
ioctl(15, 0xae80, 0) = 0
ioctl(11, 0xffffffffc008ae67, 0x7f33174e7a70) = 0
ioctl(11, 0xffffffffc008ae67, 0x7f33174e7aa0) = 0
ioctl(15, 0xae80, 0) = 0
Process 8072 attached - interrupt to quit
rt_sigtimedwait([BUS USR1], 0x7f33174e7c00, {0, 0}, 8) = -1 EAGAIN (Resource temporarily unavailable)
rt_sigpending([]) = 0
ioctl(15, 0xae80, 0) = 0
ioctl(15, 0xae80, 0) = 0
ioctl(15, 0xae80, 0) = 0
ioctl(11, 0xffffffffc008ae67, 0x7f33174e7a70) = 0
ioctl(11, 0xffffffffc008ae67, 0x7f33174e7aa0) = 0
ioctl(15, 0xae80, 0) = 0
[root@compute-13 olinjg]# strace -p 8073
Process 8073 attached - interrupt to quit
rt_sigtimedwait([BUS USR1], 0x7f3316ae6c00, {0, 0}, 8) = -1 EAGAIN (Resource temporarily unavailable)
rt_sigpending([]) = 0
ioctl(16, 0xae80, 0) = 0
ioctl(16, 0xae80, 0) = 0
ioctl(16, 0xae80, 0) = 0
ioctl(11, 0xffffffffc008ae67, 0x7f3316ae6a70) = 0
ioctl(11, 0xffffffffc008ae67, 0x7f3316ae6aa0) = 0
ioctl(16, 0xae80, 0) = 0
ioctl(11, 0xffffffffc008ae67, 0x7f3316ae6aa0) = 0
ioctl(16, 0xae80, 0) = 0
ioctl(16, 0xae80, 0) = 0
ioctl(16, 0xae80, 0) = 0
ioctl(16, 0xae80, 0) = 0
ioctl(16, 0xae80, 0) = 0
Process 8073 attached - interrupt to quit
rt_sigtimedwait([BUS USR1], 0x7f3316ae6c00, {0, 0}, 8) = -1 EAGAIN (Resource temporarily unavailable)
rt_sigpending([]) = 0
ioctl(16, 0xae80, 0) = 0
ioctl(16, 0xae80, 0) = 0
ioctl(16, 0xae80, 0) = 0
ioctl(11, 0xffffffffc008ae67, 0x7f3316ae6a70) = 0
ioctl(11, 0xffffffffc008ae67, 0x7f3316ae6aa0) = 0
ioctl(16, 0xae80, 0) = 0
ioctl(11, 0xffffffffc008ae67, 0x7f3316ae6aa0) = 0
ioctl(16, 0xae80, 0) = 0
ioctl(16, 0xae80, 0) = 0
ioctl(16, 0xae80, 0) = 0
ioctl(16, 0xae80, 0) = 0
ioctl(16, 0xae80, 0) = 0
再看看进程8070在干什么:
似乎只执行了read write select等几个简单的系统调用函数
read(9, 0x7fff02954ad0, 1) = -1 EIO (Input/output error)
read(3, 0x7fff02955a40, 128) = -1 EAGAIN (Resource temporarily unavailable)
read(51, 0x7f331c68d53c, 69632) = -1 EAGAIN (Resource temporarily unavailable)
read(51, 0x7f331c68d53c, 69632) = -1 EAGAIN (Resource temporarily unavailable)
read(9, 0x7fff02954ad0, 1) = -1 EIO (Input/output error)
read(51, 0x7f331c68d53c, 69632) = -1 EAGAIN (Resource temporarily unavailable)
read(9, 0x7fff02954ad0, 1) = -1 EIO (Input/output error)
找一个正常的计算节点,用strace看看其上的虚机在干什么。做个比较
在Read Only计算节点上写一个while程序,看看利用率是不是也会标高
CPU利用率过高的几个原因:
如果可用内存过小会导致频繁读取交换文件,使得
CPU利用率过高
linux系统性能分析,很不错:
http://www.blogjava.net/qileilove/archive/2013/03/25/396949.html