# top
top - 09:25:02 up 1055 days, 13:10, 5 users, load average: 162.98, 236.47, 262.56
Tasks: 561 total, 1 running, 559 sleeping, 0 stopped, 1 zombie
Cpu(s): 5.2%us, 4.4%sy, 0.0%ni, 90.2%id, 0.0%wa, 0.0%hi, 0.2%si, 0.0%st
Mem: 132042872k total, 124209800k used, 7833072k free, 1321092k buffers
Swap: 0k total, 0k used, 0k free, 103341292k cached
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
42493 root 20 0 20.7g 418m 3284 S 276.0 0.3 8885:56 python
42518 root 20 0 52.8g 183m 3252 S 74.3 0.1 6:26.57 python
top命令的各个参数意义:
第一行是任务队列信息
09:25:02 当前时间
up 1055 days, 13:10 系统运行时间
5 user 当前登录用户数
load average: 162.98, 236.47, 262.56 系统负载,即任务队列的平均长度。三个数值分别为 1分钟、5分钟、15分钟前到现在的平均值。
第二行为进程的信息
total 进程总数
running 正在运行的进程数
sleeping 睡眠的进程数
stopped 停止的进程数
zombie 僵尸进程数
第三行为CPU的信息
5.2% us 用户空间占用CPU百分比
4.4% sy 内核空间占用CPU百分比
0.0% ni 用户进程空间内改变过优先级的进程占用CPU百分比
90.2% id 空闲CPU百分比
0.0% wa 等待输入输出的CPU时间百分比
0.0%hi:硬件CPU中断占用百分比
0.2%si:软中断占用百分比
0.0%st:虚拟机占用百分比
第四行为内存信息
132042872k total 物理内存总量
124209800k used 使用的物理内存总量
7833072k free 空闲内存总量
1321092k buffers 用作内核缓存的内存量
第五行为内存信息
0k total 交换区总量
0k used 使用的交换区总量
0k free 空闲交换区总量
103341292k cached 缓冲的交换区总量,内存中的内容被换出到交换区,而后又被换入到内存,但使用过的交换区尚未被覆盖,该数值即为这些内容已存在于内存中的交换区的大小,相应的内存再次被换出时可不必再对交换区写入。
看到上面的信息指标,当前主机的负载已经很高了,那么要查一下原因了,看一下进程使用情况
# ps aux | grep ma_server
root 42492 0.0 0.0 316416 44080 ? S Apr08 0:00 python /export/servers/app/ma_server.py start
root 42493 1231 0.3 21657704 428116 ? Sl Apr08 8886:16 python /export/servers/app/ma_server.py start
。。。。。。
可以看到 pid:42493 的进程 占用了很长是时间8886:16,这是不正常的值,查看这个进程执行了哪些文件
lsof -p 42493
COMMAND PID USER FD TYPE DEVICE SIZE/OFF NODE NAME
python 42493 root cwd DIR 8,2 4096 2 /
python 42493 root rtd DIR 8,2 4096 2 /
python 42493 root txt REG 8,2 6224753 2235542 /usr/local/bin/python2.7
python 42493 root mem REG 8,2 90880 1704142 /lib64/libgcc_s-4.4.7-20120601.so.1
python 42493 root mem REG 8,2 24374 2626716 /usr/local/lib/python2.7/site-packages/psutil-4.4.0-py2.7-linux-x86_64.egg/psutil/_psutil_posix.so
python 42493 root mem REG 8,2 404779 2756184 /usr/local/lib/python2.7/site-packages/thriftpy-0.3.9-py2.7-linux-x86_64.egg/thriftpy/protocol/cybin.so
python 42493 root mem REG 8,2 109073 2756205 /usr/local/lib/python2.7/site-packages/thriftpy-0.3.9-py2.7-linux-x86_64.egg/thriftpy/transport/memory/cymemory.so
python 42493 root mem REG 8,2 190975 2756199 /usr/local/lib/python2.7/site-packages/thriftpy-0.3.9-py2.7-linux-x86_64.egg/thriftpy/transport/buffered/cybuffered.so
python 42493 root mem REG 8,2 74926 3014689 /root/.python-eggs/pymongo-3.3.0-py2.7-linux-x86_64.egg-tmp/pymongo/_cmessage.so
python 42493 root mem REG 8,2 151980 3014691 /root/.python-eggs/pymongo-3.3.0-py2.7-linux-x86_64.egg-tmp/bson/_cbson.so
python 42493 root mem REG 8,2 27424 1704280 /lib64/libnss_dns-2.12.so
python 42493 root mem REG 8,2 65928 1704158 /lib64/libnss_files-2.12.so
python 42493 root 20u sock 0,6 0t0 1563627155 can't identify protocol
python 42493 root 21u sock 0,6 0t0 1563645588 can't identify protocol
python 42493 root 22u sock 0,6 0t0 1563626889 can't identify protocol
python 42493 root 23u sock 0,6 0t0 1563634945 can't identify protocol
python 42493 root 24u sock 0,6 0t0 1563703398 can't identify protocol
看到报错的信息’can’t identify protocol’,他们都对应一个进程id 42493,查一下进程id的系统跟踪信息
strace -p 42493
Process 42493 attached - interrupt to quit
select(20, [19], [], [], {0, 57202}) = 0 (Timeout)
futex(0x148bbd0, FUTEX_WAIT_PRIVATE, 0, NULL) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x148bbd0, FUTEX_WAIT_PRIVATE, 0, NULL) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x148bbd0, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x148bbd0, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x148bbd0, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x148bbd0, FUTEX_WAIT_PRIVATE, 0, NULL) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x148bbd0, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x148bbd0, FUTEX_WAIT_PRIVATE, 0, NULL) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x148bbd0, FUTEX_WAKE_PRIVATE, 1) = 1
select(20, [19], [], [], {0, 500000}) = 0 (Timeout)
发现了好多 ‘Resource temporarily unavailable’ 系统资源不够用的问题(猜测是程序的进程数太多了,没有释放或者释放比较慢),先看系统的进程数设置吧。
# ulimit -a
core file size (blocks, -c) 0
data seg size (kbytes, -d) unlimited
scheduling priority (-e) 0
file size (blocks, -f) unlimited
pending signals (-i) 1031447
max locked memory (kbytes, -l) 64
max memory size (kbytes, -m) unlimited
open files (-n) 755350
pipe size (512 bytes, -p) 8
POSIX message queues (bytes, -q) 819200
real-time priority (-r) 0
stack size (kbytes, -s) 10240
cpu time (seconds, -t) unlimited
max user processes (-u) 65535
virtual memory (kbytes, -v) unlimited
file locks (-x) unlimited
用户的可用进程数已经开到:max user processes (-u) 65535 多了
未完待续
接着写:
事情有了进展,执行top命令
top
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
98486 root 20 0 20.5g 431m 3284 S 302.2 0.3 814:01.93 python
190523 root 20 0 403m 81m 5504 S 21.4 0.1 6:43.93 python
看到pid=98486的占cpu最高,可以查看这个进程里的线程哪个有问题
pstree -p 98486 | wc # 查看线程共有多少个
426 426 14057 # 有426个线程
top -p 98486 -H # 会显示出占cpu高的线程,并看到线程id
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
109995 root 20 0 20.5g 431m 3284 S 300.3 0.3 1:56.64 python
106677 root 20 0 20.5g 431m 3284 S 1.0 0.3 1:57.19 python
107572 root 20 0 20.5g 431m 3284 S 1.0 0.3 1:57.82 python
108507 root 20 0 20.5g 431m 3284 S 1.0 0.3 1:56.50 python
108549 root 20 0 20.5g 431m 3284 S 1.0 0.3 1:56.24 python
可以看到第一个是最高的,线程id=109995(虽然标记PID,但他确实是线程id)
进入gdb调试中
gdb # 进入gdb
(gdb) thread 109995 # 看看这个线程做了哪些事
Thread 109995 ():
#0 0x00007f5d95225930 in sem_wait () from /lib64/libpthread.so.0
#1 0x00000000004de048 in PyThread_acquire_lock ()
#2 0x00000000004a3034 in PyEval_RestoreThread ()
#3 0x00007f5d8e557aa6 in sock_recv_guts () from /usr/local/lib/python2.7/lib-dynload/_socket.so
#4 0x00007f5d8e557c5e in sock_recv () from /usr/local/lib/python2.7/lib-dynload/_socket.so
#5 0x00000000004a90ac in PyEval_EvalFrameEx ()
#6 0x00000000004aa0e6 in PyEval_EvalFrameEx ()
#7 0x00000000004aa0e6 in PyEval_EvalFrameEx ()
#8 0x00000000004aab07 in PyEval_EvalCodeEx ()
#9 0x000000000050e21e in function_call ()
#10 0x00000000004199c7 in PyObject_Call ()
#11 0x000000000042256f in instancemethod_call ()
#12 0x00000000004199c7 in PyObject_Call ()
#13 0x00000000004a2843 in PyEval_CallObjectWithKeywords ()
#14 0x0000000000424836 in PyInstance_New ()
#15 0x00000000004199c7 in PyObject_Call ()
#16 0x00007f5d948e18fd in clone () from /lib64/libc.so.6
quit
在这里我看到了PyThread_acquire_lock (),感觉程序里调用线程锁的部分代码导致的,所以定位代码,并且根据业务的实际情况找到问题点,得以解决。
pstack 进程id
会列出来所有这个进程下的线程跑的程序