使用ULOGD打SYSLOG到SYSLOG-NG,当NFLOG拿到的数据包在6K左右时,CPU有两个核心占了15%左右
使用修改过的ULOGD直接打TCP,CPU只有一个核心占15%左右(修改版在https://github.com/InsZVA/ulogd-with-syslog-tcp)
使用perf top查看,
8.89% libc-2.12.so [.] vfprintf
3.49% libc-2.12.so [.] _IO_default_xsputn
2.10% [kernel] [k] find_busiest_group
1.94% libc-2.12.so [.] _IO_vfscanf
1.70% libc-2.12.so [.] __strchrnul
1.66% libpython2.7.so.1.0 [.] PyEval_EvalFrameEx
1.64% libc-2.12.so [.] _itoa_word
1.45% [kernel] [k] schedule
1.13% [kernel] [k] ixgbe_clean_rx_irq
1.08% ulogd [.] ulogd_propagate_results
1.00% [kernel] [k] _spin_lock
0.98% [kernel] [k] ipt_do_table
0.97% [kernel] [k] net_rx_action
0.83% [kernel] [k] memcpy
0.76% [kernel] [k] _spin_unlock_irqrestore
0.72% [kernel] [k] menu_select
0.71% libc-2.12.so [.] _IO_str_init_static_internal
0.70% [kernel] [k] irq_entries_start
0.65% libc-2.12.so [.] _IO_vsprintf
0.59% [kernel] [k] hrtimer_interrupt
0.57% [kernel] [k] __hrtimer_start_range_ns
0.55% [kernel] [k] ixgbe_poll
0.54% [kernel] [k] rb_erase
0.54% libc-2.12.so [.] _IO_old_init
0.53% [kernel] [k] find_next_bit
0.53% [kernel] [k] getnstimeofday
0.49% [kernel] [k] _spin_lock_irqsave
0.47% [kernel] [k] ktime_get_real
0.47% [kernel] [k] tick_nohz_stop_sched_tick
0.46% libpython2.7.so.1.0 [.] lookdict_string
0.46% [kernel] [k] nf_conntrack_in
0.45% traffic_server [.] NetHandler::mainNetEvent
0.45% [kernel] [k] native_read_tsc
0.45% [kernel] [k] kfree
0.43% libc-2.12.so [.] sprintf
0.42% [kernel] [k] __audit_syscall_exit
0.42% [kernel] [k] __switch_to
0.40% [kernel] [k] ip_route_input
0.40% [kernel] [k] __remove_hrtimer
0.38% [kernel] [k] cpumask_next_and
0.37% libc-2.12.so [.] __offtime
0.37% libc-2.12.so [.] free
大量CPU消耗在标准输入输出库函数中,那么看来应该是解析字符串的问题了。
stack=log3:NFLOG,base1:BASE,ifi1:IFINDEX,ip2str1:IP2STR,print1:PRINTPKT,sys1:SYSLOGTCP
写了个代码测sprintf的性能:
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#define NULL 0
#define TIME_ELAPSED(codeToTime) do{ \
struct timeval beginTime, endTime; \
gettimeofday(&beginTime, NULL); \
{codeToTime;} \
gettimeofday(&endTime, NULL); \
long secTime = endTime.tv_sec - beginTime.tv_sec; \
long usecTime = endTime.tv_usec - beginTime.tv_usec; \
printf("[%s(%d)]Elapsed Time: SecTime = %lds, UsecTime = %ldus!\n", __FILE__, __LINE__, secTime, usecTime); \
}while(0)
uint16_t randu16() {
return (uint16_t)rand();
}
uint32_t randu32() {
return (uint32_t)rand();
}
char* f_puts(char* buf, const char* data) {
while((*buf++ = *data++));
return --buf;
}
char* f_putn(char* buf, int n) {
while (n) {
int digit = n % 10;
*buf++ = (char)('0' + digit);
n /= 10;
}
return --buf;
}
int optimized() {
char buf[1024];
char* buf_cur = buf;
buf_cur = f_puts(buf_cur, "PROTO=TCP SPT=");
buf_cur = f_putn(buf_cur, randu16());
buf_cur = f_puts(buf_cur, " DPT=");
buf_cur = f_putn(buf_cur, randu16());
buf_cur = f_puts(buf_cur, " SEQ=");
buf_cur = f_putn(buf_cur, randu32());
buf_cur = f_puts(buf_cur, " ACK=");
buf_cur = f_putn(buf_cur, randu32());
buf_cur = f_puts(buf_cur, " WINDOW=");
buf_cur = f_putn(buf_cur, randu16());
buf_cur = f_puts(buf_cur, " URG ACK PSH RST SYN FIN URGP=");
buf_cur = f_putn(buf_cur, randu16());
buf_cur = f_puts(buf_cur, " ");
return (int)(buf_cur - buf);
}
int work() {
char buf[1024];
char* buf_cur = buf;
buf_cur += sprintf(buf_cur, "PROTO=TCP ");
buf_cur += sprintf(buf_cur, "SPT=%u DPT=%u ",
randu16(),
randu16());
/* FIXME: config */
buf_cur += sprintf(buf_cur, "SEQ=%u ACK=%u ",
randu32(),
randu32());
buf_cur += sprintf(buf_cur, "WINDOW=%u ",
randu16());
buf_cur += sprintf(buf_cur, "URG ");
buf_cur += sprintf(buf_cur, "ACK ");
buf_cur += sprintf(buf_cur, "PSH ");
buf_cur += sprintf(buf_cur, "RST ");
buf_cur += sprintf(buf_cur, "SYN ");
buf_cur += sprintf(buf_cur, "FIN ");
buf_cur += sprintf(buf_cur, "URGP=%u ",
randu16());
return (int)(buf_cur - buf);
}
int main() {
TIME_ELAPSED(
for (int i = 0; i < 1000; i++)
work();
);
TIME_ELAPSED(
for (int i = 0; i < 1000; i++)
optimized();
);
return 0;
}
结果:
[main.c(99)]Elapsed Time: SecTime = 0s, UsecTime = 1158us!
[main.c(103)]Elapsed Time: SecTime = 0s, UsecTime = 159us!
注意:
千万当心itoa、sprinf等库函数,性能可能低的可怕。