结论: pcap默认使用mmap方式读取包。所以不需要过多的去考虑抓包的效率问题。内部已经做得很完美了。
ref: http://www.diybl.com/course/3_program/c++/cppxl/20100408/202002_3.html
在libpcap-1.0.0中引入了zerocopy BPF,那么这个zerocopy BPF又是什么呢?PACKET_MMAP
查看两个版本libpcap编译的程序的strace的差异,除了poll之外,对于setsockopt还有一个差异:
setsockopt(4, SOL_PACKET, PACKET_RX_RING, "\0@\0\0\376\0\0\0@ \0\0\376\0\0\0", 16) = 0
mmap2(NULL, 4161536, PROT_READ|PROT_WRITE, MAP_SHARED, 4, 0) = 0xb7a54000
从字面上来猜猜看:setsockopt设置socket的PACKET_RX_RING选项,至于这个选项是做什么的,只能够猜测是一个接收环形缓冲区相关的东西,具体其他的要看其他的参数了。
mmap2将一段内核空间地址映射到用户空间,这样用户空间就可以直接操作内核缓冲区中的数据了,至于内核缓冲区中的数据如何来的,就是所谓的zerocopy BPF底层实现的了。
这个zerocopy叫做PACKET_MMAP,之前也叫做PACKET_RING,查看kernel的config文件的话是:
CONFIG_PACKET_MMAP=y【viktor:3.1版本内核里面没有这个选项。
packet: Kill CONFIG_PACKET_MMAP.
Early on this was an experimental facility that few
people other than Alexey Kuznetsov played with.
Now it's a pretty fundamental thing and as people add
more features to AF_PACKET sockets this config options
creates ifdef spaghetti.
So kill it off.
】
以前的时候有一个专门的PACKET_MMAP版本的libpcap,但是在libpcap-1.0.0中已经增加了部分平台的PACKET_MMAP/PACKET_RING支持。
以上为转载。
源码分析:libpcap 1.3.0-pre-git
pcap_open_live调用的……pcap-linux.c: pcap_activate_linux
static int
pcap_activate_linux(pcap_t *handle)
{
const char *device;
int status = 0;
//viktor: 首先设置成普通的文件读取方式。
device = handle->opt.source;
handle->inject_op = pcap_inject_linux;
handle->setfilter_op = pcap_setfilter_linux;
handle->setdirection_op = pcap_setdirection_linux;
handle->set_datalink_op = NULL; /* can't change data link type */
handle->getnonblock_op = pcap_getnonblock_fd;
handle->setnonblock_op = pcap_setnonblock_fd;
handle->cleanup_op = pcap_cleanup_linux;
handle->read_op = pcap_read_linux;
handle->stats_op = pcap_stats_linux;
......
/*
* viktor:activate_new激活 PF_PACKET 连接。老版本内核是 SOCK_PACKET,维持兼容性.把判错代码都删掉了
*/
status = activate_new(handle);
if (status < 0) {
goto fail;
}
if (status == 1) {
/*
* Success.
* Try to use memory-mapped access.
*/
switch (activate_mmap(handle, &status)) { // <<-----------------------------这里
case 1:
/*
* We succeeded. status has been
* set to the status to return,
* which might be 0, or might be
* a PCAP_WARNING_ value.
*/
return status;
case 0:
/*
* Kernel doesn't support it - just continue
* with non-memory-mapped access.
*/
break;
......
pcap-linux.c: activate_mmap
#ifdef HAVE_PACKET_RING
/*
* viktor: 成功返回1.不支持返回0.出错返回-1 以下代码把判错的部分都删掉了。
*/
static int
activate_mmap(pcap_t *handle, int *status)
{
int ret;
handle->md.oneshot_buffer = malloc(handle->snapshot);
if (handle->opt.buffer_size == 0) {
/* by default request 2M for the ring buffer */
handle->opt.buffer_size = 2*1024*1024;
}
ret = prepare_tpacket_socket(handle);
ret = create_ring(handle, status); // <<-----------------------------这里
/*
* Success. 把操作都改成mmap操作......
*/
handle->read_op = pcap_read_linux_mmap;
handle->cleanup_op = pcap_cleanup_linux_mmap;
handle->setfilter_op = pcap_setfilter_linux_mmap;
handle->setnonblock_op = pcap_setnonblock_mmap;
handle->getnonblock_op = pcap_getnonblock_mmap;
handle->oneshot_callback = pcap_oneshot_mmap;
handle->selectable_fd = handle->fd;
return 1;
}
#else /* HAVE_PACKET_RING */
static int
activate_mmap(pcap_t *handle _U_, int *status _U_)
{
return 0;
}
#endif /* HAVE_PACKET_RING */
pcap-linux.c: create_ring
/*
* Attempt to set up memory-mapped access.
*......
*/
static int
create_ring(pcap_t *handle, int *status)
{
unsigned i, j, frames_per_block;
struct tpacket_req req;
socklen_t len;
unsigned int sk_type, tp_reserve, maclen, tp_hdrlen, netoff, macoff;
unsigned int frame_size;
/*
* Start out assuming no warnings or errors.
*/
*status = 0;
.....................
Line #3471
req.tp_frame_size = TPACKET_ALIGN(macoff + frame_size);
req.tp_frame_nr = handle->opt.buffer_size/req.tp_frame_size;
/* compute the minumum block size that will handle this frame.
* The block has to be page size aligned.
* The max block size allowed by the kernel is arch-dependent and
* it's not explicitly checked here. */
req.tp_block_size = getpagesize();
while (req.tp_block_size < req.tp_frame_size)
req.tp_block_size <<= 1;
frames_per_block = req.tp_block_size/req.tp_frame_size;
.....................
Line #3588
retry:
req.tp_block_nr = req.tp_frame_nr / frames_per_block;
/* req.tp_frame_nr is requested to match frames_per_block*req.tp_block_nr */
req.tp_frame_nr = req.tp_block_nr * frames_per_block;
if (setsockopt(handle->fd, SOL_PACKET, PACKET_RX_RING, //<<------------------------这里
(void *) &req, sizeof(req))) {
if ((errno == ENOMEM) && (req.tp_block_nr > 1)) {
/*
* Memory failure; try to reduce the requested ring
* size.
*
* We used to reduce this by half -- do 5% instead.
* That may result in more iterations and a longer
* startup, but the user will be much happier with
* the resulting buffer size.
*/
if (req.tp_frame_nr < 20)
req.tp_frame_nr -= 1;
else
req.tp_frame_nr -= req.tp_frame_nr/20;
goto retry;
}
if (errno == ENOPROTOOPT) {
/*
* We don't have ring buffer support in this kernel.
*/
return 0;
}
snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
"can't create rx ring on packet socket: %s",
pcap_strerror(errno));
*status = PCAP_ERROR;
return -1;
}
/* memory map the rx ring */
handle->md.mmapbuflen = req.tp_block_nr * req.tp_block_size;
handle->md.mmapbuf = mmap(0, handle->md.mmapbuflen, // <<-----------------------------这里
PROT_READ|PROT_WRITE, MAP_SHARED, handle->fd, 0);
if (handle->md.mmapbuf == MAP_FAILED) {
snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
"can't mmap rx ring: %s", pcap_strerror(errno));
/* clear the allocated ring on error*/
destroy_ring(handle);
*status = PCAP_ERROR;
return -1;
}
/* allocate a ring for each frame header pointer*/
handle->cc = req.tp_frame_nr;
handle->buffer = malloc(handle->cc * sizeof(union thdr *));
if (!handle->buffer) {
snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
"can't allocate ring of frame headers: %s",
pcap_strerror(errno));
destroy_ring(handle);
*status = PCAP_ERROR;
return -1;
}
/* fill the header ring with proper frame ptr*/
handle->offset = 0;
for (i=0; i
md.mmapbuf[i*req.tp_block_size];
for (j=0; j
offset) {
RING_GET_FRAME(handle) = base;
base += req.tp_frame_size;
}
}
handle->bufsize = req.tp_frame_size;
handle->offset = 0;
return 1;
}
我自己的strace结果:
bind(3, {sa_family=AF_PACKET, proto=0x03, if3, pkttype=PACKET_HOST, addr(0)={0, }, 20) = 0
getsockopt(3, SOL_SOCKET, SO_ERROR, [0], [4]) = 0
setsockopt(3, SOL_PACKET, PACKET_AUXDATA, [1], 4) = 0
getsockopt(3, SOL_PACKET, PACKET_HDRLEN, [28], [4]) = 0
setsockopt(3, SOL_PACKET, PACKET_VERSION, [1], 4) = 0
setsockopt(3, SOL_PACKET, PACKET_RESERVE, [4], 4) = 0
setsockopt(3, SOL_PACKET, PACKET_RX_RING, {block_size=131072, block_nr=31, frame_size=65600, frame_nr=31}, 16) = 0
mmap2(NULL, 4063232, PROT_READ|PROT_WRITE, MAP_SHARED, 3, 0) = 0xb714d000 //<<----------------这里
write(2, "capture_short.cpp:42: Start capt"..., 47capture_short.cpp:42: Start capture on ��...
) = 47
之后就是抓包-读时间-输出的循环:
poll([{fd=3, events=POLLIN}], 1, 1000) = 1 ([{fd=3, revents=POLLIN}])
stat64("/etc/localtime", {st_mode=S_IFREG|0644, st_size=834, ...}) = 0
write(2, "capture_short.cpp:46: 10:16:29,4"..., 49capture_short.cpp:46: 10:16:29,483605 us, len:79
) = 49