虚拟网络设备tun多用于网络数据的转发,常在vpn等软件中使用,是vpn软件中的关键部分,其对数据包的读写速度将会直接影响vpn软件的网络速度和流量。在linux内核3.0以前,tun设备的读写都是一次一包的方式来读取tun设备数据,数据量巨大的时候就会导致频繁tun设备的读操作,同时还需要大量系统信号来提示上层有数据需要读,这就使得tun的读操作将会占用大量的系统消耗,拖慢软件速度。好在这样的弊端在linux内核升级到3.0被解决了,其采用mmap的读写方式可以一次性将大量的数据包从tun设备层读到应用层,成功的解决了一次只能读一个包的问题,这就使得软件在这块的系统消耗将会大大降低。
首先我们创建一个RAW socket,然后调用setsockopt指定socket使用TPACKET_V3版本,这个版本只有内核3.0以上的linux系统才支持,然后对ring->req进行赋值,指定针的大小和数量、块的大小和数量及超时时间(单位:毫秒),然后调用mmap将进行映射,映射成功之后为映射地址申请空间,之后调用bind绑定fd,调用setsockopt设置fd的触发模式,具体代码如下:
static int setup_socket(struct ring *ring, char *netdev)
{
int err, i, fd, v = TPACKET_V3;
struct sockaddr_ll ll;
unsigned int blocksiz = 1 << 17, framesiz = 1 << 11;
unsigned int blocknum = 64;
int fanout_arg;
fd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
if (fd < 0) {
perror("socket");
exit(1);
}
err = setsockopt(fd, SOL_PACKET, PACKET_VERSION, &v, sizeof(v));
if (err < 0) {
perror("setsockopt");
exit(1);
}
memset(&ring->req, 0, sizeof(ring->req));
ring->req.tp_block_size = blocksiz;
ring->req.tp_frame_size = framesiz;
ring->req.tp_block_nr = blocknum;
ring->req.tp_frame_nr = (blocksiz * blocknum) / framesiz;
ring->req.tp_retire_blk_tov = 60;
ring->req.tp_feature_req_word = TP_FT_REQ_FILL_RXHASH;
err = setsockopt(fd, SOL_PACKET, PACKET_RX_RING, &ring->req,
sizeof(ring->req));
if (err < 0) {
perror("setsockopt");
exit(1);
}
ring->map = mmap(NULL, ring->req.tp_block_size * ring->req.tp_block_nr,
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_LOCKED, fd, 0);
if (ring->map == MAP_FAILED) {
perror("mmap");
exit(1);
}
ring->rd = malloc(ring->req.tp_block_nr * sizeof(*ring->rd));
assert(ring->rd);
for (i = 0; i < ring->req.tp_block_nr; ++i) {
ring->rd[i].iov_base = ring->map + (i * ring->req.tp_block_size);
ring->rd[i].iov_len = ring->req.tp_block_size;
}
memset(&ll, 0, sizeof(ll));
ll.sll_family = PF_PACKET;
ll.sll_protocol = htons(ETH_P_ALL);
ll.sll_ifindex = if_nametoindex(netdev);
ll.sll_hatype = 0;
ll.sll_pkttype = 0;
ll.sll_halen = 0;
err = bind(fd, (struct sockaddr *) &ll, sizeof(ll));
if (err < 0) {
perror("bind");
exit(1);
}
fanout_arg = (fanout_id | (fanout_type << 16));
err = setsockopt(fd, SOL_PACKET, PACKET_FANOUT,
&fanout_arg, sizeof(fanout_arg));
if (err) {
perror("setsockopt");
exit(1);
}
return fd;
}
将创建的fd放到poll中进行监听,当ring->req设置的超时生效时或者读取的数据包装满一个块时,fd被通知到可读,此时我们就可以遍历ring->rd空间,查看是否有可读取的数据包,如果有,则拷贝出来,同时将块置为系统可用,如果没有则退出继续等待,代码如下:
static void walk_block(struct block_desc *pbd, const int block_num)
{
int num_pkts = pbd->h1.num_pkts, i;
unsigned long bytes = 0;
struct tpacket3_hdr *ppd;
printf("walk_block start:%d ,pthreadId:%ld\n", block_num,pthread_self());
ppd = (struct tpacket3_hdr *) ((uint8_t *) pbd +
pbd->h1.offset_to_first_pkt);
for (i = 0; i < num_pkts; ++i) {
bytes += ppd->tp_snaplen;
display(ppd);
ppd = (struct tpacket3_hdr *) ((uint8_t *) ppd +
ppd->tp_next_offset);
}
packets_total += num_pkts;
bytes_total += bytes;
}
void read_tun()
{
struct block_desc *pbd;
struct tpacket_stats_v3 stats;
unsigned int block_num = 0, blocks = 64;
int fd = 0, err,i;
socklen_t len;
struct ring ring;
memset(&ring, 0, sizeof(ring));
fd = setup_socket(&ring, dev);
assert(fd > 0);
struct pollfd pfd;
memset(&pfd, 0, sizeof(pfd));
pfd.fd = fd;
pfd.events = POLLIN | POLLERR;
pfd.revents = 0;
while (likely(!sigint)) {
if(poll(&pfd, 1, -1) <= 0)
continue;
pbd = (struct block_desc *) ring.rd[block_num].iov_base;
if ((pbd->h1.block_status & TP_STATUS_USER) == 0) {
poll(&pfd, 1, -1);
continue;
}
walk_block(pbd, block_num);
flush_block(pbd);
block_num = (block_num + 1) % blocks;
}
len = sizeof(stats);
err = getsockopt(fd, SOL_PACKET, PACKET_STATISTICS, &stats, &len);
if (err < 0) {
perror("getsockopt");
exit(1);
}
fflush(stdout);
printf("\nReceived %u packets, %lu bytes, %u dropped, freeze_q_cnt: %u\n",
stats.tp_packets, bytes_total, stats.tp_drops,
stats.tp_freeze_q_cnt);
teardown_socket(&ring, fd);
}
如此我们就能实现一次系统通知,循环读取多个数据包,大大的减少系统消耗。