用raw_socket实现多线程icmp收发包遇到的坑

一、问题描述

因为工作需要,需要同时对多台主机进行ping探测,计算时延。不像tcp和udp,icmp报文需要自己手动构造报文头。简单的demon代码如下(python实现):

import os
import threading
import time
import socket
import struct
import select


## 这部分为IP报文头解析部分,非必要部分,为了后面调试用 ##
class IpHeader(object):
    def __init__(self):
        self.versionAndLen=None   # 4bit version 4bit header length
        self.tos=None             # 8bit type of service
        self.tolalLen = None      # 16bit total length
        self.identification =None # 16bit header identification
        self.fragment = None      # 16bit others and fragment offset
        self.ttl = None           # 8bit time to live
        self.protocal = None      # 8bit type of protocal
        self.checksum = None    # 16bit header checksum
        self.srcIP = None       # 32bit src IP address
        self.dstIP = None       # 32bit dst IP address 
    def decodeIP(self,code_stream):
        (self.versionAndLen,self.tos,self.tolalLen,self.identification,self.fragment,
         self.ttl,self.protocal,self.checksum,self.srcIP,self.dstIP) = struct.unpack('>BBHHHBBHII',code_stream)
    def IntToIP(self,ipInt):
        return socket.inet_ntoa(struct.pack('I',socket.htonl(ipInt)))
    def __str__(self):
        return('IP header:\n'
               'Version and Length:{}\n'
        'Type of service:{}\n'
        'Total length:{}\n'
        'Header identification:{}\n'
        'Fragment offset:{}\n'
        'Time to live(TTL):{}\n'
        'Type of protocal:{}\n'
        'Header checksum:{}\n'
        'Source IP address:{}\n'
        'Destination IP address:{}\n'.format(self.versionAndLen,self.tos,self.tolalLen,self.identification,
                                self.fragment,self.ttl,self.protocal,self.checksum,self.IntToIP(self.srcIP),self.IntToIP(self.dstIP)))

##存储每台机器的ping延时
netDelay = {} 
ICMP_ECHO_REQUEST = 8
ICMP_PROTOCAL_NUM = socket.getprotobyname("icmp") 
class Pinger:
    def __init__(self, uuid, target_host):
        self.uuid = uuid
        self.target_host = target_host
    def do_checksum(self, source_string):
        sum = 0
        max_count = (len(source_string)/2)*2
        count = 0
        while count < max_count:
            val = source_string[count + 1]*256 + source_string[count]
            sum = sum + val
            sum = sum & 0xffffffff 
            count = count + 2
        if max_count<len(source_string):
            sum = sum + ord(source_string[len(source_string) - 1])
            sum = sum & 0xffffffff 
        sum = (sum >> 16)  +  (sum & 0xffff)
        sum = sum + (sum >> 16)
        answer = ~sum
        answer = answer & 0xffff
        answer = answer >> 8 | (answer << 8 & 0xff00)
        return answer
    def send_ping(self,ID):
        ## 构建icmp报文头 ##
        my_checksum = 0
        # Create a dummy heder with a 0 checksum.
        header = struct.pack("bbHHh", ICMP_ECHO_REQUEST, 0, my_checksum, ID, ICMP_PROTOCAL_NUM)
        bytes_In_double = struct.calcsize("d")
        data = self.uuid + (64 - bytes_In_double -len(self.uuid)) * "X"
        data = struct.pack("d", time.time())  + bytes(data,encoding="utf8")
        # Get the checksum on the data and the dummy header.
        my_checksum = self.do_checksum(header + data)
        header = struct.pack(
            "bbHHh", ICMP_ECHO_REQUEST, 0, socket.htons(my_checksum), ID, 1
        )
        packet = header + data
        sock = None
        try:
            sock = socket.socket(socket.AF_INET, socket.SOCK_RAW, ICMP_PROTOCAL_NUM)
            sock.sendto(packet, (self.target_host, 1))
        except Exception as e:
            print(e)
        while True:
            readable = select.select([sock], [], [], self.timeout)
            if readable[0] == []: # Timeout
                return  self.timeout
            time_received = time.time()
            recv_packet, addr = sock.recvfrom(1024)
            ## 下面三行打印收到的包的Ip头信息
            ipHeader = IpHeader()
            ipHeader.decodeIP(recv_packet[0:20])
            print(ipHeader)
            icmp_header = recv_packet[20:28]
            type, code, checksum, packet_ID, sequence = struct.unpack(
                "bbHHh", icmp_header
            )
            if ID == packet_ID:
                bytes_In_double = struct.calcsize("d")
                time_sent = struct.unpack("d", recv_packet[28:28 + bytes_In_double])[0]
                uuid_len = int(len(self.uuid))
                packet_uuid = struct.unpack(str(uuid_len) + 's', recv_packet[28 + bytes_In_double:28 + bytes_In_double + uuid_len])[0] 
                netDelay[self.uuid] = (time_received - time_sent)*1000.0
                ## 打印每台机器对应的时延时信息 ## 
                print(packet_uuid, netDelay[self.uuid])
                sock.close()
                return
        
    def ping(self):
        my_ID = os.getpid() & 0xFFFF
        self.send_ping(my_ID)


if __name__ == '__main__':
    ## 建了3个ping线程做演示 ## 
    ping1 = Pinger('07fd057f-e116-46b8-a18d-eb5ce3a2cdf8','10.243.4.91')
    ping2 = Pinger('c17c5b9f-63f1-4131-a706-b70bd6dec3a1','10.243.4.5')
    ping3 = Pinger('c17c5b9f-63f1-4131-a706-b70bd6dec555','10.57.36.19')
    t1 = threading.Thread(target=ping1.ping)
    t1.start()
    t2 = threading.Thread(target=ping2.ping)
    t2.start()
    t3 = threading.Thread(target=ping3.ping)
    t3.start()
    t1.join()
    t2.join()
    t3.join()

本文不赘述ping构造原理,而是主要介绍多线程收发ping包所遇到的问题。

运行后结果如下:

IP header:                  
Version and Length:69
Type of service:0
Total length:92
Header identification:35542
Fragment offset:0
Time to live(TTL):64
Type of protocal:1
Header checksum:37684
Source IP address:10.57.36.19
Destination IP address:10.57.36.18
b'c17c5b9f-63f1-4131-a706-b70bd6dec555' 0.35858154296875  ## 打印的延时信息

IP header:
Version and Length:69
Type of service:0
Total length:92
Header identification:35542
Fragment offset:0
Time to live(TTL):64
Type of protocal:1
Header checksum:37684
Source IP address:10.57.36.19
Destination IP address:10.57.36.18
b'c17c5b9f-63f1-4131-a706-b70bd6dec555' 0.37860870361328125  ## 打印的延时信息

IP header:
Version and Length:69
Type of service:0
Total length:92
Header identification:35542
Fragment offset:0
Time to live(TTL):64
Type of protocal:1
Header checksum:37684
Source IP address:10.57.36.19
Destination IP address:10.57.36.18
b'c17c5b9f-63f1-4131-a706-b70bd6dec555' 0.39696693420410156  ## 打印的延时信息

令人费解的事情发生了,收到的三个imcp响应报文头部竟然一模一样,都是从同一个ip地址发过来的,此处为10.57.36.19,本机地址为10.57.36.18。看了一下时延信息,远程机器的uuid一模一样,但是时延有一点区别,但是很小,大概只有0.02ms差距。为了确定是否是发送或者接收出了问题,先抓个包看看:

可以看出,在抓包结果上看不出任何问题,三个请求包,三个响应包。

 

二、问题分析

通过抓包结果可以判断:发包这个流程没有问题,因为对端主机收到报文并正确响应了,收到的包至少在协议栈收包之前也没有问题,因为在物理接口抓到了三个响应报文。那分析的重点就是协议栈处理流程了。

之前用tcp和udp收发包都没有这种问题,那raw_socket和tcp_socket、udp_socket有何区别呢?和tcp相比,代码中没有connet的过程,udp虽然没有显式的connet的过程,但会明确指定对端的端口号(本例中 sentto函数虽然带了端口号为1,但是不起作用,可以随便写)。左思右想,觉得可能和socket匹配有关系。所以得先了解一下内核socket匹配原则。直接找到内核代码和raw_socket收包相关的部分:

static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
{
	int sdif = inet_sdif(skb);
	struct sock *sk;
	struct hlist_head *head;
	int delivered = 0;
	struct net *net;

	read_lock(&raw_v4_hashinfo.lock);
	head = &raw_v4_hashinfo.ht[hash];
	if (hlist_empty(head))
		goto out;

	net = dev_net(skb->dev);
    // 关键函数,raw_socket匹配 
	sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol,
			     iph->saddr, iph->daddr,
			     skb->dev->ifindex, sdif);

	while (sk) {
		delivered = 1;
		if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) &&
		    ip_mc_sf_allow(sk, iph->daddr, iph->saddr,
				   skb->dev->ifindex, sdif)) {
			struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);

			/* Not releasing hash table! */
			if (clone)
				raw_rcv(sk, clone);
		}
        // 关键函数,raw_socket匹配 
		sk = __raw_v4_lookup(net, sk_next(sk), iph->protocol,
				     iph->saddr, iph->daddr,
				     skb->dev->ifindex, sdif);
	}
out:
	read_unlock(&raw_v4_hashinfo.lock);
	return delivered;
}


 该函数有个while循环不断匹配的过程,匹配成功后,会走raw_rcv进行后面的处理。继续看__raw_v4_lookup函数:

struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
			     unsigned short num, __be32 raddr, __be32 laddr,
			     int dif, int sdif)
{
	sk_for_each_from(sk) {
		struct inet_sock *inet = inet_sk(sk);
        //熟悉的地址匹配
		if (net_eq(sock_net(sk), net) && inet->inet_num == num	&&  //协议号
		    !(inet->inet_daddr && inet->inet_daddr != raddr) 	&&  //源地址
		    !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&  //目的地址
		    !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif &&   //bond相关,此处我们不讨论
		      sk->sk_bound_dev_if != sdif))
			goto found; /* gotcha */
	}
	sk = NULL;
found:
	return sk;
}

 上面核心的部分就是进行几个要素的匹配:这里的匹配规则有我们熟悉的包括协议号、源地址、目的地址。说成大白话就是:如果收到的包的协议号和socket的一样,源地址和sockert对应的对端地址(raddr,ipheader可获取)一致,目的地址和socket对应本端地址(laddr,ipheader可获取)一致,即匹配成功。但是!(inet->inet_daddr && inet->inet_daddr != raddr) 和!(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr)的判断有些奇怪,inet->inet_daddr和 inet->inet_daddr为0的时候,条件为直接为真,难道这两个字段还会为0,难道不应该是对应到ipheader中的源地址和目的地址么?

 

三、柳暗花明

与其自我臆断,还不如直接看看socket长得啥样子,为了便于观察,在ping代码sock.close()之前加个延迟(time.sleep(100)),避免socket立马close观察不到。

➜ czh@localhost  ~  cat /proc/net/raw
  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode ref pointer drops
   1: 00000000:0001 00000000:0000 07 00000000:00001200 00:00000000 00000000     0        0 17924076 2 000000000ae1ed49 0
   1: 00000000:0001 00000000:0000 07 00000000:00001200 00:00000000 00000000     0        0 17931963 2 00000000398dbe49 0
   1: 00000000:0001 00000000:0000 07 00000000:00001200 00:00000000 00000000     0        0 17941800 2 000000001572e89d 0

可以看出,刚好对应了代码中创建的三个raw_socket,关键的地址信息一栏无疑,可以看出地址字段确实是全零啊。再回看__raw_v4_lookup中的代码,只要协议号相等,即可匹配成功。顺着这个线索,再来分析前面诡异的问题。由于代码中创建的三个socket协议号均相等,所以全部都能匹配,在那个while大循环中,会将clone的sk_buff同时传递到这三个socket中。这个sk_buff对应第一个收到的icmp报文。所以我们每次跑这个代码总是能收到延迟最小的那个imcp报文,且重复三次收到。0.02ms的时间差别是因为raw_rcv调用的时间差导致的。既然这样,如果让socket对应的地址不为0不就行了么,其实connet函数、bind函数都会设置地址。之前误认为connect只有tcp socket才会用到,其实只要了解了connect的实质,就会明白,其实只是对socket的一些字段进行了设置而已,我们再改一下代码:

       sock.connect((self.target_host,1))  ##通过connect设置socket的一些地址字段
       sock.sendto(packet, (self.target_host, 1)) 
       ##由于已经调用过connect,明确了发送对象,上面一行可改为:sock.send(packet)

修改后再运行:

IP header:
Version and Length:69
Type of service:0
Total length:92
Header identification:8904
Fragment offset:0
Time to live(TTL):64
Type of protocal:1
Header checksum:64322
Source IP address:10.57.36.19
Destination IP address:10.57.36.18
b'c17c5b9f-63f1-4131-a706-b70bd6dec555' 0.3306865692138672

IP header:
Version and Length:69
Type of service:0
Total length:92
Header identification:3760
Fragment offset:0
Time to live(TTL):58
Type of protocal:1
Header checksum:13487
Source IP address:10.243.4.5
Destination IP address:10.57.36.18
b'c17c5b9f-63f1-4131-a706-b70bd6dec3a1' 1.951456069946289

IP header:
Version and Length:69
Type of service:0
Total length:92
Header identification:6128
Fragment offset:0
Time to live(TTL):58
Type of protocal:1
Header checksum:11033
Source IP address:10.243.4.91
Destination IP address:10.57.36.18
b'07fd057f-e116-46b8-a18d-eb5ce3a2cdf8' 101.85647010803223

可以看出,这下子正常了,三个ip头分别来自三个不同的远端主机,另外一看,10.57.36.19确实是时延最小的。

再来看看socket信息:

​
➜ czh@localhost  ~  cat /proc/net/raw
  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode ref pointer drops
   1: 1224390A:0001 1324390A:0000 01 00000000:00000000 00:00000000 00000000     0        0 17929792 2 00000000c2e0413a 0
   1: 1224390A:0001 0504F30A:0000 01 00000000:00000000 00:00000000 00000000     0        0 17946699 2 0000000077a7fb40 0
   1: 1224390A:0001 5B04F30A:0000 01 00000000:00000000 00:00000000 00000000     0        0 17939918 2 00000000e71ff973 0

可以看到,调用connet后,确实填充了addr字段。至此问题解决。

 

四、问题拓展

linux自带的ping命令,其实也是基于上面的raw_socket,执行ping命令后查看raw_socket:

[root@host10573619 ~]# cat /proc/net/raw
  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode ref pointer drops
   1: 00000000:0001 00000000:0000 07 00000000:00000000 00:00000000 00000000     0        0 137674 2 ffff881025ea8000 0

可以看出,也存在前述问题,地址字段为全零。所以ping命令应该是做了过滤处理,只处理自己对应的报文,就如同上面的python实现,通过将进程号填充到imcp的id区域,收到回包后解析该字段,如果和自己进程号相等,则继续解析,处理下面的步骤。

但是当笔者在比较高的内核版(5.3.15-300.fc31.x86_64)本上跑ping命令,却找不到raw_socket,用strace跟踪一下系统调用信息,发现一处如下:

socket(AF_INET, SOCK_DGRAM, IPPROTO_ICMP) = 3

在低版本内核上(3.10.0-693.25.4.el7.x86_64)对应:

socket(PF_INET, SOCK_DGRAM, IPPROTO_ICMP) = -1 EACCES (Permission denied)
socket(PF_INET, SOCK_RAW, IPPROTO_ICMP) = 3

可以看出,高版本内核基于SOCK_DGRAM方式创建icmp,低版本内核则调用失败,提示(Permission denied),改用成了SOCK_RAW方式。

➜ czh@localhost  ~  sudo lsof -p 3844792
COMMAND     PID USER   FD   TYPE DEVICE  SIZE/OFF     NODE NAME
ping    3844792  czh  cwd    DIR  253,0      4096   786434 /home/czh
ping    3844792  czh  rtd    DIR  253,0      4096        2 /
ping    3844792  czh  txt    REG  253,0     82952   932277 /usr/bin/ping
ping    3844792  czh  mem    REG  253,0 217990208   918461 /usr/lib/locale/locale-archive
ping    3844792  czh  mem    REG  253,0    328192   925214 /usr/lib64/libpthread-2.30.so
ping    3844792  czh  mem    REG  253,0     36392   925193 /usr/lib64/libdl-2.30.so
ping    3844792  czh  mem    REG  253,0    105336   918543 /usr/lib64/libz.so.1.2.11
ping    3844792  czh  mem    REG  253,0   1662240   929405 /usr/lib64/libunistring.so.2.1.0
ping    3844792  czh  mem    REG  253,0   3182768   925191 /usr/lib64/libc-2.30.so
ping    3844792  czh  mem    REG  253,0    126984   925216 /usr/lib64/libresolv-2.30.so
ping    3844792  czh  mem    REG  253,0   3151112   932434 /usr/lib64/libcrypto.so.1.1.1d
ping    3844792  czh  mem    REG  253,0    137968   925549 /usr/lib64/libidn2.so.0.3.7
ping    3844792  czh  mem    REG  253,0     25504   928835 /usr/lib64/libcap.so.2.26
ping    3844792  czh  mem    REG  253,0     26998  1315022 /usr/lib64/gconv/gconv-modules.cache
ping    3844792  czh  mem    REG  253,0    260904   919996 /usr/lib64/ld-2.30.so
ping    3844792  czh    0u   CHR  136,3       0t0        6 /dev/pts/3
ping    3844792  czh    1u   CHR  136,3       0t0        6 /dev/pts/3
ping    3844792  czh    2u   CHR  136,3       0t0        6 /dev/pts/3
ping    3844792  czh    3u  icmp              0t0 18151002 00000000:03F7->00000000:0000
ping    3844792  czh    4u  sock    0,9       0t0 18151003 protocol: PINGv6

利用lsof可以看到占用的sock,可以看到有这么一行:

ping    3844792  czh    3u  icmp              0t0 18151002 00000000:03F7->00000000:0000

然后竟然多了个icmp统计:

➜ czh@localhost  ~  cat /proc/net/icmp
  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode ref pointer drops             
   42: 00000000:03F7 00000000:0000 07 00000000:00000000 00:00000000 00000000  1000        0 18151002 2 00000000a8ae2e61 0   

当调用多个ping命令时:

➜ czh@localhost  ~  cat /proc/net/icmp
  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode ref pointer drops             
   42: 00000000:03F7 00000000:0000 07 00000000:00000000 00:00000000 00000000  1000        0 18151002 2 00000000a8ae2e61 0      
   43: 00000000:03F8 00000000:0000 07 00000000:00000000 00:00000000 00000000  1000        0 18150084 2 00000000924bce8d 0   

可以看出对应的local_address的本地端口号是不一样的,这就解决了匹配问题,通过端口号便能区分到对应的sock的。前面的python代码,如果在高版本内核下,创建套接字可以直接写成:

sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM, ICMP_PROTOCAL_NUM)

由于是基于SOCK_DGRAM,所以收包解析时,我们不需要跳过ip报文头,直接从第0字节开始解析,例如icmp报头解析可以修改一下:

  icmp_header = recv_packet[20:28]  
  -> 修改为:icmp_header = recv_packet[0:8]

后面的解析以此类推。

评论 7
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值