一、问题描述
因为工作需要,需要同时对多台主机进行ping探测,计算时延。不像tcp和udp,icmp报文需要自己手动构造报文头。简单的demon代码如下(python实现):
import os
import threading
import time
import socket
import struct
import select
## 这部分为IP报文头解析部分,非必要部分,为了后面调试用 ##
class IpHeader(object):
def __init__(self):
self.versionAndLen=None # 4bit version 4bit header length
self.tos=None # 8bit type of service
self.tolalLen = None # 16bit total length
self.identification =None # 16bit header identification
self.fragment = None # 16bit others and fragment offset
self.ttl = None # 8bit time to live
self.protocal = None # 8bit type of protocal
self.checksum = None # 16bit header checksum
self.srcIP = None # 32bit src IP address
self.dstIP = None # 32bit dst IP address
def decodeIP(self,code_stream):
(self.versionAndLen,self.tos,self.tolalLen,self.identification,self.fragment,
self.ttl,self.protocal,self.checksum,self.srcIP,self.dstIP) = struct.unpack('>BBHHHBBHII',code_stream)
def IntToIP(self,ipInt):
return socket.inet_ntoa(struct.pack('I',socket.htonl(ipInt)))
def __str__(self):
return('IP header:\n'
'Version and Length:{}\n'
'Type of service:{}\n'
'Total length:{}\n'
'Header identification:{}\n'
'Fragment offset:{}\n'
'Time to live(TTL):{}\n'
'Type of protocal:{}\n'
'Header checksum:{}\n'
'Source IP address:{}\n'
'Destination IP address:{}\n'.format(self.versionAndLen,self.tos,self.tolalLen,self.identification,
self.fragment,self.ttl,self.protocal,self.checksum,self.IntToIP(self.srcIP),self.IntToIP(self.dstIP)))
##存储每台机器的ping延时
netDelay = {}
ICMP_ECHO_REQUEST = 8
ICMP_PROTOCAL_NUM = socket.getprotobyname("icmp")
class Pinger:
def __init__(self, uuid, target_host):
self.uuid = uuid
self.target_host = target_host
def do_checksum(self, source_string):
sum = 0
max_count = (len(source_string)/2)*2
count = 0
while count < max_count:
val = source_string[count + 1]*256 + source_string[count]
sum = sum + val
sum = sum & 0xffffffff
count = count + 2
if max_count<len(source_string):
sum = sum + ord(source_string[len(source_string) - 1])
sum = sum & 0xffffffff
sum = (sum >> 16) + (sum & 0xffff)
sum = sum + (sum >> 16)
answer = ~sum
answer = answer & 0xffff
answer = answer >> 8 | (answer << 8 & 0xff00)
return answer
def send_ping(self,ID):
## 构建icmp报文头 ##
my_checksum = 0
# Create a dummy heder with a 0 checksum.
header = struct.pack("bbHHh", ICMP_ECHO_REQUEST, 0, my_checksum, ID, ICMP_PROTOCAL_NUM)
bytes_In_double = struct.calcsize("d")
data = self.uuid + (64 - bytes_In_double -len(self.uuid)) * "X"
data = struct.pack("d", time.time()) + bytes(data,encoding="utf8")
# Get the checksum on the data and the dummy header.
my_checksum = self.do_checksum(header + data)
header = struct.pack(
"bbHHh", ICMP_ECHO_REQUEST, 0, socket.htons(my_checksum), ID, 1
)
packet = header + data
sock = None
try:
sock = socket.socket(socket.AF_INET, socket.SOCK_RAW, ICMP_PROTOCAL_NUM)
sock.sendto(packet, (self.target_host, 1))
except Exception as e:
print(e)
while True:
readable = select.select([sock], [], [], self.timeout)
if readable[0] == []: # Timeout
return self.timeout
time_received = time.time()
recv_packet, addr = sock.recvfrom(1024)
## 下面三行打印收到的包的Ip头信息
ipHeader = IpHeader()
ipHeader.decodeIP(recv_packet[0:20])
print(ipHeader)
icmp_header = recv_packet[20:28]
type, code, checksum, packet_ID, sequence = struct.unpack(
"bbHHh", icmp_header
)
if ID == packet_ID:
bytes_In_double = struct.calcsize("d")
time_sent = struct.unpack("d", recv_packet[28:28 + bytes_In_double])[0]
uuid_len = int(len(self.uuid))
packet_uuid = struct.unpack(str(uuid_len) + 's', recv_packet[28 + bytes_In_double:28 + bytes_In_double + uuid_len])[0]
netDelay[self.uuid] = (time_received - time_sent)*1000.0
## 打印每台机器对应的时延时信息 ##
print(packet_uuid, netDelay[self.uuid])
sock.close()
return
def ping(self):
my_ID = os.getpid() & 0xFFFF
self.send_ping(my_ID)
if __name__ == '__main__':
## 建了3个ping线程做演示 ##
ping1 = Pinger('07fd057f-e116-46b8-a18d-eb5ce3a2cdf8','10.243.4.91')
ping2 = Pinger('c17c5b9f-63f1-4131-a706-b70bd6dec3a1','10.243.4.5')
ping3 = Pinger('c17c5b9f-63f1-4131-a706-b70bd6dec555','10.57.36.19')
t1 = threading.Thread(target=ping1.ping)
t1.start()
t2 = threading.Thread(target=ping2.ping)
t2.start()
t3 = threading.Thread(target=ping3.ping)
t3.start()
t1.join()
t2.join()
t3.join()
本文不赘述ping构造原理,而是主要介绍多线程收发ping包所遇到的问题。
运行后结果如下:
IP header:
Version and Length:69
Type of service:0
Total length:92
Header identification:35542
Fragment offset:0
Time to live(TTL):64
Type of protocal:1
Header checksum:37684
Source IP address:10.57.36.19
Destination IP address:10.57.36.18
b'c17c5b9f-63f1-4131-a706-b70bd6dec555' 0.35858154296875 ## 打印的延时信息
IP header:
Version and Length:69
Type of service:0
Total length:92
Header identification:35542
Fragment offset:0
Time to live(TTL):64
Type of protocal:1
Header checksum:37684
Source IP address:10.57.36.19
Destination IP address:10.57.36.18
b'c17c5b9f-63f1-4131-a706-b70bd6dec555' 0.37860870361328125 ## 打印的延时信息
IP header:
Version and Length:69
Type of service:0
Total length:92
Header identification:35542
Fragment offset:0
Time to live(TTL):64
Type of protocal:1
Header checksum:37684
Source IP address:10.57.36.19
Destination IP address:10.57.36.18
b'c17c5b9f-63f1-4131-a706-b70bd6dec555' 0.39696693420410156 ## 打印的延时信息
令人费解的事情发生了,收到的三个imcp响应报文头部竟然一模一样,都是从同一个ip地址发过来的,此处为10.57.36.19,本机地址为10.57.36.18。看了一下时延信息,远程机器的uuid一模一样,但是时延有一点区别,但是很小,大概只有0.02ms差距。为了确定是否是发送或者接收出了问题,先抓个包看看:
可以看出,在抓包结果上看不出任何问题,三个请求包,三个响应包。
二、问题分析
通过抓包结果可以判断:发包这个流程没有问题,因为对端主机收到报文并正确响应了,收到的包至少在协议栈收包之前也没有问题,因为在物理接口抓到了三个响应报文。那分析的重点就是协议栈处理流程了。
之前用tcp和udp收发包都没有这种问题,那raw_socket和tcp_socket、udp_socket有何区别呢?和tcp相比,代码中没有connet的过程,udp虽然没有显式的connet的过程,但会明确指定对端的端口号(本例中 sentto函数虽然带了端口号为1,但是不起作用,可以随便写)。左思右想,觉得可能和socket匹配有关系。所以得先了解一下内核socket匹配原则。直接找到内核代码和raw_socket收包相关的部分:
static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
{
int sdif = inet_sdif(skb);
struct sock *sk;
struct hlist_head *head;
int delivered = 0;
struct net *net;
read_lock(&raw_v4_hashinfo.lock);
head = &raw_v4_hashinfo.ht[hash];
if (hlist_empty(head))
goto out;
net = dev_net(skb->dev);
// 关键函数,raw_socket匹配
sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol,
iph->saddr, iph->daddr,
skb->dev->ifindex, sdif);
while (sk) {
delivered = 1;
if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) &&
ip_mc_sf_allow(sk, iph->daddr, iph->saddr,
skb->dev->ifindex, sdif)) {
struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
/* Not releasing hash table! */
if (clone)
raw_rcv(sk, clone);
}
// 关键函数,raw_socket匹配
sk = __raw_v4_lookup(net, sk_next(sk), iph->protocol,
iph->saddr, iph->daddr,
skb->dev->ifindex, sdif);
}
out:
read_unlock(&raw_v4_hashinfo.lock);
return delivered;
}
该函数有个while循环不断匹配的过程,匹配成功后,会走raw_rcv进行后面的处理。继续看__raw_v4_lookup函数:
struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
unsigned short num, __be32 raddr, __be32 laddr,
int dif, int sdif)
{
sk_for_each_from(sk) {
struct inet_sock *inet = inet_sk(sk);
//熟悉的地址匹配
if (net_eq(sock_net(sk), net) && inet->inet_num == num && //协议号
!(inet->inet_daddr && inet->inet_daddr != raddr) && //源地址
!(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) && //目的地址
!(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif && //bond相关,此处我们不讨论
sk->sk_bound_dev_if != sdif))
goto found; /* gotcha */
}
sk = NULL;
found:
return sk;
}
上面核心的部分就是进行几个要素的匹配:这里的匹配规则有我们熟悉的包括协议号、源地址、目的地址。说成大白话就是:如果收到的包的协议号和socket的一样,源地址和sockert对应的对端地址(raddr,ipheader可获取)一致,目的地址和socket对应本端地址(laddr,ipheader可获取)一致,即匹配成功。但是!(inet->inet_daddr && inet->inet_daddr != raddr) 和!(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr)的判断有些奇怪,inet->inet_daddr和 inet->inet_daddr为0的时候,条件为直接为真,难道这两个字段还会为0,难道不应该是对应到ipheader中的源地址和目的地址么?
三、柳暗花明
与其自我臆断,还不如直接看看socket长得啥样子,为了便于观察,在ping代码sock.close()之前加个延迟(time.sleep(100)),避免socket立马close观察不到。
➜ czh@localhost ~ cat /proc/net/raw
sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode ref pointer drops
1: 00000000:0001 00000000:0000 07 00000000:00001200 00:00000000 00000000 0 0 17924076 2 000000000ae1ed49 0
1: 00000000:0001 00000000:0000 07 00000000:00001200 00:00000000 00000000 0 0 17931963 2 00000000398dbe49 0
1: 00000000:0001 00000000:0000 07 00000000:00001200 00:00000000 00000000 0 0 17941800 2 000000001572e89d 0
可以看出,刚好对应了代码中创建的三个raw_socket,关键的地址信息一栏无疑,可以看出地址字段确实是全零啊。再回看__raw_v4_lookup中的代码,只要协议号相等,即可匹配成功。顺着这个线索,再来分析前面诡异的问题。由于代码中创建的三个socket协议号均相等,所以全部都能匹配,在那个while大循环中,会将clone的sk_buff同时传递到这三个socket中。这个sk_buff对应第一个收到的icmp报文。所以我们每次跑这个代码总是能收到延迟最小的那个imcp报文,且重复三次收到。0.02ms的时间差别是因为raw_rcv调用的时间差导致的。既然这样,如果让socket对应的地址不为0不就行了么,其实connet函数、bind函数都会设置地址。之前误认为connect只有tcp socket才会用到,其实只要了解了connect的实质,就会明白,其实只是对socket的一些字段进行了设置而已,我们再改一下代码:
sock.connect((self.target_host,1)) ##通过connect设置socket的一些地址字段
sock.sendto(packet, (self.target_host, 1))
##由于已经调用过connect,明确了发送对象,上面一行可改为:sock.send(packet)
修改后再运行:
IP header:
Version and Length:69
Type of service:0
Total length:92
Header identification:8904
Fragment offset:0
Time to live(TTL):64
Type of protocal:1
Header checksum:64322
Source IP address:10.57.36.19
Destination IP address:10.57.36.18
b'c17c5b9f-63f1-4131-a706-b70bd6dec555' 0.3306865692138672
IP header:
Version and Length:69
Type of service:0
Total length:92
Header identification:3760
Fragment offset:0
Time to live(TTL):58
Type of protocal:1
Header checksum:13487
Source IP address:10.243.4.5
Destination IP address:10.57.36.18
b'c17c5b9f-63f1-4131-a706-b70bd6dec3a1' 1.951456069946289
IP header:
Version and Length:69
Type of service:0
Total length:92
Header identification:6128
Fragment offset:0
Time to live(TTL):58
Type of protocal:1
Header checksum:11033
Source IP address:10.243.4.91
Destination IP address:10.57.36.18
b'07fd057f-e116-46b8-a18d-eb5ce3a2cdf8' 101.85647010803223
可以看出,这下子正常了,三个ip头分别来自三个不同的远端主机,另外一看,10.57.36.19确实是时延最小的。
再来看看socket信息:
➜ czh@localhost ~ cat /proc/net/raw
sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode ref pointer drops
1: 1224390A:0001 1324390A:0000 01 00000000:00000000 00:00000000 00000000 0 0 17929792 2 00000000c2e0413a 0
1: 1224390A:0001 0504F30A:0000 01 00000000:00000000 00:00000000 00000000 0 0 17946699 2 0000000077a7fb40 0
1: 1224390A:0001 5B04F30A:0000 01 00000000:00000000 00:00000000 00000000 0 0 17939918 2 00000000e71ff973 0
可以看到,调用connet后,确实填充了addr字段。至此问题解决。
四、问题拓展
linux自带的ping命令,其实也是基于上面的raw_socket,执行ping命令后查看raw_socket:
[root@host10573619 ~]# cat /proc/net/raw
sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode ref pointer drops
1: 00000000:0001 00000000:0000 07 00000000:00000000 00:00000000 00000000 0 0 137674 2 ffff881025ea8000 0
可以看出,也存在前述问题,地址字段为全零。所以ping命令应该是做了过滤处理,只处理自己对应的报文,就如同上面的python实现,通过将进程号填充到imcp的id区域,收到回包后解析该字段,如果和自己进程号相等,则继续解析,处理下面的步骤。
但是当笔者在比较高的内核版(5.3.15-300.fc31.x86_64)本上跑ping命令,却找不到raw_socket,用strace跟踪一下系统调用信息,发现一处如下:
socket(AF_INET, SOCK_DGRAM, IPPROTO_ICMP) = 3
在低版本内核上(3.10.0-693.25.4.el7.x86_64)对应:
socket(PF_INET, SOCK_DGRAM, IPPROTO_ICMP) = -1 EACCES (Permission denied)
socket(PF_INET, SOCK_RAW, IPPROTO_ICMP) = 3
可以看出,高版本内核基于SOCK_DGRAM方式创建icmp,低版本内核则调用失败,提示(Permission denied),改用成了SOCK_RAW方式。
➜ czh@localhost ~ sudo lsof -p 3844792
COMMAND PID USER FD TYPE DEVICE SIZE/OFF NODE NAME
ping 3844792 czh cwd DIR 253,0 4096 786434 /home/czh
ping 3844792 czh rtd DIR 253,0 4096 2 /
ping 3844792 czh txt REG 253,0 82952 932277 /usr/bin/ping
ping 3844792 czh mem REG 253,0 217990208 918461 /usr/lib/locale/locale-archive
ping 3844792 czh mem REG 253,0 328192 925214 /usr/lib64/libpthread-2.30.so
ping 3844792 czh mem REG 253,0 36392 925193 /usr/lib64/libdl-2.30.so
ping 3844792 czh mem REG 253,0 105336 918543 /usr/lib64/libz.so.1.2.11
ping 3844792 czh mem REG 253,0 1662240 929405 /usr/lib64/libunistring.so.2.1.0
ping 3844792 czh mem REG 253,0 3182768 925191 /usr/lib64/libc-2.30.so
ping 3844792 czh mem REG 253,0 126984 925216 /usr/lib64/libresolv-2.30.so
ping 3844792 czh mem REG 253,0 3151112 932434 /usr/lib64/libcrypto.so.1.1.1d
ping 3844792 czh mem REG 253,0 137968 925549 /usr/lib64/libidn2.so.0.3.7
ping 3844792 czh mem REG 253,0 25504 928835 /usr/lib64/libcap.so.2.26
ping 3844792 czh mem REG 253,0 26998 1315022 /usr/lib64/gconv/gconv-modules.cache
ping 3844792 czh mem REG 253,0 260904 919996 /usr/lib64/ld-2.30.so
ping 3844792 czh 0u CHR 136,3 0t0 6 /dev/pts/3
ping 3844792 czh 1u CHR 136,3 0t0 6 /dev/pts/3
ping 3844792 czh 2u CHR 136,3 0t0 6 /dev/pts/3
ping 3844792 czh 3u icmp 0t0 18151002 00000000:03F7->00000000:0000
ping 3844792 czh 4u sock 0,9 0t0 18151003 protocol: PINGv6
利用lsof可以看到占用的sock,可以看到有这么一行:
ping 3844792 czh 3u icmp 0t0 18151002 00000000:03F7->00000000:0000
然后竟然多了个icmp统计:
➜ czh@localhost ~ cat /proc/net/icmp
sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode ref pointer drops
42: 00000000:03F7 00000000:0000 07 00000000:00000000 00:00000000 00000000 1000 0 18151002 2 00000000a8ae2e61 0
当调用多个ping命令时:
➜ czh@localhost ~ cat /proc/net/icmp
sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode ref pointer drops
42: 00000000:03F7 00000000:0000 07 00000000:00000000 00:00000000 00000000 1000 0 18151002 2 00000000a8ae2e61 0
43: 00000000:03F8 00000000:0000 07 00000000:00000000 00:00000000 00000000 1000 0 18150084 2 00000000924bce8d 0
可以看出对应的local_address的本地端口号是不一样的,这就解决了匹配问题,通过端口号便能区分到对应的sock的。前面的python代码,如果在高版本内核下,创建套接字可以直接写成:
sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM, ICMP_PROTOCAL_NUM)
由于是基于SOCK_DGRAM,所以收包解析时,我们不需要跳过ip报文头,直接从第0字节开始解析,例如icmp报头解析可以修改一下:
icmp_header = recv_packet[20:28]
-> 修改为:icmp_header = recv_packet[0:8]
后面的解析以此类推。