背景:前两天有个人跟我争论accept之后新得到的描述连接的socket的端口是多少。我说是原来监听的端口,因为我之前经常用netstat查看连接,例如ssh,端口22,所有客户端与服务端建立的连接,服务端都是22。而那位同仁认为是随机获取可用端口,理由是如果端口相同怎么区分连接呢?当时脑子抽筋,没办法反驳。真是那句话,虽然你不对,但是我却找不到理由反驳你,只怪当时没有细看。现在我们整理下:
有两种可能:
1.与监听的套接字公用一个端口
2.随机获取一个系统可用的端口
我们获得答案的方法是研究内核协议栈。
在研究代码之前我们先回顾下TCP三次握手过程:
第一步:请求端发送SYN包,等待服务端回应
第二步:服务端接收到SYN包,并回应SYN+ACK包
第三步:请求端接收到SYN+ACK包,并回应ACK包,连接建立
具体到Linux的内核协议栈:
在服务端:
通过socket调用建立初始化sock对象,通过bind调用绑定本地地址和监听端口,通过listen调用为sock对象建立监听队列的空间,并将sock加入TCP协议三张全局哈希表中的监听哈希表,当有请求连接时,就从监听哈希表中找到对应的sock对象处理该请求。
处理过程就是三次握手过程:
第一步:请求端发送SYN包请求连接
第二步:服务端接收到SYN包,并根据数据包的目的地址和目的端口信息,从监听哈希表中查找对应的sock对象,然后在sock对象的监听队列中查找是否有该请求(第一次发送syn包,是没有对应的request_sock对象的),如果没有就在请求队列空间中建立代表该请求的request_sock对象,然后发送syn+ack包回应请求端。
第三步:请求端接收到SYN+ACK包,并回应ACK包。ACK包到达服务端,服务端从监听哈希表中查出处理该ACK包的sock对象,然后从sock对象的接受队列中查找该请求的request_sock对象(这次是能找到的,因为第一次处理syn包时已经建立),并构建新的sock对象描述该链接,然后将request_sock对象连同新构建的sock对象挂到接受队列icsk_accept_queue的队尾。
接着用户空间的进程调用accept会从接受队列中取出request_sock对象连同新构建的sock对象,并将新构建的sock对象返回给用户空间进程。这个就是我们说的accept得到新socket对应的sock对象,我们称之为子sock对象,那么这个子sock对象的端口是多少呢?是随机获取的,还是和原来sock的一样?
我们先看继承与sock对象的inet_sock, 也就是在inet协议族中代表一个连接sock对象
/** struct inet_sock - representation of INET sockets
*
* @sk - ancestor class
* @pinet6 - pointer to IPv6 control block
* @daddr - Foreign IPv4 addr
* @rcv_saddr - Bound local IPv4 addr
* @dport - Destination port
* @num - Local port
* @saddr - Sending source
* @uc_ttl - Unicast TTL
* @sport - Source port
* @id - ID counter for DF pkts
* @tos - TOS
* @mc_ttl - Multicasting TTL
* @is_icsk - is this an inet_connection_sock?
* @mc_index - Multicast device index
* @mc_list - Group array
* @cork - info to build ip hdr on each ip frag while socket is corked
*/
struct inet_sock {
/* sk and pinet6 has to be the first two members of inet_sock */
struct sock sk;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
struct ipv6_pinfo *pinet6;
#endif
/* Socket demultiplex comparisons on incoming packets. */
__be32 daddr;
__be32 rcv_saddr;
__be16 dport;
__u16 num;//主机字节序表示的源端口号
__be32 saddr;
__s16 uc_ttl;
__u16 cmsg_flags;
struct ip_options *opt;
__be16 sport;//网络字字节序表示的端口号
__u16 id;
__u8 tos;
__u8 mc_ttl;
__u8 pmtudisc;
__u8 recverr:1,
is_icsk:1,
freebind:1,
hdrincl:1,
mc_loop:1,
transparent:1,
mc_all:1;
int mc_index;
__be32 mc_addr;
struct ip_mc_socklist *mc_list;
struct {
unsigned int flags;
unsigned int fragsize;
struct ip_options *opt;
struct dst_entry *dst;
int length; /* Total length of all frames */
__be32 addr;
struct flowi fl;
} cork;
};
可以看到其中有两个代表端口的字段,分别是主机字节序的num,和网络字节序的sport。
接着我们结合协议栈源码分析三次握手过程和accept过程。
数据到达网卡的时候,对于TCP协议,将大致要经过这个一个调用链:
网卡驱动-->netif_receive_skb()--->ip_rcv()--->ip_local_deliver_finish()--->tcp_v4_rcv()
下面我分析下函数tcp_v4_rcv()
int tcp_v4_rcv(struct sk_buff *skb)
{
...
/*
*
*
* 根据源端口号,目的端口号和接收的interface查找sock对象------>先在建立连接的哈希表中查找------>如果没找到就从监听哈希表中找
*
*/
sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
if (!sk)
goto no_tcp_socket;
...
else
#endif
{
if (!tcp_prequeue(sk, skb))
ret = tcp_v4_do_rcv(sk, skb);//--------------------------->用得到的sock对象处理skb
}
} else
...
}
主要是根据目的地址和端口口从listen哈希表中查找处理数据包的sock对象,然后tcp_v4_do_rcv()函数用该sock对象处理数据包
/* The socket must have it's spinlock held when we get
* here.
*
* We have a potential double-lock case here, so even when
* doing backlog processing we use the BH locking scheme.
* This is because we cannot sleep with the original spinlock
* held.
*/
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
struct sock *rsk;
#ifdef CONFIG_TCP_MD5SIG
/*
* We really want to reject the packet as early as possible
* if:
* o We're expecting an MD5'd packet and this is no MD5 tcp option
* o There is an MD5 option and we're not expecting one
*/
if (tcp_v4_inbound_md5_hash(sk, skb))
goto discard;
#endif
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
TCP_CHECK_TIMER(sk);
if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
rsk = sk;
goto reset;
}
TCP_CHECK_TIMER(sk);
return 0;
}
/*****************************************************************/
if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
goto csum_err;
if (sk->sk_state == TCP_LISTEN) {//如果是listen状态 处理握手连接
/*
*
*从监听队列中查找请求对象request_sock,对于第一个syn包,这时还没有构建对应的request_sock对象,也没有对应的子sock对象,所以直接返回入参sk */
struct sock *nsk = tcp_v4_hnd_req(sk, skb);//
if (!nsk)
goto discard;
/*对于第一个syn包,nsk就是sk,不会执行if块里面的代码,继续向下执行*/
if (nsk != sk) {
if (tcp_child_process(sk, nsk, skb)) {//执行tcp_rcv_state_process,状态机
rsk = nsk;
goto reset;
}
return 0;
}
}
TCP_CHECK_TIMER(sk);
if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {//执行状态机
rsk = sk;
goto reset;
}
TCP_CHECK_TIMER(sk);
return 0;
reset:
tcp_v4_send_reset(rsk, skb);
discard:
kfree_skb(skb);
/* Be careful here. If this function gets more complicated and
* gcc suffers from register pressure on the x86, sk (in %ebx)
* might be destroyed here. This current version compiles correctly,
* but you have been warned.
*/
return 0;
csum_err:
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}
在解释函数之前,我看一下sock对象的接受队列:
struct inet_connection_sock {
/* inet_sock has to be the first member! */
struct inet_sock icsk_inet;
struct request_sock_queue icsk_accept_queue;//接受队列
struct inet_bind_bucket *icsk_bind_hash;
unsigned long icsk_timeout;
struct timer_list icsk_retransmit_timer;
struct timer_list icsk_delack_timer;
...
}
是连接套接字中的一个成员:
struct request_sock_queue {
struct request_sock *rskq_accept_head;
struct request_sock *rskq_accept_tail;
rwlock_t syn_wait_lock;
u8 rskq_defer_accept;
/* 3 bytes hole, try to pack */
struct listen_sock *listen_opt;
};
看到请求对象的队列对头和队尾了,再看listen_sock
/** struct listen_sock - listen state
*
* @max_qlen_log - log_2 of maximal queued SYNs/REQUESTs
*/
struct listen_sock {
u8 max_qlen_log;
/* 3 bytes hole, try to use */
int qlen;
int qlen_young;
int clock_hand;
u32 hash_rnd;
u32 nr_table_entries;
struct request_sock *syn_table[0];
};
好了,看完了三个数据结构,我们接着函数tcp_v4_do_rcv(),该函数调用tcp_v4_hnd_req(),解释下内核是怎么处理三次握手的。
第一步:收到syn请求连接的数据包,取出处理该包的sock对象,sock对象中有接受队列,现在接受队列中查找请求对象request_sock,因为第一次收到该请求的syn包,肯定没有对应request_sock对象,于是没找到。又在建立连接的哈希表中找,更加不可能找到,于是直接返回sk。
static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
{
struct tcphdr *th = tcp_hdr(skb);
const struct iphdr *iph = ip_hdr(skb);
struct sock *nsk;
struct request_sock **prev;
/* Find possible connection requests. */
struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,//在sock对象的接受队列中查找请求对象
iph->saddr, iph->daddr);
/*在接受队列找到了request_sock对象,说明完成接收第一次握手,并发送了ack包,现在接收到第二个syn包,进入第三次握手,构建新sock对象*/
if (req)
return tcp_check_req(sk, skb, req, prev);
nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,//没找到可能已经建立连接,sock已被转到连接建立哈希表中,所以在此哈希表中找
th->source, iph->daddr, th->dest, inet_iif(skb));
if (nsk) {
if (nsk->sk_state != TCP_TIME_WAIT) {
bh_lock_sock(nsk);
return nsk;
}
inet_twsk_put(inet_twsk(nsk));
return NULL;
}
#ifdef CONFIG_SYN_COOKIES
if (!th->rst && !th->syn && th->ack)
sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
#endif
return sk;
}
回到函数tcp_v4_do_rcv(),上面分析,返回的就是sk,所以if条件是不成立的,if内的代码块不会执行,直接执行if代码块之后的代码。
/*对于第一个syn包,nsk就是sk,不会执行if块里面的代码,继续向下执行*/
if (nsk != sk) {
if (tcp_child_process(sk, nsk, skb)) {//执行tcp_rcv_state_process,状态机
rsk = nsk;
goto reset;
}
return 0;
}
我们进入该函数tcp_rcv_state_process(), 显然sock是被设置成listen状态,接受到的是syn包,
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, unsigned len)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
int queued = 0;
int res;
tp->rx_opt.saw_tstamp = 0;
switch (sk->sk_state) {
case TCP_CLOSE:
goto discard;
case TCP_LISTEN:
if (th->ack)
return 1;
if (th->rst)
goto discard;
/*listen 状态只处理syn包 */
if (th->syn) {
if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) /**/tcp_v4_init_sock ------> ipv4_specific ---> tcp_v4_conn_request
return 1;
/* Now we have several options: In theory there is
* nothing else in the frame. KA9Q has an option to
* send data with the syn, BSD accepts data with the
* syn up to the [to be] advertised window and
* Solaris 2.1 gives you a protocol error. For now
* we just ignore it, that fits the spec precisely
* and avoids incompatibilities. It would be nice in
* future to drop through and process the data.
*
* Now that TTCP is starting to be used we ought to
* queue this data.
* But, this leaves one open to an easy denial of
* service attack, and SYN cookies can't defend
* against this problem. So, we drop the data
* in the interest of security over speed unless
* it's still in use.
*/
kfree_skb(skb);
return 0;
}
goto discard;
case TCP_SYN_SENT:
queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
if (queued >= 0)
return queued;
...
}
于是执行下面代码块:
/*listen 状态只处理syn包 */
if (th->syn) {
if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) /**/tcp_v4_init_sock ------> ipv4_specific ---> tcp_v4_conn_request
实际调用的是函数tcp_v4_conn_request(),该函数具体做了什么呢?该函数就构建并初始化了一个brequest_sock对象,将请求对象挂到sock对象的监听队列中,并回了一个syn+ack包,就是做了第二次握手的事。还有一个大家一定要注意:就是对请求对象做了初始化,包括设置端口。端口正是我们关心的东西,不过我们先不分析,回头来分析,现在只是提醒注意。
看该函数关键代码:
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
...
req = inet_reqsk_alloc(&tcp_request_sock_ops);/*构造request_sock对象,对于TCP其实是tcp_request_sock对象 */ ----> tcp_prot proto_register
if (!req)
goto drop;
...
tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
/* 设置ireq端口等 */
tcp_openreq_init(req, &tmp_opt, skb);
ireq = inet_rsk(req);
ireq->loc_addr = daddr;
ireq->rmt_addr = saddr;
ireq->no_srccheck = inet_sk(sk)->transparent;
ireq->opt = tcp_v4_save_options(sk, skb);
...
/*回复syn + ack 包*/
if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
goto drop_and_free;
inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);//添加到接受队列中
...
}
回了syn+ack包,做了第二次握手,那就等请求端的ack回应了。
接着请求端收到syn+ack包,并又发了一个ack,进行第三次握手。在在服务端收到该ack包,于是又到了函数tcp_do_rcv(),又找到处理该数据包的sock对象,进入函数tcp_v4_do_rcv()
调用函数tcp_v4_hnd_req(),在sock队列中查找对应的请求对象request_sock,由于第一次收到syn包时,构建了请求对象request_sock,并将请求对象挂到请求队列中,这次查找肯定是能找到了
static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
{
struct tcphdr *th = tcp_hdr(skb);
const struct iphdr *iph = ip_hdr(skb);
struct sock *nsk;
struct request_sock **prev;
/* Find possible connection requests. */
struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,//在sock对象的接受队列中查找请求对象
iph->saddr, iph->daddr);
/*在接受队列找到了request_sock对象,说明完成接收第一次握手,并发送了ack包,现在接收到第二个syn包,进入第三次握手,构建新sock对象*/
if (req)
return tcp_check_req(sk, skb, req, prev);
看tcp_v4_hnd_req(),就是执行if代码块中的函数tcp_check_req(),该函数做了什么呢?该函数主要做
1)构建请求对象对应的子sock,注意这个sock也就是我们调用accept时得到的sock,并将该sock对象挂到请求对象上request_sock
2)将请求对象request_sock从请求队列移到接受队列,并将子sock对象返回
struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct request_sock **prev)
{
...
*
*/
child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); /**/ tcp_v4_init_sock -- > ipv4_specific --> tcp_v4_syn_recv_sock
if (child == NULL)
goto listen_overflow;
inet_csk_reqsk_queue_unlink(sk, req, prev);
inet_csk_reqsk_queue_removed(sk, req);
inet_csk_reqsk_queue_add(sk, req, child);//将请求对象request_sock和对应的sock添加到sock的接受队列
return child;
}
这样就有了新的连接的子sock对象。当在用户空间调用accept时,就会冲接受队列中取出请求对象,并返回对应的子sock。现在回到我们的问题:新的sock的端口是多少?是随机获取可用的端口?还是用原来监听sock'的端口?
我们看下新的子sock是怎么构建的;回到上面的tcp_check_req()函数,该函数执行下面代码构成新sock对象的构建
/* OK, ACK is valid, create big socket and
* feed this segment to it. It will repeat all
* the tests. THIS SEGMENT MUST MOVE SOCKET TO
* ESTABLISHED STATE. If it will be dropped after
* socket is created, wait for troubles.
*
* 构建sock对象
*
*/
child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); /**/ tcp_v4_init_sock -- > ipv4_specific --> tcp_v4_syn_recv_sock
if (child == NULL)
goto listen_overflow;
实际上是调用函数tcp_v4_syn_recv_sock(),看下该函数:
/*
* The three way handshake has completed - we got a valid synack -
* now create the new socket.
*
* 第三次握手成功,构建新的sock对象
*/
tcp_check_req
struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct dst_entry *dst)
{
struct inet_request_sock *ireq;
struct inet_sock *newinet;
struct tcp_sock *newtp;
struct sock *newsk;
#ifdef CONFIG_TCP_MD5SIG
struct tcp_md5sig_key *key;
#endif
if (sk_acceptq_is_full(sk))
goto exit_overflow;
if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
goto exit;
newsk = tcp_create_openreq_child(sk, req, skb);//构建新的sock对象
if (!newsk)
goto exit;
newsk->sk_gso_type = SKB_GSO_TCPV4;
sk_setup_caps(newsk, dst);
..
}
浏览整个函数没有发现端口的东西,我们进入构建函数tcp_create_openreq_child()
/* This is not only more efficient than what we used to do, it eliminates
* a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
*
* Actually, we could lots of memory writes here. tp of listening
* socket contains all necessary default parameters.
*/
struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
{
struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);//克隆sock对象
if (newsk != NULL) {
const struct inet_request_sock *ireq = inet_rsk(req);
struct tcp_request_sock *treq = tcp_rsk(req);
struct inet_connection_sock *newicsk = inet_csk(newsk);
struct tcp_sock *newtp;
/* Now setup tcp_sock */
newtp = tcp_sk(newsk);
newtp->pred_flags = 0;
newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1;
newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + 1;
newtp->snd_up = treq->snt_isn + 1;
tcp_prequeue_init(newtp);
...
阅读了函数左右代码,也没有看到明显关于端口的东西,但是我们看到克隆得到sock对象的函数inet_csk_clone()
struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
const gfp_t priority)
{
struct sock *newsk = sk_clone(sk, priority);
if (newsk != NULL) {
struct inet_connection_sock *newicsk = inet_csk(newsk);
newsk->sk_state = TCP_SYN_RECV;
newicsk->icsk_bind_hash = NULL;
inet_sk(newsk)->dport = inet_rsk(req)->rmt_port;
inet_sk(newsk)->num = ntohs(inet_rsk(req)->loc_port);//主机字节序端口
inet_sk(newsk)->sport = inet_rsk(req)->loc_port;//网络字节序端口
newsk->sk_write_space = sk_stream_write_space;
newicsk->icsk_retransmits = 0;
newicsk->icsk_backoff = 0;
newicsk->icsk_probes_out = 0;
/* Deinitialize accept_queue to trap illegal accesses. */
memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
security_inet_csk_clone(newsk, req);
}
return newsk;
}
终于看到了关于端口的信息了,是从请求对象中获得的。还记得我提醒过关于请求对象初始化的事情。我现在就看下请求对象是怎么初始化的
看请求对象构建的代码:
/* 设置ireq端口等 */
tcp_openreq_init(req, &tmp_opt, skb);
ireq = inet_rsk(req);
ireq->loc_addr = daddr;
ireq->rmt_addr = saddr;
ireq->no_srccheck = inet_sk(sk)->transparent;
ireq->opt = tcp_v4_save_options(sk, skb);
我们直接进入函数
static inline void tcp_openreq_init(struct request_sock *req,
struct tcp_options_received *rx_opt,
struct sk_buff *skb)
{
struct inet_request_sock *ireq = inet_rsk(req);
req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
req->cookie_ts = 0;
tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
req->mss = rx_opt->mss_clamp;
req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
ireq->tstamp_ok = rx_opt->tstamp_ok;
ireq->sack_ok = rx_opt->sack_ok;
ireq->snd_wscale = rx_opt->snd_wscale;
ireq->wscale_ok = rx_opt->wscale_ok;
ireq->acked = 0;
ireq->ecn_ok = 0;
ireq->rmt_port = tcp_hdr(skb)->source;//请求端端口
ireq->loc_port = tcp_hdr(skb)->dest;//本地端口
}
结合上面的代码可以得到下面的赋值:
inet_sk(newsk)->num = ntohs(inet_rsk(req)->loc_port)=tcp_hdr(skb)->dest;
inet_sk(newsk)->sport = inet_rsk(req)->loc_port=tcp_hdr(skb)->dest;
目的端口!!!请求端发送syn请求包的时候,目的端口不就是监听端口吗?至此,所有都明了。根本没有所谓的随机获取系统可用端口,用的就是监听sock'的端口。至于怎么区分拥有相同本地端口的多个sock呢?不是还有请求端地址和端口么?