Linux内核协议栈(附4) accept得到新的socket的本端端口号是什么?

背景:前两天有个人跟我争论accept之后新得到的描述连接的socket的端口是多少。我说是原来监听的端口,因为我之前经常用netstat查看连接,例如ssh,端口22,所有客户端与服务端建立的连接,服务端都是22。而那位同仁认为是随机获取可用端口,理由是如果端口相同怎么区分连接呢?当时脑子抽筋,没办法反驳。真是那句话,虽然你不对,但是我却找不到理由反驳你,只怪当时没有细看。现在我们整理下:

有两种可能:

1.与监听的套接字公用一个端口

2.随机获取一个系统可用的端口

我们获得答案的方法是研究内核协议栈。

在研究代码之前我们先回顾下TCP三次握手过程:

第一步:请求端发送SYN包,等待服务端回应

第二步:服务端接收到SYN包,并回应SYN+ACK包

第三步:请求端接收到SYN+ACK包,并回应ACK包,连接建立

具体到Linux的内核协议栈:

在服务端:

通过socket调用建立初始化sock对象,通过bind调用绑定本地地址和监听端口,通过listen调用为sock对象建立监听队列的空间,并将sock加入TCP协议三张全局哈希表中的监听哈希表,当有请求连接时,就从监听哈希表中找到对应的sock对象处理该请求。

处理过程就是三次握手过程:

第一步:请求端发送SYN包请求连接

第二步:服务端接收到SYN包,并根据数据包的目的地址和目的端口信息,从监听哈希表中查找对应的sock对象,然后在sock对象的监听队列中查找是否有该请求(第一次发送syn包,是没有对应的request_sock对象的),如果没有就在请求队列空间中建立代表该请求的request_sock对象,然后发送syn+ack包回应请求端。

第三步:请求端接收到SYN+ACK包,并回应ACK包。ACK包到达服务端,服务端从监听哈希表中查出处理该ACK包的sock对象,然后从sock对象的接受队列中查找该请求的request_sock对象(这次是能找到的,因为第一次处理syn包时已经建立),并构建新的sock对象描述该链接,然后将request_sock对象连同新构建的sock对象挂到接受队列icsk_accept_queue的队尾。

接着用户空间的进程调用accept会从接受队列中取出request_sock对象连同新构建的sock对象,并将新构建的sock对象返回给用户空间进程。这个就是我们说的accept得到新socket对应的sock对象,我们称之为子sock对象,那么这个子sock对象的端口是多少呢?是随机获取的,还是和原来sock的一样?

我们先看继承与sock对象的inet_sock, 也就是在inet协议族中代表一个连接sock对象

/** struct inet_sock - representation of INET sockets
 *
 * @sk - ancestor class
 * @pinet6 - pointer to IPv6 control block
 * @daddr - Foreign IPv4 addr
 * @rcv_saddr - Bound local IPv4 addr
 * @dport - Destination port
 * @num - Local port
 * @saddr - Sending source
 * @uc_ttl - Unicast TTL
 * @sport - Source port
 * @id - ID counter for DF pkts
 * @tos - TOS
 * @mc_ttl - Multicasting TTL
 * @is_icsk - is this an inet_connection_sock?
 * @mc_index - Multicast device index
 * @mc_list - Group array
 * @cork - info to build ip hdr on each ip frag while socket is corked
 */
struct inet_sock {
	/* sk and pinet6 has to be the first two members of inet_sock */
	struct sock		sk;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
	struct ipv6_pinfo	*pinet6;
#endif
	/* Socket demultiplex comparisons on incoming packets. */
	__be32			daddr;
	__be32			rcv_saddr;
	__be16			dport;
	__u16			num;//主机字节序表示的源端口号
	__be32			saddr;
	__s16			uc_ttl;
	__u16			cmsg_flags;
	struct ip_options	*opt;
	__be16			sport;//网络字字节序表示的端口号
	__u16			id;
	__u8			tos;
	__u8			mc_ttl;
	__u8			pmtudisc;
	__u8			recverr:1,
				is_icsk:1,
				freebind:1,
				hdrincl:1,
				mc_loop:1,
				transparent:1,
				mc_all:1;
	int			mc_index;
	__be32			mc_addr;
	struct ip_mc_socklist	*mc_list;
	struct {
		unsigned int		flags;
		unsigned int		fragsize;
		struct ip_options	*opt;
		struct dst_entry	*dst;
		int			length; /* Total length of all frames */
		__be32			addr;
		struct flowi		fl;
	} cork;
};

可以看到其中有两个代表端口的字段,分别是主机字节序的num,和网络字节序的sport。

接着我们结合协议栈源码分析三次握手过程和accept过程。

 

数据到达网卡的时候,对于TCP协议,将大致要经过这个一个调用链:

网卡驱动-->netif_receive_skb()--->ip_rcv()--->ip_local_deliver_finish()--->tcp_v4_rcv()

下面我分析下函数tcp_v4_rcv()

int tcp_v4_rcv(struct sk_buff *skb)
{
...
	/*
	*
	*
	* 根据源端口号,目的端口号和接收的interface查找sock对象------>先在建立连接的哈希表中查找------>如果没找到就从监听哈希表中找
	*
	*/
	
	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
	if (!sk)
		goto no_tcp_socket;
...
		else
#endif
		{
			if (!tcp_prequeue(sk, skb))
				ret = tcp_v4_do_rcv(sk, skb);//--------------------------->用得到的sock对象处理skb
		}
	} else
...
}

主要是根据目的地址和端口口从listen哈希表中查找处理数据包的sock对象,然后tcp_v4_do_rcv()函数用该sock对象处理数据包

/* The socket must have it's spinlock held when we get
 * here.
 *
 * We have a potential double-lock case here, so even when
 * doing backlog processing we use the BH locking scheme.
 * This is because we cannot sleep with the original spinlock
 * held.
 */
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
	struct sock *rsk;
#ifdef CONFIG_TCP_MD5SIG
	/*
	 * We really want to reject the packet as early as possible
	 * if:
	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
	 *  o There is an MD5 option and we're not expecting one
	 */
	if (tcp_v4_inbound_md5_hash(sk, skb))
		goto discard;
#endif

	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
		TCP_CHECK_TIMER(sk);
		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
			rsk = sk;
			goto reset;
		}
		TCP_CHECK_TIMER(sk);
		return 0;
	}
	/*****************************************************************/
	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
		goto csum_err;

	if (sk->sk_state == TCP_LISTEN) {//如果是listen状态 处理握手连接

/*
*
*从监听队列中查找请求对象request_sock,对于第一个syn包,这时还没有构建对应的request_sock对象,也没有对应的子sock对象,所以直接返回入参sk		*/
		struct sock *nsk = tcp_v4_hnd_req(sk, skb);//
		if (!nsk)
			goto discard;

		/*对于第一个syn包,nsk就是sk,不会执行if块里面的代码,继续向下执行*/
		if (nsk != sk) {
			if (tcp_child_process(sk, nsk, skb)) {//执行tcp_rcv_state_process,状态机
				rsk = nsk;
				goto reset;
			}
			return 0;
		}
	}

	TCP_CHECK_TIMER(sk);
	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {//执行状态机
		rsk = sk;
		goto reset;
	}
	TCP_CHECK_TIMER(sk);
	return 0;

reset:
	tcp_v4_send_reset(rsk, skb);
discard:
	kfree_skb(skb);
	/* Be careful here. If this function gets more complicated and
	 * gcc suffers from register pressure on the x86, sk (in %ebx)
	 * might be destroyed here. This current version compiles correctly,
	 * but you have been warned.
	 */
	return 0;

csum_err:
	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
	goto discard;
}

 

在解释函数之前,我看一下sock对象的接受队列:

struct inet_connection_sock {
	/* inet_sock has to be the first member! */
	struct inet_sock	  icsk_inet;
	struct request_sock_queue icsk_accept_queue;//接受队列
	struct inet_bind_bucket	  *icsk_bind_hash;
	unsigned long		  icsk_timeout;
 	struct timer_list	  icsk_retransmit_timer;
 	struct timer_list	  icsk_delack_timer;
...
}

是连接套接字中的一个成员:

struct request_sock_queue {
	struct request_sock	*rskq_accept_head;
	struct request_sock	*rskq_accept_tail;
	rwlock_t		syn_wait_lock;
	u8			rskq_defer_accept;
	/* 3 bytes hole, try to pack */
	struct listen_sock	*listen_opt;
};

看到请求对象的队列对头和队尾了,再看listen_sock

/** struct listen_sock - listen state
 *
 * @max_qlen_log - log_2 of maximal queued SYNs/REQUESTs
 */
struct listen_sock {
	u8			max_qlen_log;
	/* 3 bytes hole, try to use */
	int			qlen;
	int			qlen_young;
	int			clock_hand;
	u32			hash_rnd;
	u32			nr_table_entries;
	struct request_sock	*syn_table[0];
};


好了,看完了三个数据结构,我们接着函数tcp_v4_do_rcv(),该函数调用tcp_v4_hnd_req(),解释下内核是怎么处理三次握手的。

第一步:收到syn请求连接的数据包,取出处理该包的sock对象,sock对象中有接受队列,现在接受队列中查找请求对象request_sock,因为第一次收到该请求的syn包,肯定没有对应request_sock对象,于是没找到。又在建立连接的哈希表中找,更加不可能找到,于是直接返回sk。

static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
{
	struct tcphdr *th = tcp_hdr(skb);
	const struct iphdr *iph = ip_hdr(skb);
	struct sock *nsk;
	struct request_sock **prev;
	/* Find possible connection requests. */
	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,//在sock对象的接受队列中查找请求对象
						       iph->saddr, iph->daddr);
	
	/*在接受队列找到了request_sock对象,说明完成接收第一次握手,并发送了ack包,现在接收到第二个syn包,进入第三次握手,构建新sock对象*/
	if (req)
		return tcp_check_req(sk, skb, req, prev);

	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,//没找到可能已经建立连接,sock已被转到连接建立哈希表中,所以在此哈希表中找
			th->source, iph->daddr, th->dest, inet_iif(skb));

	if (nsk) {
		if (nsk->sk_state != TCP_TIME_WAIT) {
			bh_lock_sock(nsk);
			return nsk;
		}
		inet_twsk_put(inet_twsk(nsk));
		return NULL;
	}

#ifdef CONFIG_SYN_COOKIES
	if (!th->rst && !th->syn && th->ack)
		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
#endif
	return sk;
}

回到函数tcp_v4_do_rcv(),上面分析,返回的就是sk,所以if条件是不成立的,if内的代码块不会执行,直接执行if代码块之后的代码。

		/*对于第一个syn包,nsk就是sk,不会执行if块里面的代码,继续向下执行*/
		if (nsk != sk) {
			if (tcp_child_process(sk, nsk, skb)) {//执行tcp_rcv_state_process,状态机
				rsk = nsk;
				goto reset;
			}
			return 0;
		}

我们进入该函数tcp_rcv_state_process(), 显然sock是被设置成listen状态,接受到的是syn包,

int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
			  struct tcphdr *th, unsigned len)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct inet_connection_sock *icsk = inet_csk(sk);
	int queued = 0;
	int res;

	tp->rx_opt.saw_tstamp = 0;

	switch (sk->sk_state) {
	case TCP_CLOSE:
		goto discard;

	case TCP_LISTEN:
		if (th->ack)
			return 1;

		if (th->rst)
			goto discard;
		
		/*listen 状态只处理syn包 */
		if (th->syn) {
			if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)  /**/tcp_v4_init_sock ------> ipv4_specific   ---> tcp_v4_conn_request
				return 1;

			/* Now we have several options: In theory there is
			 * nothing else in the frame. KA9Q has an option to
			 * send data with the syn, BSD accepts data with the
			 * syn up to the [to be] advertised window and
			 * Solaris 2.1 gives you a protocol error. For now
			 * we just ignore it, that fits the spec precisely
			 * and avoids incompatibilities. It would be nice in
			 * future to drop through and process the data.
			 *
			 * Now that TTCP is starting to be used we ought to
			 * queue this data.
			 * But, this leaves one open to an easy denial of
			 * service attack, and SYN cookies can't defend
			 * against this problem. So, we drop the data
			 * in the interest of security over speed unless
			 * it's still in use.
			 */
			kfree_skb(skb);
			return 0;
		}
		goto discard;

	case TCP_SYN_SENT:
		queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
		if (queued >= 0)
			return queued;
...
}

于是执行下面代码块:

		/*listen 状态只处理syn包 */
		if (th->syn) {
			if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)  /**/tcp_v4_init_sock ------> ipv4_specific   ---> tcp_v4_conn_request

实际调用的是函数tcp_v4_conn_request(),该函数具体做了什么呢?该函数就构建并初始化了一个brequest_sock对象,将请求对象挂到sock对象的监听队列中,并回了一个syn+ack包,就是做了第二次握手的事。还有一个大家一定要注意:就是对请求对象做了初始化,包括设置端口。端口正是我们关心的东西,不过我们先不分析,回头来分析,现在只是提醒注意。

看该函数关键代码:

int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
...

	req = inet_reqsk_alloc(&tcp_request_sock_ops);/*构造request_sock对象,对于TCP其实是tcp_request_sock对象 */ ----> tcp_prot proto_register 
	if (!req)
		goto drop;
...


	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
	
	/* 设置ireq端口等 */
	tcp_openreq_init(req, &tmp_opt, skb);

	ireq = inet_rsk(req);
	ireq->loc_addr = daddr;
	ireq->rmt_addr = saddr;
	ireq->no_srccheck = inet_sk(sk)->transparent;
	ireq->opt = tcp_v4_save_options(sk, skb);
...
	/*回复syn + ack 包*/
	if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
		goto drop_and_free;

	inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);//添加到接受队列中
...
}


回了syn+ack包,做了第二次握手,那就等请求端的ack回应了。

接着请求端收到syn+ack包,并又发了一个ack,进行第三次握手。在在服务端收到该ack包,于是又到了函数tcp_do_rcv(),又找到处理该数据包的sock对象,进入函数tcp_v4_do_rcv()

调用函数tcp_v4_hnd_req(),在sock队列中查找对应的请求对象request_sock,由于第一次收到syn包时,构建了请求对象request_sock,并将请求对象挂到请求队列中,这次查找肯定是能找到了

static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
{
	struct tcphdr *th = tcp_hdr(skb);
	const struct iphdr *iph = ip_hdr(skb);
	struct sock *nsk;
	struct request_sock **prev;
	/* Find possible connection requests. */
	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,//在sock对象的接受队列中查找请求对象
						       iph->saddr, iph->daddr);
	
	/*在接受队列找到了request_sock对象,说明完成接收第一次握手,并发送了ack包,现在接收到第二个syn包,进入第三次握手,构建新sock对象*/
	if (req)
		return tcp_check_req(sk, skb, req, prev);

看tcp_v4_hnd_req(),就是执行if代码块中的函数tcp_check_req(),该函数做了什么呢?该函数主要做

1)构建请求对象对应的子sock,注意这个sock也就是我们调用accept时得到的sock,并将该sock对象挂到请求对象上request_sock

2)将请求对象request_sock从请求队列移到接受队列,并将子sock对象返回

struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
			   struct request_sock *req,
			   struct request_sock **prev)
{
...
	 *
	 */ 
	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); /**/ tcp_v4_init_sock -- > ipv4_specific  --> tcp_v4_syn_recv_sock
	if (child == NULL)
		goto listen_overflow;

	inet_csk_reqsk_queue_unlink(sk, req, prev);
	inet_csk_reqsk_queue_removed(sk, req);

	inet_csk_reqsk_queue_add(sk, req, child);//将请求对象request_sock和对应的sock添加到sock的接受队列
	return child;
}


这样就有了新的连接的子sock对象。当在用户空间调用accept时,就会冲接受队列中取出请求对象,并返回对应的子sock。现在回到我们的问题:新的sock的端口是多少?是随机获取可用的端口?还是用原来监听sock'的端口?

我们看下新的子sock是怎么构建的;回到上面的tcp_check_req()函数,该函数执行下面代码构成新sock对象的构建

	/* OK, ACK is valid, create big socket and
	 * feed this segment to it. It will repeat all
	 * the tests. THIS SEGMENT MUST MOVE SOCKET TO
	 * ESTABLISHED STATE. If it will be dropped after
	 * socket is created, wait for troubles.
	 *
	 * 构建sock对象
	 *
	 */ 
	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); /**/ tcp_v4_init_sock -- > ipv4_specific  --> tcp_v4_syn_recv_sock
	if (child == NULL)
		goto listen_overflow;

实际上是调用函数tcp_v4_syn_recv_sock(),看下该函数:

/*
 * The three way handshake has completed - we got a valid synack -
 * now create the new socket.
 *
 * 第三次握手成功,构建新的sock对象
 */
 tcp_check_req
struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
				  struct request_sock *req,
				  struct dst_entry *dst)
{
	struct inet_request_sock *ireq;
	struct inet_sock *newinet;
	struct tcp_sock *newtp;
	struct sock *newsk;
#ifdef CONFIG_TCP_MD5SIG
	struct tcp_md5sig_key *key;
#endif

	if (sk_acceptq_is_full(sk))
		goto exit_overflow;

	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
		goto exit;

	newsk = tcp_create_openreq_child(sk, req, skb);//构建新的sock对象
	if (!newsk)
		goto exit;

	newsk->sk_gso_type = SKB_GSO_TCPV4;
	sk_setup_caps(newsk, dst);
..
}

浏览整个函数没有发现端口的东西,我们进入构建函数tcp_create_openreq_child()

/* This is not only more efficient than what we used to do, it eliminates
 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
 *
 * Actually, we could lots of memory writes here. tp of listening
 * socket contains all necessary default parameters.
 */
struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
{
	struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);//克隆sock对象

	if (newsk != NULL) {
		const struct inet_request_sock *ireq = inet_rsk(req);
		struct tcp_request_sock *treq = tcp_rsk(req);
		struct inet_connection_sock *newicsk = inet_csk(newsk);
		struct tcp_sock *newtp;

		/* Now setup tcp_sock */
		newtp = tcp_sk(newsk);
		newtp->pred_flags = 0;
		newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1;
		newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + 1;
		newtp->snd_up = treq->snt_isn + 1;

		tcp_prequeue_init(newtp);
...

阅读了函数左右代码,也没有看到明显关于端口的东西,但是我们看到克隆得到sock对象的函数inet_csk_clone()

struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
			    const gfp_t priority)
{
	struct sock *newsk = sk_clone(sk, priority);

	if (newsk != NULL) {
		struct inet_connection_sock *newicsk = inet_csk(newsk);

		newsk->sk_state = TCP_SYN_RECV;
		newicsk->icsk_bind_hash = NULL;

		inet_sk(newsk)->dport = inet_rsk(req)->rmt_port;
		inet_sk(newsk)->num = ntohs(inet_rsk(req)->loc_port);//主机字节序端口
		inet_sk(newsk)->sport = inet_rsk(req)->loc_port;//网络字节序端口
		newsk->sk_write_space = sk_stream_write_space;

		newicsk->icsk_retransmits = 0;
		newicsk->icsk_backoff	  = 0;
		newicsk->icsk_probes_out  = 0;

		/* Deinitialize accept_queue to trap illegal accesses. */
		memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));

		security_inet_csk_clone(newsk, req);
	}
	return newsk;
}

终于看到了关于端口的信息了,是从请求对象中获得的。还记得我提醒过关于请求对象初始化的事情。我现在就看下请求对象是怎么初始化的

看请求对象构建的代码:

	
	/* 设置ireq端口等 */
	tcp_openreq_init(req, &tmp_opt, skb);

	ireq = inet_rsk(req);
	ireq->loc_addr = daddr;
	ireq->rmt_addr = saddr;
	ireq->no_srccheck = inet_sk(sk)->transparent;
	ireq->opt = tcp_v4_save_options(sk, skb);

我们直接进入函数

static inline void tcp_openreq_init(struct request_sock *req,
				    struct tcp_options_received *rx_opt,
				    struct sk_buff *skb)
{
	struct inet_request_sock *ireq = inet_rsk(req);

	req->rcv_wnd = 0;		/* So that tcp_send_synack() knows! */
	req->cookie_ts = 0;
	tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
	req->mss = rx_opt->mss_clamp;
	req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
	ireq->tstamp_ok = rx_opt->tstamp_ok;
	ireq->sack_ok = rx_opt->sack_ok;
	ireq->snd_wscale = rx_opt->snd_wscale;
	ireq->wscale_ok = rx_opt->wscale_ok;
	ireq->acked = 0;
	ireq->ecn_ok = 0;
	ireq->rmt_port = tcp_hdr(skb)->source;//请求端端口
	ireq->loc_port = tcp_hdr(skb)->dest;//本地端口
}


结合上面的代码可以得到下面的赋值:

		inet_sk(newsk)->num = ntohs(inet_rsk(req)->loc_port)=tcp_hdr(skb)->dest;
		inet_sk(newsk)->sport = inet_rsk(req)->loc_port=tcp_hdr(skb)->dest;

目的端口!!!请求端发送syn请求包的时候,目的端口不就是监听端口吗?至此,所有都明了。根本没有所谓的随机获取系统可用端口,用的就是监听sock'的端口。至于怎么区分拥有相同本地端口的多个sock呢?不是还有请求端地址和端口么?


 

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值