Linux内核网络栈1.2.13-tcp.c概述

参考资料
<<linux内核网络栈源代码情景分析>>
af_inet.c文件中调用函数在协议层的实现

本文主要根据在af_inet.c文件中根据初始化不同的协议,来调用不同的协议族处理。在af_inet.c中的inet_create函数中就是根据传入不同的连接类型展示不同的协议族。

static int inet_create(struct socket *sock, int protocol)
{
	...	
	switch(sock->type)  									// 根据传入的套接字进行赋值
	{
		case SOCK_STREAM: 									// 流式套接字使用tcp协议操作函数
		case SOCK_SEQPACKET:
			if (protocol && protocol != IPPROTO_TCP)        // 检查protocl是否与IPPROTO_TCP相同 如果不同则报错返回
			{
				kfree_s((void *)sk, sizeof(*sk));
				return(-EPROTONOSUPPORT);
			}
			protocol = IPPROTO_TCP; 						// 设置协议
			/* TCP_NO_CHECK设置为1 表示对于tcp协议默认使用校验	*/
			sk->no_check = TCP_NO_CHECK;
			prot = &tcp_prot; 								// 设置协议操作函数集
			break;
		...
	}
	...
}

此时选择TCP协议就选择的协议操作函数就是tcp_prot,该类型的定义值如下:

struct proto {
  struct sk_buff *	(*wmalloc)(struct sock *sk,
						    unsigned long size, int force,
						    int priority);
  struct sk_buff *	(*rmalloc)(struct sock *sk,
						    unsigned long size, int force,
						    int priority);
  void			(*wfree)(struct sock *sk, struct sk_buff *skb,
						    unsigned long size);
  void			(*rfree)(struct sock *sk, struct sk_buff *skb,
						    unsigned long size);
  unsigned long	(*rspace)(struct sock *sk);
  unsigned long	(*wspace)(struct sock *sk);
  void			(*close)(struct sock *sk, int timeout);
  int				(*read)(struct sock *sk, unsigned char *to,
						int len, int nonblock, unsigned flags);
  int				(*write)(struct sock *sk, unsigned char *to,
						 int len, int nonblock, unsigned flags);
  int				(*sendto)(struct sock *sk,
				     	         unsigned char *from, int len, int noblock,
						  unsigned flags, struct sockaddr_in *usin,
						  int addr_len);
  int				(*recvfrom)(struct sock *sk,
						    unsigned char *from, int len, int noblock,
						    unsigned flags, struct sockaddr_in *usin,
						    int *addr_len);
  int				(*build_header)(struct sk_buff *skb,
							unsigned long saddr,
							unsigned long daddr,
							struct device **dev, int type,
							struct options *opt, int len, int tos, int ttl);
  int				(*connect)(struct sock *sk,
						  struct sockaddr_in *usin, int addr_len);
  struct sock *	(*accept) (struct sock *sk, int flags);
  void			(*queue_xmit)(struct sock *sk,
							      struct device *dev, struct sk_buff *skb,
							      int free);
  void			(*retransmit)(struct sock *sk, int all);
  void			(*write_wakeup)(struct sock *sk);
  void			(*read_wakeup)(struct sock *sk);
	  int			(*rcv)(struct sk_buff *buff, struct device *dev,
					       struct options *opt, unsigned long daddr,
					       unsigned short len, unsigned long saddr,
					       int redo, struct inet_protocol *protocol);
  int			(*select)(struct sock *sk, int which,
				  select_table *wait);
  int			(*ioctl)(struct sock *sk, int cmd,
				 unsigned long arg);
  int			(*init)(struct sock *sk);
  void		(*shutdown)(struct sock *sk, int how);
  int			(*setsockopt)(struct sock *sk, int level, int optname,
  						 char *optval, int optlen);
  int			(*getsockopt)(struct sock *sk, int level, int optname,
  						char *optval, int *option);  	 
  
  unsigned short	max_header;
  unsigned long	retransmits;
  struct sock *	sock_array[SOCK_ARRAY_SIZE];	// sock数组
  char			name[80];
  int				inuse, 			// 是否正在被使用
  				highestinuse;
};


// 定义tcp_prot如下
struct proto tcp_prot = {
	sock_wmalloc,
	sock_rmalloc,
	sock_wfree,
	sock_rfree,
	sock_rspace,
	sock_wspace,
	tcp_close,
	tcp_read,
	tcp_write,
	tcp_sendto,
	tcp_recvfrom,
	ip_build_header,
	tcp_connect,
	tcp_accept,
	ip_queue_xmit,
	tcp_retransmit,
	tcp_write_wakeup,
	tcp_read_wakeup,
	tcp_rcv,
	tcp_select,
	tcp_ioctl,
	NULL,				// init
	tcp_shutdown,
	tcp_setsockopt,
	tcp_getsockopt,
	128,
	0,
	{NULL,},
	"TCP",
	0, 0
};

如上可知,在af_inet文件中调用的协议处理函数如,connect对应的协议函数为tcp_connect,accept接受的函数就是tcp_accept,inet_release中prot调用的close方法等。本文就来分析一下该函数的具体操作。

socket执行过程

主要就是操作连接connect,accept等操作。

tcp_connect函数
/*
 *	This will initiate an outgoing connection. 
 */
 
static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
{
	struct sk_buff *buff;
	struct device *dev=NULL;
	unsigned char *ptr;
	int tmp;
	int atype;
	struct tcphdr *t1;
	struct rtable *rt;

	if (sk->state != TCP_CLOSE)  															// 检查当前状态
	{
		return(-EISCONN);
	}
	
	if (addr_len < 8)  																		// 检查当前长度
		return(-EINVAL);

	if (usin->sin_family && usin->sin_family != AF_INET)  									// 检查当前协议类型
		return(-EAFNOSUPPORT);

  	/*
  	 *	connect() to INADDR_ANY means loopback (BSD'ism).
  	 */
  	
  	if(usin->sin_addr.s_addr==INADDR_ANY)
		usin->sin_addr.s_addr=ip_my_addr(); 												// 获取地址
		  
	/*
	 *	Don't want a TCP connection going to a broadcast address 
	 */

	if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)  		// 获取广播地址 并判断类型
		return -ENETUNREACH;
  
	sk->inuse = 1; 																					// 设置正在使用 
	sk->daddr = usin->sin_addr.s_addr;
	sk->write_seq = tcp_init_seq(); 																// 获取待发送的seq
	sk->window_seq = sk->write_seq;
	sk->rcv_ack_seq = sk->write_seq -1; 															// 希望接受的队列
	sk->err = 0;
	sk->dummy_th.dest = usin->sin_port;
	release_sock(sk);

	buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL); 										// 申请内存大小
	if (buff == NULL) 
	{
		return(-ENOMEM);
	}
	sk->inuse = 1;
	buff->len = 24;
	buff->sk = sk;
	buff->free = 0;
	buff->localroute = sk->localroute;
	
	t1 = (struct tcphdr *) buff->data;

	/*
	 *	Put in the IP header and routing stuff. 
	 */
	 
	rt=ip_rt_route(sk->daddr, NULL, NULL); 														// 设置ip信息
	

	/*
	 *	We need to build the routing stuff from the things saved in skb. 
	 */

	tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
					IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl); 					// 建立头部信息
	if (tmp < 0) 
	{
		sk->prot->wfree(sk, buff->mem_addr, buff->mem_len); 									// 如果失败则释放内容
		release_sock(sk);
		return(-ENETUNREACH);
	}

	buff->len += tmp;
	t1 = (struct tcphdr *)((char *)t1 +tmp);

	memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1)); 											// 拷贝内容
	t1->seq = ntohl(sk->write_seq++);
	sk->sent_seq = sk->write_seq;
	buff->h.seq = sk->write_seq;
	t1->ack = 0;
	t1->window = 2;
	t1->res1=0;
	t1->res2=0;
	t1->rst = 0;
	t1->urg = 0;
	t1->psh = 0;
	t1->syn = 1;
	t1->urg_ptr = 0;
	t1->doff = 6;
	/* use 512 or whatever user asked for */
	
	if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
		sk->window_clamp=rt->rt_window;
	else
		sk->window_clamp=0;

	if (sk->user_mss)
		sk->mtu = sk->user_mss;
	else if(rt!=NULL && (rt->rt_flags&RTF_MTU))
		sk->mtu = rt->rt_mss;
	else 
	{
#ifdef CONFIG_INET_SNARL
		if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
#else
		if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
#endif
			sk->mtu = 576 - HEADER_SIZE;
		else
			sk->mtu = MAX_WINDOW;
	}
	/*
	 *	but not bigger than device MTU 
	 */

	if(sk->mtu <32)
		sk->mtu = 32;	/* Sanity limit */
		
	sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE);
	
	/*
	 *	Put in the TCP options to say MTU. 
	 */

	ptr = (unsigned char *)(t1+1);
	ptr[0] = 2;
	ptr[1] = 4;
	ptr[2] = (sk->mtu) >> 8;
	ptr[3] = (sk->mtu) & 0xff;
	tcp_send_check(t1, sk->saddr, sk->daddr,
		  sizeof(struct tcphdr) + 4, sk); 												// 校验和

	/*
	 *	This must go first otherwise a really quick response will get reset. 
	 */

	tcp_set_state(sk,TCP_SYN_SENT); 													// 设置状态为TCP_SYN_SENT
	sk->rto = TCP_TIMEOUT_INIT;
#if 0 /* we already did this */
	init_timer(&sk->retransmit_timer); 
#endif
	sk->retransmit_timer.function=&retransmit_timer; 									// 设置会到函数并保持回调数据
	sk->retransmit_timer.data = (unsigned long)sk;
	reset_xmit_timer(sk, TIME_WRITE, sk->rto);	/* Timer for repeating the SYN until an answer */
	sk->retransmits = TCP_SYN_RETRIES;

	sk->prot->queue_xmit(sk, dev, buff, 0);   											//发送数据
	reset_xmit_timer(sk, TIME_WRITE, sk->rto);
	tcp_statistics.TcpActiveOpens++;
	tcp_statistics.TcpOutSegs++;
  
	release_sock(sk);
	return(0);
}

该函数主要是connect系统调用传输层的实现,该函数主要是发送SYN请求连续数据包,在主动打开的一端,主要进行了参数检查,对TCP首部各字段的赋值并对socket状态的更新,最后并启动超时重发定时器。

tcp_accept函数
/*
 *	This will accept the next outstanding connection.
 *
 */
 
static struct sock *tcp_accept(struct sock *sk, int flags)
{
	struct sock *newsk;
	struct sk_buff *skb;
  
    /*
   * We need to make sure that this socket is listening,
   * and that it has something pending.
   */

	if (sk->state != TCP_LISTEN)  							// 检查必须为监听套接字 如果不是监听套接字则返回错误
	{
		sk->err = EINVAL;
		return(NULL); 
	}

	/* Avoid the race. */
	cli(); 													// 禁止中断
	sk->inuse = 1;

	while((skb = tcp_dequeue_established(sk)) == NULL)  	// 监听套接字接受队列,检查是否存在已经完成连接的数据包
	{
		if (flags & O_NONBLOCK)  							// 如果是非阻塞状态
		{
			sti();
			release_sock(sk);
			sk->err = EAGAIN;
			return(NULL); 									// 返回错误
		}

		release_sock(sk); 									// 监听并解析头部
		interruptible_sleep_on(sk->sleep);
		if (current->signal & ~current->blocked) 
		{
			sti();
			sk->err = ERESTARTSYS;
			return(NULL);
		}
		sk->inuse = 1;
  	}
	sti(); 													// 使能中断 证明有接受数据

	/*
	 *	Now all we need to do is return skb->sk. 
	 */

	newsk = skb->sk; 										// 获取新sock

	kfree_skb(skb, FREE_READ);
	sk->ack_backlog--;
	release_sock(sk); 										
	return(newsk); 											// 返回
}

该函数是accept系统调用的传输层的实现,该函数主要是从侦听套接字接受队列中取数据包,查看其是否完成TCP三次握手建立过程,如果没有则等待完成,否则返回数据包对应的sock,实际上该函数获取了处理结果,其过程都由其他函数如tcp_conn_request,tcp_ack完成。

tcp_write函数
/*
 *	This routine copies from a user buffer into a socket,
 *	and starts the transmit system. 							被上层调用用于发送数据 是系统调用write函数对应的传输层处理函数 对于TCP协议而言send sendto系统调用都会调用tcp_write进行处理
 */

static int tcp_write(struct sock *sk, unsigned char *from,
	  int len, int nonblock, unsigned flags)
{
	int copied = 0;
	int copy;
	int tmp;
	struct sk_buff *skb;
	struct sk_buff *send_tmp;
	unsigned char *buff;
	struct proto *prot;
	struct device *dev = NULL;

	sk->inuse=1; 											// 设置sk正在使用
	prot = sk->prot; 										// 获取sock的协议sock
	while(len > 0)  										// 如果长度大于0
	{
		if (sk->err)  										// 判断是否有错误 如果有错误 则返回错误 
		{			/* Stop on an error */
			release_sock(sk); 								// 检查是否有数据输入
			if (copied) 
				return(copied);
			tmp = -sk->err;
			sk->err = 0;
			return(tmp);
		}

		/*
		 *	First thing we do is make sure that we are established. 
		 */
	
		if (sk->shutdown & SEND_SHUTDOWN)  					// 判断是否sk关闭 
		{
			release_sock(sk); 								// 检查是否有输入到达
			sk->err = EPIPE;
			if (copied) 
				return(copied);
			sk->err = 0;
			return(-EPIPE); 								// 返回错误
		}

		/* 
		 *	Wait for a connection to finish.
		 */
	
		while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)   	// 如果套接字为已经建立连接或者tcp关闭时 就退出循环
		{
			if (sk->err)  														// 如果出错则返回错误信息
			{
				release_sock(sk);
				if (copied)  													// 如果已经发送了字节数 则返回当前发送出去的长度
					return(copied);
				tmp = -sk->err;
				sk->err = 0;
				return(tmp);
			}

			if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)  	  // 如果状态不是TCP_SYN_SENT 或者状态不是TCP_SYN_RECV 处理错误
			{
				release_sock(sk);  												
				if (copied) 
					return(copied);

				if (sk->err) 
				{
					tmp = -sk->err;
					sk->err = 0;
					return(tmp);
				}

				if (sk->keepopen) 
				{
					send_sig(SIGPIPE, current, 0);
				}
				return(-EPIPE);
			}

			if (nonblock || copied)  										// 如果设置是否是阻塞 或者 已经发送的长度
			{
				release_sock(sk);
				if (copied) 
					return(copied); 										// 返回长度
				return(-EAGAIN); 											// 返回错误
			}

			release_sock(sk); 												// 检查是否有数据到
			cli(); 															// 关闭中断
		
			if (sk->state != TCP_ESTABLISHED &&
		    		sk->state != TCP_CLOSE_WAIT && sk->err == 0)  			// 如果状态不是连接已建立并且不是关闭状态 并且没有错误
		    	{
				interruptible_sleep_on(sk->sleep); 							// 进入等待休眠
				if (current->signal & ~current->blocked)  					// 获取当前的信号量 如果有信号量则返回
				{
					sti();
					if (copied) 
						return(copied);
					return(-ERESTARTSYS);
				}
			}
			sk->inuse = 1; 													// 证明 此时连接完成 设置正在使用该套接字
			sti(); 															// 使能中断
		}

	/*
	 * The following code can result in copy <= if sk->mss is ever
	 * decreased.  It shouldn't be.  sk->mss is min(sk->mtu, sk->max_window).
	 * sk->mtu is constant once SYN processing is finished.  I.e. we
	 * had better not get here until we've seen his SYN and at least one
	 * valid ack.  (The SYN sets sk->mtu and the ack sets sk->max_window.)
	 * But ESTABLISHED should guarantee that.  sk->max_window is by definition
	 * non-decreasing.  Note that any ioctl to set user_mss must be done
	 * before the exchange of SYN's.  If the initial ack from the other
	 * end has a window of 0, max_window and thus mss will both be 0.
	 */

	/* 
	 *	Now we need to check if we have a half built packet. 
	 */

		if ((skb = tcp_dequeue_partial(sk)) != NULL)  							// 获取partial数据
		{
		        int hdrlen;

		         /* IP header + TCP header */
			hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
			         + sizeof(struct tcphdr); 									// 获取TP TCP 头部长度信息
	
			/* Add more stuff to the end of skb->len */
			if (!(flags & MSG_OOB)) 
			{
				copy = min(sk->mss - (skb->len - hdrlen), len); 				// 检查剩余长度
				/* FIXME: this is really a bug. */
				if (copy <= 0) 
				{
			  		printk("TCP: **bug**: \"copy\" <= 0!!\n");
			  		copy = 0;
				}
	  
				memcpy_fromfs(skb->data + skb->len, from, copy); 				// 拷贝数据
				skb->len += copy;
				from += copy;
				copied += copy;
				len -= copy;
				sk->write_seq += copy; 											// 添加数据到write_seq
			}
			if ((skb->len - hdrlen) >= sk->mss ||
				(flags & MSG_OOB) || !sk->packets_out) 							// 当前数据足够发送  获取当前数据需要立马发送
				tcp_send_skb(sk, skb); 											// 发送数据
			else
				tcp_enqueue_partial(skb, sk);	 								// 将数据添加到skb中
			continue;
		}

	/*
	 * We also need to worry about the window.
 	 * If window < 1/2 the maximum window we've seen from this
 	 *   host, don't use it.  This is sender side
 	 *   silly window prevention, as specified in RFC1122.
 	 *   (Note that this is different than earlier versions of
 	 *   SWS prevention, e.g. RFC813.).  What we actually do is 
	 *   use the whole MSS.  Since the results in the right
	 *   edge of the packet being outside the window, it will
	 *   be queued for later rather than sent.
	 */

		copy = sk->window_seq - sk->write_seq; 									// 当前窗口可发送的数据长度
		if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss) 		// 如果窗口小于最大窗口值的一半 或者大于MSS值 都设置为MSS
			copy = sk->mss;
		if (copy > len) 														// 如果大于实际要发送的数据长度则设置为最大值
			copy = len;

	/*
	 *	We should really check the window here also. 
	 */
	 
		send_tmp = NULL;
		if (copy < sk->mss && !(flags & MSG_OOB))  								// 如果数据为非OOB数据 则进行等待以进行数据合并发送,所以更新容量为MSS值
		{
			/*
			 *	We will release the socket in case we sleep here. 
			 */
			release_sock(sk);
			/*
			 *	NB: following must be mtu, because mss can be increased.
			 *	mss is always <= mtu 
			 */
			skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header, 0, GFP_KERNEL); 	// 获取长度
			sk->inuse = 1; 																// 设置当前sk正在使用
			send_tmp = skb; 															// 保存当前skb
		} 
		else 
		{
			/*
			 *	We will release the socket in case we sleep here. 
			 */
			release_sock(sk); 															
			skb = prot->wmalloc(sk, copy + prot->max_header , 0, GFP_KERNEL); 			// 直接分配对应的数据长度
  			sk->inuse = 1;
		}

		/*
		 *	If we didn't get any memory, we need to sleep. 
		 */

		if (skb == NULL)  																// 如果skb为空则证明数据分配失败
		{
			sk->socket->flags |= SO_NOSPACE;
			if (nonblock)  																// 如果是非阻塞模式则直接返回
			{
				release_sock(sk);
				if (copied) 
					return(copied); 													// 如果已经发送了部分数据则直接返回
				return(-EAGAIN);
			}

			/*
			 *	FIXME: here is another race condition. 
			 */

			tmp = sk->wmem_alloc; 														// 获取保存的数据
			release_sock(sk);
			cli(); 																		// 禁止中断
			/*
			 *	Again we will try to avoid it. 
			 */
			if (tmp <= sk->wmem_alloc &&
				  (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT) 			// 检查写缓冲区 检查套接字状态是否为连接状态 或者关闭状态
				&& sk->err == 0) 
			{
				sk->socket->flags &= ~SO_NOSPACE;
				interruptible_sleep_on(sk->sleep); 										// 睡眠等待
				if (current->signal & ~current->blocked) 
				{
					sti();
					if (copied) 
						return(copied);
					return(-ERESTARTSYS);
				}
			}
			sk->inuse = 1; 																// 设置当前sk正在使用
			sti();
			continue;
		}

		skb->len = 0; 																	// 设置skb
		skb->sk = sk;
		skb->free = 0;
		skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
	
		buff = skb->data; 																// 获取data
	
		/*
		 * FIXME: we need to optimize this.
		 * Perhaps some hints here would be good.
		 */
		
		tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
				 IPPROTO_TCP, sk->opt, skb->mem_len,sk->ip_tos,sk->ip_ttl); 			// 建立mac ip 等数据
		if (tmp < 0 ) 
		{
			prot->wfree(sk, skb->mem_addr, skb->mem_len);
			release_sock(sk);
			if (copied) 
				return(copied);
			return(tmp);
		}
		skb->len += tmp;
		skb->dev = dev;
		buff += tmp;
		skb->h.th =(struct tcphdr *) buff; 												// 更新字段值
		tmp = tcp_build_header((struct tcphdr *)buff, sk, len-copy); 					// 建立tcp头部
		if (tmp < 0)  																	// 如果小于0 则创建失败
		{
			prot->wfree(sk, skb->mem_addr, skb->mem_len);
			release_sock(sk);
			if (copied) 
				return(copied);
			return(tmp);
		}

		if (flags & MSG_OOB) 
		{
			((struct tcphdr *)buff)->urg = 1; 											// 设置该值为1
			((struct tcphdr *)buff)->urg_ptr = ntohs(copy);
		}
		skb->len += tmp; 																// 增加长度
		memcpy_fromfs(buff+tmp, from, copy); 						

		from += copy;
		copied += copy;
		len -= copy;
		skb->len += copy;
		skb->free = 0;
		sk->write_seq += copy; 															// 增加数据
	
		if (send_tmp != NULL && sk->packets_out)  										// 如果不为空则表示可以合并数据
		{
			tcp_enqueue_partial(send_tmp, sk); 											// 将数据缓存到队列中
			continue;
		}
		tcp_send_skb(sk, skb); 															// 调用该函数发送给下层
	}
	sk->err = 0; 																		// 此次数据都成功处理 重置错误字段值

/*
 *	Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
 *	interactive fast network servers. It's meant to be on and
 *	it really improves the throughput though not the echo time
 *	on my slow slip link - Alan
 */

/*
 *	Avoid possible race on send_tmp - c/o Johannes Stille 
 */
 
	if(sk->partial && ((!sk->packets_out) 
     /* If not nagling we can send on the before case too.. */
	      || (sk->nonagle && before(sk->write_seq , sk->window_seq)) 				// 如果当前partial队列中缓存有数据包并且之前发送数据包得到应答 或者未采用nagle算法且数据包长度在窗口限制内,则直接发送数据
      	))
  		tcp_send_partial(sk); 														// 发送数据

	release_sock(sk);
	return(copied); 																// 返回发送的长度
}

该函数是系统调用write函数的传输层处理函数,对于TCP协议而言,send,sendto系统调用最后都将调用tcp_write函数进行处理,该函数本身由一个while循环构成,循环条件就是len>0,首先判断之前套接字操作是否出现错误,如果出现错误,则返回错误,在发送数据之前我们必须确定套接字状态为可发送数据状态,如果状态条件不满足,则根据是否阻塞来觉得是否等待。当套接字状态允许发送数据时,则进行内核数据结构的创建,复制用户缓冲区数据到内核缓冲区,在成功完成帧首部创建后,将封装后的数据发送下层进一步处理

tcp_read函数
/*
 *	This routine copies from a sock struct into the user buffer.   主要是系统调用read的调用将内核态接受的数据返回用户态
 */
 
static int tcp_read(struct sock *sk, unsigned char *to,
	int len, int nonblock, unsigned flags)
{
	struct wait_queue wait = { current, NULL };
	int copied = 0;
	unsigned long peek_seq;
	volatile unsigned long *seq;	/* So gcc doesn't overoptimise */
	unsigned long used;

	/* 
	 *	This error should be checked. 
	 */
	 
	if (sk->state == TCP_LISTEN) 					// 如果传入套接字为监听状态 则返回错误
		return -ENOTCONN;

	/*
	 *	Urgent data needs to be handled specially. 
	 */
	 
	if (flags & MSG_OOB) 							// 检查是否有紧急数据
		return tcp_read_urg(sk, nonblock, to, len, flags); 			// 读取紧急数据信息

	/*
	 *	Copying sequence to update. This is volatile to handle
	 *	the multi-reader case neatly (memcpy_to/fromfs might be 
	 *	inline and thus not flush cached variables otherwise).
	 */
	 
	peek_seq = sk->copied_seq; 						// 更新变量值
	seq = &sk->copied_seq;  
	if (flags & MSG_PEEK) 							// 如果仅仅是PEEK则不更新内容
		seq = &peek_seq;

	add_wait_queue(sk->sleep, &wait); 				// 添加wait到休眠队列中
	sk->inuse = 1; 									// 设置正在使用
	while (len > 0)  								// 如果长度大于0 
	{
		struct sk_buff * skb;
		unsigned long offset;
	
		/*
		 * Are we at urgent data? Stop if we have read anything.
		 */
		 
		if (copied && sk->urg_data && sk->urg_seq == *seq) 	 	// 是否是紧急数据
			break;

		/*
		 *	Next get a buffer.
		 */
		 
		current->state = TASK_INTERRUPTIBLE; 					// 设置为可中断状态

		skb = skb_peek(&sk->receive_queue); 					// 读receive_queue队列的数据
		do 
		{
			if (!skb) 											// 检查队列中是否有数据 如果没有则停止
				break;
			if (before(*seq, skb->h.th->seq)) 		 			// 检查获取的数据是否出现断裂
				break;
			offset = *seq - skb->h.th->seq; 					// 读取当前字节偏移的位置
			if (skb->h.th->syn) 								// 如果syn设置了值 则偏移位减一
				offset--;
			if (offset < skb->len) 								// 如果偏移量小于skb长度则表示有可用数据 跳转到found_ok_skb
				goto found_ok_skb;
			if (skb->h.th->fin) 								// 如果设置了fin
				goto found_fin_ok; 								// 跳转到found_fin_ok
			if (!(flags & MSG_PEEK)) 							// 检查是否是PEEK
				skb->used = 1; 									// 设置为使用标志位 处理下一个数据
			skb = skb->next; 									// 循环下一个skb
		}
		while (skb != (struct sk_buff *)&sk->receive_queue); 	// 循环判断是否有内容

		if (copied) 											
			break;

		if (sk->err)  								 			// 如果处理过程中出现错误	
		{
			copied = -sk->err; 									// 保存错误值
			sk->err = 0;  										// 重置返回
			break;
		}

		if (sk->state == TCP_CLOSE)  							// 如果状态是关闭状态
		{
			if (!sk->done)  									// 如果不是done状态 则设置为1
			{
				sk->done = 1;
				break;
			}
			copied = -ENOTCONN; 								// 设置错误码
			break;
		}

		if (sk->shutdown & RCV_SHUTDOWN)  						// 如果是关闭状态
		{
			sk->done = 1; 										// 设置done 为1
			break;
		}
			
		if (nonblock)  											// 如果设置为非阻塞
		{
			copied = -EAGAIN; 									// 返回错误码
			break;
		}

		cleanup_rbuf(sk); 										// 清理数据
		release_sock(sk);
		sk->socket->flags |= SO_WAITDATA; 
		schedule(); 											// 调度该程序
		sk->socket->flags &= ~SO_WAITDATA;
		sk->inuse = 1;

		if (current->signal & ~current->blocked)  				// 检查是否有信号量
		{
			copied = -ERESTARTSYS;
			break;
		}
		continue;

	found_ok_skb: 												// 找到一个可读的数据包
		/*
		 *	Lock the buffer. We can be fairly relaxed as
		 *	an interrupt will never steal a buffer we are 
		 *	using unless I've missed something serious in
		 *	tcp_data.
		 */
		
		skb->users++; 											// 保护该数据以免被释放
		
		/*
		 *	Ok so how much can we use ? 
		 */
		 
		used = skb->len - offset; 								// 获取已经使用的长度
		if (len < used) 										// 如果使用的大于读取长度
			used = len; 										// 则设置为传入长度
		/*
		 *	Do we have urgent data here? 
		 */
		
		if (sk->urg_data)  										// 是否紧急数据
		{
			unsigned long urg_offset = sk->urg_seq - *seq;
			if (urg_offset < used) 
			{
				if (!urg_offset) 
				{
					if (!sk->urginline) 
					{
						++*seq;
						offset++;
						used--;
					}
				}
				else
					used = urg_offset;
			}
		}
		
		/*
		 *	Copy it - We _MUST_ update *seq first so that we
		 *	don't ever double read when we have dual readers
		 */
		 
		*seq += used; 											// 更新读取的字节数偏移量

		/*
		 *	This memcpy_tofs can sleep. If it sleeps and we
		 *	do a second read it relies on the skb->users to avoid
		 *	a crash when cleanup_rbuf() gets called.
		 */
		 
		memcpy_tofs(to,((unsigned char *)skb->h.th) +
			skb->h.th->doff*4 + offset, used);
		copied += used; 										// 获取新数据
		len -= used;											// 获取剩余的长度
		to += used; 											// 新增读取的长度
		
		/*
		 *	We now will not sleep again until we are finished
		 *	with skb. Sorry if you are doing the SMP port
		 *	but you'll just have to fix it neatly ;)
		 */
		 
		skb->users --; 											// 减掉
		
		if (after(sk->copied_seq,sk->urg_seq)) 					// 比较是否有紧急数据
			sk->urg_data = 0; 									// 设置为空
		if (used + offset < skb->len) 							// 如果还有数据没有读完 则继续读
			continue;
		
		/*
		 *	Process the FIN.
		 */

		if (skb->h.th->fin) 									// 如果是fin
			goto found_fin_ok; 									// 跳转到找到fin处执行
		if (flags & MSG_PEEK) 									// 是否是PEEK
			continue;
		skb->used = 1; 											// 重置为1 继续
		continue;

	found_fin_ok:
		++*seq; 												// seq加1
		if (flags & MSG_PEEK)
			break;
			
		/*
		 *	All is done
		 */
		 
		skb->used = 1;
		sk->shutdown |= RCV_SHUTDOWN; 							// 设置为used 并设置为关闭状态
		break;

	}
	remove_wait_queue(sk->sleep, &wait); 						// 移除等待队列
	current->state = TASK_RUNNING; 								// 修改当前进程状态为 task_running

	/* Clean up data we have read: This will do ACK frames */
	cleanup_rbuf(sk); 											// 清除最后数据
	release_sock(sk);  
	return copied; 										
}

该函数j就是系统调用read的网络层实现,其操作主要就是sock结构receive_queue队列,只有进入该队列中数据包才将里面的数据交给应用程序处理,tcp_read主要就是从receive_queue队列中获取数据包,检查其中数据的合法性,并根据用户所要求读取的数据量,尽量拷贝该数量的数据到用户缓冲区中。大致流程,首先会检查套接字状态是否正确,然后检查是否是紧急数据,然后进入一个whilte循环,该循环就是需要读取一定数量的数据才能返回,退出的条件就是读取指定数量的数据,如果设置的为非阻塞则会立马返回不会等待,此处将当前进程状态设置为可中断状态,当条件不满足时,进入睡眠等待,当有数据时就从receive_queue队列中取数据包,进入循环。如果找到可用数据则直接跳转出来到found_ok_skb出,该循环的主要作用就是检查接受队列是否存在可读的数据包,如果没有则跳出,检查数据流是否出现断裂,如果出现断裂则跳出,检查数据包是否有可用数据,如有则跳转到found_ok_skb标志符处进行处理,检查数据包FIN字段设置情况,如果被设置则表示该数据包可能携带还携带一个请求关闭发送通道数据包,如果数据包中包含的是重复数据则释放该数据包并处理下一个可能的数据包。

tcp_rcv函数

分析完主要的函数后,主要获取数据的入口就是tcp_rcv函数,该函数就是TCP协议数据包接受的总入口函数,网络层协议在判断数据包使用的是TCP协议后,将调用tcp_rcv函数对该数据包进行传输层的处理,该函数更像是一个任务分发器,根据数据包中各标志位的设置,将数据包进一步分发给相关具体函数进行具体处理,当前的主要数据包可分为如下几种:SYN请求连接数据包,ACK应答数据包,RST数据包,普通数据包,FIN断开连接数据包等,在TCP连接器件,ACK应答数据包和普通数据包作为一个数据包传输,即数据包中包含普通数据且TCP首部中ACK字段被设置为1。

/*
 *	A TCP packet has arrived. 				TCP协议数据包接受的总入口函数   数据包类型大致有 SYN请求连接数据包 ACK应答数据包 RST数据包 普通数据包 FIN断开连接数据包等
 */
 
int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
	unsigned long daddr, unsigned short len,
	unsigned long saddr, int redo, struct inet_protocol * protocol)
{
	struct tcphdr *th;
	struct sock *sk;
	int syn_ok=0;
	
	if (!skb) 
	{
		printk("IMPOSSIBLE 1\n");
		return(0);
	}

	if (!dev)  										// 数据包没有经过网口返回
	{
		printk("IMPOSSIBLE 2\n");
		return(0);
	}
  
	tcp_statistics.TcpInSegs++;
  
	if(skb->pkt_type!= PACKET_HOST) 				// 如果不是发送给本地的数据包在网络层就已经处理
	{
	  	kfree_skb(skb,FREE_READ);
	  	return(0);
	}
  
	th = skb->h.th;

	/*
	 *	Find the socket.
	 */

	sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr); 					// 根据传入条件查找对应的sock

	/*
	 *	If this socket has got a reset it's to all intents and purposes 
  	 *	really dead. Count closed sockets as dead.
  	 *
  	 *	Note: BSD appears to have a bug here. A 'closed' TCP in BSD
  	 *	simply drops data. This seems incorrect as a 'closed' TCP doesn't
  	 *	exist so should cause resets as if the port was unreachable.
  	 */
  	 
	if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE)) 							// 如果找到的sk不为空 并且没有被复位或者关闭 则不可接受数据包
		sk=NULL;

	if (!redo)  																	// 是否是新的数据包
	{
		if (tcp_check(th, len, saddr, daddr ))  									// 检验是否正确
		{
			skb->sk = NULL;
			kfree_skb(skb,FREE_READ);
			/*
			 *	We don't release the socket because it was
			 *	never marked in use.
			 */
			return(0);
		}
		th->seq = ntohl(th->seq); 													// 获取seq

		/* See if we know about the socket. */
		if (sk == NULL)  															// 如果为空则返回
		{
			/*
			 *	No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
			 */
			tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255); 		// 重置TCP
			skb->sk = NULL;
			/*
			 *	Discard frame
			 */
			kfree_skb(skb, FREE_READ);
			return(0);
		}

		skb->len = len; 																// 初始化值
		skb->acked = 0;
		skb->used = 0;
		skb->free = 0;
		skb->saddr = daddr;
		skb->daddr = saddr;
	
		/* We may need to add it to the backlog here. */
		cli();
		if (sk->inuse)  																// 如果在使用
		{
			skb_queue_tail(&sk->back_log, skb); 										// 放入到back_log中并返回
			sti();
			return(0);
		}
		sk->inuse = 1; 																	// 设置为正在使用
		sti(); 															
	}
	else
	{
		if (sk==NULL) 
		{
			tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255); 		// 如果为空则重置
			skb->sk = NULL;
			kfree_skb(skb, FREE_READ);
			return(0);
		}
	}


	if (!sk->prot)  																	// 如果没有协议操作函数则返回
	{
		printk("IMPOSSIBLE 3\n");
		return(0);
	}


	/*
	 *	Charge the memory to the socket. 
	 */
	 
	if (sk->rmem_alloc + skb->mem_len >= sk->rcvbuf)  									// 对接受缓冲区空余空间进行检查 如果空间过小则丢弃该数据包
	{
		kfree_skb(skb, FREE_READ);
		release_sock(sk);
		return(0);
	}

	skb->sk=sk;
	sk->rmem_alloc += skb->mem_len; 																// 如果可以接受则更新当前接受的数据长度

	/*
	 *	This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
	 *	don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
	 *	compatibility. We also set up variables more thoroughly [Karn notes in the
	 *	KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
	 */

	if(sk->state!=TCP_ESTABLISHED)		/* Skip this lot for normal flow */ 						// 如果sk状态不是已连接状态
	{
	
		/*
		 *	Now deal with unusual cases.
		 */
	 
		if(sk->state==TCP_LISTEN) 																	// 如果是监听状态
		{
			if(th->ack)	/* These use the socket TOS.. might want to be the received TOS */ 			// 如果是ack数据包
				tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl); 					// 重置

			/*
			 *	We don't care for RST, and non SYN are absorbed (old segments)
			 *	Broadcast/multicast SYN isn't allowed. Note - bug if you change the
			 *	netmask on a running connection it can go broadcast. Even Sun's have
			 *	this problem so I'm ignoring it 
			 */
			   
			if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR) 					// 如果待回复或者rst复位都丢弃该数据
			{
				kfree_skb(skb, FREE_READ);
				release_sock(sk);
				return 0;
			}
		
			/*	
			 *	Guess we need to make a new socket up 
			 */
		
			tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq()); 						// 经过检查之后该数据包为SYN数据包
		
			/*
			 *	Now we have several options: In theory there is nothing else
			 *	in the frame. KA9Q has an option to send data with the syn,
			 *	BSD accepts data with the syn up to the [to be] advertised window
			 *	and Solaris 2.1 gives you a protocol error. For now we just ignore
			 *	it, that fits the spec precisely and avoids incompatibilities. It
			 *	would be nice in future to drop through and process the data.
			 */
			 
			release_sock(sk);
			return 0;
		}
	
		/* retransmitted SYN? */
		if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq) 				 	// 如果当前状态为TCP_SYN_RECV 检查是否是重复发送的SYN数据包 如果重新发送的则丢弃
		{
			kfree_skb(skb, FREE_READ);
			release_sock(sk);
			return 0;
		}
		
		/*
		 *	SYN sent means we have to look for a suitable ack and either reset
		 *	for bad matches or go to connected 
		 */
	   
		if(sk->state==TCP_SYN_SENT) 														// 如果是TCP_SYN_SENT状态
		{
			/* Crossed SYN or previous junk segment */
			if(th->ack) 																	// 是否发送ACK
			{
				/* We got an ack, but it's not a good ack */
				if(!tcp_ack(sk,th,saddr,len)) 												// 检查ACK标志位 如果出现错误则报错返回
				{
					/* Reset the ack - its an ack from a 
					   different connection  [ th->rst is checked in tcp_reset()] */
					tcp_statistics.TcpAttemptFails++;
					tcp_reset(daddr, saddr, th,
						sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
					kfree_skb(skb, FREE_READ);
					release_sock(sk);
					return(0);
				}
				if(th->rst) 																// 如果是RST数据包则重置
					return tcp_std_reset(sk,skb);
				if(!th->syn) 																// 如果不是SYN数据包则丢弃
				{
					/* A valid ack from a different connection
					   start. Shouldn't happen but cover it */
					kfree_skb(skb, FREE_READ);
					release_sock(sk);
					return 0;
				}
				/*
				 *	Ok.. it's good. Set up sequence numbers and
				 *	move to established.
				 */
				syn_ok=1;	/* Don't reset this connection for the syn */
				sk->acked_seq=th->seq+1;
				sk->fin_seq=th->seq;
				tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);					// 回复一个应答数据包
				tcp_set_state(sk, TCP_ESTABLISHED); 										// 设置状态为建立连接状态
				tcp_options(sk,th); 														// 设置TCP选项值
				sk->dummy_th.dest=th->source;
				sk->copied_seq = sk->acked_seq;
				if(!sk->dead)
				{
					sk->state_change(sk); 													// 唤醒等待进程
					sock_wake_async(sk->socket, 0);
				}
				if(sk->max_window==0)
				{
					sk->max_window = 32;
					sk->mss = min(sk->max_window, sk->mtu); 								// 设置MSS
				}
			}
			else
			{
				/* See if SYN's cross. Drop if boring */
				if(th->syn && !th->rst) 													// 如果是syn
				{
					/* Crossed SYN's are fine - but talking to
					   yourself is right out... */
					if(sk->saddr==saddr && sk->daddr==daddr &&
						sk->dummy_th.source==th->source &&
						sk->dummy_th.dest==th->dest)
					{
						tcp_statistics.TcpAttemptFails++;
						return tcp_std_reset(sk,skb); 										// 检查状态重置
					}
					tcp_set_state(sk,TCP_SYN_RECV);											// 设置状态为TCP_SYN_RECV
					
					/*
					 *	FIXME:
					 *	Must send SYN|ACK here
					 */
				}		
				/* Discard junk segment */													// 释放数据
				kfree_skb(skb, FREE_READ);
				release_sock(sk);
				return 0;
			}
			/*
			 *	SYN_RECV with data maybe.. drop through
			 */
			goto rfc_step6;
		}

	/*
	 *	BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
	 *	a more complex suggestion for fixing these reuse issues in RFC1644
	 *	but not yet ready for general use. Also see RFC1379.
	 */
	
#define BSD_TIME_WAIT
#ifdef BSD_TIME_WAIT
		if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
			after(th->seq, sk->acked_seq) && !th->rst) 									// 处于2MSL状态的套接字 是否接受一个连接请求进行判读
		{
			long seq=sk->write_seq;
			if(sk->debug)
				printk("Doing a BSD time wait\n");
			tcp_statistics.TcpEstabResets++;	   
			sk->rmem_alloc -= skb->mem_len;
			skb->sk = NULL;
			sk->err=ECONNRESET;
			tcp_set_state(sk, TCP_CLOSE); 												// 将套接字置于关闭状态
			sk->shutdown = SHUTDOWN_MASK;
			release_sock(sk);
			sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr); 				// 获取sk
			if (sk && sk->state==TCP_LISTEN) 											// 如果sk为监听状态
			{
				sk->inuse=1;
				skb->sk = sk;
				sk->rmem_alloc += skb->mem_len;
				tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000); 			// 处理请求
				release_sock(sk);
				return 0;
			}
			kfree_skb(skb, FREE_READ);
			return 0;
		}
#endif	
	}

	/*
	 *	We are now in normal data flow (see the step list in the RFC)
	 *	Note most of these are inline now. I'll inline the lot when
	 *	I have time to test it hard and look at what gcc outputs 
	 */
	
	if(!tcp_sequence(sk,th,len,opt,saddr,dev)) 								// 对数据包中数据序列号进行合法性检查 如果不合法则释放
	{
		kfree_skb(skb, FREE_READ);
		release_sock(sk);
		return 0;
	}

	if(th->rst) 															// 是否是RST数据包
		return tcp_std_reset(sk,skb); 										// 处理RST标志位
	
	/*
	 *	!syn_ok is effectively the state test in RFC793.
	 */
	 
	if(th->syn && !syn_ok) 													// 如果不在相应状态则发送SYN请求连接数据包
	{
		tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
		return tcp_std_reset(sk,skb);	
	}

	/*
	 *	Process the ACK
	 */
	 

	if(th->ack && !tcp_ack(sk,th,saddr,len)) 								// 是否需要回复ACK 如果需要则回复ACK信息
	{
		/*
		 *	Our three way handshake failed.
		 */
		 
		if(sk->state==TCP_SYN_RECV) 										// 如果状态为TCP_SYN_RECV 则重置
		{
			tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
		}
		kfree_skb(skb, FREE_READ);
		release_sock(sk);
		return 0;
	}
	
rfc_step6:		/* I'll clean this up later */

	/*
	 *	Process urgent data
	 */
	 	
	if(tcp_urg(sk, th, saddr, len)) 						// 处理紧急数据
	{
		kfree_skb(skb, FREE_READ);
		release_sock(sk);
		return 0;
	}
	
	
	/*
	 *	Process the encapsulated data
	 */
	
	if(tcp_data(skb,sk, saddr, len)) 					// 获取数据
	{
		kfree_skb(skb, FREE_READ);
		release_sock(sk);
		return 0;
	}

	/*
	 *	And done
	 */	
	
	release_sock(sk);
	return 0;
}

该函数主要包括以下几个模块,数据包合法性检测模块tcp_sequence,请求连接处理模块tcp_conn_request,RST数据包处理模块tcp_reset,应答处理模块tcp_ack,数据处理模块tcp_urg、tcp_data,以及断开连接处理模块tcp_fin。主要是根据不同的任务进行不同的数据处理。

总结

本文只是简单的对照书籍了解了TCP协议部分的内容,由内容可知TCP协议本身较为复杂,因为该协议要求的可靠性数据传输保证以及流式传输方式使得实现上必须进行数据重传以及重新排序等操作,TCP协议实现要考虑的方面很多,其主要核心思想就是对数据进行编号和应答机制。本文还有大量相关的函数没有列出,大家可对照书本读阅。由于本人才疏学浅,如有错误请批评指正。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值