linux 内核协议栈原理分析之 tcp 服务器端的 send 过程

最新推荐文章于 2024-04-19 19:10:54 发布

置顶 guoguangwu

最新推荐文章于 2024-04-19 19:10:54 发布

阅读量1.1k

点赞数

分类专栏： linux内核网络文章标签： linux tcp/ip

本文链接：https://blog.csdn.net/guoguangwu/article/details/116856898

版权

linux内核同时被 2 个专栏收录

68 篇文章 13 订阅

订阅专栏

网络

10 篇文章 0 订阅

订阅专栏

基于linux2.4.0分析，讲解服务器端通过send函数触发内核执行的整个流程。部分内容可能因为自己没有完全搞懂，写的有些偏差，请大家指正。

这里面会涉及到很多东西，后面慢慢补充，比如sock_alloc函数涉及到文件系统的知识。

send系统调用最终会走到内核的sys_socketcall函数。

后面的调用流程 sys_send=>sys_sendto=>sock_sendmsg=>sock->ops->sendmsg

发现这个是个函数指针，具体是什么需要看初始化的时候或者accept的时候设置的函数跳转表，这个时候肯定是已经经过三次握手之后建立的连接。为了解决这个问题，我们先看accept处理过程。

三次握手过程：

先是服务器进程在accept系统调用中，创建socket(newsock)对象，等待客户端发起连接过来；客户端通过connect系统调用向服务器端发起连接，这个时候客户端的socket状态由sys_socket创建时的SS_UNCONNECTED变成在inet_stream_connect函数中的SS_CONNECTING，而sock的状态由tcp_v4_connect=>tcp_connect函数设置成TCP_SYN_SENT，并发送syn数据包给到服务器（第一次握手包）；
服务器收到syn数据包，先由网卡中断函数处理，放入每CPU变量的队列中，然后发送接收软中断，接着由软中断处理程序继续处理，调用对应的注册函数net_rx_action处理接收到的数据包，然后将数据包脱链，然后由ptype_base的对应协议的函数处理这里为ip_packet_type的ip_rcv处理，经过一系列处理完成（路由等），最终进入到传输层处理，拿到传输层协议ID，到inet_protos中索引对应的元素，这里为tcp_protocol，调用tcp_v4_rcv处理接收数据包，查询是否为已创建连接，这里不是，进入tcp_v4_do_rcv然后进入tcp_v4_hnd_req再进入tcp_rcv_state_process，进入syn处理流程ipv4_specific.tcp_v4_conn_request处理连接请求，然后创建req，放入服务器sock接收队列中，并调用tcp_v4_send_synack发送syn_ack数据包（第二次握手包）；
客户端在接收到服务器发过来的数据包，流程和上面基本一致，最后也是走到tcp_v4_do_rcv中处理。此时进入tcp_rcv_state_process函数中，根据当前sock状态进入TCP_SYN_SENT分支，调用tcp_rcv_synsent_state_process函数将sock设置为TCP_ESTABLISHED已经连接状态，唤醒在该sock连接上面的等待进程，发送第三次握手的数据包ack给服务器端。
服务器在收到第三个握手报文时，tcp_v4_do_rcv函数继续处理，此时tcp_v4_hnd_req可以查到对应的请求结构req，然后进入tcp_check_req调用tcp_v4_syn_recv_sock创建sock并设置state为TCP_SYN_RECV（tcp_create_openreq_child函数中），继承服务器sock的函数跳转表，并将req->sk指向该sock，并将req挂入服务器的接收队列中，接着调用tcp_child_process设置sock的状态为TCP_ESTABLISHED（tcp_rcv_state_process），并唤醒服务器进程从wait_for_connect此处继续执行，返回对应的客户端的sock，继续返回到inet_accept，将socket和sock绑定，设置socket状态为SS_CONNECTED，最后通过sock_map_fd返回新创建的socket的文件描述符，后面就用这个来接收客户端的数据，返回用户态服务器进程去处理连接请求。

代码分析：

asmlinkage long sys_accept(int fd, struct sockaddr *upeer_sockaddr, int *upeer_addrlen)
{
	struct socket *sock, *newsock;
	int err, len;
	char address[MAX_SOCK_ADDR];

	sock = sockfd_lookup(fd, &err);/*找到listen文件描述的socket对象*/
	if (!sock)
		goto out;

	err = -EMFILE;
	if (!(newsock = sock_alloc())) 
		goto out_put;

	newsock->type = sock->type;/*这两个变量都是继承listen的socket的值 tcp为SOCK_STREAM*/
	newsock->ops = sock->ops;/*listen的值是在inet_create中设置为inet_stream_ops地址*/
    /*这里调用的是 inet_accept函数,里面继续调用tcp_prot的accept函数tcp_accept，
      等待连接过来，将客户端的socket与sock完成配对*/
	err = sock->ops->accept(sock, newsock, sock->file->f_flags);
	if (err < 0)
		goto out_release;

	if (upeer_sockaddr) {
		if(newsock->ops->getname(newsock, (struct sockaddr *)address, &len, 2)<0) {
			err = -ECONNABORTED;
			goto out_release;
		}
		err = move_addr_to_user(address, len, upeer_sockaddr, upeer_addrlen);
		if (err < 0)
			goto out_release;
	}

	/* File flags are not inherited via accept() unlike another OSes. */

	if ((err = sock_map_fd(newsock)) < 0)
		goto out_release;

out_put:
	sockfd_put(sock);
out:
	return err;

out_release:
	sock_release(newsock);
	goto out_put;
}

sock->ops初始化：

我们看下服务器的sock->ops怎么初始化成inet_stream_ops的。从建立socket时来看。

int server_fd = socket(AF_INET, SOCK_STREAM, 0);

sys_socket：

内核代码调用路径：sys_socketcall=>sys_socket


asmlinkage long sys_socket(int family, int type, int protocol)
{
	int retval;
	struct socket *sock;

	retval = sock_create(family, type, protocol, &sock);/*创建socket*/
	if (retval < 0)
		goto out;

	retval = sock_map_fd(sock);/*与文件系统建立关联，设置f_op、d_op函数跳转表等*/
	if (retval < 0)
		goto out_release;

out:
	/* It may be already another descriptor 8) Not kernel problem. */
	return retval;

out_release:
	sock_release(sock);
	return retval;
}

sys_socketcall=>sys_socket=>sock_create



int sock_create(int family, int type, int protocol, struct socket **res)
{
	int i;
	struct socket *sock;
    //省略部分检查代码
    ......
    /*创建socket对象*/
	if (!(sock = sock_alloc())) 
	{
		printk(KERN_WARNING "socket: no more sockets\n");
		i = -ENFILE;		/* Not exactly a match, but its the
					   closest posix thing */
		goto out;
	}
    /*设置socket类型 AF_INET*/
	sock->type  = type;
    /*执行AF_INET类型的create函数，这里需要看下协议栈初始化的过程才知道执行的时候是什么函数*/
	if ((i = net_families[family]->create(sock, protocol)) < 0) 
	{
		sock_release(sock);
		goto out;
	}

	*res = sock;

out:
	net_family_read_unlock();
	return i;
}

net_families初始化

inet_init=>sock_register



/*
 *	Called by socket.c on kernel startup.  
 */
 
static int __init inet_init(void)
{
	struct sk_buff *dummy_skb;
	struct inet_protocol *p;

	printk(KERN_INFO "NET4: Linux TCP/IP 1.0 for NET4.0\n");

	if (sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb))
	{
		printk(KERN_CRIT "inet_proto_init: panic\n");
		return -EINVAL;
	}

	/*
	 *	Tell SOCKET that we are alive... 
	 */
    /*往net_families中注册了inet_family_ops*/
  	(void) sock_register(&inet_family_ops);

	/*
	 *	Add all the protocols. 
	 */

	printk(KERN_INFO "IP Protocols: ");
	for(p = inet_protocol_base; p != NULL;) 
	{
		struct inet_protocol *tmp = (struct inet_protocol *) p->next;
		inet_add_protocol(p);
		printk("%s%s",p->name,tmp?", ":"\n");
		p = tmp;
	}

	/*
	 *	Set the ARP module up
	 */

	arp_init();

  	/*
  	 *	Set the IP module up
  	 */

	ip_init();

	tcp_v4_init(&inet_family_ops);

	return 0;
}


#define PF_INET		AF_INET/*AF_INET与PF_INET等价*/

struct net_proto_family inet_family_ops = {
	PF_INET,
	inet_create
};


int sock_register(struct net_proto_family *ops)
{
	int err;

	if (ops->family >= NPROTO) {
		printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family, NPROTO);
		return -ENOBUFS;
	}
	net_family_write_lock();
	err = -EEXIST;
	if (net_families[ops->family] == NULL) {/*以PF_INET为下标设置到数组中*/
		net_families[ops->family]=ops;
		err = 0;
	}
	net_family_write_unlock();
	return err;
}

inet_create：

sys_socketcall=>sys_socket=>sock_create=>inet_create


/*
 *	Create an inet socket.
 */

static int inet_create(struct socket *sock, int protocol)
{
	struct sock *sk;
	struct proto *prot;

	sock->state = SS_UNCONNECTED;
	sk = sk_alloc(PF_INET, GFP_KERNEL, 1);/**/
	if (sk == NULL) 
		goto do_oom;

	switch (sock->type) {
	case SOCK_STREAM:
		if (protocol && protocol != IPPROTO_TCP)/*protocol为0所以不会终止*/
			goto free_and_noproto;
		protocol = IPPROTO_TCP;
		prot = &tcp_prot;
		sock->ops = &inet_stream_ops;/*这里设置了socket的函数跳转表*/
		break;
    ......
	}

	if (ipv4_config.no_pmtu_disc)
		sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_DONT;
	else
		sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_WANT;
    /*这里面会设置sock_def_readable，当socket上面有数据时，会唤醒等待的进程*/
	sock_init_data(sock,sk);

	sk->destruct = inet_sock_destruct;

	sk->zapped = 0;
	sk->family = PF_INET;
	sk->protocol = protocol;

	sk->prot = prot;/*设置为tcp_prot函数跳转表*/
	sk->backlog_rcv = prot->backlog_rcv;

	sk->protinfo.af_inet.ttl=sysctl_ip_default_ttl;

	sk->protinfo.af_inet.mc_loop=1;
	sk->protinfo.af_inet.mc_ttl=1;
	sk->protinfo.af_inet.mc_index=0;
	sk->protinfo.af_inet.mc_list=NULL;

#ifdef INET_REFCNT_DEBUG
	atomic_inc(&inet_sock_nr);
#endif

	if (sk->num) {
		/* It assumes that any protocol which allows
		 * the user to assign a number at socket
		 * creation time automatically
		 * shares.
		 */
		sk->sport = htons(sk->num);

		/* Add to protocol hash chains. */
		sk->prot->hash(sk);
	}

	if (sk->prot->init) {
        /*这里调用的是tcp_v4_init_sock函数，会设置一个比较重要的函数跳转表ipv4_specific*/
		int err = sk->prot->init(sk);
		if (err != 0) {
			inet_sock_release(sk);
			return(err);
		}
	}
	return(0);

free_and_badtype:
	sk_free(sk);
	return -ESOCKTNOSUPPORT;

free_and_badperm:
	sk_free(sk);
	return -EPERM;

free_and_noproto:
	sk_free(sk);
	return -EPROTONOSUPPORT;

do_oom:
	return -ENOBUFS;
}

inet_accept：

那我们继续看accept系统调用执行到inet_accept时做了什么。

sys_socketcall=>sys_accept=>inet_accept


/*
 *	Accept a pending connection. The TCP layer now gives BSD semantics.
 */

int inet_accept(struct socket *sock, struct socket *newsock, int flags)
{
	struct sock *sk1 = sock->sk;
	struct sock *sk2;
	int err = -EINVAL;
    /*使用服务器的socket的sock对象执行对应的accept函数，我们看了sys_socket的过程
      知道这里执行的是tcp_accept*/
	if((sk2 = sk1->prot->accept(sk1,flags,&err)) == NULL)
		goto do_err;

	lock_sock(sk2);

	BUG_TRAP((1<<sk2->state)&(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_CLOSE));

	sock_graft(sk2, newsock);

	newsock->state = SS_CONNECTED;
	release_sock(sk2);
	return 0;

do_err:
	return err;
}

追踪tcp_accept

sys_socketcall=>sys_accept=>inet_accept=>tcp_accept


/*
 *	This will accept the next outstanding connection.
 */

struct sock *tcp_accept(struct sock *sk, int flags, int *err)
{
	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
	struct open_request *req;
	struct sock *newsk;
	int error;

	lock_sock(sk); 

	/* We need to make sure that this socket is listening,
	 * and that it has something pending.
	 */
	error = -EINVAL;
	if (sk->state != TCP_LISTEN)
		goto out;

	/* Find already established connection */
	if (!tp->accept_queue) {
       /*如果当前没有client连过来，睡眠，当client发起连接时，即server收到syn包时创建req,
         在tcp_v4_hnd_req函数中创建的。
        */
		long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
        /*计算睡眠时间*/
		/* If this is a non blocking socket don't sleep */
		error = -EAGAIN;
		if (!timeo)
			goto out;
        /*睡眠等待连接，进程被阻塞*/
		error = wait_for_connect(sk, timeo);
		if (error)
			goto out;
	}

	req = tp->accept_queue;
    /*req连接脱链（已完成连接建立），如果下一个节点不存在，将链表首尾指针置空*/
	if ((tp->accept_queue = req->dl_next) == NULL)
		tp->accept_queue_tail = NULL;

 	newsk = req->sk;/*设置新连接的sock*/
	tcp_acceptq_removed(sk);
	tcp_openreq_fastfree(req);
	BUG_TRAP(newsk->state != TCP_SYN_RECV);
	release_sock(sk);
	return newsk;

out:
	release_sock(sk);
	*err = error; 
	return NULL;
}

sys_socketcall=>sys_accept=>inet_accept=>tcp_accept=>wait_for_connect


/*
 *	Wait for an incoming connection, avoid race
 *	conditions. This must be called with the socket locked.
 */
static int wait_for_connect(struct sock * sk, long timeo)
{
	DECLARE_WAITQUEUE(wait, current);/*为当前进程设置等待项*/
	int err;

	/*
	 * True wake-one mechanism for incoming connections: only
	 * one process gets woken up, not the 'whole herd'.
	 * Since we do not 'race & poll' for established sockets
	 * anymore, the common case will execute the loop only once.
	 *
	 * Subtle issue: "add_wait_queue_exclusive()" will be added
	 * after any current non-exclusive waiters, and we know that
	 * it will always _stay_ after any new non-exclusive waiters
	 * because all non-exclusive waiters are added at the
	 * beginning of the wait-queue. As such, it's ok to "drop"
	 * our exclusiveness temporarily when we get woken up without
	 * having to remove and re-insert us on the wait queue.
	 */
	add_wait_queue_exclusive(sk->sleep, &wait);/*加入sock的等待队列中*/
	for (;;) {
		current->state = TASK_INTERRUPTIBLE;/*设置为可中断状态*/
		release_sock(sk);/*唤醒其他的等待进程*/
		if (sk->tp_pinfo.af_tcp.accept_queue == NULL)
			timeo = schedule_timeout(timeo);
		lock_sock(sk);
		err = 0;
		if (sk->tp_pinfo.af_tcp.accept_queue)/*如果有连接，终止循环*/
			break;
		err = -EINVAL;
		if (sk->state != TCP_LISTEN)/*如果服务器socket不是listen状态报错*/
			break;
		err = sock_intr_errno(timeo);
		if (signal_pending(current))
			break;
		err = -EAGAIN;
		if (!timeo)
			break;
	}
	current->state = TASK_RUNNING;/*设置就绪状态*/
	remove_wait_queue(sk->sleep, &wait);/*删除当前进程的等待项*/
	return err;
}

client 发送syn数据包：

下面简单讲解下client发送syn数据包到达服务器时怎么触发创建的

CS8900驱动中断注册

以CS8900驱动为例：

net_open会注册该网卡的中断函数net_interrupt，当数据包过来时会调用net_rx然后分配skb设置二层协议，之后调用netif_rx，将skb挂到对应每CPU变量softnet_data的input_pkt_queue队列中，然后设置NET_RX_SOFTIRQ。

具体代码如下：


/* Open/initialize the board.  This is called (in the current kernel)
   sometime after booting when the 'ifconfig' program is run.

   This routine should set everything up anew at each open, even
   registers that "should" only need to be set once at boot, so that
   there is non-reboot way to recover if something goes wrong.
   */

/* AKPM: do we need to do any locking here? */

static int
net_open(struct net_device *dev)
{
	struct net_local *lp = (struct net_local *)dev->priv;
	int result = 0;
	int i;
	int ret;

	if (dev->irq < 2) {
		/* Allow interrupts to be generated by the chip */
/* Cirrus' release had this: */
#if 0
		writereg(dev, PP_BusCTL, readreg(dev, PP_BusCTL)|ENABLE_IRQ );
#endif
/* And 2.3.47 had this: */
		writereg(dev, PP_BusCTL, ENABLE_IRQ | MEMORY_ON);

		for (i = 2; i < CS8920_NO_INTS; i++) {
			if ((1 << dev->irq) & lp->irq_map) {
				if (request_irq(i, net_interrupt, 0, dev->name, dev) == 0) {
					dev->irq = i;
					write_irq(dev, lp->chip_type, i);
					/* writereg(dev, PP_BufCFG, GENERATE_SW_INTERRUPT); */
					break;
				}
			}
		}

		if (i >= CS8920_NO_INTS) {
			writereg(dev, PP_BusCTL, 0);	/* disable interrupts. */
			printk(KERN_ERR "cs89x0: can't get an interrupt\n");
			ret = -EAGAIN;
			goto bad_out;
		}
	} else {
		if (((1 << dev->irq) & lp->irq_map) == 0) {
			printk(KERN_ERR "%s: IRQ %d is not in our map of allowable IRQs, which is %x\n",
                               dev->name, dev->irq, lp->irq_map);
			ret = -EAGAIN;
			goto bad_out;
		}
/* FIXME: Cirrus' release had this: */
		writereg(dev, PP_BusCTL, readreg(dev, PP_BusCTL)|ENABLE_IRQ );
/* And 2.3.47 had this: */
#if 0
		writereg(dev, PP_BusCTL, ENABLE_IRQ | MEMORY_ON);
#endif
		write_irq(dev, lp->chip_type, dev->irq);
        /*设置中断处理函数*/
		ret = request_irq(dev->irq, &net_interrupt, 0, dev->name, dev);
		if (ret) {
			if (net_debug)
				printk(KERN_DEBUG "cs89x0: request_irq(%d) failed\n", dev->irq);
			goto bad_out;
		}
	}



	}

	return 0;
bad_out:
	return ret;
}


/* The typical workload of the driver:
   Handle the network interface interrupts. */
   
static void net_interrupt(int irq, void *dev_id, struct pt_regs * regs)
{
	struct net_device *dev = dev_id;
	struct net_local *lp;
	int ioaddr, status;
 
	ioaddr = dev->base_addr;
	lp = (struct net_local *)dev->priv;

	/* we MUST read all the events out of the ISQ, otherwise we'll never
           get interrupted again.  As a consequence, we can't have any limit
           on the number of times we loop in the interrupt handler.  The
           hardware guarantees that eventually we'll run out of events.  Of
           course, if you're on a slow machine, and packets are arriving
           faster than you can read them off, you're screwed.  Hasta la
           vista, baby!  */
	while ((status = readword(dev, ISQ_PORT))) {
		if (net_debug > 4)printk("%s: event=%04x\n", dev->name, status);
		switch(status & ISQ_EVENT_MASK) {
		case ISQ_RECEIVER_EVENT:
			/* Got a packet(s). */
			net_rx(dev);/*处理接收数据包*/
			break;
	......
		}
	}
}

net_rx接收函数

net_interrupt=> net_rx


/* We have a good packet(s), get it/them out of the buffers. */
static void
net_rx(struct net_device *dev)
{
	struct net_local *lp = (struct net_local *)dev->priv;
	struct sk_buff *skb;
	int status, length;

	int ioaddr = dev->base_addr;
	status = inw(ioaddr + RX_FRAME_PORT);
	length = inw(ioaddr + RX_FRAME_PORT);

	if ((status & RX_OK) == 0) {
		count_rx_errors(status, lp);
		return;
	}

	/* Malloc up new buffer. */
	skb = dev_alloc_skb(length + 2);/*分配数据包结构和缓冲块*/
	if (skb == NULL) {
#if 0		/* Again, this seems a cruel thing to do */
		printk(KERN_WARNING "%s: Memory squeeze, dropping packet.\n", dev->name);
#endif
		lp->stats.rx_dropped++;
		return;
	}
	skb_reserve(skb, 2);	/* longword align L3 header */
	skb->dev = dev;

	insw(ioaddr + RX_FRAME_PORT, skb_put(skb, length), length >> 1);
	if (length & 1)
		skb->data[length-1] = inw(ioaddr + RX_FRAME_PORT);

	if (net_debug > 3) {
		printk(	"%s: received %d byte packet of type %x\n",
			dev->name, length,
			(skb->data[ETH_ALEN+ETH_ALEN] << 8) | skb->data[ETH_ALEN+ETH_ALEN+1]);
	}

        skb->protocol=eth_type_trans(skb,dev);/*设置2层协议*/
	netif_rx(skb);/*继续处理数据包*/
	lp->stats.rx_packets++;
	lp->stats.rx_bytes+=skb->len;
	return;
}

netif_rx：

net_interrupt=> net_rx=>netif_rx



/**
 *	netif_rx	-	post buffer to the network code
 *	@skb: buffer to post
 *
 *	This function receives a packet from a device driver and queues it for
 *	the upper (protocol) levels to process.  It always succeeds. The buffer
 *	may be dropped during processing for congestion control or by the 
 *	protocol layers.
 *      
 *	return values:
 *	NET_RX_SUCCESS	(no congestion)           
 *	NET_RX_CN_LOW     (low congestion) 
 *	NET_RX_CN_MOD     (moderate congestion)
 *	NET_RX_CN_HIGH    (high congestion) 
 *	NET_RX_DROP    (packet was dropped)
 *      
 *      
 */

int netif_rx(struct sk_buff *skb)
{
	int this_cpu = smp_processor_id();
	struct softnet_data *queue;
	unsigned long flags;

	if (skb->stamp.tv_sec == 0)
		get_fast_time(&skb->stamp);

	/* The code is rearranged so that the path is the most
	   short when CPU is congested, but is still operating.
	 */
	queue = &softnet_data[this_cpu];/*获取当前cpu的软中断数据结构对象*/

	local_irq_save(flags);

	netdev_rx_stat[this_cpu].total++;
	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
		if (queue->input_pkt_queue.qlen) {
			if (queue->throttle)
				goto drop;

enqueue:
			dev_hold(skb->dev);
			__skb_queue_tail(&queue->input_pkt_queue,skb);/*将数据包链入队列*/
			__cpu_raise_softirq(this_cpu, NET_RX_SOFTIRQ);/*设置数据包到达软中断*/
			local_irq_restore(flags);
#ifndef OFFLINE_SAMPLE
			get_sample_stats(this_cpu);
#endif
			return softnet_data[this_cpu].cng_level;
		}

		if (queue->throttle) {
			queue->throttle = 0;
#ifdef CONFIG_NET_HW_FLOWCONTROL
			if (atomic_dec_and_test(&netdev_dropping))
				netdev_wakeup();
#endif
		}
		goto enqueue;
	}

	if (queue->throttle == 0) {
		queue->throttle = 1;
		netdev_rx_stat[this_cpu].throttled++;
#ifdef CONFIG_NET_HW_FLOWCONTROL
		atomic_inc(&netdev_dropping);
#endif
	}

drop:
	netdev_rx_stat[this_cpu].dropped++;
	local_irq_restore(flags);

	kfree_skb(skb);
	return NET_RX_DROP;
}

软中断：

接下来要看软中断是如何处理接收数据包的，我们先看初始化时，怎么设置的回调函数。



/*
 *       Callers must hold the rtnl semaphore.  See the comment at the
 *       end of Space.c for details about the locking.
 */
int __init net_dev_init(void)
{
	struct net_device *dev, **dp;
	int i;

#ifdef CONFIG_NET_SCHED
	pktsched_init();
#endif

#ifdef CONFIG_NET_DIVERT
	dv_init();
#endif /* CONFIG_NET_DIVERT */
	
	/*
	 *	Initialise the packet receive queues.
	 */
    //每CPU变量的初始化
	for (i = 0; i < NR_CPUS; i++) {
		struct softnet_data *queue;

		queue = &softnet_data[i];
		skb_queue_head_init(&queue->input_pkt_queue);/*数据包的接收队列初始化*/
		queue->throttle = 0;
		queue->cng_level = 0;
		queue->avg_blog = 10; /* arbitrary non-zero */
		queue->completion_queue = NULL;
	}
	
     ......
	}

    ......
	dev_boot_phase = 0;
    /*软中断的处理函数*/
	open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);/*发送*/
	open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);/*接收*/

	dst_init();
	dev_mcast_init();

	/*
	 *	Initialise network devices
	 */
	 
	net_device_init();

	return 0;
}

net_rx_action：

所以我们现在继续看net_rx_action的处理过程：

do_softirq=>net_rx_action



static void net_rx_action(struct softirq_action *h)
{
	int this_cpu = smp_processor_id();
	struct softnet_data *queue = &softnet_data[this_cpu];/*获取当前cpu的软中断结构对象*/
	unsigned long start_time = jiffies;
	int bugdet = netdev_max_backlog;

	br_read_lock(BR_NETPROTO_LOCK);
    
	for (;;) {
		struct sk_buff *skb;
		struct net_device *rx_dev;

		local_irq_disable();
		skb = __skb_dequeue(&queue->input_pkt_queue);/*数据包从接收队列出列*/
		local_irq_enable();

		if (skb == NULL)
			break;

		skb_bond(skb);

		rx_dev = skb->dev;

#ifdef CONFIG_NET_FASTROUTE
		if (skb->pkt_type == PACKET_FASTROUTE) {
			netdev_rx_stat[this_cpu].fastroute_deferred_out++;
			dev_queue_xmit(skb);
			dev_put(rx_dev);
			continue;
		}
#endif
		skb->h.raw = skb->nh.raw = skb->data;
		{
			struct packet_type *ptype, *pt_prev;
			unsigned short type = skb->protocol;

			pt_prev = NULL;
            ......
            /*ip数据包将在这里被调用执行，我们看下对应的初始化就能明白*/
			for (ptype=ptype_base[ntohs(type)&15];ptype;ptype=ptype->next) {
				if (ptype->type == type &&
				    (!ptype->dev || ptype->dev == skb->dev)) {
					if (pt_prev) {
						if (!pt_prev->data)
							deliver_to_old_ones(pt_prev, skb, 0);
						else {
							atomic_inc(&skb->users);
							pt_prev->func(skb,
								      skb->dev,
								      pt_prev);/*ip_rcv*/
						}
					}
					pt_prev = ptype;
				}
			}

			if (pt_prev) {
				if (!pt_prev->data)
					deliver_to_old_ones(pt_prev, skb, 1);
				else
					pt_prev->func(skb, skb->dev, pt_prev);
			} else
				kfree_skb(skb);
		}

		dev_put(rx_dev);

		if (bugdet-- < 0 || jiffies - start_time > 1)
			goto softnet_break;

#ifdef CONFIG_NET_HW_FLOWCONTROL
	if (queue->throttle && queue->input_pkt_queue.qlen < no_cong_thresh ) {
		if (atomic_dec_and_test(&netdev_dropping)) {
			queue->throttle = 0;
			netdev_wakeup();
			goto softnet_break;
		}
	}
#endif

	}
	br_read_unlock(BR_NETPROTO_LOCK);

	local_irq_disable();
	if (queue->throttle) {
		queue->throttle = 0;
#ifdef CONFIG_NET_HW_FLOWCONTROL
		if (atomic_dec_and_test(&netdev_dropping))
			netdev_wakeup();
#endif
	}
	local_irq_enable();

	NET_PROFILE_LEAVE(softnet_process);
	return;

softnet_break:
	br_read_unlock(BR_NETPROTO_LOCK);

	local_irq_disable();
	netdev_rx_stat[this_cpu].time_squeeze++;
	__cpu_raise_softirq(this_cpu, NET_RX_SOFTIRQ);
	local_irq_enable();

	NET_PROFILE_LEAVE(softnet_process);
	return;
}

ptype_base初始化：

我们看下ptype_base的初始化。

inet_init=>ip_init

static struct packet_type ip_packet_type =
{
	__constant_htons(ETH_P_IP),
	NULL,	/* All devices */
	ip_rcv,
	(void*)1,
	NULL,
};
/*
 *	IP registers the packet type and then calls the subprotocol initialisers
 */

void __init ip_init(void)
{
	dev_add_pack(&ip_packet_type);/*将ip_packet_type放入到ptype_base中，因为type为ETH_P_IP*/

	ip_rt_init();
	inet_initpeers();

#ifdef CONFIG_IP_MULTICAST
	proc_net_create("igmp", 0, ip_mc_procinfo);
#endif
}

inet_init=>ip_init=>dev_add_pack

 
void dev_add_pack(struct packet_type *pt)
{
	int hash;

	br_write_lock_bh(BR_NETPROTO_LOCK);

#ifdef CONFIG_NET_FASTROUTE
	/* Hack to detect packet socket */
	if (pt->data) {
		netdev_fastroute_obstacles++;
		dev_clear_fastroute(pt->dev);
	}
#endif
	if (pt->type == htons(ETH_P_ALL)) {
		netdev_nit++;
		pt->next=ptype_all;
		ptype_all=pt;
	} else {/*ip协议的处理放在了ptype_base中*/
		hash=ntohs(pt->type)&15;
		pt->next = ptype_base[hash];
		ptype_base[hash] = pt;
	}
	br_write_unlock_bh(BR_NETPROTO_LOCK);
}

ip_rcv：

接下来看下ip_rcv做了什么：

do_softirq=>net_rx_action=>ip_rcv


/*
 * 	Main IP Receive routine.
 */ 
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
{
	struct iphdr *iph = skb->nh.iph;

	/* When the interface is in promisc. mode, drop all the crap
	 * that it receives, do not try to analyse it.
	 */
	if (skb->pkt_type == PACKET_OTHERHOST)/*非本机数据包丢弃*/
		goto drop;

	IP_INC_STATS_BH(IpInReceives);

    ....../*省略部分检查代码*/

	return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL,
		       ip_rcv_finish);

inhdr_error:
	IP_INC_STATS_BH(IpInHdrErrors);
drop:
        kfree_skb(skb);
out:
        return NET_RX_DROP;
}

ip_rcv_finish：

do_softirq=>net_rx_action=>ip_rcv=>ip_rcv_finish


static inline int ip_rcv_finish(struct sk_buff *skb)
{
	struct net_device *dev = skb->dev;
	struct iphdr *iph = skb->nh.iph;

	/*
	 *	Initialise the virtual path cache for the packet. It describes
	 *	how the packet travels inside Linux networking.
	 */ 
	if (skb->dst == NULL) {/*查找路由，不存在就创建，本机的数据包则设置为ip_local_deliver*/
		if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))
			goto drop; 
	}

    .......
    /*ip头大于20字节，有选项*/
	if (iph->ihl > 5) {
		struct ip_options *opt;

		/* It looks as overkill, because not all
		   IP options require packet mangling.
		   But it is the easiest for now, especially taking
		   into account that combination of IP options
		   and running sniffer is extremely rare condition.
		                                      --ANK (980813)
		*/

		skb = skb_cow(skb, skb_headroom(skb));
		if (skb == NULL)
			return NET_RX_DROP;
		iph = skb->nh.iph;

		skb->ip_summed = 0;
		if (ip_options_compile(NULL, skb))/*ip选项检查*/
			goto inhdr_error;

		opt = &(IPCB(skb)->opt);
		if (opt->srr) {
			struct in_device *in_dev = in_dev_get(dev);
			if (in_dev) {
				if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
					if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
						printk(KERN_INFO "source route option %u.%u.%u.%u -> %u.%u.%u.%u\n",
						       NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
					in_dev_put(in_dev);
					goto drop;
				}
				in_dev_put(in_dev);
			}
			if (ip_options_rcv_srr(skb))/*是否指定路由*/
				goto drop;
		}
	}

	return skb->dst->input(skb);/*调用ip_local_deliver*/

inhdr_error:
	IP_INC_STATS_BH(IpInHdrErrors);
drop:
        kfree_skb(skb);
        return NET_RX_DROP;
}

ip_local_deliver_finish：

do_softirq=>net_rx_action=>ip_rcv=>ip_rcv_finish=>ip_local_deliver

内部调用ip_local_deliver_finish

int ip_local_deliver(struct sk_buff *skb)
{
	struct iphdr *iph = skb->nh.iph;

    .......

	return NF_HOOK(PF_INET, NF_IP_LOCAL_IN, skb, skb->dev, NULL,
		       ip_local_deliver_finish);
}

do_softirq=>net_rx_action=>ip_rcv=>ip_rcv_finish=>ip_local_deliver=>ip_local_deliver_finish


static inline int ip_local_deliver_finish(struct sk_buff *skb)
{
	struct iphdr *iph = skb->nh.iph;

#ifdef CONFIG_NETFILTER_DEBUG
	nf_debug_ip_local_deliver(skb);
#endif /*CONFIG_NETFILTER_DEBUG*/

        /* Point into the IP datagram, just past the header. */
        skb->h.raw = skb->nh.raw + iph->ihl*4;

	{
		/* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */
		int hash = iph->protocol & (MAX_INET_PROTOS - 1);/*获取传输层协议*/
		struct sock *raw_sk = raw_v4_htable[hash];
		struct inet_protocol *ipprot;
		int flag;

		/* If there maybe a raw socket we must check - if not we
		 * don't care less
		 */
		if(raw_sk != NULL)
			raw_sk = raw_v4_input(skb, iph, hash);
        /*根据协议找到tcp_protocol*/
		ipprot = (struct inet_protocol *) inet_protos[hash];
		flag = 0;
		if(ipprot != NULL) {
			if(raw_sk == NULL &&
			   ipprot->next == NULL &&
			   ipprot->protocol == iph->protocol) {
				int ret;
				
				/* Fast path... */
                /*调用tcp_v4_rcv*/
				ret = ipprot->handler(skb, (ntohs(iph->tot_len) -
							    (iph->ihl * 4)));

				return ret;
			} else {
				flag = ip_run_ipprot(skb, iph, ipprot, (raw_sk != NULL));
			}
		}
        .......
	}

	return 0;
}

inet_protos初始化：

看一下inet_protos的初始化过程，同样在inet_init函数中初始化的：

static struct inet_protocol tcp_protocol = 
{
	tcp_v4_rcv,		/* TCP handler		*/
	tcp_v4_err,		/* TCP error control	*/  
	IPPROTO_PREVIOUS,
	IPPROTO_TCP,		/* protocol ID		*/
	0,			/* copy			*/
	NULL,			/* data			*/
	"TCP"			/* name			*/
};

#define IPPROTO_PREVIOUS &tcp_protocol

struct inet_protocol *inet_protocol_base = IPPROTO_PREVIOUS;

/*
 *	Called by socket.c on kernel startup.  
 */
 
static int __init inet_init(void)
{
   ......

	printk(KERN_INFO "IP Protocols: ");
	for(p = inet_protocol_base; p != NULL;) 
	{
		struct inet_protocol *tmp = (struct inet_protocol *) p->next;
		inet_add_protocol(p);/*将传输层协议处理对象注册进来*/
		printk("%s%s",p->name,tmp?", ":"\n");
		p = tmp;
	}


    ......


	return 0;
}

inet_init=>inet_add_protocol

/* Standard well-defined IP protocols.  */
enum {
......
  IPPROTO_TCP = 6,		/* Transmission Control Protocol	*/
......

  IPPROTO_RAW	 = 255,		/* Raw IP packets			*/
  IPPROTO_MAX
};


#define MAX_INET_PROTOS	32		/* Must be a power of 2		*/

/*
 *	Add a protocol handler to the hash tables
 */

void inet_add_protocol(struct inet_protocol *prot)
{
	unsigned char hash;
	struct inet_protocol *p2;

	hash = prot->protocol & (MAX_INET_PROTOS - 1);/*以协议为下标保存到inet_protos数组中，如果大于最大值-1，则取余*/
	br_write_lock_bh(BR_NETPROTO_LOCK);
	prot ->next = inet_protos[hash];
	inet_protos[hash] = prot;
	prot->copy = 0;

	/*
	 *	Set the copy bit if we need to. 
	 */
	 
	p2 = (struct inet_protocol *) prot->next;
	while(p2 != NULL) 
	{
		if (p2->protocol == prot->protocol) 
		{
			prot->copy = 1;
			break;
		}
		p2 = (struct inet_protocol *) p2->next;
	}
	br_write_unlock_bh(BR_NETPROTO_LOCK);
}

tcp_v4_rcv ：

继续看数据包的处理。

do_softirq=>net_rx_action=>ip_rcv=>ip_rcv_finish=>ip_local_deliver=>ip_local_deliver_finish=>tcp_v4_rcv


/*
 *	From tcp_input.c
 */

int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
{
	struct tcphdr *th;
	struct sock *sk;
	int ret;

    ......
    /*查询该数据包的sock对象是否已经建立，目前是没有，所以查询到的是服务器的sock*/

	sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
			     skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));

	if (!sk)
		goto no_tcp_socket;

process:
    ......

	if (sk->state == TCP_TIME_WAIT)
		goto do_time_wait;

	skb->dev = NULL;

	bh_lock_sock(sk);
	ret = 0;
	if (!sk->lock.users) {
		if (!tcp_prequeue(sk, skb))/*链入预处理队列*/
			ret = tcp_v4_do_rcv(sk, skb);/*处理数据包*/
	} else
		sk_add_backlog(sk, skb);
	bh_unlock_sock(sk);

	sock_put(sk);

	return ret;

    ......
}

do_softirq=>net_rx_action=>ip_rcv=>ip_rcv_finish=>ip_local_deliver=>ip_local_deliver_finish=>tcp_v4_rcv =>tcp_v4_do_rcv



/* The socket must have it's spinlock held when we get
 * here.
 *
 * We have a potential double-lock case here, so even when
 * doing backlog processing we use the BH locking scheme.
 * This is because we cannot sleep with the original spinlock
 * held.
 */
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
    ......

	if (sk->state == TCP_ESTABLISHED) { /* Fast path,已经处于连接状态 */
		TCP_CHECK_TIMER(sk);
		if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))/*处理数据包*/
			goto reset;
		TCP_CHECK_TIMER(sk);
		return 0; 
	}

	......
    /*处于监听状态*/
	if (sk->state == TCP_LISTEN) { 
		struct sock *nsk = tcp_v4_hnd_req(sk, skb);/*查找和创建客户端的sock结构对象*/
		if (!nsk)
			goto discard;

		if (nsk != sk) {/*如果找到的不是服务器当前的sock结构，它就代表客户端的sock*/
			if (tcp_child_process(sk, nsk, skb))
				goto reset;
			return 0;
		}
	}

	TCP_CHECK_TIMER(sk);
	if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))/*根据sock状态处理数据包*/
		goto reset;
	TCP_CHECK_TIMER(sk);
	return 0;

reset:
	tcp_v4_send_reset(skb);//向客户端发送rst数据包
discard:
	kfree_skb(skb);
	/* Be careful here. If this function gets more complicated and
	 * gcc suffers from register pressure on the x86, sk (in %ebx) 
	 * might be destroyed here. This current version compiles correctly,
	 * but you have been warned.
	 */
	return 0;

csum_err:
	TCP_INC_STATS_BH(TcpInErrs);
	goto discard;
}

do_softirq=>net_rx_action=>ip_rcv=>ip_rcv_finish=>ip_local_deliver=>ip_local_deliver_finish=>tcp_v4_rcv =>tcp_v4_do_rcv=>tcp_v4_hnd_req


static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
{
	struct open_request *req, **prev;
	struct tcphdr *th = skb->h.th;
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
	struct sock *nsk;

	/* Find possible connection requests. */
	req = tcp_v4_search_req(tp, skb->nh.iph, th, &prev);/*查找连接请求结构*/
	if (req)//如果找到了连接请求结构
		return tcp_check_req(sk, skb, req, prev);//创建代表客户端的sock结构，将连接请求链入到这个sock结构的接收队列中
    //如果没有找到，就在已经连接的队列中查找
	nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
					  th->source,
					  skb->nh.iph->daddr,
					  ntohs(th->dest),
					  tcp_v4_iif(skb));

	if (nsk) {
		if (nsk->state != TCP_TIME_WAIT) {
			bh_lock_sock(nsk);
			return nsk;
		}
		tcp_tw_put((struct tcp_tw_bucket*)sk);
		return NULL;
	}

#ifdef CONFIG_SYN_COOKIES
	if (!th->rst && !th->syn && th->ack)
		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
#endif
	return sk;
}


/* 
 *	Process an incoming packet for SYN_RECV sockets represented
 *	as an open_request.
 */

struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
			   struct open_request *req,
			   struct open_request **prev)
{
	struct tcphdr *th = skb->h.th;
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
	u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
	int paws_reject = 0;
	struct tcp_opt ttp;
	struct sock *child;
    ......
    //调用tcp_v4_syn_recv_sock，其初始化的过程后面会讲解
	child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
	if (child == NULL)
		goto listen_overflow;

	tcp_synq_unlink(tp, req, prev);
	tcp_synq_removed(sk, req);

	tcp_acceptq_queue(sk, req, child);/*将req挂入sk的接收队列中，为accept做准备*/
	return child;

listen_overflow:
	if (!sysctl_tcp_abort_on_overflow) {
		req->acked = 1;
		return NULL;
	}

embryonic_reset:
	NET_INC_STATS_BH(EmbryonicRsts);
	if (!(flg & TCP_FLAG_RST))
		req->class->send_reset(skb);

	tcp_synq_drop(sk, req, prev);
	return NULL;
}

这里不可能找到客户端的连接请求结构，既然没有找到客户端的连接请求结构，最后返回的还是服务器的sock结构对象。

tcp_rcv_state_process:

回到tcp_v4_do_rcv函数中，此时进入tcp_rcv_state_process函数中根据服务器的sock状态对数据包进行处理。

do_softirq=>net_rx_action=>ip_rcv=>ip_rcv_finish=>ip_local_deliver=>ip_local_deliver_finish=>tcp_v4_rcv =>tcp_v4_do_rcv=>tcp_rcv_state_process



/*
 *	This function implements the receiving procedure of RFC 793 for
 *	all states except ESTABLISHED and TIME_WAIT. 
 *	It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
 *	address independent.
 */
	
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
			  struct tcphdr *th, unsigned len)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
	int queued = 0;

	tp->saw_tstamp = 0;

	switch (sk->state) {
	case TCP_CLOSE:
		goto discard;

	case TCP_LISTEN:
		if(th->ack)
			return 1;

		if(th->syn) {
            /*我们应该执行到这里，又是函数指针，需要继续看初始化的过程，此时af_specific指向
              ipv4_specific*/
			if(tp->af_specific->conn_request(sk, skb) < 0)
				return 1;

.......
			goto discard;
		}
		goto discard;

	case TCP_SYN_SENT:
		queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
		if (queued >= 0)
			return queued;
		queued = 0;
		goto step6;
	}



......
	return 0;
}

sys_socket=>sock_create=>inet_create=>tcp_prot->init(sk->prot->init)=>tcp_v4_init_sock

sock的初始化过程：

struct tcp_func ipv4_specific = {
	ip_queue_xmit,
	tcp_v4_send_check,
	tcp_v4_rebuild_header,
	tcp_v4_conn_request,
	tcp_v4_syn_recv_sock,/*三次握手的client的ack数据包过来时调用的函数：创建服务器的client的sock对象，并与req绑定，然后唤醒正在等待的服务器进程*/
	tcp_v4_hash_connecting,
	tcp_v4_remember_stamp,
	sizeof(struct iphdr),

	ip_setsockopt,
	ip_getsockopt,
	v4_addr2sockaddr,
	sizeof(struct sockaddr_in)
};

/* NOTE: A lot of things set to zero explicitly by call to
 *       sk_alloc() so need not be done here.
 */
static int tcp_v4_init_sock(struct sock *sk)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);

	skb_queue_head_init(&tp->out_of_order_queue);
	tcp_init_xmit_timers(sk);
	tcp_prequeue_init(tp);
    ......
    /*设置对应的跳转函数表*/
	sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;

	sk->sndbuf = sysctl_tcp_wmem[1];
	sk->rcvbuf = sysctl_tcp_rmem[1];

	atomic_inc(&tcp_sockets_allocated);

	return 0;
}

do_softirq=>net_rx_action=>ip_rcv=>ip_rcv_finish=>ip_local_deliver=>ip_local_deliver_finish=>tcp_v4_rcv =>tcp_v4_do_rcv=>tcp_rcv_state_process=>tcp_v4_conn_request


int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
	struct tcp_opt tp;
	struct open_request *req;
	__u32 saddr = skb->nh.iph->saddr;
	__u32 daddr = skb->nh.iph->daddr;
	__u32 isn = TCP_SKB_CB(skb)->when;
	struct dst_entry *dst = NULL;
    /*创建请求结构对象*/
	req = tcp_openreq_alloc();
	if (req == NULL)
		goto drop;

	tcp_clear_options(&tp);
	tp.mss_clamp = 536;
	tp.user_mss = sk->tp_pinfo.af_tcp.user_mss;

	tcp_parse_options(skb, &tp, 0);

	if (want_cookie) {
		tcp_clear_options(&tp);
		tp.saw_tstamp = 0;
	}

	if (tp.saw_tstamp && tp.rcv_tsval == 0) {
		/* Some OSes (unknown ones, but I see them on web server, which
		 * contains information interesting only for windows'
		 * users) do not send their stamp in SYN. It is easy case.
		 * We simply do not advertise TS support.
		 */
		tp.saw_tstamp = 0;
		tp.tstamp_ok = 0;
	}
	tp.tstamp_ok = tp.saw_tstamp;
    /*初始化请求对象*/
	tcp_openreq_init(req, &tp, skb);

    ......
	if (tcp_v4_send_synack(sk, req, dst))/*给客户端发送 ack数据包, 这是tcp协议的第二次握手*/
		goto drop_and_free;

	if (want_cookie) {
	   	tcp_openreq_free(req); 
	} else {
		tcp_v4_synq_add(sk, req);/*将请求对象挂入服务器的sock中*/
	}
	return 0;

drop_and_free:
	tcp_openreq_free(req); 
drop:
	TCP_INC_STATS_BH(TcpAttemptFails);
	return 0;
}

do_softirq=>net_rx_action=>ip_rcv=>ip_rcv_finish=>ip_local_deliver=>ip_local_deliver_finish=>tcp_v4_rcv =>tcp_v4_do_rcv=>tcp_rcv_state_process=>tcp_v4_conn_request=>tcp_v4_synq_add


static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
{
	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
	struct tcp_listen_opt *lopt = tp->listen_opt;
	unsigned h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port);

	req->expires = jiffies + TCP_TIMEOUT_INIT;/*设置过期时间*/
	req->retrans = 0;//重发标志
	req->sk = NULL;/*清空sock指针*/
	req->index = h;
	req->dl_next = lopt->syn_table[h];//指向上一个连接请求结构

	write_lock(&tp->syn_wait_lock);
	lopt->syn_table[h] = req;//记录到监听结构的数组中，头插
	write_unlock(&tp->syn_wait_lock);

	tcp_synq_added(sk);
}

当服务器给客户端发送syn_ack数据包时，会送到客户端网卡，过程就像上面一样的接收过程，最终通过tcp_v4_do_rcv函数进入tcp_rcv_state_process来处理接收到的syn_ack数据包，我们看关键处理部分：



/*
 *	This function implements the receiving procedure of RFC 793 for
 *	all states except ESTABLISHED and TIME_WAIT. 
 *	It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
 *	address independent.
 */
	
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
			  struct tcphdr *th, unsigned len)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
	int queued = 0;

	tp->saw_tstamp = 0;

	switch (sk->state) {//判断sock状态
......

	case TCP_SYN_SENT://处于SYN状态
		queued = tcp_rcv_synsent_state_process(sk, skb, th, len);//调用处理函数
		if (queued >= 0)
			return queued;
		queued = 0;
		goto step6;
	}


......

	if (!queued) { 
discard:
		__kfree_skb(skb);
	}
	return 0;
}

后面我们会讲解客户端发出连接请求时，在tcp_connect函数中，会将sock状态设置为SYN状态: tcp_set_state(sk,TCP_SYN_SENT);

也就是第一次握手时，客户端的sock设置了TCP_SYN_SENT状态，所以服务器的第二次握手报文到来时，客户端会进入tcp_rcv_synsent_state_process函数。在这个函数中，

客户端会重新准备数据包调用tcp_send_ack函数向服务器发出第三次握手，并将客户端的sock状态修改为TCP_ESTABLISHED.

tcp_v4_hnd_req:

服务器接收到客户端的第三次握手数据包后，再次执行到tcp_v4_do_rcv函数，还是进入到tcp_v4_hnd_req函数中查找连接请求结构和代表客户端的sock结构。


static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
{
	struct open_request *req, **prev;
	struct tcphdr *th = skb->h.th;
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
	struct sock *nsk;

	/* Find possible connection requests. */
	req = tcp_v4_search_req(tp, skb->nh.iph, th, &prev);//查找请求结构
	if (req)//如果找到连接请求结构
		return tcp_check_req(sk, skb, req, prev);//创建代表客户端的sock结构，将连接请求链入到这个sock结构的接收队列中

......
	return sk;
}

do_softirq=>net_rx_action=>ip_rcv=>ip_rcv_finish=>ip_local_deliver=>ip_local_deliver_finish=>tcp_v4_rcv =>tcp_v4_do_rcv=>tcp_v4_hnd_req=>tcp_check_req

struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
			   struct open_request *req,
			   struct open_request **prev)
{
	struct tcphdr *th = skb->h.th;
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
	u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
	int paws_reject = 0;
	struct tcp_opt ttp;
	struct sock *child;

......
    //创建代表客户端的sock结构，并初始化
	child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
	if (child == NULL)
		goto listen_overflow;

	tcp_synq_unlink(tp, req, prev);//将请求结构脱队
	tcp_synq_removed(sk, req);//摘取定时器等

	tcp_acceptq_queue(sk, req, child);//将连接请求链入客户端的sock结构
	return child;
......
}

函数大部分代码是检查数据包的内容，我们这里只看核心部分，tp->af_specific->syn_recv_sock这个对应的是ipv4_specific.tcp_v4_syn_recv_sock,我们看下代码实现：

do_softirq=>net_rx_action=>ip_rcv=>ip_rcv_finish=>ip_local_deliver=>ip_local_deliver_finish=>tcp_v4_rcv =>tcp_v4_do_rcv=>tcp_v4_hnd_req=>tcp_check_req=>tcp_v4_syn_recv_sock

struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
				   struct open_request *req,
				   struct dst_entry *dst)
{
	struct tcp_opt *newtp;
	struct sock *newsk;

	if (tcp_acceptq_is_full(sk))
		goto exit_overflow;

	if (dst == NULL &&
	    (dst = tcp_v4_route_req(sk, req)) == NULL)
		goto exit;

	newsk = tcp_create_openreq_child(sk, req, skb);//克隆服务器的sock结构，创建客户端的sock
	if (!newsk)
		goto exit;

	newsk->dst_cache = dst;
    //初始化源目的地址等
	newtp = &(newsk->tp_pinfo.af_tcp);
	newsk->daddr = req->af.v4_req.rmt_addr;
	newsk->saddr = req->af.v4_req.loc_addr;
	newsk->rcv_saddr = req->af.v4_req.loc_addr;
	newsk->protinfo.af_inet.opt = req->af.v4_req.opt;
	req->af.v4_req.opt = NULL;
	newsk->protinfo.af_inet.mc_index = tcp_v4_iif(skb);
	newsk->protinfo.af_inet.mc_ttl = skb->nh.iph->ttl;
	newtp->ext_header_len = 0;
	if (newsk->protinfo.af_inet.opt)
		newtp->ext_header_len = newsk->protinfo.af_inet.opt->optlen;

	tcp_sync_mss(newsk, dst->pmtu);
	newtp->advmss = dst->advmss;
	tcp_initialize_rcv_mss(newsk);

	__tcp_v4_hash(newsk);//将该连接挂入tcp_ehash哈希表中
	__tcp_inherit_port(sk, newsk);
	return newsk;
......
}

do_softirq=>net_rx_action=>ip_rcv=>ip_rcv_finish=>ip_local_deliver=>ip_local_deliver_finish=>tcp_v4_rcv =>tcp_v4_do_rcv=>tcp_v4_hnd_req=>tcp_check_req=>tcp_v4_syn_recv_sock=>tcp_create_openreq_child



/* This is not only more efficient than what we used to do, it eliminates
 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
 *
 * Actually, we could lots of memory writes here. tp of listening
 * socket contains all necessary default parameters.
 */
struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb)
{
	struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, 0);

	if(newsk != NULL) {
		struct tcp_opt *newtp;
#ifdef CONFIG_FILTER
		struct sk_filter *filter;
#endif

		memcpy(newsk, sk, sizeof(*newsk));//继承服务器的sock结构的内容比如tcp_prot
		newsk->state = TCP_SYN_RECV;//修改连接状态
        //省略初始化代码,这里
	    ......
     		newsk->socket = NULL;//将对应的socket指针清空

	}
	return newsk;
}

看完了创建过程，继续回到tcp_check_req函数中，返回child sock之后调用tcp_acceptq_queue函数将请求结构和客户端的sock进行关联

static inline void tcp_acceptq_queue(struct sock *sk, struct open_request *req,
					 struct sock *child)
{
	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;

	req->sk = child;//连接请求结构记录客户端的sock结构，之前的accept系统调用函数调用tcp_accept从accept_queue队列中获取客户端的sock
	tcp_acceptq_added(sk);//递增服务器的sock的ack计数

	if (!tp->accept_queue_tail) {//如果队列为空
		tp->accept_queue = req;//放入队列头
	} else {
		tp->accept_queue_tail->dl_next = req;//队列不为空，放入队列尾部
	}
	tp->accept_queue_tail = req;//记录到连接队列中
	req->dl_next = NULL;//清空下一个连接请求结构指针
}

从tcp_v4_hnd_req返回至tcp_v4_do_rcv，这时取得了客户端的sock结构，与服务器的sock对比，确认不是服务器的sock结构后，进入tcp_child_process函数中，唤醒服务器进程接收连接请求，服务器和客户端的sock指针都传入这个函数。

tcp_child_process:

int tcp_child_process(struct sock *parent, struct sock *child,
		      struct sk_buff *skb)
{
	int ret = 0;
	int state = child->state;

	if (child->lock.users == 0) {//检查客户端sock是否可用
		ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len);//根据客户端sock的状态处理数据包

		/* Wakeup parent, send SIGIO */
		if (state == TCP_SYN_RECV && child->state != state)//如果客户端sock状态改变
			parent->data_ready(parent, 0);//唤醒进程处理连接请求
	} else {
		/* Alas, it is possible again, because we do lookup
		 * in main socket hash table and lock on listening
		 * socket does not protect us more.
		 */
		sk_add_backlog(child, skb);//否则放入后备队列中
	}

	bh_unlock_sock(child);
	sock_put(child);
	return ret;
}

int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
			  struct tcphdr *th, unsigned len)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
	int queued = 0;

	switch (sk->state) {

	......
	/* step 5: check the ACK field */
	if (th->ack) {//如果头部设置了ack标志
		int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH);//验证ack

		switch(sk->state) {
		case TCP_SYN_RECV:
			if (acceptable) {//验证通过
				tp->copied_seq = tp->rcv_nxt;//记录下一个序号
				mb();
				tcp_set_state(sk, TCP_ESTABLISHED);//修改客户端sock状态

				/* Note, that this wakeup is only for marginal
				 * crossed SYN case. Passively open sockets
				 * are not waked up, because sk->sleep == NULL
				 * and sk->socket == NULL.
				 */
				if (sk->socket) {
					sk->state_change(sk);//调用状态改变函数
					sk_wake_async(sk,0,POLL_OUT);
				}
.......
			} else {
				return 1;
			}
			break;

......
}

这里将客户端的sock状态设置为TCP_ESTABLISHED表示已经连接。返回 tcp_child_process函数，这时检查客户端的状态已经发生改变，也就是说3次握手已经成功完成，于是调用服务器的sock结构的data_ready函数，这个函数是在sys_socket=>inet_create中的sock_init_data里面设置的,为sock_def_readable。

sock_def_readable:


void sock_def_readable(struct sock *sk, int len)
{
	read_lock(&sk->callback_lock);
	if (sk->sleep && waitqueue_active(sk->sleep))
		wake_up_interruptible(sk->sleep);
	sk_wake_async(sk,1,POLL_IN);
	read_unlock(&sk->callback_lock);
}

它唤醒服务器sock结构上的等待进程，即唤醒服务器程序来接收客户端的连接请求，对应前面accept的内容，在那里会从 wait_for_connect函数中进行，从而使服务器程序接收客户端的连接请求。

然后先回到inet_accept函数中里面通过sock_graft(sk2, newsock);将sock和socket相互绑定，此时的newsock指向的ops为服务器socket的ops，即inet_stream_ops，所以sendmsg为inet_sendmsg。到这里将req请求结构创建好，后面的数据包过来时，就能查到对应的数据。

至此我们把accept的过程简单讲解了一下，可以回答开篇的那个问题了，即sock->ops->sendmsg 是inet_sendmsg。

inet_sendmsg：

sys_send=>sys_sendto=>sock_sendmsg=>inet_sendmsg

int inet_sendmsg(struct socket *sock, struct msghdr *msg, int size,
		 struct scm_cookie *scm)
{
	struct sock *sk = sock->sk;

	/* We may need to bind the socket. */
	if (sk->num==0 && inet_autobind(sk) != 0)
		return -EAGAIN;

	return sk->prot->sendmsg(sk, msg, size);//tcp_sendmsg
}

sys_send=>sys_sendto=>sock_sendmsg=>inet_sendmsg=>tcp_sendmsg

int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size)
{
	struct iovec *iov;//缓冲区结构指针
	struct tcp_opt *tp;
	struct sk_buff *skb;//数据包结构指针
	int iovlen, flags;
	int mss_now;
	int err, copied;
	long timeo;

	err = 0;
	tp = &(sk->tp_pinfo.af_tcp);

	lock_sock(sk);//加锁，如果sock锁被其他进程占用了，当前进程睡眠等待
	TCP_CHECK_TIMER(sk);

	flags = msg->msg_flags;//获取消息结构的标志

	timeo = sock_sndtimeo(sk, flags&MSG_DONTWAIT);//确定发送超时时间

	/* 定时等待sock进入连接状态. */
	if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
		if((err = wait_for_tcp_connect(sk, flags, &timeo)) != 0)
			goto out_unlock;

	/* This should be in poll */
	clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);

	mss_now = tcp_current_mss(sk);

	/* Ok commence sending. */
	iovlen = msg->msg_iovlen;//获取缓冲区数量
	iov = msg->msg_iov;//获取缓冲区指针
	copied = 0;

	while (--iovlen >= 0) {//循环发送每一个缓冲区的数据
		int seglen=iov->iov_len;//获取缓冲区长度
		unsigned char * from=iov->iov_base;//获取缓冲区地址

		iov++;

		while (seglen > 0) {//如果缓冲区长度大于0
			int copy, tmp, queue_it;

			if (err)
				goto do_fault2;

			/* Stop on errors. */
			if (sk->err)
				goto do_sock_err;

			/* Make sure that we are established. */
			if (sk->shutdown & SEND_SHUTDOWN)
				goto do_shutdown;
	
			/* Now we need to check if we have a half
			 * built packet we can tack some data onto.
			 */
			skb = sk->write_queue.prev;//获取发送队列中的前一个数据包
			if (tp->send_head &&
			    (mss_now - skb->len) > 0) {
				copy = skb->len;
				if (skb_tailroom(skb) > 0) {
					int last_byte_was_odd = (copy % 4);

					copy = mss_now - copy;
					if(copy > skb_tailroom(skb))
						copy = skb_tailroom(skb);
					if(copy > seglen)
						copy = seglen;
					if(last_byte_was_odd) {
						if(copy_from_user(skb_put(skb, copy),
								  from, copy))
							err = -EFAULT;
						skb->csum = csum_partial(skb->data,
									 skb->len, 0);
					} else {
						skb->csum =
							csum_and_copy_from_user(
							from, skb_put(skb, copy),
							copy, skb->csum, &err);
					}
					/*
					 * FIXME: the *_user functions should
					 *	  return how much data was
					 *	  copied before the fault
					 *	  occurred and then a partial
					 *	  packet with this data should
					 *	  be sent.  Unfortunately
					 *	  csum_and_copy_from_user doesn't
					 *	  return this information.
					 *	  ATM it might send partly zeroed
					 *	  data in this case.
					 */
					tp->write_seq += copy;
					TCP_SKB_CB(skb)->end_seq += copy;
					from += copy;
					copied += copy;
					seglen -= copy;
					if (PSH_NEEDED ||
					    after(tp->write_seq, tp->pushed_seq+(tp->max_window>>1))) {
						TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
						tp->pushed_seq = tp->write_seq;
					}
					if (flags&MSG_OOB) {
						tp->urg_mode = 1;
						tp->snd_up = tp->write_seq;
						TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
					}
					continue;
				} else {
					TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
					tp->pushed_seq = tp->write_seq;
				}
			}

			copy = min(seglen, mss_now);

			/* Determine how large of a buffer to allocate.  */
			tmp = MAX_TCP_HEADER + 15 + tp->mss_cache;
			if (copy < mss_now && !(flags & MSG_OOB)) {
				/* What is happening here is that we want to
				 * tack on later members of the users iovec
				 * if possible into a single frame.  When we
				 * leave this loop our we check to see if
				 * we can send queued frames onto the wire.
				 */
				queue_it = 1;
			} else {
				queue_it = 0;
			}

			skb = NULL;
			if (tcp_memory_free(sk))
				skb = tcp_alloc_skb(sk, tmp, sk->allocation);//分配数据包结构空间
			if (skb == NULL) {
				/* If we didn't get any memory, we need to sleep. */
				set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
				set_bit(SOCK_NOSPACE, &sk->socket->flags);

				__tcp_push_pending_frames(sk, tp, mss_now, 1);

				if (!timeo) {
					err = -EAGAIN;
					goto do_interrupted;
				}
				if (signal_pending(current)) {
					err = sock_intr_errno(timeo);
					goto do_interrupted;
				}
				timeo = wait_for_tcp_memory(sk, timeo);//没有分配成功，等待内存

				/* If SACK's were formed or PMTU events happened,
				 * we must find out about it.
				 */
				mss_now = tcp_current_mss(sk);
				continue;
			}

			seglen -= copy;

			/* Prepare control bits for TCP header creation engine. */
			if (PSH_NEEDED ||
			    after(tp->write_seq+copy, tp->pushed_seq+(tp->max_window>>1))) {
				TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK|TCPCB_FLAG_PSH;
				tp->pushed_seq = tp->write_seq + copy;
			} else {
				TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
			}
			TCP_SKB_CB(skb)->sacked = 0;
			if (flags & MSG_OOB) {
				TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
				tp->urg_mode = 1;
				tp->snd_up = tp->write_seq + copy;
			}

			/* TCP data bytes are SKB_PUT() on top, later
			 * TCP+IP+DEV headers are SKB_PUSH()'d beneath.
			 * Reserve header space and checksum the data.
			 */
			skb_reserve(skb, MAX_TCP_HEADER);
			skb->csum = csum_and_copy_from_user(from,
					skb_put(skb, copy), copy, 0, &err);

			if (err)
				goto do_fault;

			from += copy;
			copied += copy;

			TCP_SKB_CB(skb)->seq = tp->write_seq;
			TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + copy;

			/* This advances tp->write_seq for us. */
			tcp_send_skb(sk, skb, queue_it, mss_now);//发送数据包
		}
	}
......
}

sys_send=>sys_sendto=>sock_sendmsg=>inet_sendmsg=>tcp_sendmsg=>tcp_send_skb

void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue, unsigned cur_mss)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);

	/* Advance write_seq and place onto the write_queue. */
	tp->write_seq = TCP_SKB_CB(skb)->end_seq;
	__skb_queue_tail(&sk->write_queue, skb);
	tcp_charge_skb(sk, skb);

	if (!force_queue && tp->send_head == NULL && tcp_snd_test(tp, skb, cur_mss, tp->nonagle)) {
		/* Send it out now. */
		TCP_SKB_CB(skb)->when = tcp_time_stamp;
		if (tcp_transmit_skb(sk, skb_clone(skb, sk->allocation)) == 0) {//发送数据包
			tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
			tcp_minshall_update(tp, cur_mss, skb);
			if (tp->packets_out++ == 0)
				tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
			return;
		}
	}
	/* Queue it, remembering where we must start sending. */
	if (tp->send_head == NULL)
		tp->send_head = skb;
}

sys_send=>sys_sendto=>sock_sendmsg=>inet_sendmsg=>tcp_sendmsg=>tcp_send_skb=>tcp_transmit_skb

int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
{
	if(skb != NULL) {
		struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
		struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
		int tcp_header_size = tp->tcp_header_len;
		struct tcphdr *th;
		int sysctl_flags;
		int err;

        ......
        //tcp_v4_send_check，计算并记录校验和
		tp->af_specific->send_check(sk, th, skb->len, skb);

		if (tcb->flags & TCPCB_FLAG_ACK)
			tcp_event_ack_sent(sk);

		if (skb->len != tcp_header_size)
			tcp_event_data_sent(tp, skb);

		TCP_INC_STATS(TcpOutSegs);
        //ip_queue_xmit,进入ip层协议
		err = tp->af_specific->queue_xmit(skb);
		if (err <= 0)
			return err;

		tcp_enter_cwr(tp);

		/* NET_XMIT_CN is special. It does not guarantee,
		 * that this packet is lost. It tells that device
		 * is about to start to drop packets or already
		 * drops some packets of the same priority and
		 * invokes us to send less aggressively.
		 */
		return err == NET_XMIT_CN ? 0 : err;
	}
	return -ENOBUFS;
#undef SYSCTL_FLAG_TSTAMPS
#undef SYSCTL_FLAG_WSCALE
#undef SYSCTL_FLAG_SACK
}

进化成ip包

sys_send=>sys_sendto=>sock_sendmsg=>inet_sendmsg=>tcp_sendmsg=>tcp_send_skb=>tcp_transmit_skb=>ip_queue_xmit


int ip_queue_xmit(struct sk_buff *skb)
{
	struct sock *sk = skb->sk;
	struct ip_options *opt = sk->protinfo.af_inet.opt;
	struct rtable *rt;
	struct iphdr *iph;

	/* Make sure we can route this packet. */
	rt = (struct rtable *)__sk_dst_check(sk, 0);//检查路由项并转换为路由表指针
	if (rt == NULL) {//如果没有路由表
		u32 daddr;

		/* Use correct destination address if we have options. */
		daddr = sk->daddr;//获取目的地址
		if(opt && opt->srr)//如果指定了源路由
			daddr = opt->faddr;//将转发地址作为目标地址

		/* If this fails, retransmit mechanism of transport layer will
		 * keep trying until route appears or the connection times itself
		 * out.
		 */
		if (ip_route_output(&rt, daddr, sk->saddr,
				    RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
				    sk->bound_dev_if))//查找或创建路由表
			goto no_route;
		__sk_dst_set(sk, &rt->u.dst);
	}
	skb->dst = dst_clone(&rt->u.dst);

	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
		goto no_route;

	/* OK, we know where to send it, allocate and build IP header. */
	iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
	*((__u16 *)iph)	= htons((4 << 12) | (5 << 8) | (sk->protinfo.af_inet.tos & 0xff));
	iph->tot_len = htons(skb->len);
	iph->frag_off = 0;
	iph->ttl      = sk->protinfo.af_inet.ttl;
	iph->protocol = sk->protocol;
	iph->saddr    = rt->rt_src;
	iph->daddr    = rt->rt_dst;
	skb->nh.iph   = iph;
	/* Transport layer set skb->h.foo itself. */

	if(opt && opt->optlen) {
		iph->ihl += opt->optlen >> 2;
		ip_options_build(skb, opt, sk->daddr, rt, 0);
	}

	return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
		       ip_queue_xmit2);//继续发送数据包

no_route:
	IP_INC_STATS(IpOutNoRoutes);
	kfree_skb(skb);
	return -EHOSTUNREACH;
}

sys_send=>sys_sendto=>sock_sendmsg=>inet_sendmsg=>tcp_sendmsg=>tcp_send_skb=>tcp_transmit_skb=>ip_queue_xmit=>ip_queue_xmit2

static inline int ip_queue_xmit2(struct sk_buff *skb)
{
	struct sock *sk = skb->sk;
	struct rtable *rt = (struct rtable *)skb->dst;
	struct net_device *dev;
	struct iphdr *iph = skb->nh.iph;

......
	skb->priority = sk->priority;
	return skb->dst->output(skb);//继续发送

fragment:
	if (ip_dont_fragment(sk, &rt->u.dst)) {
		/* Reject packet ONLY if TCP might fragment
		 * it itself, if were careful enough.
		 */
		iph->frag_off |= __constant_htons(IP_DF);
		NETDEBUG(printk(KERN_DEBUG "sending pkt_too_big to self\n"));

		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
			  htonl(rt->u.dst.pmtu));
		kfree_skb(skb);
		return -EMSGSIZE;
	}
	ip_select_ident(iph, &rt->u.dst);
	return ip_fragment(skb, skb->dst->output);
}

sys_send=>sys_sendto=>sock_sendmsg=>inet_sendmsg=>tcp_sendmsg=>tcp_send_skb=>tcp_transmit_skb=>ip_queue_xmit=>ip_queue_xmit2=>ip_output

路由相关的内容用另一篇讲解

int ip_output(struct sk_buff *skb)
{
#ifdef CONFIG_IP_ROUTE_NAT
	struct rtable *rt = (struct rtable*)skb->dst;
#endif

	IP_INC_STATS(IpOutRequests);

#ifdef CONFIG_IP_ROUTE_NAT
	if (rt->rt_flags&RTCF_NAT)
		ip_do_nat(skb);
#endif

	return ip_finish_output(skb);
}

sys_send=>sys_sendto=>sock_sendmsg=>inet_sendmsg=>tcp_sendmsg=>tcp_send_skb=>tcp_transmit_skb=>ip_queue_xmit=>ip_queue_xmit2=>ip_output=>ip_finish_output

__inline__ int ip_finish_output(struct sk_buff *skb)
{
	struct net_device *dev = skb->dst->dev;

	skb->dev = dev;
	skb->protocol = __constant_htons(ETH_P_IP);

	return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
		       ip_finish_output2);
}

sys_send=>sys_sendto=>sock_sendmsg=>inet_sendmsg=>tcp_sendmsg=>tcp_send_skb=>tcp_transmit_skb=>ip_queue_xmit=>ip_queue_xmit2=>ip_output=>ip_finish_output=>ip_finish_output2

static inline int ip_finish_output2(struct sk_buff *skb)
{
	struct dst_entry *dst = skb->dst;
	struct hh_cache *hh = dst->hh;

#ifdef CONFIG_NETFILTER_DEBUG
	nf_debug_ip_finish_output2(skb);
#endif /*CONFIG_NETFILTER_DEBUG*/

	if (hh) {
		read_lock_bh(&hh->hh_lock);
  		memcpy(skb->data - 16, hh->hh_data, 16);
		read_unlock_bh(&hh->hh_lock);
	        skb_push(skb, hh->hh_len);
		return hh->hh_output(skb);
	} else if (dst->neighbour)
		return dst->neighbour->output(skb);

	printk(KERN_DEBUG "khm\n");
	kfree_skb(skb);
	return -EINVAL;
}

这里是走令居子系统来执行转发的。最后进化成以太网数据包，然后发送以太网数据包dev_queue_xmit，进入网卡，使用网卡驱动的发送函数进行发送net_send_packet

持续更新中。。。。。。

guoguangwu

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
linux 内核协议栈原理分析之 tcp 服务器端的 send 过程

基于linux2.4.0分析，讲解通过send函数触发内核执行的整个流程。send系统调用最终会走到内核的sys_socketcall函数。后面的调用流程sys_send=>sys_sendto=>sock_sendmsg=>sock->ops->sendmsg发现这个是个函数指针，具体是什么需要看初始化的时候或者accept的时候设置的函数跳转表，这个时候肯定是已经经过三次握手之后，建立的连接。为了解决这个问题，我们先看accept处理过程。/*..
复制链接

扫一扫