linux 协议栈 准备连接请求

基于linux2.4.0分析。

我们从客户端socket程序出发,分析客户端与服务端的连接过程。

简单的客户端代码:

int main(argc, argv)
int argc;
char **argv;
{
    unsigned short port;       /* port client will connect to         */
    char buf[12];              /* data buffer for sending & receiving */
    struct hostent *hostnm;    /* server host name information        */
    struct sockaddr_in server; /* server address                      */
    int s;                     /* client socket                       */

    if (argc != 3)
    {
        fprintf(stderr, "Usage: %s hostname port\n", argv[0]);
        exit(1);
    }


    hostnm = gethostbyname(argv[1]);
    if (hostnm == (struct hostent *) 0)
    {
        fprintf(stderr, "Gethostbyname failed\n");
        exit(2);
    }

    port = (unsigned short) atoi(argv[2]);

    strcpy(buf, "the message");


    server.sin_family      = AF_INET;
    server.sin_port        = htons(port);
    server.sin_addr.s_addr = *((unsigned long *)hostnm->h_addr);


    if ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0)
    {
        exit(3);
    }


    if (connect(s, (struct sockaddr *)&server, sizeof(server)) < 0)
    {
        exit(4);
    }

    if (send(s, buf, sizeof(buf), 0) < 0)
    {
        exit(5);
    }

    if (recv(s, buf, sizeof(buf), 0) < 0)
    {
        tcperror("Recv()");
        exit(6);
    }

    close(s);

    printf("Client Ended Successfully\n");
    exit(0);

}

上面大致流程,首先创建客户端socket,这个实现在另一个博客中已经讲解,目前还有connect函数还没分析,这个函数负责客户端与服务器的连接,如果该函数没有执行成功,那么服务器就无法接收到连接请求,并建立它们的通信桥梁。

内核的连接函数:

最后也是走到sys_socketcall函数中。



asmlinkage long sys_socketcall(int call, unsigned long *args)
{
......
	
	switch(call) 
	{
......
		case SYS_CONNECT:
			err = sys_connect(a0, (struct sockaddr *)a1, a[2]);
			break;
	......
		default:
			err = -EINVAL;
			break;
	}
	return err;
}

对应的三个参数为socket id 、服务器地址和长度

sys_socketcall=> sys_connect

asmlinkage long sys_connect(int fd, struct sockaddr *uservaddr, int addrlen)
{
	struct socket *sock;
	char address[MAX_SOCK_ADDR];
	int err;

	sock = sockfd_lookup(fd, &err);//找到客户端的socket
	if (!sock)
		goto out;
	err = move_addr_to_kernel(uservaddr, addrlen, address);//将地址复制到内核空间
	if (err < 0)
		goto out_put;
	err = sock->ops->connect(sock, (struct sockaddr *) address, addrlen,
				 sock->file->f_flags);//调用具体协议的连接函数
out_put:
	sockfd_put(sock);
out:
	return err;
}

重点为sock->ops->connect,inet协议族的话对应的是inet_stream_ops,因此为inet_stream_connect函数

struct proto_ops inet_stream_ops = {
......
	connect:	inet_stream_connect,
......
};

sys_socketcall=> sys_connect=>inet_stream_connect

int inet_stream_connect(struct socket *sock, struct sockaddr * uaddr,
			int addr_len, int flags)
{
	struct sock *sk=sock->sk;
	int err;
	long timeo;

	lock_sock(sk);//加锁,如果sock被其他进程占用了,当前进程睡眠

	if (uaddr->sa_family == AF_UNSPEC) {//检查所属协议族
		err = sk->prot->disconnect(sk, flags);
		sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
		goto out;
	}

	switch (sock->state) {//判断客户端的socket状态
	default:
		err = -EINVAL;
		goto out;
	case SS_CONNECTED://如果已经连接直接返回
		err = -EISCONN;
		goto out;
	case SS_CONNECTING://如果正在连接就跳出语句
		err = -EALREADY;
		/* Fall out of switch with err, set for this state */
		break;
	case SS_UNCONNECTED://如果未连接就执行协议结构的连接函数
		err = -EISCONN;
		if (sk->state != TCP_CLOSE) 
			goto out;

		err = -EAGAIN;
		if (sk->num == 0) {
			if (sk->prot->get_port(sk, 0) != 0)//如果未指定端口,内核随机分配一个未使用的端口(大于等于1024)
				goto out;
			sk->sport = htons(sk->num);
		}

		err = sk->prot->connect(sk, uaddr, addr_len);//调用tcp协议连接函数
		if (err < 0)
			goto out;

  		sock->state = SS_CONNECTING;//设置连接状态,正在连接

		/* Just entered SS_CONNECTING state; the only
		 * difference is that return value in non-blocking
		 * case is EINPROGRESS, rather than EALREADY.
		 */
		err = -EINPROGRESS;
		break;
	}

	timeo = sock_sndtimeo(sk, flags&O_NONBLOCK);//设置发送定时器

	if ((1<<sk->state)&(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
		/* Error code is set above */
		if (!timeo || !inet_wait_for_connect(sk, timeo))//定时等待连接
			goto out;

		err = sock_intr_errno(timeo);//确定超时错误码
		if (signal_pending(current))//是否有信号等待处理
			goto out;
	}

	/* Connection was closed by RST, timeout, ICMP error
	 * or another process disconnected us.
	 */
	if (sk->state == TCP_CLOSE)
		goto sock_error;

	/* sk->err may be not zero now, if RECVERR was ordered by user
	 * and error was received after socket entered established state.
	 * Hence, it is handled normally after connect() return successfully.
	 */

	sock->state = SS_CONNECTED;//设置客户端socket状态为连接状态
	err = 0;
out:
	release_sock(sk);//解锁,并唤醒sock锁上的其他进程
	return err;

sock_error:
	err = sock_error(sk) ? : -ECONNABORTED;
	sock->state = SS_UNCONNECTED;
	if (sk->prot->disconnect(sk, flags))//执行断开函数
		sock->state = SS_DISCONNECTING;
	goto out;
}

这个函数先对客户端sock结构加锁,然后判断服务器地址是否属于AF_UNSPEC(未确定协议族),如果相同就执行tcp_prot结构中的disconnect函数(sk->sk_prot挂入的是tcp_prot结构),被设置为tcp_disconnect,这个函数会断开连接、复位socket的相关结构,这个过程我们不关心,但是由此可以看出connect函数也可以切断连接。
上面的switch语句根据客户端sock的状态来执行相关的语句,客户端在创建过程中调用inet_create函数将sock状态设置为未连接状态SS_UNCONNECTED。这里的例子是第一次连接,所以执行SS_UNCONNECTED分支的代码,由于系统调用函数可以被中断和其他进程抢占,这里还要检查下客户端sock结构有没有被关闭。如果没关闭,执行connect函数,实际是tcp_prot的connect函数

struct proto tcp_prot = {
	name:		"TCP",
	close:		tcp_close,
	connect:	tcp_v4_connect,
	disconnect:	tcp_disconnect,
......
};

sys_socketcall=> sys_connect=>inet_stream_connect=>tcp_v4_connect


/* This will initiate an outgoing connection. */
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
	struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
	struct sk_buff *buff;
	struct rtable *rt;
	u32 daddr, nexthop;
	int tmp;
	int err;

	if (addr_len < sizeof(struct sockaddr_in))//地址长度是否相符
		return(-EINVAL);

	if (usin->sin_family != AF_INET)//是否属于INET协议族
		return(-EAFNOSUPPORT);

	nexthop = daddr = usin->sin_addr.s_addr;//记录服务器IP地址
	if (sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr) {//是否设置了IP选项结构,并指定了源路由
		if (daddr == 0)
			return -EINVAL;
		nexthop = sk->protinfo.af_inet.opt->faddr;//跳转地址为转发地址
	}
    //查找路由表
	tmp = ip_route_connect(&rt, nexthop, sk->saddr,
			       RT_TOS(sk->protinfo.af_inet.tos)|RTO_CONN|sk->localroute, sk->bound_dev_if);
	if (tmp < 0)
		return tmp;
    //路由表是组播或者广播类型就放弃使用
	if (rt->rt_flags&(RTCF_MULTICAST|RTCF_BROADCAST)) {
		ip_rt_put(rt);
		return -ENETUNREACH;
	}

	__sk_dst_set(sk, &rt->u.dst);
    //检查ip选项
	if (!sk->protinfo.af_inet.opt || !sk->protinfo.af_inet.opt->srr)
		daddr = rt->rt_dst;//使用路由表中的地址作为目标地址

	err = -ENOBUFS;
	buff = alloc_skb(MAX_TCP_HEADER + 15, GFP_KERNEL);

	if (buff == NULL)
		goto failure;

	if (!sk->saddr)//没有指定源地址
		sk->saddr = rt->rt_src;//使用路由表中的地址作为源地址
	sk->rcv_saddr = sk->saddr;//接收地址和源地址相同

	if (tp->ts_recent_stamp && sk->daddr != daddr) {//接收过但地址已经改变,需要复位
		/* Reset inherited state */
		tp->ts_recent = 0;
		tp->ts_recent_stamp = 0;
		tp->write_seq = 0;
	}

	if (sysctl_tcp_tw_recycle &&
	    !tp->ts_recent_stamp &&
	    rt->rt_dst == daddr) {//接收过且地址未改变
		struct inet_peer *peer = rt_get_peer(rt);//获取对方信息

		/* VJ's idea. We save last timestamp seen from
		 * the destination in peer table, when entering state TIME-WAIT
		 * and initialize ts_recent from it, when trying new connection.
		 */

		if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {//调整时间戳
			tp->ts_recent_stamp = peer->tcp_ts_stamp;
			tp->ts_recent = peer->tcp_ts;
		}
	}

	sk->dport = usin->sin_port;//记录指定的端口
	sk->daddr = daddr;//记录路由表的目的地址

	if (!tp->write_seq)
		tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
							   sk->sport, usin->sin_port);

	tp->ext_header_len = 0;//初始化网络层头部
	if (sk->protinfo.af_inet.opt)
		tp->ext_header_len = sk->protinfo.af_inet.opt->optlen;//记录ip选项规定长度

	tp->mss_clamp = 536;//设置MSS最大分段值

	err = tcp_connect(sk, buff);//发送syn数据包
	if (err == 0)
		return 0;

failure:
	__sk_dst_reset(sk);
	sk->dport = 0;
	return err;
}

sys_socketcall=> sys_connect=>inet_stream_connect=>tcp_v4_connect=>tcp_connect


int tcp_connect(struct sock *sk, struct sk_buff *buff)
{
	struct dst_entry *dst = __sk_dst_get(sk);
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);

	/* Reserve space for headers. */
	skb_reserve(buff, MAX_TCP_HEADER);//开辟tcp头空间

	/* We'll fix this up when we get a response from the other end.
	 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
	 */
	tp->tcp_header_len = sizeof(struct tcphdr) +
		(sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);//记录头部长度

	/* If user gave his TCP_MAXSEG, record it to clamp */
	if (tp->user_mss)
		tp->mss_clamp = tp->user_mss;//记录指定的mss值
	tp->max_window = 0;
	tcp_sync_mss(sk, dst->pmtu);//确定同步发送的mss

	if (!tp->window_clamp)
		tp->window_clamp = dst->window;
	tp->advmss = dst->advmss;
	tcp_initialize_rcv_mss(sk);//初始化接收的mss值

	tcp_select_initial_window(tcp_full_space(sk),
				  tp->advmss - (tp->ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
				  &tp->rcv_wnd,
				  &tp->window_clamp,
				  sysctl_tcp_window_scaling,
				  &tp->rcv_wscale);//确定窗口比例和窗口值

	tp->rcv_ssthresh = tp->rcv_wnd;//记录当前使用的窗口

	/* Socket identity change complete, no longer
	 * in TCP_CLOSE, so enter ourselves into the
	 * hash tables.
	 */
	tcp_set_state(sk,TCP_SYN_SENT);//修改sock状态为SYN状态
	if (tp->af_specific->hash_connecting(sk))
		goto err_out;

	sk->err = 0;
	sk->done = 0;
	tp->snd_wnd = 0;//希望接收的窗口
	tcp_init_wl(tp, tp->write_seq, 0);//记录发送序号
	tp->snd_una = tp->write_seq;//应答时的第一个序号
	tp->snd_sml = tp->write_seq;//最后一个字节
	tp->rcv_nxt = 0;//下一个要接收的
	tp->rcv_wup = 0;//窗口更新时最后一次rcv_nxt
	tp->copied_seq = 0;//未读的数据头

	tp->rto = TCP_TIMEOUT_INIT;//重发时间限制
	tp->retransmits = 0;//重发数目
	tcp_clear_retrans(tp);//重发计数器复位

	TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
	TCP_ECN_send_syn(tp, buff);
	TCP_SKB_CB(buff)->sacked = 0;
	buff->csum = 0;
	TCP_SKB_CB(buff)->seq = tp->write_seq++;
	TCP_SKB_CB(buff)->end_seq = tp->write_seq;
	tp->snd_nxt = tp->write_seq;//记录发送序列号
	tp->pushed_seq = tp->write_seq;

	/* Send it off. */
	TCP_SKB_CB(buff)->when = tcp_time_stamp;
	tp->retrans_stamp = TCP_SKB_CB(buff)->when;
	__skb_queue_tail(&sk->write_queue, buff);//将数据包链入发送队列
	tcp_charge_skb(sk, buff);//调整预分配长度
	tp->packets_out++;//调整飞行中的数据包计数器
	tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));//发送数据包
	TCP_INC_STATS(TcpActiveOpens);

	/* Timer for repeating the SYN until an answer. */
	tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);//设置用于重发SYN的定时器
	return 0;

err_out:
	tcp_set_state(sk,TCP_CLOSE);
	kfree_skb(buff);
	return -EADDRNOTAVAIL;
}

这里的路由项dst结构体指针,重点是input、output函数指针。我们看下其定义:

struct dst_entry
{
	struct dst_entry        *next;
	atomic_t		__refcnt;	/* client references	引用计数*/
	int			__use;
	struct net_device       *dev;
	int			obsolete;
	int			flags;
#define DST_HOST		1
	unsigned long		lastuse;
	unsigned long		expires;//过期时间

	unsigned		mxlock;
	unsigned		pmtu;
	unsigned		window;
	unsigned		rtt;
	unsigned		rttvar;
	unsigned		ssthresh;
	unsigned		cwnd;
	unsigned		advmss;
	unsigned		reordering;

	unsigned long		rate_last;	/* rate limiting for ICMP */
	unsigned long		rate_tokens;

	int			error;

	struct neighbour	*neighbour;//邻居结构
	struct hh_cache		*hh;//链路层头部缓存

	int			(*input)(struct sk_buff*);//接收函数
	int			(*output)(struct sk_buff*);//发送函数

#ifdef CONFIG_NET_CLS_ROUTE
	__u32			tclassid;
#endif

	struct  dst_ops	        *ops;//路由项函数表
		
	char			info[0];
};

sys_socketcall=> sys_connect=>inet_stream_connect=>tcp_v4_connect=>tcp_connect=>tcp_sync_mss

int tcp_sync_mss(struct sock *sk, u32 pmtu)
{
	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
	int mss_now;

	/* Calculate base mss without TCP options:
	   It is MMS_S - sizeof(tcphdr) of rfc1122
	 */

	mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);//根据mtu计算mss

	/* Clamp it (mss_clamp does not include tcp options) */
	if (mss_now > tp->mss_clamp)
		mss_now = tp->mss_clamp;

	/* Now subtract optional transport overhead */
	mss_now -= tp->ext_header_len;

	/* Then reserve room for full set of TCP options and 8 bytes of data */
	if (mss_now < 48)
		mss_now = 48;

	/* Now subtract TCP options size, not including SACKs */
	mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);

	/* Bound mss with half of window */
	if (tp->max_window && mss_now > (tp->max_window>>1))
		mss_now = max((tp->max_window>>1), 68 - tp->tcp_header_len);//调整mss为窗口半值

	/* And store cached results */
	tp->pmtu_cookie = pmtu;//记录mtu值
	tp->mss_cache = mss_now;//记录mss值
	return mss_now;
}

sys_socketcall=> sys_connect=>inet_stream_connect=>tcp_v4_connect=>tcp_connect=>tcp_initialize_rcv_mss

static inline void tcp_initialize_rcv_mss(struct sock *sk)
{
	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
	int hint = min(tp->advmss, tp->mss_cache);//取本地mss与公开mss最小值

	hint = min(hint, tp->rcv_wnd/2);//与接收窗口一半作比较,取最小值
		
	tp->ack.rcv_mss = max(min(hint, TCP_MIN_RCVMSS), TCP_MIN_MSS);//记录到连接结构中
}

构造发送tcp数据包

sys_socketcall=> sys_connect=>inet_stream_connect=>tcp_v4_connect=>tcp_connect=>tcp_transmit_skb

int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
{
	if(skb != NULL) {
		struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
		struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
		int tcp_header_size = tp->tcp_header_len;//取得tcp包头部长度
		struct tcphdr *th;
		int sysctl_flags;
		int err;

#define SYSCTL_FLAG_TSTAMPS	0x1
#define SYSCTL_FLAG_WSCALE	0x2
#define SYSCTL_FLAG_SACK	0x4

		sysctl_flags = 0;
		if (tcb->flags & TCPCB_FLAG_SYN) {//如果设置了syn标志
			tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;//修改tcp头部长度
			if(sysctl_tcp_timestamps) {//如果指定客tcp时间戳
				tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;//增加记录时间的长度
				sysctl_flags |= SYSCTL_FLAG_TSTAMPS;//增加时间戳标志
			}
			if(sysctl_tcp_window_scaling) {//如果指定了tcp滑动窗口比例
				tcp_header_size += TCPOLEN_WSCALE_ALIGNED;//增加记录比例长度
				sysctl_flags |= SYSCTL_FLAG_WSCALE;//增加滑动窗口比例标志
			}
			if(sysctl_tcp_sack) {//如果指定了sack标志
				sysctl_flags |= SYSCTL_FLAG_SACK;//增加sack标志
				if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
					tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;//增加sack长度
			}
		} else if (tp->eff_sacks) {
			/* A SACK is 2 pad bytes, a 2 byte header, plus
			 * 2 32-bit sequence numbers for each SACK block.
			 */
			tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
					    (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK));
		}
		th = (struct tcphdr *) skb_push(skb, tcp_header_size);//开辟tcp头长度缓存,并返回地址
		skb->h.th = th;//数据包记录当前的tcp头部指针
		skb_set_owner_w(skb, sk);//建立与sock的关联,设置析构函数sock_wfree

		/* Build TCP header and checksum it. */
		th->source		= sk->sport;//记录源端口 都是网络序
		th->dest		= sk->dport;//记录目标端口
		th->seq			= htonl(tcb->seq);//记录发送序号
		th->ack_seq		= htonl(tp->rcv_nxt);//记录ack序号
		*(((__u16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) | tcb->flags);//记录头部长度和标志
		if (tcb->flags & TCPCB_FLAG_SYN) {//设置了syn标志
			/* RFC1323: The window in SYN & SYN/ACK segments
			 * is never scaled.
			 */
			th->window	= htons(tp->rcv_wnd);//设置窗口大小
		} else {
			th->window	= htons(tcp_select_window(sk));
		}
		th->check		= 0;//清空校验和
		th->urg_ptr		= 0;//清空紧急指针

		if (tp->urg_mode &&
		    between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) {
			th->urg_ptr		= htons(tp->snd_up-tcb->seq);//设置紧急指针
			th->urg			= 1;//紧急指针有效
		}

		if (tcb->flags & TCPCB_FLAG_SYN) {//设置了syn标志
			tcp_syn_build_options((__u32 *)(th + 1),
					      tcp_advertise_mss(sk),
					      (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
					      (sysctl_flags & SYSCTL_FLAG_SACK),
					      (sysctl_flags & SYSCTL_FLAG_WSCALE),
					      tp->rcv_wscale,
					      tcb->when,
		      			      tp->ts_recent);
		} else {
			tcp_build_and_update_options((__u32 *)(th + 1),
						     tp, tcb->when);

			TCP_ECN_send(sk, tp, skb, tcp_header_size);//设置cwr拥塞减少标志ece提醒标志
		}
		tp->af_specific->send_check(sk, th, skb->len, skb);//计算并记录检验和

		if (tcb->flags & TCPCB_FLAG_ACK)//如果设置了ack标志
			tcp_event_ack_sent(sk);//调整快速ack数值

		if (skb->len != tcp_header_size)//检查数据块总长度
			tcp_event_data_sent(tp, skb);//复位拥塞窗口

		TCP_INC_STATS(TcpOutSegs);//递增发送计数

		err = tp->af_specific->queue_xmit(skb);//调用发送函数
		if (err <= 0)
			return err;

		tcp_enter_cwr(tp);//检查拥塞状态,设置慢启动和拥塞窗口等内容

		/* NET_XMIT_CN is special. It does not guarantee,
		 * that this packet is lost. It tells that device
		 * is about to start to drop packets or already
		 * drops some packets of the same priority and
		 * invokes us to send less aggressively.
		 */
		return err == NET_XMIT_CN ? 0 : err;//确定返回值,拥塞返回0
	}
	return -ENOBUFS;
#undef SYSCTL_FLAG_TSTAMPS
#undef SYSCTL_FLAG_WSCALE
#undef SYSCTL_FLAG_SACK
}

        tp->af_specific->send_check(sk, th, skb->len, skb);
连接函数表在tcp_v4_init_sock函数中挂入了ipv4_specific。

    sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;
 

struct tcp_func ipv4_specific = {
	ip_queue_xmit,
	tcp_v4_send_check,
......
};

最后看err = tp->af_specific->queue_xmit(skb);,根据上面的代码知道调用的函数为ip_queue_xmit,由它来完成发送数据包的任务,这个函数放在后面分析,现在假定发送成功。则进入tcp_enter_cwr函数。

sys_socketcall=> sys_connect=>inet_stream_connect=>tcp_v4_connect=>tcp_connect=>tcp_transmit_skb=>tcp_enter_cwr

static inline void tcp_enter_cwr(struct tcp_opt *tp)
{
	tp->prior_ssthresh = 0;
	if (tp->ca_state < TCP_CA_CWR) {
		__tcp_enter_cwr(tp);
		tp->ca_state = TCP_CA_CWR;
	}
}

这个函数根据拥塞状态设置拥塞窗口等内容。

进化成ip数据包

sys_socketcall=> sys_connect=>inet_stream_connect=>tcp_v4_connect=>tcp_connect=>tcp_transmit_skb=>ip_queue_xmit


int ip_queue_xmit(struct sk_buff *skb)
{
	struct sock *sk = skb->sk;//取得所属sock结构指针
	struct ip_options *opt = sk->protinfo.af_inet.opt;//获取ip选项
	struct rtable *rt;
	struct iphdr *iph;

	/* Make sure we can route this packet. */
	rt = (struct rtable *)__sk_dst_check(sk, 0);//检测路由项并转换成路由表指针
	if (rt == NULL) {
		u32 daddr;

		/* Use correct destination address if we have options. */
		daddr = sk->daddr;
		if(opt && opt->srr)//获取目的地址
			daddr = opt->faddr;

		/* If this fails, retransmit mechanism of transport layer will
		 * keep trying until route appears or the connection times itself
		 * out.
		 */
		if (ip_route_output(&rt, daddr, sk->saddr,
				    RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
				    sk->bound_dev_if))//查找或者创建路由表
			goto no_route;
		__sk_dst_set(sk, &rt->u.dst);//将路由信息保存到skb中
	}
	skb->dst = dst_clone(&rt->u.dst);

	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
		goto no_route;

	/* OK, we know where to send it, allocate and build IP header. */
	iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));//分配ip头空间
	*((__u16 *)iph)	= htons((4 << 12) | (5 << 8) | (sk->protinfo.af_inet.tos & 0xff));
	iph->tot_len = htons(skb->len);
	iph->frag_off = 0;
	iph->ttl      = sk->protinfo.af_inet.ttl;
	iph->protocol = sk->protocol;
	iph->saddr    = rt->rt_src;//记录源地址
	iph->daddr    = rt->rt_dst;//记录目的地址
	skb->nh.iph   = iph;//保存ip头指针
	/* Transport layer set skb->h.foo itself. */

	if(opt && opt->optlen) {//如果设置了ip选项
		iph->ihl += opt->optlen >> 2;//调整ip头长度
		ip_options_build(skb, opt, sk->daddr, rt, 0);//在头部记录ip选项
	}

	return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
		       ip_queue_xmit2);//继续发送数据包

no_route:
	IP_INC_STATS(IpOutNoRoutes);
	kfree_skb(skb);
	return -EHOSTUNREACH;
}

sys_socketcall=> sys_connect=>inet_stream_connect=>tcp_v4_connect=>tcp_connect=>tcp_transmit_skb=>ip_queue_xmit=>ip_queue_xmit2

static inline int ip_queue_xmit2(struct sk_buff *skb)
{
	struct sock *sk = skb->sk;
	struct rtable *rt = (struct rtable *)skb->dst;
	struct net_device *dev;
	struct iphdr *iph = skb->nh.iph;

	dev = rt->u.dst.dev;

	/* This can happen when the transport layer has segments queued
	 * with a cached route, and by the time we get here things are
	 * re-routed to a device with a different MTU than the original
	 * device.  Sick, but we must cover it.
	 */
	if (skb_headroom(skb) < dev->hard_header_len && dev->hard_header) {//头部空间不足,重新分配新的数据包,并复制原油内容到新的数据包中
		struct sk_buff *skb2;

		skb2 = skb_realloc_headroom(skb, (dev->hard_header_len + 15) & ~15);
		kfree_skb(skb);//释放原来的数据包
		if (skb2 == NULL)
			return -ENOMEM;
		if (sk)
			skb_set_owner_w(skb2, sk);//新数据包与sock关联
		skb = skb2;
		iph = skb->nh.iph;
	}

	if (skb->len > rt->u.dst.pmtu)//检查是否需要分片
		goto fragment;

	if (ip_dont_fragment(sk, &rt->u.dst))
		iph->frag_off |= __constant_htons(IP_DF);

	ip_select_ident(iph, &rt->u.dst);//设置ip id

	/* Add an IP checksum. */
	ip_send_check(iph);//计算检验和

	skb->priority = sk->priority;
	return skb->dst->output(skb);//继续发送数据包

fragment:
	if (ip_dont_fragment(sk, &rt->u.dst)) {
		/* Reject packet ONLY if TCP might fragment
		 * it itself, if were careful enough.
		 */
		iph->frag_off |= __constant_htons(IP_DF);
		NETDEBUG(printk(KERN_DEBUG "sending pkt_too_big to self\n"));

		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
			  htonl(rt->u.dst.pmtu));
		kfree_skb(skb);
		return -EMSGSIZE;
	}
	ip_select_ident(iph, &rt->u.dst);
	return ip_fragment(skb, skb->dst->output);
}

return skb->dst->output(skb);这行代码调用的函数是在创建路由表函数ip_route_output_slow被设置为ip_output。

    rth->u.dst.output=ip_output;
 

sys_socketcall=> sys_connect=>inet_stream_connect=>tcp_v4_connect=>tcp_connect=>tcp_transmit_skb=>ip_queue_xmit=>ip_queue_xmit2=>ip_output

int ip_output(struct sk_buff *skb)
{
#ifdef CONFIG_IP_ROUTE_NAT
	struct rtable *rt = (struct rtable*)skb->dst;
#endif

	IP_INC_STATS(IpOutRequests);

#ifdef CONFIG_IP_ROUTE_NAT
	if (rt->rt_flags&RTCF_NAT)
		ip_do_nat(skb);
#endif

	return ip_finish_output(skb);
}

sys_socketcall=> sys_connect=>inet_stream_connect=>tcp_v4_connect=>tcp_connect=>tcp_transmit_skb=>ip_queue_xmit=>ip_queue_xmit2=>ip_output=>ip_finish_output

__inline__ int ip_finish_output(struct sk_buff *skb)
{
	struct net_device *dev = skb->dst->dev;//取得设备结构体指针

	skb->dev = dev;//记录路由项指定的设备
	skb->protocol = __constant_htons(ETH_P_IP);//设置协议

	return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
		       ip_finish_output2);//继续发送
}

sys_socketcall=> sys_connect=>inet_stream_connect=>tcp_v4_connect=>tcp_connect=>tcp_transmit_skb=>ip_queue_xmit=>ip_queue_xmit2=>ip_output=>ip_finish_output=>ip_finish_output2

static inline int ip_finish_output2(struct sk_buff *skb)
{
	struct dst_entry *dst = skb->dst;//取得路由项指针
	struct hh_cache *hh = dst->hh;//取得路由项的以太网头部缓存

#ifdef CONFIG_NETFILTER_DEBUG
	nf_debug_ip_finish_output2(skb);
#endif /*CONFIG_NETFILTER_DEBUG*/

	if (hh) {//如果缓存了以太网头部
		read_lock_bh(&hh->hh_lock);
  		memcpy(skb->data - 16, hh->hh_data, 16);
		read_unlock_bh(&hh->hh_lock);
	        skb_push(skb, hh->hh_len);
		return hh->hh_output(skb);//调用缓存头部结构的发送函数
	} else if (dst->neighbour)//是否指定了邻居结构
		return dst->neighbour->output(skb);//调用邻居结构的发送函数

	printk(KERN_DEBUG "khm\n");
	kfree_skb(skb);
	return -EINVAL;
}

struct hh_cache
 

struct hh_cache//链路层头部缓存结构
{
	struct hh_cache *hh_next;	/* Next entry	指向队列中的下一个链路层头部		     */
	atomic_t	hh_refcnt;	/* number of users                   */
	unsigned short  hh_type;	/* protocol identifier, f.e ETH_P_IP 使用计数器 */
	int		hh_len;		/* length of header 头部缓存的长度,用字节表示*/
	int		(*hh_output)(struct sk_buff *skb);//发送数据包函数指针
	rwlock_t	hh_lock;
	/* cached hardware header; allow for machine alignment needs.        */
	unsigned long	hh_data[16/sizeof(unsigned long)];
};

如果路由项没有缓存链路层头部,则调用邻居结构的output函数。这个需要结合邻居子系统的arp_constructor函数分析。

        if (dev->hard_header_cache)
            neigh->ops = &arp_hh_ops;
        else
            neigh->ops = &arp_generic_ops;
        if (neigh->nud_state&NUD_VALID)
            neigh->output = neigh->ops->connected_output;
        else
            neigh->output = neigh->ops->output;

struct neigh_ops
{
	int			family;
	void			(*destructor)(struct neighbour *);
	void			(*solicit)(struct neighbour *, struct sk_buff*);
	void			(*error_report)(struct neighbour *, struct sk_buff*);
	int			(*output)(struct sk_buff*);
	int			(*connected_output)(struct sk_buff*);
	int			(*hh_output)(struct sk_buff*);
	int			(*queue_xmit)(struct sk_buff*);
};

static struct neigh_ops arp_hh_ops =
{
	AF_INET,
	NULL,
	arp_solicit,
	arp_error_report,
	neigh_resolve_output,
	neigh_resolve_output,
	dev_queue_xmit,
	dev_queue_xmit
};

此时邻居结构是新建的,它的状态不是NUD_VALID,因此邻居结构的发送函数挂入的是neigh_resolve_output,但是从定义上面看connected_output也是neigh_resolve_output。

继续发送:

sys_socketcall=> sys_connect=>inet_stream_connect=>tcp_v4_connect=>tcp_connect=>tcp_transmit_skb=>ip_queue_xmit=>ip_queue_xmit2=>ip_output=>ip_finish_output=>ip_finish_output2=>neigh_resolve_output

int neigh_resolve_output(struct sk_buff *skb)
{
	struct dst_entry *dst = skb->dst;//取得路由项
	struct neighbour *neigh;

	if (!dst || !(neigh = dst->neighbour))//如果路由项和邻居结构不存在就返回
		goto discard;

	__skb_pull(skb, skb->nh.raw - skb->data);//开辟链路层头部空间

	if (neigh_event_send(neigh, skb) == 0) {//检查数据块是否可用,如果可用继续发送
		int err;
		struct net_device *dev = neigh->dev;//取得邻居结构的网络设备
		if (dev->hard_header_cache && dst->hh == NULL) {//如果设备提供了链路层的缓存函数,但路由项没有链路层头部缓存
			write_lock_bh(&neigh->lock);
			if (dst->hh == NULL)//再次检查,之前的加锁可能会导致睡眠
				neigh_hh_init(neigh, dst, dst->ops->protocol);//创建缓冲头并初始化
			err = dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len);//设置数据块的链路层头部结构
			write_unlock_bh(&neigh->lock);
		} else {
			read_lock_bh(&neigh->lock);
			err = dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len);
			read_unlock_bh(&neigh->lock);
		}
		if (err >= 0)
			return neigh->ops->queue_xmit(skb);//调用邻居函数表的发送函数
		kfree_skb(skb);
		return -EINVAL;
	}
	return 0;

discard:
	NEIGH_PRINTK1("neigh_resolve_output: dst=%p neigh=%p\n", dst, dst ? dst->neighbour : NULL);
	kfree_skb(skb);
	return -EINVAL;
}

进化成以太网数据包

sys_socketcall=> sys_connect=>inet_stream_connect=>tcp_v4_connect=>tcp_connect=>tcp_transmit_skb=>ip_queue_xmit=>ip_queue_xmit2=>ip_output=>ip_finish_output=>ip_finish_output2=>neigh_resolve_output=>neigh_hh_init


static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst, u16 protocol)
{
	struct hh_cache	*hh = NULL;
	struct net_device *dev = dst->dev;//取得网络设备

	for (hh=n->hh; hh; hh = hh->hh_next)//检查是否已经创建了缓冲头部
		if (hh->hh_type == protocol)
			break;

	if (!hh && (hh = kmalloc(sizeof(*hh), GFP_ATOMIC)) != NULL) {//分配缓冲头部空间
		memset(hh, 0, sizeof(struct hh_cache));
		hh->hh_lock = RW_LOCK_UNLOCKED;
		hh->hh_type = protocol;//记录协议
		atomic_set(&hh->hh_refcnt, 0);//计数清零
		hh->hh_next = NULL;//初始化队列指针
		if (dev->hard_header_cache(n, hh)) {//调用网络设备的缓冲函数初始化它
			kfree(hh);
			hh = NULL;
		} else {
			atomic_inc(&hh->hh_refcnt);//递增使用计数
			hh->hh_next = n->hh;//列入队列头部
			n->hh = hh;
			if (n->nud_state&NUD_CONNECTED)//如果邻居结构处于连接状态
				hh->hh_output = n->ops->hh_output;//设置发送函数
			else
				hh->hh_output = n->ops->output;//设置发送函数
		}
	}
	if (hh)	{
		atomic_inc(&hh->hh_refcnt);
		dst->hh = hh;
	}
}

注意参数protocol传递的是路由项函数表的协议标识。dst_alloc挂入的是ipv4_dst_ops,这个结构体的协议标识为ETH_P_IP

struct dst_ops ipv4_dst_ops =
{
	AF_INET,
	__constant_htons(ETH_P_IP),//协议标识
	0,

	rt_garbage_collect,
	ipv4_dst_check,
	ipv4_dst_reroute,
	ipv4_dst_destroy,
	ipv4_negative_advice,
	ipv4_link_failure,
	sizeof(struct rtable),
};

代码先在邻居结构的缓冲头队列hh_cache中查找,查看是否建立了ETH_P_IP协议的缓冲头部。如果找到了则直接跳过循环并记录到路由项中,如果在队列中没有找到,则分配一个缓冲头部并进行一些初始化操作。具体的初始化操作要结合其他代码来看

    dev->hard_header_cache    = eth_header_cache;
 

sys_socketcall=> sys_connect=>inet_stream_connect=>tcp_v4_connect=>tcp_connect=>tcp_transmit_skb=>ip_queue_xmit=>ip_queue_xmit2=>ip_output=>ip_finish_output=>ip_finish_output2=>neigh_resolve_output=>neigh_hh_init=>eth_header_cache

int eth_header_cache(struct neighbour *neigh, struct hh_cache *hh)
{
	unsigned short type = hh->hh_type;//取得协议类型
	struct ethhdr *eth = (struct ethhdr*)(((u8*)hh->hh_data) + 2);
	struct net_device *dev = neigh->dev;

	if (type == __constant_htons(ETH_P_802_3))
		return -1;

	eth->h_proto = type;//记录协议
	memcpy(eth->h_source, dev->dev_addr, dev->addr_len);//复制设备地址作为源地址
	memcpy(eth->h_dest, neigh->ha, dev->addr_len);//复制邻居结构地址作为目标地址
	hh->hh_len = ETH_HLEN;//设置长度
	return 0;
}

这个函数对链路层头部进行初始化,结构体struct ethhdr 为以太网头部结构。这里将协议类型、网卡设备的地址以及邻居结构的地址记录到头部中。

接下来还要增加头部缓存结构的计数器,链入到邻居结构的缓存头队列中。如果邻居装填为连接状态,则设置缓存头结构的发送函数为dev_queue_xmit,否则为neigh_resolve_output。

回到neigh_resolve_output函数中,接着调用dev->hard_header函数初始化数据包的链路层头部,具体为eth_header。

sys_socketcall=> sys_connect=>inet_stream_connect=>tcp_v4_connect=>tcp_connect=>tcp_transmit_skb=>ip_queue_xmit=>ip_queue_xmit2=>ip_output=>ip_finish_output=>ip_finish_output2=>neigh_resolve_output=>neigh_hh_init=>eth_header_cache=>eth_header

struct ethhdr 
{
	unsigned char	h_dest[ETH_ALEN];	/* destination eth addr	*/
	unsigned char	h_source[ETH_ALEN];	/* source ether addr	*/
	unsigned short	h_proto;		/* packet type ID field	*/
};


int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
	   void *daddr, void *saddr, unsigned len)
{
	struct ethhdr *eth = (struct ethhdr *)skb_push(skb,ETH_HLEN);//向下延伸数据块的起始地址,为链路层头部开辟空间

	/* 
	 *	Set the protocol type. For a packet of type ETH_P_802_3 we put the length
	 *	in here instead. It is up to the 802.2 layer to carry protocol information.
	 */
	
	if(type!=ETH_P_802_3) 
		eth->h_proto = htons(type);
	else
		eth->h_proto = htons(len);

	/*
	 *	Set the source hardware address. 
	 */
	 
	if(saddr)
		memcpy(eth->h_source,saddr,dev->addr_len);
	else
		memcpy(eth->h_source,dev->dev_addr,dev->addr_len);//如果源地址没有明确,使用设备地址

	/*
	 *	Anyway, the loopback-device should never use this function... 
	 */

	if (dev->flags & (IFF_LOOPBACK|IFF_NOARP)) //如果是回接设备或者不用解析
	{
		memset(eth->h_dest, 0, dev->addr_len);//设置目的地址为空
		return(dev->hard_header_len);
	}
	
	if(daddr)
	{
		memcpy(eth->h_dest,daddr,dev->addr_len);//复制目的地址
		return dev->hard_header_len;//链路层头部长度作为返回值
	}
	
	return -dev->hard_header_len;
}

发送以太网数据包

            return neigh->ops->queue_xmit(skb);
对照arp_hh_ops定义我们知道queue_xmit实际为dev_queue_xmit。继续发送。

int dev_queue_xmit(struct sk_buff *skb)
{
	struct net_device *dev = skb->dev;//取得网络设备
	struct Qdisc  *q;

	/* Grab device queue */
	spin_lock_bh(&dev->queue_lock);
	q = dev->qdisc;//取得排队规则
	if (q->enqueue) {//如果指定了入队函数
		int ret = q->enqueue(skb, q);//入队

		qdisc_run(dev);//调用排队规则的发送函数

		spin_unlock_bh(&dev->queue_lock);
		return ret == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : ret;
	}

	/* The device has no queue. Common case for software devices:
	   loopback, all the sorts of tunnels...

	   Really, it is unlikely that xmit_lock protection is necessary here.
	   (f.e. loopback and IP tunnels are clean ignoring statistics counters.)
	   However, it is possible, that they rely on protection
	   made by us here.

	   Check this and shot the lock. It is not prone from deadlocks.
	   Either shot noqueue qdisc, it is even simpler 8)
	 */
	if (dev->flags&IFF_UP) {//检查设备启用标志后获取当前CPU的ID
		int cpu = smp_processor_id();

		if (dev->xmit_lock_owner != cpu) {//检查是否为当前CPU所使用
			spin_unlock(&dev->queue_lock);
			spin_lock(&dev->xmit_lock);
			dev->xmit_lock_owner = cpu;

			if (!netif_queue_stopped(dev)) {//检查设备的发送状态
				if (netdev_nit)
					dev_queue_xmit_nit(skb,dev);

				if (dev->hard_start_xmit(skb, dev) == 0) {//开始发送数据包
					dev->xmit_lock_owner = -1;
					spin_unlock_bh(&dev->xmit_lock);
					return 0;
				}
			}
			dev->xmit_lock_owner = -1;
			spin_unlock_bh(&dev->xmit_lock);
			if (net_ratelimit())
				printk(KERN_DEBUG "Virtual device %s asks to queue packet!\n", dev->name);
			kfree_skb(skb);
			return -ENETDOWN;
		} else {
			/* Recursion is detected! It is possible, unfortunately */
			if (net_ratelimit())
				printk(KERN_DEBUG "Dead loop on virtual device %s, fix it urgently!\n", dev->name);
		}
	}
	spin_unlock_bh(&dev->queue_lock);

	kfree_skb(skb);
	return -ENETDOWN;
}

重点关注这个函数dev->hard_start_xmit 。由具体的网卡设备决定执行的是什么函数,比如cs890x系列:

    dev->hard_start_xmit     = net_send_packet;
 


static int net_send_packet(struct sk_buff *skb, struct net_device *dev)
{
	struct net_local *lp = (struct net_local *)dev->priv;

	if (net_debug > 3) {
		printk("%s: sent %d byte packet of type %x\n",
			dev->name, skb->len,
			(skb->data[ETH_ALEN+ETH_ALEN] << 8) | skb->data[ETH_ALEN+ETH_ALEN+1]);
	}

	/* keep the upload from being interrupted, since we
                  ask the chip to start transmitting before the
                  whole packet has been completely uploaded. */

	spin_lock_irq(&lp->lock);
	netif_stop_queue(dev);

	/* initiate a transmit sequence */
	writeword(dev, TX_CMD_PORT, lp->send_cmd);
	writeword(dev, TX_LEN_PORT, skb->len);//将数据包复制到硬件缓冲区,发送

	/* Test to see if the chip has allocated memory for the packet */
	if ((readreg(dev, PP_BusST) & READY_FOR_TX_NOW) == 0) {
		/*
		 * Gasp!  It hasn't.  But that shouldn't happen since
		 * we're waiting for TxOk, so return 1 and requeue this packet.
		 */
		
		spin_unlock_irq(&lp->lock);
		if (net_debug) printk("cs89x0: Tx buffer not free!\n");
		return 1;
	}
	/* Write the contents of the packet */
	outsw(dev->base_addr + TX_FRAME_PORT,skb->data,(skb->len+1) >>1);
	spin_unlock_irq(&lp->lock);
	dev->trans_start = jiffies;//记录发送时间
	dev_kfree_skb (skb);//释放数据包

	/*
	 * We DO NOT call netif_wake_queue() here.
	 * We also DO NOT call netif_start_queue().
	 *
	 * Either of these would cause another bottom half run through
	 * net_send_packet() before this packet has fully gone out.  That causes
	 * us to hit the "Gasp!" above and the send is rescheduled.  it runs like
	 * a dog.  We just return and wait for the Tx completion interrupt handler
	 * to restart the netdevice layer
	 */

	return 0;
}

大致的流程到这里就分析完了

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值