我们接着昨天分析到的tcp_connect()函数继续
为了阅读方便再次把函数贴出来
int tcp_connect(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; tcp_connect_init(sk); buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation); if (unlikely(buff == NULL)) return -ENOBUFS; /* Reserve space for headers. */ skb_reserve(buff, MAX_TCP_HEADER); tp->snd_nxt = tp->write_seq; tcp_init_nondata_skb(buff, tp->write_seq++, TCPCB_FLAG_SYN); TCP_ECN_send_syn(sk, buff); /* Send it off. */ TCP_SKB_CB(buff)->when = tcp_time_stamp; tp->retrans_stamp = TCP_SKB_CB(buff)->when; skb_header_release(buff); __tcp_add_write_queue_tail(sk, buff); sk->sk_wmem_queued += buff->truesize; sk_mem_charge(sk, buff->truesize); tp->packets_out += tcp_skb_pcount(buff); tcp_transmit_skb(sk, buff, 1, GFP_KERNEL); /* We change tp->snd_nxt after the tcp_transmit_skb() call * in order to make this packet get counted in tcpOutSegs. */ tp->snd_nxt = tp->write_seq; tp->pushed_seq = tp->write_seq; TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); return 0; } |
函数中首先是调用了tcp_connect_init()来初始化tcp的sock结构
static void tcp_connect_init(struct sock *sk) { struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); __u8 rcv_wscale; /* We'll fix this up when we get a response from the other end. * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. */ tp->tcp_header_len = sizeof(struct tcphdr) + (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0); #ifdef CONFIG_TCP_MD5SIG if (tp->af_specific->md5_lookup(sk, sk) != NULL) tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; #endif /* If user gave his TCP_MAXSEG, record it to clamp */ if (tp->rx_opt.user_mss) tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; tp->max_window = 0; tcp_mtup_init(sk); tcp_sync_mss(sk, dst_mtu(dst)); if (!tp->window_clamp) tp->window_clamp = dst_metric(dst, RTAX_WINDOW); tp->advmss = dst_metric(dst, RTAX_ADVMSS); tcp_initialize_rcv_mss(sk); tcp_select_initial_window(tcp_full_space(sk), tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len -sizeof(struct tcphdr) : 0), &tp->rcv_wnd, &tp->window_clamp, sysctl_tcp_window_scaling, &rcv_wscale); tp->rx_opt.rcv_wscale = rcv_wscale; tp->rcv_ssthresh = tp->rcv_wnd; sk->sk_err = 0; sock_reset_flag(sk, SOCK_DONE); tp->snd_wnd = 0; tcp_init_wl(tp, tp->write_seq, 0); tp->snd_una = tp->write_seq; tp->snd_sml = tp->write_seq; tp->rcv_nxt = 0; tp->rcv_wup = 0; tp->copied_seq = 0; inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; inet_csk(sk)->icsk_retransmits = 0; tcp_clear_retrans(tp); } |
代码中首先是struct dst_entry结构
struct dst_entry { struct rcu_head rcu_head; struct dst_entry *child; struct net_device *dev; short error; short obsolete; int flags; #define DST_HOST 1 #define DST_NOXFRM 2 #define DST_NOPOLICY 4 #define DST_NOHASH 8 unsigned long expires; unsigned short header_len; /* more space at head required */ unsigned short trailer_len; /* space to reserve at tail */ unsigned int rate_tokens; unsigned long rate_last; /* rate limiting for ICMP */ struct dst_entry *path; struct neighbour *neighbour; struct hh_cache *hh; struct xfrm_state *xfrm; int (*input)(struct sk_buff*); int (*output)(struct sk_buff*); struct dst_ops *ops; u32 metrics[RTAX_MAX]; #ifdef CONFIG_NET_CLS_ROUTE __u32 tclassid; #endif /* * __refcnt wants to be on a different cache line from * input/output/ops or performance tanks badly */ atomic_t __refcnt; /* client references */ int __use; unsigned long lastuse; union { struct dst_entry *next; struct rtable *rt_next; struct rt6_info *rt6_next; struct dn_route *dn_next; }; }; |
这个结构体是专门用于路由目的而使用的,所以代码中首先是从sock的sk_dst_cache得到这个结构指针。然后是确定tcp_header_len的长度,接着是调用tcp_mtup_init()函数对inet_connection_sock结构进行一些设置。
void tcp_mtup_init(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1; icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + icsk->icsk_af_ops->net_header_len; icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss); icsk->icsk_mtup.probe_size = 0; } |
接着是同步mss,我们说过mss是最大分段值,即Maxitum Segment Size,上一节在http://blog.chinaunix.net/u2/64681/showart_1411408.html 那章中我们曾经看到过。相关mss的介绍网上很多。推荐朋友们阅读http://www.net130.com/CMS/Pub/network/network_protocal/2005_09_22_97176.htm 关于其实现原理,找到一段论坛的文字节摘于此:出处是http://forum.h3c.com/viewthread.php?tid=35493
关TCP MSS值 中低端路由器tcp mss的实现原理为何经常要在内网口和外网口都要配置tcp mss 值呢? PC1(192.168.0.1)―――Router――――Internet―――-www server(238.135.1.1) 建立tcp连接的两端在三次握手时会协商tcp mss大小,具体如下: pc1发出syn报文,其中option选项填充的mss字段一般为1460,同样www server收到syn报文后,会发送syn+ack报文应答,option选项填充的mss字段也为1460;协商双方会比较syn和syn+ack报文中mss字段大小,选择较小的mss作为发送tcp分片的大小。通过比较,协商双方的tcp mss都是1460。 对于涉及mpls l3vpn、pppoe+nat、ipsec、l2tp、gre等组网,通常由于报文太大需要分片,一般可以通过设置tcp mss解决。 针对上例说明tcp mss如何实现 1、 假设在路由器内网口配置tcp mss 1200 a) 路由器收到www server的syn+ack报文时会修改option选项中的mss字段为1200,然后再转发给PC1,PC1收到报文后认为对端的tcp mss为1200,这样PC1发送数据给www server时会以1200作为分片大小;但路由器修改tcp mss为1200的操作www server是不知道的,因此www server还会以1460作为分片大小发送报文。 2、 假设再路由器外网口配置tcp mss 1200 a) 路由器收到PC1的syn报文时会修改option选项中的mss字段为1200,然后再转发给www server,同样www server发送数据给PC1时会以1200作为分片大小;同样PC1不知道路由器修改tcp mss为1200,因为PC1还会以1460作为分片大小发送报文。 3、 因此在实现双向大包传输时需要在内外网同时修改tcp mss 综上所述:在路由器接口上配置的tcp mss命令仅对出接口方向的syn报文和syn+ack报文有效,对于入接口方向的syn和syn+ack报文无效。 |
我们看到其进入了
unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); int mss_now; if (icsk->icsk_mtup.search_high > pmtu) icsk->icsk_mtup.search_high = pmtu; mss_now = tcp_mtu_to_mss(sk, pmtu); mss_now = tcp_bound_to_half_wnd(tp, mss_now); /* And store cached results */ icsk->icsk_pmtu_cookie = pmtu; if (icsk->icsk_mtup.enabled) mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)); tp->mss_cache = mss_now; return mss_now; } |
在这个函数中,首先是调用了
int tcp_mtu_to_mss(struct sock *sk, int pmtu) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); int mss_now; /* Calculate base mss without TCP options: It is MMS_S - sizeof(tcphdr) of rfc1122 */ mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr); /* Clamp it (mss_clamp does not include tcp options) */ if (mss_now > tp->rx_opt.mss_clamp) mss_now = tp->rx_opt.mss_clamp; /* Now subtract optional transport overhead */ mss_now -= icsk->icsk_ext_hdr_len; /* Then reserve room for full set of TCP options and 8 bytes of data */ if (mss_now < 48) mss_now = 48; /* Now subtract TCP options size, not including SACKs */ mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); return mss_now; } |
我们看到在进入这个函数时,参数pmtu是层层从tcp_connect_init()函数中传递下来的,我们看到在那里是dst_mtu(dst),如果朋友们阅读了我上边提到的mss的资料会有涉及到mtu的概念,MTU: Maxitum Transmission Unit 最大传输单元,所以确定了mtu就做为参数传递到了上面的tcp_mtu_to_mss人()函数中。我们看到在函数中要根据mtu计算出一个合适的mss。然后回到tcp_sync_mss函数中将计算后听mss传递给tcp_bound_to_half_wnd()函数
static int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize) { if (tp->max_window && pktsize > (tp->max_window >> 1)) return max(tp->max_window >> 1, 68U - tp->tcp_header_len); else return pktsize; } |
结合tcp的最大窗口一半大小来进一步确定mss值,然后在tcp_sync_mss()函数中将这个计算到的mss值,保存给tcp的socket中mss_cache。同时也要将计算到的mtu值保存到inet_connection_sock结构中。我们在分析代码时未对协议进行理论分析,主要原因是我们避免太学术话,我们追求实现过程,以实践来学理论,所以请朋友们注意这里可能涉及到滑动窗口协议,不过我们还探讨这个协议的具体内容和细节。回到tcp_connect_init()函数中
我们看到代码
if (!tp->window_clamp) tp->window_clamp = dst_metric(dst, RTAX_WINDOW); |
检查是否设置了tcp的sock的最大值,没果没有设置就再分配一个。这个值是在struct dst_entry 结构中的metrics数组中的,根据RTAX_WINDOW为下标得到数组中已经设置好的数值。而在上面我们看到dst_mtu ()函数时使用的却是下标RTAX_MTU。我们看一下这个公用的函数
static inline u32 dst_metric(const struct dst_entry *dst, int metric) { return dst->metrics[metric-1]; } |
我们接下看到设置了tcp的sock的向外公开的mss值
tp->advmss = dst_metric(dst, RTAX_ADVMSS); |
函数接着进入了tcp_initialize_rcv_mss()中初始化接收mss的大小
void tcp_initialize_rcv_mss(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); hint = min(hint, tp->rcv_wnd / 2); hint = min(hint, TCP_MIN_RCVMSS); hint = max(hint, TCP_MIN_MSS); inet_csk(sk)->icsk_ack.rcv_mss = hint; } |
我们看到这里根据对外公开的mss 和上面我们的tcp的mss来 选一个最小值,然后再与接收窗口大小的一半来对比,再与系统中要求的mss的值相比较最后得到一接收的mss的值。
接下来我们进入了tcp_select_initial_window()函数
void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd, __u32 *window_clamp, int wscale_ok, __u8 *rcv_wscale) { unsigned int space = (__space < 0 ? 0 : __space); /* If no clamp set the clamp to the max possible scaled window */ if (*window_clamp == 0) (*window_clamp) = (65535 << 14); space = min(*window_clamp, space); /* Quantize space offering to a multiple of mss if possible. */ if (space > mss) space = (space / mss) * mss; /* NOTE: offering an initial window larger than 32767 * will break some buggy TCP stacks. If the admin tells us * it is likely we could be speaking with such a buggy stack * we will truncate our initial window offering to 32K-1 * unless the remote has sent us a window scaling option, * which we interpret as a sign the remote TCP is not * misinterpreting the window field as a signed quantity. */ if (sysctl_tcp_workaround_signed_windows) (*rcv_wnd) = min(space, MAX_TCP_WINDOW); else (*rcv_wnd) = space; (*rcv_wscale) = 0; if (wscale_ok) { /* Set window scaling on max possible window * See RFC1323 for an explanation of the limit to 14 */ space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max); space = min_t(u32, space, *window_clamp); while (space > 65535 && (*rcv_wscale) < 14) { space >>= 1; (*rcv_wscale)++; } } /* Set initial window to value enough for senders, * following RFC2414. Senders, not following this RFC, * will be satisfied with 2. */ if (mss > (1 << *rcv_wscale)) { int init_cwnd = 4; if (mss > 1460 * 3) init_cwnd = 2; else if (mss > 1460) init_cwnd = 3; if (*rcv_wnd > init_cwnd * mss) *rcv_wnd = init_cwnd * mss; } /* Set the clamp no higher than max representable value */ (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp); } |
这个函数就是初始化我们的tcp的窗口相关数值。回到tcp_connect_init()函数中我们接下来看到继续到tcp的sock结构进行了相关的设置后,就返回到了我们的tcp_connect()函数中,我们接下看到
buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation); |
转而调用了
static inline struct sk_buff *alloc_skb_fclone(unsigned int size, gfp_t priority) { return __alloc_skb(size, priority, 1, -1); } |
内部的_alloc_skb函数我们在unix的socket过程中曾经读过了http://blog.chinaunix.net/u2/64681/showart_1355078.html 我是无名小卒转载朋友请注明出处。在那篇文章中详细叙述了如何分配一个用于socket的数据缓冲结构sk_buff结构。接着我们看到
skb_reserve(buff, MAX_TCP_HEADER); |
将缓冲区结构调整了大小增加了tcp头部的空间要求大小。然后函数进入了tcp_init_nondata_skb()中
static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) { skb->csum = 0; TCP_SKB_CB(skb)->flags = flags; TCP_SKB_CB(skb)->sacked = 0; skb_shinfo(skb)->gso_segs = 1; skb_shinfo(skb)->gso_size = 0; skb_shinfo(skb)->gso_type = 0; TCP_SKB_CB(skb)->seq = seq; if (flags & (TCPCB_FLAG_SYN | TCPCB_FLAG_FIN)) seq++; TCP_SKB_CB(skb)->end_seq = seq; } |
这里调用了一个宏
#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) |
我们知道在sk_buff结构中有一个char类型的数组cb[48];这里可以看出这个数组中保存的是struct tcp_skb_cb的结构指针。这个函数主要是围绕着sk_buff结构做一些关于tcp方面的初始化。回到tcp_connect()函数中,可以看到除了对sk_buff的时间设置和一些初始操作外,还执行了__tcp_add_write_queue_tail()将这个sk_buff结构挂入到了自已本身的sock结构等待写的队列sk_write_queue中。接着调整一下tcp的sock结构中的packets_out记录已经处于“飞行”状态的数据包。然后就要进入tcp_transmit_skb()函数中将这个数据包发送出去。在进入这个函数之前我们先看完tcp_connect()函数余下的代码,如果tcp_transmit_skb()函数正常发送完毕,则需要调整一下tcp的sock中的关于发送数据的计数器,然后调整一个tcp的状态,然后调用inet_csk_reset_xmit_timer()来设置一个定时器,如果超时没有得到服务器端的应答就会调整定时器重新发送数据,关于定时器我们暂且放到将来再讲述,这里重点是论述tcp_transmit_skb()这个关键的函数了,函数比较长时间关系明天继续。
转自:http://blog.chinaunix.net/uid-7960587-id-2035556.html