1、TCP基本概念
传输控制协议TCP是一种面向连接的、可靠的、基于字节流的运输层通信协议。TCP层是位于IP层之上,应用层之下的传输层。
2、TCP连接时三次握手示意
3. TCP协议栈从上到下提供的接口
创建socket
创建TCP socket调用接口
在创建socket套接字描述符, sys_socket内核函数会根据指定的协议(例如socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP))挂载对应的协议处理函数
250 static int inet_create(struct net *net, struct socket *sock, int protocol,int kern)
251{
...
262 /* Look for the requested type/protocol pair. */
263 lookup_protocol:
264 err = -ESOCKTNOSUPPORT;
265 rcu_read_lock();
// TCP套接字、UDP套接字、原始套接字的inet_protosw实 例都在inetsw_array数组中定义,
//这些实例会调inet_register_protosw()注册到inetsw中
//根据protocol查找要创建的套接字对应的四层传输协议。
266 list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
268 ...
283 }
284
//如果没有找到,则调用request_module()来尝试加载协议所属的模块,正常情况下不会发生。
285 if (unlikely(err)) {
286 if (try_loading_module < 2) {
287 rcu_read_unlock();
...
}
三次握手
结构体变量struct proto tcp_prot指定了TCP协议栈的访问接口函数
首先客户端发送SYN报文
调用tcp_v4_connect函数建立与服务器联系并发送SYN段:
tcp_v4_connect函数
140/*This will initiate an outgoing connection.*/141int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, intaddr_len)142{
...171 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,172 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,173IPPROTO_TCP,174orig_sport, orig_dport, sk);
...214215 /*Socket identity is still unknown (sport may be zero).
216 * However we set state to SYN-SENT and not releasing socket
217 * lock select source port, enter ourselves into the hash tables and
218 * complete initialization after this.
219*/
220tcp_set_state(sk, TCP_SYN_SENT);
...227 rt =ip_route_newports(fl4, rt, orig_sport, orig_dport,228 inet->inet_sport, inet->inet_dport, sk);
...246 err =tcp_connect(sk);
...
}
265EXPORT_SYMBOL(tcp_v4_connect);
此函数前面部分是确定socket的源端口,目的ip及端口。目的IP和目的端口是由connect系统调用的入参指定。tcp_connect函数用于构建并发送一个SYN请求。
tcp_connect函数
构造一个携带SYN标志位的TCP头,tcp_init_nondata_skb函数实现
发送带有SYN的TCP报文,tcp_transmit_skb函数实现
设置计时器超时重发,net_csk_reset_xmit_timer函数实现
3090/*Build a SYN and send it off.*/3091int tcp_connect(struct sock *sk)3092{
...3108 /*Reserve space for headers.*/
3109skb_reserve(buff, MAX_TCP_HEADER);3110
3111 tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);3112 tp->retrans_stamp =tcp_time_stamp;3113tcp_connect_queue_skb(sk, buff);3114tcp_ecn_send_syn(sk, buff);3115
3116 /*Send off SYN; include data in Fast Open.*/
3117 err = tp->fastopen_req ?tcp_send_syn_data(sk, buff) :3118 tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
...3129 /*Timer for repeating the SYN until an answer.*/
3130inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,3131 inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
...
}
3134EXPORT_SYMBOL(tcp_connect);
tcp_transmit_sbk函数
__tcp_transmit_skb函数的主要任务是向ip层发送数据包,其中包括
初始化TCP协议头等数据结构
查看clone_it是否要克隆Socket Buffer,应用Socket Buffer可能正被其他进程使用,就要克隆一个份
构建TCP协议选项
阻塞控制,确定网络上有多少数据包最好
构建TCP协议头主要的数据域:源端口、目的端口、数据段初始序列号,计算窗口大小,如果是SYN请求包就不需要计算窗口大小
发送数据包到ip层,发送过程状态机切换,发送SYN包之后切换为SYN_SENT
//net/ipv4/tcp_output.c
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, intclone_it,
gfp_t gfp_mask)
{return__tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
tcp_sk(sk)->rcv_nxt);
}
tcp_transmit_skb是对__tcp_transmit_skb的封装,继续调用,进入__tcp_transmit_skb发送SYN报文
__tcp_transmit_skb函数
//net/ipv4/tcp_output.c
static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,intclone_it, gfp_t gfp_mask, u32 rcv_nxt)
{const struct inet_connection_sock *icsk =inet_csk(sk);
struct inet_sock*inet;
struct tcp_sock*tp;
struct tcp_skb_cb*tcb;
struct tcp_out_options opts;
unsignedinttcp_options_size, tcp_header_size;
struct sk_buff*oskb =NULL;
struct tcp_md5sig_key*md5;
struct tcphdr*th;
u64 prior_wstamp;interr;
BUG_ON(!skb || !tcp_skb_pcount(skb));
tp=tcp_sk(sk);if(clone_it) {
Socket Buffer
TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq- tp->snd_una;
oskb=skb;
tcp_skb_tsorted_save(oskb) {if(unlikely(skb_cloned(oskb)))
skb=pskb_copy(oskb, gfp_mask);elseskb=skb_clone(oskb, gfp_mask);
} tcp_skb_tsorted_restore(oskb);if (unlikely(!skb))return -ENOBUFS;
}
prior_wstamp= tp->tcp_wstamp_ns;
tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
inet=inet_sk(sk);
tcb=TCP_SKB_CB(skb);
memset(&opts, 0, sizeof(opts));if (unlikely(tcb->tcp_flags &TCPHDR_SYN))
tcp_options_size= tcp_syn_options(sk, skb, &opts, &md5);elsetcp_options_size= tcp_established_options(sk, skb, &opts,&md5);
tcp_header_size= tcp_options_size +sizeof(struct tcphdr);/*if no packet is in qdisc/device queue, then allow XPS to select
* another queue. We can be called from tcp_tsq_handler()
* which holds one reference to sk.
*
* TODO: Ideally, in-flight pure ACK packets should not matter here.
* One way to get this would be to set skb->truesize = 2 on them.*/skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);/*If we had to use memory reserve to allocate this skb,
* this might cause drops if packet is looped back :
* Other socket might not have SOCK_MEMALLOC.
* Packets not looped back do not care about pfmemalloc.*/skb->pfmemalloc = 0;
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
skb_orphan(skb);
skb->sk =sk;
skb->destructor = skb_is_tcp_pure_ack(skb) ?__sock_wfree : tcp_wfree;
skb_set_hash_from_sk(skb, sk);
refcount_add(skb->truesize, &sk->sk_wmem_alloc);
skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);/*Build TCP header and checksum it.*/th= (struct tcphdr *)skb->data;
th->source = inet->inet_sport;
th->dest = inet->inet_dport;
th->seq = htonl(tcb->seq);
th->ack_seq =htonl(rcv_nxt);*(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |tcb->tcp_flags);
th->check = 0;
th->urg_ptr = 0;/*The urg_mode check is necessary during a below snd_una win probe*/
if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {if (before(tp->snd_up, tcb->seq + 0x10000)) {
th->urg_ptr = htons(tp->snd_up - tcb->seq);
th->urg = 1;
}else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
th->urg_ptr = htons(0xFFFF);
th->urg = 1;
}
}
tcp_options_write((__be32*)(th + 1), tp, &opts);
skb_shinfo(skb)->gso_type = sk->sk_gso_type;if (likely(!(tcb->tcp_flags &TCPHDR_SYN))) {
th->window =htons(tcp_select_window(sk));
tcp_ecn_send(sk, skb, th, tcp_header_size);
}else{/*RFC1323: The window in SYN & SYN/ACK segments
* is never scaled.*/th->window = htons(min(tp->rcv_wnd, 65535U));
}
#ifdef CONFIG_TCP_MD5SIG/*Calculate the MD5 hash, as we have all we need now*/
if(md5) {
sk_nocaps_add(sk, NETIF_F_GSO_MASK);
tp->af_specific->calc_md5_hash(opts.hash_location,
md5, sk, skb);
}
#endif
icsk->icsk_af_ops->send_check(sk, skb);if (likely(tcb->tcp_flags &TCPHDR_ACK))
tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);if (skb->len !=tcp_header_size) {
tcp_event_data_sent(tp, sk);
tp->data_segs_out +=tcp_skb_pcount(skb);
tp->bytes_sent += skb->len -tcp_header_size;
}if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
tcp_skb_pcount(skb));
tp->segs_out +=tcp_skb_pcount(skb);/*OK, its time to fill skb_shinfo(skb)->gso_{segs|size}*/skb_shinfo(skb)->gso_segs =tcp_skb_pcount(skb);
skb_shinfo(skb)->gso_size =tcp_skb_mss(skb);/*Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns)*/
/*Cleanup our debris for IP stacks*/memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
sizeof(struct inet6_skb_parm)));
err= icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);if (unlikely(err > 0)) {
tcp_enter_cwr(sk);
err=net_xmit_eval(err);
}if (!err &&oskb) {
tcp_update_skb_after_send(sk, oskb, prior_wstamp);
tcp_rate_skb_sent(sk, oskb);
}returnerr;
}
客户端tcp层是完成SYN包的发送了,经过下层传输到网卡。之后服务端接收客户端发来的tcp报文,并发送回SYN+ACK。