TCP协议发送过程的数据包来源有两种,第一是应用层产生的数据包需要复制到内核接受缓冲区由tcp_sendmsg函数完成,第二是TCP连接管理TCP协议层自己阐述的数据、数据包重传数据包,由函数tcp_transmit_skb完成。首先介绍tcp_sendmsg函数,tcp_sendmsg函数主要做了三件事情:
(1)、将数据包复制到Socket Buffer中。
(2)、把Socket BUffer键入到发送队列。
(3)、设置TCP控制块结构,用于构造TCP协议头信息。
1、判断套接字状态
发送数据包时首先要检查套接字状态,如果不是TCP_ESTABLISHED、或者TCP_CLOSE_WAIT就直接返回,表名连接还没有建立不能发送数据包,接着初始化一些局部变量,iov应用层数据库起始地址、iovlen数据库个数、mss_now当前打开的套接字最大长度、timeo发送超时时间
int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
size_t size)
{
struct sock *sk = sock->sk;
struct iovec *iov;
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
int iovlen, flags;
int mss_now, size_goal;
int sg, err, copied;
long timeo;
//锁定套接字
lock_sock(sk);
TCP_CHECK_TIMER(sk);
flags = msg->msg_flags;
//设置超时时间
timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
/* Wait for a connection to finish. */
//不是状态不是ESTABLISHED和CLOSE_WAIT表示还没有建立连接
//直接返回
if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
goto out_err;
/* This should be in poll */
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
//可发送TCP最大数据包长度,根据mtu决定
mss_now = tcp_send_mss(sk, &size_goal, flags);
/* Ok commence sending. */
//保存数据在用户空间地址
iovlen = msg->msg_iovlen;
//msg_iov数据格式的数量
iov = msg->msg_iov;
copied = 0;
err = -EPIPE;
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto out_err;
sg = sk->sk_route_caps & NETIF_F_SG;
...
2、初始化Socket Buffer
接下来进入主循环,复制数据到内核缓冲区直到seglen等于0,首先将指针偏移到最后一个Socket Buffer处,调用sk_stream_memory_free判断发送缓冲区是否有剩余空间,如果没有就等待数据发送,如果还有剩余空间就调用sk_stream_alloc_skb分配一个Socket Buffer,然后把这个skb加入到队列中
...
while (seglen > 0) {
int copy = 0;
int max = size_goal;
//指向最后一个skb
skb = tcp_write_queue_tail(sk);
if (tcp_send_head(sk)) {
//队列中没有数据包
if (skb->ip_summed == CHECKSUM_NONE)
max = mss_now;
//剩余空间大小
copy = max - skb->len;
}
if (copy <= 0) {
//分配新的内存存放数据
new_segment:
/* Allocate new segment. If the interface is SG,
* allocate skb fitting to single page.
*/
//没有内存可以分配,就等待内存是否
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
//分配socket buffer
skb = sk_stream_alloc_skb(sk,
select_size(sk, sg),
sk->sk_allocation);
if (!skb)
goto wait_for_memory;
/*
* Check whether we can use HW checksum.
*/
if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
skb->ip_summed = CHECKSUM_PARTIAL;
//将新分配的skb加入到队列中
skb_entail(sk, skb);
copy = size_goal;
max = size_goal;
}
...
3、复制数据到内核缓冲区
这里我们要知道数据包存放的位置有两个地方,第一个是Socket Buffer里面缓冲区也就是线性存储,第二是struct skb_shared_info中的frags页面数组称为非线性存储。数据包先保存在Socket Buffer线性存储中,如果线性存储没有空间了就存储在struct skb_shared_info的frags数组中。所以先调用skb_tailroom查看skb缓冲区是否有空间,如果有就调用skb_add_data将数据包复制到Socket Buffer缓冲区空间,如果Socket Buffer没有空间了就查看struct skb_shared_infof的数据结构frags页面数组的最后一个页面是否还有空间,调用skb_can_coalesce计算页面是否还有剩余空间,如果有就返回第i个页面,最后要更新第i个页面中存储数据长度。如果frags最后一个页面没有剩余空间就调用sk_stream_alloc_page分配一个新的页。
...
/* Where to copy to? */
//判断socket buffer的缓冲区空间是否还有剩余空间
if (skb_tailroom(skb) > 0) {
/* We have some space in skb head. Superb! */
if (copy > skb_tailroom(skb))
//返回剩余空间大小
copy = skb_tailroom(skb);
//最终调用copy_from_user将数据包复制
//到剩余空间
if ((err = skb_add_data(skb, from, copy)) != 0)
goto do_fault;
} else {
int merge = 0;
int i = skb_shinfo(skb)->nr_frags;
struct page *page = TCP_PAGE(sk);
int off = TCP_OFF(sk);
//判断frags页面数组中是否还有空间
if (skb_can_coalesce(skb, i, page, off) &&
off != PAGE_SIZE) {
/* We can extend the last page
* fragment. */
merge = 1;
} else if (i == MAX_SKB_FRAGS || !sg) {
/* Need to add new fragment and cannot
* do this because interface is non-SG,
* or because all the page slots are
* busy. */
//页面数量已经达到最大,这只PSH标志
//表示数据段可以发送了
tcp_mark_push(tp, skb);
goto new_segment;
} else if (page) {
if (off == PAGE_SIZE) {
put_page(page);
TCP_PAGE(sk) = page = NULL;
off = 0;
}
} else
off = 0;
if (copy > PAGE_SIZE - off)
copy = PAGE_SIZE - off;
if (!sk_wmem_schedule(sk, copy))
goto wait_for_memory;
if (!page) {
/* Allocate new cache page. */
//页面数组中最后一个页面也满了就分配一个新页
if (!(page = sk_stream_alloc_page(sk)))
goto wait_for_memory;
}
/* Time to copy data. We are close to
* the end! */
//将数据复制到页缓冲区
err = skb_copy_to_page(sk, from, skb, page,
off, copy);
if (err) {
/* If this page was new, give it to the
* socket so it does not get leaked.
*/
if (!TCP_PAGE(sk)) {
TCP_PAGE(sk) = page;
TCP_OFF(sk) = 0;
}
goto do_error;
}
/* Update the skb. */
if (merge) {
//更新第i个页面存储的数据大小
skb_shinfo(skb)->frags[i - 1].size +=
copy;
} else {
//新增页面则要填充页面描述信息
skb_fill_page_desc(skb, i, page, off, copy);
if (TCP_PAGE(sk)) {
get_page(page);
} else if (off + copy < PAGE_SIZE) {
get_page(page);
TCP_PAGE(sk) = page;
}
}
TCP_OFF(sk) = off + copy;
}
//TCP协议头信息设置在socket buffer 控制缓冲区
//当数据段发送时才穿件TCP协议头
if (!copied)
TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
//写更新序列号
tp->write_seq += copy;
//更新缓冲区中的序列号
TCP_SKB_CB(skb)->end_seq += copy;
skb_shinfo(skb)->gso_segs = 0;
from += copy;
copied += copy;
if ((seglen -= copy) == 0 && iovlen == 0)
goto out;
if (skb->len < max || (flags & MSG_OOB))
continue;
...
4、发送数据
TCP协议头信息不设置带数据包中,而是保存在Socket Bufer控制缓冲区,当有数据段从队列取出发送时才创建TCP协议头,force_push查看是否立即发送数据包,如果是就调用tcp_mack_push设置TCP协议头PSH标志,然后立即发送数据,如果队列中还没有足够的缓冲区或缓冲页面就等待数据包到达一定数据再发送,最终调用的发送函数是tcp_transmit_skb。
...
//查看是否立即发送数据包
if (forced_push(tp)) {
//设置立即发送数据包标志PSH
tcp_mark_push(tp, skb);
//发送数据包,实际调用的是ip_queue_xmit
__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
} else if (skb == tcp_send_head(sk))
tcp_push_one(sk, mss_now);
continue;
wait_for_sndbuf:
//缓冲数据段,等到一定数量再发送
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
if (copied)
tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
goto do_error;
mss_now = tcp_send_mss(sk, &size_goal, flags);
}
}
...
tcp_sendmsg完整代码:
int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
size_t size)
{
struct sock *sk = sock->sk;
struct iovec *iov;
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
int iovlen, flags;
int mss_now, size_goal;
int sg, err, copied;
long timeo;
//锁定套接字
lock_sock(sk);
TCP_CHECK_TIMER(sk);
flags = msg->msg_flags;
//设置超时时间
timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
/* Wait for a connection to finish. */
//不是状态不是ESTABLISHED和CLOSE_WAIT表示还没有建立连接
//直接返回
if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
goto out_err;
/* This should be in poll */
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
//可发送TCP最大数据包长度,根据mtu决定
mss_now = tcp_send_mss(sk, &size_goal, flags);
/* Ok commence sending. */
//保存数据在用户空间地址
iovlen = msg->msg_iovlen;
//msg_iov数据格式的数量
iov = msg->msg_iov;
copied = 0;
err = -EPIPE;
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto out_err;
sg = sk->sk_route_caps & NETIF_F_SG;
while (--iovlen >= 0) {
//i/o数组中元素的个数
int seglen = iov->iov_len;
//i/o数组基地址
unsigned char __user *from = iov->iov_base;
iov++;
while (seglen > 0) {
int copy = 0;
int max = size_goal;
//指向最后一个skb
skb = tcp_write_queue_tail(sk);
if (tcp_send_head(sk)) {
//队列中没有数据包
if (skb->ip_summed == CHECKSUM_NONE)
max = mss_now;
//剩余空间大小
copy = max - skb->len;
}
if (copy <= 0) {
//分配新的内存存放数据
new_segment:
/* Allocate new segment. If the interface is SG,
* allocate skb fitting to single page.
*/
//判断发送缓冲区是否还有空闲内存
//如果没有就等待数据发送
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
//分配socket buffer
skb = sk_stream_alloc_skb(sk,
select_size(sk, sg),
sk->sk_allocation);
if (!skb)
goto wait_for_memory;
/*
* Check whether we can use HW checksum.
*/
if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
skb->ip_summed = CHECKSUM_PARTIAL;
//将新分配的skb加入到队列中
skb_entail(sk, skb);
copy = size_goal;
max = size_goal;
}
/* Try to append data to the end of skb. */
if (copy > seglen)
copy = seglen;
/* Where to copy to? */
//判断socket buffer的缓冲区空间是否还有剩余空间
if (skb_tailroom(skb) > 0) {
/* We have some space in skb head. Superb! */
if (copy > skb_tailroom(skb))
//返回剩余空间大小
copy = skb_tailroom(skb);
//最终调用copy_from_user将数据包复制
//到剩余空间
if ((err = skb_add_data(skb, from, copy)) != 0)
goto do_fault;
} else {
int merge = 0;
int i = skb_shinfo(skb)->nr_frags;
struct page *page = TCP_PAGE(sk);
int off = TCP_OFF(sk);
//判断frags页面数组中是否还有空间
//并返回第i个页面
if (skb_can_coalesce(skb, i, page, off) &&
off != PAGE_SIZE) {
/* We can extend the last page
* fragment. */
merge = 1;
} else if (i == MAX_SKB_FRAGS || !sg) {
/* Need to add new fragment and cannot
* do this because interface is non-SG,
* or because all the page slots are
* busy. */
//页面数量已经达到最大,这只PSH标志
//表示数据段可以发送了
tcp_mark_push(tp, skb);
goto new_segment;
} else if (page) {
if (off == PAGE_SIZE) {
put_page(page);
TCP_PAGE(sk) = page = NULL;
off = 0;
}
} else
off = 0;
if (copy > PAGE_SIZE - off)
copy = PAGE_SIZE - off;
if (!sk_wmem_schedule(sk, copy))
goto wait_for_memory;
if (!page) {
/* Allocate new cache page. */
//页面数组中最后一个页面也满了就分配一个新页
if (!(page = sk_stream_alloc_page(sk)))
goto wait_for_memory;
}
/* Time to copy data. We are close to
* the end! */
//将数据复制到页缓冲区
err = skb_copy_to_page(sk, from, skb, page,
off, copy);
if (err) {
/* If this page was new, give it to the
* socket so it does not get leaked.
*/
if (!TCP_PAGE(sk)) {
TCP_PAGE(sk) = page;
TCP_OFF(sk) = 0;
}
goto do_error;
}
/* Update the skb. */
if (merge) {
//更新第i个页面存储的数据大小
skb_shinfo(skb)->frags[i - 1].size +=
copy;
} else {
//新增页面则要填充页面描述信息
skb_fill_page_desc(skb, i, page, off, copy);
if (TCP_PAGE(sk)) {
get_page(page);
} else if (off + copy < PAGE_SIZE) {
get_page(page);
TCP_PAGE(sk) = page;
}
}
TCP_OFF(sk) = off + copy;
}
//TCP协议头信息设置在socket buffer 控制缓冲区
//当数据段发送时才穿件TCP协议头
if (!copied)
TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
//写更新序列号
tp->write_seq += copy;
//更新缓冲区中的序列号
TCP_SKB_CB(skb)->end_seq += copy;
skb_shinfo(skb)->gso_segs = 0;
from += copy;
copied += copy;
if ((seglen -= copy) == 0 && iovlen == 0)
goto out;
if (skb->len < max || (flags & MSG_OOB))
continue;
//查看是否立即发送数据包
if (forced_push(tp)) {
//设置立即发送数据包标志PSH
tcp_mark_push(tp, skb);
//发送数据包,实际调用的是ip_queue_xmit
__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
} else if (skb == tcp_send_head(sk))
tcp_push_one(sk, mss_now);
continue;
wait_for_sndbuf:
//缓冲数据段,等到一定数量再发送
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
if (copied)
tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
goto do_error;
mss_now = tcp_send_mss(sk, &size_goal, flags);
}
}
out:
if (copied)
tcp_push(sk, flags, mss_now, tp->nonagle);
TCP_CHECK_TIMER(sk);
release_sock(sk);
if (copied > 0)
uid_stat_tcp_snd(current_uid(), copied);
return copied;
do_fault:
if (!skb->len) {
tcp_unlink_write_queue(skb, sk);
/* It is the one place in all of TCP, except connection
* reset, where we can be unlinking the send_head.
*/
tcp_check_send_head(sk, skb);
sk_wmem_free_skb(sk, skb);
}
do_error:
if (copied)
goto out;
out_err:
err = sk_stream_error(sk, flags, err);
TCP_CHECK_TIMER(sk);
release_sock(sk);
return err;
}