连接建立完成后,应用进程可以使用send、sendto、sendmsg、write、writev系统调用来发送TCP数据,其中sendmsg和writev可以发送位于多个不连续内存中的数据。使用这些函数发送数据时需指定socket文件描述符、要发送的数据所在的缓冲区首地址及数据长度等信息。上述发包系统调用对应的内核函数都会调用__sock_sendmsg_nosec函数。现在以send系统调用和write系统调用为例来证明这个结论。
send系统调用原型:
ssize_t send(int sockfd, const void *buf, size_t len, int flags);
参数:sockfd是要发送数据的socket的文件描述符;buf是数据所在缓存的首地址;len是要发送的数据的长度;flags用于设置功能。成功时此函数的返回值是已经成功发送的字节数。
send函数对应的内核代码为:
1800 SYSCALL_DEFINE4(send, int, fd, void __user *, buff, size_t, len,
1801 unsigned int, flags)
1802 {
1803 return sys_sendto(fd, buff, len, flags, NULL, 0);
1804 }
send_to系统调用对应的内核函数也是sys_sendto函数:
1753 SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len,
1754 unsigned int, flags, struct sockaddr __user *, addr,
1755 int, addr_len)
1756 {
1757 struct socket *sock;
1758 struct sockaddr_storage address;
1759 int err;
1760 struct msghdr msg;
1761 struct iovec iov;
1762 int fput_needed;
1763
1764 if (len > INT_MAX)
1765 len = INT_MAX;
1766 sock = sockfd_lookup_light(fd, &err, &fput_needed);
1767 if (!sock)
1768 goto out;
1769
1770 iov.iov_base = buff;
1771 iov.iov_len = len;
1772 msg.msg_name = NULL;
1773 msg.msg_iov = &iov;
1774 msg.msg_iovlen = 1;
1775 msg.msg_control = NULL;
1776 msg.msg_controllen = 0;
1777 msg.msg_namelen = 0;
1778 if (addr) {
1779 err = move_addr_to_kernel(addr, addr_len, &address);
1780 if (err < 0)
1781 goto out_put;
1782 msg.msg_name = (struct sockaddr *)&address;
1783 msg.msg_namelen = addr_len;
1784 }
1785 if (sock->file->f_flags & O_NONBLOCK)
1786 flags |= MSG_DONTWAIT;
1787 msg.msg_flags = flags;
1788 err = sock_sendmsg(sock, &msg, len); //核心处理函数
1789
1790 out_put:
1791 fput_light(sock->file, fput_needed);
1792 out:
1793 return err;
1794 }
1770-1777:struct msghdr可以支持一次输入多个不连续的缓存,但对于send和sendto系统调用而言一次只输入一段缓存,故这段代码就是用入参初始化msghdr结构。
sock_sendmsg函数的定义:
636 int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
637 {
638 struct kiocb iocb;
639 struct sock_iocb siocb;
640 int ret;
641
642 init_sync_kiocb(&iocb, NULL);
643 iocb.private = &siocb;
644 ret = __sock_sendmsg(&iocb, sock, msg, size);
645 if (-EIOCBQUEUED == ret)
646 ret = wait_on_sync_kiocb(&iocb);
647 return ret;
648 }
sock_sendmsg函数是通过调用__sock_sendmsg函数来完成发送数据的功能的。先暂时停下,我们来看看write系统调用的原型:
ssize_t write(int fd, const void *buf, size_t count);
参数:fd是要发送数据的socket的文件描述符;buf是数据所在缓存的首地址;count是要发送的数据的长度。成功时此函数的返回值是已经成功发送的字节数。
write系统调用对应的内核函数为:
486 SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
487 size_t, count)
488 {
489 struct fd f = fdget(fd);
490 ssize_t ret = -EBADF;
491
492 if (f.file) {
493 loff_t pos = file_pos_read(f.file);
494 ret = vfs_write(f.file, buf, count, &pos);
495 file_pos_write(f.file, pos);
496 fdput(f);
497 }
498
499 return ret;
500 }
vfs_write函数:
430 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
431 {
432 ssize_t ret;
433
434 if (!(file->f_mode & FMODE_WRITE))
435 return -EBADF;
436 if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
437 return -EINVAL;
438 if (unlikely(!access_ok(VERIFY_READ, buf, count)))
439 return -EFAULT;
440
441 ret = rw_verify_area(WRITE, file, pos, count);
442 if (ret >= 0) {
443 count = ret;
444 file_start_write(file);
445 if (file->f_op->write)
446 ret = file->f_op->write(file, buf, count, pos);
447 else
448 ret = do_sync_write(file, buf, count, pos);
449 if (ret > 0) {
450 fsnotify_modify(file);
451 add_wchar(current, ret);
452 }
453 inc_syscw(current);
454 file_end_write(file);
455 }
456
457 return ret;
458 }
445-448:对应socket类型的文件系统,file->f_op->wiret一定是NULL,即会调用do_sync_write函数:
383 ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
384 {
385 struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
386 struct kiocb kiocb;
387 ssize_t ret;
388
389 init_sync_kiocb(&kiocb, filp);
390 kiocb.ki_pos = *ppos;
391 kiocb.ki_left = len;
392 kiocb.ki_nbytes = len;
393
394 ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
395 if (-EIOCBQUEUED == ret)
396 ret = wait_on_sync_kiocb(&kiocb);
397 *ppos = kiocb.ki_pos;
398 return ret;
399 }
394:参见2.1章,socket文件系统中filp->f_op->aio_wirte指向的函数是
sock_aio_write:
938 static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb,
939 struct file *file, const struct iovec *iov,
940 unsigned long nr_segs)
941 {
942 struct socket *sock = file->private_data;
943 size_t size = 0;
944 int i;
945
946 for (i = 0; i < nr_segs; i++)
947 size += iov[i].iov_len;
948
949 msg->msg_name = NULL;
950 msg->msg_namelen = 0;
951 msg->msg_control = NULL;
952 msg->msg_controllen = 0;
953 msg->msg_iov = (struct iovec *)iov;
954 msg->msg_iovlen = nr_segs;
955 msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
956 if (sock->type == SOCK_SEQPACKET)
957 msg->msg_flags |= MSG_EOR;
958
959 return __sock_sendmsg(iocb, sock, msg, size);
960 }
961
962 static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov,
963 unsigned long nr_segs, loff_t pos)
964 {
965 struct sock_iocb siocb, *x;
966
967 if (pos != 0)
968 return -ESPIPE;
969
970 x = alloc_sock_iocb(iocb, &siocb);
971 if (!x)
972 return -ENOMEM;
973
974 return do_sock_write(&x->async_msg, iocb, iocb->ki_filp, iov, nr_segs);
975 }
可见,与send、sendto系统调用一样,write系统调用在内核中最终调用的也是__sock_sendmsg。下面来分析__sock_sendmsg函数的代码:
615 static inline int __sock_sendmsg_nosec(struct kiocb *iocb, struct socket *sock,
616 struct msghdr *msg, size_t size)
617 {
618 struct sock_iocb *si = kiocb_to_siocb(iocb);
619
620 si->sock = sock;
621 si->scm = NULL;
622 si->msg = msg;
623 si->size = size;
624
625 return sock->ops->sendmsg(iocb, sock, msg, size);
626 }
627
628 static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock,
629 struct msghdr *msg, size_t size)
630 {
631 int err = security_socket_sendmsg(sock, msg, size);
632
633 return err ?: __sock_sendmsg_nosec(iocb, sock, msg, size);
634 }
625:这个函数指针指向的是
tcp_sendmsg函数:
1016 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1017 size_t size)
1018 {
1019 struct iovec *iov;
1020 struct tcp_sock *tp = tcp_sk(sk);
1021 struct sk_buff *skb;
1022 int iovlen, flags, err, copied = 0;
1023 int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
1024 bool sg;
1025 long timeo;
1026
1027 lock_sock(sk);
1028
1029 flags = msg->msg_flags;
1030 if (flags & MSG_FASTOPEN) { //使用TFO功能
1031 err = tcp_sendmsg_fastopen(sk, msg, &copied_syn);//发送TFO数据
1032 if (err == -EINPROGRESS && copied_syn > 0)
1033 goto out;
1034 else if (err)
1035 goto out_err;
1036 offset = copied_syn;
1037 }
1038
1039 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1040
1041 /* Wait for a connection to finish. One exception is TCP Fast Open
1042 * (passive side) where data is allowed to be sent before a connection
1043 * is fully established.
1044 */
1045 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1046 !tcp_passive_fastopen(sk)) {
1047 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
1048 goto do_error;
1049 }
1050
1051 if (unlikely(tp->repair)) {
1052 if (tp->repair_queue == TCP_RECV_QUEUE) {
1053 copied = tcp_send_rcvq(sk, msg, size);
1054 goto out;
1055 }
1056
1057 err = -EINVAL;
1058 if (tp->repair_queue == TCP_NO_QUEUE)
1059 goto out_err;
1060
1061 /* 'common' sending to sendq */
1062 }
1063
1064 /* This should be in poll */
1065 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1066
1067 mss_now = tcp_send_mss(sk, &size_goal, flags);//获取当前最大报文段的值
1068
1069 /* Ok commence sending. */
1070 iovlen = msg->msg_iovlen;//数据块总个数
1071 iov = msg->msg_iov;//起始数据块
1072 copied = 0;
1073
1074 err = -EPIPE;
1075 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))//如果socket出错或不允许再发送数据
1076 goto out_err;
1077
1078 sg = !!(sk->sk_route_caps & NETIF_F_SG);//如果网卡支持分散-聚集IO(Scatter/Gather IO),则sg为1
1079
1080 while (--iovlen >= 0) {//如果有数据块没有发送
1081 size_t seglen = iov->iov_len;
1082 unsigned char __user *from = iov->iov_base;
1083
1084 iov++;
1085 if (unlikely(offset > 0)) { /* Skip bytes copied in SYN */
1086 if (offset >= seglen) {//当前块的内容都已经在SYN中发送过了
1087 offset -= seglen;
1088 continue;
1089 }
1090 seglen -= offset;
1091 from += offset;
1092 offset = 0;
1093 }
1094
1095 while (seglen > 0) {//块中有数据没有发送
1096 int copy = 0;
1097 int max = size_goal;
1098
1099 skb = tcp_write_queue_tail(sk);
1100 if (tcp_send_head(sk)) {//如果还有包没有发送
1101 if (skb->ip_summed == CHECKSUM_NONE)
1102 max = mss_now;
1103 copy = max - skb->len;//计算包中的剩余空间大小
1104 }
1105
1106 if (copy <= 0) {//没有剩余空间
1107 new_segment:
1108 /* Allocate new segment. If the interface is SG,
1109 * allocate skb fitting to single page.
1110 */
1111 if (!sk_stream_memory_free(sk))//检查发送队列中SKB占用的内存是否超出了socket的限制
1112 goto wait_for_sndbuf;
1113
1114 skb = sk_stream_alloc_skb(sk,
1115 select_size(sk, sg),
1116 sk->sk_allocation);//申请新的SKB,其数据部分空间大小是一个最大报文段的值
1117 if (!skb)
1118 goto wait_for_memory;
1119
1120 /*
1121 * All packets are restored as if they have
1122 * already been sent.
1123 */
1124 if (tp->repair)
1125 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1126
1127 /*
1128 * Check whether we can use HW checksum.
1129 */
1130 if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
1131 skb->ip_summed = CHECKSUM_PARTIAL;
1132
1133 skb_entail(sk, skb);//将SKB放入发送队列末尾
1134 copy = size_goal;
1135 max = size_goal;
1136 }
1137
1138 /* Try to append data to the end of skb. */
1139 if (copy > seglen)
1140 copy = seglen;
1141
1142 /* Where to copy to? */
1143 if (skb_availroom(skb) > 0) {//线性区还有空间
1144 /* We have some space in skb head. Superb! */
1145 copy = min_t(int, copy, skb_availroom(skb));
1146 err = skb_add_data_nocache(sk, skb, from, copy);//将用户态内存中的数据copy到SKB中
1147 if (err)
1148 goto do_fault;
1149 } else {
1150 bool merge = true;
1151 int i = skb_shinfo(skb)->nr_frags;//已经分配非连续页的数量
1152 struct page_frag *pfrag = sk_page_frag(sk);
1153
1154 if (!sk_page_frag_refill(sk, pfrag))//判断当前页是否有空间可写;如果没有则申请新页,申请不到则需要等待有内存可用
1155 goto wait_for_memory;
1156
1157 if (!skb_can_coalesce(skb, i, pfrag->page,
1158 pfrag->offset)) {//判断当前页是否需要加入到skb_shinfo(skb)->frags数组中
1159 if (i == MAX_SKB_FRAGS || !sg) {
1160 tcp_mark_push(tp, skb);
1161 goto new_segment;
1162 }
1163 merge = false;//不需要加入,因为当前页是skb_shinfo(skb)->frags数组最后的成员且有空间可写
1164 }
1165
1166 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1167
1168 if (!sk_wmem_schedule(sk, copy))//查看socket的内存限制
1169 goto wait_for_memory;
1170
1171 err = skb_copy_to_page_nocache(sk, from, skb,
1172 pfrag->page,
1173 pfrag->offset,
1174 copy);//将用户态空间中的数据copy到页中
1175 if (err)
1176 goto do_error;
1177
1178 /* Update the skb. */
1179 if (merge) {//没有申请新页
1180 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);//更新页大小
1181 } else {
1182 skb_fill_page_desc(skb, i, pfrag->page,
1183 pfrag->offset, copy);//将新页加入到skb_shinfo(skb)->frags数组中
1184 get_page(pfrag->page);
1185 }
1186 pfrag->offset += copy;
1187 }
1188
1189 if (!copied)
1190 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1191
1192 tp->write_seq += copy;
1193 TCP_SKB_CB(skb)->end_seq += copy;
1194 skb_shinfo(skb)->gso_segs = 0;
1195
1196 from += copy;
1197 copied += copy;
1198 if ((seglen -= copy) == 0 && iovlen == 0)//数据全部copy到内核空间
1199 goto out;
1200
1201 if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
1202 continue;//如果要发送的数据是紧急数据,则立刻跳出当前循环,尽快copy全部数据并发送
1203
1204 if (forced_push(tp)) {//如果积累了一定长度的数据没有push,在数据全部copy进内核前也可以先发送一次
1205 tcp_mark_push(tp, skb);//在SKB中增加PSH标记,更新pushed_seq标记
1206 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);//发送数据
1207 } else if (skb == tcp_send_head(sk)) //如果当前包是发送队列中第一个包,先发送之
1208 tcp_push_one(sk, mss_now);//只发送一个包
1209 continue;
1210
1211 wait_for_sndbuf:
1212 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1213 wait_for_memory:
1214 if (copied)//如果已经有copy进内核的数据,在睡眠等待内存前先发送一次
1215 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1216
1217 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)//睡眠直到有空余内存或超时
1218 goto do_error;
1219
1220 mss_now = tcp_send_mss(sk, &size_goal, flags);
1221 }
1222 }
1223
1224 out:
1225 if (copied)//如果有数据copy到内核
1226 tcp_push(sk, flags, mss_now, tp->nonagle);//发送数据
1227 release_sock(sk);//解除软中断不能访问socekt的标记,并处理backlog中积累的数据包
1228 return copied + copied_syn;
1229
1230 do_fault:
1231 if (!skb->len) {
1232 tcp_unlink_write_queue(skb, sk);
1233 /* It is the one place in all of TCP, except connection
1234 * reset, where we can be unlinking the send_head.
1235 */
1236 tcp_check_send_head(sk, skb);
1237 sk_wmem_free_skb(sk, skb);
1238 }
1239
1240 do_error:
1241 if (copied + copied_syn)
1242 goto out;
1243 out_err:
1244 err = sk_stream_error(sk, flags, err);
1245 release_sock(sk);
1246 return err;
1247 }
1027:标识socket正在被系统调用访问,软中断收到这个socket的包后只能挂到backlog队列中然后返回,待系统调用结束时调用release_sock(sk)处理
1150-1163:线性区没有空间,但还允许向这个skb中写入数据,原因是mss变大了或者网卡支持分散-聚集IO,只能将数据保存在非连续空间中;如果网卡不支持分散-聚集IO,则系统会在将数据发送到驱动前将非线性区中的数据线性化
1201:skb->len < max为真意味着skb还可以申请一些空间保存数据,故先不发送,要尽量填满skb以提高发送效率
tcp_sendmsg函数的主要功能是:
1、如果发送队列尾部的skb尚未发送而且还有剩余空间,则将用户缓存中的数据copy进去;如果没有这样的空间则申请一个数据空间固定大小的skb,再copy数据;
2、一个skb的空间如果不够就再申请一个固定大小的skb,再copy数据,直到数据全部copy完毕,或skb的缓存空间无法申请,或发送缓存达到限制为止;若是后两种情况,如果socket是非阻塞的则立即返回,否则会等待能够得到空间或超时
3、将申请的skb放入发送队列尾部,再调用tcp_push、__tcp_push_pending_frames或tcp_push_one函数发送队列中的skb
tcp_push函数:
619 static inline void tcp_push(struct sock *sk, int flags, int mss_now,
620 int nonagle)
621 {
622 if (tcp_send_head(sk)) { //有数据未发送
623 struct tcp_sock *tp = tcp_sk(sk);
624
625 if (!(flags & MSG_MORE) || forced_push(tp))
626 tcp_mark_push(tp, tcp_write_queue_tail(sk));
627
628 tcp_mark_urg(tp, flags); //如果客户设置要发送OOB数据,则记录紧急数据的下一个字节的序列号,其实紧急数据就是当前数据包的最后一个字节数据
629 __tcp_push_pending_frames(sk, mss_now,
630 (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
631 }
632 }
__tcp_push_pending_frames、tcp_push_one都会调用tcp_write_xmit函数发送数据:
1811 static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1812 int push_one, gfp_t gfp)
1813 {
1814 struct tcp_sock *tp = tcp_sk(sk);
1815 struct sk_buff *skb;
1816 unsigned int tso_segs, sent_pkts;
1817 int cwnd_quota;
1818 int result;
1819
1820 sent_pkts = 0;
1821
1822 if (!push_one) {
1823 /* Do MTU probing. */
1824 result = tcp_mtu_probe(sk);
1825 if (!result) {
1826 return false;
1827 } else if (result > 0) {
1828 sent_pkts = 1;
1829 }
1830 }
1831
1832 while ((skb = tcp_send_head(sk))) {//获取没有发送的最老的skb
1833 unsigned int limit;
1834
1835
1836 tso_segs = tcp_init_tso_segs(sk, skb, mss_now);//获取网卡将这个skb分割成的段的个数
1837 BUG_ON(!tso_segs);
1838
1839 if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE)
1840 goto repair; /* Skip network transmission */
1841
1842 cwnd_quota = tcp_cwnd_test(tp, skb);//计算拥塞窗口允许发送的字节数
1843 if (!cwnd_quota) {
1844 if (push_one == 2)//如果要发送TCP探测包
1845 /* Force out a loss probe pkt. */
1846 cwnd_quota = 1;//允许发送一个字节
1847 else
1848 break;
1849 }
1850
1851 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) //检查发送窗口是否允许发送数据
1852 break;
1853
1854 if (tso_segs == 1) {//网卡会将此skb按一个包发送
1855 if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
1856 (tcp_skb_is_last(sk, skb) ?
1857 nonagle : TCP_NAGLE_PUSH))))//查看nagle算法是否运行发送当前包
1858 break;
1859 } else {//网卡会将此skb分割成多个包发送
1860 if (!push_one && tcp_tso_should_defer(sk, skb))//判断在希望发送多个包时TOS是否会延迟发送该包
1861 break;
1862 }
1863
1864 /* TSQ : sk_wmem_alloc accounts skb truesize,
1865 * including skb overhead. But thats OK.
1866 */
1867 if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) {
1868 set_bit(TSQ_THROTTLED, &tp->tsq_flags);
1869 break;
1870 }
1871 limit = mss_now;
1872 if (tso_segs > 1 && !tcp_urg_mode(tp))//网卡会将此skb分割成多个包发送并且没有紧急数据
1873 limit = tcp_mss_split_point(sk, skb, mss_now,
1874 min_t(unsigned int,
1875 cwnd_quota,
1876 sk->sk_gso_max_segs));//计算网卡能一次发送的字节数
1877
1878 if (skb->len > limit && //如果包过大
1879 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))//就得拆成两个包,当前包的大小会减小为与limit一致
1880 break;
1881
1882 TCP_SKB_CB(skb)->when = tcp_time_stamp;//记录发送时间
1883
1884 if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))//发送数据的副本
1885 break;
1886
1887 repair:
1888 /* Advance the send_head. This one is sent out.
1889 * This call will increment packets_out.
1890 */
1891 tcp_event_new_data_sent(sk, skb);//send_head指向下一个要发送的包
1892
1893 tcp_minshall_update(tp, mss_now, skb);
1894 sent_pkts += tcp_skb_pcount(skb);//计算已经发送的包的个数
1895
1896 if (push_one)
1897 break;
1898 }
1899
1900 if (likely(sent_pkts)) {//至少发送了一个包
1901 if (tcp_in_cwnd_reduction(sk))
1902 tp->prr_out += sent_pkts;
1903
1904 /* Send one loss probe per tail loss episode. */
1905 if (push_one != 2)
1906 tcp_schedule_loss_probe(sk);
1907 tcp_cwnd_validate(sk);
1908 return false;
1909 }
1910 return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));
1911 }
tcp_write_xmit函数会根据拥塞窗口、发送窗口、nagle算法等条件判断是否发送数据以及发送多少,即新放入发送队列中的数据不一定会立即发送。而且发送包时是发送skb的副本,原来的skb会一直呆在发送队列中,
如果发生了数据丢失则TCP会将发送队列中的skb再发送一次,直到数据被确认时才能删除。
可以发送时tcp_write_xmit函数会调用tcp_transmit_skb函数发送skb:
828 static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
829 gfp_t gfp_mask)
830 {
831 const struct inet_connection_sock *icsk = inet_csk(sk);
832 struct inet_sock *inet;
833 struct tcp_sock *tp;
834 struct tcp_skb_cb *tcb;
835 struct tcp_out_options opts;
836 unsigned int tcp_options_size, tcp_header_size;
837 struct tcp_md5sig_key *md5;
838 struct tcphdr *th;
839 int err;
840
841 BUG_ON(!skb || !tcp_skb_pcount(skb));
842
843 /* If congestion control is doing timestamping, we must
844 * take such a timestamp before we potentially clone/copy.
845 */
846 if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
847 __net_timestamp(skb);
848
849 if (likely(clone_it)) {//clone一个副本发送出去,原本留在队列中等待ACK确认后再删除
850 const struct sk_buff *fclone = skb + 1;
851
852 if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
853 fclone->fclone == SKB_FCLONE_CLONE))
854 NET_INC_STATS_BH(sock_net(sk),
855 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
856
857 if (unlikely(skb_cloned(skb)))
858 skb = pskb_copy(skb, gfp_mask);
859 else
860 skb = skb_clone(skb, gfp_mask);
861 if (unlikely(!skb))
862 return -ENOBUFS;
863 }
864
865 inet = inet_sk(sk);
866 tp = tcp_sk(sk);
867 tcb = TCP_SKB_CB(skb);
868 memset(&opts, 0, sizeof(opts));
869
870 if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
871 tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);//构建SYN包的选项信息
872 else
873 tcp_options_size = tcp_established_options(sk, skb, &opts,//构建非SYN包的选项信息
874 &md5);
875 tcp_header_size = tcp_options_size + sizeof(struct tcphdr);//计算TCP头长度
876
877 if (tcp_packets_in_flight(tp) == 0)//没有停留在网络中的包(即发送后未被确认的包,包括重传包)
878 tcp_ca_event(sk, CA_EVENT_TX_START);
879
880 /* if no packet is in qdisc/device queue, then allow XPS to select
881 * another queue.
882 */
883 skb->ooo_okay = sk_wmem_alloc_get(sk) == 0;
884
885 skb_push(skb, tcp_header_size);//skb->data指向TCP头
886 skb_reset_transport_header(skb);
887
888 skb_orphan(skb);
889 skb->sk = sk;
890 skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
891 tcp_wfree : sock_wfree;
892 atomic_add(skb->truesize, &sk->sk_wmem_alloc);
893
894 /* Build TCP header and checksum it. */
895 th = tcp_hdr(skb);
896 th->source = inet->inet_sport;//设置源端口
897 th->dest = inet->inet_dport;//设置目的端口
898 th->seq = htonl(tcb->seq);//设置序列号
899 th->ack_seq = htonl(tp->rcv_nxt);//设置确认号
900 *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
901 tcb->tcp_flags);//设置控制标记位
902
903 if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
904 /* RFC1323: The window in SYN & SYN/ACK segments
905 * is never scaled.
906 */
907 th->window = htons(min(tp->rcv_wnd, 65535U));
908 } else {
909 th->window = htons(tcp_select_window(sk));
910 }
911 th->check = 0;
912 th->urg_ptr = 0;
913
914 /* The urg_mode check is necessary during a below snd_una win probe */
915 if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {//有紧急数据且当前包的序列号小于紧急数据的下一字节的序列号
916 if (before(tp->snd_up, tcb->seq + 0x10000)) {//紧急数据下一字节的序列号与当前包的序列号的差值小于等于紧急指针的最大值(65535)
917 th->urg_ptr = htons(tp->snd_up - tcb->seq);//设置紧急指针指向紧急数据的下一字节
918 th->urg = 1;//标识紧急指针有效,即当前包的数据是紧急数据
919 } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
920 th->urg_ptr = htons(0xFFFF);//设置紧急指针为最大值,以示紧急数据并不在当前包内
921 th->urg = 1;
922 }
923 }
924
925 tcp_options_write((__be32 *)(th + 1), tp, &opts);//将之前构建的选项写入SKB中
926 if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))
927 TCP_ECN_send(sk, skb, tcp_header_size);
928
929 #ifdef CONFIG_TCP_MD5SIG
930 /* Calculate the MD5 hash, as we have all we need now */
931 if (md5) {
932 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
933 tp->af_specific->calc_md5_hash(opts.hash_location,
934 md5, sk, NULL, skb);
935 }
936 #endif
937
938 icsk->icsk_af_ops->send_check(sk, skb);//调用tcp_v4_send_check或tcp_v6_send_check计算TCP检验和
939
940 if (likely(tcb->tcp_flags & TCPHDR_ACK))//包中有ACK标记
941 tcp_event_ack_sent(sk, tcp_skb_pcount(skb));//可以取消延迟ACK定时器,因为即将发送的包中已经携带ACK标记
942
943 if (skb->len != tcp_header_size)
944 tcp_event_data_sent(tp, sk);
945
946 if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
947 TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
948 tcp_skb_pcount(skb));
949
950 err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl);//调用ip_queue_xmit或inet6_csk_xmit构建IP头并将包发送出去
951 if (likely(err <= 0))
952 return err;
953
954 tcp_enter_cwr(sk, 1);
955
956 return net_xmit_eval(err);
957 }
919-921:紧急数据下一字节的序列号与当前包的序列号的差值大于紧急指针65535,且最新发送的数据序列与紧急数据下一字节的序列号的差值小于65535。这意味着当前包的序列号太旧,没有必要在这里设置紧急指针
923:所有序列号小于紧急数据最后一字节的序列号,且与在最新已发送数据的的序列号差值小于等于紧急指针的最大值的数据(包括重传)在发送时都会被设置紧急指针,故TCP没法设置紧急数据从何处开始,也不需要,因为紧急数据只有1字节而已,只要能通过紧急指针找到这字节数据即可
关于紧急指针的功能后续章节再讨论。
综上,TCP发送数据的系统调用会将用户指定缓冲区的内容复制到内核中,内核存储数据的结构是skb。每个skb存储数据的空间大小的固定的,即,用户进程指定的连续缓存中的数据可能会被TCP拆成多个固定长度的skb来存储并发送,这被称为“分段”。发送时一个skb会被封成一个TCP报文,进而封装成一个IP报文。skb包在放入队列时是一个一个添加到队列尾部顺序放入,发送时是从队列头开始一个一个按序发送,从而保证了发送数据的顺序与用户缓存中的顺序是一致的。发数据的系统调用返回值是“成功发送的字节数”,但实际上只是“成功放入TCP发送队列中的字节数”,放入队列的数据不一定会立即发送,但TCP会负责将这些数据可靠的发送给对端,在成功之前不会删除。也就是说,用户进程可以认为数据已经发送成功了。
TCP也可以copy大段数据到一个skb中从而减少分段,这种技术名为“TSO (TCP Segmentation Offload)”。