splice是linux2.6内核中新增的零拷贝数据发送函数,主要用于将数据发送到管道或从管道中接收数据。于splice类似的零拷贝发送函数还有sendfile,不同的是sendfile不能发送socket中的数据。所谓零拷贝是指(与传统的read/write模式相比),在数据发送的过程中,不会产生用户态、内核态之间的数据拷贝。在代理模式下,使用经典的read/write方式转发socket数据的流程为:
buf = malloc(len); \\首先申请一块长度为len的内存
read(sockfd1, buf, len); \\将第一个socket sockfd1中len长度的数据读入buf
write(sockfd2, buf, len); \\将buf中的数据通过sockfd2发送出去
这种转发数据的方式会导致数据会在内核态socket buff与用户态的buf之间发生两次copy(读一次,写一次)。
为了提高数据转发的效率,于是splice横空出世!splice系统调用的原型为:
ssize_t splice(int fd_in, loff_t *off_in, int fd_out,
loff_t *off_out, size_t len, unsigned int flags);
参数解析:
fd_in:要读入数据的文件描述符
off_in:要读入数据的起始偏移
fd_out:要写入数据的文件描述符
off_out:要写入数据的起始偏移
len:要写入数据的长度
flags:标志位
要特别强调的一点是,fd_in和fd_out中必须有一个是管道的描述符
使用splice转发数据的流程为:
1、调用mkfifo或者pipe创建一个管道
2、splice(sockfd1, &off_in_1, pipe_fd_w, &off_out_w, len, 0) \\将sockfd1中的数据移动到管道的写端
3、splice(pipe_fd_r, &off_in_r, sockfd2, &off_out_2, len, 0) \\通过管道的读端,将数据通过sockfd2发送出去
splice的效率为什么会比普通的read/write方法高呢?下面我们来分析一下代码。splice系统调用对应的内核代码为:1721 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1722 int, fd_out, loff_t __user *, off_out,
1723 size_t, len, unsigned int, flags)
1724 {
1725 struct fd in, out;
1726 long error;
1727
1728 if (unlikely(!len))
1729 return 0;
1730
1731 error = -EBADF;
1732 in = fdget(fd_in);
1733 if (in.file) {
1734 if (in.file->f_mode & FMODE_READ) { //fd_in可读
1735 out = fdget(fd_out);
1736 if (out.file) {
1737 if (out.file->f_mode & FMODE_WRITE) //fd_out可写
1738 error = do_splice(in.file, off_in,
1739 out.file, off_out,
1740 len, flags);
1741 fdput(out);
1742 }
1743 }
1744 fdput(in);
1745 }
1746 return error;
1747 }
do_splice函数:
1324 static long do_splice(struct file *in, loff_t __user *off_in,
1325 struct file *out, loff_t __user *off_out,
1326 size_t len, unsigned int flags)
1327 {
1328 struct pipe_inode_info *ipipe;
1329 struct pipe_inode_info *opipe;
1330 loff_t offset;
1331 long ret;
1332
1333 ipipe = get_pipe_info(in);
1334 opipe = get_pipe_info(out);
1335
1336 if (ipipe && opipe) { //in和out都是管道
...
1350 return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1351 }
1352
1353 if (ipipe) { //in是管道
1354 if (off_in)
1355 return -ESPIPE;
1356 if (off_out) {
1357 if (!(out->f_mode & FMODE_PWRITE))
1358 return -EINVAL;
1359 if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1360 return -EFAULT;
1361 } else {
1362 offset = out->f_pos;
1363 }
1364
1365 ret = do_splice_from(ipipe, out, &offset, len, flags); //从管道中度数据发送到out
1366
1367 if (!off_out)
1368 out->f_pos = offset;
1369 else if (copy_to_user(off_out, &offset, sizeof(loff_t)))
1370 ret = -EFAULT;
1371
1372 return ret;
1373 }
1374
1375 if (opipe) { //out是管道
1376 if (off_out)
1377 return -ESPIPE;
1378 if (off_in) {
1379 if (!(in->f_mode & FMODE_PREAD))
1380 return -EINVAL;
1381 if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1382 return -EFAULT;
1383 } else {
1384 offset = in->f_pos;
1385 }
1386
1387 ret = do_splice_to(in, &offset, opipe, len, flags); //从in中读取数据放入管道
1388
1389 if (!off_in)
1390 in->f_pos = offset;
1391 else if (copy_to_user(off_in, &offset, sizeof(loff_t)))
1392 ret = -EFAULT;
1393
1394 return ret;
1395 }
1396
1397 return -EINVAL;
1398 }
代理模式下in和out必有一个是socket,所以不会走1353行的分支。
首先分析从socket读数据放入管道的流程,这个功能由do_splice_to函数完成:1127 static long do_splice_to(struct file *in, loff_t *ppos,
1128 struct pipe_inode_info *pipe, size_t len,
1129 unsigned int flags)
1130 {
1131 ssize_t (*splice_read)(struct file *, loff_t *,
1132 struct pipe_inode_info *, size_t, unsigned int);
1133 int ret;
1134
1135 if (unlikely(!(in->f_mode & FMODE_READ)))
1136 return -EBADF;
1137
1138 ret = rw_verify_area(READ, in, ppos, len);
1139 if (unlikely(ret < 0))
1140 return ret;
1141
1142 if (in->f_op && in->f_op->splice_read)//如果文件是socket,则in->f_op->splice_read指向sock_splice_read
1143 splice_read = in->f_op->splice_read;
1144 else
1145 splice_read = default_file_splice_read;
1146
1147 return splice_read(in, ppos, pipe, len, flags);
1148 }
sock_splice_read函数:
871 static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
872 struct pipe_inode_info *pipe, size_t len,
873 unsigned int flags)
874 {
875 struct socket *sock = file->private_data;
876
877 if (unlikely(!sock->ops->splice_read))
878 return -EINVAL;
879
880 return sock->ops->splice_read(sock, ppos, pipe, len, flags);//指向tcp_splice_read
881 }
tcp_splice_read函数:
670 ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
671 struct pipe_inode_info *pipe, size_t len,
672 unsigned int flags)
673 {
674 struct sock *sk = sock->sk;
675 struct tcp_splice_state tss = {
676 .pipe = pipe,
677 .len = len,
678 .flags = flags,
679 };
680 long timeo;
681 ssize_t spliced;
682 int ret;
683
684 sock_rps_record_flow(sk);
685 /*
686 * We can't seek on a socket input
687 */
688 if (unlikely(*ppos))
689 return -ESPIPE;
690
691 ret = spliced = 0;
692
693 lock_sock(sk);
694
695 timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
696 while (tss.len) {
697 ret = __tcp_splice_read(sk, &tss);//将socket缓存中的数据读到管道中
698 if (ret < 0)
699 break;
700 else if (!ret) {//没有读到数据
701 if (spliced)
702 break;
703 if (sock_flag(sk, SOCK_DONE))
704 break;
705 if (sk->sk_err) {
706 ret = sock_error(sk);
707 break;
708 }
709 if (sk->sk_shutdown & RCV_SHUTDOWN)
710 break;
711 if (sk->sk_state == TCP_CLOSE) {
712 /*
713 * This occurs when user tries to read
714 * from never connected socket.
715 */
716 if (!sock_flag(sk, SOCK_DONE))
717 ret = -ENOTCONN;
718 break;
719 }
720 if (!timeo) {//不允许等待
721 ret = -EAGAIN;
722 break;
723 }
724 sk_wait_data(sk, &timeo);//缓存未填满,等待
725 if (signal_pending(current)) {
726 ret = sock_intr_errno(timeo);
727 break;
728 }
729 continue;
730 }
731 tss.len -= ret;
732 spliced += ret;
733
734 if (!timeo)
735 break;
736 release_sock(sk);
737 lock_sock(sk);
738
739 if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
740 (sk->sk_shutdown & RCV_SHUTDOWN) ||
741 signal_pending(current))
742 break;
743 }
744
745 release_sock(sk);
746
747 if (spliced)
748 return spliced;
749
750 return ret;
751 }
__tcp_splice_read
函数:
647 static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
648 {
649 /* Store TCP splice context information in read_descriptor_t. */
650 read_descriptor_t rd_desc = {
651 .arg.data = tss,
652 .count = tss->len,
653 };
654
655 return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
656 }
tcp_read_sock
函数:
1465 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1466 sk_read_actor_t recv_actor)
1467 {
1468 struct sk_buff *skb;
1469 struct tcp_sock *tp = tcp_sk(sk);
1470 u32 seq = tp->copied_seq;
1471 u32 offset;
1472 int copied = 0;
1473
1474 if (sk->sk_state == TCP_LISTEN)
1475 return -ENOTCONN;
1476 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {//获取接收队列首包
1477 if (offset < skb->len) {//offset是当前数据包中已经copy的长度
1478 int used;
1479 size_t len;
1480
1481 len = skb->len - offset;
1482 /* Stop reading if we hit a patch of urgent data */
1483 if (tp->urg_data) {
1484 u32 urg_offset = tp->urg_seq - seq;
1485 if (urg_offset < len)
1486 len = urg_offset;
1487 if (!len)
1488 break;
1489 }
1490 used = recv_actor(desc, skb, offset, len);//调用tcp_splice_data_recv将数据填入管道
1491 if (used <= 0) {
1492 if (!copied)
1493 copied = used;
1494 break;
1495 } else if (used <= len) {
1496 seq += used;
1497 copied += used;
1498 offset += used;
1499 }
1500 /* If recv_actor drops the lock (e.g. TCP splice
1501 * receive) the skb pointer might be invalid when
1502 * getting here: tcp_collapse might have deleted it
1503 * while aggregating skbs from the socket queue.
1504 */
1505 skb = tcp_recv_skb(sk, seq - 1, &offset);
1506 if (!skb)
1507 break;
1508 /* TCP coalescing might have appended data to the skb.
1509 * Try to splice more frags
1510 */
1511 if (offset + 1 != skb->len)
1512 continue;
1513 }
1514 if (tcp_hdr(skb)->fin) {
1515 sk_eat_skb(sk, skb, false);
1516 ++seq;
1517 break;
1518 }
1519 sk_eat_skb(sk, skb, false);//释放skb
1520 if (!desc->count)
1521 break;
1522 tp->copied_seq = seq;
1523 }
1524 tp->copied_seq = seq;
1525
1526 tcp_rcv_space_adjust(sk);//更新缓存空间信息
1527
1528 /* Clean up data we have read: This will do ACK frames. */
1529 if (copied > 0) {
1530 tcp_recv_skb(sk, seq, &offset);//释放已经完全copy完毕的skb
1531 tcp_cleanup_rbuf(sk, copied);//如果必要则发送ACK更新通告窗口
1532 }
1533 return copied;
1534 }
tcp_splice_data_recv
函数:
634 static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
635 unsigned int offset, size_t len)
636 {
637 struct tcp_splice_state *tss = rd_desc->arg.data;
638 int ret;
639
640 ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len),
641 tss->flags);//将skb的线性区和非线性区中的数据map到管道中
642 if (ret > 0)
643 rd_desc->count -= ret;
644 return ret;
645 }
skb_splice_bist函数使用map方法传递数据到管道,这样就实现了免copy。
tcp_recv_skb函数用于获取接收队列中的skb:
1432 static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1433 {
1434 struct sk_buff *skb;
1435 u32 offset;
1436
1437 while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { //获取队列首包
1438 offset = seq - TCP_SKB_CB(skb)->seq; //offset是已经copy的数据长度
1439 if (tcp_hdr(skb)->syn)
1440 offset--; //去掉SYN标记所占用的1位序列号
1441 if (offset < skb->len || tcp_hdr(skb)->fin) { //当前skb还有数据未读,或是有FIN标记
1442 *off = offset;
1443 return skb; //就是你了,返回
1444 }
1445 /* This looks weird, but this can happen if TCP collapsing
1446 * splitted a fat GRO packet, while we released socket lock
1447 * in skb_splice_bits()
1448 */
1449 sk_eat_skb(sk, skb, false); //整个skb都已经被读过了,废弃之
1450 }
1451 return NULL;
1452 }
这时,splice系统调用将socket中的数据读到管道中的任务已经完成。下面开始将管道中的数据发送到sockfd2,这是
do_splice_from函数的工作:
1096 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
1097 loff_t *ppos, size_t len, unsigned int flags)
1098 {
1099 ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
1100 loff_t *, size_t, unsigned int);
1101 int ret;
1102
1103 if (unlikely(!(out->f_mode & FMODE_WRITE)))
1104 return -EBADF;
1105
1106 if (unlikely(out->f_flags & O_APPEND))
1107 return -EINVAL;
1108
1109 ret = rw_verify_area(WRITE, out, ppos, len);
1110 if (unlikely(ret < 0))
1111 return ret;
1112
1113 if (out->f_op && out->f_op->splice_write)//如果是socket类型文件则out->f_op->splice_write指向generic_splice_sendpage
1114 splice_write = out->f_op->splice_write;
1115 else
1116 splice_write = default_file_splice_write;
1117
1118 file_start_write(out);
1119 ret = splice_write(pipe, out, ppos, len, flags);
1120 file_end_write(out);
1121 return ret;
1122 }
generic_splice_sendpage函数:
1085 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
1086 loff_t *ppos, size_t len, unsigned int flags)
1087 {
1088 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
1089 }
splice_from_pipe函数:
958 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
959 loff_t *ppos, size_t len, unsigned int flags,
960 splice_actor *actor)
961 {
962 ssize_t ret;
963 struct splice_desc sd = {
964 .total_len = len,
965 .flags = flags,
966 .pos = *ppos,
967 .u.file = out,
968 };
969
970 pipe_lock(pipe);
971 ret = __splice_from_pipe(pipe, &sd, actor); //actor函数的真实身份是pipe_to_sendpage
972 pipe_unlock(pipe);
973
974 return ret;
975 }
__splice_from_pipe函数:
927 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
928 splice_actor *actor)
929 {
930 int ret;
931
932 splice_from_pipe_begin(sd);
933 do {
934 ret = splice_from_pipe_next(pipe, sd);//检查是否有数据可读
935 if (ret > 0)
936 ret = splice_from_pipe_feed(pipe, sd, actor);//读取pipe中的数据到目的文件中
937 } while (ret > 0);
938 splice_from_pipe_end(pipe, sd);
939
940 return sd->num_spliced ? sd->num_spliced : ret;
941 }
splice_from_pipe_feed函数:
795 int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
796 splice_actor *actor)
797 {
798 int ret;
799
800 while (pipe->nrbufs) {//遍历管道中所有有数据的buffer
801 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
...
815 ret = actor(pipe, buf, sd);//调用pipe_to_sendpage发送数据
816 if (ret <= 0)
817 return ret;
...
pipe_to_sendpage
函数:
691 static int pipe_to_sendpage(struct pipe_inode_info *pipe,
692 struct pipe_buffer *buf, struct splice_desc *sd)
693 {
694 struct file *file = sd->u.file;
695 loff_t pos = sd->pos;
696 int more;
697
698 if (!likely(file->f_op && file->f_op->sendpage))
699 return -EINVAL;
700
701 more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
702
703 if (sd->len < sd->total_len && pipe->nrbufs > 1)
704 more |= MSG_SENDPAGE_NOTLAST;
705
706 return file->f_op->sendpage(file, buf->page, buf->offset,
707 sd->len, &pos, more);//对于socket,会调用sock_sendpage
708 }
sock_sendpage函数:
856 static ssize_t sock_sendpage(struct file *file, struct page *page,
857 int offset, size_t size, loff_t *ppos, int more)
858 {
859 struct socket *sock;
860 int flags;
861
862 sock = file->private_data;
863
864 flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
865 /* more is a combination of MSG_MORE and MSG_SENDPAGE_NOTLAST */
866 flags |= more;
867
868 return kernel_sendpage(sock, page, offset, size, flags);
869 }
kernel_sendpage函数:
3432 int kernel_sendpage(struct socket *sock, struct page *page, int offset,
3433 size_t size, int flags)
3434 {
3435 if (sock->ops->sendpage)
3436 return sock->ops->sendpage(sock, page, offset, size, flags);//会调用inet_sendpage
3437
3438 return sock_no_sendpage(sock, page, offset, size, flags);
3439 }
inet_sendpage函数
776 ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
777 size_t size, int flags)
778 {
779 struct sock *sk = sock->sk;
780
781 sock_rps_record_flow(sk);
782
783 /* We may need to bind the socket. */
784 if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
785 inet_autobind(sk))
786 return -EAGAIN;
787
788 if (sk->sk_prot->sendpage)
789 return sk->sk_prot->sendpage(sk, page, offset, size, flags);//指向tcp_sendpage
790 return sock_no_sendpage(sock, page, offset, size, flags);
791 }
tcp_sendpage函数:
944 int tcp_sendpage(struct sock *sk, struct page *page, int offset,
945 size_t size, int flags)
946 {
947 ssize_t res;
948
949 if (!(sk->sk_route_caps & NETIF_F_SG) || //不支持分散-聚集IO
950 !(sk->sk_route_caps & NETIF_F_ALL_CSUM)) //不支持硬件计算检验和
951 return sock_no_sendpage(sk->sk_socket, page, offset, size,
952 flags);
953
954 lock_sock(sk);
955 res = do_tcp_sendpages(sk, page, offset, size, flags);
956 release_sock(sk);
957 return res;
958 }
网卡支持分散-聚集IO并且支持硬件计算检验和时tcp_sendpages会调用
do_tcp_sendpages函数:
827 static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
828 size_t size, int flags)
829 {
830 struct tcp_sock *tp = tcp_sk(sk);
831 int mss_now, size_goal;
832 int err;
833 ssize_t copied;
834 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
835
836 /* Wait for a connection to finish. One exception is TCP Fast Open
837 * (passive side) where data is allowed to be sent before a connection
838 * is fully established.
839 */
840 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
841 !tcp_passive_fastopen(sk)) {
842 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
843 goto out_err;
844 }
845
846 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
847
848 mss_now = tcp_send_mss(sk, &size_goal, flags);
849 copied = 0;
850
851 err = -EPIPE;
852 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
853 goto out_err;
854
855 while (size > 0) {
856 struct sk_buff *skb = tcp_write_queue_tail(sk);
857 int copy, i;
858 bool can_coalesce;
859
860 if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
861 new_segment:
862 if (!sk_stream_memory_free(sk))
863 goto wait_for_sndbuf;
864
865 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);//申请一个线性空间长度为0的skb,将数据全部放入非线性空间中
866 if (!skb)
867 goto wait_for_memory;
868
869 skb_entail(sk, skb);
870 copy = size_goal;
871 }
872
873 if (copy > size)
874 copy = size;
875
876 i = skb_shinfo(skb)->nr_frags;
877 can_coalesce = skb_can_coalesce(skb, i, page, offset);
878 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
879 tcp_mark_push(tp, skb);
880 goto new_segment;
881 }
882 if (!sk_wmem_schedule(sk, copy))
883 goto wait_for_memory;
884
885 if (can_coalesce) {//这个页已经在frag_list中,且有新数据在pipe中写入
886 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);//更新page大小即可
887 } else {
888 get_page(page);
889 skb_fill_page_desc(skb, i, page, offset, copy);//将pipe中的page直接放入skb的frag_list中,无需数据copy
890 }
891 skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
892
893 skb->len += copy;
894 skb->data_len += copy;
895 skb->truesize += copy;
896 sk->sk_wmem_queued += copy;
897 sk_mem_charge(sk, copy);
898 skb->ip_summed = CHECKSUM_PARTIAL;
899 tp->write_seq += copy;
900 TCP_SKB_CB(skb)->end_seq += copy;
901 skb_shinfo(skb)->gso_segs = 0;
902
903 if (!copied)
904 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
905
906 copied += copy;
907 offset += copy;
908 if (!(size -= copy))
909 goto out;
910
911 if (skb->len < size_goal || (flags & MSG_OOB))
912 continue;
913
914 if (forced_push(tp)) {
915 tcp_mark_push(tp, skb);
916 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
917 } else if (skb == tcp_send_head(sk))
918 tcp_push_one(sk, mss_now);
919 continue;
920
921 wait_for_sndbuf:
922 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
923 wait_for_memory:
924 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
925
926 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
927 goto do_error;
928
929 mss_now = tcp_send_mss(sk, &size_goal, flags);
930 }
931
932 out:
933 if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
934 tcp_push(sk, flags, mss_now, tp->nonagle);
935 return copied;
936
937 do_error:
938 if (copied)
939 goto out;
940 out_err:
941 return sk_stream_error(sk, flags, err);
942 }
可见
tcp_sendpages函数会将pipe中的page直接放入skb的frag数组中,再利用网卡分散-聚集IO的功能发送,这样就避免了数据copy。
如果 网卡不支持分散-聚集IO或支持硬件计算检验和,就不能支持直接发送skb frag数组中的page,这时需要使用sock_no_sendpage函数发送数据:
2116 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2117 {
2118 ssize_t res;
2119 struct msghdr msg = {.msg_flags = flags};
2120 struct kvec iov;
2121 char *kaddr = kmap(page);
2122 iov.iov_base = kaddr + offset;
2123 iov.iov_len = size;
2124 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2125 kunmap(page);
2126 return res;
2127 }
这个函数会将page作为普通数据并调用kernel_sendmsg函数发送,kernel_sendmsg最终会调到tcp_sendmsg函数将page中的数据copy到skb中。也就是说,当
网卡不支持分散-聚集IO或支持硬件计算检验和时,splice系统调用会存在一次数据copy。
至此,TCP splice的流程已经完全明晰了:使用splice函数转发一次数据需要2次系统调用和0次(网卡支持分散-聚集IO并且支持硬件计算检验和)或1次(网卡不支持分散-聚集IO或支持硬件计算检验和)数据copy,与普通的数据转发技术相比节约了1到2次数据copy。但这个技术需要两次调用splice系统调用,而且需要一个pipe作为数据中转站,这样使得效率的提升受到了限制。如果能够直接在两个TCP socket直接传递数据,从而实现真正的“零copy”,则会是很令人兴奋的事情。
利用splice技术也可以提高发送文件的效率。通常,发送一个文件中数据的流程是:
(1)使用read系统调用读取file中的数据;这个过程中有两次数据copy:从硬件的DMA copy到内核缓存,从内核缓存到用户态缓存;有两次进程上下文切换;(2)使用write、send、sendmsg等系统调用将用户态缓存中的数据发送到网络中;这个过程中有两次数据copy:从用户态缓存到内核socket 缓存,从socket缓存到硬件的DMA;有两次进程上下文切换。
发送一次数据需要四次数据copy,四次进程上下文切换 。
使用sendfile时发送数据的流程是:
(1) 进程切换到内核态;
(2)将file中的数据从硬件的DMA copy到内核缓存 ;
(3)将内核缓存中的数据放入到socket缓存,这一步是0 copy;
(4)将 socket缓存 中的数据copy到 硬件的DMA 。
(5) 进程切换回用户态。
发送一次数据需要两次数据copy,两次进程上下文切换。可见,使用sendfile系统调用在内核态完成数据的传递,从而能大大减少开销,提高性能。
sendfile系统调用原型:
其中out_fd是数据要写入的文件的描述符,in_fd是读取数据的文件的描述符。in_fd必须不能是socket,out_fd可以是普通文件也可以是socket。ssize_t sendfile(int out_fd, int in_fd, off_t *offset, size_t count);
- sendfile系统调用对应的内核函数为sys_sendfile:
核心函数do_splice_direct函数:1061 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, 1062 size_t count, loff_t max) 1063 { 1064 struct fd in, out; 1065 struct inode *in_inode, *out_inode; 1066 loff_t pos; 1067 loff_t out_pos; 1068 ssize_t retval; 1069 int fl; 1070 1071 /* 1072 * Get input file, and verify that it is ok.. 1073 */ 1074 retval = -EBADF; 1075 in = fdget(in_fd); 1076 if (!in.file) 1077 goto out; 1078 if (!(in.file->f_mode & FMODE_READ)) 1079 goto fput_in; 1080 retval = -ESPIPE; 1081 if (!ppos) { 1082 pos = in.file->f_pos; 1083 } else { 1084 pos = *ppos; 1085 if (!(in.file->f_mode & FMODE_PREAD)) 1086 goto fput_in; 1087 } 1088 retval = rw_verify_area(READ, in.file, &pos, count);//计算可读的字节数 1089 if (retval < 0) 1090 goto fput_in; 1091 count = retval; 1092 1093 /* 1094 * Get output file, and verify that it is ok.. 1095 */ 1096 retval = -EBADF; 1097 out = fdget(out_fd); 1098 if (!out.file) 1099 goto fput_in; 1100 if (!(out.file->f_mode & FMODE_WRITE)) 1101 goto fput_out; 1102 retval = -EINVAL; 1103 in_inode = file_inode(in.file); 1104 out_inode = file_inode(out.file); 1105 out_pos = out.file->f_pos; 1106 retval = rw_verify_area(WRITE, out.file, &out_pos, count);//计算能写入的字节数 1107 if (retval < 0) 1108 goto fput_out; 1109 count = retval; 1110 1111 if (!max) 1112 max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); 1113 1114 if (unlikely(pos + count > max)) { 1115 retval = -EOVERFLOW; 1116 if (pos >= max) 1117 goto fput_out; 1118 count = max - pos; 1119 } 1120 1121 fl = 0; 1122 #if 0 1123 /* 1124 * We need to debate whether we can enable this or not. The 1125 * man page documents EAGAIN return for the output at least, 1126 * and the application is arguably buggy if it doesn't expect 1127 * EAGAIN on a non-blocking file descriptor. 1128 */ 1129 if (in.file->f_flags & O_NONBLOCK) 1130 fl = SPLICE_F_NONBLOCK; 1131 #endif 1132 retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);//从文件中读出数据,写入目的文件 1133 1134 if (retval > 0) { 1135 add_rchar(current, retval); 1136 add_wchar(current, retval); 1137 fsnotify_access(in.file); 1138 fsnotify_modify(out.file); 1139 out.file->f_pos = out_pos; 1140 if (ppos) 1141 *ppos = pos; 1142 else 1143 in.file->f_pos = pos; 1144 } 1145 1146 inc_syscr(current); 1147 inc_syscw(current); 1148 if (pos > max) 1149 retval = -EOVERFLOW; 1150 1151 fput_out: 1152 fdput(out); 1153 fput_in: 1154 fdput(in); 1155 out: 1156 return retval; 1157 } 1158 1159 SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count) 1160 { 1161 loff_t pos; 1162 off_t off; 1163 ssize_t ret; 1164 1165 if (offset) {//指定了偏移 1166 if (unlikely(get_user(off, offset))) 1167 return -EFAULT; 1168 pos = off; 1169 ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);//从指定处读数据发送 1170 if (unlikely(put_user(pos, offset))) 1171 return -EFAULT; 1172 return ret; 1173 } 1174 1175 return do_sendfile(out_fd, in_fd, NULL, count, 0);//从上次的偏移处读数据并发送 1176 }
splice_direct_to_actor函数:1297 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, 1298 loff_t *opos, size_t len, unsigned int flags) 1299 { 1300 struct splice_desc sd = { 1301 .len = len, 1302 .total_len = len, 1303 .flags = flags, 1304 .pos = *ppos, 1305 .u.file = out, 1306 .opos = opos, 1307 }; 1308 long ret; 1309 1310 ret = splice_direct_to_actor(in, &sd, direct_splice_actor);//指定direct_splice_actor函数为入参 1311 if (ret > 0) 1312 *ppos = sd.pos; 1313 1314 return ret; 1315 }
sendfile系统调用读数据的方法是do_splice_to函数,由于这个函数不会调用到socket的读方法,故不做分析。写数据的方法是direct_splice_actor函数:1163 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, 1164 splice_direct_actor *actor) 1165 { 1166 struct pipe_inode_info *pipe; 1167 long ret, bytes; 1168 umode_t i_mode; 1169 size_t len; 1170 int i, flags; ... 1185 pipe = current->splice_pipe; 1186 if (unlikely(!pipe)) { 1187 pipe = alloc_pipe_info();//建立管道用于传输数据 ... 1196 pipe->readers = 1; 1197 1198 current->splice_pipe = pipe; 1199 } 1200 1201 /* 1202 * Do the splice. 1203 */ 1204 ret = 0; 1205 bytes = 0; 1206 len = sd->total_len; 1207 flags = sd->flags; 1208 1209 /* 1210 * Don't block on output, we have to drain the direct pipe. 1211 */ 1212 sd->flags &= ~SPLICE_F_NONBLOCK; 1213 1214 while (len) { 1215 size_t read_len; 1216 loff_t pos = sd->pos, prev_pos = pos; 1217 1218 ret = do_splice_to(in, &pos, pipe, len, flags);//读取数据到管道中 1219 if (unlikely(ret <= 0)) 1220 goto out_release; 1221 1222 read_len = ret; 1223 sd->total_len = read_len; 1224 1225 /* 1226 * NOTE: nonblocking mode only applies to the input. We 1227 * must not do the output in nonblocking mode as then we 1228 * could get stuck data in the internal pipe: 1229 */ 1230 ret = actor(pipe, sd);//调用direct_splice_actor发送数据 ... 1245 1246 done: 1247 pipe->nrbufs = pipe->curbuf = 0; 1248 file_accessed(in); 1249 return bytes; ...
可见这个函数使用的是do_splice_from。下面的流程与splice系统调用一致。1272 static int direct_splice_actor(struct pipe_inode_info *pipe, 1273 struct splice_desc *sd) 1274 { 1275 struct file *file = sd->u.file; 1276 1277 return do_splice_from(pipe, file, sd->opos, sd->total_len, 1278 sd->flags); 1279 }
- 综上,sendfile将文件的数据从硬件DMA读取到pipe的页中,发送数据时在将页直接挂到目标文件的缓冲(对于socket类型文件则是skb的frag_list)中,再copy到硬件DMA,从而减少了数据copy,提高了性能。而如果网卡支持分散-聚集IO和硬件计算检验和,则可以直接将skb frag_list中的数据发送出去,避免了软件中对frag_list进行重组的开销;否则与splice系统调用一样将多一次数据copy。