4.3 TCP Splice

  splice是linux2.6内核中新增的零拷贝数据发送函数,主要用于将数据发送到管道或从管道中接收数据。于splice类似的零拷贝发送函数还有sendfile,不同的是sendfile不能发送socket中的数据。所谓零拷贝是指(与传统的read/write模式相比),在数据发送的过程中,不会产生用户态、内核态之间的数据拷贝。在代理模式下,使用经典的read/write方式转发socket数据的流程为:

buf = malloc(len);  \\首先申请一块长度为len的内存
read(sockfd1, buf, len);  \\将第一个socket sockfd1中len长度的数据读入buf
write(sockfd2, buf, len); \\将buf中的数据通过sockfd2发送出去

  这种转发数据的方式会导致数据会在内核态socket buff与用户态的buf之间发生两次copy(读一次,写一次)。

     为了提高数据转发的效率,于是splice横空出世!splice系统调用的原型为:

ssize_t splice(int fd_in, loff_t *off_in, int fd_out,
                      loff_t *off_out, size_t len, unsigned int flags);
  参数解析:

  fd_in:要读入数据的文件描述符

     off_in:要读入数据的起始偏移

     fd_out:要写入数据的文件描述符

     off_out:要写入数据的起始偏移

     len:要写入数据的长度

     flags:标志位

     要特别强调的一点是,fd_in和fd_out中必须有一个是管道的描述符

  使用splice转发数据的流程为:

1、调用mkfifo或者pipe创建一个管道

2、splice(sockfd1, &off_in_1, pipe_fd_w, &off_out_w, len, 0)  \\将sockfd1中的数据移动到管道的写端

3、splice(pipe_fd_r, &off_in_r, sockfd2, &off_out_2, len, 0)  \\通过管道的读端,将数据通过sockfd2发送出去

  splice的效率为什么会比普通的read/write方法高呢?下面我们来分析一下代码。splice系统调用对应的内核代码为:

1721 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1722         int, fd_out, loff_t __user *, off_out,
1723         size_t, len, unsigned int, flags)
1724 {
1725     struct fd in, out;   
1726     long error;
1727 
1728     if (unlikely(!len))  
1729         return 0;        
1730 
1731     error = -EBADF;      
1732     in = fdget(fd_in);
1733     if (in.file) {
1734         if (in.file->f_mode & FMODE_READ) { //fd_in可读
1735             out = fdget(fd_out);           
1736             if (out.file) {
1737                 if (out.file->f_mode & FMODE_WRITE) //fd_out可写
1738                     error = do_splice(in.file, off_in,
1739                               out.file, off_out,             
1740                               len, flags);
1741                 fdput(out);
1742             }
1743         }
1744         fdput(in);
1745     }
1746     return error;
1747 }
  do_splice函数:
1324 static long do_splice(struct file *in, loff_t __user *off_in,
1325               struct file *out, loff_t __user *off_out,
1326               size_t len, unsigned int flags)
1327 {
1328     struct pipe_inode_info *ipipe;
1329     struct pipe_inode_info *opipe;
1330     loff_t offset;
1331     long ret;
1332 
1333     ipipe = get_pipe_info(in);
1334     opipe = get_pipe_info(out);
1335 
1336     if (ipipe && opipe) {  //in和out都是管道
... 
1350         return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1351     }
1352 
1353     if (ipipe) {  //in是管道
1354         if (off_in)
1355             return -ESPIPE;
1356         if (off_out) {
1357             if (!(out->f_mode & FMODE_PWRITE))
1358                 return -EINVAL;
1359             if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1360                 return -EFAULT;
1361         } else {
1362             offset = out->f_pos;
1363         }
1364 
1365         ret = do_splice_from(ipipe, out, &offset, len, flags); //从管道中度数据发送到out
1366 
1367         if (!off_out)
1368             out->f_pos = offset;
1369         else if (copy_to_user(off_out, &offset, sizeof(loff_t)))
1370             ret = -EFAULT;
1371 
1372         return ret;
1373     }
1374 
1375     if (opipe) {  //out是管道
1376         if (off_out)
1377             return -ESPIPE;
1378         if (off_in) {
1379             if (!(in->f_mode & FMODE_PREAD))
1380                 return -EINVAL;
1381             if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1382                 return -EFAULT;
1383         } else {
1384             offset = in->f_pos;
1385         }
1386 
1387         ret = do_splice_to(in, &offset, opipe, len, flags); //从in中读取数据放入管道
1388 
1389         if (!off_in)
1390             in->f_pos = offset;
1391         else if (copy_to_user(off_in, &offset, sizeof(loff_t)))
1392             ret = -EFAULT;
1393 
1394         return ret;
1395     }
1396 
1397     return -EINVAL;
1398 }
  代理模式下in和out必有一个是socket,所以不会走1353行的分支。 首先分析从socket读数据放入管道的流程,这个功能由do_splice_to函数完成:
1127 static long do_splice_to(struct file *in, loff_t *ppos,
1128              struct pipe_inode_info *pipe, size_t len,
1129              unsigned int flags)
1130 {
1131     ssize_t (*splice_read)(struct file *, loff_t *,
1132                    struct pipe_inode_info *, size_t, unsigned int);
1133     int ret;
1134
1135     if (unlikely(!(in->f_mode & FMODE_READ)))
1136         return -EBADF;   
1137
1138     ret = rw_verify_area(READ, in, ppos, len);
1139     if (unlikely(ret < 0))
1140         return ret;      
1141
1142     if (in->f_op && in->f_op->splice_read)//如果文件是socket,则in->f_op->splice_read指向sock_splice_read
1143         splice_read = in->f_op->splice_read;
1144     else
1145         splice_read = default_file_splice_read;
1146
1147     return splice_read(in, ppos, pipe, len, flags);
1148 }
   sock_splice_read函数:
 871 static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
 872                 struct pipe_inode_info *pipe, size_t len,
 873                 unsigned int flags)
 874 {
 875     struct socket *sock = file->private_data;
 876
 877     if (unlikely(!sock->ops->splice_read))
 878         return -EINVAL;  
 879
 880     return sock->ops->splice_read(sock, ppos, pipe, len, flags);//指向tcp_splice_read
 881 }
   tcp_splice_read函数:
 670 ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
 671             struct pipe_inode_info *pipe, size_t len,
 672             unsigned int flags)
 673 {   
 674     struct sock *sk = sock->sk;
 675     struct tcp_splice_state tss = {
 676         .pipe = pipe,
 677         .len = len,
 678         .flags = flags,
 679     };
 680     long timeo;    
 681     ssize_t spliced;
 682     int ret;
 683     
 684     sock_rps_record_flow(sk);
 685     /*
 686      * We can't seek on a socket input  
 687      */
 688     if (unlikely(*ppos))
 689         return -ESPIPE;
 690
 691     ret = spliced = 0;
 692     
 693     lock_sock(sk);
 694
 695     timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
 696     while (tss.len) {
 697         ret = __tcp_splice_read(sk, &tss);//将socket缓存中的数据读到管道中
 698         if (ret < 0)
 699             break;
 700         else if (!ret) {//没有读到数据
 701             if (spliced)
 702                 break;
 703             if (sock_flag(sk, SOCK_DONE))
 704                 break;
 705             if (sk->sk_err) {
 706                 ret = sock_error(sk);
 707                 break;
 708             }
 709             if (sk->sk_shutdown & RCV_SHUTDOWN)
 710                 break;
 711             if (sk->sk_state == TCP_CLOSE) {
 712                 /*
 713                  * This occurs when user tries to read
 714                  * from never connected socket.
 715                  */
 716                 if (!sock_flag(sk, SOCK_DONE))
 717                     ret = -ENOTCONN;
 718                 break;
 719             }
 720             if (!timeo) {//不允许等待
 721                 ret = -EAGAIN;
 722                 break;
 723             }
 724             sk_wait_data(sk, &timeo);//缓存未填满,等待
 725             if (signal_pending(current)) {
 726                 ret = sock_intr_errno(timeo);
 727                 break;
 728             }
 729             continue;
 730         }
 731         tss.len -= ret;
 732         spliced += ret;
 733
 734         if (!timeo)
 735             break;
 736         release_sock(sk);
 737         lock_sock(sk);
 738
 739         if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
 740             (sk->sk_shutdown & RCV_SHUTDOWN) ||
 741             signal_pending(current))
 742             break;
 743     }
 744
 745     release_sock(sk);
 746
 747     if (spliced)
 748         return spliced;
 749
 750     return ret;
 751 }
   __tcp_splice_read 函数
 647 static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
 648 {
 649     /* Store TCP splice context information in read_descriptor_t. */
 650     read_descriptor_t rd_desc = {      
 651         .arg.data = tss,
 652         .count    = tss->len,
 653     };
 654
 655     return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
 656 }
   tcp_read_sock 函数
1465 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1466           sk_read_actor_t recv_actor)
1467 {
1468     struct sk_buff *skb;
1469     struct tcp_sock *tp = tcp_sk(sk);
1470     u32 seq = tp->copied_seq;
1471     u32 offset;
1472     int copied = 0;
1473
1474     if (sk->sk_state == TCP_LISTEN)
1475         return -ENOTCONN;
1476     while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {//获取接收队列首包
1477         if (offset < skb->len) {//offset是当前数据包中已经copy的长度
1478             int used;
1479             size_t len;
1480
1481             len = skb->len - offset;
1482             /* Stop reading if we hit a patch of urgent data */
1483             if (tp->urg_data) {
1484                 u32 urg_offset = tp->urg_seq - seq;
1485                 if (urg_offset < len)
1486                     len = urg_offset;
1487                 if (!len)
1488                     break;
1489             }
1490             used = recv_actor(desc, skb, offset, len);//调用tcp_splice_data_recv将数据填入管道
1491             if (used <= 0) {
1492                 if (!copied)
1493                     copied = used;
1494                 break;
1495             } else if (used <= len) {
1496                 seq += used;
1497                 copied += used;
1498                 offset += used;
1499             }
1500             /* If recv_actor drops the lock (e.g. TCP splice
1501              * receive) the skb pointer might be invalid when
1502              * getting here: tcp_collapse might have deleted it
1503              * while aggregating skbs from the socket queue.
1504              */
1505             skb = tcp_recv_skb(sk, seq - 1, &offset);
1506             if (!skb)
1507                 break;
1508             /* TCP coalescing might have appended data to the skb.
1509              * Try to splice more frags
1510              */
1511             if (offset + 1 != skb->len)
1512                 continue;
1513         }
1514         if (tcp_hdr(skb)->fin) {
1515             sk_eat_skb(sk, skb, false);
1516             ++seq;
1517             break;
1518         }
1519         sk_eat_skb(sk, skb, false);//释放skb
1520         if (!desc->count)
1521             break;
1522         tp->copied_seq = seq;
1523     }
1524     tp->copied_seq = seq;
1525
1526     tcp_rcv_space_adjust(sk);//更新缓存空间信息
1527
1528     /* Clean up data we have read: This will do ACK frames. */
1529     if (copied > 0) {
1530         tcp_recv_skb(sk, seq, &offset);//释放已经完全copy完毕的skb
1531         tcp_cleanup_rbuf(sk, copied);//如果必要则发送ACK更新通告窗口
1532     }
1533     return copied;
1534 }
   tcp_splice_data_recv 函数:
 634 static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
 635                 unsigned int offset, size_t len)
 636 {
 637     struct tcp_splice_state *tss = rd_desc->arg.data;
 638     int ret;
 639
 640     ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len),
 641                   tss->flags);//将skb的线性区和非线性区中的数据map到管道中
 642     if (ret > 0)
 643         rd_desc->count -= ret;
 644     return ret;
 645 }
  skb_splice_bist函数使用map方法传递数据到管道,这样就实现了免copy。

  tcp_recv_skb函数用于获取接收队列中的skb:

1432 static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)                                                                    
1433 {
1434     struct sk_buff *skb; 
1435     u32 offset;
1436 
1437     while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { //获取队列首包
1438         offset = seq - TCP_SKB_CB(skb)->seq; //offset是已经copy的数据长度
1439         if (tcp_hdr(skb)->syn) 
1440             offset--;    //去掉SYN标记所占用的1位序列号
1441         if (offset < skb->len || tcp_hdr(skb)->fin) { //当前skb还有数据未读,或是有FIN标记
1442             *off = offset;
1443             return skb;    //就是你了,返回
1444         }
1445         /* This looks weird, but this can happen if TCP collapsing
1446          * splitted a fat GRO packet, while we released socket lock                                                                        
1447          * in skb_splice_bits()
1448          */              
1449         sk_eat_skb(sk, skb, false); //整个skb都已经被读过了,废弃之
1450     }
1451     return NULL;         
1452 }
  这时,splice系统调用将socket中的数据读到管道中的任务已经完成。下面开始将管道中的数据发送到sockfd2,这是 do_splice_from函数的工作:
1096 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
1097                loff_t *ppos, size_t len, unsigned int flags)
1098 {
1099     ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
1100                 loff_t *, size_t, unsigned int);
1101     int ret;
1102
1103     if (unlikely(!(out->f_mode & FMODE_WRITE)))
1104         return -EBADF;   
1105
1106     if (unlikely(out->f_flags & O_APPEND))
1107         return -EINVAL;  
1108
1109     ret = rw_verify_area(WRITE, out, ppos, len);
1110     if (unlikely(ret < 0))
1111         return ret;      
1112
1113     if (out->f_op && out->f_op->splice_write)//如果是socket类型文件则out->f_op->splice_write指向generic_splice_sendpage
1114         splice_write = out->f_op->splice_write;
1115     else
1116         splice_write = default_file_splice_write;
1117
1118     file_start_write(out);
1119     ret = splice_write(pipe, out, ppos, len, flags);
1120     file_end_write(out);
1121     return ret;
1122 }
  generic_splice_sendpage函数:
1085 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
1086                 loff_t *ppos, size_t len, unsigned int flags)
1087 {
1088     return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
1089 }
  splice_from_pipe函数:
 958 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 959              loff_t *ppos, size_t len, unsigned int flags,
 960              splice_actor *actor)           
 961 {
 962     ssize_t ret;
 963     struct splice_desc sd = {
 964         .total_len = len,
 965         .flags = flags,  
 966         .pos = *ppos,
 967         .u.file = out,
 968     };
 969
 970     pipe_lock(pipe);
 971     ret = __splice_from_pipe(pipe, &sd, actor); //actor函数的真实身份是pipe_to_sendpage
 972     pipe_unlock(pipe);
 973
 974     return ret;
 975 }
   __splice_from_pipe函数:
 927 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
 928                splice_actor *actor)
 929 {
 930     int ret;
 931
 932     splice_from_pipe_begin(sd);
 933     do {
 934         ret = splice_from_pipe_next(pipe, sd);//检查是否有数据可读
 935         if (ret > 0)
 936             ret = splice_from_pipe_feed(pipe, sd, actor);//读取pipe中的数据到目的文件中
 937     } while (ret > 0);
 938     splice_from_pipe_end(pipe, sd);
 939
 940     return sd->num_spliced ? sd->num_spliced : ret;
 941 }
   splice_from_pipe_feed函数:
 795 int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
 796               splice_actor *actor)
 797 {   
 798     int ret;
 799     
 800     while (pipe->nrbufs) {//遍历管道中所有有数据的buffer
 801         struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
 ...
 815         ret = actor(pipe, buf, sd);//调用pipe_to_sendpage发送数据
 816         if (ret <= 0)
 817             return ret;
 ...
   pipe_to_sendpage 函数:

 691 static int pipe_to_sendpage(struct pipe_inode_info *pipe,
 692                 struct pipe_buffer *buf, struct splice_desc *sd)
 693 {
 694     struct file *file = sd->u.file;
 695     loff_t pos = sd->pos;
 696     int more;
 697
 698     if (!likely(file->f_op && file->f_op->sendpage))
 699         return -EINVAL;
 700    
 701     more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
 702    
 703     if (sd->len < sd->total_len && pipe->nrbufs > 1)
 704         more |= MSG_SENDPAGE_NOTLAST;
 705    
 706     return file->f_op->sendpage(file, buf->page, buf->offset,
 707                     sd->len, &pos, more);//对于socket,会调用sock_sendpage
 708 }  
  sock_sendpage函数:
 856 static ssize_t sock_sendpage(struct file *file, struct page *page,
 857                  int offset, size_t size, loff_t *ppos, int more)
 858 {
 859     struct socket *sock;
 860     int flags;
 861
 862     sock = file->private_data;
 863
 864     flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
 865     /* more is a combination of MSG_MORE and MSG_SENDPAGE_NOTLAST */
 866     flags |= more;
 867
 868     return kernel_sendpage(sock, page, offset, size, flags);
 869 }
   kernel_sendpage函数:
3432 int kernel_sendpage(struct socket *sock, struct page *page, int offset,
3433             size_t size, int flags)
3434 {
3435     if (sock->ops->sendpage)
3436         return sock->ops->sendpage(sock, page, offset, size, flags);//会调用inet_sendpage
3437
3438     return sock_no_sendpage(sock, page, offset, size, flags);
3439 }   
  inet_sendpage函数
776 ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
 777               size_t size, int flags)
 778 {
 779     struct sock *sk = sock->sk;
 780     
 781     sock_rps_record_flow(sk);
 782
 783     /* We may need to bind the socket. */
 784     if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
 785         inet_autobind(sk))
 786         return -EAGAIN;
 787     
 788     if (sk->sk_prot->sendpage)
 789         return sk->sk_prot->sendpage(sk, page, offset, size, flags);//指向tcp_sendpage
 790     return sock_no_sendpage(sock, page, offset, size, flags);
 791 }   
  tcp_sendpage函数:
 944 int tcp_sendpage(struct sock *sk, struct page *page, int offset,
 945          size_t size, int flags)
 946 {   
 947     ssize_t res;
 948     
 949     if (!(sk->sk_route_caps & NETIF_F_SG) || //不支持分散-聚集IO
 950         !(sk->sk_route_caps & NETIF_F_ALL_CSUM)) //不支持硬件计算检验和
 951         return sock_no_sendpage(sk->sk_socket, page, offset, size,
 952                     flags);
 953
 954     lock_sock(sk);
 955     res = do_tcp_sendpages(sk, page, offset, size, flags);
 956     release_sock(sk);
 957     return res;     
 958 }   
  网卡支持分散-聚集IO并且支持硬件计算检验和时tcp_sendpages会调用 do_tcp_sendpages函数:
 827 static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
 828                 size_t size, int flags)
 829 {
 830     struct tcp_sock *tp = tcp_sk(sk);
 831     int mss_now, size_goal;
 832     int err;
 833     ssize_t copied;
 834     long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 835
 836     /* Wait for a connection to finish. One exception is TCP Fast Open
 837      * (passive side) where data is allowed to be sent before a connection
 838      * is fully established.
 839      */
 840     if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
 841         !tcp_passive_fastopen(sk)) {
 842         if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
 843             goto out_err;
 844     }
 845
 846     clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 847
 848     mss_now = tcp_send_mss(sk, &size_goal, flags);
 849     copied = 0;
 850
 851     err = -EPIPE;
 852     if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 853         goto out_err;
 854
 855     while (size > 0) {
 856         struct sk_buff *skb = tcp_write_queue_tail(sk);
 857         int copy, i;
 858         bool can_coalesce;
 859
 860         if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
 861 new_segment:
 862             if (!sk_stream_memory_free(sk))
 863                 goto wait_for_sndbuf;
 864
 865             skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);//申请一个线性空间长度为0的skb,将数据全部放入非线性空间中
 866             if (!skb)
 867                 goto wait_for_memory;
 868
 869             skb_entail(sk, skb);
 870             copy = size_goal;
 871         }
 872
 873         if (copy > size)
 874             copy = size;
 875
 876         i = skb_shinfo(skb)->nr_frags;
 877         can_coalesce = skb_can_coalesce(skb, i, page, offset);
 878         if (!can_coalesce && i >= MAX_SKB_FRAGS) {
 879             tcp_mark_push(tp, skb);
 880             goto new_segment;
 881         }
 882         if (!sk_wmem_schedule(sk, copy))
 883             goto wait_for_memory;
 884
 885         if (can_coalesce) {//这个页已经在frag_list中,且有新数据在pipe中写入
 886             skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);//更新page大小即可
 887         } else {
 888             get_page(page);
 889             skb_fill_page_desc(skb, i, page, offset, copy);//将pipe中的page直接放入skb的frag_list中,无需数据copy
 890         }
 891         skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
 892
 893         skb->len += copy;
 894         skb->data_len += copy;
 895         skb->truesize += copy;
 896         sk->sk_wmem_queued += copy;
 897         sk_mem_charge(sk, copy);
 898         skb->ip_summed = CHECKSUM_PARTIAL;
 899         tp->write_seq += copy;
 900         TCP_SKB_CB(skb)->end_seq += copy;
 901         skb_shinfo(skb)->gso_segs = 0;
 902
 903         if (!copied)
 904             TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
 905
 906         copied += copy;
 907         offset += copy;
 908         if (!(size -= copy))
 909             goto out;
 910
 911         if (skb->len < size_goal || (flags & MSG_OOB))
 912             continue;
 913
 914         if (forced_push(tp)) {
 915             tcp_mark_push(tp, skb);
 916             __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
 917         } else if (skb == tcp_send_head(sk))
 918             tcp_push_one(sk, mss_now);
 919         continue;
 920
 921 wait_for_sndbuf:
 922         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 923 wait_for_memory:
 924         tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
 925
 926         if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
 927             goto do_error;
 928
 929         mss_now = tcp_send_mss(sk, &size_goal, flags);
 930     }
 931
 932 out:
 933     if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
 934         tcp_push(sk, flags, mss_now, tp->nonagle);
 935     return copied;
 936
 937 do_error:
 938     if (copied)
 939         goto out;
 940 out_err:
 941     return sk_stream_error(sk, flags, err);
 942 }
  可见 tcp_sendpages函数会将pipe中的page直接放入skb的frag数组中,再利用网卡分散-聚集IO的功能发送,这样就避免了数据copy。
  如果 网卡不支持分散-聚集IO或支持硬件计算检验和,就不能支持直接发送skb frag数组中的page,这时需要使用sock_no_sendpage函数发送数据:

2116 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2117 {
2118     ssize_t res;
2119     struct msghdr msg = {.msg_flags = flags};
2120     struct kvec iov;
2121     char *kaddr = kmap(page);
2122     iov.iov_base = kaddr + offset;
2123     iov.iov_len = size;
2124     res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2125     kunmap(page);
2126     return res;
2127 }
  这个函数会将page作为普通数据并调用kernel_sendmsg函数发送,kernel_sendmsg最终会调到tcp_sendmsg函数将page中的数据copy到skb中。也就是说,当 网卡不支持分散-聚集IO或支持硬件计算检验和时,splice系统调用会存在一次数据copy。

  至此,TCP splice的流程已经完全明晰了:使用splice函数转发一次数据需要2次系统调用和0次(网卡支持分散-聚集IO并且支持硬件计算检验和)或1次(网卡不支持分散-聚集IO或支持硬件计算检验和)数据copy,与普通的数据转发技术相比节约了1到2次数据copy。但这个技术需要两次调用splice系统调用,而且需要一个pipe作为数据中转站,这样使得效率的提升受到了限制。如果能够直接在两个TCP socket直接传递数据,从而实现真正的“零copy”,则会是很令人兴奋的事情。

  利用splice技术也可以提高发送文件的效率。通常,发送一个文件中数据的流程是:

(1)使用read系统调用读取file中的数据;这个过程中有两次数据copy:从硬件的DMA copy到内核缓存,从内核缓存到用户态缓存;有两次进程上下文切换;
(2)使用write、send、sendmsg等系统调用将用户态缓存中的数据发送到网络中;这个过程中有两次数据copy:从用户态缓存到内核socket 缓存,从socket缓存到硬件的DMA;有两次进程上下文切换。
     发送一次数据需要四次数据copy,四次进程上下文切换
     使用sendfile时发送数据的流程是:
(1) 进程切换到内核态;
(2)将file中的数据从硬件的DMA copy到内核缓存
(3)将内核缓存中的数据放入到socket缓存,这一步是0 copy;
(4)将 socket缓存 中的数据copy到 硬件的DMA
(5) 进程切换回用户态。
     发送一次数据需要两次数据copy,两次进程上下文切换。可见,使用sendfile系统调用在内核态完成数据的传递,从而能大大减少开销,提高性能。
     sendfile系统调用原型:
  1. ssize_t sendfile(int out_fd, int in_fd, off_t *offset, size_t count);
      其中out_fd是数据要写入的文件的描述符,in_fd是读取数据的文件的描述符。in_fd必须不能是socket,out_fd可以是普通文件也可以是socket。
  2.   sendfile系统调用对应的内核函数为sys_sendfile:
    1061 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
    1062                size_t count, loff_t max)
    1063 {
    1064     struct fd in, out;   
    1065     struct inode *in_inode, *out_inode;
    1066     loff_t pos;
    1067     loff_t out_pos;
    1068     ssize_t retval;
    1069     int fl;
    1070
    1071     /*
    1072      * Get input file, and verify that it is ok..
    1073      */
    1074     retval = -EBADF;
    1075     in = fdget(in_fd);
    1076     if (!in.file)
    1077         goto out;
    1078     if (!(in.file->f_mode & FMODE_READ))
    1079         goto fput_in;
    1080     retval = -ESPIPE;
    1081     if (!ppos) {
    1082         pos = in.file->f_pos;
    1083     } else {
    1084         pos = *ppos;
    1085         if (!(in.file->f_mode & FMODE_PREAD))
    1086             goto fput_in;
    1087     }
    1088     retval = rw_verify_area(READ, in.file, &pos, count);//计算可读的字节数
    1089     if (retval < 0)
    1090         goto fput_in;
    1091     count = retval;
    1092
    1093     /*
    1094      * Get output file, and verify that it is ok..
    1095      */
    1096     retval = -EBADF;
    1097     out = fdget(out_fd);
    1098     if (!out.file)
    1099         goto fput_in;
    1100     if (!(out.file->f_mode & FMODE_WRITE))
    1101         goto fput_out;
    1102     retval = -EINVAL;
    1103     in_inode = file_inode(in.file);
    1104     out_inode = file_inode(out.file);
    1105     out_pos = out.file->f_pos;
    1106     retval = rw_verify_area(WRITE, out.file, &out_pos, count);//计算能写入的字节数
    1107     if (retval < 0)
    1108         goto fput_out;
    1109     count = retval;
    1110
    1111     if (!max)
    1112         max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
    1113
    1114     if (unlikely(pos + count > max)) {
    1115         retval = -EOVERFLOW;
    1116         if (pos >= max)
    1117             goto fput_out;
    1118         count = max - pos;
    1119     }
    1120
    1121     fl = 0;
    1122 #if 0
    1123     /*
    1124      * We need to debate whether we can enable this or not. The
    1125      * man page documents EAGAIN return for the output at least,
    1126      * and the application is arguably buggy if it doesn't expect
    1127      * EAGAIN on a non-blocking file descriptor.
    1128      */
    1129     if (in.file->f_flags & O_NONBLOCK)
    1130         fl = SPLICE_F_NONBLOCK;
    1131 #endif
    1132     retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);//从文件中读出数据,写入目的文件
    1133
    1134     if (retval > 0) {
    1135         add_rchar(current, retval);
    1136         add_wchar(current, retval);
    1137         fsnotify_access(in.file);
    1138         fsnotify_modify(out.file);
    1139         out.file->f_pos = out_pos;
    1140         if (ppos)
    1141             *ppos = pos;
    1142         else
    1143             in.file->f_pos = pos;
    1144     }
    1145
    1146     inc_syscr(current);
    1147     inc_syscw(current);
    1148     if (pos > max)
    1149         retval = -EOVERFLOW;
    1150
    1151 fput_out:
    1152     fdput(out);
    1153 fput_in:
    1154     fdput(in);
    1155 out:
    1156     return retval;
    1157 }
    1158 
    1159 SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
    1160 {
    1161     loff_t pos;
    1162     off_t off;
    1163     ssize_t ret;
    1164
    1165     if (offset) {//指定了偏移
    1166         if (unlikely(get_user(off, offset)))
    1167             return -EFAULT;
    1168         pos = off;
    1169         ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);//从指定处读数据发送
    1170         if (unlikely(put_user(pos, offset)))
    1171             return -EFAULT;
    1172         return ret;
    1173     }
    1174
    1175     return do_sendfile(out_fd, in_fd, NULL, count, 0);//从上次的偏移处读数据并发送
    1176 }
      核心函数do_splice_direct函数:
    1297 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
    1298               loff_t *opos, size_t len, unsigned int flags)
    1299 {       
    1300     struct splice_desc sd = {
    1301         .len        = len,
    1302         .total_len  = len,
    1303         .flags      = flags,
    1304         .pos        = *ppos,
    1305         .u.file     = out,
    1306         .opos       = opos,
    1307     };
    1308     long ret;
    1309     
    1310     ret = splice_direct_to_actor(in, &sd, direct_splice_actor);//指定direct_splice_actor函数为入参
    1311     if (ret > 0)
    1312         *ppos = sd.pos;
    1313     
    1314     return ret;
    1315 }   
      splice_direct_to_actor函数:
    1163 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
    1164                    splice_direct_actor *actor)
    1165 {
    1166     struct pipe_inode_info *pipe;
    1167     long ret, bytes;
    1168     umode_t i_mode;
    1169     size_t len;
    1170     int i, flags;
    ...
    1185     pipe = current->splice_pipe;
    1186     if (unlikely(!pipe)) {
    1187         pipe = alloc_pipe_info();//建立管道用于传输数据
    ...
    1196         pipe->readers = 1;
    1197
    1198         current->splice_pipe = pipe;
    1199     }
    1200
    1201     /*
    1202      * Do the splice.
    1203      */
    1204     ret = 0;
    1205     bytes = 0;
    1206     len = sd->total_len;
    1207     flags = sd->flags;
    1208
    1209     /*
    1210      * Don't block on output, we have to drain the direct pipe.
    1211      */
    1212     sd->flags &= ~SPLICE_F_NONBLOCK;
    1213
    1214     while (len) {
    1215         size_t read_len;
    1216         loff_t pos = sd->pos, prev_pos = pos;
    1217
    1218         ret = do_splice_to(in, &pos, pipe, len, flags);//读取数据到管道中
    1219         if (unlikely(ret <= 0))
    1220             goto out_release;
    1221
    1222         read_len = ret;
    1223         sd->total_len = read_len;
    1224
    1225         /*
    1226          * NOTE: nonblocking mode only applies to the input. We
    1227          * must not do the output in nonblocking mode as then we
    1228          * could get stuck data in the internal pipe:
    1229          */
    1230         ret = actor(pipe, sd);//调用direct_splice_actor发送数据
    ...
    1245
    1246 done:
    1247     pipe->nrbufs = pipe->curbuf = 0;
    1248     file_accessed(in);
    1249     return bytes;
    ...
      sendfile系统调用读数据的方法是do_splice_to函数,由于这个函数不会调用到socket的读方法,故不做分析。写数据的方法是direct_splice_actor函数:
  3. 1272 static int direct_splice_actor(struct pipe_inode_info *pipe,
    1273                    struct splice_desc *sd)
    1274 {
    1275     struct file *file = sd->u.file;
    1276
    1277     return do_splice_from(pipe, file, sd->opos, sd->total_len,
    1278                   sd->flags);
    1279 }
      可见这个函数使用的是do_splice_from。下面的流程与splice系统调用一致。
  4.   综上,sendfile将文件的数据从硬件DMA读取到pipe的页中,发送数据时在将页直接挂到目标文件的缓冲(对于socket类型文件则是skb的frag_list)中,再copy到硬件DMA,从而减少了数据copy,提高了性能。而如果网卡支持分散-聚集IO和硬件计算检验和,则可以直接将skb frag_list中的数据发送出去,避免了软件中对frag_list进行重组的开销;否则与splice系统调用一样将多一次数据copy。
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值