当数据到达TCP接收缓存后,TCP会唤醒应用进程。应用进程在被内核唤醒后,就可以使用read、readv、recv、recvfrom、recvmsg系统调读取TCP数据。现以read和recv系统调用为例分析应用进程从TCP收取数据的方法。
read系统调用原型:
ssize_t read(int fd, void *buf, size_t count);
fd:scoket文件描述符;buf:用于存放数据的缓存的首地址;count:buf的长度(字节)
成功时返回从TCP中读取到的字节数,如果返回0则意味着TCP收到了FIN,应用进程可以选择关闭连接。
read系统调用对于的内核函数为:
472 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
473 {
474 struct fd f = fdget(fd);
475 ssize_t ret = -EBADF;
476
477 if (f.file) {
478 loff_t pos = file_pos_read(f.file);
479 ret = vfs_read(f.file, buf, count, &pos);
480 file_pos_write(f.file, pos);
481 fdput(f);
482 }
483 return ret;
484 }
vfs_read:
353 ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
354 {
355 ssize_t ret;
356
357 if (!(file->f_mode & FMODE_READ))
358 return -EBADF;
359 if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
360 return -EINVAL;
361 if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
362 return -EFAULT;
363
364 ret = rw_verify_area(READ, file, pos, count);
365 if (ret >= 0) {
366 count = ret;
367 if (file->f_op->read) //对于socket类型的文件系统这个判断为假
368 ret = file->f_op->read(file, buf, count, pos);
369 else
370 ret = do_sync_read(file, buf, count, pos);
371 if (ret > 0) {
372 fsnotify_access(file);
373 add_rchar(current, ret);
374 }
375 inc_syscr(current);
376 }
377
378 return ret;
379 }
socket类型的文件系统会使用do_sync_read函数:
333 ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
334 {
335 struct iovec iov = { .iov_base = buf, .iov_len = len };
336 struct kiocb kiocb;
337 ssize_t ret;
338
339 init_sync_kiocb(&kiocb, filp);
340 kiocb.ki_pos = *ppos;
341 kiocb.ki_left = len;
342 kiocb.ki_nbytes = len;
343
344 ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); //指向sock_aio_read
345 if (-EIOCBQUEUED == ret)
346 ret = wait_on_sync_kiocb(&kiocb);
347 *ppos = kiocb.ki_pos;
348 return ret;
349 }
sock_aio_read函数:
898 static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb,
899 struct file *file, const struct iovec *iov,
900 unsigned long nr_segs)
901 {
902 struct socket *sock = file->private_data;
903 size_t size = 0;
904 int i;
905
906 for (i = 0; i < nr_segs; i++)
907 size += iov[i].iov_len;
908
909 msg->msg_name = NULL;
910 msg->msg_namelen = 0;
911 msg->msg_control = NULL;
912 msg->msg_controllen = 0;
913 msg->msg_iov = (struct iovec *)iov;
914 msg->msg_iovlen = nr_segs;
915 msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
916
917 return __sock_recvmsg(iocb, sock, msg, size, msg->msg_flags);
918 }
919
920 static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
921 unsigned long nr_segs, loff_t pos)
922 {
923 struct sock_iocb siocb, *x;
924
925 if (pos != 0)
926 return -ESPIPE;
927
928 if (iocb->ki_left == 0) /* Match SYS5 behaviour */
929 return 0;
930
931
932 x = alloc_sock_iocb(iocb, &siocb);
933 if (!x)
934 return -ENOMEM;
935 return do_sock_read(&x->async_msg, iocb, iocb->ki_filp, iov, nr_segs);
936 }
可见read系统调用在内核都会调用__sock_recvmsg函数。再来看recv系统调用:
ssize_t recv(int sockfd, void *buf, size_t len, int flags);
sockfd:scoket文件描述符;buf:用于存放数据的缓存的首地址;count:buf的长度(字节);flags:用于设置功能。
成功时返回从TCP中读取到的字节数,如果返回0则意味着TCP收到了FIN,应用进程可以选择关闭连接。
recv函数对应的内核函数为sys_recv:
1812 SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size,
1813 unsigned int, flags, struct sockaddr __user *, addr,
1814 int __user *, addr_len)
1815 {
1816 struct socket *sock;
1817 struct iovec iov;
1818 struct msghdr msg;
1819 struct sockaddr_storage address;
1820 int err, err2;
1821 int fput_needed;
1822
1823 if (size > INT_MAX)
1824 size = INT_MAX;
1825 sock = sockfd_lookup_light(fd, &err, &fput_needed);
1826 if (!sock)
1827 goto out;
1828
1829 msg.msg_control = NULL;
1830 msg.msg_controllen = 0;
1831 msg.msg_iovlen = 1;
1832 msg.msg_iov = &iov;
1833 iov.iov_len = size;
1834 iov.iov_base = ubuf;
1835 msg.msg_name = (struct sockaddr *)&address;
1836 msg.msg_namelen = sizeof(address);
1837 if (sock->file->f_flags & O_NONBLOCK)
1838 flags |= MSG_DONTWAIT;
1839 err = sock_recvmsg(sock, &msg, size, flags);
1840
1841 if (err >= 0 && addr != NULL) {
1842 err2 = move_addr_to_user(&address, //将数据发送端的地址信息返回给应用进程
1843 msg.msg_namelen, addr, addr_len);
1844 if (err2 < 0)
1845 err = err2;
1846 }
1847
1848 fput_light(sock->file, fput_needed);
1849 out:
1850 return err;
1851 }
1852
1853 /*
1854 * Receive a datagram from a socket.
1855 */
1856
1857 asmlinkage long sys_recv(int fd, void __user *ubuf, size_t size,
1858 unsigned int flags)
1859 {
1860 return sys_recvfrom(fd, ubuf, size, flags, NULL, NULL);
1861 }
sock_recvmsg是收包的核心函数:
787 int sock_recvmsg(struct socket *sock, struct msghdr *msg,
788 size_t size, int flags)
789 {
790 struct kiocb iocb;
791 struct sock_iocb siocb;
792 int ret;
793
794 init_sync_kiocb(&iocb, NULL);
795 iocb.private = &siocb;
796 ret = __sock_recvmsg(&iocb, sock, msg, size, flags);
797 if (-EIOCBQUEUED == ret)
798 ret = wait_on_sync_kiocb(&iocb);
799 return ret;
800 }
可见recv系统调用也会调用__sock_recvmsg函数来完成收数据的功能。
__sock_recvmsg函数:
765 static inline int __sock_recvmsg_nosec(struct kiocb *iocb, struct socket *sock,
766 struct msghdr *msg, size_t size, int flags)
767 {
768 struct sock_iocb *si = kiocb_to_siocb(iocb);
769
770 si->sock = sock;
771 si->scm = NULL;
772 si->msg = msg;
773 si->size = size;
774 si->flags = flags;
775
776 return sock->ops->recvmsg(iocb, sock, msg, size, flags);//指向inet_recvmsg函数
777 }
778
779 static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
780 struct msghdr *msg, size_t size, int flags)
781 {
782 int err = security_socket_recvmsg(sock, msg, size, flags);
783
784 return err ?: __sock_recvmsg_nosec(iocb, sock, msg, size, flags);
785 }
inet_recvmsg
函数:
794 int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
795 size_t size, int flags)
796 {
797 struct sock *sk = sock->sk;
798 int addr_len = 0;
799 int err;
800
801 sock_rps_record_flow(sk);
802
803 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
804 flags & ~MSG_DONTWAIT, &addr_len);//指向tcp_recvmsg函数
805 if (err >= 0)
806 msg->msg_namelen = addr_len;
807 return err;
808 }
tcp_recvmsg
函数:
1545 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1546 size_t len, int nonblock, int flags, int *addr_len)
1547 {
1548 struct tcp_sock *tp = tcp_sk(sk);
1549 int copied = 0;
1550 u32 peek_seq;
1551 u32 *seq;
1552 unsigned long used;
1553 int err;
1554 int target; /* Read at least this many bytes */
1555 long timeo;
1556 struct task_struct *user_recv = NULL;
1557 bool copied_early = false;
1558 struct sk_buff *skb;
1559 u32 urg_hole = 0;
1560
1561 lock_sock(sk);
1562
1563 err = -ENOTCONN;
1564 if (sk->sk_state == TCP_LISTEN)//listening socket不允许读
1565 goto out;
1566
1567 timeo = sock_rcvtimeo(sk, nonblock); //获取超时参数
1568
1569 /* Urgent data needs to be handled specially. */
1570 if (flags & MSG_OOB)//以带外方式读取紧急数据
1571 goto recv_urg;
...
1588 seq = &tp->copied_seq;
1589 if (flags & MSG_PEEK) { //用MSG_PEEK收取数据的话数据不会从TCP缓存中删除,下次还能读取到
1590 peek_seq = tp->copied_seq;
1591 seq = &peek_seq; //改变临时变量,而非tp->copied_seq,使得下次读数据时tp->copied_seq不变
1592 }
1593
1594 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); //得到本次要copy到应用缓存的字节数
...
1618 do {
1619 u32 offset;
1620
1621 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1622 if (tp->urg_data && tp->urg_seq == *seq) {//有紧急指针且下一个要copy的字节就是紧急数据
1623 if (copied)//已经copy了至少1字节数据
1624 break; //已经读了一些数据,马上就要读到紧急数据了,立马停住,以免将紧急数据混在普通数据中
1625 if (signal_pending(current)) { //有信号等待处理,可能有是在紧急数据到达慢速处理路径时TCP发送给进程的SIGURG信号
1626 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; //设置返回值
1627 break; //跳出循环,系统调用返回去处理SIGURG信号
1628 }
1629 }
1630
1631 /* Next get a buffer. */
1632
1633 skb_queue_walk(&sk->sk_receive_queue, skb) {//遍历接收队列
1634 /* Now that we have two receive queues this
1635 * shouldn't happen.
1636 */
1637 if (WARN(before(*seq, TCP_SKB_CB(skb)->seq), //出现数据空洞,BUG!
1638 "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n",
1639 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
1640 flags))
1641 break;
1642
1643 offset = *seq - TCP_SKB_CB(skb)->seq; //offset是skb中已经被读过的长度
1644 if (tcp_hdr(skb)->syn)
1645 offset--;
1646 if (offset < skb->len) //包中有未读的新数据
1647 goto found_ok_skb; //处理skb中的数据
1648 if (tcp_hdr(skb)->fin)
1649 goto found_fin_ok; //处理FIN标记
1650 WARN(!(flags & MSG_PEEK), //skb中的数据全部被读过,但还没有被删除,一定是开启了PEEK模式
1651 "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
1652 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
1653 }
1654
1655 /* Well, if we have backlog, try to process it now yet. */
1656
1657 if (copied >= target && !sk->sk_backlog.tail) //完成读取任务且没有skb在backlog队列中
1658 break; //功成身退
1659
1660 if (copied) {
1661 if (sk->sk_err ||
1662 sk->sk_state == TCP_CLOSE ||
1663 (sk->sk_shutdown & RCV_SHUTDOWN) ||
1664 !timeo ||
1665 signal_pending(current))
1666 break;
1667 } else { //没有copy任何数据
1668 if (sock_flag(sk, SOCK_DONE))
1669 break;
1670
1671 if (sk->sk_err) {
1672 copied = sock_error(sk);
1673 break;
1674 }
1675
1676 if (sk->sk_shutdown & RCV_SHUTDOWN)
1677 break;
1678
1679 if (sk->sk_state == TCP_CLOSE) {
1680 if (!sock_flag(sk, SOCK_DONE)) {
1681 /* This occurs when user tries to read
1682 * from never connected socket.
1683 */
1684 copied = -ENOTCONN;
1685 break;
1686 }
1687 break;
1688 }
1689
1690 if (!timeo) {
1691 copied = -EAGAIN;
1692 break;
1693 }
1694
1695 if (signal_pending(current)) {
1696 copied = sock_intr_errno(timeo);
1697 break;
1698 }
1699 }
1700
1701 tcp_cleanup_rbuf(sk, copied); //被读取到应用进程的数据无需保留,需要从接收缓存中清除,并相机发送ACK
...
1758 if (copied >= target) {//copy数据的字节数达到目标
1759 /* Do not sleep, just process backlog. */
1760 release_sock(sk);
1761 lock_sock(sk);
1762 } else//copy数据的字节数未达到目标
1763 sk_wait_data(sk, &timeo);//睡眠,等待新数据到来
...
1793 if ((flags & MSG_PEEK) && //PEEK模式
1794 (peek_seq - copied - urg_hole != tp->copied_seq)) {
1795 net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
1796 current->comm,
1797 task_pid_nr(current));
1798 peek_seq = tp->copied_seq; //出现BUG,校正peek_seq
1799 }
1800 continue;
1801
1802 found_ok_skb:
1803 /* Ok so how much can we use? */
1804 used = skb->len - offset;//计算skb中新数据的长度
1805 if (len < used)//用户缓存空间不够
1806 used = len;
1807
1808 /* Do we have urgent data here? */
1809 if (tp->urg_data) {
1810 u32 urg_offset = tp->urg_seq - *seq;
1811 if (urg_offset < used) {//紧急数据在能copy的数据中
1812 if (!urg_offset) {//下一个要copy的字节就是紧急数据
1813 if (!sock_flag(sk, SOCK_URGINLINE)) {//没有设置以inline方式读取紧急数据
1814 ++*seq;//跳过紧急数据
1815 urg_hole++;
1816 offset++;
1817 used--;
1818 if (!used)
1819 goto skip_copy;
1820 }
1821 } else
1822 used = urg_offset;//copy到紧急数据为止
1823 }
1824 }
1825
1826 if (!(flags & MSG_TRUNC)) { //没有设置MSG_TRUNC标记
...
1855 {
1856 err = skb_copy_datagram_iovec(skb, offset,
1857 msg->msg_iov, used);//copy数据到用户缓存
1858 if (err) {
1859 /* Exception. Bailout! */
1860 if (!copied)
1861 copied = -EFAULT;
1862 break;
1863 }
1864 }
1865 }
1866
1867 *seq += used;
1868 copied += used;
1869 len -= used;
1870
1871 tcp_rcv_space_adjust(sk);//调整接收缓存空间
1872
1873 skip_copy:
1874 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {//紧急指针已经处理完毕
1875 tp->urg_data = 0;
1876 tcp_fast_path_check(sk); //重启快速处理路径
1877 }
1878 if (used + offset < skb->len)//skb中还有数据没有copy完毕
1879 continue;
1880
1881 if (tcp_hdr(skb)->fin)
1882 goto found_fin_ok;
1883 if (!(flags & MSG_PEEK)) {//不以PEEK方式读包则需要释放数据已经全部copy完毕的skb
1884 sk_eat_skb(sk, skb, copied_early); //释放skb
1885 copied_early = false;
1886 }
1887 continue;//copy下一个skb中的数据
1888
1889 found_fin_ok:
1890 /* Process the FIN. */
1891 ++*seq;
1892 if (!(flags & MSG_PEEK)) {//PEEK方式读包不释放skb
1893 sk_eat_skb(sk, skb, copied_early);
1894 copied_early = false;
1895 }
1896 break;
1897 } while (len > 0);
...
1932 /* Clean up data we have read: This will do ACK frames. */
1933 tcp_cleanup_rbuf(sk, copied); //被读取到应用进程的数据无需保留,需要从接收缓存中清除,并相机发送ACK
1934
1935 release_sock(sk); //释放socket,允许其它进程或内核软中断访问socket
1936 return copied; //返回已copy的字节数
1937
1938 out:
1939 release_sock(sk);
1940 return err;
1941
1942 recv_urg:
1943 err = tcp_recv_urg(sk, msg, len, flags);//以带外方式读取紧急数据
1944 goto out;
...
1949 }
1594:如果flag设置了MSG_WAITALL则target = len,否则为sk->sk_rcvlowat和len中最小的一个,但至少为1;而sk->sk_rcvlowat可以通过SO_RCVLOWAT socket选项来设置
1657:tcp_recvmsg函数在访问tcp_sock之前会调用lock_sock将其锁定,在进程调用release_sock释放锁之前内核收包软中断无法访问sock,只能将收到的包放入backlog队列中,在release_sock函数中会处理这些积压的包
1826:MSG_TRUNC标记位表明数据的结尾被截短,因为接收缓冲区太小不足以接收全部的数据。但从代码上来看,如果设置了这个标记位则应用进程将无法从TCP收取数据
在应用进程通过收包系统调用读取完毕数据后,TCP对数据的可靠交付才算真正完成。以上讨论的是普通的收包方式,下面几节探讨一下特殊的收包方式(与这些方式有关的代码在tcp_recvmsg中已经隐去了)。