tcp_sendmsg函数

wwwlyj123321

已于 2022-11-02 10:23:57 修改

阅读量810

点赞数 1

分类专栏：工具类服务器运维文章标签： tcp/ip 网络服务器

于 2022-11-01 20:44:37 首次发布

本文链接：https://blog.csdn.net/wwwlyj123321/article/details/127640946

版权

服务器运维同时被 2 个专栏收录

35 篇文章 3 订阅

订阅专栏

工具类

26 篇文章 2 订阅

订阅专栏

tcp_sendmsg()的主要工作是把用户层的数据，填充到skb中，然后加入到sock的发送队列。

之后调用tcp_write_xmit()来把sock发送队列中的skb尽量地发送出去。

内核版本：4.9.130

struct msghdr {
	void		*msg_name;	/* ptr to socket address structure */
	int		msg_namelen;	/* size of socket address structure */
	struct iov_iter	msg_iter;	/* data */
	void		*msg_control;	/* ancillary data */
	__kernel_size_t	msg_controllen;	/* ancillary data buffer length */
	unsigned int	msg_flags;	/* flags on received message */
	struct kiocb	*msg_iocb;	/* ptr to iocb for async requests */
};

struct iov_iter {
	int type;
	size_t iov_offset;
	size_t count;
	union {
		const struct iovec *iov;
		const struct kvec *kvec;
		const struct bio_vec *bvec;
		struct pipe_inode_info *pipe;
	};
	union {
		unsigned long nr_segs;
		struct {
			int idx;
			int start_idx;
		};
	};
};

static inline size_t msg_data_left(struct msghdr *msg)
{
	return iov_iter_count(&msg->msg_iter);
}
static inline size_t iov_iter_count(const struct iov_iter *i)
{
	return i->count;
}


static inline bool sk_stream_memory_free(const struct sock *sk)
{
    //发送队列总大小已经超过了发送缓冲区上限
	if (sk->sk_wmem_queued >= sk->sk_sndbuf)
		return false;
    //
	return sk->sk_prot->stream_memory_free ?
		sk->sk_prot->stream_memory_free(sk) : true;
}

int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *skb;
	struct sockcm_cookie sockc;
	int flags, err, copied = 0;
	int mss_now = 0, size_goal, copied_syn = 0;
	bool process_backlog = false;
	bool sg;
	long timeo;

	lock_sock(sk);

	flags = msg->msg_flags;

    /* Send data in TCP SYN.
     * 使用了TCP Fast Open时，会在发送SYN时携带上数据。
     */
	if ((flags & MSG_FASTOPEN) && !tp->repair) {
		err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
		if (err == -EINPROGRESS && copied_syn > 0)
			goto out;
		else if (err)
			goto out_err;
	}

    /* 发送的超时时间，如果是非阻塞的则为0 */
	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);

    /* https://blog.csdn.net/sinat_20184565/article/details/106109415 */
	tcp_rate_check_app_limited(sk);  /* is sending application-limited? */

	/* Wait for a connection to finish. One exception is TCP Fast Open
	 * (passive side) where data is allowed to be sent before a connection
	 * is fully established.
	 */
    /* 如果连接尚未完成三次握手，是不允许发送数据的，除非是Fast Open的被动打开方 */
	if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
	    !tcp_passive_fastopen(sk)) {
		err = sk_stream_wait_connect(sk, &timeo);
		if (err != 0)
			goto do_error;
	}

    /* 使用TCP_REPAIR选项时 */
	if (unlikely(tp->repair)) {
		if (tp->repair_queue == TCP_RECV_QUEUE) {
            /* 发送到接收队列中 */
			copied = tcp_send_rcvq(sk, msg, size);
			goto out_nopush;
		}

		err = -EINVAL;
		if (tp->repair_queue == TCP_NO_QUEUE)
			goto out_err;

		/* 'common' sending to sendq */
	}

	sockc.tsflags = sk->sk_tsflags;
	if (msg->msg_controllen) {
		err = sock_cmsg_send(sk, msg, &sockc);
		if (unlikely(err)) {
			err = -EINVAL;
			goto out_err;
		}
	}


    /* This should be in poll.
     * 清除使用异步情况下，发送队列满了的标志。
     */
	/* This should be in poll */
	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);

	/* Ok commence sending. */
	copied = 0;

restart:
    
    /* 获取当前有效的 mss。
     * mtu: max transmission unit.
     * mss: max segment size. (mtu - (ip header size) - (tcp header size)).
     * GSO: Generic Segmentation Offload.
     * size_goal 表示数据报到达网络设备时，数据段的最大长度，该长度用来分割数据，
     * TCP 发送段时，每个 SKB 的大小不能超过该值。
     * 不支持 GSO 情况下， size_goal 就等于 MSS，如果支持 GSO，
     * 那么 size_goal 是 mss 的整数倍，数据报发送到网络设备后再由网络设备根据 MSS 进行分割。
     */
	mss_now = tcp_send_mss(sk, &size_goal, flags);

	err = -EPIPE;
	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
		goto do_error;

	sg = !!(sk->sk_route_caps & NETIF_F_SG);

    /* 将 msg 数据拷贝到 skb，等待发送。 */
	while (msg_data_left(msg)) {
		int copy = 0;
		int max = size_goal;

        /* 从等待发送数据链表中，取最后一个 skb，将将要发送的数据填充到 skb，等待发送。 */
		skb = tcp_write_queue_tail(sk);
        /* 还有未发送的数据，说明该skb还未发送 */
		if (tcp_send_head(sk)) { //sk->sk_send_head不为空，说明还有skb未发送
            /* 如果网卡不支持检验和计算，那么skb的最大长度为MSS，即不能使用GSO */
			if (skb->ip_summed == CHECKSUM_NONE)
				max = mss_now;

            /* max - skb->len 判断 skb 是否已满，大于零说明 skb 还有剩余空间，
             * 还能往 skb 追加填充数据，组成一个 mss 的数据包，发往 ip 层。 */
			copy = max - skb->len; /* 此skb可追加的数据长度 */
		}

        /* 如果当前 skb 空间不足，那么要重新创建一个 sk_buffer 装载数据。 
           或者被设置了 eor 标记不能合并。*/
		if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
			bool first_skb;

new_segment:
			/* Allocate new segment. If the interface is SG,
			 * allocate skb fitting to single page.
			 */
            /* 如果发送队列的总大小（sk_wmem_queued）>= 发送缓存上限（sk_sndbuf）
             * 或者发送缓冲区中尚未发送的数据量，超过了用户的设置值，那么进入等待状态。
            */
			if (!sk_stream_memory_free(sk))
				goto wait_for_sndbuf;

			if (process_backlog && sk_flush_backlog(sk)) {
				process_backlog = false;
				goto restart;
			}
			first_skb = skb_queue_empty(&sk->sk_write_queue);
            /* 申请一个skb，其线性数据区的大小为：
             * 通过select_size()得到的线性数据区中TCP负荷的大小 + 最大的协议头长度。
             * 如果申请skb失败了，或者虽然申请skb成功，但是从系统层面判断此次申请不合法，
             * 那么就进入睡眠，等待内存。
            */
			skb = sk_stream_alloc_skb(sk,
						  select_size(sk, sg, first_skb),
						  sk->sk_allocation,
						  first_skb);
			if (!skb)
				goto wait_for_memory;

			process_backlog = true;
			/*
			 * Check whether we can use HW checksum.
			 */
			if (sk_check_csum_caps(sk))
				skb->ip_summed = CHECKSUM_PARTIAL;

            /* 将 skb 添加进发送队列尾部。 */
			skb_entail(sk, skb);
            /* skb 数据缓冲区大小是 size_goal。 */
			copy = size_goal;
			max = size_goal;

			/* All packets are restored as if they have
			 * already been sent. skb_mstamp isn't set to
			 * avoid wrong rtt estimation.
			 */
			if (tp->repair)
				TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
		}

		/* Try to append data to the end of skb. */
		if (copy > msg_data_left(msg))
			copy = msg_data_left(msg);//本次可拷贝的数据量不能超过数据块的长度

		/* Where to copy to? */
		if (skb_availroom(skb) > 0) //如果skb的线性数据区还有剩余空间，就先复制到线性数据区。
        {
			/* We have some space in skb head. Superb! */
			copy = min_t(int, copy, skb_availroom(skb));
            /* 将数据拷贝到连续的数据区域。*/
			err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
			if (err)
				goto do_fault;
		} else {
            /* 如果 skb 的线性存储区底部已经没有空间了，
             * 将数据拷贝到 skb 的 struct skb_shared_info 结构指向的不需要连续的页面区域。 
            */
			bool merge = true;
			int i = skb_shinfo(skb)->nr_frags;/* 分页数 */
			struct page_frag *pfrag = sk_page_frag(sk);/* 上次缓存的分页 */

            /* 检查分页是否有可用空间，如果没有就申请新的page。
             * 如果申请失败，说明系统内存不足。
             * 之后会设置TCP内存压力标志，减小发送缓冲区的上限，睡眠等待内存。
             */
			if (!sk_page_frag_refill(sk, pfrag))
				goto wait_for_memory;

            /* 判断能否往最后一个分页追加数据 */
			if (!skb_can_coalesce(skb, i, pfrag->page,
					      pfrag->offset)) {

                /* 不能追加时，检查分页数是否达到了上限，或者网卡不支持分散聚合。
                 * 如果是的话，就为此skb设置PSH标志，尽快地发送出去。
                 * 然后跳转到new_segment处申请新的skb，来继续填装数据。
                 */
				if (i >= sysctl_max_skb_frags || !sg) {
					tcp_mark_push(tp, skb);
					goto new_segment;
				}
				merge = false;
			}

			copy = min_t(int, copy, pfrag->size - pfrag->offset);

            /* 从系统层面判断发送缓存的申请是否合法 */
			if (!sk_wmem_schedule(sk, copy))
				goto wait_for_memory;

            /* 拷贝用户空间的数据到内核空间，同时计算校验和。
             * 更新skb的长度字段，更新sock的发送队列大小和预分配缓存。
             */
			err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
						       pfrag->page,
						       pfrag->offset,
						       copy);
			if (err)
				goto do_error;

			/* Update the skb. */
			if (merge) {/* 如果把数据追加到最后一个分页了，更新最后一个分页的数据大小 */
				skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
			} else {
                /* 初始化新增加的页 */
				skb_fill_page_desc(skb, i, pfrag->page,
						   pfrag->offset, copy);
				get_page(pfrag->page);
			}
			pfrag->offset += copy;
		}
    
        /* 如果这是第一次拷贝，取消PSH标志 */
		if (!copied)
			TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;

		tp->write_seq += copy;             /* 更新发送队列的最后一个序号 */
		TCP_SKB_CB(skb)->end_seq += copy;  /* 更新skb的结束序号 */
        /* 初始化 gso 分段数 gso_segs. */
		tcp_skb_pcount_set(skb, 0);

        
		copied += copy;  /* 已经拷贝到发送队列的数据量 */
		if (!msg_data_left(msg)) {
			if (unlikely(flags & MSG_EOR))
				TCP_SKB_CB(skb)->eor = 1;  /* #define MSG_EOR 0x80 -- End of record */
            /* 用户层数据已经拷贝完毕，进行发送。 */
			goto out;
		}


        /* 如果skb还可以继续填充数据，或者发送的是带外数据，或者使用TCP REPAIR选项，
         * 那么继续拷贝数据，先不发送。
         */
		if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
			continue;

        /* 积累的数据包数量太多了，需要发送出去。*/
		if (forced_push(tp)) {
			tcp_mark_push(tp, skb);
            /* 尽可能的将发送队列中的skb发送出去，禁用nalge */
			__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
		} else if (skb == tcp_send_head(sk))
			tcp_push_one(sk, mss_now); /* 如果是第一个网络包，那么只发送当前段。 */
		continue;

wait_for_sndbuf:
        /* 发送队列中段数据总长度已经达到了发送缓冲区的长度上限，那么设置 SOCK_NOSPACE。*/
		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
        /* 在进入睡眠等待前，如果已有数据从用户空间复制过来，那么通过 tcp_push 先发送出去。 */
		if (copied)
			tcp_push(sk, flags & ~MSG_MORE, mss_now,
				 TCP_NAGLE_PUSH, size_goal);

        /* 进入睡眠，等待内存空闲信号唤醒。 */
        /* 分两种情况：
        * 1. sock的发送缓存不足。等待sock有发送缓存可写事件，或者超时。
        * 2. TCP层内存不足，等待2~202ms之间的一个随机时间。
        */
		err = sk_stream_wait_memory(sk, &timeo);
		if (err != 0)
			goto do_error;
        /* 睡眠后MSS和TSO段长可能会发生变化，重新计算 */
		mss_now = tcp_send_mss(sk, &size_goal, flags);
	}//~while (msg_data_left(msg))

out:
    /* 在连接状态下，在发送过程中，如果有正常的退出，或者由于错误退出，
     * 但是已经有复制数据了，都会进入发送环节。 */
	if (copied) {
		tcp_tx_timestamp(sk, sockc.tsflags, tcp_write_queue_tail(sk));
        /* 是否能立即发送数据要看是否启用了 Nagle 算法。 */
		tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
	}
out_nopush:
	release_sock(sk);
	return copied + copied_syn;

do_fault:
	if (!skb->len) /* 如果skb没有负荷 */
    {
		tcp_unlink_write_queue(skb, sk); /* 把skb从发送队列中删除 */
		/* It is the one place in all of TCP, except connection
		 * reset, where we can be unlinking the send_head.
		 */
		tcp_check_send_head(sk, skb);/* 是否要撤销sk->sk_send_head */
		sk_wmem_free_skb(sk, skb);   /* 更新发送队列的大小和预分配缓存，释放skb */
	}

do_error:
	if (copied + copied_syn)
		goto out;
out_err:
	err = sk_stream_error(sk, flags, err);
	/* make sure we wake any epoll edge trigger waiter */
	if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
		sk->sk_write_space(sk);
	release_sock(sk);
	return err;
}
EXPORT_SYMBOL(tcp_sendmsg);

ref：

TCP的发送系列 — tcp_sendmsg()的实现（一）_zhangskd的博客-CSDN博客

[内核源码] 网络协议栈 - write (tcp) 发送数据

TCP层sendmsg系统调用的实现分析 - AlexAlex - 博客园