Linux网络发送消息

发送消息流程

发送消息框架

  • APP 调用send函数会触发系统调用,系统调用将数据拷贝到内核空间进入协议栈

  • 经过协议栈处理后,进入到RingBuffer

  • 然后经过驱动将数据真正的发出去

  • 发送完成后,触发中断通知CPU

  • CPU接收到中断请求后,清理RingBuffer区

源码

int main()
{
    ......
    // 调用send 发送数据
    send(fd,buf,sizeof(bnf),0);
    return 0;
}
SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len,
		unsigned int, flags, struct sockaddr __user *, addr,
		int, addr_len)
{
    struct socket *sock;
	struct sockaddr_storage address;
    struct msghdr msg;
    .......
    // 根据fd查找对应的socket
    sock = sockfd_lookup_light(fd, &err, &fput_needed);
    .....
    // 构造消息头
    msg.msg_name = (struct sockaddr *)&address;
    msg.msg_namelen = addr_len;
    msg.msg_flags = flags;
    
    ....
    //发送数据
    sock_sendmsg(sock, &msg);
}
int sock_sendmsg(struct socket *sock, struct msghdr *msg)
{
	int err = security_socket_sendmsg(sock, msg,
					  msg_data_left(msg));

	return err ?: sock_sendmsg_nosec(sock, msg);
}
static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg)
{
	int ret = sock->ops->sendmsg(sock, msg, msg_data_left(msg));
	BUG_ON(ret == -EIOCBQUEUED);
	return ret;
}

接着调用协议栈相关函数进行处理

// net/ipv4/af_inet.c
int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
{
	struct sock *sk = sock->sk;

	sock_rps_record_flow(sk);

	/* We may need to bind the socket. */
	if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
	    inet_autobind(sk))
		return -EAGAIN;

	return sk->sk_prot->sendmsg(sk, msg, size);
}

进入传输层处理

static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
			    gfp_t gfp_mask)
{
	return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
				  tcp_sk(sk)->rcv_nxt);
}
static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
			      int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
{
    const struct inet_connection_sock *icsk = inet_csk(sk);
    ........
    // 封装TCP头部
    th = tcp_hdr(skb);
	th->source		= inet->inet_sport;
	th->dest		= inet->inet_dport;
	th->seq			= htonl(tcb->seq);
	th->ack_seq		= htonl(rcv_nxt);
    
    ......
    //调用网络层接口发送
    err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
}

进入网络层

int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
{
    struct iphdr *iph;
    // 合作IP层头部
    iph->ttl      = ip_select_ttl(inet, &rt->dst);
	iph->protocol = sk->sk_protocol;
    res = ip_local_out(net, sk, skb);
}
int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
	struct iphdr *iph = ip_hdr(skb);

	iph->tot_len = htons(skb->len);
	ip_send_check(iph);

	skb->protocol = htons(ETH_P_IP);

	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
		       net, sk, skb, NULL, skb_dst(skb)->dev,
		       dst_output);
}
static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
    ....
    int res = dst_neigh_output(dst, neigh, skb);
    ......
}

进入数据链路层

// include/net/dst.h
static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
				   struct sk_buff *skb)
{
	const struct hh_cache *hh;

	if (dst->pending_confirm) {
		unsigned long now = jiffies;

		dst->pending_confirm = 0;
		/* avoid dirtying neighbour */
		if (n->confirmed != now)
			n->confirmed = now;
	}

	hh = &n->hh;
	if ((n->nud_state & NUD_CONNECTED) && hh->hh_len)
		return neigh_hh_output(hh, skb);
}
static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
{
	unsigned int seq;
	int hh_len;

	do {
		seq = read_seqbegin(&hh->hh_lock);
		hh_len = hh->hh_len;
		if (likely(hh_len <= HH_DATA_MOD)) {
			/* this is inlined by gcc */
			memcpy(skb->data - HH_DATA_MOD, hh->hh_data, HH_DATA_MOD);
		} else {
			int hh_alen = HH_DATA_ALIGN(hh_len);

			memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
		}
	} while (read_seqretry(&hh->hh_lock, seq));

	skb_push(skb, hh_len);
	return dev_queue_xmit(skb);
}

进入网络子系统

int dev_queue_xmit(struct sk_buff *skb)
{
	return __dev_queue_xmit(skb, NULL);
}
static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
{
    ....
    // 选择发送队列并获取qdisc
    txq = netdev_pick_tx(dev, skb, accel_priv);
	q = rcu_dereference_bh(txq->qdisc);
    ....
    // 继续发送
    rc = __dev_xmit_skb(skb, q, dev, txq);
}

最终调用驱动程序发送

// drivers/net/ethernet/intel/igb/igb_main.c
static const struct net_device_ops igb_netdev_ops = {
	.ndo_open		= igb_open,
	.ndo_stop		= igb_close,
    // send
	.ndo_start_xmit		= igb_xmit_frame,
	.ndo_get_stats64	= igb_get_stats64,
	.ndo_set_rx_mode	= igb_set_rx_mode,
	.ndo_set_mac_address	= igb_set_mac,
	.ndo_change_mtu		= igb_change_mtu,
	.ndo_do_ioctl		= igb_ioctl,
	.ndo_tx_timeout		= igb_tx_timeout,
	.ndo_validate_addr	= eth_validate_addr,
	.ndo_vlan_rx_add_vid	= igb_vlan_rx_add_vid,
	.ndo_vlan_rx_kill_vid	= igb_vlan_rx_kill_vid,
	.ndo_set_vf_mac		= igb_ndo_set_vf_mac,
	.ndo_set_vf_vlan	= igb_ndo_set_vf_vlan,
	.ndo_set_vf_rate	= igb_ndo_set_vf_bw,
	.ndo_set_vf_spoofchk	= igb_ndo_set_vf_spoofchk,
	.ndo_get_vf_config	= igb_ndo_get_vf_config,
#ifdef CONFIG_NET_POLL_CONTROLLER
	.ndo_poll_controller	= igb_netpoll,
#endif
	.ndo_fix_features	= igb_fix_features,
	.ndo_set_features	= igb_set_features,
	.ndo_features_check	= passthru_features_check,
};
static netdev_tx_t igb_xmit_frame(struct sk_buff *skb,
				  struct net_device *netdev)
{
	struct igb_adapter *adapter = netdev_priv(netdev);

	......
	return igb_xmit_frame_ring(skb, igb_tx_queue_mapping(adapter, skb));
}
netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb,
				struct igb_ring *tx_ring)
{
    // 获取TX queue中的下一个可用的缓冲信息
    first = &tx_ring->tx_buffer_info[tx_ring->next_to_use];
	first->skb = skb;
	first->bytecount = skb->len;
    
    // igb_tx_map 函数准备给设备发送的数据
    igb_tx_map(tx_ring, first, hdr_len);
}

数据发送完成后,还需要释放缓存队列

static irqreturn_t igb_msix_ring(int irq, void *data)
{
	struct igb_q_vector *q_vector = data;

	/* Write the ITR value calculated from the previous interrupt. */
    // 写中断
	igb_write_itr(q_vector);
	// 调用软中断
	napi_schedule(&q_vector->napi);

	return IRQ_HANDLED;
}
/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
				     struct napi_struct *napi)
{
	list_add_tail(&napi->poll_list, &sd->poll_list);
	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
static void net_rx_action(struct softirq_action *h)
{
    .....
    budget -= napi_poll(n, &repoll);
    .....
}
static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
	.....
    work = n->poll(n, weight);
    .....
}
/**
 *  igb_poll - NAPI Rx polling callback
 *  @napi: napi polling structure
 *  @budget: count of how many packets we should handle
 **/
static int igb_poll(struct napi_struct *napi, int budget)
{
    bool clean_complete = true;
	int work_done = 0;

    #ifdef CONFIG_IGB_DCA
    if (q_vector->adapter->flags & IGB_FLAG_DCA_ENABLED)
        igb_update_dca(q_vector);
    #endif
    if (q_vector->tx.ring)
        // 清理TX_Buffer
        clean_complete = igb_clean_tx_irq(q_vector);
}
static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
{
    /* clear last DMA location and unmap remaining buffers */
    // 清除DMA区域
    while (tx_desc != eop_desc) {
        ......
    }
}

网卡的准备

  • 服务器的网卡是支持多个队列的,每一个队列都是一个RingBuffer表示

  • 网卡在启动的时候就会初始化分配相关的队列

源码

//file: drivers/net/ethernet/intel/igb/igb_main.c
static int __igb_open(struct net_device *netdev, bool resuming)
{
    struct igb_adapter *adapter = netdev_priv(netdev);

    //分配传输描述符数组
    err = igb_setup_all_tx_resources(adapter);

    //分配接收描述符数组
    err = igb_setup_all_rx_resources(adapter);

    //开启全部队列
    netif_tx_start_all_queues(netdev);
}

__igb_open调用相关的函数分配了所以的RingBuffer(包括传输和接收队列)

//file: drivers/net/ethernet/intel/igb/igb_main.c
static int igb_setup_all_tx_resources(struct igb_adapter *adapter)
{
    //根据队列的个数分配 RingBuffer
    for (i = 0; i < adapter->num_tx_queues; i++) {
        igb_setup_tx_resources(adapter->tx_ring[i]);
    }
}
//file: drivers/net/ethernet/intel/igb/igb_main.c
int igb_setup_tx_resources(struct igb_ring *tx_ring)
{
 //1.申请 igb_tx_buffer 数组内存
 size = sizeof(struct igb_tx_buffer) * tx_ring->count;
 tx_ring->tx_buffer_info = vzalloc(size);

 //2.申请 e1000_adv_tx_desc DMA 数组内存
 tx_ring->size = tx_ring->count * sizeof(union e1000_adv_tx_desc);
 tx_ring->size = ALIGN(tx_ring->size, 4096);
 tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
        &tx_ring->dma, GFP_KERNEL);

 //3.初始化队列成员
 tx_ring->next_to_use = 0;
 tx_ring->next_to_clean = 0;
}

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值