发送消息流程
发送消息框架
-
APP 调用send函数会触发系统调用,系统调用将数据拷贝到内核空间进入协议栈
-
经过协议栈处理后,进入到RingBuffer
-
然后经过驱动将数据真正的发出去
-
发送完成后,触发中断通知CPU
-
CPU接收到中断请求后,清理RingBuffer区
源码
int main()
{
......
// 调用send 发送数据
send(fd,buf,sizeof(bnf),0);
return 0;
}
SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len,
unsigned int, flags, struct sockaddr __user *, addr,
int, addr_len)
{
struct socket *sock;
struct sockaddr_storage address;
struct msghdr msg;
.......
// 根据fd查找对应的socket
sock = sockfd_lookup_light(fd, &err, &fput_needed);
.....
// 构造消息头
msg.msg_name = (struct sockaddr *)&address;
msg.msg_namelen = addr_len;
msg.msg_flags = flags;
....
//发送数据
sock_sendmsg(sock, &msg);
}
int sock_sendmsg(struct socket *sock, struct msghdr *msg)
{
int err = security_socket_sendmsg(sock, msg,
msg_data_left(msg));
return err ?: sock_sendmsg_nosec(sock, msg);
}
static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg)
{
int ret = sock->ops->sendmsg(sock, msg, msg_data_left(msg));
BUG_ON(ret == -EIOCBQUEUED);
return ret;
}
接着调用协议栈相关函数进行处理
// net/ipv4/af_inet.c
int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
{
struct sock *sk = sock->sk;
sock_rps_record_flow(sk);
/* We may need to bind the socket. */
if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
inet_autobind(sk))
return -EAGAIN;
return sk->sk_prot->sendmsg(sk, msg, size);
}
进入传输层处理
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
gfp_t gfp_mask)
{
return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
tcp_sk(sk)->rcv_nxt);
}
static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
........
// 封装TCP头部
th = tcp_hdr(skb);
th->source = inet->inet_sport;
th->dest = inet->inet_dport;
th->seq = htonl(tcb->seq);
th->ack_seq = htonl(rcv_nxt);
......
//调用网络层接口发送
err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
}
进入网络层
int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
{
struct iphdr *iph;
// 合作IP层头部
iph->ttl = ip_select_ttl(inet, &rt->dst);
iph->protocol = sk->sk_protocol;
res = ip_local_out(net, sk, skb);
}
int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct iphdr *iph = ip_hdr(skb);
iph->tot_len = htons(skb->len);
ip_send_check(iph);
skb->protocol = htons(ETH_P_IP);
return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
net, sk, skb, NULL, skb_dst(skb)->dev,
dst_output);
}
static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
....
int res = dst_neigh_output(dst, neigh, skb);
......
}
进入数据链路层
// include/net/dst.h
static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
struct sk_buff *skb)
{
const struct hh_cache *hh;
if (dst->pending_confirm) {
unsigned long now = jiffies;
dst->pending_confirm = 0;
/* avoid dirtying neighbour */
if (n->confirmed != now)
n->confirmed = now;
}
hh = &n->hh;
if ((n->nud_state & NUD_CONNECTED) && hh->hh_len)
return neigh_hh_output(hh, skb);
}
static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
{
unsigned int seq;
int hh_len;
do {
seq = read_seqbegin(&hh->hh_lock);
hh_len = hh->hh_len;
if (likely(hh_len <= HH_DATA_MOD)) {
/* this is inlined by gcc */
memcpy(skb->data - HH_DATA_MOD, hh->hh_data, HH_DATA_MOD);
} else {
int hh_alen = HH_DATA_ALIGN(hh_len);
memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
}
} while (read_seqretry(&hh->hh_lock, seq));
skb_push(skb, hh_len);
return dev_queue_xmit(skb);
}
进入网络子系统
int dev_queue_xmit(struct sk_buff *skb)
{
return __dev_queue_xmit(skb, NULL);
}
static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
{
....
// 选择发送队列并获取qdisc
txq = netdev_pick_tx(dev, skb, accel_priv);
q = rcu_dereference_bh(txq->qdisc);
....
// 继续发送
rc = __dev_xmit_skb(skb, q, dev, txq);
}
最终调用驱动程序发送
// drivers/net/ethernet/intel/igb/igb_main.c
static const struct net_device_ops igb_netdev_ops = {
.ndo_open = igb_open,
.ndo_stop = igb_close,
// send
.ndo_start_xmit = igb_xmit_frame,
.ndo_get_stats64 = igb_get_stats64,
.ndo_set_rx_mode = igb_set_rx_mode,
.ndo_set_mac_address = igb_set_mac,
.ndo_change_mtu = igb_change_mtu,
.ndo_do_ioctl = igb_ioctl,
.ndo_tx_timeout = igb_tx_timeout,
.ndo_validate_addr = eth_validate_addr,
.ndo_vlan_rx_add_vid = igb_vlan_rx_add_vid,
.ndo_vlan_rx_kill_vid = igb_vlan_rx_kill_vid,
.ndo_set_vf_mac = igb_ndo_set_vf_mac,
.ndo_set_vf_vlan = igb_ndo_set_vf_vlan,
.ndo_set_vf_rate = igb_ndo_set_vf_bw,
.ndo_set_vf_spoofchk = igb_ndo_set_vf_spoofchk,
.ndo_get_vf_config = igb_ndo_get_vf_config,
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = igb_netpoll,
#endif
.ndo_fix_features = igb_fix_features,
.ndo_set_features = igb_set_features,
.ndo_features_check = passthru_features_check,
};
static netdev_tx_t igb_xmit_frame(struct sk_buff *skb,
struct net_device *netdev)
{
struct igb_adapter *adapter = netdev_priv(netdev);
......
return igb_xmit_frame_ring(skb, igb_tx_queue_mapping(adapter, skb));
}
netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb,
struct igb_ring *tx_ring)
{
// 获取TX queue中的下一个可用的缓冲信息
first = &tx_ring->tx_buffer_info[tx_ring->next_to_use];
first->skb = skb;
first->bytecount = skb->len;
// igb_tx_map 函数准备给设备发送的数据
igb_tx_map(tx_ring, first, hdr_len);
}
数据发送完成后,还需要释放缓存队列
static irqreturn_t igb_msix_ring(int irq, void *data)
{
struct igb_q_vector *q_vector = data;
/* Write the ITR value calculated from the previous interrupt. */
// 写中断
igb_write_itr(q_vector);
// 调用软中断
napi_schedule(&q_vector->napi);
return IRQ_HANDLED;
}
/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
struct napi_struct *napi)
{
list_add_tail(&napi->poll_list, &sd->poll_list);
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
static void net_rx_action(struct softirq_action *h)
{
.....
budget -= napi_poll(n, &repoll);
.....
}
static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
.....
work = n->poll(n, weight);
.....
}
/**
* igb_poll - NAPI Rx polling callback
* @napi: napi polling structure
* @budget: count of how many packets we should handle
**/
static int igb_poll(struct napi_struct *napi, int budget)
{
bool clean_complete = true;
int work_done = 0;
#ifdef CONFIG_IGB_DCA
if (q_vector->adapter->flags & IGB_FLAG_DCA_ENABLED)
igb_update_dca(q_vector);
#endif
if (q_vector->tx.ring)
// 清理TX_Buffer
clean_complete = igb_clean_tx_irq(q_vector);
}
static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
{
/* clear last DMA location and unmap remaining buffers */
// 清除DMA区域
while (tx_desc != eop_desc) {
......
}
}
网卡的准备
-
服务器的网卡是支持多个队列的,每一个队列都是一个RingBuffer表示
-
网卡在启动的时候就会初始化分配相关的队列
源码
//file: drivers/net/ethernet/intel/igb/igb_main.c
static int __igb_open(struct net_device *netdev, bool resuming)
{
struct igb_adapter *adapter = netdev_priv(netdev);
//分配传输描述符数组
err = igb_setup_all_tx_resources(adapter);
//分配接收描述符数组
err = igb_setup_all_rx_resources(adapter);
//开启全部队列
netif_tx_start_all_queues(netdev);
}
__igb_open调用相关的函数分配了所以的RingBuffer(包括传输和接收队列)
//file: drivers/net/ethernet/intel/igb/igb_main.c
static int igb_setup_all_tx_resources(struct igb_adapter *adapter)
{
//根据队列的个数分配 RingBuffer
for (i = 0; i < adapter->num_tx_queues; i++) {
igb_setup_tx_resources(adapter->tx_ring[i]);
}
}
//file: drivers/net/ethernet/intel/igb/igb_main.c
int igb_setup_tx_resources(struct igb_ring *tx_ring)
{
//1.申请 igb_tx_buffer 数组内存
size = sizeof(struct igb_tx_buffer) * tx_ring->count;
tx_ring->tx_buffer_info = vzalloc(size);
//2.申请 e1000_adv_tx_desc DMA 数组内存
tx_ring->size = tx_ring->count * sizeof(union e1000_adv_tx_desc);
tx_ring->size = ALIGN(tx_ring->size, 4096);
tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
&tx_ring->dma, GFP_KERNEL);
//3.初始化队列成员
tx_ring->next_to_use = 0;
tx_ring->next_to_clean = 0;
}