版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/hz5034/article/details/79952195
dev_queue_xmit()
协议栈出口函数是dev_queue_xmit(),协议栈通过dev_queue_xmit()将skb下送网卡驱动
dev_queue_xmit() -> dev_hard_start_xmit() -> ops->ndo_start_xmit()即ixgbe_xmit_frame() -> ixgbe_tx_map() + ixgbe_tx_queue()
/**
* dev_queue_xmit - transmit a buffer
* @skb: buffer to transmit
*
* Queue a buffer for transmission to a network device. The caller must
* have set the device and priority and built the buffer before calling
* this function. The function can be called from an interrupt.
*
* A negative errno code is returned on a failure. A success does not
* guarantee the frame will be transmitted as it may be dropped due
* to congestion or traffic shaping.
*
* -----------------------------------------------------------------------------------
* I notice this method can also return errors from the queue disciplines,
* including NET_XMIT_DROP, which is a positive value. So, errors can also
* be positive.
*
* Regardless of the return value, the skb is consumed, so it is currently
* difficult to retry a send to this method. (You can bump the ref count
* before sending to hold a reference for retry if you are careful.)
*
* When calling this method, interrupts MUST be enabled. This is because
* the BH enable code must have IRQs enabled so that it will not deadlock.
* --BLG
*/
int dev_queue_xmit(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
struct netdev_queue *txq;
struct Qdisc *q;
int rc = -ENOMEM;
/* GSO will handle the following emulations directly. */
// 若skb是GSO(Generic Segmentation Offload)包且网络设备支持GSO
if (netif_needs_gso(dev, skb))
goto gso;
if (skb_has_frags(skb) &&
!(dev->features & NETIF_F_FRAGLIST) &&
__skb_linearize(skb))
goto out_kfree_skb;
/* Fragmented skb is linearized if device does not support SG,
* or if at least one of fragments is in highmem and device
* does not support DMA from it.
*/
if (skb_shinfo(skb)->nr_frags &&
(!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
__skb_linearize(skb))
goto out_kfree_skb;
/* If packet is not checksummed and device does not support
* checksumming for this protocol, complete checksumming here.
*/
if (skb->ip_summed == CHECKSUM_PARTIAL) {
skb_set_transport_header(skb, skb->csum_start -
skb_headroom(skb));
if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
goto out_kfree_skb;
}
gso:
/* Disable soft irqs for various locks below. Also
* stops preemption for RCU.
*/
rcu_read_lock_bh();
txq = dev_pick_tx(dev, skb); // 选择发送队列
q = rcu_dereference(txq->qdisc); // 得到网络设备的排队规则
#ifdef CONFIG_NET_CLS_ACT
skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
#endif
if (q->enqueue) { // 若定义了入队操作(使用qdisc)
rc = __dev_xmit_skb(skb, q, dev, txq); // qdisc入出队
goto out;
}
/* The device has no queue. Common case for software devices:
loopback, all the sorts of tunnels...
Really, it is unlikely that netif_tx_lock protection is necessary
here. (f.e. loopback and IP tunnels are clean ignoring statistics
counters.)
However, it is possible, that they rely on protection
made by us here.
Check this and shot the lock. It is not prone from deadlocks.
Either shot noqueue qdisc, it is even simpler 8)
*/
if (dev->flags & IFF_UP) {
int cpu = smp_processor_id(); /* ok because BHs are off */
if (txq->xmit_lock_owner != cpu) {
HARD_TX_LOCK(dev, txq, cpu);
if (!netif_tx_queue_stopped(txq)) {
rc = NET_XMIT_SUCCESS;
// dev_queue_xmit() -> dev_hard_start_xmit() -> ops->ndo_start_xmit()
if (!dev_hard_start_xmit(skb, dev, txq)) {
HARD_TX_UNLOCK(dev, txq);
goto out;
}
}
HARD_TX_UNLOCK(dev, txq);
if (net_ratelimit())
printk(KERN_CRIT "Virtual device %s asks to "
"queue packet!\n", dev->name);
} else {
/* Recursion is detected! It is possible,
* unfortunately */
if (net_ratelimit())
printk(KERN_CRIT "Dead loop on virtual device "
"%s, fix it urgently!\n", dev->name);
}
}
rc = -ENETDOWN;
rcu_read_unlock_bh();
out_kfree_skb:
kfree_skb(skb);
return rc;
out:
rcu_read_unlock_bh();
return rc;
}
dev_pick_tx()
static struct netdev_queue *dev_pick_tx(struct net_device *dev,
struct sk_buff *skb)
{
const struct net_device_ops *ops = dev->netdev_ops;
u16 queue_index = 0;
if (ops->ndo_select_queue) // ixgbe为ixgbe_select_queue()
queue_index = ops->ndo_select_queue(dev, skb);
else if (dev->real_num_tx_queues > 1)
queue_index = skb_tx_hash(dev, skb);
skb_set_queue_mapping(skb, queue_index);
return netdev_get_tx_queue(dev, queue_index);
}
static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb)
{
struct ixgbe_adapter *adapter = netdev_priv(dev);
int txq = smp_processor_id(); // 得到当前CPU id
if (adapter->flags & IXGBE_FLAG_FDIR_HASH_CAPABLE) {
while (unlikely(txq >= dev->real_num_tx_queues))
txq -= dev->real_num_tx_queues;
return txq; // 返回当前CPU id
}
if (adapter->flags & IXGBE_FLAG_DCB_ENABLED)
return (skb->vlan_tci & IXGBE_TX_FLAGS_VLAN_PRIO_MASK) >> 13;
return skb_tx_hash(dev, skb);
}
static inline void skb_set_queue_mapping(struct sk_buff *skb, u16 queue_mapping)
{
skb->queue_mapping = queue_mapping;
}
static inline
struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev,
unsigned int index)
{
return &dev->_tx[index]; // 得到发送队列的netdev_queue
}
__dev_xmit_skb()
static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
struct net_device *dev,
struct netdev_queue *txq)
{
spinlock_t *root_lock = qdisc_lock(q);
int rc;
spin_lock(root_lock);
if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
kfree_skb(skb);
rc = NET_XMIT_DROP;
} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
!test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) {
/*
* This is a work-conserving queue; there are no old skbs
* waiting to be sent out; and the qdisc is not running -
* xmit the skb directly.
*/
__qdisc_update_bstats(q, skb->len);
if (sch_direct_xmit(skb, q, dev, txq, root_lock))
__qdisc_run(q);
else
clear_bit(__QDISC_STATE_RUNNING, &q->state);
rc = NET_XMIT_SUCCESS;
} else {
rc = qdisc_enqueue_root(skb, q);
qdisc_run(q);
}
spin_unlock(root_lock);
return rc;
}
static inline int qdisc_enqueue_root(struct sk_buff *skb, struct Qdisc *sch)
{
qdisc_skb_cb(skb)->pkt_len = skb->len;
return qdisc_enqueue(skb, sch) & NET_XMIT_MASK;
}
static inline void qdisc_run(struct Qdisc *q)
{
if (!test_and_set_bit(__QDISC_STATE_RUNNING, &q->state))
__qdisc_run(q);
}
qdisc_enqueue()
static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
#ifdef CONFIG_NET_SCHED
if (sch->stab)
qdisc_calculate_pkt_len(skb, sch->stab);
#endif
return sch->enqueue(skb, sch); // pfifo_fast_ops为pfifo_fast_enqueue()
}
static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
{
/* 若发送队列未满
在ether_setup()中设置dev->tx_queue_len为1000 */
if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
int band = prio2band[skb->priority & TC_PRIO_MAX]; // prio2band数组
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
struct sk_buff_head *list = band2list(priv, band); // band2list()
priv->bitmap |= (1 << band);
qdisc->q.qlen++;
return __qdisc_enqueue_tail(skb, qdisc, list); // 入队
}
return qdisc_drop(skb, qdisc); // 若发送队列满,丢包
}
__qdisc_run()
/*
* Transmit one skb, and handle the return status as required. Holding the
* __QDISC_STATE_RUNNING bit guarantees that only one CPU can execute this
* function.
*
* Returns to the caller:
* 0 - queue is empty or throttled.
* >0 - queue is not empty.
*/
int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
struct net_device *dev, struct netdev_queue *txq,
spinlock_t *root_lock)
{
int ret = NETDEV_TX_BUSY;
/* And release qdisc */
spin_unlock(root_lock);
HARD_TX_LOCK(dev, txq, smp_processor_id());
if (!netif_tx_queue_stopped(txq) &&
!netif_tx_queue_frozen(txq))
// dev_queue_xmit() -> dev_hard_start_xmit() -> ops->ndo_start_xmit()
ret = dev_hard_start_xmit(skb, dev, txq);
HARD_TX_UNLOCK(dev, txq);
spin_lock(root_lock);
switch (ret) {
case NETDEV_TX_OK:
/* Driver sent out skb successfully */
ret = qdisc_qlen(q);
break;
case NETDEV_TX_LOCKED:
/* Driver try lock failed */
ret = handle_dev_cpu_collision(skb, txq, q);
break;
default:
/* Driver returned NETDEV_TX_BUSY - requeue skb */
if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit()))
printk(KERN_WARNING "BUG %s code %d qlen %d\n",
dev->name, ret, q->q.qlen);
ret = dev_requeue_skb(skb, q);
break;
}
if (ret && (netif_tx_queue_stopped(txq) ||
netif_tx_queue_frozen(txq)))
ret = 0;
return ret;
}
/*
* NOTE: Called under qdisc_lock(q) with locally disabled BH.
*
* __QDISC_STATE_RUNNING guarantees only one CPU can process
* this qdisc at a time. qdisc_lock(q) serializes queue accesses for
* this queue.
*
* netif_tx_lock serializes accesses to device driver.
*
* qdisc_lock(q) and netif_tx_lock are mutually exclusive,
* if one is grabbed, another must be free.
*
* Note, that this procedure can be called by a watchdog timer
*
* Returns to the caller:
* 0 - queue is empty or throttled.
* >0 - queue is not empty.
*
*/
static inline int qdisc_restart(struct Qdisc *q)
{
struct netdev_queue *txq;
struct net_device *dev;
spinlock_t *root_lock;
struct sk_buff *skb;
/* Dequeue packet */
skb = dequeue_skb(q);
if (unlikely(!skb))
return 0;
root_lock = qdisc_lock(q);
dev = qdisc_dev(q);
txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
return sch_direct_xmit(skb, q, dev, txq, root_lock);
}
void __qdisc_run(struct Qdisc *q)
{
unsigned long start_time = jiffies;
while (qdisc_restart(q)) {
/*
* Postpone processing if
* 1. another process needs the CPU;
* 2. we've been doing it for too long.
*/
if (need_resched() || jiffies != start_time) {
// __netif_schedule() -> __netif_reschedule() -> raise_softirq_irqoff()
__netif_schedule(q);
break;
}
}
clear_bit(__QDISC_STATE_RUNNING, &q->state);
}
netif_receive_skb()
协议栈入口函数是netif_receive_skb(),网卡驱动通过netif_receive_skb()将skb上送协议栈
ixgbe_receive_skb() -> napi_gro_receive() -> napi_skb_finish() -> netif_receive_skb() -> ip_rcv()
/**
* netif_receive_skb - process receive buffer from network
* @skb: buffer to process
*
* netif_receive_skb() is the main receive data processing function.
* It always succeeds. The buffer may be dropped during processing
* for congestion control or by the protocol layers.
*
* This function may only be called from softirq context and interrupts
* should be enabled.
*
* Return values (usually ignored):
* NET_RX_SUCCESS: no congestion
* NET_RX_DROP: packet was dropped
*/
int netif_receive_skb(struct sk_buff *skb)
{
struct packet_type *ptype, *pt_prev;
struct net_device *orig_dev;
struct net_device *null_or_orig;
int ret = NET_RX_DROP;
__be16 type;
if (!skb->tstamp.tv64)
net_timestamp(skb);
if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
return NET_RX_SUCCESS;
/* if we've gotten here through NAPI, check netpoll */
if (netpoll_receive_skb(skb))
return NET_RX_DROP;
if (!skb->iif)
skb->iif = skb->dev->ifindex;
null_or_orig = NULL;
orig_dev = skb->dev;
if (orig_dev->master) {
if (skb_bond_should_drop(skb))
null_or_orig = orig_dev; /* deliver only exact match */
else
skb->dev = orig_dev->master;
}
__get_cpu_var(netdev_rx_stat).total++;
skb_reset_network_header(skb);
skb_reset_transport_header(skb);
skb->mac_len = skb->network_header - skb->mac_header;
pt_prev = NULL;
rcu_read_lock();
#ifdef CONFIG_NET_CLS_ACT
if (skb->tc_verd & TC_NCLS) {
skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
goto ncls;
}
#endif
// 遍历ptype_all hlist
list_for_each_entry_rcu(ptype, &ptype_all, list) {
if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
ptype->dev == orig_dev) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
}
#ifdef CONFIG_NET_CLS_ACT
skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
if (!skb)
goto out;
ncls:
#endif
skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
if (!skb)
goto out;
skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
if (!skb)
goto out;
type = skb->protocol;
// 遍历ptype_base hlist,通过type得到packet_type
list_for_each_entry_rcu(ptype,
&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
if (ptype->type == type &&
(ptype->dev == null_or_orig || ptype->dev == skb->dev ||
ptype->dev == orig_dev)) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
}
if (pt_prev) {
ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); // 调用packet_type的func
} else {
kfree_skb(skb);
/* Jamal, now you will not able to escape explaining
* me how you were going to use this. :-)
*/
ret = NET_RX_DROP;
}
out:
rcu_read_unlock();
return ret;
}
ip_packet_type
static struct packet_type ip_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_IP),
.func = ip_rcv, // ip_packet_type的func
.gso_send_check = inet_gso_send_check,
.gso_segment = inet_gso_segment,
.gro_receive = inet_gro_receive,
.gro_complete = inet_gro_complete,
};
eth_type_trans()
/**
* eth_type_trans - determine the packet's protocol ID.
* @skb: received socket data
* @dev: receiving network device
*
* The rule here is that we
* assume 802.3 if the type field is short enough to be a length.
* This is normal practice and works for any 'now in use' protocol.
*/
__be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
{
struct ethhdr *eth;
unsigned char *rawp;
skb->dev = dev;
skb_reset_mac_header(skb);
skb_pull(skb, ETH_HLEN);
eth = eth_hdr(skb);
if (unlikely(is_multicast_ether_addr(eth->h_dest))) { // mac地址的最低位为1表示多/广播,为0表示单播
if (!compare_ether_addr_64bits(eth->h_dest, dev->broadcast))
skb->pkt_type = PACKET_BROADCAST;
else
skb->pkt_type = PACKET_MULTICAST;
}
/*
* This ALLMULTI check should be redundant by 1.4
* so don't forget to remove it.
*
* Seems, you forgot to remove it. All silly devices
* seems to set IFF_PROMISC.
*/
else if (1 /*dev->flags&IFF_PROMISC */ ) {
if (unlikely(compare_ether_addr_64bits(eth->h_dest, dev->dev_addr))) // 比较skb的目的mac地址和网卡的mac地址
skb->pkt_type = PACKET_OTHERHOST; // 默认为PACKET_HOST
}
/*
* Some variants of DSA tagging don't have an ethertype field
* at all, so we check here whether one of those tagging
* variants has been configured on the receiving interface,
* and if so, set skb->protocol without looking at the packet.
*/
if (netdev_uses_dsa_tags(dev))
return htons(ETH_P_DSA);
if (netdev_uses_trailer_tags(dev))
return htons(ETH_P_TRAILER);
if (ntohs(eth->h_proto) >= 1536) // type/size大于0x0600表示type
return eth->h_proto; // 返回L3协议
rawp = skb->data;
/*
* This is a magic hack to spot IPX packets. Older Novell breaks
* the protocol design and runs IPX over 802.3 without an 802.2 LLC
* layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This
* won't work for fault tolerant netware but does for the rest.
*/
if (*(unsigned short *)rawp == 0xFFFF)
return htons(ETH_P_802_3);
/*
* Real 802.2 LLC
*/
return htons(ETH_P_802_2);
}
EXPORT_SYMBOL(eth_type_trans);
- 上一篇 网络性能测试工具
查看评论