协议栈出入口函数

89人阅读 评论(0) 收藏 举报

dev_queue_xmit()

协议栈出口函数是dev_queue_xmit(),协议栈通过dev_queue_xmit()将skb下送网卡驱动

dev_queue_xmit() -> dev_hard_start_xmit() -> ops->ndo_start_xmit()ixgbe_xmit_frame() -> ixgbe_tx_map() + ixgbe_tx_queue()

/**
 * dev_queue_xmit - transmit a buffer
 * @skb: buffer to transmit
 *
 * Queue a buffer for transmission to a network device. The caller must
 * have set the device and priority and built the buffer before calling
 * this function. The function can be called from an interrupt.
 *
 * A negative errno code is returned on a failure. A success does not
 * guarantee the frame will be transmitted as it may be dropped due
 * to congestion or traffic shaping.
 *
 * -----------------------------------------------------------------------------------
 *      I notice this method can also return errors from the queue disciplines,
 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
 *      be positive.
 *
 *      Regardless of the return value, the skb is consumed, so it is currently
 *      difficult to retry a send to this method.  (You can bump the ref count
 *      before sending to hold a reference for retry if you are careful.)
 *
 *      When calling this method, interrupts MUST be enabled.  This is because
 *      the BH enable code must have IRQs enabled so that it will not deadlock.
 *          --BLG
 */
int dev_queue_xmit(struct sk_buff *skb)
{
   struct net_device *dev = skb->dev;
   struct netdev_queue *txq;
   struct Qdisc *q;
   int rc = -ENOMEM;

   /* GSO will handle the following emulations directly. */
   // 若skb是GSO(Generic Segmentation Offload)包且网络设备支持GSO
   if (netif_needs_gso(dev, skb))
      goto gso;

   if (skb_has_frags(skb) &&
       !(dev->features & NETIF_F_FRAGLIST) &&
       __skb_linearize(skb))
      goto out_kfree_skb;

   /* Fragmented skb is linearized if device does not support SG,
    * or if at least one of fragments is in highmem and device
    * does not support DMA from it.
    */
   if (skb_shinfo(skb)->nr_frags &&
       (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
       __skb_linearize(skb))
      goto out_kfree_skb;

   /* If packet is not checksummed and device does not support
    * checksumming for this protocol, complete checksumming here.
    */
   if (skb->ip_summed == CHECKSUM_PARTIAL) {
      skb_set_transport_header(skb, skb->csum_start -
                     skb_headroom(skb));
      if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
         goto out_kfree_skb;
   }

gso:
   /* Disable soft irqs for various locks below. Also
    * stops preemption for RCU.
    */
   rcu_read_lock_bh();

   txq = dev_pick_tx(dev, skb); // 选择发送队列
   q = rcu_dereference(txq->qdisc); // 得到网络设备的排队规则

#ifdef CONFIG_NET_CLS_ACT
   skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
#endif
   if (q->enqueue) { // 若定义了入队操作(使用qdisc)
      rc = __dev_xmit_skb(skb, q, dev, txq); // qdisc入出队
      goto out;
   }

   /* The device has no queue. Common case for software devices:
      loopback, all the sorts of tunnels...

      Really, it is unlikely that netif_tx_lock protection is necessary
      here.  (f.e. loopback and IP tunnels are clean ignoring statistics
      counters.)
      However, it is possible, that they rely on protection
      made by us here.

      Check this and shot the lock. It is not prone from deadlocks.
      Either shot noqueue qdisc, it is even simpler 8)
    */
   if (dev->flags & IFF_UP) {
      int cpu = smp_processor_id(); /* ok because BHs are off */

      if (txq->xmit_lock_owner != cpu) {

         HARD_TX_LOCK(dev, txq, cpu);

         if (!netif_tx_queue_stopped(txq)) {
            rc = NET_XMIT_SUCCESS;
            // dev_queue_xmit() -> dev_hard_start_xmit() -> ops->ndo_start_xmit()
            if (!dev_hard_start_xmit(skb, dev, txq)) {
               HARD_TX_UNLOCK(dev, txq);
               goto out;
            }
         }
         HARD_TX_UNLOCK(dev, txq);
         if (net_ratelimit())
            printk(KERN_CRIT "Virtual device %s asks to "
                   "queue packet!\n", dev->name);
      } else {
         /* Recursion is detected! It is possible,
          * unfortunately */
         if (net_ratelimit())
            printk(KERN_CRIT "Dead loop on virtual device "
                   "%s, fix it urgently!\n", dev->name);
      }
   }

   rc = -ENETDOWN;
   rcu_read_unlock_bh();

out_kfree_skb:
   kfree_skb(skb);
   return rc;
out:
   rcu_read_unlock_bh();
   return rc;
}

dev_pick_tx()

static struct netdev_queue *dev_pick_tx(struct net_device *dev,
                    struct sk_buff *skb)
{
    const struct net_device_ops *ops = dev->netdev_ops;
    u16 queue_index = 0;

    if (ops->ndo_select_queue) // ixgbe为ixgbe_select_queue()
        queue_index = ops->ndo_select_queue(dev, skb);
    else if (dev->real_num_tx_queues > 1)
        queue_index = skb_tx_hash(dev, skb);

    skb_set_queue_mapping(skb, queue_index);
    return netdev_get_tx_queue(dev, queue_index);
}

static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb)
{
    struct ixgbe_adapter *adapter = netdev_priv(dev);
    int txq = smp_processor_id(); // 得到当前CPU id

    if (adapter->flags & IXGBE_FLAG_FDIR_HASH_CAPABLE) {
        while (unlikely(txq >= dev->real_num_tx_queues))
            txq -= dev->real_num_tx_queues;
        return txq; // 返回当前CPU id
    }

    if (adapter->flags & IXGBE_FLAG_DCB_ENABLED)
        return (skb->vlan_tci & IXGBE_TX_FLAGS_VLAN_PRIO_MASK) >> 13;

    return skb_tx_hash(dev, skb);
}

static inline void skb_set_queue_mapping(struct sk_buff *skb, u16 queue_mapping)
{
    skb->queue_mapping = queue_mapping;
}

static inline
struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev,
                     unsigned int index)
{
    return &dev->_tx[index]; // 得到发送队列的netdev_queue
}

__dev_xmit_skb()

static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
                 struct net_device *dev,
                 struct netdev_queue *txq)
{
    spinlock_t *root_lock = qdisc_lock(q);
    int rc;

    spin_lock(root_lock);
    if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
        kfree_skb(skb);
        rc = NET_XMIT_DROP;
    } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
           !test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) {
        /*
         * This is a work-conserving queue; there are no old skbs
         * waiting to be sent out; and the qdisc is not running -
         * xmit the skb directly.
         */
        __qdisc_update_bstats(q, skb->len);
        if (sch_direct_xmit(skb, q, dev, txq, root_lock))
            __qdisc_run(q);
        else
            clear_bit(__QDISC_STATE_RUNNING, &q->state);

        rc = NET_XMIT_SUCCESS;
    } else {
        rc = qdisc_enqueue_root(skb, q);
        qdisc_run(q);
    }
    spin_unlock(root_lock);

    return rc;
}

static inline int qdisc_enqueue_root(struct sk_buff *skb, struct Qdisc *sch)
{
    qdisc_skb_cb(skb)->pkt_len = skb->len;
    return qdisc_enqueue(skb, sch) & NET_XMIT_MASK;
}

static inline void qdisc_run(struct Qdisc *q)
{
    if (!test_and_set_bit(__QDISC_STATE_RUNNING, &q->state))
        __qdisc_run(q);
}

qdisc_enqueue()

static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
#ifdef CONFIG_NET_SCHED
    if (sch->stab)
        qdisc_calculate_pkt_len(skb, sch->stab);
#endif
    return sch->enqueue(skb, sch); // pfifo_fast_ops为pfifo_fast_enqueue()
}

static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
{
    /* 若发送队列未满
       在ether_setup()中设置dev->tx_queue_len为1000 */
    if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
        int band = prio2band[skb->priority & TC_PRIO_MAX]; // prio2band数组
        struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
        struct sk_buff_head *list = band2list(priv, band); // band2list()

        priv->bitmap |= (1 << band);
        qdisc->q.qlen++;
        return __qdisc_enqueue_tail(skb, qdisc, list); // 入队
    }

    return qdisc_drop(skb, qdisc); // 若发送队列满,丢包
}

__qdisc_run()

/*
 * Transmit one skb, and handle the return status as required. Holding the
 * __QDISC_STATE_RUNNING bit guarantees that only one CPU can execute this
 * function.
 *
 * Returns to the caller:
 *              0  - queue is empty or throttled.
 *              >0 - queue is not empty.
 */
int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
            struct net_device *dev, struct netdev_queue *txq,
            spinlock_t *root_lock)
{
    int ret = NETDEV_TX_BUSY;

    /* And release qdisc */
    spin_unlock(root_lock);

    HARD_TX_LOCK(dev, txq, smp_processor_id());
    if (!netif_tx_queue_stopped(txq) &&
        !netif_tx_queue_frozen(txq))
        // dev_queue_xmit() -> dev_hard_start_xmit() -> ops->ndo_start_xmit()
        ret = dev_hard_start_xmit(skb, dev, txq);
    HARD_TX_UNLOCK(dev, txq);

    spin_lock(root_lock);

    switch (ret) {
    case NETDEV_TX_OK:
        /* Driver sent out skb successfully */
        ret = qdisc_qlen(q);
        break;

    case NETDEV_TX_LOCKED:
        /* Driver try lock failed */
        ret = handle_dev_cpu_collision(skb, txq, q);
        break;

    default:
        /* Driver returned NETDEV_TX_BUSY - requeue skb */
        if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit()))
            printk(KERN_WARNING "BUG %s code %d qlen %d\n",
                   dev->name, ret, q->q.qlen);

        ret = dev_requeue_skb(skb, q);
        break;
    }

    if (ret && (netif_tx_queue_stopped(txq) ||
            netif_tx_queue_frozen(txq)))
        ret = 0;

    return ret;
}

/*
 * NOTE: Called under qdisc_lock(q) with locally disabled BH.
 *
 * __QDISC_STATE_RUNNING guarantees only one CPU can process
 * this qdisc at a time. qdisc_lock(q) serializes queue accesses for
 * this queue.
 *
 *  netif_tx_lock serializes accesses to device driver.
 *
 *  qdisc_lock(q) and netif_tx_lock are mutually exclusive,
 *  if one is grabbed, another must be free.
 *
 * Note, that this procedure can be called by a watchdog timer
 *
 * Returns to the caller:
 *              0  - queue is empty or throttled.
 *              >0 - queue is not empty.
 *
 */
static inline int qdisc_restart(struct Qdisc *q)
{
    struct netdev_queue *txq;
    struct net_device *dev;
    spinlock_t *root_lock;
    struct sk_buff *skb;

    /* Dequeue packet */
    skb = dequeue_skb(q);
    if (unlikely(!skb))
        return 0;

    root_lock = qdisc_lock(q);
    dev = qdisc_dev(q);
    txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));

    return sch_direct_xmit(skb, q, dev, txq, root_lock);
}

void __qdisc_run(struct Qdisc *q)
{
    unsigned long start_time = jiffies;

    while (qdisc_restart(q)) {
        /*
         * Postpone processing if
         * 1. another process needs the CPU;
         * 2. we've been doing it for too long.
         */
        if (need_resched() || jiffies != start_time) {
            // __netif_schedule() -> __netif_reschedule() -> raise_softirq_irqoff()
            __netif_schedule(q);
            break;
        }
    }

    clear_bit(__QDISC_STATE_RUNNING, &q->state);
}

netif_receive_skb()

协议栈入口函数是netif_receive_skb(),网卡驱动通过netif_receive_skb()将skb上送协议栈

ixgbe_receive_skb() -> napi_gro_receive() -> napi_skb_finish() -> netif_receive_skb() -> ip_rcv()

/**
 * netif_receive_skb - process receive buffer from network
 * @skb: buffer to process
 *
 * netif_receive_skb() is the main receive data processing function.
 * It always succeeds. The buffer may be dropped during processing
 * for congestion control or by the protocol layers.
 *
 * This function may only be called from softirq context and interrupts
 * should be enabled.
 *
 * Return values (usually ignored):
 * NET_RX_SUCCESS: no congestion
 * NET_RX_DROP: packet was dropped
 */
int netif_receive_skb(struct sk_buff *skb)
{
   struct packet_type *ptype, *pt_prev;
   struct net_device *orig_dev;
   struct net_device *null_or_orig;
   int ret = NET_RX_DROP;
   __be16 type;

   if (!skb->tstamp.tv64)
      net_timestamp(skb);

   if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
      return NET_RX_SUCCESS;

   /* if we've gotten here through NAPI, check netpoll */
   if (netpoll_receive_skb(skb))
      return NET_RX_DROP;

   if (!skb->iif)
      skb->iif = skb->dev->ifindex;

   null_or_orig = NULL;
   orig_dev = skb->dev;
   if (orig_dev->master) {
      if (skb_bond_should_drop(skb))
         null_or_orig = orig_dev; /* deliver only exact match */
      else
         skb->dev = orig_dev->master;
   }

   __get_cpu_var(netdev_rx_stat).total++;

   skb_reset_network_header(skb);
   skb_reset_transport_header(skb);
   skb->mac_len = skb->network_header - skb->mac_header;

   pt_prev = NULL;

   rcu_read_lock();

#ifdef CONFIG_NET_CLS_ACT
   if (skb->tc_verd & TC_NCLS) {
      skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
      goto ncls;
   }
#endif

   // 遍历ptype_all hlist
   list_for_each_entry_rcu(ptype, &ptype_all, list) {
      if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
          ptype->dev == orig_dev) {
         if (pt_prev)
            ret = deliver_skb(skb, pt_prev, orig_dev);
         pt_prev = ptype;
      }
   }

#ifdef CONFIG_NET_CLS_ACT
   skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
   if (!skb)
      goto out;
ncls:
#endif

   skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
   if (!skb)
      goto out;
   skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
   if (!skb)
      goto out;

   type = skb->protocol;
   // 遍历ptype_base hlist,通过type得到packet_type
   list_for_each_entry_rcu(ptype,
         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
      if (ptype->type == type &&
          (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
           ptype->dev == orig_dev)) {
         if (pt_prev)
            ret = deliver_skb(skb, pt_prev, orig_dev);
         pt_prev = ptype;
      }
   }

   if (pt_prev) {
      ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); // 调用packet_type的func
   } else {
      kfree_skb(skb);
      /* Jamal, now you will not able to escape explaining
       * me how you were going to use this. :-)
       */
      ret = NET_RX_DROP;
   }

out:
   rcu_read_unlock();
   return ret;
}

ip_packet_type

static struct packet_type ip_packet_type __read_mostly = {
   .type = cpu_to_be16(ETH_P_IP),
   .func = ip_rcv, // ip_packet_type的func
   .gso_send_check = inet_gso_send_check,
   .gso_segment = inet_gso_segment,
   .gro_receive = inet_gro_receive,
   .gro_complete = inet_gro_complete,
};

eth_type_trans()

/**
 * eth_type_trans - determine the packet's protocol ID.
 * @skb: received socket data
 * @dev: receiving network device
 *
 * The rule here is that we
 * assume 802.3 if the type field is short enough to be a length.
 * This is normal practice and works for any 'now in use' protocol.
 */
__be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
{
   struct ethhdr *eth;
   unsigned char *rawp;

   skb->dev = dev;
   skb_reset_mac_header(skb);
   skb_pull(skb, ETH_HLEN);
   eth = eth_hdr(skb);

   if (unlikely(is_multicast_ether_addr(eth->h_dest))) { // mac地址的最低位为1表示多/广播,为0表示单播
      if (!compare_ether_addr_64bits(eth->h_dest, dev->broadcast))
         skb->pkt_type = PACKET_BROADCAST;
      else
         skb->pkt_type = PACKET_MULTICAST;
   }

   /*
    *      This ALLMULTI check should be redundant by 1.4
    *      so don't forget to remove it.
    *
    *      Seems, you forgot to remove it. All silly devices
    *      seems to set IFF_PROMISC.
    */

   else if (1 /*dev->flags&IFF_PROMISC */ ) {
      if (unlikely(compare_ether_addr_64bits(eth->h_dest, dev->dev_addr))) // 比较skb的目的mac地址和网卡的mac地址
         skb->pkt_type = PACKET_OTHERHOST; // 默认为PACKET_HOST
   }

   /*
    * Some variants of DSA tagging don't have an ethertype field
    * at all, so we check here whether one of those tagging
    * variants has been configured on the receiving interface,
    * and if so, set skb->protocol without looking at the packet.
    */
   if (netdev_uses_dsa_tags(dev))
      return htons(ETH_P_DSA);
   if (netdev_uses_trailer_tags(dev))
      return htons(ETH_P_TRAILER);

   if (ntohs(eth->h_proto) >= 1536) // type/size大于0x0600表示type
      return eth->h_proto; // 返回L3协议

   rawp = skb->data;

   /*
    *      This is a magic hack to spot IPX packets. Older Novell breaks
    *      the protocol design and runs IPX over 802.3 without an 802.2 LLC
    *      layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This
    *      won't work for fault tolerant netware but does for the rest.
    */
   if (*(unsigned short *)rawp == 0xFFFF)
      return htons(ETH_P_802_3);

   /*
    *      Real 802.2 LLC
    */
   return htons(ETH_P_802_2);
}
EXPORT_SYMBOL(eth_type_trans);
查看评论

CCNA之TCP/IP协议栈精讲

-
  • 1970年01月01日 08:00

驱动程序的基本函数

驱动程序的基本函数   类别 函数名 功能 函数形成 参数 描述 驱动程序入口和出口点 module_...
  • zclongembedded
  • zclongembedded
  • 2012-11-22 10:35:41
  • 1051

Zstack协议栈函数的一些浅解

TI的Z-Stack装载在一个基于IAR开发环境的工程里。Z-Stack根据IEEE 802.15.4和ZigBee标准分为以下几层:API(Application Programming Inter...
  • u011208690
  • u011208690
  • 2014-03-28 21:50:05
  • 1748

zigbee协议栈 中断服务函数定义方法

感谢原作者,但是我找不到原作者的帖子,我看到的也是转载帖但是没看到原作者。 zigbee协议栈 中断服务函数定义方法 我们先看协议栈中 hal_uart.c 文件中的串口操作中...
  • a791404623
  • a791404623
  • 2017-07-20 20:25:26
  • 281

ZigBee2006协议栈中使用硬件中断函数的问题

最近一直在研究ZigBee2006协议栈,需要用到硬件。有关硬件的头文件和C文件基本上都在HAL文件夹里,这里面有各个硬件,如键盘、LCD、串口等外设的头文件,在hal/target/2430BB、2...
  • woyaodangjingchale
  • woyaodangjingchale
  • 2014-12-01 21:31:24
  • 462

ZigBee协议栈Zstack介绍

关于ZigBee协议栈的各个层的一些介绍
  • weilexuexi12
  • weilexuexi12
  • 2017-05-07 21:21:58
  • 1455

GA 394-2002(出入口控制系统技术要求).doc

  • 2018年01月17日 14:47
  • 176KB
  • 下载

ZigBee ZStack 协议栈学习-- 一个数据包的艺术之旅

一个ZigBee数据包的艺术之旅
  • zl374216459
  • zl374216459
  • 2017-03-02 12:05:42
  • 579

【Linux 内核网络协议栈源码剖析】socket 函数剖析

深度剖析网络协议栈中的 socket 函数,可以说是把前面介绍的串联起来,将网络协议栈各层关联起来。...
  • yeswenqian
  • yeswenqian
  • 2015-07-01 12:43:57
  • 2324

Zigbee协议栈--Z-Stack的使用

文章来自于百度文库:http://wenku.baidu.com/link?url=uiESmBOQ1hHnGX-MQQnRWpr93rOo6kdF8vVG_HA1JwzCg7kPYzbQ_YLww_...
  • chenbang110
  • chenbang110
  • 2016-09-27 14:49:11
  • 4604
    个人资料
    持之以恒
    等级:
    访问量: 5万+
    积分: 1773
    排名: 2万+
    文章分类