Linux中报文是如何从网卡传递到相应协议的

报文是如何从网卡传递到相应协议的
------------------------------------

先来看一看网络部分是如何被初始化的. 下面是函数被调用的过程:

start_kernel -> init -> do_basic_setup -> sock_init -> proto-init
           inet_proto_init  -> ip_init -> dev_add_pack

下面的结构定义了网络协议的初始化入口:
struct net_proto
{
const char *name; /* Protocol name */
void (*init_func)(struct net_proto *); /* Bootstrap */
} protocols[];
每一个协议提供了一个自己的init_func. 如IP提供了ip_init.

dev_add_pack完成了实际的协议添加过程. 系统维护了两个协议表. 一个是单向链表,
另一个是hash表(使用了桶形的hash表).

每一个协议用一个struct packet_type来描述, 其中的func是其入口函数. 当系统从
读到一个报文就会调用相应协议的func来完成实际的处理工作.

struct packet_type
{
unsigned short type; /* This is really htons(ether_type). */
struct device *dev; /* NULL is wildcarded here */
int (*func) (struct sk_buff *, struct device *,
struct packet_type *);
void *data; /* Private to the packet type */
struct packet_type *next;
};

// 协议的hash表
struct packet_type *ptype_base[16]; /* Hashed types */
// 协议的单向链表的头指针
struct packet_type *ptype_all = NULL; /* Taps */

dev_add_pack完成的工作其实很简单, 它将一个struct packet_type指针加入到相应
的链表中. 请看源码:
void dev_add_pack(struct packet_type *pt)
{
int hash;

//由pt->type来判断加到那一个链表中
if(pt->type == htons(ETH_P_ALL))
{

netdev_nit++;
// 添加到单向链表中
pt->next = ptype_all;
ptype_all = pt;
}
else
{
// 添加到hash表中
hash = ntohs(pt->type)&15;
~~~~~~~~~~~~~~~~~~~~~~~~~~
//这是它的hash算法, 简单的只取低4位
pt->next = ptype_base[hash];
ptype_base[hash] = pt;
}
}
ip_init使用dev_add_pack完成了实际的添加过程.

下面是核心数据结构sk_buff
struct sk_buff {
struct sk_buff * next; /* Next buffer in list */
struct sk_buff * prev; /* Previous buffer in list */
struct sk_buff_head * list; /* List we are on */
struct sock *sk; /* Socket we are owned by */
struct timeval stamp; /* Time we arrived */
struct device *dev; /* Device we arrived on/are leaving by   */

/* Transport layer header */
union
{
struct tcphdr *th;
struct udphdr *uh;
struct icmphdr *icmph;
struct igmphdr *igmph;
struct iphdr *ipiph;
struct spxhdr *spxh;
unsigned char *raw;
} h;

/* Network layer header */
union
{
struct iphdr *iph;
struct ipv6hdr *ipv6h;
struct arphdr *arph;
struct ipxhdr *ipxh;
unsigned char *raw;
} nh;

/* Link layer header */
union
{
  struct ethhdr *ethernet;
  unsigned char *raw;
} mac;

struct  dst_entry *dst;

char cb[48];

unsigned int len; /* Length of actual data    */
unsigned int csum; /* Checksum */
volatile char used; /* Data moved to user and not MSG_PEEK */
unsigned char is_clone, /* We are a clone    */
cloned, /* head may be cloned(check refcnt to be sure)*/
  pkt_type, /* Packet class     */
  pkt_bridged, /* Tracker for bridging       */
  ip_summed; /* Driver fed us an IP checksum       */
__u32 priority; /* Packet queueing priority       */
atomic_t users; /* User count - see datagram.c,tcp.c  */
unsigned short protocol; /* Packet protocol from driver.       */
unsigned short security; /* Security level of packet       */
unsigned int truesize; /* Buffer size        */

unsigned char *head; /* Head of buffer        */
unsigned char *data; /* Data head pointer        */
unsigned char *tail; /* Tail pointer        */
unsigned char *end; /* End pointer        */
void (*destructor)(struct sk_buff *); /* Destruct function   */
#ifdef CONFIG_IP_FIREWALL
        __u32           fwmark;                 /* Label made by fwchains, used by pktsched  */
#endif
#if defined(CONFIG_SHAPER) || defined(CONFIG_SHAPER_MODULE)
__u32 shapelatency; /* Latency on frame */
__u32 shapeclock; /* Time it should go out */
__u32 shapelen; /* Frame length in clocks */
__u32 shapestamp; /* Stamp for shaper    */
__u16 shapepend; /* Pending */
#endif

#if defined(CONFIG_HIPPI)
union{
__u32 ifield;
} private;
#endif
};

* arch/i386/kernel/irq.c

void do_IRQ(unsigned int irq, struct pt_regs *regs)
{
    int cpu = smp_processor_id();

    kstat.irqs[cpu][irq]++;
    irq_desc[irq].handler->handle(irq, regs);

    /*
     * This should be conditional: we should really get
     * a return code from the irq handler to tell us
     * whether the handler wants us to do software bottom
     * half handling or not..
     */
    //每次处理中断, 都会进行底半处理. 现在还不符合底半的原意.

    if (1) {
        if (bh_active & bh_mask)
            do_bottom_half();
    }
    __sti();/*VY*/
}

* net/core/dev.c

/*
* Device drivers call our routines to queue packets here. We empty the
* queue in the bottom half handler.
*/
//所有的报文先存放在这里, 随后由再net_bh发给相应的网络层模块, 如 ip_input.

static struct sk_buff_head backlog;

/*
* Receive a packet from a device driver and queue it for the upper
* (protocol) levels.  It always succeeds.
*/
/*
* 简单的将新到的包, 插到backlog中.
* 网卡驱动程序会调用这个函数
*/

void netif_rx(struct sk_buff *skb)
{
if(skb->stamp.tv_sec==0)
get_fast_time(&skb->stamp); //取当前的时间

/* The code is rearranged so that the path is the most
   short when CPU is congested, but is still operating.
*/


if (backlog.qlen <= netdev_max_backlog) {
if (backlog.qlen) {
if (netdev_dropping == 0) {
skb_queue_tail(&backlog, skb);
mark_bh(NET_BH);
return;
}
atomic_inc(&netdev_rx_dropped);
kfree_skb(skb);
return;
}

netdev_dropping = 0;

skb_queue_tail(&backlog, skb);
mark_bh(NET_BH);
return;
}
netdev_dropping = 1;
atomic_inc(&netdev_rx_dropped);
kfree_skb(skb);
}

/*
*  网络的底半处理程序, 将报文由backlog中取出, 发给相应的网络层模块,
*  如 ip_input. 有一点请大家注意一个报文可以被多个协议层所处理. 这
*  一点很重要.
*/

void net_bh(void)
{
struct packet_type *ptype;
struct packet_type *pt_prev;
unsigned short type;
unsigned long start_time = jiffies;

NET_PROFILE_ENTER(net_bh);
/*
* Can we send anything now? We want to clear the
* decks for any more sends that get done as we
* process the input. This also minimises the
* latency on a transmit interrupt bh.
*/


if (qdisc_head.forw != &qdisc_head)
qdisc_run_queues();

/*
* Any data left to process. This may occur because a
* mark_bh() is done after we empty the queue including
* that from the device which does a mark_bh() just after
*/


/*
* While the queue is not empty..
*
* Note that the queue never shrinks due to
* an interrupt, so we can do this test without
* disabling interrupts.
*/


while (!skb_queue_empty(&backlog))
{
struct sk_buff * skb;

/* Give chance to other bottom halves to run */
if (jiffies - start_time > 1)
goto net_bh_break;

/* We have a packet. Therefore the queue has shrunk */
skb = skb_dequeue(&backlog); //从backlog中取出一个包

/*
* Bump the pointer to the next structure.
*
* On entry to the protocol layer. skb->data and
* skb->nh.raw point to the MAC and encapsulated data
*/


/* XXX until we figure out every place to modify.. */
skb->h.raw = skb->nh.raw = skb->data;

if (skb->mac.raw < skb->head || skb->mac.raw > skb->data) {
printk(KERN_CRIT "%s: wrong mac.raw ptr, proto=%04x
",
                               skb->dev->name, skb->protocol);
kfree_skb(skb);
continue;
}

/* Fetch the packet protocol ID. */

type = skb->protocol; //这里是指网络层协议

/*
*  We got a packet ID.  Now loop over the "known protocols"
*  list. There are two lists. The ptype_all list of taps (normally empty)
*  and the main protocol list which is hashed perfectly for normal protocols.
*/


pt_prev = NULL;
for (ptype = ptype_all; ptype!=NULL; ptype=ptype->next)
{
if (!ptype->dev || ptype->dev == skb->dev) {
if(pt_prev)
{
struct sk_buff *skb2=skb_clone(skb, GFP_ATOMIC);
if(skb2)
pt_prev->func(skb2,skb->dev, pt_prev);
}
pt_prev=ptype;
}
}

for (ptype = ptype_base[ntohs(type)&15]; ptype != NULL; ptype = ptype->next)
{
if (ptype->type == type && (!ptype->dev || ptype->dev==skb->dev))
{
/*
* We already have a match queued. Deliver
* to it and then remember the new match
*/

if(pt_prev)
{
struct sk_buff *skb2;

skb2=skb_clone(skb, GFP_ATOMIC);

/*
* Kick the protocol handler. This should be fast
* and efficient code.
*/


if(skb2)
pt_prev->func(skb2, skb->dev, pt_prev);
}
/* Remember the current last to do */
pt_prev=ptype;
}
} /* End of protocol list loop */

/* Is there a last item to send to ? */

if(pt_prev)
pt_prev->func(skb, skb->dev, pt_prev);
/* Has an unknown packet has been received ? */

else {
kfree_skb(skb);
}
  } /* End of queue loop */

  /* We have emptied the queue */
/* One last output flush. */

if (qdisc_head.forw != &qdisc_head)
qdisc_run_queues();

netdev_dropping = 0;

NET_PROFILE_LEAVE(net_bh);
return;

net_bh_break:
mark_bh(NET_BH);
NET_PROFILE_LEAVE(net_bh);
return;
}

为了理清头绪, 我重新描述一下上面的过程:

网卡驱动程序调用netif_rx将新收到的报文存在backlog队列中.
在底半处理中, net_bh调用相应的协议模块来处理报文. 而目前
linux的实现中, 每次中断都会调用底半处理.

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值