报文是如何从网卡传递到相应协议的
------------------------------------
先来看一看网络部分是如何被初始化的. 下面是函数被调用的过程:
start_kernel -> init -> do_basic_setup -> sock_init -> proto-init
inet_proto_init -> ip_init -> dev_add_pack
下面的结构定义了网络协议的初始化入口:
struct net_proto
{
const char *name; /* Protocol name */
void (*init_func)(struct net_proto *); /* Bootstrap */
} protocols[];
每一个协议提供了一个自己的init_func. 如IP提供了ip_init.
dev_add_pack完成了实际的协议添加过程. 系统维护了两个协议表. 一个是单向链表,
另一个是hash表(使用了桶形的hash表).
每一个协议用一个struct packet_type来描述, 其中的func是其入口函数. 当系统从
读到一个报文就会调用相应协议的func来完成实际的处理工作.
struct packet_type
{
unsigned short type; /* This is really htons(ether_type). */
struct device *dev; /* NULL is wildcarded here */
int (*func) (struct sk_buff *, struct device *,
struct packet_type *);
void *data; /* Private to the packet type */
struct packet_type *next;
};
// 协议的hash表
struct packet_type *ptype_base[16]; /* Hashed types */
// 协议的单向链表的头指针
struct packet_type *ptype_all = NULL; /* Taps */
dev_add_pack完成的工作其实很简单, 它将一个struct packet_type指针加入到相应
的链表中. 请看源码:
void dev_add_pack(struct packet_type *pt)
{
int hash;
//由pt->type来判断加到那一个链表中
if(pt->type == htons(ETH_P_ALL))
{
netdev_nit++;
// 添加到单向链表中
pt->next = ptype_all;
ptype_all = pt;
}
else
{
// 添加到hash表中
hash = ntohs(pt->type)&15;
~~~~~~~~~~~~~~~~~~~~~~~~~~
//这是它的hash算法, 简单的只取低4位
pt->next = ptype_base[hash];
ptype_base[hash] = pt;
}
}
ip_init使用dev_add_pack完成了实际的添加过程.
下面是核心数据结构sk_buff
struct sk_buff {
struct sk_buff * next; /* Next buffer in list */
struct sk_buff * prev; /* Previous buffer in list */
struct sk_buff_head * list; /* List we are on */
struct sock *sk; /* Socket we are owned by */
struct timeval stamp; /* Time we arrived */
struct device *dev; /* Device we arrived on/are leaving by */
/* Transport layer header */
union
{
struct tcphdr *th;
struct udphdr *uh;
struct icmphdr *icmph;
struct igmphdr *igmph;
struct iphdr *ipiph;
struct spxhdr *spxh;
unsigned char *raw;
} h;
/* Network layer header */
union
{
struct iphdr *iph;
struct ipv6hdr *ipv6h;
struct arphdr *arph;
struct ipxhdr *ipxh;
unsigned char *raw;
} nh;
/* Link layer header */
union
{
struct ethhdr *ethernet;
unsigned char *raw;
} mac;
struct dst_entry *dst;
char cb[48];
unsigned int len; /* Length of actual data */
unsigned int csum; /* Checksum */
volatile char used; /* Data moved to user and not MSG_PEEK */
unsigned char is_clone, /* We are a clone */
cloned, /* head may be cloned(check refcnt to be sure)*/
pkt_type, /* Packet class */
pkt_bridged, /* Tracker for bridging */
ip_summed; /* Driver fed us an IP checksum */
__u32 priority; /* Packet queueing priority */
atomic_t users; /* User count - see datagram.c,tcp.c */
unsigned short protocol; /* Packet protocol from driver. */
unsigned short security; /* Security level of packet */
unsigned int truesize; /* Buffer size */
unsigned char *head; /* Head of buffer */
unsigned char *data; /* Data head pointer */
unsigned char *tail; /* Tail pointer */
unsigned char *end; /* End pointer */
void (*destructor)(struct sk_buff *); /* Destruct function */
#ifdef CONFIG_IP_FIREWALL
__u32 fwmark; /* Label made by fwchains, used by pktsched */
#endif
#if defined(CONFIG_SHAPER) || defined(CONFIG_SHAPER_MODULE)
__u32 shapelatency; /* Latency on frame */
__u32 shapeclock; /* Time it should go out */
__u32 shapelen; /* Frame length in clocks */
__u32 shapestamp; /* Stamp for shaper */
__u16 shapepend; /* Pending */
#endif
#if defined(CONFIG_HIPPI)
union{
__u32 ifield;
} private;
#endif
};
* arch/i386/kernel/irq.c
void do_IRQ(unsigned int irq, struct pt_regs *regs)
{
int cpu = smp_processor_id();
kstat.irqs[cpu][irq]++;
irq_desc[irq].handler->handle(irq, regs);
/*
* This should be conditional: we should really get
* a return code from the irq handler to tell us
* whether the handler wants us to do software bottom
* half handling or not..
*/
//每次处理中断, 都会进行底半处理. 现在还不符合底半的原意.
if (1) {
if (bh_active & bh_mask)
do_bottom_half();
}
__sti();/*VY*/
}
* net/core/dev.c
/*
* Device drivers call our routines to queue packets here. We empty the
* queue in the bottom half handler.
*/
//所有的报文先存放在这里, 随后由再net_bh发给相应的网络层模块, 如 ip_input.
static struct sk_buff_head backlog;
/*
* Receive a packet from a device driver and queue it for the upper
* (protocol) levels. It always succeeds.
*/
/*
* 简单的将新到的包, 插到backlog中.
* 网卡驱动程序会调用这个函数
*/
void netif_rx(struct sk_buff *skb)
{
if(skb->stamp.tv_sec==0)
get_fast_time(&skb->stamp); //取当前的时间
/* The code is rearranged so that the path is the most
short when CPU is congested, but is still operating.
*/
if (backlog.qlen <= netdev_max_backlog) {
if (backlog.qlen) {
if (netdev_dropping == 0) {
skb_queue_tail(&backlog, skb);
mark_bh(NET_BH);
return;
}
atomic_inc(&netdev_rx_dropped);
kfree_skb(skb);
return;
}
netdev_dropping = 0;
skb_queue_tail(&backlog, skb);
mark_bh(NET_BH);
return;
}
netdev_dropping = 1;
atomic_inc(&netdev_rx_dropped);
kfree_skb(skb);
}
/*
* 网络的底半处理程序, 将报文由backlog中取出, 发给相应的网络层模块,
* 如 ip_input. 有一点请大家注意一个报文可以被多个协议层所处理. 这
* 一点很重要.
*/
void net_bh(void)
{
struct packet_type *ptype;
struct packet_type *pt_prev;
unsigned short type;
unsigned long start_time = jiffies;
NET_PROFILE_ENTER(net_bh);
/*
* Can we send anything now? We want to clear the
* decks for any more sends that get done as we
* process the input. This also minimises the
* latency on a transmit interrupt bh.
*/
if (qdisc_head.forw != &qdisc_head)
qdisc_run_queues();
/*
* Any data left to process. This may occur because a
* mark_bh() is done after we empty the queue including
* that from the device which does a mark_bh() just after
*/
/*
* While the queue is not empty..
*
* Note that the queue never shrinks due to
* an interrupt, so we can do this test without
* disabling interrupts.
*/
while (!skb_queue_empty(&backlog))
{
struct sk_buff * skb;
/* Give chance to other bottom halves to run */
if (jiffies - start_time > 1)
goto net_bh_break;
/* We have a packet. Therefore the queue has shrunk */
skb = skb_dequeue(&backlog); //从backlog中取出一个包
/*
* Bump the pointer to the next structure.
*
* On entry to the protocol layer. skb->data and
* skb->nh.raw point to the MAC and encapsulated data
*/
/* XXX until we figure out every place to modify.. */
skb->h.raw = skb->nh.raw = skb->data;
if (skb->mac.raw < skb->head || skb->mac.raw > skb->data) {
printk(KERN_CRIT "%s: wrong mac.raw ptr, proto=%04x
",
skb->dev->name, skb->protocol);
kfree_skb(skb);
continue;
}
/* Fetch the packet protocol ID. */
type = skb->protocol; //这里是指网络层协议
/*
* We got a packet ID. Now loop over the "known protocols"
* list. There are two lists. The ptype_all list of taps (normally empty)
* and the main protocol list which is hashed perfectly for normal protocols.
*/
pt_prev = NULL;
for (ptype = ptype_all; ptype!=NULL; ptype=ptype->next)
{
if (!ptype->dev || ptype->dev == skb->dev) {
if(pt_prev)
{
struct sk_buff *skb2=skb_clone(skb, GFP_ATOMIC);
if(skb2)
pt_prev->func(skb2,skb->dev, pt_prev);
}
pt_prev=ptype;
}
}
for (ptype = ptype_base[ntohs(type)&15]; ptype != NULL; ptype = ptype->next)
{
if (ptype->type == type && (!ptype->dev || ptype->dev==skb->dev))
{
/*
* We already have a match queued. Deliver
* to it and then remember the new match
*/
if(pt_prev)
{
struct sk_buff *skb2;
skb2=skb_clone(skb, GFP_ATOMIC);
/*
* Kick the protocol handler. This should be fast
* and efficient code.
*/
if(skb2)
pt_prev->func(skb2, skb->dev, pt_prev);
}
/* Remember the current last to do */
pt_prev=ptype;
}
} /* End of protocol list loop */
/* Is there a last item to send to ? */
if(pt_prev)
pt_prev->func(skb, skb->dev, pt_prev);
/* Has an unknown packet has been received ? */
else {
kfree_skb(skb);
}
} /* End of queue loop */
/* We have emptied the queue */
/* One last output flush. */
if (qdisc_head.forw != &qdisc_head)
qdisc_run_queues();
netdev_dropping = 0;
NET_PROFILE_LEAVE(net_bh);
return;
net_bh_break:
mark_bh(NET_BH);
NET_PROFILE_LEAVE(net_bh);
return;
}
为了理清头绪, 我重新描述一下上面的过程:
网卡驱动程序调用netif_rx将新收到的报文存在backlog队列中.
在底半处理中, net_bh调用相应的协议模块来处理报文. 而目前
linux的实现中, 每次中断都会调用底半处理.
Linux中报文是如何从网卡传递到相应协议的
最新推荐文章于 2021-04-30 07:28:58 发布