本文分析基于Linux Kernel 1.2.13
原创作品,转载请标明http://blog.csdn.net/yming0221/article/details/7497260
更多请看专栏,地址http://blog.csdn.net/column/details/linux-kernel-net.html
作者:闫明
注:标题中的”(上)“,”(下)“表示分析过程基于数据包的传递方向:”(上)“表示分析是从底层向上分析、”(下)“表示分析是从上向下分析。
经过前面两篇博文的分析,已经对Linux的内核网络栈的结构有了一个模糊的认识,这里我们开始从底层开始详细分析Linux内核网络栈的实现。由于这是早期版本,代码的层次隔离做的还不是很好,这里说是从底层分析,但是不免会牵扯上层或下层的函数,许多关键代码都在驱动的文件夹下。
我们首先有第一篇博文中知道在网络栈初始化的时候在net/socket.c中的函数sock_init()函数中当proto_init()完成后会执行dev_init()来进行网络设备模块的初始化。
首先说明一下,在drivers/net/space.c中定义了设备首节点地址dev_base,其实际上是回环设备的地址。
struct device loopback_dev = {
"lo", /* Software Loopback interface */
0x0, /* recv memory end */
0x0, /* recv memory start */
0x0, /* memory end */
0x0, /* memory start */
0, /* base I/O address */
0, /* IRQ */
0, 0, 0, /* flags */
NEXT_DEV, /* next device */
loopback_init /* loopback_init should set up the rest */
};
struct device *dev_base = &loopback_dev;
而NEXT_DEV宏定义即定义了下一个网络设备的地址,这样可以把设备串成链。
附网络设备的定义(include/linux/netdevice.h)如下:
/*
* The DEVICE structure.
* Actually, this whole structure is a big mistake. It mixes I/O
* data with strictly "high-level" data, and it has to know about
* almost every data structure used in the INET module.
*/
struct device
{
/*
* This is the first field of the "visible" part of this structure
* (i.e. as seen by users in the "Space.c" file). It is the name
* the interface.
*/
char *name;
/* I/O specific fields - FIXME: Merge these and struct ifmap into one */
unsigned long rmem_end; /* shmem "recv" end */
unsigned long rmem_start; /* shmem "recv" start */
unsigned long mem_end; /* sahared mem end */
unsigned long mem_start; /* shared mem start */
unsigned long base_addr; /* device I/O address */
unsigned char irq; /* device IRQ number */
/* Low-level status flags. */
volatile unsigned char start, /* start an operation */
tbusy, /* transmitter busy */
interrupt; /* interrupt arrived */
struct device *next;
/* The device initialization function. Called only once. */
int (*init)(struct device *dev);
/* Some hardware also needs these fields, but they are not part of the
usual set specified in Space.c. */
unsigned char if_port; /* Selectable AUI, TP,..*/
unsigned char dma; /* DMA channel */
struct enet_statistics* (*get_stats)(struct device *dev);
/*
* This marks the end of the "visible" part of the structure. All
* fields hereafter are internal to the system, and may change at
* will (read: may be cleaned up at will).
*/
/* These may be needed for future network-power-down code. */
unsigned long trans_start; /* Time (in jiffies) of last Tx */
unsigned long last_rx; /* Time of last Rx */
unsigned short flags; /* interface flags (a la BSD) */
unsigned short family; /* address family ID (AF_INET) */
unsigned short metric; /* routing metric (not used) */
unsigned short mtu; /* interface MTU value */
unsigned short type; /* interface hardware type */
unsigned short hard_header_len; /* hardware hdr length */
void *priv; /* pointer to private data */
/* Interface address info. */
unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */
unsigned char dev_addr[MAX_ADDR_LEN]; /* hw address */
unsigned char addr_len; /* hardware address length */
unsigned long pa_addr; /* protocol address */
unsigned long pa_brdaddr; /* protocol broadcast addr */
unsigned long pa_dstaddr; /* protocol P-P other side addr */
unsigned long pa_mask; /* protocol netmask */
unsigned short pa_alen; /* protocol address length */
struct dev_mc_list *mc_list; /* Multicast mac addresses */
int mc_count; /* Number of installed mcasts */
struct ip_mc_list *ip_mc_list; /* IP multicast filter chain */
/* For load balancing driver pair support */
unsigned long pkt_queue; /* Packets queued */
struct device *slave; /* Slave device */
/* Pointer to the interface buffers. */
struct sk_buff_head buffs[DEV_NUMBUFFS];
/* Pointers to interface service routines. */
int (*open)(struct device *dev);
int (*stop)(struct device *dev);
int (*hard_start_xmit) (struct sk_buff *skb,
struct device *dev);
int (*hard_header) (unsigned char *buff,
struct device *dev,
unsigned short type,
void *daddr,
void *saddr,
unsigned len,
struct sk_buff *skb);
int (*rebuild_header)(void *eth, struct device *dev,
unsigned long raddr, struct sk_buff *skb);
unsigned short (*type_trans) (struct sk_buff *skb,
struct device *dev);
#define HAVE_MULTICAST
void (*set_multicast_list)(struct device *dev,
int num_addrs, void *addrs);
#define HAVE_SET_MAC_ADDR
int (*set_mac_address)(struct device *dev, void *addr);
#define HAVE_PRIVATE_IOCTL
int (*do_ioctl)(struct device *dev, struct ifreq *ifr, int cmd);
#define HAVE_SET_CONFIG
int (*set_config)(struct device *dev, struct ifmap *map);
};
dev_init()网络设备的初始化函数如下:
/*
* Initialize the DEV module. At boot time this walks the device list and
* unhooks any devices that fail to initialise (normally hardware not
* present) and leaves us with a valid list of present and active devices.
*
* The PCMCIA code may need to change this a little, and add a pair
* of register_inet_device() unregister_inet_device() calls. This will be
* needed for ethernet as modules support.
*/
void dev_init(void)
{
struct device *dev, *dev2;
/*
* Add the devices.
* If the call to dev->init fails, the dev is removed
* from the chain disconnecting the device until the
* next reboot.
*/
dev2 = NULL;
for (dev = dev_base; dev != NULL; dev=dev->next) //循环移除设备由璞傅絛ev_base指向的网络设备链表
{
if (dev->init && dev->init(dev)) //如果设备有初始化函数并且初始化失败,则从链表摘除设备(init()函数成功返回0)
{
/*
* It failed to come up. Unhook it.这个函数还挺有技巧性的,从默认配置的设备中扫描不存在的设备,将其移除
*/
if (dev2 == NULL)
dev_base = dev->next;
else
dev2->next = dev->next;
}
else
{
dev2 = dev;
}
}
}
这里我们看一下dev_base这个队列是如何定义的,这里我们仅仅看eth网卡的定义方式即可
/* "eth0" defaults to autoprobe (== 0), other use a base of 0xffe0 (== -0x20),
which means "don't probe". These entries exist to only to provide empty
slots which may be enabled at boot-time. */
static struct device eth3_dev = {
"eth3", 0,0,0,0,0xffe0 /* I/O base*/, 0,0,0,0, NEXT_DEV, ethif_probe };
static struct device eth2_dev = {
"eth2", 0,0,0,0,0xffe0 /* I/O base*/, 0,0,0,0, eth3_dev, ethif_probe };
static struct device eth1_dev = {
"eth1", 0,0,0,0,0xffe0 /* I/O base*/, 0,0,0,0, eth2_dev, ethif_probe };
static struct device eth0_dev = {
"eth0", 0, 0, 0, 0, ETH0_ADDR, ETH0_IRQ, 0, 0, 0, eth1_dev, ethif_probe };
# undef NEXT_DEV
# define NEXT_DEV (eth0_dev)
可以看出eth系列网卡设备的init函数定义为ethif_probe(),该函数会调用具体网卡的探测函数,我们还是以NS8390 ethernet网卡为例来分析,该网卡的驱动实现文件为drivers/net/ne.c
ethif_probe()函数会调用函数ne_probe()探测函数,而该函数对设备地址进行检查后调用ne_probe1()函数,具体工作有ne_probe1()函数完成。
函数如下:
static int ne_probe1(struct device *dev, int ioaddr)
{
.....................//合法性检查
/* Fixup for users that don't know that IRQ 2 is really IRQ 9,
or don't know which one to set. */
dev->irq = 9;//设置中断类型号
/* Snarf the interrupt now. There's no point in waiting since we cannot
share and the board will usually be enabled. */
{
int irqval = request_irq (dev->irq, ei_interrupt, 0, wordlength==2 ? "ne2000":"ne1000");//注册申请中断,中断处理函数为ei_interrupt
if (irqval) {
printk (" unable to get IRQ %d (irqval=%d).\n", dev->irq, irqval);
return EAGAIN;
}
}
dev->base_addr = ioaddr;
request_region(ioaddr, NE_IO_EXTENT, wordlength==2 ? "ne2000":"ne1000");//申请内存空间
for(i = 0; i < ETHER_ADDR_LEN; i++)
dev->dev_addr[i] = SA_prom[i];
ethdev_init(dev);//调用函数对dev设备结构体进行初始化
printk("\n%s: %s found at %#x, using IRQ %d.\n",
dev->name, name, ioaddr, dev->irq);
if (ei_debug > 0)
printk(version);
ei_status.name = name;
ei_status.tx_start_page = start_page;
ei_status.stop_page = stop_page;
ei_status.word16 = (wordlength == 2);
ei_status.rx_start_page = start_page + TX_PAGES;
#ifdef PACKETBUF_MEMSIZE
/* Allow the packet buffer size to be overridden by know-it-alls. */
ei_status.stop_page = ei_status.tx_start_page + PACKETBUF_MEMSIZE;
#endif
ei_status.reset_8390 = &ne_reset_8390;
ei_status.block_input = &ne_block_input;
ei_status.block_output = &ne_block_output;
NS8390_init(dev, 0);//配置网卡中的寄存器等到默认状态
return 0;
}
初始化函数ethdev_init()在文件drivers/net/8390.c中。如下:
/* Initialize the rest of the 8390 device structure. */
int ethdev_init(struct device *dev)
{
if (ei_debug > 1)
printk(version);
if (dev->priv == NULL) {//申请私有空间存储具体网卡的结构体信息
struct ei_device *ei_local;//8390网卡设备的结构体
dev->priv = kmalloc(sizeof(struct ei_device), GFP_KERNEL);//申请内核内存空间
memset(dev->priv, 0, sizeof(struct ei_device));
ei_local = (struct ei_device *)dev->priv;
#ifndef NO_PINGPONG
ei_local->pingpong = 1;
#endif
}
/* The open call may be overridden by the card-specific code. */
if (dev->open == NULL)
dev->open = &ei_open;//设备的打开函数
/* We should have a dev->stop entry also. */
dev->hard_start_xmit = &ei_start_xmit;//设备的发送函数,定义在8390.c中
dev->get_stats = get_stats;
#ifdef HAVE_MULTICAST
dev->set_multicast_list = &set_multicast_list;
#endif
ether_setup(dev);//进一步调用函数设置dev设备结构体
return 0;
}
ether_setup()函数的实现如下:
void ether_setup(struct device *dev)
{
int i;
/* Fill in the fields of the device structure with ethernet-generic values.
This should be in a common file instead of per-driver. */
for (i = 0; i < DEV_NUMBUFFS; i++)
skb_queue_head_init(&dev->buffs[i]);//缓冲队列初始化
/* register boot-defined "eth" devices */
if (dev->name && (strncmp(dev->name, "eth", 3) == 0)) {//定义eth网卡的名称
i = simple_strtoul(dev->name + 3, NULL, 0);
if (ethdev_index[i] == NULL) {
ethdev_index[i] = dev;
}
else if (dev != ethdev_index[i]) {
/* Really shouldn't happen! */
printk("ether_setup: Ouch! Someone else took %s\n",
dev->name);
}
}
dev->hard_header = eth_header;//该函数的作用是创建链路层首部,定义在eth.c中
dev->rebuild_header = eth_rebuild_header;//该函数的作用是重建链路层首部,用于ARP协议
dev->type_trans = eth_type_trans;
dev->type = ARPHRD_ETHER;
dev->hard_header_len = ETH_HLEN;
dev->mtu = 1500; /* eth_mtu */
dev->addr_len = ETH_ALEN;
for (i = 0; i < ETH_ALEN; i++) {
dev->broadcast[i]=0xff;
}
/* New-style flags. */
dev->flags = IFF_BROADCAST|IFF_MULTICAST;
dev->family = AF_INET;
dev->pa_addr = 0;
dev->pa_brdaddr = 0;
dev->pa_mask = 0;
dev->pa_alen = sizeof(unsigned long);
}
这样,网络设备的初始化工作就完成了。
在drivers/net/8390.c中实现了该网卡的设备的基本操作函数,
设备的打开函数ei_open()比较简单,下面列出该设备的发送和接收函数,在这里不做具体的分析,如果想更多了解请点击前面分析过的DM9000网卡驱动,下面给出链接:
- ARM-Linux驱动--DM9000网卡驱动分析(一)
- ARM-Linux驱动--DM9000网卡驱动分析(二)
- ARM-Linux驱动--DM9000网卡驱动分析(三)
- ARM-Linux驱动--DM9000网卡驱动分析(四)
ei_start_xmit()
static int ei_start_xmit(struct sk_buff *skb, struct device *dev)
{
int e8390_base = dev->base_addr;
struct ei_device *ei_local = (struct ei_device *) dev->priv;
int length, send_length;
unsigned long flags;
/*
* We normally shouldn't be called if dev->tbusy is set, but the
* existing code does anyway. If it has been too long since the
* last Tx, we assume the board has died and kick it.
*/
if (dev->tbusy) { /* Do timeouts, just like the 8003 driver. */
int txsr = inb(e8390_base+EN0_TSR), isr;
int tickssofar = jiffies - dev->trans_start;
if (tickssofar < TX_TIMEOUT || (tickssofar < (TX_TIMEOUT+5) && ! (txsr & ENTSR_PTX))) {
return 1;
}
isr = inb(e8390_base+EN0_ISR);
if (dev->start == 0) {
printk("%s: xmit on stopped card\n", dev->name);
return 1;
}
printk(KERN_DEBUG "%s: transmit timed out, TX status %#2x, ISR %#2x.\n",
dev->name, txsr, isr);
/* Does the 8390 thinks it has posted an interrupt? */
if (isr)
printk(KERN_DEBUG "%s: Possible IRQ conflict on IRQ%d?\n", dev->name, dev->irq);
else {
/* The 8390 probably hasn't gotten on the cable yet. */
printk(KERN_DEBUG "%s: Possible network cable problem?\n", dev->name);
if(ei_local->stat.tx_packets==0)
ei_local->interface_num ^= 1; /* Try a different xcvr. */
}
/* Try to restart the card. Perhaps the user has fixed something. */
ei_reset_8390(dev);
NS8390_init(dev, 1);
dev->trans_start = jiffies;
}
/* Sending a NULL skb means some higher layer thinks we've missed an
tx-done interrupt. Caution: dev_tint() handles the cli()/sti()
itself. */
if (skb == NULL) {
dev_tint(dev);
return 0;
}
length = skb->len;
if (skb->len <= 0)
return 0;
save_flags(flags);
cli();
/* Block a timer-based transmit from overlapping. */
if ((set_bit(0, (void*)&dev->tbusy) != 0) || ei_local->irqlock) {
printk("%s: Tx access conflict. irq=%d lock=%d tx1=%d tx2=%d last=%d\n",
dev->name, dev->interrupt, ei_local->irqlock, ei_local->tx1,
ei_local->tx2, ei_local->lasttx);
restore_flags(flags);
return 1;
}
/* Mask interrupts from the ethercard. */
outb(0x00, e8390_base + EN0_IMR);
ei_local->irqlock = 1;
restore_flags(flags);
send_length = ETH_ZLEN < length ? length : ETH_ZLEN;
if (ei_local->pingpong) {
int output_page;
if (ei_local->tx1 == 0) {
output_page = ei_local->tx_start_page;
ei_local->tx1 = send_length;
if (ei_debug && ei_local->tx2 > 0)
printk("%s: idle transmitter tx2=%d, lasttx=%d, txing=%d.\n",
dev->name, ei_local->tx2, ei_local->lasttx,
ei_local->txing);
} else if (ei_local->tx2 == 0) {
output_page = ei_local->tx_start_page + 6;
ei_local->tx2 = send_length;
if (ei_debug && ei_local->tx1 > 0)
printk("%s: idle transmitter, tx1=%d, lasttx=%d, txing=%d.\n",
dev->name, ei_local->tx1, ei_local->lasttx,
ei_local->txing);
} else { /* We should never get here. */
if (ei_debug)
printk("%s: No Tx buffers free. irq=%d tx1=%d tx2=%d last=%d\n",
dev->name, dev->interrupt, ei_local->tx1,
ei_local->tx2, ei_local->lasttx);
ei_local->irqlock = 0;
dev->tbusy = 1;
outb_p(ENISR_ALL, e8390_base + EN0_IMR);
return 1;
}
ei_block_output(dev, length, skb->data, output_page);
if (! ei_local->txing) {
ei_local->txing = 1;
NS8390_trigger_send(dev, send_length, output_page);
dev->trans_start = jiffies;
if (output_page == ei_local->tx_start_page)
ei_local->tx1 = -1, ei_local->lasttx = -1;
else
ei_local->tx2 = -1, ei_local->lasttx = -2;
} else
ei_local->txqueue++;
dev->tbusy = (ei_local->tx1 && ei_local->tx2);
} else { /* No pingpong, just a single Tx buffer. */
ei_block_output(dev, length, skb->data, ei_local->tx_start_page);
ei_local->txing = 1;
NS8390_trigger_send(dev, send_length, ei_local->tx_start_page);
dev->trans_start = jiffies;
dev->tbusy = 1;
}
/* Turn 8390 interrupts back on. */
ei_local->irqlock = 0;
outb_p(ENISR_ALL, e8390_base + EN0_IMR);
dev_kfree_skb (skb, FREE_WRITE);
return 0;
}
ei_receive()函数
static void ei_receive(struct device *dev)
{
int e8390_base = dev->base_addr;
struct ei_device *ei_local = (struct ei_device *) dev->priv;
int rxing_page, this_frame, next_frame, current_offset;
int rx_pkt_count = 0;
struct e8390_pkt_hdr rx_frame;
int num_rx_pages = ei_local->stop_page-ei_local->rx_start_page;
while (++rx_pkt_count < 10) {
int pkt_len;
/* Get the rx page (incoming packet pointer). */
outb_p(E8390_NODMA+E8390_PAGE1, e8390_base + E8390_CMD);
rxing_page = inb_p(e8390_base + EN1_CURPAG);
outb_p(E8390_NODMA+E8390_PAGE0, e8390_base + E8390_CMD);
/* Remove one frame from the ring. Boundary is always a page behind. */
this_frame = inb_p(e8390_base + EN0_BOUNDARY) + 1;
if (this_frame >= ei_local->stop_page)
this_frame = ei_local->rx_start_page;
/* Someday we'll omit the previous, iff we never get this message.
(There is at least one clone claimed to have a problem.) */
if (ei_debug > 0 && this_frame != ei_local->current_page)
printk("%s: mismatched read page pointers %2x vs %2x.\n",
dev->name, this_frame, ei_local->current_page);
if (this_frame == rxing_page) /* Read all the frames? */
break; /* Done for now */
current_offset = this_frame << 8;
ei_block_input(dev, sizeof(rx_frame), (char *)&rx_frame,
current_offset);
pkt_len = rx_frame.count - sizeof(rx_frame);
next_frame = this_frame + 1 + ((pkt_len+4)>>8);
/* Check for bogosity warned by 3c503 book: the status byte is never
written. This happened a lot during testing! This code should be
cleaned up someday. */
if (rx_frame.next != next_frame
&& rx_frame.next != next_frame + 1
&& rx_frame.next != next_frame - num_rx_pages
&& rx_frame.next != next_frame + 1 - num_rx_pages) {
ei_local->current_page = rxing_page;
outb(ei_local->current_page-1, e8390_base+EN0_BOUNDARY);
ei_local->stat.rx_errors++;
continue;
}
if (pkt_len < 60 || pkt_len > 1518) {
if (ei_debug)
printk("%s: bogus packet size: %d, status=%#2x nxpg=%#2x.\n",
dev->name, rx_frame.count, rx_frame.status,
rx_frame.next);
ei_local->stat.rx_errors++;
} else if ((rx_frame.status & 0x0F) == ENRSR_RXOK) {
struct sk_buff *skb;
skb = alloc_skb(pkt_len, GFP_ATOMIC);
if (skb == NULL) {
if (ei_debug > 1)
printk("%s: Couldn't allocate a sk_buff of size %d.\n",
dev->name, pkt_len);
ei_local->stat.rx_dropped++;
break;
} else {
skb->len = pkt_len;
skb->dev = dev;
ei_block_input(dev, pkt_len, (char *) skb->data,
current_offset + sizeof(rx_frame));
netif_rx(skb);
ei_local->stat.rx_packets++;
}
} else {
int errs = rx_frame.status;
if (ei_debug)
printk("%s: bogus packet: status=%#2x nxpg=%#2x size=%d\n",
dev->name, rx_frame.status, rx_frame.next,
rx_frame.count);
if (errs & ENRSR_FO)
ei_local->stat.rx_fifo_errors++;
}
next_frame = rx_frame.next;
/* This _should_ never happen: it's here for avoiding bad clones. */
if (next_frame >= ei_local->stop_page) {
printk("%s: next frame inconsistency, %#2x\n", dev->name,
next_frame);
next_frame = ei_local->rx_start_page;
}
ei_local->current_page = next_frame;
outb_p(next_frame-1, e8390_base+EN0_BOUNDARY);
}
/* If any worth-while packets have been received, dev_rint()
has done a mark_bh(NET_BH) for us and will work on them
when we get to the bottom-half routine. */
/* Record the maximum Rx packet queue. */
if (rx_pkt_count > high_water_mark)
high_water_mark = rx_pkt_count;
/* Bug alert! Reset ENISR_OVER to avoid spurious overruns! */
outb_p(ENISR_RX+ENISR_RX_ERR+ENISR_OVER, e8390_base+EN0_ISR);
return;
}