Linux_1.0_TCP-IP协议栈分析
Author: Wenxy
Version: 1.0
Begin date: 2009-2-10
End date:
1. TCP/IP协议栈启动分析 (按源代码中运行的顺序分析,Kernel startup)
1.1 ./net/inet/sock.c
/* Called by ddi.c on kernel startup. */
void inet_proto_init(struct ddi_proto *pro)
{
struct inet_protocol [Wenxy1] *p;
int i;
printk("Swansea University Computer Society Net2Debugged [1.30]/n");
/* Set up our UNIX VFS major device. */
if (register_chrdev(AF_INET_MAJOR, "af_inet", &inet_fops) < 0) {
printk("%s: cannot register major device %d!/n",
pro->name, AF_INET_MAJOR);
return;
}
/* Tell SOCKET that we are alive... */
(void) sock_register(inet_proto_ops.family, &inet_proto_ops);
seq_offset = CURRENT_TIME*250;
/* Add all the protocols. */
for(i = 0; i < SOCK_ARRAY_SIZE; i++) {
tcp_prot.sock_array[i] = NULL;
udp_prot.sock_array[i] = NULL;
raw_prot.sock_array[i] = NULL;
}
printk("IP Protocols: ");
for(p = inet_protocol_base; p != NULL;) [Wenxy2] {
struct inet_protocol *tmp;
tmp = (struct inet_protocol *) p->next;
inet_add_protocol(p); /* init TCP/IP stack */
printk("%s%s",p->name,tmp?", ":"/n");
p = tmp;
}
/* Initialize the DEV module. */
dev_init();
/* Initialize the "Buffer Head" pointers. */
bh_base[INET_BH].routine = inet_bh;
}
1.2 ./net/inet/protocol.c
void
inet_add_protocol(struct inet_protocol *prot)
{
unsigned char hash;
struct inet_protocol *p2;
hash = prot->protocol & (MAX_INET_PROTOS - 1);
prot ->next = inet_protos[hash];
inet_protos[hash] = prot;
prot->copy = 0;
/* Set the copy bit if we need to. */
p2 = (struct inet_protocol *) prot->next;
while(p2 != NULL) {
if (p2->protocol == prot->protocol) {
prot->copy = 1;
break;
}
p2 = (struct inet_protocol *) prot->next;
}
}
1.3 ./net/inet/dev.c
/* Initialize the DEV module. */
void
dev_init(void)
{
struct device *dev, *dev2;
/* Add the devices.
* If the call to dev->init fails, the dev is removed
* from the chain disconnecting the device until the
* next reboot.
*/
dev2 = NULL;
for (dev = dev_base; dev != NULL; dev=dev->next) {
if (dev->init && dev->init(dev) [Wenxy3] ) {
if (dev2 == NULL) dev_base = dev->next;
else dev2->next = dev->next;
} else {
dev2 = dev;
}
}
/* Set up some IP addresses. */
ip_bcast = in_aton("255.255.255.255");
}
1.4 ./drivers/net/ eexpress.c (Insume install Intel EtherExpress NIC)
/* Check for a network adaptor of this type, and return '0' iff one exists.
If dev->base_addr == 0, probe all likely locations.
If dev->base_addr == 1, always return failure.
If dev->base_addr == 2, (detachable devices only) alloate space for the
device and return success.
*/
int
express_probe(struct device *dev)
{
/* Don't probe all settable addresses, 0x[23][0-7]0, just common ones. */
int *port, ports[] = {0x300, 0x270, 0x320, 0x340, 0};
int base_addr = dev->base_addr;
if (base_addr > 0x1ff) /* Check a single specified location. */
return eexp_probe1(dev, base_addr);
else if (base_addr > 0)
return ENXIO; /* Don't probe at all. */
for (port = &ports[0]; *port; port++) {
short id_addr = *port + ID_PORT;
unsigned short sum = 0;
int i;
#ifdef notdef
for (i = 16; i > 0; i--)
sum += inb(id_addr);
printk("EtherExpress ID checksum is %04x./n", sum);
#else
for (i = 4; i > 0; i--) {
short id_val = inb(id_addr);
sum |= (id_val >> 4) << ((id_val & 3) << 2);
}
#endif
if (sum == 0xbaba
&& eexp_probe1(dev, *port) == 0)
return 0;
}
return ENODEV; /* ENODEV would be more accurate. */
}
int eexp_probe1(struct device *dev, short ioaddr)
{
unsigned short station_addr[3];
int i;
printk("%s: EtherExpress at %#x,", dev->name, ioaddr);
/* The station address is stored !backwards! in the EEPROM, reverse
after reading. (Hmmm, a little brain-damage there at Intel, eh?) */
station_addr[0] = read_eeprom(ioaddr, 2);
station_addr[1] = read_eeprom(ioaddr, 3);
station_addr[2] = read_eeprom(ioaddr, 4);
/* Check the first three octets of the S.A. for the manufactor's code. */
if (station_addr[2] != 0x00aa || (station_addr[1] & 0xff00) != 0x0000) {
printk(" rejected (invalid address %04x%04x%04x)./n",
station_addr[2], station_addr[1], station_addr[0]);
return ENODEV;
}
/* We've committed to using the board, and can start filling in *dev. */
snarf_region(ioaddr, 16);
dev->base_addr = ioaddr;
for (i = 0; i < 6; i++) {
dev->dev_addr[i] = ((unsigned char*)station_addr)[5-i];
printk(" %02x", dev->dev_addr[i]);
}
/* There is no reason for the driver to care, but I print out the
interface to minimize bogus bug reports. */
{
char irqmap[] = {0, 9, 3, 4, 5, 10, 11, 0};
char *ifmap[] = {"AUI", "BNC", "10baseT"};
enum iftype {AUI=0, BNC=1, TP=2};
unsigned short setupval = read_eeprom(ioaddr, 0);
dev->irq = irqmap[setupval >> 13];
dev->if_port = (setupval & 0x1000) == 0 ? AUI :
read_eeprom(ioaddr, 5) & 0x1 ? TP : BNC;
printk(", IRQ %d, Interface %s./n", dev->irq, ifmap[dev->if_port]);
/* Release the IRQ line so that it can be shared if we don't use the
ethercard. */
outb(0x00, ioaddr + SET_IRQ);
}
/* It's now OK to leave the board in reset, pending the open(). */
outb(ASIC_RESET, ioaddr + EEPROM_Ctrl);
if ((dev->mem_start & 0xf) > 0)
net_debug = dev->mem_start & 7;
if (net_debug)
printk(version);
/* Initialize the device structure. */
dev->priv = kmalloc(sizeof(struct net_local), GFP_KERNEL);
memset(dev->priv, 0, sizeof(struct net_local));
dev->open = eexp_open;
dev->stop = eexp_close;
dev->hard_start_xmit = eexp_send_packet;
dev->get_stats = eexp_get_stats;
#ifdef HAVE_MULTICAST
dev->set_multicast_list = &set_multicast_list;
#endif
/* Fill in the fields of the device structure with ethernet-generic values.
This should be in a common file instead of per-driver. */
for (i = 0; i < DEV_NUMBUFFS; i++)
dev->buffs[i] = NULL;
dev->hard_header = eth_header;
dev->add_arp = eth_add_arp;
dev->queue_xmit = dev_queue_xmit;
dev->rebuild_header = eth_rebuild_header;
dev->type_trans = eth_type_trans;
dev->type = ARPHRD_ETHER;
dev->hard_header_len = ETH_HLEN;
dev->mtu = 1500; /* eth_mtu */
dev->addr_len = ETH_ALEN;
for (i = 0; i < ETH_ALEN; i++) {
dev->broadcast[i]=0xff;
}
/* New-style flags. */
dev->flags = IFF_BROADCAST;
dev->family = AF_INET;
dev->pa_addr = 0;
dev->pa_brdaddr = 0;
dev->pa_mask = 0;
dev->pa_alen = sizeof(unsigned long);
return 0;
}
Note, right, TCP/IP stack was initialized, NIC device was initialized, network communication is ready.
1.5 驱动接收到网络数据包后,让TCP/IP协议栈处理的流程,通常kernel的设计是:驱动程序处理上半部分的工作,内核的中断处理程序来处理下半部分的工作。
由于在网卡驱动中接收到了一个网络数据包,上半部分的工作是把加入到skbuff链表的tail。以8390网卡驱动为例,代码如下:
/* We have a good packet(s), get it/them out of the buffers. */
static void ei_receive(struct device *dev)
{
int e8390_base = dev->base_addr;
struct ei_device *ei_local = (struct ei_device *) dev->priv;
int rxing_page, this_frame, next_frame, current_offset;
int rx_pkt_count = 0;
struct e8390_pkt_hdr rx_frame;
int num_rx_pages = ei_local->stop_page-ei_local->rx_start_page;
while (++rx_pkt_count < 10) {
int pkt_len;
/* Get the rx page (incoming packet pointer). */
outb_p(E8390_NODMA+E8390_PAGE1, e8390_base + E8390_CMD);
rxing_page = inb_p(e8390_base + EN1_CURPAG);
outb_p(E8390_NODMA+E8390_PAGE0, e8390_base + E8390_CMD);
/* Remove one frame from the ring. Boundary is alway a page behind. */
this_frame = inb_p(e8390_base + EN0_BOUNDARY) + 1;
if (this_frame >= ei_local->stop_page)
this_frame = ei_local->rx_start_page;
/* Someday we'll omit the previous, iff we never get this message.
(There is at least one clone claimed to have a problem.) */
if (ei_debug > 0 && this_frame != ei_local->current_page)
printk("%s: mismatched read page pointers %2x vs %2x./n",
dev->name, this_frame, ei_local->current_page);
if (this_frame == rxing_page) /* Read all the frames? */
break; /* Done for now */
current_offset = this_frame << 8;
ei_block_input(dev, sizeof(rx_frame), (char *)&rx_frame,
current_offset);
pkt_len = rx_frame.count - sizeof(rx_frame);
next_frame = this_frame + 1 + ((pkt_len+4)>>8);
/* Check for bogosity warned by 3c503 book: the status byte is never
written. This happened a lot during testing! This code should be
cleaned up someday. */
if (rx_frame.next != next_frame
&& rx_frame.next != next_frame + 1
&& rx_frame.next != next_frame - num_rx_pages
&& rx_frame.next != next_frame + 1 - num_rx_pages) {
ei_local->current_page = rxing_page;
outb(ei_local->current_page-1, e8390_base+EN0_BOUNDARY);
ei_local->stat.rx_errors++;
continue;
}
if (pkt_len < 60 || pkt_len > 1518) {
if (ei_debug)
printk("%s: bogus packet size: %d, status=%#2x nxpg=%#2x./n",
dev->name, rx_frame.count, rx_frame.status,
rx_frame.next);
ei_local->stat.rx_errors++;
} else if ((rx_frame.status & 0x0F) == ENRSR_RXOK) {
int sksize = sizeof(struct sk_buff) + pkt_len;
struct sk_buff *skb;
skb = alloc_skb(sksize, GFP_ATOMIC); [Wenxy5]
if (skb == NULL) {
if (ei_debug)
printk("%s: Couldn't allocate a sk_buff of size %d./n",
dev->name, sksize);
ei_local->stat.rx_dropped++;
break;
} else {
skb->mem_len = sksize;
skb->mem_addr = skb;
skb->len = pkt_len;
skb->dev = dev;
ei_block_input(dev, pkt_len, (char *) skb->data,
current_offset + sizeof(rx_frame));
netif_rx(skb); [Wenxy6]
ei_local->stat.rx_packets++;
}
} else {
int errs = rx_frame.status;
if (ei_debug)
printk("%s: bogus packet: status=%#2x nxpg=%#2x size=%d/n",
dev->name, rx_frame.status, rx_frame.next,
rx_frame.count);
if (errs & ENRSR_FO)
ei_local->stat.rx_fifo_errors++;
}
next_frame = rx_frame.next;
/* This _should_ never happen: it's here for avoiding bad clones. */
if (next_frame >= ei_local->stop_page) {
printk("%s: next frame inconsistency, %#2x..", dev->name,
next_frame);
next_frame = ei_local->rx_start_page;
}
ei_local->current_page = next_frame;
outb(next_frame-1, e8390_base+EN0_BOUNDARY);
}
/* If any worth-while packets have been received, dev_rint()
has done a mark_bh(INET_BH) for us and will work on them
when we get to the bottom-half routine. */
/* Record the maximum Rx packet queue. */
if (rx_pkt_count > high_water_mark)
high_water_mark = rx_pkt_count;
/* Bug alert! Reset ENISR_OVER to avoid spurious overruns! */
outb_p(ENISR_RX+ENISR_RX_ERR+ENISR_OVER, e8390_base+EN0_ISR);
return;
}
./net/inet/skbuff.c
/*
* Insert an sk_buff at the end of a list.
*/
void skb_queue_tail(struct sk_buff *volatile* list, struct sk_buff *newsk)
{
unsigned long flags;
if(newsk->list)
printk("Suspicious queue tail: sk_buff on list!/n");
IS_SKB(newsk);
save_flags(flags);
cli();
newsk->list=list;
if(*list)
{
(*list)->prev->next=newsk;
newsk->prev=(*list)->prev;
newsk->next=*list;
(*list)->prev=newsk;
}
else
{
newsk->next=newsk;
newsk->prev=newsk;
*list=newsk;
}
IS_SKB(newsk->prev);
IS_SKB(newsk->next);
restore_flags(flags);
}
./net/inet/dev.c
/*
* Receive a packet from a device driver and queue it for the upper
* (protocol) levels. It always succeeds.
*/
void
netif_rx(struct sk_buff *skb)
{
/* Set any necessary flags. */
skb->sk = NULL;
skb->free = 1;
/* and add it to the "backlog" queue. */
IS_SKB(skb);
skb_queue_tail(&backlog,skb);
/* If any packet arrived, mark it for processing. */
if (backlog != NULL) mark_bh(INET_BH); [Wenxy7]
return;
}
1.6 进入协议栈的流程: ./kernel/irq.c
/*
* do_bottom_half() runs at normal kernel priority: all interrupts
* enabled. do_bottom_half() is atomic with respect to itself: a
* bottom_half handler need not be re-entrant.
*/
asmlinkage void do_bottom_half [Wenxy8] (void)
{
unsigned long active;
unsigned long mask, left;
struct bh_struct *bh;
bh = bh_base;
active = bh_active & bh_mask;
for (mask = 1, left = ~0 ; left & active ; bh++,mask += mask,left += left) {
if (mask & active) {
void (*fn)(void *);
bh_active &= ~mask;
fn = bh->routine; [Wenxy9]
if (!fn)
goto bad_bh;
fn(bh->data);
}
}
return;
bad_bh:
printk ("irq.c:bad bottom half entry/n");
}
1.7 接下来执行:/net/inet/dev.c
/*
* This function gets called periodically, to see if we can
* process any data that came in from some interface.
*
*/
void
inet_bh(void *tmp)
struct sk_buff *skb;
struct packet_type *ptype;
unsigned short type;
unsigned char flag = 0;
int nitcount;
/* Atomically check and mark our BUSY state. */
if (set_bit(1, (void*)&in_bh))
return;
/* Can we send anything now? */
dev_transmit(); [Wenxy11]
/* Any data left to process? */
while((skb=skb_dequeue(&backlog))!=NULL) [Wenxy12]
{
nitcount=dev_nit;
flag=0;
sti();
/*
* Bump the pointer to the next structure.
* This assumes that the basic 'skb' pointer points to
* the MAC header, if any (as indicated by its "length"
* field). Take care now!
*/
skb->h.raw = skb->data + skb->dev->hard_header_len;
skb->len -= skb->dev->hard_header_len;
/*
* Fetch the packet protocol ID. This is also quite ugly, as
* it depends on the protocol driver (the interface itself) to
* know what the type is, or where to get it from. The Ethernet
* interfaces fetch the ID from the two bytes in the Ethernet MAC
* header (the h_proto field in struct ethhdr), but drivers like
* SLIP and PLIP have no alternative but to force the type to be
* IP or something like that. Sigh- FvK
*/
type = skb->dev->type_trans(skb, skb->dev);
/*
* We got a packet ID. Now loop over the "known protocols"
* table (which is actually a linked list, but this will
* change soon if I get my way- FvK), and forward the packet
* to anyone who wants it.
*/
for (ptype = ptype_base; ptype != NULL; ptype = ptype->next) {
if (ptype->type == type || ptype->type == NET16(ETH_P_ALL)) {
struct sk_buff *skb2;
if (ptype->type==NET16(ETH_P_ALL))
nitcount--;
if (ptype->copy || nitcount) { /* copy if we need to */
skb2 = alloc_skb(skb->mem_len, GFP_ATOMIC);
if (skb2 == NULL)
continue;
memcpy(skb2, (const void *) skb, skb->mem_len);
skb2->mem_addr = skb2;
skb2->h.raw = (unsigned char *)(
(unsigned long) skb2 +
(unsigned long) skb->h.raw -
(unsigned long) skb
);
skb2->free = 1;
} else {
skb2 = skb;
}
/* This used to be in the 'else' part, but then
* we don't have this flag set when we get a
* protocol that *does* require copying... -FvK
*/
flag = 1;
/* Kick the protocol handler. */
ptype->func(skb2, skb->dev, ptype);
}
}
/*
* That's odd. We got an unknown packet. Who's using
* stuff like Novell or Amoeba on this network??
*/
if (!flag) {
DPRINTF((DBG_DEV,
"INET: unknown packet type 0x%04X (ignored)/n", type));
skb->sk = NULL;
kfree_skb(skb, FREE_WRITE);
}
/* Again, see if we can transmit anything now. */
dev_transmit();
cli();
}
in_bh = 0;
sti();
dev_transmit();
}
请注意:每个NIC driver有一个接收skbuff list, 也有一个发送skbuff list,实际进入TCP/IP协议栈的只有一个skbuff node(结点),这样便于封装,协议栈处理只针对某个skbuff。
2. TCP/IP协议栈接收/发送数据流程分析(以应用程序调用UDP socket为例)
2.1 《Linux_1.0_TCP-IP协议栈数据处理流程.doc》
参见《Linux_1.0_TCP-IP协议栈数据处理流程.doc》。
2. UDP socket发送数据,TCP/IP协议栈处理数据的流程
参见《LinuxTCPIP协议栈数据处理流程图.vsd》。
代码的执行进入TCP/IP协议栈。 /* Initialize the "Buffer Head" pointers. */
bh_base[INET_BH].routine = inet_bh;这里初始化了这个函数指针。
extern struct bh_struct bh_base[32];
/* Who gets which entry in bh_base. Things which will occur most often
should come first. */
enum {
TIMER_BH = 0,
CONSOLE_BH,
SERIAL_BH,
TTY_BH,
INET_BH,
KEYBOARD_BH
};