linxu kernel version 1.0 TCP/IP 协议栈源代码分析2, TCP/IP协议栈启动分析 (按源代码中运行的顺序分析,Kernel startup)

 

Linux_1.0_TCP-IP协议栈分析

Author: Wenxy

Version: 1.0

Begin date: 2009-2-10

End date:

1.    TCP/IP协议栈启动分析 (按源代码中运行的顺序分析,Kernel startup)

1.1 ./net/inet/sock.c

/* Called by ddi.c on kernel startup.  */

void inet_proto_init(struct ddi_proto *pro)

{

  struct inet_protocol [Wenxy1]   *p;

  int i;

 

  printk("Swansea University Computer Society Net2Debugged [1.30]/n");

  /* Set up our UNIX VFS major device. */

  if (register_chrdev(AF_INET_MAJOR, "af_inet", &inet_fops) < 0) {

       printk("%s: cannot register major device %d!/n",

                                   pro->name, AF_INET_MAJOR);

       return;

  } 

 

  /* Tell SOCKET that we are alive... */

  (void) sock_register(inet_proto_ops.family, &inet_proto_ops);

 

  seq_offset = CURRENT_TIME*250;

 

  /* Add all the protocols. */

  for(i = 0; i < SOCK_ARRAY_SIZE; i++) {

       tcp_prot.sock_array[i] = NULL;                                                                                     

       udp_prot.sock_array[i] = NULL;

       raw_prot.sock_array[i] = NULL;

  }

  printk("IP Protocols: ");

  for(p = inet_protocol_base; p != NULL;) [Wenxy2]  {

       struct inet_protocol *tmp;

 

       tmp = (struct inet_protocol *) p->next;

       inet_add_protocol(p);    /* init TCP/IP stack */

       printk("%s%s",p->name,tmp?", ":"/n");

       p = tmp;

  }

 

  /* Initialize the DEV module. */

  dev_init();

 

  /* Initialize the "Buffer Head" pointers. */

  bh_base[INET_BH].routine = inet_bh;

}

 

1.2 ./net/inet/protocol.c

void

inet_add_protocol(struct inet_protocol *prot)

{

  unsigned char hash;

  struct inet_protocol *p2;

 

  hash = prot->protocol & (MAX_INET_PROTOS - 1);

  prot ->next = inet_protos[hash];

  inet_protos[hash] = prot;

  prot->copy = 0;

 

  /* Set the copy bit if we need to. */

  p2 = (struct inet_protocol *) prot->next;

  while(p2 != NULL) {

       if (p2->protocol == prot->protocol) {

              prot->copy = 1;

              break;

       }

       p2 = (struct inet_protocol *) prot->next;

  }

}

1.3 ./net/inet/dev.c

/* Initialize the DEV module. */

void

dev_init(void)

{

  struct device *dev, *dev2;

 

  /* Add the devices.

   * If the call to dev->init fails, the dev is removed

   * from the chain disconnecting the device until the

   * next reboot.

   */

  dev2 = NULL;

  for (dev = dev_base; dev != NULL; dev=dev->next) {

       if (dev->init && dev->init(dev) [Wenxy3]  ) {

              if (dev2 == NULL) dev_base = dev->next;

                else dev2->next = dev->next;

       } else {

              dev2 = dev;

       }

  }

 

  /* Set up some IP addresses. */

  ip_bcast = in_aton("255.255.255.255");

}

 

1.4 ./drivers/net/ eexpress.c (Insume install Intel EtherExpress NIC)

/* Check for a network adaptor of this type, and return '0' iff one exists.

   If dev->base_addr == 0, probe all likely locations.

   If dev->base_addr == 1, always return failure.

   If dev->base_addr == 2, (detachable devices only) alloate space for the

   device and return success.

   */

int

express_probe(struct device *dev)

{

       /* Don't probe all settable addresses, 0x[23][0-7]0, just common ones. */

       int *port, ports[] = {0x300, 0x270, 0x320, 0x340, 0};

       int base_addr = dev->base_addr;

 

       if (base_addr > 0x1ff)    /* Check a single specified location. */

              return eexp_probe1(dev, base_addr);

       else if (base_addr > 0)

              return ENXIO;              /* Don't probe at all. */

 

       for (port = &ports[0]; *port; port++) {

              short id_addr = *port + ID_PORT;

              unsigned short sum = 0;

              int i;

#ifdef notdef

              for (i = 16; i > 0; i--)

                     sum += inb(id_addr);

              printk("EtherExpress ID checksum is %04x./n", sum);

#else

              for (i = 4; i > 0; i--) {

                     short id_val = inb(id_addr);

                     sum |= (id_val >> 4) << ((id_val & 3) << 2);

              }

#endif

              if (sum == 0xbaba

                     && eexp_probe1(dev, *port) == 0)

                     return 0;

       }

 

       return ENODEV;                  /* ENODEV would be more accurate. */

}

 

 

int eexp_probe1(struct device *dev, short ioaddr)

{

       unsigned short station_addr[3];

       int i;

 

       printk("%s: EtherExpress at %#x,", dev->name, ioaddr);

 

       /* The station address is stored !backwards! in the EEPROM, reverse

          after reading.  (Hmmm, a little brain-damage there at Intel, eh?) */

       station_addr[0] = read_eeprom(ioaddr, 2);

       station_addr[1] = read_eeprom(ioaddr, 3);

       station_addr[2] = read_eeprom(ioaddr, 4);

 

       /* Check the first three octets of the S.A. for the manufactor's code. */

       if (station_addr[2] != 0x00aa || (station_addr[1] & 0xff00) != 0x0000) {

              printk(" rejected (invalid address %04x%04x%04x)./n",

                        station_addr[2], station_addr[1], station_addr[0]);

              return ENODEV;

       }

 

       /* We've committed to using the board, and can start filling in *dev. */

       snarf_region(ioaddr, 16);

       dev->base_addr = ioaddr;

 

       for (i = 0; i < 6; i++) {

              dev->dev_addr[i] = ((unsigned char*)station_addr)[5-i];

              printk(" %02x", dev->dev_addr[i]);

       }

 

       /* There is no reason for the driver to care, but I print out the

          interface to minimize bogus bug reports. */

       {

              char irqmap[] = {0, 9, 3, 4, 5, 10, 11, 0};

              char *ifmap[] = {"AUI", "BNC", "10baseT"};

              enum iftype {AUI=0, BNC=1, TP=2};

              unsigned short setupval = read_eeprom(ioaddr, 0);

 

              dev->irq = irqmap[setupval >> 13];

              dev->if_port = (setupval & 0x1000) == 0 ? AUI :

                     read_eeprom(ioaddr, 5) & 0x1 ? TP : BNC;

              printk(", IRQ %d, Interface %s./n", dev->irq, ifmap[dev->if_port]);

              /* Release the IRQ line so that it can be shared if we don't use the

                 ethercard. */

              outb(0x00, ioaddr + SET_IRQ);

       }

 

       /* It's now OK to leave the board in reset, pending the open(). */

       outb(ASIC_RESET, ioaddr + EEPROM_Ctrl);

 

       if ((dev->mem_start & 0xf) > 0)

              net_debug = dev->mem_start & 7;

 

       if (net_debug)

              printk(version);

 

       /* Initialize the device structure. */

       dev->priv = kmalloc(sizeof(struct net_local), GFP_KERNEL);

       memset(dev->priv, 0, sizeof(struct net_local));

 

       dev->open             = eexp_open;

       dev->stop              = eexp_close;

       dev->hard_start_xmit = eexp_send_packet;

       dev->get_stats       = eexp_get_stats;

#ifdef HAVE_MULTICAST

       dev->set_multicast_list = &set_multicast_list;

#endif

[Wenxy4]  

       /* Fill in the fields of the device structure with ethernet-generic values.

          This should be in a common file instead of per-driver.  */

       for (i = 0; i < DEV_NUMBUFFS; i++)

              dev->buffs[i] = NULL;

 

       dev->hard_header   = eth_header;

       dev->add_arp  = eth_add_arp;

       dev->queue_xmit = dev_queue_xmit;

       dev->rebuild_header = eth_rebuild_header;

       dev->type_trans = eth_type_trans;

 

       dev->type              = ARPHRD_ETHER;

       dev->hard_header_len = ETH_HLEN;

       dev->mtu              = 1500; /* eth_mtu */

       dev->addr_len = ETH_ALEN;

       for (i = 0; i < ETH_ALEN; i++) {

              dev->broadcast[i]=0xff;

       }

 

       /* New-style flags. */

       dev->flags             = IFF_BROADCAST;

       dev->family           = AF_INET;

       dev->pa_addr  = 0;

       dev->pa_brdaddr = 0;

       dev->pa_mask       = 0;

       dev->pa_alen  = sizeof(unsigned long);

 

       return 0;

}

 

 

Note, right, TCP/IP stack was initialized, NIC device was initialized, network communication is ready.

 

1.5 驱动接收到网络数据包后,让TCP/IP协议栈处理的流程,通常kernel的设计是:驱动程序处理上半部分的工作,内核的中断处理程序来处理下半部分的工作。

由于在网卡驱动中接收到了一个网络数据包,上半部分的工作是把加入到skbuff链表的tail。以8390网卡驱动为例,代码如下:

/* We have a good packet(s), get it/them out of the buffers. */

 

static void ei_receive(struct device *dev)

{

    int e8390_base = dev->base_addr;

    struct ei_device *ei_local = (struct ei_device *) dev->priv;

    int rxing_page, this_frame, next_frame, current_offset;

    int rx_pkt_count = 0;

    struct e8390_pkt_hdr rx_frame;

    int num_rx_pages = ei_local->stop_page-ei_local->rx_start_page;

   

    while (++rx_pkt_count < 10) {

              int pkt_len;

             

              /* Get the rx page (incoming packet pointer). */

              outb_p(E8390_NODMA+E8390_PAGE1, e8390_base + E8390_CMD);

              rxing_page = inb_p(e8390_base + EN1_CURPAG);

              outb_p(E8390_NODMA+E8390_PAGE0, e8390_base + E8390_CMD);

             

              /* Remove one frame from the ring.  Boundary is alway a page behind. */

              this_frame = inb_p(e8390_base + EN0_BOUNDARY) + 1;

              if (this_frame >= ei_local->stop_page)

                     this_frame = ei_local->rx_start_page;

             

              /* Someday we'll omit the previous, iff we never get this message.

                 (There is at least one clone claimed to have a problem.)  */

              if (ei_debug > 0  &&  this_frame != ei_local->current_page)

                     printk("%s: mismatched read page pointers %2x vs %2x./n",

                               dev->name, this_frame, ei_local->current_page);

             

              if (this_frame == rxing_page)       /* Read all the frames? */

                     break;                          /* Done for now */

             

              current_offset = this_frame << 8;

              ei_block_input(dev, sizeof(rx_frame), (char *)&rx_frame,

                                      current_offset);

             

              pkt_len = rx_frame.count - sizeof(rx_frame);

             

              next_frame = this_frame + 1 + ((pkt_len+4)>>8);

             

              /* Check for bogosity warned by 3c503 book: the status byte is never

                 written.  This happened a lot during testing! This code should be

                 cleaned up someday. */

              if (rx_frame.next != next_frame

                     && rx_frame.next != next_frame + 1

                     && rx_frame.next != next_frame - num_rx_pages

                     && rx_frame.next != next_frame + 1 - num_rx_pages) {

                     ei_local->current_page = rxing_page;

                     outb(ei_local->current_page-1, e8390_base+EN0_BOUNDARY);

                     ei_local->stat.rx_errors++;

                     continue;

              }

 

              if (pkt_len < 60  ||  pkt_len > 1518) {

                     if (ei_debug)

                            printk("%s: bogus packet size: %d, status=%#2x nxpg=%#2x./n",

                                      dev->name, rx_frame.count, rx_frame.status,

                                      rx_frame.next);

                     ei_local->stat.rx_errors++;

              } else if ((rx_frame.status & 0x0F) == ENRSR_RXOK) {

                     int sksize = sizeof(struct sk_buff) + pkt_len;

                     struct sk_buff *skb;

                    

                     skb = alloc_skb(sksize, GFP_ATOMIC); [Wenxy5]  

                     if (skb == NULL) {

                            if (ei_debug)

                                   printk("%s: Couldn't allocate a sk_buff of size %d./n",

                                             dev->name, sksize);

                            ei_local->stat.rx_dropped++;

                            break;

                     } else {

                            skb->mem_len = sksize;

                            skb->mem_addr = skb;

                            skb->len = pkt_len;

                            skb->dev = dev;

                           

                            ei_block_input(dev, pkt_len, (char *) skb->data,

                                                    current_offset + sizeof(rx_frame));

                            netif_rx(skb); [Wenxy6]  

                            ei_local->stat.rx_packets++;

                     }

              } else {

                     int errs = rx_frame.status;

                     if (ei_debug)

                            printk("%s: bogus packet: status=%#2x nxpg=%#2x size=%d/n",

                                      dev->name, rx_frame.status, rx_frame.next,

                                      rx_frame.count);

                     if (errs & ENRSR_FO)

                            ei_local->stat.rx_fifo_errors++;

              }

              next_frame = rx_frame.next;

             

              /* This _should_ never happen: it's here for avoiding bad clones. */

              if (next_frame >= ei_local->stop_page) {

                     printk("%s: next frame inconsistency, %#2x..", dev->name,

                               next_frame);

                     next_frame = ei_local->rx_start_page;

              }

              ei_local->current_page = next_frame;

              outb(next_frame-1, e8390_base+EN0_BOUNDARY);

    }

    /* If any worth-while packets have been received, dev_rint()

       has done a mark_bh(INET_BH) for us and will work on them

       when we get to the bottom-half routine. */

 

       /* Record the maximum Rx packet queue. */

       if (rx_pkt_count > high_water_mark)

              high_water_mark = rx_pkt_count;

 

    /* Bug alert!  Reset ENISR_OVER to avoid spurious overruns! */

    outb_p(ENISR_RX+ENISR_RX_ERR+ENISR_OVER, e8390_base+EN0_ISR);

    return;

}

 

./net/inet/skbuff.c

/*

 *    Insert an sk_buff at the end of a list.

 */

 

void skb_queue_tail(struct sk_buff *volatile* list, struct sk_buff *newsk)

{

       unsigned long flags;

 

       if(newsk->list)

              printk("Suspicious queue tail: sk_buff on list!/n");

 

       IS_SKB(newsk);

       save_flags(flags);

       cli();

 

       newsk->list=list;

       if(*list)

       {

              (*list)->prev->next=newsk;

              newsk->prev=(*list)->prev;

              newsk->next=*list;

              (*list)->prev=newsk;

       }

       else

       {

              newsk->next=newsk;

              newsk->prev=newsk;

              *list=newsk;

       }

       IS_SKB(newsk->prev);

       IS_SKB(newsk->next);

       restore_flags(flags);

 

}

 

 

./net/inet/dev.c

/*

 * Receive a packet from a device driver and queue it for the upper

 * (protocol) levels.  It always succeeds.

 */

void

netif_rx(struct sk_buff *skb)

{

  /* Set any necessary flags. */

  skb->sk = NULL;

  skb->free = 1;

 

  /* and add it to the "backlog" queue. */

  IS_SKB(skb);

  skb_queue_tail(&backlog,skb);

  

  /* If any packet arrived, mark it for processing. */

  if (backlog != NULL) mark_bh(INET_BH); [Wenxy7]  

 

  return;

}

 

1.6 进入协议栈的流程: ./kernel/irq.c

/*

 * do_bottom_half() runs at normal kernel priority: all interrupts

 * enabled.  do_bottom_half() is atomic with respect to itself: a

 * bottom_half handler need not be re-entrant.

 */

asmlinkage void do_bottom_half [Wenxy8]  (void)

{

       unsigned long active;

       unsigned long mask, left;

       struct bh_struct *bh;

 

       bh = bh_base;

       active = bh_active & bh_mask;

       for (mask = 1, left = ~0 ; left & active ; bh++,mask += mask,left += left) {

              if (mask & active) {

                     void (*fn)(void *);

                     bh_active &= ~mask;

                     fn = bh->routine; [Wenxy9]  

                     if (!fn)

                            goto bad_bh;

                     fn(bh->data);

              }

       }

       return;

bad_bh:

       printk ("irq.c:bad bottom half entry/n");

}

 

1.7 接下来执行:/net/inet/dev.c

 

/*

 * This function gets called periodically, to see if we can

 * process any data that came in from some interface.

 *

 */

void

inet_bh(void *tmp)

[Wenxy10]  {

  struct sk_buff *skb;

  struct packet_type *ptype;

  unsigned short type;

  unsigned char flag = 0;

  int nitcount;

 

  /* Atomically check and mark our BUSY state. */

  if (set_bit(1, (void*)&in_bh))

      return;

 

  /* Can we send anything now? */

  dev_transmit(); [Wenxy11]  

 

  /* Any data left to process? */

  while((skb=skb_dequeue(&backlog))!=NULL) [Wenxy12]  

  {

      nitcount=dev_nit;

       flag=0;

       sti();

       /*

       * Bump the pointer to the next structure.

       * This assumes that the basic 'skb' pointer points to

       * the MAC header, if any (as indicated by its "length"

       * field).  Take care now!

       */

       skb->h.raw = skb->data + skb->dev->hard_header_len;

       skb->len -= skb->dev->hard_header_len;

 

       /*

       * Fetch the packet protocol ID.  This is also quite ugly, as

       * it depends on the protocol driver (the interface itself) to

       * know what the type is, or where to get it from.  The Ethernet

       * interfaces fetch the ID from the two bytes in the Ethernet MAC

       * header (the h_proto field in struct ethhdr), but drivers like

       * SLIP and PLIP have no alternative but to force the type to be

       * IP or something like that.  Sigh- FvK

       */

       type = skb->dev->type_trans(skb, skb->dev);

 

       /*

        * We got a packet ID.  Now loop over the "known protocols"

        * table (which is actually a linked list, but this will

        * change soon if I get my way- FvK), and forward the packet

        * to anyone who wants it.

        */

       for (ptype = ptype_base; ptype != NULL; ptype = ptype->next) {

              if (ptype->type == type || ptype->type == NET16(ETH_P_ALL)) {

                     struct sk_buff *skb2;

 

                     if (ptype->type==NET16(ETH_P_ALL))

                            nitcount--;

                     if (ptype->copy || nitcount) { /* copy if we need to     */

                            skb2 = alloc_skb(skb->mem_len, GFP_ATOMIC);

                            if (skb2 == NULL)

                                   continue;

                            memcpy(skb2, (const void *) skb, skb->mem_len);

                            skb2->mem_addr = skb2;

                            skb2->h.raw = (unsigned char *)(

                                (unsigned long) skb2 +

                                (unsigned long) skb->h.raw -

                                (unsigned long) skb

                            );

                            skb2->free = 1;

                     } else {

                            skb2 = skb;

                     }

 

                     /* This used to be in the 'else' part, but then

                      * we don't have this flag set when we get a

                      * protocol that *does* require copying... -FvK

                      */

                     flag = 1;

 

                     /* Kick the protocol handler. */

                     ptype->func(skb2, skb->dev, ptype);

              }

       }

 

       /*

        * That's odd.  We got an unknown packet.  Who's using

        * stuff like Novell or Amoeba on this network??

        */

       if (!flag) {

              DPRINTF((DBG_DEV,

                     "INET: unknown packet type 0x%04X (ignored)/n", type));

              skb->sk = NULL;

              kfree_skb(skb, FREE_WRITE);

       }

 

       /* Again, see if we can transmit anything now. */

       dev_transmit();                                                                                

       cli();

  }

  in_bh = 0;

  sti();

  dev_transmit();

}

 

请注意每个NIC driver有一个接收skbuff list, 也有一个发送skbuff list,实际进入TCP/IP协议栈的只有一个skbuff node(结点),这样便于封装,协议栈处理只针对某个skbuff

 

2.         TCP/IP协议栈接收/发送数据流程分析(以应用程序调用UDP socket为例)

2.1 Linux_1.0_TCP-IP协议栈数据处理流程.doc

参见《Linux_1.0_TCP-IP协议栈数据处理流程.doc》。

2. UDP socket发送数据,TCP/IP协议栈处理数据的流程

参见《LinuxTCPIP协议栈数据处理流程图.vsd》。

 


 

Defined in file: ./net/inet/protocol.h

  [Wenxy2]

初始化TCP/IP协议栈每一层的接口API。参见 ./net/inet/protocol.c中的全局变量:  struct inet_protocol *inet_protocol_base = &icmp_protocol;

  [Wenxy3]

struct device *dev_base = NEXT_DEV;

#   define NEXT_DEV (&eth0_dev)

Call ethif_probe which is a function pointer.

Then call NIC driver

#ifdef CONFIG_EEXPRESS        /* Intel EtherExpress */

       && express_probe(dev)

  [Wenxy4]

The NIC driver API of device driver.

  [Wenxy5]

申请一个skbuff结点的内存.

  [Wenxy6]

此函数调用了./net/inet/dev.c中的netif_rx(struct sk_buff *skb),在其中又调用了  skb_queue_tail(&backlog,skb);

  [Wenxy7]

通知内核的中断来处理下半部分工作(就是让TCP/IP协议栈来处理)

  [Wenxy8]

顾名思交:处理中断下半部分的工作。

  [Wenxy9]

代码的执行进入TCP/IP协议栈。  /* Initialize the "Buffer Head" pointers. */

  bh_base[INET_BH].routine = inet_bh;这里初始化了这个函数指针。

extern struct bh_struct bh_base[32];

 

/* Who gets which entry in bh_base.  Things which will occur most often

   should come first. */

enum {

TIMER_BH = 0,

CONSOLE_BH,

SERIAL_BH,

TTY_BH,

INET_BH,

KEYBOARD_BH

};

  [Wenxy10]

代码执行从fn = bh->routine;  [Wenxy10] 进入到这里

  [Wenxy11]

TCP/IP协议栈也可以发送数据了。

  [Wenxy12]

把接收队列中的所有数据包结点全部处理完。

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值