从今天开始写larbin的源码分析。希望在一个月后,自己能够对larbin的源码了如指掌,也希望到那时自己可以写出一个爬虫。

 

下面是global的结构,从global中的成员,我们可以大概对larbin有个了解。

  struct global {
  /** Constructor : see global.cc for details */
  global (int argc, char * argv[]);          
  /** Destructor : never used */
  ~global ();
  /** current time : avoid to many calls to time(NULL) */
  static time_t now;                                          //当前的时间
  /** List of pages allready seen (one bit per page) */
  static hashTable *seen;                                  //已经扫描过的页
#ifdef NO_DUP
  /** Hashtable for suppressing duplicates */
  static hashDup *hDuplicate;
#endif // NO_DUP
  /** URLs for the sequencer with high priority */               
  static SyncFifo<url> *URLsPriority;                       //SyncFifo 是一个同步的先进先出的队列.点此可进入此结构的研究

   static SyncFifo<url> *URLsPriorityWait;    
  static uint readPriorityWait;
  /** This one has a lower priority : see fetch/sequencer.cc */
  static PersistentFifo *URLsDisk;                          //PersisitentFifo是一个存储在硬盘上的队列。点此进入此结构
  static PersistentFifo *URLsDiskWait;
  static uint readWait;
  /** hashtables of the site we accessed (cache) */
  static NamedSite *namedSiteList;                      //NamedSite是存储已经访问过的网站的,每一个NamedSite都对应一个IPsite。点此进入NamedSite的研究。 

  static IPSite *IPSiteList;      //点此进入ipsite的研究
  /** Sites which have at least one url to fetch */
  static Fifo<IPSite> *okSites;         // Fifo是非同步的标准的存在于RAM中的队列。点此进入Fifo的研究
  /** Sites which have at least one url to fetch
   * but need a dns call
   */
  static Fifo<NamedSite> *dnsSites;
  /** Informations for the fetch
   * This array contain all the connections (empty or not)
   */
  static Connexion *connexions;
  /** Internal state of adns */
  static adns_state ads;
  /* Number of pending dns calls */
  static uint nbDnsCalls;
  /** free connection for fetchOpen : connections with state==EMPTY */
  static ConstantSizedFifo<Connexion> *freeConns;
#ifdef THREAD_OUTPUT
  /** free connection for fetchOpen : connections waiting for end user */
  static ConstantSizedFifo<Connexion> *userConns;
#endif
  /** Sum of the sizes of a fifo in Sites */
  static Interval *inter;
  /** How deep should we go inside a site */
  static int8_t depthInSite;
  /** Follow external links ? */
  static bool externalLinks;
  /** how many seconds should we wait beetween 2 calls at the same server
   * 0 if you are only on a personnal server, >=30 otherwise
   */
  static time_t waitDuration;
  /** Name of the bot */
  static char *userAgent;
  /** Name of the man who lauch the bot */
  static char *sender;
  /** http headers to send with requests
   * sends name of the robots, from field...
   */
  static char *headers;
  static char *headersRobots;  // used when asking a robots.txt
  /* internet address of the proxy (if any) */
  static sockaddr_in *proxyAddr;
  /** connect to this server through a proxy using connection conn
   * return >0 in case of success (connecting or connected), 0 otherwise
   */
  static char getProxyFds (Connexion *conn);
  /** Limit to domain */
  static Vector<char> *domains;
  /** forbidden extensions
   * extensions which are allways to avoid : .ps, .pdf...
   */
  static Vector<char> forbExt;
  /** number of parallel connexions
   * your kernel must support a little more than nb_conn file descriptors
   */
  static uint nb_conn;
  /** number of parallel dns calls */
  static uint dnsConn;
  /** number of urls in IPSites */
  static int IPUrl;
  /** port on which is launched the http statistic webserver */
  static unsigned short int httpPort;
  /** port on which input wait for queries */
  static unsigned short int inputPort;
  /** parse configuration file */
  static void parseFile (char *file);
  /** read the domain limit */
  static void manageDomain (char **posParse);
  /** read the forbidden extensions */
  static void manageExt (char **posParse);
  /// POLL ///
  /** array used by poll */
  static struct pollfd *pollfds;
  /** pos of the max used field in pollfds */
  static uint posPoll;
  /** size of pollfds */
  static uint sizePoll;
  /** array used for dealing with answers */
  static short *ansPoll;
  /** number of the biggest file descriptor */
  static int maxFds;
  /** make sure the new socket is not too big for ansPoll */
  static void verifMax (int fd);
#ifdef MAXBANDWIDTH
  /** number of bits still allowed during this second */
  static long int remainBand;
#endif // MAXBANDWIDTH
};