Customization larbin

最新推荐文章于 2024-10-03 08:44:33 发布

wangran51

最新推荐文章于 2024-10-03 08:44:33 发布

阅读量127

点赞数

分类专栏： Crawler 文章标签：运维 java 爬虫

Crawler 专栏收录该内容

17 篇文章 0 订阅

订阅专栏

配置英文原版说明：

http://larbin.sourceforge.net/custom-eng.html#larbin.conf

larbin.conf

###############################################
# Who are you ?
# mail of the one who launched larbin (YOUR mail)
From larbin2.6.3@unspecified.mail
# name of the bot (sent with http headers)
UserAgent larbin_2.6.3

############################################
# What are the inputs and ouputs of larbin
# port on which is launched the http statistic webserver
# if unset or set to 0, no webserver is launched//larbin在运行时可以通过 http://localhost:8081查看运行情况；如果值为0，则不启动web服务器。
httpPort 8081
# port on which you can submit urls to fetch
# no input is possible if you comment this line or use port 0
#inputPort 1976

############################################
# parameters to adapt depending on your network
# Number of connexions in parallel (to adapt depending of your network speed)//并行获取网页的数量
pagesConnexions 100
# Number of dns calls in parallel//并行解析dns的数量
dnsConnexions 5
# How deep do you want to go in a site//网页抓取深度
depthInSite 5
# do you want to follow external links//是否允许抓取域名外连接
#noExternalLinks
# time between 2 calls on the same server (in sec) : NEVER less than 30//对同一个服务器获取网页的间隔时间
waitDuration 60
# Make requests through a proxy (use with care)
#proxy www 8080

##############################################
# now, let's customize the search

# first page to fetch (you can specify several urls)
startUrl http://www.csdn.net/ //抓取网页的其实URL,可指定多值

# Do you want to limit your search to a specific domain ?
# if yes, uncomment the following line//限制爬虫抓取的网址域名后缀。
#limitToDomain .fr .dk .uk end

# What are the extensions you surely don't want//限制不被下载的对象的后缀,可通过注释或者增加后缀控制下载
# never forbid .html, .htm and so on : larbin needs them
forbiddenExtensions
#.tar .gz .tgz .zip .Z .rpm .deb
#.ps .dvi .pdf
#.png .jpg .jpeg .bmp .smi .tiff .gif
#.mov .avi .mpeg .mpg .mp3 .qt .wav .ram .rm
#.jar .java .class .diff
#.doc .xls .ppt .mdb .rtf .exe .pps .so .psd
end

option.h

// Larbin
// Sebastien Ailleret
// 27-05-01 -> 09-03-02

#ifndef LARBIN_CONFIG
#define LARBIN_CONFIG

#include "config.h"

/* This files allows a lot of customizations of larbin
* see doc/custom-eng.html for more details
*/

/
// Select the output module you want to use
//相关代码在src/interf/useroutput.cc

//#define DEFAULT_OUTPUT // do nothing. //除了统计，其他什么都不做，不会下载网页
#define SIMPLE_SAVE // save in files named save/dxxxxxx/fyyyyyy //以fyyyyy为文件名下载到save/dxxxxx目录下，该目录还包含一个index文件，记录已经下载的网址
//#define MIRROR_SAVE // save in files (respect sites hierarchy) //网页下载到save/dxxxxx/url目录下，其中url是起始网址
//#define STATS_OUTPUT // do some stats on pages //输出统计，可以通过http://localhost:8081/output.html查看

// Set up a specific search
//相关代码在src/fetch/specbuf.cc

//#define SPECIFICSEARCH //寻找特定的文档
//#define contentTypes ((char *[]) { "audio/mpeg", NULL }) //寻找内容的类型
//#define privilegedExts ((char *[]) { ".mp3", NULL }) //文件的扩展名

// how do you want to manage specific pages (select one of the followings)
//#define DEFAULT_SPECIFIC //默认像html一样保存
//#define SAVE_SPECIFIC //特别的页面保存在磁盘上
//#define DYNAMIC_SPECIFIC //对于大的文件，使用动态分配的缓冲区

//
// What do you want the crawler to do

// do you want to follow links in pages//如果此选项未设置，HTML页面不会被解析和链接，就不会跟踪。
#define FOLLOW_LINKS

// do you want the crawler to associate to each page the list of its sons//加入个个页面连接包含的链表
//#define LINKS_INFO

// do you want to associate a tag to pages (given in input)
// this allows to follow a page from input to output (and follow redirection)
//#define URL_TAGS

// do you want to suppress duplicate pages//如果将此选项设置，当遇到和旧的内容相同的网页时，larbin不返回成功
#define NO_DUP

// do you want larbin to stop when everything has been fetched//完成时是否要退出
//#define EXIT_AT_END

// do you want to fetch images //是否想要下载图像
// if you enable this option, update forbiddenExtensions in larbin.conf
//#define IMAGES

// downlaod everything (ie no check of content type in http headers)//下载任何东西
//#define ANYTYPE

// do you want to manage cookies//对cookies进行管理
//#define COOKIES

//
// Various options

// do you want to get cgi //获取指定的cgi
// 0 : yes ; 1 : no ; 2 : NO ! //0代表所有的cgi，1代表拒绝urls里有‘？’或‘＝’的cgi，2代表禁止所有的cgi
#define CGILEVEL 0

// limit bandwith usage (in octets/sec) //设置限制的带宽，不设置则没有限制
// be carefull, larbin might use 10 to 20% more //larbin可能可以用到10％——20％或更多
//#define MAXBANDWIDTH 200000

// the depth is initialized each time a link goes to another site //如果此选项，当一个链接指向另一个网站，新的URL深度会初始化，否则它永远不会
#define DEPTHBYSITE

//
// Efficiency vs feature

// do we need a special thread for output//如果没有设置，在程序中只有一个线程
// This is compulsory if it can block
// (not needed if you did not add code yourself)
//#define THREAD_OUTPUT

// if this option is set, larbin saves the hashtable from time to time
// this way it can restart from where it last stopped
// by reloading the table //从上次停止的地方开始执行
//#define RELOAD

//
// now it's just if you need to know how it works

// do not launch the webserver //启动web服务器
// this can be usefull in order to launch no thread at all
//#define NOWEBSERVER

// do you want nice graphs for in the stats page //在统计页面实时显示直方图
#define GRAPH

// uncomment if you are not interested in debugging information
//#define NDEBUG //不在web上显示调试信息

// enable this if you really dislike stats (in the webserver)//不在web上显示统计信息
//#define NOSTATS

// enable this if you really like stats (on stdout)
#define STATS //每8秒显示一次统计信息
//#define BIGSTATS //在屏幕上显示获取到的所有页面，但会减慢larbin速度

// Please enable this option if you want to report a crash//当崩溃是报告
// then compile with "make debug"
//#define CRASH

#endif // LARBIN_CONFIG

types.h

// Larbin
// Sebastien Ailleret
// 12-01-00 -> 10-12-01

#ifndef TYPES_H
#define TYPES_H

// Size of the HashSize (max number of urls that can be fetched)//hash表的大小（最大的可以提取的网址数量）
#define hashSize 64000000

// Size of the duplicate hashTable//复制哈希表的大小
#define dupSize hashSize
#define dupFile "dupfile.bak"

// Size of the arrays of Sites in main memory//主存网址数组的大小
#define namedSiteListSize 20000
#define IPSiteListSize 10000

// Max number of urls in ram//随机存取存储器的大小
#define ramUrls 100000
#define maxIPUrls 80000 // this should allow less dns call

// Max number of urls per site in Url//每个网站的网址的最大数量
#define maxUrlsBySite 254 // must fit in uint8_t

// time out when reading a page (in sec)//读一个网页超时的时间
#define timeoutPage 30 // default time out
#define timeoutIncr 2000 // number of bytes for 1 more sec

// How long do we keep dns answers and robots.txt//保持域名解释的时间
#define dnsValidTime 2*24*3600

// Maximum size of a page//可以下载的网页的最大大小
#define maxPageSize 1000000
#define nearlyFullPage 90000

// Maximum size of a robots.txt that is read
// the value used is min(maxPageSize, maxRobotsSize)
#define maxRobotsSize 10000

// How many forbidden items do we accept in a robots.txt
#define maxRobotsItem 100

// file name used for storing urls on disk//在硬盘上存储urls的文件名
#define fifoFile "fifo"
#define fifoFileWait "fifowait"

// number of urls per file on disk//每个文件的urls个数
// should be equal to ramUrls for good interaction with restart//为了在重起时有好的影响，应该和随机存储器的大小相等
#define urlByFile ramUrls

// Size of the buffer used to read sockets//套接字缓冲区的大小
#define BUF_SIZE 16384
#define STRING_SIZE 1024

// Max size for a url//url的最大值
#define maxUrlSize 512
#define maxSiteSize 40 // max size for the name of a site

// max size for cookies//cookies的最大大小
#define maxCookieSize 128

// Standard size of a fifo in a Site
#define StdVectSize maxRobotsItem

// maximum number of input connections//输入链接的最大数
#define maxInput 5

// if we save files, how many files per directory and where
#define filesPerDir 2000 //每个文件夹保存的网页数量
#define saveDir "save/" //下载的网页保存的路径
#define indexFile "index.html" // for MIRROR_SAVE
#define nbDir 1000 // for MIRROR_SAVE

// options for SPECIFICSEARCH (except with DEFAULT_SPECIFIC)
#define specDir "specific/" //特殊文件的保存路径
#define maxSpecSize 5000000 //特殊文件的最大大小

// Various reasons of error when getting a page//下载一个网页时各种错误的原因
#define nbAnswers 16
enum FetchError
{
success,
noDNS,
noConnection,
forbiddenRobots,
timeout,
badType,
tooBig,
err30X,
err40X,
earlyStop,
duplicate,
fastRobots,
fastNoConn,
fastNoDns,
tooDeep,
urlDup
};

// standard types
typedef unsigned int uint;

#endif // TYPES_H