1、并发版爬虫架构:
//单任务版架构,耗时最多的是fetcher
fetcher的输出其实就是Parser的输入,可以把两者放一个模块
//送很多种子
func Run(seeds ...Request){
var requests []Request
for _,r := range seeds {
requests := append(requests,r)
}
for len(requests) >0 {
r := requests[0]
requests = requests[1:]
parseResult,err :=worker(r)
if err != nil {
continue
}
requests = append(requests,parseResult.Requests...) //...表示把requests中的内容展开了添加进去,不然就得写成requests = append(requests,parseResult.Requests[0],parseResult.Requests[1])
for _,item := range parseResult.Items {
log.Printf("Got item %v",item) //%v直接输出,是数字输出数字,是字符串也直接输出
}
}
}
//提取worker
func worker(r Request) (ParseResult,err){
log.Printf("Fetching %s",r.Url)
body,err := fetcher.Fetch(r.Url)
if err != nil {
log.Printf("Fetcher: error " + "fetching url %s:%v",r.Url,err)
return ParseResult{
},err //这里ParseResult不能返回nil
}
return r.ParserFunc(body),nil
}
//把worker并发
还需要一个Scheduler来分配任务
Engine用一个goroutine,Scheduler用一个goroutine,worker开多个goroutine
复杂点在Scheduler的设计,可以做复杂,也可以简单做
简单的就是Scheduler实现I,所有Worker公用一个输入
所有的Worker都从一个Scheduler拿Request,拿到一个处理一个
2、简单调度器:
type SimpleEngine struct {
}
//送很多种子
func (e SimpleEngine) Run(seeds ...Request){
var requests []Request
for _,r := range seeds {
requests := append(requests,r)
}
for len(requests) >0 {
r := requests[0]
requests = requests[1:]
parseResult,err :=e.w