前言
这里主要展示一下 Go tour 练习:Web 爬虫的两种做法
官方地址
提示:以下是本篇文章正文内容,下面代码可供参考
一、使用Mutex
主要是通过map做数据的存储,通过Mutex保证数据的一直性
type UrlMap struct {
m map[string]int
mux sync.Mutex
}
func (urlMap *UrlMap) getUrl(url string) int {
urlMap.mux.Lock()
defer urlMap.mux.Unlock()
return urlMap.m[url]
}
func (urlMap *UrlMap) setUrl(url string) {
urlMap.mux.Lock()
defer urlMap.mux.Unlock()
urlMap.m[url]++
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, urlMap *UrlMap) {
if depth <= 0 {
return
}
if urlMap.getUrl(url) > 1 {
return
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
var newUrls []string
for _, u := range urls {
if urlMap.getUrl(u) == 0 {
urlMap.setUrl(u)
newUrls = append(newUrls, u)
}
}
fmt.Printf("found: %s %q\n", urls, body)
for _, u := range newUrls {
go Crawl(u, depth-1, fetcher, urlMap)
}
return
}
func main() {
urlMap := UrlMap{m: map[string]int{"https://golang.org/": 1}}
Crawl("https://golang.org/", 4, fetcher, &urlMap)
time.Sleep(100 * time.Millisecond)
}
二、使用channel
通过master管理worker,每个worker负责fetch的工作(I/O 工作),在master使用map统一管理fetch过的url,这样就不需要使用锁机制了
type Fetcher interface {
Fetch(url string) (body string, urls []string, err error)
}
func worker(url string, ch chan []string, fetcher Fetcher) {
_, urls, err := fetcher.Fetch(url)
if err != nil {
ch <- []string{}
} else {
ch <- urls
}
}
func master(ch chan []string, fetcher Fetcher) {
n := 1
urlMap := map[string]bool{}
// listen channel
// get url list in chan []string
for urls := range ch {
//get single url
for _, url := range urls {
if urlMap[url] != true {
n++
urlMap[url] = true
go worker(url, ch, fetcher)
}
}
n--
if n == 0 {
break
}
}
}
func ConcurrentChannel(url string, fetcher Fetcher) {
//make a channel to listen the fetch resutls
c := make(chan []string)
//send the url to channel
go func() {
c <- []string{url}
}()
master(c, fetcher)
}
func main() {
fmt.Printf("=== ConcurrentChannel ===\n")
ConcurrentChannel("https://golang.org/", fetcher)
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f[url]; ok {
fmt.Printf("found: %s %q\n", url, res.body)
return res.body, res.urls, nil
}
fmt.Printf("missing: %s\n", url)
return "", nil, fmt.Errorf("not found: %s", url)
}