Go指南练习：Web 爬虫 - 使用两种解法Mutex跟channel

本文通过两种方式实现了一个简单的Web爬虫：一是采用Mutex保证数据一致性；二是利用channel进行无锁并发管理。前者通过Mutex锁定map来跟踪已访问的URL，后者则由Master分配任务给Worker，使用map记录已抓取的URL。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

文章目录

前言
一、使用Mutex
二、使用channel
总结

前言

这里主要展示一下 Go tour 练习：Web 爬虫的两种做法
官方地址

提示：以下是本篇文章正文内容，下面代码可供参考

一、使用Mutex

主要是通过map做数据的存储，通过Mutex保证数据的一直性

type UrlMap struct {
	m   map[string]int
	mux sync.Mutex
}

func (urlMap *UrlMap) getUrl(url string) int {
	urlMap.mux.Lock()
	defer urlMap.mux.Unlock()
	return urlMap.m[url]
}

func (urlMap *UrlMap) setUrl(url string) {
	urlMap.mux.Lock()
	defer urlMap.mux.Unlock()
	urlMap.m[url]++
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, urlMap *UrlMap) {

	if depth <= 0 {
		return
	}

	if urlMap.getUrl(url) > 1 {
		return
	}

	body, urls, err := fetcher.Fetch(url)

	if err != nil {
		fmt.Println(err)
		return
	}

	var newUrls []string
	for _, u := range urls {
		if urlMap.getUrl(u) == 0 {
			urlMap.setUrl(u)
			newUrls = append(newUrls, u)
		}
	}

	fmt.Printf("found: %s %q\n", urls, body)
	for _, u := range newUrls {
		go Crawl(u, depth-1, fetcher, urlMap)
	}

	return
}

func main() {
	urlMap := UrlMap{m: map[string]int{"https://golang.org/": 1}}
	Crawl("https://golang.org/", 4, fetcher, &urlMap)
	time.Sleep(100 * time.Millisecond)
}

二、使用channel

通过master管理worker，每个worker负责fetch的工作(I/O 工作)，在master使用map统一管理fetch过的url，这样就不需要使用锁机制了

type Fetcher interface {
	Fetch(url string) (body string, urls []string, err error)
}

func worker(url string, ch chan []string, fetcher Fetcher) {
	_, urls, err := fetcher.Fetch(url)
	if err != nil {
		ch <- []string{}
	} else {
		ch <- urls
	}
}

func master(ch chan []string, fetcher Fetcher) {
	n := 1
	urlMap := map[string]bool{}
	// listen channel
	// get url list in chan []string
	for urls := range ch {
		//get single url
		for _, url := range urls {
			if urlMap[url] != true {
				n++
				urlMap[url] = true
				go worker(url, ch, fetcher)
			}
		}
		n--
		if n == 0 {
			break
		}
	}
}

func ConcurrentChannel(url string, fetcher Fetcher) {
	//make a channel to listen the fetch resutls
	c := make(chan []string)
	//send the url to channel
	go func() {
		c <- []string{url}
	}()
	master(c, fetcher)
}

func main() {
	fmt.Printf("=== ConcurrentChannel ===\n")
	ConcurrentChannel("https://golang.org/", fetcher)
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
	if res, ok := f[url]; ok {
		fmt.Printf("found:   %s %q\n", url, res.body)
		return res.body, res.urls, nil
	}
	fmt.Printf("missing: %s\n", url)
	return "", nil, fmt.Errorf("not found: %s", url)
}