【Go】爬虫

最新推荐文章于 2024-04-17 07:47:06 发布

寒士°、

最新推荐文章于 2024-04-17 07:47:06 发布

阅读量824

点赞数

分类专栏： Golang学习文章标签： golang

本文链接：https://blog.csdn.net/weixin_43877853/article/details/130564537

版权

Golang学习专栏收录该内容

9 篇文章 0 订阅

订阅专栏

package links

import(
	"fmt"
	"net/http"
	"net/net1/html"
)

func Extract(url string)([]string, error){
	resp, err := http.Get(url)
	if err != nil{
		return nil, err
	}
	if resp.StatusCode != http.StatusOK{
		resp.Body.Close()
		return nil, fmt.Errorf("getting %s: %s",url, resp.Status)
	}

	doc, err := html.Parse(resp.Body)
	resp.Body.Close()
	if err != nil{
		return nil, fmt.Errorf("parsing %s as HTML: %v", url, err)
	}
	var links  []string
	visitNode := func(n *html.Node){
		if n.Type == html.ElementNode && n.Data == "a"{
			for _, a := range n.Attr{
				if a.Key != "href"{
					continue
				}
				link, err := resp.Request.URL.Parse(a.Val)
				if err != nil{
					continue
				}
				links = append(links, link.String())
			}
		}
	}
	forEachNode(doc, visitNode, nil)
	return links, nil
}

func forEachNode(n *html.Node, pre, post func(n *html.Node)){
	if pre != nil{
		pre(n)
	}
	for c := n.FirstChild; c != nil; c = c.NextSibling{
		forEachNode(c, pre, post)
	}
	if post != nil{
		post(n)
	}
}

需要引入一个非标准的包golang.org/x/net/html，用来解析html，返回一个*Node和err

然后定义一个匿名函数visitNode，用来遍历Node中包含的所有超链接，再将链接加入links中，等待后续遍历。

定义forEachNode遍历根为n的每个子节点，定义两个函数变量pre和post，表示前序访问子节点或是后序访问子节点。

package main

import (
	"fmt"
	"log"
	"sync"
	"flag"
	"links"
)

//!+sema
// tokens is a counting semaphore used to
// enforce a limit of 20 concurrent requests.
var tokens = make(chan struct{}, 20)

var maxDepth int
var seen = make(map[string]bool)
// visit seen in goroutine, need a lock
var seenLock = sync.Mutex{}

func crawl(url string, depth int, wg *sync.WaitGroup){
	defer wg.Done()
	if depth >= maxDepth{
		return
	}
	fmt.Println(depth, url)
	tokens <- struct{}{} // acquire a token
	list, err := links.Extract(url)
	<-tokens // release the token
	if err != nil {
		log.Print(err)
	}
	for _, link := range list{
		seenLock.Lock()
		if seen[link]{
			seenLock.Unlock()
			continue
		}
		seen[link] = true
		seenLock.Unlock()
		wg.Add(1)
		go crawl(link, depth+1, wg)
	}
}

func main() {
	flag.IntVar(&maxDepth, "depth", 3, "max crawl depth")
	flag.Parse()
	wg := &sync.WaitGroup{}
	for _, link := range flag.Args(){
		wg.Add(1)
		go crawl(link, 0, wg)
	}
	wg.Wait()
}

首先定义一个20个大小的缓冲通道，作为令牌，防止同时打开的线程太多，只有得到令牌的才能打开爬虫线程
定义seen标记访问过的链接，不嵌套访问造成死循环。
由于要在线程中访问，修改seen，故添加锁。
用flag库解析命令行，设定爬虫深度，默认为3
使用sync.WaitGroup等待爬虫完成执行，将命令行中的链接创建线程
在爬虫中，通过解析节点包含链接，增加深度，若大于设定的爬虫深度，则返回。

package main

import (
	"fmt"
	"log"
	"flag"
	"links"
)

type linklist struct{
	url string
	depth int
}
var maxDepth int

func crawl(link linklist)[]linklist {
	if link.depth >= maxDepth{
		return nil
	}
	fmt.Println(link.depth, link.url)
	urls, err := links.Extract(link.url)
	if err != nil {
		log.Print(err)
	}
	var list []linklist
	for _, url := range urls{
		list = append(list, linklist{url, link.depth + 1})
	}
	return list
}

//!+
func main() {
	flag.IntVar(&maxDepth, "depth", 3, "max crawl depth")
	flag.Parse()
	fmt.Println(maxDepth)
	worklist := make(chan []linklist)  // lists of URLs, may have duplicates
	unseenLinks := make(chan linklist) // de-duplicated URLs

	// Add command-line arguments to worklist.
	go func(){
		var list []linklist
		for _, url := range flag.Args(){
			list = append(list, linklist{url, 0})
		}
		worklist <- list
	}()

	// Create 20 crawler goroutines to fetch each unseen link.
	for i := 0; i < 20; i++ {
		go func() {
			for link := range unseenLinks {
				foundLinks := crawl(link)
				go func() { worklist <- foundLinks }()
			}
		}()
	}

	// The main goroutine de-duplicates worklist items
	// and sends the unseen ones to the crawlers.
	seen := make(map[string]bool)
	for list := range worklist {
		for _, link := range list {
			if !seen[link.url] {
				seen[link.url] = true
				unseenLinks <- link
			}
		}
	}
}

另外的一种写法，使用结构体记录链接的深度。

思路为通过worklist通道传输链接列表，然后通过遍历worklist中的链接，如果未被访问过则通过unsendLinks通道传输给爬虫。

通过一个20的循环建立一个20个爬虫线程，爬虫返回的连接再重复上面通道传输过程。

相比于来说，前一种写法是深度优先，后一种是广度优先。(某种程度上的可以说)

寒士°、

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
【Go】爬虫

思路为通过worklist通道传输链接列表，然后通过遍历worklist中的链接，如果未被访问过则通过unsendLinks通道传输给爬虫。定义forEachNode遍历根为n的每个子节点，定义两个函数变量pre和post，表示前序访问子节点或是后序访问子节点。然后定义一个匿名函数visitNode，用来遍历Node中包含的所有超链接，再将链接加入links中，等待后续遍历。通过一个20的循环建立一个20个爬虫线程，爬虫返回的连接再重复上面通道传输过程。另外的一种写法，使用结构体记录链接的深度。
复制链接

扫一扫