说明:
最大20线程,搜索深度不大于3的网页,并打印出来。
当搜索的不是网页的时候,存在bug,还在学习,希望有大佬告知怎么解决。
package main
import (
"fmt"
"log"
"net/http"
"golang.org/x/net/html"
)
type Link struct{
link string
dep int
}
func main(){
turl := make(chan []Link)
tt := []Link{}
tt = append(tt,Link{"https://blog.csdn.net/tianlongtc/",1})
go func(){
turl <- tt
}()
var n int = 1;
vis := make(map[string]bool)
for ; n > 0; n-- {
list := <-turl
for _,link := range list{
if !vis[link.link] {
vis[link.link] = true;
n++
go func(link Link){
turl <- tcrawl(link);
}(link)
}
}
}
}
var tokens = make(chan struct{}, 20)
func tcrawl(link Link) ([]Link){
if link.dep > 3{
return nil
}
fmt.Println(link.link," ", link.dep)
tokens <- struct{}{}
links,err := getUrl(link);
<-tokens
if err != nil {
log.Print(err)
}
return links
}
func getUrl(link Link) ([]Link, error ){
resp,err := http.Get(link.link);
if err != nil {
log.Print(err)
}
if resp.StatusCode != http.StatusOK {
resp.Body.Close()
return nil, fmt.Errorf("getting %s: %s", link.link, resp.Status)
}
doc, err := html.Parse(resp.Body)
if err != nil {
log.Fatal(err)
}
resp.Body.Close()
var links []Link
visNode := func (n *html.Node) {
if n.Type == html.ElementNode && n.Data == "a"{
for _,a := range n.Attr {
if a.Key != "href" {
continue
}
lin, err := resp.Request.URL.Parse(a.Val)
if err != nil {
continue // ignore bad URLs
}
links = append(links, Link{lin.String(),link.dep+1})
}
}
}
forEach(doc,visNode,nil)
return links,nil
}
func forEach(n *html.Node, pre, post func(n *html.Node)) {
if pre != nil {
pre(n)
}
for t := n.FirstChild; t != nil ; t = t.NextSibling {
forEach(t,pre,post)
}
if post != nil {
post(n)
}
}