自学golang期间,编写gorouting,chan相关代码,实现分布式爬去某主站的每个页面的链接,
暂未想到有什么好办法,在抓取完网页上的链接后,自动退出程序。
抓取性能:内存(8G)占用97%,抓取链接数量:923571个。
最后抓取的链接如下:
923550 https://jobs.51job.com/xian/97026549.html?t=2&s=01
923551 https://jobs.51job.com/xian-ytq/100531159.html?t=2&s=01
923552 https://jobs.51job.com/xian-ytq/co4962830.html
923553 https://jobs.51job.com/xian/102536289.html?t=2&s=01
923554 https://jobs.51job.com/xian/co4541067.html
923555 https://jobs.51job.com/xian-ytq/103266474.html?t=2&s=01
923556 https://jobs.51job.com/xian-ytq/co4427712.html
923557 https://jobs.51job.com/xian/100993960.html?t=2&s=01
923558 https://jobs.51job.com/xian-jjjs/101527308.html?t=2&s=01
923559 https://jobs.51job.com/xian-jjjs/co4429285.html
923560 https://jobs.51job.com/xian/98443048.html?t=2&s=01
923561 https://jobs.51job.com/xian/co4204829.html
923562 https://jobs.51job.com/xian-ytq/101144457.html?t=2&s=01
923563 https://jobs.51job.com/xian-ytq/co4979864.html
923564 https://jobs.51job.com/xian-ytq/91962903.html?t=2&s=01
923565 https://jobs.51job.com/xian-ytq/co4411060.html
923566 https://jobs.51job.com/xian/98252112.html?t=2&s=01
923567 https://jobs.51job.com/xian/co4868985.html
923568 https://jobs.51job.com/xian-lhq/95451726.html?t=2&s=01
923569 https://jobs.51job.com/xian-lhq/co4054606.html
923570 https://jobs.51job.com/xian/103780877.html?t=2&s=01
923571 https://jobs.51job.com/xian/co4301859.html
具体代码如下:
package main
import (
"fmt"
"net/http"
"strings"
"golang.org/x/net/html"
)
func analyseNode(node *html.Node, link string, f func(string)) {
if node.Type == html.ElementNode && node.Data == "a" {
for _, b := range node.Attr {
if b.Key == "href" && b.Val != "javascript:" {
v := b.Val
if strings.HasPrefix(v, "#") {
continue
}
f(v)
}
}
}
for n := node.FirstChild; n != nil; n = n.NextSibling {
analyseNode(n, link, f)
}
}
func requestPage(link string, f func(string)) {
resp, err := http.Get(link)
if err == nil {
doc, err := html.Parse(resp.Body)
resp.Body.Close()
if err == nil {
analyseNode(doc, link, f)
}
}
}
func main() {
var i uint64 = 0
seen := make(map[string]bool)
worklist := make(chan string)
found := make(chan string)
fmt.Println("main runing")
go func() {
str := "https://www.51job.com"
seen[str] = true
worklist <- str
}()
for i := 0; i < 20; i++ {
go func() {
for url := range worklist {
link := url
requestPage(link, func(s string) {
if strings.HasPrefix(s, "//") {
protocol := link[0:strings.Index(link, "//")]
s = protocol + s
} else if strings.HasPrefix(s, "/") {
s = link + s
}
found <- s
})
}
}()
}
for url := range found {
if !seen[url] {
seen[url] = true
i += 1
fmt.Println(i, url)
go func(url string) {
worklist <- url
}(url)
}
}
}
有待进一步优化