思路:每生成一个url就用go func() 去执行,然后通过goquery 处理请求返回的html页面,把想要的数据清洗出来,插入mongo数据库保存。
package main
import (
"context"
"fmt"
"github.com/PuerkitoBio/goquery"
"go.mongodb.org/mongo-driver/bson/primitive"
"go.mongodb.org/mongo-driver/mongo"
"go.mongodb.org/mongo-driver/mongo/options"
"io"
"log"
"net/http"
"strconv"
"strings"
"time"
)
var (
dberr error
client *mongo.Client
db *mongo.Database
collection *mongo.Collection
id primitive.ObjectID
Mdata []interface{} //Moive 数组
)
type Moive struct {
Title string
Url string
Score float32
}
//插入mongo一条数据库
func InsertRecord(client *mongo.Client, collect *mongo.Collection,moive Moive) (insertID primitive.ObjectID) {
collect = client.Database("go").Collection("moive")
insertRest, err := collect.InsertOne(context.TODO(), moive)
if err != nil {
fmt.Println(err)
return
}
insertID = insertRest.InsertedID.(primitive.ObjectID)
fmt.Println("插入Id:",insertID.Hex())
return insertID
}
//爬取一个页面并返回结果
func SpiderOnePage(url string) (result string, err error) {
resp, err1 := http.Get(url)
if err1 != nil {
err = err1
fmt.Println("http.Get err1 = ", err1)
return
}
defer resp.Body.Close()
//读取网页的内容
buf := make([]byte, 4*1024)
for {
n, err2 := resp.Body.Read(buf)
if n == 0 {
//如果读取接收,直接break
if err2 == io.EOF {
break
//如果是其他错误,就打印出来
} else {
fmt.Println("resp.Body.Read err2 = ", err2)
break
}
}
result += string(buf[:n]) //读取多少,写多少
}
return
}
//用 goquery 获取html 标签内容 类似 于Jquery
func getBodyData(srcResult string)() {
dom,err:=goquery.NewDocumentFromReader(strings.NewReader(srcResult))
if err!=nil{
log.Fatalln(err)
}
detail := ""
dom.Find("div[class=text]").Each(func(i int, selection *goquery.Selection) {
title := selection.Find("a").First().Text()
title = strings.Replace(title, " ", "", -1)
url,_ := selection.Find("a").First().Attr("href")
url = strings.Replace(url, " ", "", -1)
score:= selection.Find("b").First().Text()
detail +="title:"+title+",url:"+url+",score:"+score+"\r\n"
fmt.Println("data = ", detail)
fscore,_ := strconv.ParseFloat(score, 32)
moive := Moive{Title: title, Url:url, Score: float32(fscore)} //组装
Mdata = append(Mdata, moive)
})
}
//爬取内容,得到url的 html内容
func GetResponse(url string) (result string, err error) {
srcResult, err1 := SpiderOnePage(url)
if err1 != nil {
err = err1
return
}
//fmt.Println("srcResult result= \n", srcResult)
//页面处理
getBodyData(srcResult)
return "", nil
}
//组装url
func SpiderPage(i int, page chan int) {
//某电影网站
url := "https://www.xxx.com/mdb/film/list/year-1908/o0d0p"+ strconv.Itoa(i) +".html"
fmt.Println("url=", url)
//爬取博客内容,得到每一页的结果
result, err := GetResponse(url)
if err != nil {
fmt.Println("StartSpider err=", err)
return
}
fmt.Println("result=", result)
//把爬取的内容写入到文件
page <- i //通道
}
func StartSpider(startPage int, endPage int) {
//for循环,遍历每一页的地址
page := make(chan int)
for i := startPage; i <= endPage; i++ {
go SpiderPage(i, page) //并发的爬取每一页内容
}
for i := startPage; i <= endPage; i++ {
fmt.Printf("第%d个网页爬取完成\n", <-page) //必须把内容输出,不然不执行GO
}
}
func main() {
//创建上下文
start := time.Now() // 获取当前时间
ctx, cancelFunc := context.WithCancel(context.TODO())
time.AfterFunc(10*time.Second, func() {
cancelFunc()
})
//数据库连接
if client, dberr = mongo.Connect(ctx, options.Client().ApplyURI("mongodb://localhost:27017").SetConnectTimeout(5*time.Second)); dberr != nil {
fmt.Print(dberr)
return
}
//2.选择数据库 my_db
db = client.Database("go")
//3.选择表 my_collection
collection = db.Collection("moive")
collection = collection
startPage := 0
endPage := 7
fmt.Println(startPage)
fmt.Println(endPage)
StartSpider(startPage, endPage)
fmt.Println("数据:", Mdata)
fmt.Println("长度:", len(Mdata))
//批量写入
ids, _ := collection.InsertMany(context.TODO(),Mdata)
if ids != nil {
fmt.Println("mongo存储消息流水:[%s]", ids)
}
elapsed := time.Since(start)
fmt.Println("该函数执行完成耗时:", elapsed)
}