我这里只写了爬取top250电影名字的代码
还要爬其他标签的话可以自己改
goquery官方文档
https://pkg.go.dev/github.com/PuerkitoBio/goquery#pkg-examples
建议先看看官方文档理解函数
package main
import (
"fmt"
"net/http"
"strconv"
"github.com/PuerkitoBio/goquery"
)
// 解析代码示例 可以删除
func fetch(url string) (*goquery.Document, error) {
resp, err := http.Get(url)
if err != nil {
return nil, err
}
defer resp.Body.Close()
// 使用goquery包解析网页内容
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
return nil, err
}
return doc, nil
}
// 爬虫本体
func spider() {
// 设定客户端
client := &http.Client{}
// 发送请求 爬取n页 换页+25
for i := 0; i < 250; i += 25 {
req, err := http.NewRequest("GET", "https://movie.douban.com/top250?start="+strconv.Itoa(i)+"&filter=", nil)
if err != nil {
fmt.Println("req err", err)
}
// 仿造header防止浏览器检测到是爬虫访问
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7")
// req.Header.Set("Accept-Encoding", "gzip, deflate, br") 无用header 若添加会导致乱码
req.Header.Set("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6")
req.Header.Set("Cache-Control", "max-age=0")
req.Header.Set("Connection", "keep-alive")
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0")
req.Header.Set("Sec-Ch-Ua-Mobile", "?0")
req.Header.Set("Sec-Fetch-Dest", "document")
req.Header.Set("Sec-Fetch-Mode", "navigate")
req.Header.Set("Sec-Fetch-Site", "none")
req.Header.Set("Sec-Fetch-User", "?1")
req.Header.Set("Upgrade-Insecure-Requests", "1")
resp, err := client.Do(req)
if err != nil {
fmt.Println("请求失败", err)
}
defer resp.Body.Close()
DocDetial, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
fmt.Println("解析失败", err)
}
for j := 1; j <= 25; j++ {
title := DocDetial.Find("#content > div > div.article > ol > li:nth-child( " + strconv.Itoa(j) + ") > div > div.info > div.hd > a > span:nth-child(1)").Text()
// strconv.Itoa()将数字转为字符串
fmt.Printf("name%d:%s \n", j+i, title)
}
}
}
func main() {
spider()
}
vscode运行结果截图如下