colly框架学习(二)
之前响应内容为html,今天爬取xhr响应
1、网站
- https://spa1.scrape.center
2、回调
-
xhr响应格式为json,解析回调为OnResponse
-
列表xhr
type Result struct { Id int `json:"id"` Alias string `json:"alias"` Categories []string `json:"categories"` Cover string `json:"cover"` Minute int `json:"minute"` PublishedAt string `json:"published_at"` Name string `json:"name"` Regions []string `json:"regions"` Score float32 `json:"score"` } // 定义数据结构体 需要和响应结构保持一致 type Data struct { Count int `json:"count"` Results [10]Result `json:"results"` } c.OnResponse(func(response *colly.Response) { // 这里需要对url过滤,不然所有的url都会调用 if strings.Contains(response.Request.URL.String(), "limit"){ //bodyString := string(response.Body) fmt.Printf("parse list page, %s.......\n", response.Request.URL.String()) // 将json专为结构体 var data Data if err := json.Unmarshal(response.Body, &data); err != nil { fmt.Printf("json --> error:%s\n", err) } //fmt.Println("数据:", data.Results) for _, result := range data.Results { id := result.Id detailUrl := fmt.Sprintf("https://spa1.scrape.center/api/movie/%d/", id) c.Visit(detailUrl) } } })
-
详情页xhr
type Director struct { Name string `json:"name"` Image string `json:"image"` } type Actor struct { Name string `json:"name"` Image string `json:"image"` Role string `json:"role"` } type Detail struct { Id int `json:"id"` Actors []Actor `json:"actors"` Alias string `json:"alias"` Categories []string `json:"categories"` Cover string `json:"cover"` Directors []Director `json:"directors"` Drama string `json:"drama"` Minute int `json:"minute"` PublishedAt string `json:"published_at"` Name string `json:"name"` Photos []string `json:"photos"` Rank int `json:"rank"` Regions []string `json:"regions"` Score float32 `json:"score"` UpdatedAt string `json:"updated_at"` } c.OnResponse(func(response *colly.Response) { // 这里需要对url过滤,不然所有的url都会调用 if !strings.Contains(response.Request.URL.String(), "limit") { fmt.Printf("parse detail page, %s.......\n", response.Request.URL.String()) var detail Detail if err := json.Unmarshal(response.Body, &detail); err != nil { fmt.Printf("json2 --> error:%s\n", err) } movieName := detail.Name //fmt.Printf("电影名:%s\n", movieName) categories := strings.Join(detail.Categories, ",") //fmt.Printf("类别:%s\n", categories) location := strings.Join(detail.Regions, ",") //fmt.Printf("地区:%s\n", location) } })
3、完整代码
package main
import (
"encoding/json"
"fmt"
"github.com/gocolly/colly"
"github.com/gocolly/colly/extensions"
"strings"
)
func main() {
type Result struct {
Id int `json:"id"`
Alias string `json:"alias"`
Categories []string `json:"categories"`
Cover string `json:"cover"`
Minute int `json:"minute"`
PublishedAt string `json:"published_at"`
Name string `json:"name"`
Regions []string `json:"regions"`
Score float32 `json:"score"`
}
type Data struct {
Count int `json:"count"`
Results [10]Result `json:"results"`
}
type Director struct {
Name string `json:"name"`
Image string `json:"image"`
}
type Actor struct {
Name string `json:"name"`
Image string `json:"image"`
Role string `json:"role"`
}
type Detail struct {
Id int `json:"id"`
Actors []Actor `json:"actors"`
Alias string `json:"alias"`
Categories []string `json:"categories"`
Cover string `json:"cover"`
Directors []Director `json:"directors"`
Drama string `json:"drama"`
Minute int `json:"minute"`
PublishedAt string `json:"published_at"`
Name string `json:"name"`
Photos []string `json:"photos"`
Rank int `json:"rank"`
Regions []string `json:"regions"`
Score float32 `json:"score"`
UpdatedAt string `json:"updated_at"`
}
fmt.Println("start crawling...")
c := colly.NewCollector(func(collector *colly.Collector) {
collector.Async = true
extensions.RandomUserAgent(collector)
},
colly.AllowedDomains("spa1.scrape.center"),
)
c.OnResponse(func(response *colly.Response) {
if strings.Contains(response.Request.URL.String(), "limit"){
//bodyString := string(response.Body)
fmt.Printf("parse list page, %s.......\n", response.Request.URL.String())
var data Data
if err := json.Unmarshal(response.Body, &data); err != nil {
fmt.Printf("json --> error:%s\n", err)
}
//fmt.Println("数据:", data.Results)
for _, result := range data.Results {
id := result.Id
detailUrl := fmt.Sprintf("https://spa1.scrape.center/api/movie/%d/", id)
c.Visit(detailUrl)
}
}
})
c.OnResponse(func(response *colly.Response) {
if !strings.Contains(response.Request.URL.String(), "limit") {
fmt.Printf("parse detail page, %s.......\n", response.Request.URL.String())
var detail Detail
if err := json.Unmarshal(response.Body, &detail); err != nil {
fmt.Printf("json2 --> error:%s\n", err)
}
movieName := detail.Name
fmt.Printf("电影名:%s\n", movieName)
categories := strings.Join(detail.Categories, ",")
fmt.Printf("类别:%s\n", categories)
location := strings.Join(detail.Regions, ",")
fmt.Printf("地区:%s\n", location)
}
})
c.OnRequest(func(request *colly.Request) {
fmt.Println("Visiting", request.URL.String())
})
for i := 0; i < 11; i++ {
url := fmt.Sprintf("https://spa1.scrape.center/api/movie/?limit=10&offset=%d", i*10)
c.Visit(url)
c.Wait()
}
}