colly框架学习（二）

最新推荐文章于 2024-07-20 08:27:55 发布

凯风寒泉

最新推荐文章于 2024-07-20 08:27:55 发布

阅读量664

点赞数 13

文章标签：学习爬虫 golang

本文链接：https://blog.csdn.net/weixin_43829589/article/details/139423323

版权

colly框架学习（二）

之前响应内容为html，今天爬取xhr响应

1、网站

https://spa1.scrape.center

2、回调

xhr响应格式为json，解析回调为OnResponse

列表xhr

type Result struct {
		Id int `json:"id"`
		Alias string `json:"alias"`
		Categories []string `json:"categories"`
		Cover string `json:"cover"`
		Minute int `json:"minute"`
		PublishedAt string `json:"published_at"`
		Name string `json:"name"`
		Regions []string `json:"regions"`
		Score float32 `json:"score"`
	}
// 定义数据结构体 需要和响应结构保持一致
type Data struct {
	Count int `json:"count"`
	Results [10]Result `json:"results"`
}

c.OnResponse(func(response *colly.Response) {
        // 这里需要对url过滤，不然所有的url都会调用
		if strings.Contains(response.Request.URL.String(), "limit"){
			//bodyString := string(response.Body)
			fmt.Printf("parse list page, %s.......\n", response.Request.URL.String())
            // 将json专为结构体
			var data Data
			if err := json.Unmarshal(response.Body, &data); err != nil {
				fmt.Printf("json --> error:%s\n", err)
			}
			//fmt.Println("数据:", data.Results)
			for _, result := range data.Results {
				id := result.Id
				detailUrl := fmt.Sprintf("https://spa1.scrape.center/api/movie/%d/", id)
				c.Visit(detailUrl)
			}
		}
	})

详情页xhr

type Director struct {
		Name string `json:"name"`
		Image string `json:"image"`
	}

type Actor struct {
	Name string `json:"name"`
	Image string `json:"image"`
	Role string `json:"role"`
}

type Detail struct {
	Id int `json:"id"`
	Actors []Actor `json:"actors"`
	Alias string `json:"alias"`
	Categories []string `json:"categories"`
	Cover string `json:"cover"`
	Directors []Director `json:"directors"`
	Drama string `json:"drama"`
	Minute int `json:"minute"`
	PublishedAt string `json:"published_at"`
	Name string `json:"name"`
	Photos []string `json:"photos"`
	Rank int `json:"rank"`
	Regions []string `json:"regions"`
	Score float32 `json:"score"`
	UpdatedAt string `json:"updated_at"`
}

c.OnResponse(func(response *colly.Response) {
        // 这里需要对url过滤，不然所有的url都会调用
		if !strings.Contains(response.Request.URL.String(), "limit") {
			fmt.Printf("parse detail page, %s.......\n", response.Request.URL.String())
			var detail Detail
			if err := json.Unmarshal(response.Body, &detail); err != nil {
				fmt.Printf("json2 --> error:%s\n", err)
			}
			movieName := detail.Name
			//fmt.Printf("电影名：%s\n", movieName)
			categories := strings.Join(detail.Categories, ",")
			//fmt.Printf("类别：%s\n", categories)
			location := strings.Join(detail.Regions, ",")
			//fmt.Printf("地区：%s\n", location)
		}
	})

3、完整代码

package main

import (
	"encoding/json"
	"fmt"
	"github.com/gocolly/colly"
	"github.com/gocolly/colly/extensions"
	"strings"
)

func main()  {
	type Result struct {
		Id int `json:"id"`
		Alias string `json:"alias"`
		Categories []string `json:"categories"`
		Cover string `json:"cover"`
		Minute int `json:"minute"`
		PublishedAt string `json:"published_at"`
		Name string `json:"name"`
		Regions []string `json:"regions"`
		Score float32 `json:"score"`
	}

	type Data struct {
		Count int `json:"count"`
		Results [10]Result `json:"results"`
	}

	type Director struct {
		Name string `json:"name"`
		Image string `json:"image"`
	}

	type Actor struct {
		Name string `json:"name"`
		Image string `json:"image"`
		Role string `json:"role"`
	}

	type Detail struct {
		Id int `json:"id"`
		Actors []Actor `json:"actors"`
		Alias string `json:"alias"`
		Categories []string `json:"categories"`
		Cover string `json:"cover"`
		Directors []Director `json:"directors"`
		Drama string `json:"drama"`
		Minute int `json:"minute"`
		PublishedAt string `json:"published_at"`
		Name string `json:"name"`
		Photos []string `json:"photos"`
		Rank int `json:"rank"`
		Regions []string `json:"regions"`
		Score float32 `json:"score"`
		UpdatedAt string `json:"updated_at"`
	}

	fmt.Println("start crawling...")

	c := colly.NewCollector(func(collector *colly.Collector) {
			collector.Async = true
			extensions.RandomUserAgent(collector)
		},
			colly.AllowedDomains("spa1.scrape.center"),
		)

	c.OnResponse(func(response *colly.Response) {
		if strings.Contains(response.Request.URL.String(), "limit"){
			//bodyString := string(response.Body)
			fmt.Printf("parse list page, %s.......\n", response.Request.URL.String())
			var data Data
			if err := json.Unmarshal(response.Body, &data); err != nil {
				fmt.Printf("json --> error:%s\n", err)
			}
			//fmt.Println("数据:", data.Results)
			for _, result := range data.Results {
				id := result.Id
				detailUrl := fmt.Sprintf("https://spa1.scrape.center/api/movie/%d/", id)
				c.Visit(detailUrl)
			}
		}
	})

	c.OnResponse(func(response *colly.Response) {
		if !strings.Contains(response.Request.URL.String(), "limit") {
			fmt.Printf("parse detail page, %s.......\n", response.Request.URL.String())
			var detail Detail
			if err := json.Unmarshal(response.Body, &detail); err != nil {
				fmt.Printf("json2 --> error:%s\n", err)
			}
			movieName := detail.Name
			fmt.Printf("电影名：%s\n", movieName)
			categories := strings.Join(detail.Categories, ",")
			fmt.Printf("类别：%s\n", categories)
			location := strings.Join(detail.Regions, ",")
			fmt.Printf("地区：%s\n", location)
		}
	})

	c.OnRequest(func(request *colly.Request) {
		fmt.Println("Visiting", request.URL.String())
	})

	for i := 0; i < 11; i++ {
		url := fmt.Sprintf("https://spa1.scrape.center/api/movie/?limit=10&offset=%d", i*10)
		c.Visit(url)
		c.Wait()
	}
}