colly框架学习(二)

colly框架学习(二)

之前响应内容为html,今天爬取xhr响应

1、网站

  • https://spa1.scrape.center

2、回调

  • xhr响应格式为json,解析回调为OnResponse

  • 列表xhr

    type Result struct {
    		Id int `json:"id"`
    		Alias string `json:"alias"`
    		Categories []string `json:"categories"`
    		Cover string `json:"cover"`
    		Minute int `json:"minute"`
    		PublishedAt string `json:"published_at"`
    		Name string `json:"name"`
    		Regions []string `json:"regions"`
    		Score float32 `json:"score"`
    	}
    // 定义数据结构体 需要和响应结构保持一致
    type Data struct {
    	Count int `json:"count"`
    	Results [10]Result `json:"results"`
    }
    
    c.OnResponse(func(response *colly.Response) {
            // 这里需要对url过滤,不然所有的url都会调用
    		if strings.Contains(response.Request.URL.String(), "limit"){
    			//bodyString := string(response.Body)
    			fmt.Printf("parse list page, %s.......\n", response.Request.URL.String())
                // 将json专为结构体
    			var data Data
    			if err := json.Unmarshal(response.Body, &data); err != nil {
    				fmt.Printf("json --> error:%s\n", err)
    			}
    			//fmt.Println("数据:", data.Results)
    			for _, result := range data.Results {
    				id := result.Id
    				detailUrl := fmt.Sprintf("https://spa1.scrape.center/api/movie/%d/", id)
    				c.Visit(detailUrl)
    			}
    		}
    	})
    
  • 详情页xhr

    type Director struct {
    		Name string `json:"name"`
    		Image string `json:"image"`
    	}
    
    type Actor struct {
    	Name string `json:"name"`
    	Image string `json:"image"`
    	Role string `json:"role"`
    }
    
    type Detail struct {
    	Id int `json:"id"`
    	Actors []Actor `json:"actors"`
    	Alias string `json:"alias"`
    	Categories []string `json:"categories"`
    	Cover string `json:"cover"`
    	Directors []Director `json:"directors"`
    	Drama string `json:"drama"`
    	Minute int `json:"minute"`
    	PublishedAt string `json:"published_at"`
    	Name string `json:"name"`
    	Photos []string `json:"photos"`
    	Rank int `json:"rank"`
    	Regions []string `json:"regions"`
    	Score float32 `json:"score"`
    	UpdatedAt string `json:"updated_at"`
    }
    
    c.OnResponse(func(response *colly.Response) {
            // 这里需要对url过滤,不然所有的url都会调用
    		if !strings.Contains(response.Request.URL.String(), "limit") {
    			fmt.Printf("parse detail page, %s.......\n", response.Request.URL.String())
    			var detail Detail
    			if err := json.Unmarshal(response.Body, &detail); err != nil {
    				fmt.Printf("json2 --> error:%s\n", err)
    			}
    			movieName := detail.Name
    			//fmt.Printf("电影名:%s\n", movieName)
    			categories := strings.Join(detail.Categories, ",")
    			//fmt.Printf("类别:%s\n", categories)
    			location := strings.Join(detail.Regions, ",")
    			//fmt.Printf("地区:%s\n", location)
    		}
    	})
    

3、完整代码

package main

import (
	"encoding/json"
	"fmt"
	"github.com/gocolly/colly"
	"github.com/gocolly/colly/extensions"
	"strings"
)

func main()  {
	type Result struct {
		Id int `json:"id"`
		Alias string `json:"alias"`
		Categories []string `json:"categories"`
		Cover string `json:"cover"`
		Minute int `json:"minute"`
		PublishedAt string `json:"published_at"`
		Name string `json:"name"`
		Regions []string `json:"regions"`
		Score float32 `json:"score"`
	}

	type Data struct {
		Count int `json:"count"`
		Results [10]Result `json:"results"`
	}

	type Director struct {
		Name string `json:"name"`
		Image string `json:"image"`
	}

	type Actor struct {
		Name string `json:"name"`
		Image string `json:"image"`
		Role string `json:"role"`
	}

	type Detail struct {
		Id int `json:"id"`
		Actors []Actor `json:"actors"`
		Alias string `json:"alias"`
		Categories []string `json:"categories"`
		Cover string `json:"cover"`
		Directors []Director `json:"directors"`
		Drama string `json:"drama"`
		Minute int `json:"minute"`
		PublishedAt string `json:"published_at"`
		Name string `json:"name"`
		Photos []string `json:"photos"`
		Rank int `json:"rank"`
		Regions []string `json:"regions"`
		Score float32 `json:"score"`
		UpdatedAt string `json:"updated_at"`
	}

	fmt.Println("start crawling...")

	c := colly.NewCollector(func(collector *colly.Collector) {
			collector.Async = true
			extensions.RandomUserAgent(collector)
		},
			colly.AllowedDomains("spa1.scrape.center"),
		)

	c.OnResponse(func(response *colly.Response) {
		if strings.Contains(response.Request.URL.String(), "limit"){
			//bodyString := string(response.Body)
			fmt.Printf("parse list page, %s.......\n", response.Request.URL.String())
			var data Data
			if err := json.Unmarshal(response.Body, &data); err != nil {
				fmt.Printf("json --> error:%s\n", err)
			}
			//fmt.Println("数据:", data.Results)
			for _, result := range data.Results {
				id := result.Id
				detailUrl := fmt.Sprintf("https://spa1.scrape.center/api/movie/%d/", id)
				c.Visit(detailUrl)
			}
		}
	})

	c.OnResponse(func(response *colly.Response) {
		if !strings.Contains(response.Request.URL.String(), "limit") {
			fmt.Printf("parse detail page, %s.......\n", response.Request.URL.String())
			var detail Detail
			if err := json.Unmarshal(response.Body, &detail); err != nil {
				fmt.Printf("json2 --> error:%s\n", err)
			}
			movieName := detail.Name
			fmt.Printf("电影名:%s\n", movieName)
			categories := strings.Join(detail.Categories, ",")
			fmt.Printf("类别:%s\n", categories)
			location := strings.Join(detail.Regions, ",")
			fmt.Printf("地区:%s\n", location)
		}
	})

	c.OnRequest(func(request *colly.Request) {
		fmt.Println("Visiting", request.URL.String())
	})

	for i := 0; i < 11; i++ {
		url := fmt.Sprintf("https://spa1.scrape.center/api/movie/?limit=10&offset=%d", i*10)
		c.Visit(url)
		c.Wait()
	}
}
  • 13
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值