scrapy动态请求解析

# -*- coding: utf-8 -*-
import scrapy
from selenium import webdriver
from urllib import parse
import time
import json
from scrapy import Request, FormRequest
from SumSpider.items import ZhihuUserItem


# start_url = "https://www.zhihu.com/api/v4/members/zhu-yun-bei-79?include=allow_message%2Cis_followed%2Cis_following%2Cis_org%2Cis_blocking%2Cemployments%2Canswer_count%2Cfollower_count%2Carticles_count%2Cgender%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics"
# start_url = "https://www.zhihu.com/api/v4/members/excited-vczh/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=40&limit=20"
class ZhihuSpider(scrapy.Spider):
    name = 'zhihu'
    allowed_domains = ['www.zhihu.com']
    
    user = "excited-vczh"
    
    # 每个关注的人的数据url (json格式)
    user_url = "https://www.zhihu.com/api/v4/members/{user}?include={include}"
    user_query = "allow_message%2Cis_followed%2Cis_following%2Cis_org%2Cis_blocking%2Cemployments%2Canswer_count%2Cfollower_count%2Carticles_count%2Cgender%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics"
    
    # 每页关注的人的列表url
    follows_url = "https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}"
    follows_query = "data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics"

    # 每页关注我的人的列表url
    followers_url = "https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}"
    followers_query = "data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics"
    
    #入口
    def start_requests(self):
        yield Request(url=self.user_url.format(user=self.user, include=self.user_query), callback=self.parse_user)
        yield Request(url=self.follows_url.format(user=self.user, include=self.follows_query, offset=0, limit=20), callback=self.parse_follows)
        yield Request(url=self.followers_url.format(user=self.user, include=self.followers_query, offset=0, limit=20),callback=self.parse_followers)
    
    # 解析每个人的数据
    def parse_user(self, response):
        item = ZhihuUserItem()
        res_dict = json.loads(response.text)
        for field in item.fields:
            if field in res_dict.keys():
                item[field] = res_dict.get(field)
        yield item
        yield Request(url=self.follows_url.format(user=res_dict.get("url_token"), include=self.follows_query, offset=0, limit=20), callback=self.parse_follows)
        yield Request(url=self.followers_url.format(user=res_dict.get("url_token"), include=self.followers_query, offset=0, limit=20), callback=self.parse_followers)
    
    # 关注的人的列表 (拿到url_token 和 翻页)
    def parse_follows(self, response):
        res_dict = json.loads(response.text)
        if "data" in res_dict.keys():
            for i in res_dict.get("data"):
                if "url_token" in i.keys():
                    # print(i.get("url_token"))
                    yield Request(url=self.user_url.format(user=i.get("url_token"), include=self.user_query), callback=self.parse_user)
        if "paging" in res_dict.keys() and res_dict.get("paging").get("is_end") is False:
            next_page = res_dict.get("paging").get("next")
            yield Request(url=next_page, callback=self.parse_follows)

    # 关注我的人的列表 (拿到url_token 和 翻页)
    def parse_followers(self, response):
        res_dict = json.loads(response.text)
        if "data" in res_dict.keys():
            for i in res_dict.get("data"):
                if "url_token" in i.keys():
                    # print(i.get("url_token"))
                    yield Request(url=self.user_url.format(user=i.get("url_token"), include=self.user_query), callback=self.parse_user)
        if "paging" in res_dict.keys() and res_dict.get("paging").get("is_end") is False:
            next_page = res_dict.get("paging").get("next")
            yield Request(url=next_page, callback=self.parse_followers)

转载于:https://www.cnblogs.com/lilied/p/8352857.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值