scrapy打造知乎后花园三：抓取关注用户数据 json异步动态加载

最新推荐文章于 2023-01-10 14:08:21 发布

lxb1022

最新推荐文章于 2023-01-10 14:08:21 发布

阅读量834

点赞数 1

分类专栏： scrapy

本文链接：https://blog.csdn.net/lxb1022/article/details/76258722

版权

scrapy 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

一、爬取思路

1.利用上一篇完成登录，然后去抓取数据。

2.随便找一个用户作为入口，然后利用Chrome工具找到他关注用户的API接口网址和传递的参数。

3.获取API接口返回的JSON数据。然后对JSON数据进行处理。

4.从关注者数大于10000的用户中随机取一个，去抓取他关注的人。如此循环。

二、具体操作

1.随便找一个用户Crossin，点击[关注了]，查看他关注了哪些人。

https://www.zhihu.com/people/crossin/following

2.按F12，打开调试工具。点击network。找到请求API的URL和参数。

https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}

include:data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics

offset:20

limit:20

重点：保存authorization:到headers，不然抓取不到数据报错401。

点击preview，可以看到返回的的JSON结构数据。下一页。

myspider.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request, FormRequest
from zhihu.items import ZhihuItem
import time
from PIL import Image
import json
import requests
from random import choice

class MyspiderSpider(scrapy.Spider):
    name = 'myspider'
    allowed_domains = ['zhihu.com']
    start_urls = ['https://www.zhihu.com/']
    
    headers_zhihu = {  
           'Host':'www.zhihu.com ',  
           'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
           'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',  
           'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',  
           'Accept-Encoding':'gzip,deflate,sdch',  
           'Referer':'https://www.zhihu.com ',  
           'If-None-Match':"FpeHbcRb4rpt_GuDL6-34nrLgGKd.gz",  
           'Cache-Control':'max-age=0',  
           'Connection':'keep-alive',
           'authorization':'Bearer Mi4wQUdCQzBlWkZEUXdBTU1MYmVyUVBEQmNBQUFCaEFsVk4tWXVlV1FCR3pSbE1WbVpQeU5OODdrTUNlM21jZ2ZTUnBB|1500970746|f8c1997fd0e539beec76bcbb15cffd29971d7c05' 
    }  
    def start_requests(self):
        return [Request("https://www.zhihu.com/",meta={'cookiejar':1},headers = self.headers_zhihu,callback=self.captcha)]
       
    def captcha(self,response):
        xsrf = response.xpath('//input[@name="_xsrf"]/@value').extract()[0]
        t = str(int(time.time() * 1000))
        captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + '&type=login&lang=en'
        return [Request(captcha_url, callback=self.parser_captcha,meta={'cookiejar':response.meta['cookiejar'],'xsrf':xsrf})]         
    def parser_captcha(self, response):        
        with open('captcha.jpg', 'wb') as f:  
            f.write(response.body)  
            f.close()  
        im = Image.open('captcha.jpg')
        im.show()
        im.close()
        captcha = raw_input("请输入验证码：") 
        xsrf = response.meta['xsrf']
        return  FormRequest('https://www.zhihu.com/login/phone_num',
                method='POST',
                meta = {'cookiejar':response.meta['cookiejar']},
                callback = self.after_login,
                dont_filter = True,
                headers = self.headers_zhihu, 
                formdata = {
                    'phone_num':'138*********',
                    'password':'*******',
                    '_xsrf':xsrf,
                    'captcha_type':'en',
                    'captcha':captcha,
                },)
#用户关注列表API接口
    follows_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
    follows_query ='data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
    start_user = 'crossin'
    user_token=[]    
    def after_login(self,response):  
        json_file = json.loads(response.text)＃返回登录JSON信息
        if json_file['r'] == 0:
           print('登录成功.....开始爬了。。。。')
           yield Request(self.follows_url.format(user=self.start_user,include=self.follows_query,offset=0,limit=20),
                         callback=self.parse_follows,
                         meta = {'cookiejar':response.meta['cookiejar']},
                         headers = self.headers_zhihu,
                        )
        else:
           print json_file['msg'].encode('utf-8')
        
    def parse_follows(self, response):
        results = json.loads(response.text)
        item = ZhihuItem()
        if 'data' in results.keys():
            for result in results.get('data'):#读取JSON结构化数据
                item['name'] =  result.get('name').encode('utf-8')
                url_token = result.get('url_token')
                item['url_token'] = 'https://www.zhihu.com/people/' + url_token
                item['answer_count'] = result.get('answer_count')
                item['articles_count'] = result.get('articles_count')
                follower_count = result.get('follower_count')
                item['follower_count'] = follower_count
                gender = result.get('gender')
                if gender==0:
                   item['gender']='女'
                else:
                   item['gender']='男'
                   
                if follower_count > 10000:
                   self.user_token.append(url_token)＃保存关注数大于10000的用户
                yield item        
        if 'paging' in results.keys() and results.get('paging').get('is_end')==False:#没有到最后一页，继续爬这个用户
            next_page = results.get('paging').get('next')
            yield Request(next_page,
                          callback=self.parse_follows,
                          meta = {'cookiejar':response.meta['cookiejar']},
                          headers = self.headers_zhihu,
                         )
        else:
            start_user = choice(self.user_token)#到最后一页后，随机选一个用户，继续爬
            yield Request(self.follows_url.format(user=start_user,include=self.follows_query,offset=0,limit=20),
                         callback=self.parse_follows,
                         meta = {'cookiejar':response.meta['cookiejar']},
                         headers = self.headers_zhihu,
                         )

items.py

import scrapy

class ZhihuItem(scrapy.Item):
    # define the fields for your item here like:
    name = scrapy.Field()
    answer_count = scrapy.Field()
    articles_count = scrapy.Field()
    follower_count = scrapy.Field()
    gender = scrapy.Field()
    url_token = scrapy.Field()

pipelines.py 保存为excel格式。

# -*- coding: utf-8 -*-
import time
from openpyxl import Workbook
class ZhihuPipeline(object):
     def __init__(self):
        self.wb = Workbook()
        self.ws = self.wb.active
        self.ws.append(['姓名','性别','回答','文章','关注者','网址'])
     def process_item(self, item, spider):
         line = [item['name'],item['gender'],item['answer_count'],item['articles_count'],item['follower_count'],item['url_token']]
         self.ws.append(line)
         now = time.strftime('%Y-%m-%d',time.localtime())
         filename ='/home/soft/zhihu/'+ now +'.xlsx'
         self.wb.save(filename)
         return item

三、代码运行

[root@master zhihu]# scrapy crawl myspider