使用Scrapy爬取百万知乎用户信息

最新推荐文章于 2022-08-08 20:31:52 发布

KARKEY

最新推荐文章于 2022-08-08 20:31:52 发布

阅读量797

点赞数

分类专栏：爬虫 python 文章标签：爬虫-Scrapy

本文链接：https://blog.csdn.net/KARKEY/article/details/77512265

版权

爬虫同时被 2 个专栏收录

1 篇文章 0 订阅

订阅专栏

python

1 篇文章 0 订阅

订阅专栏

参考了http://cuiqingcai.com/4380.html
这里做一些补充
控制了下速度，前后爬了大概两天，获取了知乎百万用户数据信息
用户信息数据库
知乎的api做的非常友好，通过接口直接返回纯净的Json数据。
相比参考的那篇文章，这里把item()改了一下。
主要是最后几个字段在返回的数据中只有name字段比较有效，所以甩了一堆if提取
像这样

class UserItem(Item):
    # define the fields for your item here like:
    id = Field()             #用户uid
    url_token = Field()      #用户标识，api接口标识
    name = Field()           #用户昵称
    gender = Field()  # 性别     bool
    headline = Field()       #头条简介
    description = Field()    #个人介绍
    avatar_url = Field()     #用户头像
    cover_url = Field()     #封面图像
    type = Field()          #个人or机构？
    badge = Field()         #徽章？

    answer_count = Field()   #回答数量
    articles_count = Field()   #文章数
    commercial_question_count = Field()   #
    columns_count=Field()       #专栏数
    favorite_count = Field()        #个人收藏数
    favorited_count = Field()    #被收藏数
    follower_count = Field()        #粉丝
    following_columns_count = Field() #关注的专栏
    following_count = Field()       #关注
    pins_count = Field()
    question_count = Field()
    thank_from_count = Field()
    thank_to_count = Field()
    thanked_count = Field()
    vote_from_count = Field()
    vote_to_count = Field()
    voteup_count = Field()
    following_favlists_count = Field()
    following_question_count = Field()
    following_topic_count = Field()
    marked_answers_count = Field()
    mutual_followees_count = Field()
    hosted_live_count = Field()
    participated_live_count = Field()

    locations_name = Field()
    business_name=Field()
    educations_school_major = Field()
    employments_company_job = Field()

spider代码

from scrapy import Spider,Request
import json
from zhihu.items import UserItem
import time
import pymongo


class Mongo(object):
    client=pymongo.MongoClient('localhost')
    db=client['zhihu']
    tb=db['users']

    def isexist(self,url_token):
        if self.tb.find_one({'url_token':str(url_token)}):
            return True
        return False

class zhihuSpider(Spider):
    name = 'zhihu'
    allowed_domains=["www.zhihu.com"]
    user_Info_url='https://www.zhihu.com/api/v4/members/{user}?include={include}'
    user_following_url='https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
    user_follwers_url='https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}'
    follwers_query='data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics'
    user_query = 'locations,employments,gender,educations,business,voteup_count,thanked_Count,follower_count,following_count,cover_url,following_topic_count,following_question_count,following_favlists_count,following_columns_count,answer_count,articles_count,pins_count,question_count,commercial_question_count,favorite_count,favorited_count,logs_count,marked_answers_count,marked_answers_text,message_thread_token,account_status,is_active,is_force_renamed,is_bind_sina,sina_weibo_url,sina_weibo_name,show_sina_weibo,is_blocking,is_blocked,is_following,is_followed,mutual_followees_count,vote_to_count,vote_from_count,thank_to_count,thank_from_count,thanked_count,description,hosted_live_count,participated_live_count,allow_message,industry_category,org_name,org_homepage,badge[?(type=best_answerer)].topics'
    following_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
    request_count=0
    user_info_count=0         #
    time=time.clock()
    mongo = Mongo()

    def start_requests(self):

        yield self.make_request_for_userInfo('excited-vczh',self.parse_userInfo)


    def parse_userInfo(self,response):
        self.timed_task(6)
        item=UserItem()
        result=json.loads(response.body)
        for key in result.keys():
            if key in item.fields:
                item[key]=result.get(key)
            else:
                if key=='locations':
                    item['locations_name']=[]
                    for location in result.get(key):
                        item['locations_name'].append(location.get('name'))
                if key=='business':
                    business=result.get(key)
                    if business!=None:
                        item['business_name']=business.get('name')
                if key=='educations':
                    item['educations_school_major']=[]
                    for edu in result.get(key):
                        educations_school_major = {'学校': '', '专业': ''}
                        sch=edu.get('school');major=edu.get('major')
                        if sch!=None:
                            sch_name=sch.get('name')
                            educations_school_major['学校']=sch_name
                        if major!=None:
                            major_name=major.get('name')
                            educations_school_major['专业']=major_name
                        item['educations_school_major'].append(educations_school_major)
                if key=='employments':
                    item['employments_company_job']=[]
                    for emp in result.get(key):
                        employments_company_job = {'公司':'','职位':''}
                        if emp.get('company')!=None:
                            company_name=emp.get('company').get('name')
                            employments_company_job['公司']=company_name
                        if emp.get('job')!=None:
                            job_name=emp.get('job').get('name')
                            employments_company_job['职位']=job_name
                            # company_name=str(company_name).replace('.', '_')
                        item['employments_company_job'].append(employments_company_job)


        yield item         
        self.user_info_count += 1
        url_token=result.get('url_token')
        yield Request(self.user_follwers_url.format(user=url_token, include=self.follwers_query, offset=0, limit=20),
                      callback=self.parse_followers)
        yield self.make_request_for_userFollowing(url_token,1,callback=self.parse_following)


    def parse_following(self, response):

        results=json.loads(response.body)
        if 'data' in results.keys():
            for user in results.get('data'):
                user_url=user.get('url_token')
                if not self.mongo.isexist(user_url):
                    self.request_count += 1
                    yield self.make_request_for_userInfo(user_url,self.parse_userInfo,priority=13)
        if'paping' in results.keys() and results.get('is_end')==False:
            next_page=results.get('paging').get('next')
            yield Request(next_page,callback=self.parse_following)

    def parse_followers(self, response):
        results=json.loads(response.body)
        if 'data' in results.keys():
            for user in results.get('data'):
                # if user.get('follower_count'):                        #粉丝列表中有人关注则获取useinfo
                user_url=user.get('url_token')
                if not self.mongo.isexist(user_url):
                    self.request_count += 1
                    yield self.make_request_for_userInfo(user_url,self.parse_userInfo,priority=13)
        if 'paping' in results.keys() and results.get('is_end') == False:
            next_page = results.get('paging').get('next')
            yield Request(next_page, callback=self.parse_following)




    def timed_task(self,dalay):
        if time.clock()-self.time>=dalay:      
            self.time=time.clock()          
            msg='解析Useinfo数： {}       提交请求数：{}        机器时间：{}'
            print(msg.format(self.user_info_count,self.request_count,time.clock()))




    def make_request_for_userInfo(self,url_token,callback,**kw):
        return Request(self.user_Info_url.format(user=url_token,include=self.user_query),callback=callback,**kw)

    def make_request_for_userFollowing(self, url_token,page,callback,limit=20,**kw):
        return Request(self.user_following_url.format(user=url_token,include=self.following_query,offset=page*20,limit=limit),callback=callback,**kw)

补充
1.去重：循环爬取关注和粉丝列表的时候，必然要去重，当数据量达到几十万级别的时候，查询性能尤为重要，一开始准备用redis对url_token做一个set来去重，后来发现对mongodb中url_token字段直接建立索引后查询时间再ms级别也能满足需求。
2.暂停与重启：Scrapy可以通过scrapy crawl zhihu -s JOBDIR=目录命令
保存架构中调度模块的请求数据和相关数据，很方便暂停与重启
3.知乎的反爬：知乎不仅api接口友好，反爬也比较宽松，请求速度大概300page/min速度下不限制，如果请求量比较大，服务器后台大概过一两个小时检查的时候会开始弹验证马,另外对较高并发的请求也不敏感，算是很友好了，我也限制了速度爬取，只有最后才放开了并发数测试一下
4.存储：存储100万条数据的时候，硬盘占用大概几百Mb，但是mongodb在运行中会把索引+数据都读到内存里放着，这一点优化了查询速度，但是太吃内存，查询数据的时候内存直接占用1个G，Linux下很容易限制进程的资源占用，WIN下好像服务端系统可以，win10没找限制占用资源的办法…这一点很蛋疼，准备以后用Mysql存了。
这里写图片描述