参考了http://cuiqingcai.com/4380.html
这里做一些补充
控制了下速度,前后爬了大概两天,获取了知乎百万用户数据信息
知乎的api做的非常友好,通过接口直接返回纯净的Json数据。
相比参考的那篇文章,这里把item()改了一下。
主要是最后几个字段在返回的数据中只有name字段比较有效,所以甩了一堆if提取
像这样
class UserItem(Item):
# define the fields for your item here like:
id = Field() #用户uid
url_token = Field() #用户标识,api接口标识
name = Field() #用户昵称
gender = Field() # 性别 bool
headline = Field() #头条简介
description = Field() #个人介绍
avatar_url = Field() #用户头像
cover_url = Field() #封面图像
type = Field() #个人or机构?
badge = Field() #徽章?
answer_count = Field() #回答数量
articles_count = Field() #文章数
commercial_question_count = Field() #
columns_count=Field() #专栏数
favorite_count = Field() #个人收藏数
favorited_count = Field() #被收藏数
follower_count = Field() #粉丝
following_columns_count = Field() #关注的专栏
following_count = Field() #关注
pins_count = Field()
question_count = Field()
thank_from_count = Field()
thank_to_count = Field()
thanked_count = Field()
vote_from_count = Field()
vote_to_count = Field()
voteup_count = Field()
following_favlists_count = Field()
following_question_count = Field()
following_topic_count = Field()
marked_answers_count = Field()
mutual_followees_count = Field()
hosted_live_count = Field()
participated_live_count = Field()
locations_name = Field()
business_name=Field()
educations_school_major = Field()
employments_company_job = Field()
spider代码
from scrapy import Spider,Request
import json
from zhihu.items import UserItem
import time
import pymongo
class Mongo(object):
client=pymongo.MongoClient('localhost')
db=client['zhihu']
tb=db['users']
def isexist(self,url_token):
if self.tb.find_one({'url_token':str(url_token)}):
return True
return False
class zhihuSpider(Spider):
name = 'zhihu'
allowed_domains=["www.zhihu.com"]
user_Info_url='https://www.zhihu.com/api/v4/members/{user}?include={include}'
user_following_url='https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
user_follwers_url='https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}'
follwers_query='data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics'
user_query = 'locations,employments,gender,educations,business,voteup_count,thanked_Count,follower_count,following_count,cover_url,following_topic_count,following_question_count,following_favlists_count,following_columns_count,answer_count,articles_count,pins_count,question_count,commercial_question_count,favorite_count,favorited_count,logs_count,marked_answers_count,marked_answers_text,message_thread_token,account_status,is_active,is_force_renamed,is_bind_sina,sina_weibo_url,sina_weibo_name,show_sina_weibo,is_blocking,is_blocked,is_following,is_followed,mutual_followees_count,vote_to_count,vote_from_count,thank_to_count,thank_from_count,thanked_count,description,hosted_live_count,participated_live_count,allow_message,industry_category,org_name,org_homepage,badge[?(type=best_answerer)].topics'
following_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
request_count=0
user_info_count=0 #
time=time.clock()
mongo = Mongo()
def start_requests(self):
yield self.make_request_for_userInfo('excited-vczh',self.parse_userInfo)
def parse_userInfo(self,response):
self.timed_task(6)
item=UserItem()
result=json.loads(response.body)
for key in result.keys():
if key in item.fields:
item[key]=result.get(key)
else:
if key=='locations':
item['locations_name']=[]
for location in result.get(key):
item['locations_name'].append(location.get('name'))
if key=='business':
business=result.get(key)
if business!=None:
item['business_name']=business.get('name')
if key=='educations':
item['educations_school_major']=[]
for edu in result.get(key):
educations_school_major = {'学校': '', '专业': ''}
sch=edu.get('school');major=edu.get('major')
if sch!=None:
sch_name=sch.get('name')
educations_school_major['学校']=sch_name
if major!=None:
major_name=major.get('name')
educations_school_major['专业']=major_name
item['educations_school_major'].append(educations_school_major)
if key=='employments':
item['employments_company_job']=[]
for emp in result.get(key):
employments_company_job = {'公司':'','职位':''}
if emp.get('company')!=None:
company_name=emp.get('company').get('name')
employments_company_job['公司']=company_name
if emp.get('job')!=None:
job_name=emp.get('job').get('name')
employments_company_job['职位']=job_name
# company_name=str(company_name).replace('.', '_')
item['employments_company_job'].append(employments_company_job)
yield item
self.user_info_count += 1
url_token=result.get('url_token')
yield Request(self.user_follwers_url.format(user=url_token, include=self.follwers_query, offset=0, limit=20),
callback=self.parse_followers)
yield self.make_request_for_userFollowing(url_token,1,callback=self.parse_following)
def parse_following(self, response):
results=json.loads(response.body)
if 'data' in results.keys():
for user in results.get('data'):
user_url=user.get('url_token')
if not self.mongo.isexist(user_url):
self.request_count += 1
yield self.make_request_for_userInfo(user_url,self.parse_userInfo,priority=13)
if'paping' in results.keys() and results.get('is_end')==False:
next_page=results.get('paging').get('next')
yield Request(next_page,callback=self.parse_following)
def parse_followers(self, response):
results=json.loads(response.body)
if 'data' in results.keys():
for user in results.get('data'):
# if user.get('follower_count'): #粉丝列表中有人关注则获取useinfo
user_url=user.get('url_token')
if not self.mongo.isexist(user_url):
self.request_count += 1
yield self.make_request_for_userInfo(user_url,self.parse_userInfo,priority=13)
if 'paping' in results.keys() and results.get('is_end') == False:
next_page = results.get('paging').get('next')
yield Request(next_page, callback=self.parse_following)
def timed_task(self,dalay):
if time.clock()-self.time>=dalay:
self.time=time.clock()
msg='解析Useinfo数: {} 提交请求数:{} 机器时间:{}'
print(msg.format(self.user_info_count,self.request_count,time.clock()))
def make_request_for_userInfo(self,url_token,callback,**kw):
return Request(self.user_Info_url.format(user=url_token,include=self.user_query),callback=callback,**kw)
def make_request_for_userFollowing(self, url_token,page,callback,limit=20,**kw):
return Request(self.user_following_url.format(user=url_token,include=self.following_query,offset=page*20,limit=limit),callback=callback,**kw)
补充
1.去重:循环爬取关注和粉丝列表的时候,必然要去重,当数据量达到几十万级别的时候,查询性能尤为重要,一开始准备用redis对url_token做一个set来去重,后来发现对mongodb中url_token字段直接建立索引后查询时间再ms级别也能满足需求。
2.暂停与重启:Scrapy可以通过scrapy crawl zhihu -s JOBDIR=目录命令
保存架构中调度模块的请求数据和相关数据,很方便暂停与重启
3.知乎的反爬:知乎不仅api接口友好,反爬也比较宽松,请求速度大概300page/min速度下不限制,如果请求量比较大,服务器后台大概过一两个小时检查的时候会开始弹验证马,另外对较高并发的请求也不敏感,算是很友好了,我也限制了速度爬取,只有最后才放开了并发数测试一下
4.存储:存储100万条数据的时候,硬盘占用大概几百Mb,但是mongodb在运行中会把索引+数据都读到内存里放着,这一点优化了查询速度,但是太吃内存,查询数据的时候内存直接占用1个G,Linux下很容易限制进程的资源占用,WIN下好像服务端系统可以,win10没找限制占用资源的办法…这一点很蛋疼,准备以后用Mysql存了。