1 爬取逻辑
先选取一个用户,爬取他的粉丝列表和关注列表.然后对每个粉丝进行分析,找出他们的粉丝列表和关注列表,以此往复,递归下去,就可以爬取大部分的用户信息了.通过一个树形的结构,蔓延到所有的用户.
这里我们选取轮子哥来作为我们的起始点:). 下图的就是关注列表和粉丝列表的请求,返回的是一个json类型的结果.
然后我们注意到第一个关注的人的信息中有这么一个url_token
的值,接着打开第一个关注的人的页面.注意到用户详情页的网址恰好是由url_token
组成.由此我们可以获取每个用户的详情页的链接.以便爬取他们的关注列表和粉丝列表,实现递归.
当我们把鼠标放到用户头像上时,出现了一个ajax请求,可以得到这个请求的url.
然后可以看到请求的参数和返回的信息,同样是json格式,里面包含了用户的一部分信息(这里注意到比起一年前,这里请求的信息变少了,然而你用以前的请求参数去请求,还是会返回结果),可以打开详情页,可以发现是吻合的.
总结一下就是:
- 首先选取一个起始人,一定要是一位关注或粉丝数多的大V作为爬取起始点.
- 通过知乎接口获得该用户的粉丝列表和关注列表.(url_1,url_2)
- 通过知乎接口获得列表中每位用户的详细信息.(url_3)
- 进一步对列表中的每一个用户,获取他们的粉丝和关注列表,实现递归爬取.
2 项目代码
首先是创建项目scrapy startproject zhihu_user
接着创建spider,cd进zhihu_user目录, scrapy genspider zhizhu www.zhihu.com
这就是项目的一个文件结构.
下面给出详细代码及注释,没有给出的代码就是scrapy框架默认的代码.
代码也放到了github上,想要的同学可以点这里.
"""zhihu.py"""
# -*- coding: utf-8 -*-
import json
from scrapy import Request, Spider
from zhihu_user.items import UserItem
class ZhihuSpider(Spider):
name = "zhihu"
allowed_domains = ["www.zhihu.com"]
start_urls = ['https://www.zhihu.com']
# 起始用户的url_token
start_user = 'excited-vczh'
# 每个用户信息的url和请求参数,这里请求参数可以自己变化,知乎较以前相比变化了
user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}'
user_query = "locations,employments,gender,educations,business,voteup_count,thanked_Count,follower_count,following_count,cover_url,following_topic_count,following_question_count,following_favlists_count,following_columns_count,avatar_hue,answer_count,articles_count,pins_count,question_count,columns_count,commercial_question_count,favorite_count,favorited_count,logs_count,marked_answers_count,marked_answers_text,message_thread_token,account_status,is_active,is_bind_phone,is_force_renamed,is_bind_sina,is_privacy_protected,sina_weibo_url,sina_weibo_name,show_sina_weibo,is_blocking,is_blocked,is_following,is_followed,mutual_followees_count,vote_to_count,vote_from_count,thank_to_count,thank_from_count,thanked_count,description,hosted_live_count,participated_live_count,allow_message,industry_category,org_name,org_homepage,badge[?(type=best_answerer)].topics"
# user_query = 'locations,employments,gender,educations,business,voteup_count,thanked_Count,follower_count,following_count,cover_url,following_topic_count,following_question_count,following_favlists_count,following_columns_count,answer_count,articles_count,pins_count,question_count,commercial_question_count,favorite_count,favorited_count,logs_count,marked_answers_count,marked_answers_text,message_thread_token,account_status,is_active,is_force_renamed,is_bind_sina,sina_weibo_url,sina_weibo_name,show_sina_weibo,is_blocking,is_blocked,is_following,is_followed,mutual_followees_count,vote_to_count,vote_from_count,thank_to_count,thank_from_count,thanked_count,description,hosted_live_count,participated_live_count,allow_message,industry_category,org_name,org_homepage,badge[?(type=best_answerer)].topics'
# user_query = 'allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics'
# 关注列表的url和请求参数
follows_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
follows_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
#粉丝列表的url和请求参数
followers_url = 'https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}'
followers_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
def start_requests(self):
# Request the details of the fist user.
yield Request(self.user_url.format(user=self.start_user, include=self.user_query), callback=self.parse_user)
# Request the first user's list of following.
yield Request(self.follows_url.format(user=self.start_user, include=self.follows_query, offset=0, limit=20), callback=self.parse_follows)
# 请求起始用户的粉丝列表
yield Request(self.followers_url.format(user=self.start_user, include=self.followers_query, offset=0, limit=20), callback=self.parse_followers)
def parse_user(self, response):
"""
解析用户信息,请求用户关注列表和粉丝列表
:param response:
:return:
"""
result = json.loads(response.text) #Convert to a dict
# Get the first user's details
item = UserItem()
# Item有个fields属性,包含所有的field名,依次遍历并赋值
for field in item.fields:
if field in result.keys():
item[field] = result.get(field)
yield item
# 获取起始用户关注列表中每个用户的关注列表
yield Request(self.follows_url.format(user=result.get('url_token'), include=self.follows_query, offset=0, limit=20), callback=self.parse_follows)
# 获取起始用户粉丝列表中每个用户的粉丝列表
yield Request(self.follows_url.format(user=result.get('url_token'), include=self.follows_query, offset=0, limit=20), callback=self.parse_followers)
def parse_follows(self, response):
"""
Parse follows list information and Get next page url
"""
results = json.loads(response.text)
if 'data' in results.keys():
for result in results.get('data'):
# 请求用户详细信息
yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query), callback=self.parse_user)
if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
next_page = results.get('paging').get('next')
# 请求下一页的关注列表
yield Request(next_page, callback=self.parse_follows)
def parse_followers(self, response):
"""
Parse followers list information and Get next page url
"""
results = json.loads(response.text)
if 'data' in results.keys():
for result in results.get('data'):
# 请求用户详细信息
yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query), callback=self.parse_user)
if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
next_page = results.get('paging').get('next')
# 请求下一页的关注列表
yield Request(next_page, callback=self.parse_followers)
"""items.py """
# -*- coding: utf-8 -*-
from scrapy import Item, Field
# 这里定义了item容器,你可以自定义你想要爬取的内容
class UserItem(Item):
id = Field()
name = Field()
account_status = Field()
allow_message = Field()
answer_count = Field()
articles_count = Field()
avatar_hue = Field()
avatar_url = Field()
avatar_url_template = Field()
badge = Field()
business = Field()
employments = Field()
columns_count = Field()
commercial_question_count = Field()
cover_url = Field()
description = Field()
educations = Field()
favorite_count = Field()
favorited_count = Field()
follower_count = Field()
following_columns_count = Field()
following_favlists_count = Field()
following_question_count = Field()
following_topic_count = Field()
gender = Field()
headline = Field()
hosted_live_count = Field()
is_active = Field()
is_bind_sina = Field()
is_blocked = Field()
is_advertiser = Field()
is_blocking = Field()
is_followed = Field()
is_following = Field()
is_force_renamed = Field()
is_privacy_protected = Field()
locations = Field()
is_org = Field()
type = Field()
url = Field()
url_token = Field()
user_type = Field()
logs_count = Field()
marked_answers_count = Field()
marked_answers_text = Field()
message_thread_token = Field()
mutual_followees_count = Field()
participated_live_count = Field()
pins_count = Field()
question_count = Field()
show_sina_weibo = Field()
thank_from_count = Field()
thank_to_count = Field()
thanked_count = Field()
type = Field()
vote_from_count = Field()
vote_to_count = Field()
voteup_count = Field()
"""pipelines.py 存储到mongodb"""
# -*- coding: utf-8 -*-
import pymongo
class MongoPipeline(object):
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
# 去重插入操作
self.db['user'].update({'url_token': item['url_token']}, {'$set': item}, True)
return item
"""settings.py"""
# -*- coding: utf-8 -*-
# 只把做了修改的内容贴上来了
BOT_NAME = 'zhihu_user'
SPIDER_MODULES = ['zhihu_user.spiders']
NEWSPIDER_MODULE = 'zhihu_user.spiders'
# 注意这里要调成false,否则会遵守robots协议,就无法爬取了
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Override the default request headers:
# 要加user-agent,和authorization,这个在不登陆的情况下是不会改变的
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
'authorization': 'oauth c3cef7c66a1843f8b3a9e6a1e3160e20'
}
ITEM_PIPELINES = {
# 'zhihu_user.pipelines.MongoPipeline': 300,
}
MONGO_URI = 'localhost'
MONGO_DATABASE = 'zhihu'