一、爬取思路
1.利用上一篇完成登录,然后去抓取数据。
2.随便找一个用户作为入口,然后利用Chrome工具找到他关注用户的API接口网址和传递的参数。
3.获取API接口返回的JSON数据。然后对JSON数据进行处理。
4.从关注者数大于10000的用户中随机取一个,去抓取他关注的人。如此循环。
二、具体操作
1.随便找一个用户Crossin, 点击[关注了],查看他关注了哪些人。
https://www.zhihu.com/people/crossin/following
2.按F12,打开调试工具。点击network。找到请求API的URL和参数。
https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}
include:data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics
offset:20
limit:20
重点:保存authorization:到headers,不然抓取不到数据报错401。
点击preview,可以看到返回的的JSON结构数据。下一页。
myspider.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request, FormRequest
from zhihu.items import ZhihuItem
import time
from PIL import Image
import json
import requests
from random import choice
class MyspiderSpider(scrapy.Spider):
name = 'myspider'
allowed_domains = ['zhihu.com']
start_urls = ['https://www.zhihu.com/']
headers_zhihu = {
'Host':'www.zhihu.com ',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding':'gzip,deflate,sdch',
'Referer':'https://www.zhihu.com ',
'If-None-Match':"FpeHbcRb4rpt_GuDL6-34nrLgGKd.gz",
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'authorization':'Bearer Mi4wQUdCQzBlWkZEUXdBTU1MYmVyUVBEQmNBQUFCaEFsVk4tWXVlV1FCR3pSbE1WbVpQeU5OODdrTUNlM21jZ2ZTUnBB|1500970746|f8c1997fd0e539beec76bcbb15cffd29971d7c05'
}
def start_requests(self):
return [Request("https://www.zhihu.com/",meta={'cookiejar':1},headers = self.headers_zhihu,callback=self.captcha)]
def captcha(self,response):
xsrf = response.xpath('//input[@name="_xsrf"]/@value').extract()[0]
t = str(int(time.time() * 1000))
captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + '&type=login&lang=en'
return [Request(captcha_url, callback=self.parser_captcha,meta={'cookiejar':response.meta['cookiejar'],'xsrf':xsrf})]
def parser_captcha(self, response):
with open('captcha.jpg', 'wb') as f:
f.write(response.body)
f.close()
im = Image.open('captcha.jpg')
im.show()
im.close()
captcha = raw_input("请输入验证码:")
xsrf = response.meta['xsrf']
return FormRequest('https://www.zhihu.com/login/phone_num',
method='POST',
meta = {'cookiejar':response.meta['cookiejar']},
callback = self.after_login,
dont_filter = True,
headers = self.headers_zhihu,
formdata = {
'phone_num':'138*********',
'password':'*******',
'_xsrf':xsrf,
'captcha_type':'en',
'captcha':captcha,
},)
#用户关注列表API接口
follows_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
follows_query ='data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
start_user = 'crossin'
user_token=[]
def after_login(self,response):
json_file = json.loads(response.text)#返回登录JSON信息
if json_file['r'] == 0:
print('登录成功.....开始爬了。。。。')
yield Request(self.follows_url.format(user=self.start_user,include=self.follows_query,offset=0,limit=20),
callback=self.parse_follows,
meta = {'cookiejar':response.meta['cookiejar']},
headers = self.headers_zhihu,
)
else:
print json_file['msg'].encode('utf-8')
def parse_follows(self, response):
results = json.loads(response.text)
item = ZhihuItem()
if 'data' in results.keys():
for result in results.get('data'):#读取JSON结构化数据
item['name'] = result.get('name').encode('utf-8')
url_token = result.get('url_token')
item['url_token'] = 'https://www.zhihu.com/people/' + url_token
item['answer_count'] = result.get('answer_count')
item['articles_count'] = result.get('articles_count')
follower_count = result.get('follower_count')
item['follower_count'] = follower_count
gender = result.get('gender')
if gender==0:
item['gender']='女'
else:
item['gender']='男'
if follower_count > 10000:
self.user_token.append(url_token)#保存关注数大于10000的用户
yield item
if 'paging' in results.keys() and results.get('paging').get('is_end')==False:#没有到最后一页,继续爬这个用户
next_page = results.get('paging').get('next')
yield Request(next_page,
callback=self.parse_follows,
meta = {'cookiejar':response.meta['cookiejar']},
headers = self.headers_zhihu,
)
else:
start_user = choice(self.user_token)#到最后一页后,随机选一个用户,继续爬
yield Request(self.follows_url.format(user=start_user,include=self.follows_query,offset=0,limit=20),
callback=self.parse_follows,
meta = {'cookiejar':response.meta['cookiejar']},
headers = self.headers_zhihu,
)
items.py
import scrapy
class ZhihuItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
answer_count = scrapy.Field()
articles_count = scrapy.Field()
follower_count = scrapy.Field()
gender = scrapy.Field()
url_token = scrapy.Field()
pipelines.py 保存为excel格式。
# -*- coding: utf-8 -*-
import time
from openpyxl import Workbook
class ZhihuPipeline(object):
def __init__(self):
self.wb = Workbook()
self.ws = self.wb.active
self.ws.append(['姓名','性别','回答','文章','关注者','网址'])
def process_item(self, item, spider):
line = [item['name'],item['gender'],item['answer_count'],item['articles_count'],item['follower_count'],item['url_token']]
self.ws.append(line)
now = time.strftime('%Y-%m-%d',time.localtime())
filename ='/home/soft/zhihu/'+ now +'.xlsx'
self.wb.save(filename)
return item
三、代码运行
[root@master zhihu]# scrapy crawl myspider
代码运行过程。
excel保存内容。
四、触类旁通
下面是关注者和问答的API接口URL和参数