目标:爬取知乎首页前x个问题的详情及问题指定范围内的答案的摘要
power by:
- Python 3.6
- Scrapy 1.4
- json
- pymysql
项目地址:https://github.com/Dengqlbq/ZhiHuSpider.git
Step 1——相关简介
本文将注意力放在代码实现上,代码思路的描述将另开一文
代码思路:http://blog.csdn.net/sinat_34200786/article/details/78568894
Step 2——模拟登录
知乎如果不登录是爬取不到信息的,所以首先要做的就是模拟登录
主要步骤:
获取xsrf及验证码图片
填写验证码提交表单登录
登录是否成功
获取xsrf及验证码图片:
def start_requests(self):
yield scrapy.Request('https://www.zhihu.com/', callback=self.login_zhihu)
def login_zhihu(self, response):
""" 获取xsrf及验证码图片 """
xsrf = re.findall(r'name="_xsrf" value="(.*?)"/>', response.text)[0]
self.headers['X-Xsrftoken'] = xsrf
self.post_data['_xsrf'] = xsrf
times = re.findall(r'<script type="text/json" class="json-inline" data-n'
r'ame="ga_vars">{"user_created":0,"now":(\d+),', response.text)[0]
captcha_url = 'https://www.zhihu.com/' + 'captcha.gif?r=' + times + '&type=login&lang=cn'
yield scrapy.Request(captcha_url, headers=self.headers, meta={'post_data': self.post_data},
callback=self.veri_captcha)
填写验证码提交表单登录:
def veri_captcha(self, response):
""" 输入验证码信息进行登录 """
with open('captcha.jpg', 'wb') as f:
f.write(response.body)
print('只有一个倒立文字则第二个位置为0')
loca1 = input('input the loca 1:')
loca2 = input('input the loca 2:')
captcha = self.location(int(loca1), int(loca2))
self.post_data = response.meta.get('post_data', {})
self.post_data['captcha'] = captcha
post_url = 'https://www.zhihu.com/login/email'
yield scrapy.FormRequest(post_url, formdata=self.post_data, headers=self.headers,
callback=self.login_success)
def location(self, a, b):
""" 将输入的位置转换为相应信息 """
if b != 0:
captcha = "{\"img_size\":[200,44],\"input_points\":[%s,%s]}" %
(str(self.capacha_index[a - 1]),
str(self.capacha_index[b - 1]))
else:
captcha = "{\"img_size\":[200,44],\"input_points\":[%s]}" % str(self.capacha_index[a - 1])
return captcha
登录是否成功:
def login_success(self, response):
if 'err' in response.text:
print(response.text)
print("error!!!!!!")
else:
print("successful!!!!!!")
yield scrapy.Request('https://www.zhihu.com', headers=self.headers, dont_filter=True)
Step 3——获取首页问题
获取第一页的问题只需要将问题URL提取出来即可,不过第一页只有10个左右的问题,
如果想提取更多的问题就需要模拟翻页以便获取问题数据
def parse(self, response):
""" 获取首页问题 """
question_urls = re.findall(r'https://www.zhihu.com/question/(\d+)', response.text)
# 翻页用到的session_token 和 authorization都可在首页源代码找到
self.session_token = re.findall(r'session_token=([0-9,a-z]{32})', response.text)[0]
auto = re.findall(r'carCompose":"(.*?)"', response.text)[0]
self.headers['authorization'] = 'Bearer ' + auto
# 首页第一页问题
for url in question_urls:
question_detail = 'https://www.zhihu.com/question/' + url
yield scrapy.Request(question_detail, headers=self.headers, callback=self.parse_question)
# 获取指定数量问题
n = 10
while n < self.question_count:
yield scrapy.Request(self.next_page.format(self.session_token, n), headers=self.headers,
callback=self.get_more_question)
n += 10
def get_more_question(self, response):
""" 获取更多首页问题 """
question_url = 'https://www.zhihu.com/question/{0}'
questions = json.loads(response.text)
for que in questions['data']:
question_id = re.findall(r'(\d+)', que['target']['question']['url'])[0]
yield scrapy.Request(question_url.format(question_id), headers=self.headers,
callback=self.parse_question)
Step 4——获取问题详情
分析问题页获取问题详情及请求问题指定范围的指定数量答案
Item结构:
class ZhihuQuestionItem(scrapy.Item):
name = scrapy.Field()
url = scrapy.Field()
keywords = scrapy.Field()
answer_count = scrapy.Field()
comment_count = scrapy.Field()
flower_count = scrapy.Field()
date_created = scrapy.Field()
获取问题详情及请求指定范围答案
def parse_question(self, response):
""" 解析问题详情及获取指定范围答案 """
text = response.text
item = ZhihuQuestionItem()
item['name'] = re.findall(r'<meta itemprop="name" content="(.*?)"', text)[0]
item['url'] = re.findall(r'<meta itemprop="url" content="(.*?)"', text)[0]
item['keywords'] = re.findall(r'<meta itemprop="keywords" content="(.*?)"', text)[0]
item['answer_count'] = re.findall(r'<meta itemprop="answerCount" content="(.*?)"', text)[0]
item['comment_count'] = re.findall(r'<meta itemprop="commentCount" content="(.*?)"', text)[0]
item['flower_count'] = re.findall(r'<meta itemprop="zhihu:followerCount"
content="(.*?)"', text)[0]
item['date_created'] = re.findall(r'<meta itemprop="dateCreated" content="(.*?)"', text)[0]
count_answer = int(item['answer_count'])
yield item
question_id = int(re.match(r'https://www.zhihu.com/question/(\d+)', response.url).group(1))
# 从指定位置开始获取指定数量答案
if count_answer > self.answer_count:
count_answer = self.answer_count
n = self.answer_offset
while n + 20 <= count_answer:
yield scrapy.Request(self.more_answer_url.format(question_id, n, n + 20),
headers=self.headers, callback=self.parse_answer)
n += 20
Step 5——获取答案
在parse_question( )中请求的指定范围答案的url返回json数据
item结构:
class ZhihuAnswerItem(scrapy.Item):
question_id = scrapy.Field()
author = scrapy.Field()
ans_url = scrapy.Field()
comment_count = scrapy.Field()
upvote_count = scrapy.Field()
excerpt = scrapy.Field()
获取答案:
def parse_answer(self, response):
""" 解析获取到的指定范围答案 """
answers = json.loads(response.text)
for ans in answers['data']:
item = ZhihuAnswerItem()
item['question_id'] = re.match(r'http://www.zhihu.com/api/v4/questions/(\d+)',
ans['question']['url']).group(1)
item['author'] = ans['author']['name']
item['ans_url'] = ans['url']
item['comment_count'] = ans['comment_count']
item['upvote_count'] = ans['voteup_count']
item['excerpt'] = ans['excerpt']
yield item
Step 6——问题及答案入库
class ZhihuPipeline(object):
def __init__(self):
self.settings = get_project_settings()
self.connect = pymysql.connect(
host=self.settings['MYSQL_HOST'],
db=self.settings['MYSQL_DBNAME'],
user=self.settings['MYSQL_USER'],
passwd=self.settings['MYSQL_PASSWD'],
charset=self.settings['MYSQL_CHARSET'],
use_unicode=True
)
self.cursor = self.connect.cursor()
def process_item(self, item, spider):
if item.__class__.__name__ == 'ZhihuQuestionItem':
sql = 'insert into Scrapy_test.zhihuQuestion(name,url,keywords,answer_count,' \
'flower_count,comment_count,date_created) values (%s,%s,%s,%s,%s,%s,%s)'
self.cursor.execute(sql, (item['name'], item['url'], item['keywords'],
item['answer_count'],item['flower_count'],
item['comment_count'], item['date_created']))
else:
sql = 'insert intoScrapy_test.zhihuAnswer(question_id,author,ans_url',\
'upvote_count,comment_count,excerpt)values (%s,%s,%s,%s,%s,%s)'
self.cursor.execute(sql, (item['question_id'], item['author'],
item['ans_url'],
item['upvote_count'],item['comment_count'], item['excerpt']))
self.connect.commit()