《scrapy打造知乎后花园一》中,完美的验证了验证码登录,似乎登录了,还不能爬取任何内容。
这篇文章来验证cookie模拟登陆+验证码爬取首页内容。到现在为止,只写了myspider.py一个文件。
一、代码分析:
# -*- coding: utf-8 -*-
import scrapy,urllib,re
from scrapy.http import Request, FormRequest
from zhihu.items import *
import time
from PIL import Image
import json
import requests
class MyspiderSpider(scrapy.Spider):
name = 'myspider'
allowed_domains = ['zhihu.com']
start_urls = ['https://www.zhihu.com/']
headers_zhihu = {
'Host':'www.zhihu.com ',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding':'gzip,deflate,sdch',
'Referer':'https://www.zhihu.com ',
'If-None-Match':"FpeHbcRb4rpt_GuDL6-34nrLgGKd.gz",
'Cache-Control':'max-age=0',
'Connection':'keep-alive'
}
def start_requests(self):
return [Request("https://www.zhihu.com/",meta={'cookiejar':1},headers = self.headers_zhihu,callback=self.captcha)]
def captcha(self,response):
xsrf = response.xpath('//input[@name="_xsrf"]/@value').extract()[0]
t = str(int(time.time() * 1000))
captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + '&type=login&lang=en'
return [Request(captcha_url, callback=self.parser_captcha,meta={'cookiejar':response.meta['cookiejar'],'xsrf':xsrf})]
def parser_captcha(self, response):
with open('captcha.jpg', 'wb') as f:
f.write(response.body)
f.close()
im = Image.open('captcha.jpg')
im.show()
im.close()
captcha = raw_input("请输入验证码:")
xsrf = response.meta['xsrf']
return FormRequest('https://www.zhihu.com/login/phone_num',
method='POST',
meta = {'cookiejar':response.meta['cookiejar']},
callback = self.after_login,
dont_filter = True,
headers = self.headers_zhihu,
formdata = {
'phone_num':'138*******',
'password':'******',
'_xsrf':xsrf,
'captcha_type':'en',
'captcha':captcha,
},)
def after_login(self,response):
json_file = json.loads(response.text)
print json_file
'_xsrf':xsrf,
'captcha_type':'en',
'captcha':captcha,
},)
def after_login(self,response):
json_file = json.loads(response.text)
print json_file
if json_file['r'] == 0:
print('登录成功.....开始爬了。。。。')
yield Request(
'http://www.zhihu.com',
meta = {'cookiejar':response.meta['cookiejar']},
headers = self.headers_zhihu,
callback=self.parse,
dont_filter = True,
)
else:
print('登录失败!')
def parse(self, response):
leirong = response.xpath('//div[@class="Card TopstoryItem"]')
for lr in leirong:
author = lr.xpath('.//div[@class="AuthorInfo-content"]/div[1]/span/div/div/a/text()').extract()[0]
title = lr.xpath('.//div[@class="ContentItem AnswerItem"]/h2/div/a/text()').extract()[0]
print "作者:%s" %author.encode('utf-8')
print "标题:%s" %title.encode('utf-8')
二、代码验证:
1.登陆后,知乎首页内容。
2.运行代码:[root@master spiders]# scrapy crawl myspider 手动输入验证码。
3.运行结果,可以看到爬取内容正是首页的内容。只能爬取5条记录????
终于爬到内容了,知乎后花园的大门已经打开,要把它变成自己后花园,还远远不够。。。。。