scrapy打造知乎后花园二：cookie模拟登陆验证码爬取首页内容

本文链接：https://blog.csdn.net/lxb1022/article/details/75529388

《scrapy打造知乎后花园一》中，完美的验证了验证码登录，似乎登录了，还不能爬取任何内容。

这篇文章来验证cookie模拟登陆+验证码爬取首页内容。到现在为止，只写了myspider.py一个文件。

一、代码分析：

# -*- coding: utf-8 -*-
import scrapy,urllib,re
from scrapy.http import Request, FormRequest
from zhihu.items import *
import time
from PIL import Image
import json
import requests
class MyspiderSpider(scrapy.Spider):
    name = 'myspider'
    allowed_domains = ['zhihu.com']
    start_urls = ['https://www.zhihu.com/']

    headers_zhihu = {
           'Host':'www.zhihu.com ',
           'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
           'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
           'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
           'Accept-Encoding':'gzip,deflate,sdch',
           'Referer':'https://www.zhihu.com ',
           'If-None-Match':"FpeHbcRb4rpt_GuDL6-34nrLgGKd.gz",
           'Cache-Control':'max-age=0',
           'Connection':'keep-alive'
    }
    def start_requests(self):
        return [Request("https://www.zhihu.com/",meta={'cookiejar':1},headers = self.headers_zhihu,callback=self.captcha)]

    def captcha(self,response):
        xsrf = response.xpath('//input[@name="_xsrf"]/@value').extract()[0]
        t = str(int(time.time() * 1000))
        captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + '&type=login&lang=en'
        return [Request(captcha_url, callback=self.parser_captcha,meta={'cookiejar':response.meta['cookiejar'],'xsrf':xsrf})]
    def parser_captcha(self, response):
        with open('captcha.jpg', 'wb') as f:
            f.write(response.body)
            f.close()
        im = Image.open('captcha.jpg')
        im.show()
        im.close()
        captcha = raw_input("请输入验证码：")
        xsrf = response.meta['xsrf']
        return  FormRequest('https://www.zhihu.com/login/phone_num',
                method='POST',
                meta = {'cookiejar':response.meta['cookiejar']},
                callback = self.after_login,
                dont_filter = True,
                headers = self.headers_zhihu,
                formdata = {
                    'phone_num':'138*******',
                    'password':'******',
                    '_xsrf':xsrf,
                    'captcha_type':'en',
                    'captcha':captcha,
                },)

    def after_login(self,response):
        json_file = json.loads(response.text)
        print json_file
                    '_xsrf':xsrf,
                    'captcha_type':'en',
                    'captcha':captcha,
                },)

    def after_login(self,response):
        json_file = json.loads(response.text)
        print json_file
        if json_file['r'] == 0:
           print('登录成功.....开始爬了。。。。')
           yield Request(
                         'http://www.zhihu.com',
                         meta = {'cookiejar':response.meta['cookiejar']},
                         headers = self.headers_zhihu,
                         callback=self.parse,
                         dont_filter = True,
                         )
        else:
           print('登录失败！')
    def parse(self, response):
        leirong = response.xpath('//div[@class="Card TopstoryItem"]')
        for lr in leirong:
            author = lr.xpath('.//div[@class="AuthorInfo-content"]/div[1]/span/div/div/a/text()').extract()[0]
            title = lr.xpath('.//div[@class="ContentItem AnswerItem"]/h2/div/a/text()').extract()[0]
            print "作者：%s" %author.encode('utf-8')
            print "标题：%s" %title.encode('utf-8')

二、代码验证：

1.登陆后，知乎首页内容。