scrapy的basic模板模拟登录、requests模拟登录

#一、通过requests模拟知乎登录

# -*- coding: utf-8 -*-
# author : seven
# time : 2017/7/21
import requests
import re
from bs4 import BeautifulSoup

try:
    import http.cookiejar as cookielib  # python3
except:
    import cookielib  # python2

session = requests.session()
session.cookies = cookielib.LWPCookieJar(filename='cookies.txt')

try:
    session.cookies.load(ignore_discard=True)
except:
    print('cookie 未能加载')

userAgent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'
header = {
    'HOST': 'www.zhihu.com',
    'Referer': 'https://www.zhihu.com',
    'User-Agent': userAgent
}


def is_login():
    '''通过访问需要登录的页面,判断返回的状态码来判断是否登录,200登录来,302为登录'''
    need_login_visit_url = 'https://www.zhihu.com/settings/profile'
    response = session.get(need_login_visit_url, headers=header, allow_redirects=False)
    # allow_redirects如果服务器返回302是否需要访问重定向后的页面,如果不设置为False,获取重定向后的页面之后就返回200了,就永远认为是登录了
    if response.status_code != 200:
        return False
    else:
        return True


def get_xsrf():
    '''请求时候,要设置userAgent,有的网站就会验证此值,防止爬虫'''
    response = session.get('https://www.zhihu.com/#signin', headers=header)
    soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')
    xsrfNodes = soup.select('input[name="_xsrf"]')
    xsrfInput = xsrfNodes[0]
    xsrf = xsrfInput.get('value', '')
    return xsrf

    # 不知道为何用下面正则匹配返回的内容匹配不出来,临时先用BeautifulSoup代替
    # match_obj = re.match('.*name="_xsrf" value="(.*?)"',response.text)
    # if match_obj:
    #     return match_obj.group(1)
    # else:
    #     return ''


def get_index():
    '''获取首页'''
    response = session.get('https://www.zhihu.com', headers=header)
    with open('index_page.html', 'wb') as f:
        f.write(response.text.encode('utf-8'))  # 向内存中写内容要用utf-8
    print('ok')


def zhihu_login(account, password):
    # 知乎登录
    if re.match(r'1\d{10}', account):
        '''phone login'''
        post_url = 'http://www.zhihu.com/login/phone_num'
        post_data = {
            '_xsrf': get_xsrf(),
            'phone_num': account,
            'password': password
        }
    else:
        '''email login'''
        post_url = 'http://www.zhihu.com/login/email'
        post_data = {
            '_xsrf': get_xsrf(),
            'phone_num': account,
            'password': password
        }

    session.post(post_url, data=post_data, headers=header)
    session.cookies.save()  # 保存cookie


if __name__ == '__main__':
    if not is_login():
        zhihu_login('xxxxx','xxxxx.')
    else:
        # 2、带着cookie去访问其他页面就ok了
        get_index()




#二、scrapy模拟登录
#1、创建工程:scrapy genspider zhihu www.zhibu.com
#2、spider文件夹下代码:
# -*- coding: utf-8 -*-
import scrapy
import re
import json
from PIL import Image


class ZhihuSpider(scrapy.Spider):

    name = 'zhihu'
    allowed_domains = ['www.zhihu.com']
    start_urls = ['http://www.zhihu.com/']

    userAgent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'
    headers = {
        'HOST': 'www.zhihu.com',
        'Referer': 'https://www.zhihu.com',
        'User-Agent': userAgent
    }

    def parse(self, response):
        '''登录成功后访问页面'''
        print(response.body)
        pass

    def parse_detail(self, response):
        '''每个访问页面的详细数据'''
        pass

    def start_requests(self):
        '''重写start_requests,入口方法'''
        return [scrapy.Request('https://www.zhihu.com/#signin', headers=self.headers,
                               callback=self.login)]  # 所有scrapy的Request都要有callback,因为它的请求全是异步的

    def login(self, response):
        # 正则表达式默认之匹配第一行\r\n之后的不匹配,re.DOTALL:匹配所有
        match_obj = re.match('.*name="_xsrf" value="(.*?)"', response.text, re.DOTALL)
        if match_obj:
            xsrf = match_obj.group(1)

            import time
            t = str(int(time.time() * 1000))
            captcha_url = "http://www.zhihu.com/captcha.gif?r={0}&type=login".format(t)

            yield scrapy.Request(captcha_url,headers=self.headers,meta={'xsrf':xsrf},callback=self.login_after_captcha,dont_filter=True)


    def login_after_captcha(self,reponse):
        '''请求验证码之后登录'''
        captcha_name = 'captcha.jpg'
        with open(captcha_name,'wb') as f:
            f.write(reponse.body)
        try:
            im = Image.open(captcha_name)
            im.show()
            im.close()
        except Exception as e:
            print('打开图片出错')

        captcha = input("请输入验证码:")

        xsrf = reponse.meta.get('xsrf','')
        return [scrapy.FormRequest(  # 完成表单提交
            # 为了简单这里只写phone login
            url='http://www.zhihu.com/login/phone_num',
            formdata={
                '_xsrf': xsrf,
                'phone_num': 'sdsd',
                'password': 'asdasads.',
                'captcha': captcha
            },
            headers=self.headers,
            callback=self.check_login,  # 所有scrapy的Request都要有callback,因为它的请求全是异步的
            dont_filter=True  # 默认scrapy是过滤掉除了allowed_domains中掉url,如果dont_filter=True ,则不会过滤
        )]


    def check_login(self, response):
        '''登录后回调,在去访问其他url,scrapy会自动带cookie信息,不需要自己处理'''
        reponse_text = response.text;
        text_json = json.loads(reponse_text)  # 服务器返回的是json字符串,解析出来
        if 'msg' in text_json and text_json['msg'] == '登录成功':
            for url in self.start_urls:
                yield scrapy.Request(url, dont_filter=True, headers=self.headers)  # 不写回调函数,默认走parse

        else:
            print('error %s' % reponse_text)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值