scrapy的basic模板模拟登录、requests模拟登录

最新推荐文章于 2023-07-23 22:25:04 发布
Mikaelemmmm
最新推荐文章于 2023-07-23 22:25:04 发布
阅读量931
点赞数
分类专栏： python学习 python爬虫
本文链接：https://blog.csdn.net/jj546630576/article/details/76916127
版权
python学习同时被 2 个专栏收录
47 篇文章 0 订阅
订阅专栏
python爬虫
8 篇文章 0 订阅
订阅专栏
#一、通过requests模拟知乎登录

# -*- coding: utf-8 -*-
# author : seven
# time : 2017/7/21
import requests
import re
from bs4 import BeautifulSoup

try:
    import http.cookiejar as cookielib  # python3
except:
    import cookielib  # python2

session = requests.session()
session.cookies = cookielib.LWPCookieJar(filename='cookies.txt')

try:
    session.cookies.load(ignore_discard=True)
except:
    print('cookie 未能加载')

userAgent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'
header = {
    'HOST': 'www.zhihu.com',
    'Referer': 'https://www.zhihu.com',
    'User-Agent': userAgent
}


def is_login():
    '''通过访问需要登录的页面，判断返回的状态码来判断是否登录，200登录来，302为登录'''
    need_login_visit_url = 'https://www.zhihu.com/settings/profile'
    response = session.get(need_login_visit_url, headers=header, allow_redirects=False)
    # allow_redirects如果服务器返回302是否需要访问重定向后的页面,如果不设置为False，获取重定向后的页面之后就返回200了，就永远认为是登录了
    if response.status_code != 200:
        return False
    else:
        return True


def get_xsrf():
    '''请求时候，要设置userAgent，有的网站就会验证此值，防止爬虫'''
    response = session.get('https://www.zhihu.com/#signin', headers=header)
    soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')
    xsrfNodes = soup.select('input[name="_xsrf"]')
    xsrfInput = xsrfNodes[0]
    xsrf = xsrfInput.get('value', '')
    return xsrf

    # 不知道为何用下面正则匹配返回的内容匹配不出来，临时先用BeautifulSoup代替
    # match_obj = re.match('.*name="_xsrf" value="(.*?)"',response.text)
    # if match_obj:
    #     return match_obj.group(1)
    # else:
    #     return ''


def get_index():
    '''获取首页'''
    response = session.get('https://www.zhihu.com', headers=header)
    with open('index_page.html', 'wb') as f:
        f.write(response.text.encode('utf-8'))  # 向内存中写内容要用utf-8
    print('ok')


def zhihu_login(account, password):
    # 知乎登录
    if re.match(r'1\d{10}', account):
        '''phone login'''
        post_url = 'http://www.zhihu.com/login/phone_num'
        post_data = {
            '_xsrf': get_xsrf(),
            'phone_num': account,
            'password': password
        }
    else:
        '''email login'''
        post_url = 'http://www.zhihu.com/login/email'
        post_data = {
            '_xsrf': get_xsrf(),
            'phone_num': account,
            'password': password
        }

    session.post(post_url, data=post_data, headers=header)
    session.cookies.save()  # 保存cookie


if __name__ == '__main__':
    if not is_login():
        zhihu_login('xxxxx','xxxxx.')
    else:
        # 2、带着cookie去访问其他页面就ok了
        get_index()




#二、scrapy模拟登录
#1、创建工程：scrapy genspider zhihu www.zhibu.com
#2、spider文件夹下代码：
# -*- coding: utf-8 -*-
import scrapy
import re
import json
from PIL import Image


class ZhihuSpider(scrapy.Spider):

    name = 'zhihu'
    allowed_domains = ['www.zhihu.com']
    start_urls = ['http://www.zhihu.com/']

    userAgent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'
    headers = {
        'HOST': 'www.zhihu.com',
        'Referer': 'https://www.zhihu.com',
        'User-Agent': userAgent
    }

    def parse(self, response):
        '''登录成功后访问页面'''
        print(response.body)
        pass

    def parse_detail(self, response):
        '''每个访问页面的详细数据'''
        pass

    def start_requests(self):
        '''重写start_requests，入口方法'''
        return [scrapy.Request('https://www.zhihu.com/#signin', headers=self.headers,
                               callback=self.login)]  # 所有scrapy的Request都要有callback，因为它的请求全是异步的

    def login(self, response):
        # 正则表达式默认之匹配第一行\r\n之后的不匹配，re.DOTALL：匹配所有
        match_obj = re.match('.*name="_xsrf" value="(.*?)"', response.text, re.DOTALL)
        if match_obj:
            xsrf = match_obj.group(1)

            import time
            t = str(int(time.time() * 1000))
            captcha_url = "http://www.zhihu.com/captcha.gif?r={0}&type=login".format(t)

            yield scrapy.Request(captcha_url,headers=self.headers,meta={'xsrf':xsrf},callback=self.login_after_captcha,dont_filter=True)


    def login_after_captcha(self,reponse):
        '''请求验证码之后登录'''
        captcha_name = 'captcha.jpg'
        with open(captcha_name,'wb') as f:
            f.write(reponse.body)
        try:
            im = Image.open(captcha_name)
            im.show()
            im.close()
        except Exception as e:
            print('打开图片出错')

        captcha = input("请输入验证码：")

        xsrf = reponse.meta.get('xsrf','')
        return [scrapy.FormRequest(  # 完成表单提交
            # 为了简单这里只写phone login
            url='http://www.zhihu.com/login/phone_num',
            formdata={
                '_xsrf': xsrf,
                'phone_num': 'sdsd',
                'password': 'asdasads.',
                'captcha': captcha
            },
            headers=self.headers,
            callback=self.check_login,  # 所有scrapy的Request都要有callback，因为它的请求全是异步的
            dont_filter=True  # 默认scrapy是过滤掉除了allowed_domains中掉url，如果dont_filter=True ，则不会过滤
        )]


    def check_login(self, response):
        '''登录后回调，在去访问其他url，scrapy会自动带cookie信息，不需要自己处理'''
        reponse_text = response.text;
        text_json = json.loads(reponse_text)  # 服务器返回的是json字符串，解析出来
        if 'msg' in text_json and text_json['msg'] == '登录成功':
            for url in self.start_urls:
                yield scrapy.Request(url, dont_filter=True, headers=self.headers)  # 不写回调函数，默认走parse

        else:
            print('error %s' % reponse_text)