模拟登陆+代理IP

最新推荐文章于 2021-03-27 17:14:11 发布

weixin_44493666

最新推荐文章于 2021-03-27 17:14:11 发布

阅读量334

点赞数

分类专栏： python爬虫文章标签： python

本文链接：https://blog.csdn.net/weixin_44493666/article/details/108388572

版权

python爬虫专栏收录该内容

4 篇文章 0 订阅

订阅专栏

模拟登陆：
-爬取基于某些用户的用户信息。
需求：对人人网进行模拟登录。
-点击登录按钮后会发起一个post请求
-post请求中会携带登录之前的相关登录数据（用户名，密码，验证码…）
-验证码：每次请求都会变化
#编码流程：
1.验证码的识别，获取验证码图片的文字数据
2.对post请求进行发送（处理请求参数）
3.对响应数据进行持久化存储

'''
#编码流程：
	1.验证码的识别，获取验证码图片的文字数据
	2.对post请求进行发送（处理请求参数）
	3.对响应数据进行持久化存储
'''

import requests
from lxml import etree
import json
import numpy as np
import time

import yanzheng_1 as YZ


#封装识别验证码图片的函数
def getCodetext(imgPath,codeType):
    pd_id           = "125806"     #用户中心页可以查询到pd信息
    pd_key          = "kzojlpQERHpH4S7aFpP0llCJaqSOqgCq"
    app_id          = "325806"     #开发者分成用的账号，在开发者中心可以查询到
    app_key         = "XT5hk2ONzXlbp8/msasuDhof7IxaMZ4h"
    #识别类型，
    #具体类型可以查看官方网站的价格页选择具体的类型，不清楚类型的，可以咨询客服
    pred_type       = codeType
    api             = YZ.FateadmApi(app_id, app_key, pd_id, pd_key)
    # 查询余额
    balance 		= api.QueryBalcExtend()   # 直接返余额
    # api.QueryBalc()

    # 通过文件形式识别：
    file_name       = imgPath
    # 多网站类型时，需要增加src_url参数，具体请参考api文档: http://docs.fateadm.com/web/#/1?page_id=6
    # result =  api.PredictFromFileExtend(pred_type,file_name)   # 直接返回识别结果
    rsp             = api.PredictFromFile(pred_type, file_name)  # 返回详细识别结果

    '''
    # 如果不是通过文件识别，则调用Predict接口：
    # result 			= api.PredictExtend(pred_type,data)   	# 直接返回识别结果
    rsp             = api.Predict(pred_type,data)				# 返回详细的识别结果
    '''

    just_flag    = False
    if just_flag :
        if rsp.ret_code == 0:
            #识别的结果如果与预期不符，可以调用这个接口将预期不符的订单退款
            # 退款仅在正常识别出结果后，无法通过网站验证的情况，请勿非法或者滥用，否则可能进行封号处理
            api.Justice( rsp.request_id)

    #card_id         = "123"
    #card_key        = "123"
    #充值
    #api.Charge(card_id, card_key)
    #LOG("print in testfunc")
    return rsp.pred_rsp.value



if __name__ == '__main__':
    #1.对验证码图片进行捕获和识别
    url = 'http://www.renren.com/SysHome.do'
    headers = {
        'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0'
    }
    page_text = requests.get(url=url,headers=headers).text

    tree = etree.HTML(page_text)
    code_img_src = tree.xpath('//*[@id="verifyPic_login"]/@src')[0]
    code_img_data = requests.get(url=code_img_src,headers=headers).content
    with open ('F:/python_test/requests_test/python3/code.jpg','wb') as fp:
        fp.write(code_img_data)
    #2.使用斐斐打码平台提供的示例代码对图片进行识别
    code_text = getCodetext('F:/python_test/requests_test/python3/code.jpg','30600')#http://www.fateadm.com/online_identify.html?usr=125806&ukey=kzojlpQERHpH4S7aFpP0llCJaqSOqgCq
    print('识别结果为：',code_text)
    #3.post请求的发送，用来模拟登录
    login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2020841713498'
    data = {
        "email":"17345717747",
        "icode":code_text,
        "origURL":"http://www.renren.com/home",
        "domain":"renren.com",
        "key_id":"1",
        "captcha_type":"web_login",
        "password":"1e6a6182f90ab16ebebc86df70e8c7c66cbe4e9d6ba0b7d0217b48f685a75ba8",
        "rkey":"f3b3f581ea74a37be4af0ce1d0a62e2b",
        "f":"http%3A%2F%2Fwww.renren.com%2F975048900%2Fnewsfeed%2Fphoto"
    }
    response = requests.post(url=login_url,headers=headers,data=data)
    print(response.status_code)
    print(response.json)
    '''
    login_page_text = response.text
    fp = open('F:/python_test/requests_test/renren.json','w',encoding='utf-8') 
    '''

需求：爬取当前用户的相关的用户信息（个人主页中显示的用户信息）

http/https协议特性：无状态。
直接对用户页面发起get请求，没有请求到页面数据的原因：
发起的第二次基于个人主页页面请求的时候，服务器并不知道该请求是基于登录状态下的请求
cookie：用来让服务器端记录客户端的相关状态
-手动处理：通过抓包工具获取cookie值，将改制封装到headers中。(不建议)
-自动处理：
-cookie值的来源是哪里？
-模拟登录post请求后，由服务器端创建。
session会话对象：
-作用：
1.可以进行请求的发送。
2.如果请求过程中产生了cookie，则该cookie会被自动存储/携带在该session对象中
-创建session对象：session = requests.Session（）
-使用session队形进行模拟登录post请求的发送（cookie就会被存储在session中）
-session对象度个人主页对应的get请求进行发送（携带了cookie）

'''
#编码流程：
	1.验证码的识别，获取验证码图片的文字数据
	2.对post请求进行发送（处理请求参数）
	3.对响应数据进行持久化存储
'''

import requests
from lxml import etree
import json
import numpy as np
import time

import yanzheng_1 as YZ


#封装识别验证码图片的函数
def getCodetext(imgPath,codeType):
    pd_id           = "125806"     #用户中心页可以查询到pd信息
    pd_key          = "kzojlpQERHpH4S7aFpP0llCJaqSOqgCq"
    app_id          = "325806"     #开发者分成用的账号，在开发者中心可以查询到
    app_key         = "XT5hk2ONzXlbp8/msasuDhof7IxaMZ4h"
    #识别类型，
    #具体类型可以查看官方网站的价格页选择具体的类型，不清楚类型的，可以咨询客服
    pred_type       = codeType
    api             = YZ.FateadmApi(app_id, app_key, pd_id, pd_key)
    # 查询余额
    balance 		= api.QueryBalcExtend()   # 直接返余额
    # api.QueryBalc()

    # 通过文件形式识别：
    file_name       = imgPath
    # 多网站类型时，需要增加src_url参数，具体请参考api文档: http://docs.fateadm.com/web/#/1?page_id=6
    # result =  api.PredictFromFileExtend(pred_type,file_name)   # 直接返回识别结果
    rsp             = api.PredictFromFile(pred_type, file_name)  # 返回详细识别结果

    '''
    # 如果不是通过文件识别，则调用Predict接口：
    # result 			= api.PredictExtend(pred_type,data)   	# 直接返回识别结果
    rsp             = api.Predict(pred_type,data)				# 返回详细的识别结果
    '''

    just_flag    = False
    if just_flag :
        if rsp.ret_code == 0:
            #识别的结果如果与预期不符，可以调用这个接口将预期不符的订单退款
            # 退款仅在正常识别出结果后，无法通过网站验证的情况，请勿非法或者滥用，否则可能进行封号处理
            api.Justice( rsp.request_id)

    #card_id         = "123"
    #card_key        = "123"
    #充值
    #api.Charge(card_id, card_key)
    #LOG("print in testfunc")
    return rsp.pred_rsp.value



if __name__ == '__main__':
    #创建一个session对象
    session = requests.session()
    #1.对验证码图片进行捕获和识别
    url = 'http://www.renren.com/SysHome.do'
    headers = {
        'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0'
    }
    page_text = requests.get(url=url,headers=headers).text

    tree = etree.HTML(page_text)
    code_img_src = tree.xpath('//*[@id="verifyPic_login"]/@src')[0]
    code_img_data = requests.get(url=code_img_src,headers=headers).content
    with open ('F:/python_test/requests_test/python3/code.jpg','wb') as fp:
        fp.write(code_img_data)
    #2.使用斐斐打码平台提供的示例代码对图片进行识别
    code_text = getCodetext('F:/python_test/requests_test/python3/code.jpg','30600')#http://www.fateadm.com/online_identify.html?usr=125806&ukey=kzojlpQERHpH4S7aFpP0llCJaqSOqgCq
    print('识别结果为：',code_text)
    #3.post请求的发送，用来模拟登录
    login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2020841713498'
    data = {
        "email":"17345717747",
        "icode":code_text,
        "origURL":"http://www.renren.com/home",
        "domain":"renren.com",
        "key_id":"1",
        "captcha_type":"web_login",
        "password":"1e6a6182f90ab16ebebc86df70e8c7c66cbe4e9d6ba0b7d0217b48f685a75ba8",
        "rkey":"f3b3f581ea74a37be4af0ce1d0a62e2b",
        "f":"http%3A%2F%2Fwww.renren.com%2F975048900%2Fnewsfeed%2Fphoto"
    }
    #使用session进行post请求的发送
    response = session.post(url=login_url,headers=headers,data=data)
    print(response.status_code)
    '''
    login_page_text = response.text
    fp = open('F:/python_test/requests_test/renren.json','w',encoding='utf-8') 
    '''
    #爬取当前主页对应的页面数据
    detail_url = 'http://www.renren.com/975048900/profile'
    '''
    #手动cookie处理
    headers = {
        'cookie' = 'xxxx'
    }
    '''
    #使用携带cookie的session进行get请求的发送
    detail_page_text = session.get(url=detail_url,headers=headers).text
    with open('F:/python_test/requests_test/liber.html','w',encoding='utf-8') as fp:
        fp.write(detail_page_text)

代理：破解封IP这种反爬机制。
什么是代理：
-代理服务器。
代理的作用：
-突破自身IP访问的限制。
-可以隐藏自身真实IP
代理相关的网站：
-快代理
-西祠代理
-www.goubanjia.com
代理IP的类型：
-http：应用到http协议对应的url中
-https：应用到https协议对应的url中
代理IP的匿名度：
-透明：服务器知道带刺请求使用了代理，也知道请求对应的真实IP
-匿名：知道使用了代理，不知道真实IP
-高匿名：不知道使用了代理，不知道真实IP

#需求
import requests
url = 'https://www.baidu.com/s?wd=ip'
headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0'
}
#,proxies={"https":'27.189.162.153'}
IP = '27.189.162.153'
proxies = {
    "https":'https://'+IP,
    "http":'https://'+IP
}
page_text = requests.get(url=url,headers=headers,proxies=proxies).text
with open('F:/python_test/requests_test/ip.html','w',encoding='utf-8')as fp:
    fp.write(page_text)

#反爬机制：封IP
#反反爬策略：使用代理进行请求发送

weixin_44493666

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
模拟登陆+代理IP

模拟登陆：-爬取基于某些用户的用户信息。需求：对人人网进行模拟登录。-点击登录按钮后会发起一个post请求-post请求中会携带登录之前的相关登录数据（用户名，密码，验证码…）-验证码：每次请求都会变化#编码流程：1.验证码的识别，获取验证码图片的文字数据2.对post请求进行发送（处理请求参数）3.对响应数据进行持久化存储'''#编码流程： 1.验证码的识别，获取验证码图片的文字数据 2.对post请求进行发送（处理请求参数） 3.对响应数据进行持久化存储'''impor
复制链接

扫一扫