云码识别验证码的使用:
import json
import time
import requests
import base64
class YdmVerify(object):
_custom_url = "http://api.jfbym.com/api/YmServer/customApi"
_token = "vlBj9JWH5p_IwM8UPPjnrgsdXzConMqx_keexofn-VU"
_headers = {
'Content-Type': 'application/json'
}
def common_verify(self, image, verify_type="10110"):
# 数英汉字类型
# 通用数英1-4位 10110
# 通用数英5-8位 10111
# 通用数英9~11位 10112
# 通用数英12位及以上 10113
# 通用数英1~6位plus 10103
# 定制-数英5位~qcs 9001
# 定制-纯数字4位 193
# 中文类型
# 通用中文字符1~2位 10114
# 通用中文字符 3~5位 10115
# 通用中文字符6~8位 10116
# 通用中文字符9位及以上 10117
# 定制-XX西游苦行中文字符 10107
# 计算类型
# 通用数字计算题 50100
# 通用中文计算题 50101
# 定制-计算题 cni 452
payload = {
"image": base64.b64encode(image).decode(),
"token": self._token,
"type": verify_type
}
print(payload)
resp = requests.post(self._custom_url, headers=self._headers, data=json.dumps(payload))
print(resp.text)
return resp.json()['data']['data']
def image_to_base64(image_path):
with open(image_path, "rb") as image_file:
image_data = image_file.read()
return image_data
if __name__ == '__main__':
y = YdmVerify()
image_path = './RandCode.gif'
image_data = image_to_base64(image_path)
zz = y.common_verify(image=image_data)
print(zz)
古诗文网验证码识别:
#1.将验证码图片本地下载
#2.调用平台提供的示例代码进行图片数据识别
import requests
from lxml import etree
import test01
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0'}
url = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx'
response = requests.get(url=url,headers=headers)
page_text = response.text
#解析验证码图片img中src属性值
tree = etree.HTML(page_text)
img_url = tree.xpath('//div[@class="mainreg2"]/img[@id="imgCode"]/@src')
print(img_url)
url_2 = 'https://so.gushiwen.cn' + img_url[0]
response_2 = requests.get(url=url_2,headers=headers)
img_data = response_2.content
img_name = './RandCode.gif'
with open(img_name,'wb') as fp:
fp.write(img_data)
print("下载成功")
y = test01.YdmVerify()
image_data = test01.image_to_base64(img_name)
img_json = y.common_verify(image=image_data)
print(img_json)
模拟登录
- 爬取基于某些用户的用户信息
需求:对古诗词网进行模拟登陆
需求:爬取当前用户的个人主页对应的页面数据
http/https协议特性:无状态
没有请求到对应页面数据的原因:发起的第二次基于个人主页页面请求的时候,服务器端并不知道该请求是基于登录状态下的请求。
cookie:用来让服务器端记录客户端的相关状态
创建session对象模拟登陆
#1.将验证码图片本地下载
#2.调用平台提供的示例代码进行图片数据识别
import requests
from lxml import etree
import test01
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0'}
url = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx'
session = requests.Session()
response = session.get(url=url,headers=headers)
page_text = response.text
#解析验证码图片img中src属性值
tree = etree.HTML(page_text)
img_url = tree.xpath('//div[@class="mainreg2"]/img[@id="imgCode"]/@src')
print(img_url)
url_2 = 'https://so.gushiwen.cn' + img_url[0]
response_2 = session.get(url=url_2,headers=headers)
img_data = response_2.content
img_name = './RandCode.gif'
with open(img_name,'wb') as fp:
fp.write(img_data)
print("下载成功")
y = test01.YdmVerify()
image_data = test01.image_to_base64(img_name)
img_code = y.common_verify(image=image_data)
print(img_code)
headers2 = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0'
,'Referer':'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx'
,'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'
,'Content-Type':'application/x-www-form-urlencoded'
}
data = {
'__VIEWSTATE':'2rhazQ4U9Ct2OaFVX45HRmnEPgGyGqkpLGEbX0DxDwjYhYDQyzXeBe09yW27psb0hrqz8ZHjdPKsKYr2ZofphjaR1YuUJZzCvVLl1q+1fbdRjIf66thkKELNkPB76ejhs9+xzGFnQebsZ3nYMzxduwnF9Rg='
,'__VIEWSTATEGENERATOR':'C93BE1AE'
,'from':'http://so.gushiwen.cn/user/collect.aspx'
,'email':'17713860652'
,'pwd':'wshjj5244273..'
,'code':img_code
,'denglu':'登录'
}
response_3 = session.post(url=url,data=data,headers=headers2)
page_text_2 = response_3.text
with open('./zz.html','w',encoding='utf-8') as fp:
fp.write(page_text_2)
print("传输成功")
代理:破解封IP这种反爬机制
什么是代理:
- 代理服务器
代理的作用:
- 突破自身IP访问的限制
- 可以隐藏自身真实的IP
代理相关的网站:
- 快代理
#
import requests
import json
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0'
,
}
url = 'https://qifu-api.baidubce.com/ip/local/geo/v1/district?'
page_text = requests.get(url=url,headers=headers,proxies={"http":"39.165.0.137:9002"}).json()
print(page_text)