关于做爬虫项目的一些小杂记–pyppeteer
1.启动参数解释(待补充)
# 'headless': False 是否以”无头”的模式运行,,即是否显示窗口,默认为 True(不显示)
# 'ignoreHTTPSErrors': True 是否忽略 Https 报错信息,默认为 False
# 'dumpio': True 防止多开导致的假死
# args常用配置
#### '--no-sandbox' 取消沙盒模式,放开权限
#### '--disable-infobars' 不显示信息栏,比如:chrome正在受到自动测试软件的控制
#### '--window-size=1920,1080' 设置窗口大小 和"--start-maximized"参数互斥
#### "--start-maximized" 窗口最大化
#### "--proxy-server=http://127.0.0.1:80" 设置代理
#### "--user-agent=Mozilla/5.0......" 设置UA
browser = await launch(
{'headless': False, 'dumpio': True, 'autoClose': False,
'ignoreHTTPSErrors': True,
'args': ['--no-sandbox', '--disable-infobars', '--window-size=1920,1080',
]})
2.清除特定参数(webdriver),达到绕过反爬检查,从而实现爬虫
await page.evaluateOnNewDocument('''
() => {
const newProto = navigator.__proto__;
delete newProto.webdriver;
navigator.__proto__ = newProto;
}
''')
3.拖拽验证码,缺口位置确定
def get_distance(bkg,blk):
block = cv2.imread(blk, 0)
template = cv2.imread(bkg, 0)
cv2.imwrite('template.jpg', template)
cv2.imwrite('block.jpg', block)
block = cv2.imread('block.jpg')
block = cv2.cvtColor(block, cv2.COLOR_BGR2GRAY)
# block = abs(255 - block)
cv2.imwrite('block.jpg', block)
block = cv2.imread('block.jpg')
template = cv2.imread('template.jpg')
result = cv2.matchTemplate(block,template,cv2.TM_CCOEFF_NORMED)
x, y = np.unravel_index(result.argmax(), result.shape)
print(x, y, sep='\n')
#这里就是下图中的绿色框框
cv2.rectangle(template, (y+20, x+20), (y + 136-25, x + 136-25), (7, 249, 151), 2)
print('x坐标为:%d'%(y))
return y
if __name__ == "__main__":
# 下载图片
urls = {"bkg.jpg": "https://necaptcha.nosdn.127.net/b016d6ff32fc4c7ca5d40d0869ecde33.jpg",
"blg.jpg": "https://necaptcha.nosdn.127.net/3a39df3bb5e443798136534e19b8c4f1.png"}
for i in urls.keys():
re = requests.request("GET", urls[i])
with open(i, 'bw') as f:
f.write(re.content)
# 裁剪 滑块图片
image = cv2.imread("blg.jpg", 1) # 读取图片 image_name应该是变量
img = cv2.medianBlur(image, 5) # 中值滤波,去除黑色边际中可能含有的噪声干扰
b = cv2.threshold(img, 15, 255, cv2.THRESH_BINARY) # 调整裁剪效果
binary_image = b[1] # 二值图--具有三通道
binary_image = cv2.cvtColor(binary_image, cv2.COLOR_BGR2GRAY)
# print(binary_image.shape) # 改为单通道
print(binary_image)
x = binary_image.shape[0]
print("高度x=", x)
y = binary_image.shape[1]
print("宽度y=", y)
edges_x = []
edges_y = []
for i in range(x):
for j in range(y):
if binary_image[i][j] == 255:
edges_x.append(i)
edges_y.append(j)
left = min(edges_x) # 左边界
right = max(edges_x) # 右边界
width = right - left # 宽度
bottom = min(edges_y) # 底部
top = max(edges_y) # 顶部
height = top - bottom # 高度
print(left, width, bottom, height, sep="\n")
pre1_picture = image[left:left + width, bottom:bottom + height]
cv2.imwrite('blg_new.jpg', pre1_picture)
# 裁剪背景图
image = cv2.imread("bkg.jpg", 1)
image_new = image[left:left + width, :]
cv2.imwrite("bkg_new.jpg", image_new)
# 获取坐标
distance = get_distance("bkg_new.jpg", "blg_new.jpg")
print(distance)
4.关于破解验证码时,资源格式为(data:image/png;base64)时的解决方法
a.data:image/png;base64其实就是Data URI scheme
Data URI scheme :Data URI scheme是在RFC2397中定义的,目的是将一些小的数据,直接嵌入到网页中,从而不用再从外部文件载入,例如下面的代码
<img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAADIAAAAyCAYAAAAeP4ixAAAGyklEQVR42s1ZS29bRRQOC9jAghU/AFjwO6C/ARZAFBHsvBwnsR2cB0rdPJQ6iZvYiVMpKQgWEBAlVYWaSrxFF0UIJFiwII3UHRIq0NIqadw2HeZM5oy/mTvXXPcGxZaO7vXcmTvnm/Odx8xta4vxGxkZefH8ZFrMjw+L8ZG8krnxnKC2sbGxl9pa/ZfP519em8gKCeS/5Bvq26ogiqzoeiEnrhbT4sZSn7hfPZQb5X7Vtn6yDpTGtBSI0dHRWVbuymxaPFjpFveXk+r6UIJ4uNJjhNq/O502YGhsq1iinZX6aSF1qLgUAkFyUOlSQgBI7lU61fXnuTRapv24cTzGynxdHDKKI5jdcqdSngHUziYNqCunBwwYetexoZAKnFA+cSprrT5R6E41ZWS32q4AUB8EQkL+pMGcOE5arZESV4v9CgDTiZRFIHtrHQYA91F+VEmI74tpptfacTr5Dinxx2IyoDxagq2hnJ+cHgLAn+U+dvqd46RWjZTYO9ulFL+58ppZfQTAPmOEwVBolveaWrX/RcnuxfkoiU3J7dUuywrk2AHlQSx6Vfsiz6Mtt9rsakd68aczg2b12YFpxQ/eGVJi6KTb9j8uHD7TIZnaL0xnmgLzSEAqZwZMJFJU0Mr66IOhl5Tl/iaihQAJpZ7jT7GAcEjlaIRXd3IDSFuDgShraJD8zM03+D6rKtDvprZYQFgRpg4rbEUgLEMcIKwwhVoab57p/lzOuKHZt0ixgODLLb4D3XhVGzk4AkGA7vsDQABwLCABDvMqlhPWJDg5K4vC9LSoBcWkW1z6Ilx8IPBizOA+x2SHZqe2LOj6iI+aIC6F4wFxqOMCcX2FQSAQLCYZiPI3N1K54lD2SIC4Dm0SGuQOBGLoBQ4dAOKjljMPzh3b2S0+owNDtAmNWM4YK2ph6PVEMB7PVokFhA4O5t9+S2xOZ8X2SjZQauAqojMroBwQHCBuNAqNXE4ggTLlL7h+K2WoUCg8FVbZrvrKg7XJvPhlYSBoIQeIKU9CisVGQFxKRi1l5FZgwgtGonyCRHZ6XkpSdvySB12aHrBLjJDM7ssVbmRSyso8E4iG0Ifvr5V6xe5yr+pzZ7lH7JRS4rOZjFVU5nK5Z6LsQdI86PLMUHgiAxDo2L49ibtF9uUmU6powG40vb6YFh9OZpqrkGXnVxjMj3PphqvtAvHR0Uc73/tcKrr/b1V6xEYdzFTU3aGxzINqOM+RMi5tQgtPJ8zTOyyLQbuhp57n+pmUoZl0i6cjgTmnD9u2S2lvTvA5r+UblURo4RmIjJ7SJYyyF+s+k4pqlYTaXE1lG0akgAXCgDQST4QLK3O2S/1Mr88jARkeHn6OBsyND4c6uZsw3STYSHG3xvPR1ErGus/NiqkCfo8ERIdmlTTdl5v434Bm6LCuU1MybWRRn4+Z46hmDzRcILiz81kBkxtTwxfxmHa8hwnUYh6q4dz/LKeas4js+CwNKEpqBawQkgxxJ+jb7pp+ZQ2EypwGvsLnybXFTjM3BR8N5Iuozv46DfhoKhNQhidUZ77lhAFplA/JCSbhOf/DwOMBOS9S01Hr3MnDAb9R+HVKezMJryxQBWnHzwPJUzuvWmlfLvJVzvJdO8Xu5vKIrLv6zZ5lFfwAFGMgeK8Ug6pYHS1pZX0g90sdlpXYqj6f/HspITYKA9Ezu1WiFNN2iCwnjKLK3JrjPDHuMXwAUTFqU6eWLmg9Bwtn9I1Tg9FrLSxNtmYGLc6y4r59vctn17EtIEAVasevXdcWesXt8uE8t5beUP8bVr8HlTdFMpl8XDY+KR++QFl8vZQ3Ay6VsuLeu8EwaiVBpy4yX6/K9bDKkQnDMkcfXhgu8zenBpvfj1yYGggd8MN7OVG7KPcjH/QfHtyBFdDcWDga34A2tx2/m+D7+IvXI+0QedDEaF4syDzxyWxG/Pr+kKhdzojalhYNhq1CSrnObOWFsl2tMucVfapOwAAhEPuVjvhnv+x0ygJbGVsADCvFNQ9mZh/1MJkhPXFR0DLxzn7LdYXIJ+6eT3nB0DPMG6wIZ1ymGnOe/QWVNhGJv/5qyh7JAR1mZJWppcKk/P5XOQNE3QMYDKnmi672G14UdGS2JuUMjF4uJeMB0VEFPy9YFJMAlMh7auckhqvM/xUg7cgGCIjpw5EKn8f9rGDMDYAoUimKSQB0VcD0PT3bm2u3wNA9ivGRSh2ANU8lEbDOkQAhoQ//1goDGAZCVmEwmIHpnsDtzr5aVxDKEQyzPJfPMrGA8MS8mqygoodLMYhimCOQXnRlhfkeSxWeB616JEBaUZoCslnob0kQzX6e/hc2174fGWvDEwAAAABJRU5ErkJggkoT">
在上面的Data URI中,data表示取得数据的协定名称,image/png 是数据类型名称,base64 是数据的编码方法,逗号后面就是这个image/png文件base64编码后的数据。 目前,Data URI scheme支持的类型有:
data:,文本数据
data:text/plain,文本数据
data:text/html,HTML代码
data:text/html;base64,base64编码的HTML代码
data:text/css,CSS代码
data:text/css;base64,base64编码的CSS代码
data:text/javascript,Javascript代码
data:text/javascript;base64,base64编码的Javascript代码
data:image/gif;base64,base64编码的gif图片数据
data:image/png;base64,base64编码的png图片数据
data:image/jpeg;base64,base64编码的jpeg图片数据
data:image/x-icon;base64,base64编码的icon图片数据
b. 在爬虫过程中对以上数据如何处理呢?请看下面代码
async def get_image(self, page, path,image_name):
# 先获取src
src_text = await get_attribute(page, path, attr="src")
# 截取除图片的内容 是经过base64加密的
image_content = src_text.split("base64,")[1]
import base64
# 通过base64解密
image_content_b = base64.b64decode(image_content.encode())
# 通过二进制写入文件,即可获得目标图片
with open(image_name, "bw") as f:
f.write(image_content_b)
async def get_attribute(page, path, attr="value"):
"""
获取指定元素的value值
:param page:
:param path:
:param attr: 默认value 默认获取value属性值
:return:
"""
for i in range(1, global_wait_time):
try:
value = await page.querySelectorEval(path, "el => el.getAttribute('{}')".format(attr))
return value
except:
await page.waitFor(global_interval_time)
continue
Exception("path=[{}]的元素的value值未找到({}秒超时)".format(path, global_wait_time))
关于page.select 方法的简单说明
简单的页面,包含一个下拉筛选
<!DOCTYPE html>
<html>
<body>
<select id="select_01">
<option value="1">Volvo</option>
<option value="2">Saab</option>
<option>Opel</option>
<option value="4">41</option>
</select>
</body>
</html>
# -*- coding: utf-8 -*-
# Author:zschuan
# Date:2020/9/28 17:24
from pyppeteer import launch
async def login():
browser = await launch(
{'headless': False, 'dumpio': True, 'autoClose': False,
'args': ['--no-sandbox', '--disable-infobars', '--window-size=1680,1050']})
page = await browser.newPage()
await page.setViewport(viewport={'width': 1680, 'height': 1050})
# 地址该处自己的地址即可
await page.goto(r"D:\Programmer\data\DataPython\zschuan\puppeteer_test\select.html")
await page.waitFor("#select_01")
# 无value属性
await page.select("#select_01", "Opel")
# # 有value属性
# await page.select("#select_01", "4")
if __name__ == "__main__":
import asyncio
asyncio.get_event_loop().run_until_complete(login())
正常的下拉中都会有value值,此时我们可以直接通过page.select(元素,value值)来实现下拉选择,但是有时候option标签中并没有value属性,此时,我么可以通过option标签之间的文本来实现下拉(例如上述实例中的【Opel】)
注:当option标签中既有value属性,又有文本信息时,此时必须先填写value属性值,直接填写文本信息,无法实现下拉选择