微软“最强自动化工具”playwright实战项目
近期微软推出了一款号称“最强”的自动化测试工具,网上便出现了大批讲解文章,一看全是github上的reademe.md。估计是来赚那点文章费的。。。
“只要和网络扯上关系,就能用来做爬虫。”
playwright相比selenium支持异步,相比pyppeteer背后有微软在维护,并且支持录制操作。唯一一点国内好像没有详细的文档,只有官方的英文文档(这让不会英文的我苦比了T。T)。但是“我爱学习”,用一个以前的工作项目来熟悉这个工具吧 ^。*
项目介绍
有谱么
爬取有谱么吉他和有尤克里里的每首谱子并保存至png格式,但谱子是渲染上去的所以需使用自动化工具
playwright安装
# 安装python库
pip install playwright -i https://pypi.douban.com/simple
# 安装驱动
python -m playwright install
实现步骤
- 获取每首谱子的url
有谱么这个网站没什么反爬措施,全是api传输数据,稍微分析下就能拿到想要的数据。直接上代码吧。
import requests
from faker import Faker
import asyncio
import os
import json
import aiohttp
fake = Faker()
YOOPU_TYPE = ["guitar", "ukulele"]
# 获得榜单用户id
def userCodeList(t):
url = f"https://yoopu.me/users/ranking/totalscore?instrument={t}"
headers = {
"User-Agent": fake.user_agent()
}
resp = requests.get(url, headers=headers)
data = resp.json()
codeList = [user["userCode"] for user in data["userRanks"]]
return codeList
# 获得用户的曲子id,title
async def songInfo(userCode):
headers = {
"User-Agent": fake.user_agent(),
"Referer": "https://yoopu.me/view-user"
}
song_list = []
async with aiohttp.ClientSession() as session:
for i in range(0, 10000, 20):
url = f"https://yoopu.me/api/user/sheets?code={userCode}&start={i}&sort=views"
print(url, "start")
async with session.get(url, headers=headers) as resp:
# 边界值
data = await resp.text()
if data == "[]":
break
data = await resp.json()
for song in data:
item = {
"type": song['type'],
"title": song['title'],
"artist": song['artist'],
"id": song['id']
}
song_list.append(item)
# break
return song_list
# 回调函数,保存数据
def callback_songInfo(future):
song_list = future.result()
songs = os.path.join(os.path.abspath(os.path.dirname(__file__)), "songs.txt")
with open(songs, "a+", encoding='utf-8') as f:
for song in song_list:
f.write(json.dumps(song, ensure_ascii=False) + "\n")
print(song)
def async_run():
loop = asyncio.get_event_loop()
tasks = []
for t in YOOPU_TYPE:
code_list = userCodeList(t)
for code in code_list:
print(code, "start")
task = asyncio.ensure_future(songInfo(code))
task.add_done_callback(callback_songInfo) # 回调
tasks.append(task)
loop.run_until_complete(asyncio.wait(tasks))
if __name__ == "__main__":
async_run()
运行完数据保存在本地txt文件里,当然也可以redis或其他什么里做个增量爬取。
- 获取谱子图片
单首谱子url格式为: f"https://yoopu.me/view/{id}"
这里使用playwright的异步操作,先来个简单实例
# 导入包
from playwright import async_playwright
import asyncio
async def func(url):
async with async_playwright() as asp:
# chrome驱动, 有头模式
browser = await asp.chromium.launch(headless=False)
# 新建窗口
page = await browser.newPage()
# 进入网页
await page.goto(url)
# 截屏
await page.screenshot()
input()
# 关闭浏览器
await browser.close()
if __name__ == "__main__":
loop = asyncio.get_event_loop()
url = "https://yoopu.me/view/b19eym10"
loop.run_until_complete(func(url))
上面是一个简单的例子,单项目却要复杂一点,所以我先写一下我的思路:
- 所有谱子一共有1.3w+,就是说要打开一个浏览器,1.3w+个网页。
- 一个网页完成任务就关闭,使用信号量 asyncio.Semaphore() 控制并发量,防止网页打开太多电脑卡死,尤其是虚拟机。
- 有些谱子长度过长为了可以截取长图
- 要先获取具体谱子在网页中的位置
- 然后修改网页大小至可以完全展示全部谱子长度
- 然后浏览器截图和再次记录谱子在网页中的位置
- 最后通过 pillow的 crop()方法截取最终图片并保存至本地
- 网页上还有些小功能键会影响最终效果,所以在打开网页加载完所有网站js后,注入自己写的js,清除掉这个功能键。
- 保存日志文件方便观察
上代码:
from playwright import async_playwright
import asyncio
import os
import json
from PIL import Image
from loguru import logger
import time
import re
# 异常处理
def errPro(func):
'''
一个装饰器,用来记录异常
'''
def inner(*args, **kwargs):
try:
res = func(*args, **kwargs)
return res
except Exception as e:
raise Exception(f"{func.__name__} - {str(e)}")
return inner
#去除特殊字符
def isSpec(l):
'''
有些 title 或 artist 会包含 "/",会导致异常,这里正则处理一下
只有中文,英文字符,-,数字才会保留下来
'''
l = json.loads(l)
comp = re.compile("[\u4e00-\u9fa5a-zA-Z\-0-9]{0,}")
for k,v in l.items():
words = comp.findall(v)
words = "".join(words)
l[k] = words
return l
# 读取songs.txt
@errPro
def readSong():
'''
读取 songs.txt 返回要爬取的歌曲信息
'''
songs = os.path.join(os.path.abspath(os.path.dirname(__file__)), "songs.txt")
with open(songs, "r", encoding='utf-8') as f:
song_list = [isSpec(l) for l in f.readlines()]
return song_list
# 剪切图片
@errPro
def cropPic(pic_path, box):
'''
使用 crop 剪切出最终图片
'''
img = Image.open(pic_path)
img = img.convert("RGB")
img = img.crop(box)
img.save(pic_path)
logger.info(f"{pic_path} ok")
@errPro
async def screenshotPic(songInfo, browser, semaphore):
'''
处理单个网页
'''
# 网页url
url = f"https://yoopu.me/view/{songInfo['id']}"
# 图片保存路径
pic_path = os.path.join(os.path.join(os.path.abspath(os.path.dirname(__file__)), "pus"), f"{songInfo['title']}-{songInfo['artist']}-{songInfo['type']}.png")
async with semaphore:
page = await browser.newPage()
await page.goto(url)
logger.info(f"{songInfo['title']} start")
# 等待直到 selector出现,相当于selenium的隐式等待
await page.waitForSelector("//hexi-sheet")
# 注入js,这个api是在加载网站js后再运行,实现清除两个功能键
await page.addScriptTag(content='''
document.getElementsByTagName("yp-slider-play")[0].style.display = "none";
document.getElementsByClassName("fullscreen-button yoopu3-icon")[0].style.display = "none";
''')
# 获得谱子对象
pu = await page.querySelector("//hexi-sheet")
# 获得谱子的边界框值: {左上点xy坐标和宽高} -> dict
location = await pu.boundingBox()
# 调整页面大小
await page.setViewportSize(width=int(location['width']*1.2), height=int(location['height']*1.2))
# 重新获取谱子的大小
location = await pu.boundingBox()
# 截屏
await page.screenshot(path=pic_path)
# 剪切图片
box = [location["x"], location["y"], location["x"]+location["width"], location["y"]+location["height"]]
cropPic(pic_path, box)
# 关闭单个页面
await page.close()
logger.info(f"{songInfo['title']} end")
async def main():
async with async_playwright() as asp:
# 打开浏览器
browser = await asp.chromium.launch(headless=True)
# 信号量,限制并发数,这里我用的是虚拟机怕崩掉,所以限制6个并发,依据自己运行环境设置并发量
semaphore = asyncio.Semaphore(6)
song_list = readSong()
# 创建任务列表,这时任务状态还是 Pending
tasks = [asyncio.ensure_future(screenshotPic(song, browser, semaphore)) for song in song_list]
# 实现异步嵌套
dones, pendings = await asyncio.wait(tasks)
for t in dones:
t.result()
# 关闭浏览器
await browser.close()
if __name__ == "__main__":
# 记录一下时间
start = time.time()
# 日志保存位置
log = logger.add(os.path.join(os.path.abspath(os.path.dirname(__file__)), "yoopu.log"))
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
end = time.time()
logger.info(f"共耗时:{end - start}s")
总结一下
1.3w+个任务,每个任务生成3行log,log中也没有出现DEBUG。同一时间存在6个任务,总共耗时9318秒 约等于 2.5个小时,增加并发量还能更快一点。相比selenium同步实现,还算可以吧。。。
playwright常用API
playwright一个比较坑人的地方就是文档是 api名与实际不符合吧,实际api用的是驼峰命名法,还有就是没有案例,不过可以直接看源码,还有参数详解也挺好的。
- waitForSelector [等待]
- addInitScript & addScriptTag [注入js,前者是在任何脚本之前执行,后者是在任何脚本之后执行,一般用后者]
-
querySelector & querySelectorAll [查找对象,前者返回一个对象,后者返回一个列表]
-
boundingBox [获得对象的边框值]
-
getAttribute [获得属性值]
-
innerHTML & innerText [前者获得对象html,后者获得对象内所有文本]
-
fill & click [前者填充文本,后者点击,可以组合实现点击搜索功能]
代码块
from playwright import async_playwright
import asyncio
async def example():
async with async_playwright() as asp:
browser = await asp.chromium.launch(headless=False)
url = "https://yoopu.me/view/b19eym10"
page = await browser.newPage()
await page.goto(url)
await page.waitForSelector("//hexi-sheet")
await page.addInitScript(source="alert('hello, world')")
await page.addScriptTag(content='''
document.getElementsByTagName("yp-slider-play")[0].style.display = "none";
document.getElementsByClassName("fullscreen-button yoopu3-icon")[0].style.display = "none";
''')
pu = await page.querySelector("//hexi-sheet")
pu_all = await page.querySelectorAll("//hexi-sheet")
print("pu:", pu)
print("pu_all:", pu_all)
pu_location = await pu.boundingBox()
print("pu_location:", pu_location)
instrument = await pu.getAttribute("instrument")
print("instrument:", instrument)
innerhtml = await pu.innerHTML()
innertext = await pu.innerText()
print("innerHtml:", innerhtml[:50])
print("innerText:", innertext[:50])
q = await page.querySelector("//form[@class='searchContainer']/input")
yoopu3_icon = await page.querySelector("//form[@class='searchContainer']/a")
await q.fill("再见")
await yoopu3_icon.click()
# 阻塞
input()
await browser.close()
if __name__ == "__main__":
asyncio.get_event_loop().run_until_complete(example())
我先写这些吧,一些情况也够用了,如果想实现一些复杂的动作链,可以看看文档,如果和我一样看不懂英文可以在这个网站下个百度翻译插件。如果代码有错误请一定要提出来。
看文档不易,点个赞吧 :)