提示:文章转载标明出处
文章目录
- 单个案例测试
- 一 基本用法
- 二 异步
- 总结
单个案例测试
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto('https://www.ebaina.com/')
page.wait_for_load_state('networkidle')
html = page.content()
print(html)
browser.close()
一、基本用法
支持三种浏览器内核:chromium, firefox, webkit
browser = p.chromium.launch(headless=False)
browser = p.firefox.launch(headless=False)
browser = p.webkit.launch(headless=False)
headless=False 打开无痕页面,设置为True 为无头模式
browser = p.chromium.launch(headless=False)
设置等待时间1秒
page.wait_for_load_state(1000)
不加载图片
from playwright.sync_api import sync_playwright
import re
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
# 不加载图片
def cancel_request(route, request):
route.abort()
page.route(re.compile(r"(.png)|(.jpg)"), cancel_request)
page.goto('https://www.ebaina.com/')
page.wait_for_load_state('networkidle')
html = page.content()
print(html)
browser.close()
关闭Webdriver属性
from playwright.sync_api import sync_playwright
import re
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
# 关闭Webdriver属性
js = """
Object.defineProperties(navigator, {webdriver:{get:()=>undefined}});
"""
page.add_init_script(js)
# 不加载图片
def cancel_request(route, request):
route.abort()
page.route(re.compile(r"(.png)|(.jpg)"), cancel_request)
page.goto('https://www.ebaina.com/')
page.wait_for_load_state('networkidle')
html = page.content()
print(html)
browser.close()
二、异步
1.引入库
单独强调一下是因为异步库有点不一样
from playwright.sync_api import sync_playwright # 非异步
from playwright.async_api import async_playwright # 异步库
2.异步用法
import asyncio
from playwright.async_api import async_playwright
async def page_request(project_url):
"""
playwright 火狐请求
:param project_url:
:return:
"""
async with async_playwright() as p:
browser = await p.firefox.launch(
proxy={
"server": "http://http-dyn.abuyun.com:9020",
"username": "123",
"password": "abc"
},
headless=False
)
user_context = await browser.new_context()
page_info = await user_context.new_page()
try:
await page_info.goto(project_url)
await page_info.wait_for_load_state('networkidle')
html = await page_info.content()
print(html)
except Exception as e:
print(e)
await browser.close()
async def main(li_project):
tasks = []
# 项目列表信息
for project_info in li_project:
# 取详情信息
task = asyncio.ensure_future(page_request(project_info))
tasks.append(task)
print('异步个数:', len(tasks))
await asyncio.gather(*tasks)
if __name__ == '__main__':
# 异步个数
async_time = 3
li_projects = ["https://www.baidu.com/", 'https://www.ebaina.com/', 'https://www.cnblogs.com/']
for num in range(len(li_projects)):
if num % async_time == 0:
asyncio.run(main(li_projects[num: num+async_time]))
总结
palywright目前还比较新,不同于selenium,pypeteer,目前很多网站还是可以用的。
现在爬虫越来越难做了,太卷了,不回逆向寸步难行,慢慢更逆向吧