playwright爬虫
with sync_playwright() as pw:
browser = pw.chromium.launch(headless=True)
context = browser.new_context()
page = context.new_page()
禁止加载图片
page.route("**/*", lambda route: route.abort()
if route.request.resource_type == "image"
else route.continue_()
)
或者, lambda可以替换为自定义的函数
page.route(re.compile(r"\.(jpg|png|svg)$"),
lambda route: route.abort())
page.goto(url)
page.wait_for_load_state('networkidle')
text = page.content()
page.close()
context.close()
browser.close()
滚动鼠标
page.mouse.wheel(0,3000)
使用登录状态
# 保存
storage = context.storage_state()
with open("state.json", "w") as f:
f.write(json.dumps(storage))
# 加载
context = browser.new_context(storage_state="login_data.json")
点击标签
page.goto(url)
time.sleep(random.uniform(3, 5))
page.locator('//a[@id="main-link"]').first.click(delay=random.uniform(0.01, 0.2), force=True)
time.sleep(random.uniform(3, 5))
video_element = page.locator('//div[text()="视频" or text()="Videos"]')
video_position = video_element.bounding_box()
x = random.uniform(video_position['x'], video_position['x'] + video_position['width'])
y = random.uniform(video_position['y'], video_position['y'] + video_position['height'])
page.mouse.move(x, y)
page.mouse.down()
time.sleep(random.uniform(0.01, 0.5))
page.mouse.up()
time.sleep(random.uniform(3, 5))