通过playwright和re模块爬取甄嬛传图片并保存到文件夹
from playwright.sync_api import Playwright, sync_playwright, expect
from lxml import etree
import re
from itertools import islice
header={
'user-agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Mobile Safari/537.36 Edg/128.0.0.0'
}
with sync_playwright() as playwright:
browser = playwright.chromium.launch(executable_path='C:/Program Files/Google/Chrome/Application/chrome.exe', headless=True)
context = browser.new_context()
page = context.new_page()
page.goto('https://www.douban.com/group/topic/178075294/?_i=55379458etXyYW')
# 等待动态内容加载
page.wait_for_selector('#link-report > div > div > div:nth-child(99) > div > img')
page_source = page.content()
print(page_source)
if "男人都会让女人伤心的" in page_source:
print("成功拿到数据")
#加一个文件夹
dir_name='甄嬛传'
if not os.path.exists(dir_name):
os.mkdir(dir_name)
#解析数据
r=r'<img height="auto" src="(?P<src>.*?)" style=".*?" width="500"'
obj=re.compile(r)
result=obj.finditer(page_source)
for url in result:
#print(url)
u=url.group("src")
print(u)
fileName=u.split("/")[-1]
response=requests.get(u,headers=header)
with open(dir_name+"/"+fileName,'wb') as f:
f.write(response.content)
print("所有图片都已完成")