爬取甄嬛传图片

太焦虑了

已于 2024-09-08 11:00:53 修改

阅读量557

点赞数 7

文章标签： python beautifulsoup 正则表达式

于 2024-09-08 10:58:05 首次发布

本文链接：https://blog.csdn.net/m0_62568587/article/details/142023506

版权

通过playwright和re模块爬取甄嬛传图片并保存到文件夹

from playwright.sync_api import Playwright, sync_playwright, expect
from lxml import etree
import re
from itertools import islice

header={
    'user-agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Mobile Safari/537.36 Edg/128.0.0.0'
}
with sync_playwright() as playwright:

    browser = playwright.chromium.launch(executable_path='C:/Program Files/Google/Chrome/Application/chrome.exe', headless=True)
    context = browser.new_context()
    page = context.new_page()
    page.goto('https://www.douban.com/group/topic/178075294/?_i=55379458etXyYW')
    # 等待动态内容加载
    page.wait_for_selector('#link-report > div > div > div:nth-child(99) > div > img')
    page_source = page.content()
    print(page_source)
    if "男人都会让女人伤心的" in page_source:
       print("成功拿到数据")

    #加一个文件夹
    dir_name='甄嬛传'
    if not os.path.exists(dir_name):
        os.mkdir(dir_name)
    #解析数据
    r=r'<img height="auto" src="(?P<src>.*?)" style=".*?" width="500"'
    obj=re.compile(r)
    result=obj.finditer(page_source)
    for url in result:
        #print(url)
        u=url.group("src")
        print(u)
        fileName=u.split("/")[-1]
        response=requests.get(u,headers=header)
        with open(dir_name+"/"+fileName,'wb') as f:
            f.write(response.content)

print("所有图片都已完成")