爬取甄嬛传图片

通过playwright和re模块爬取甄嬛传图片并保存到文件夹


from playwright.sync_api import Playwright, sync_playwright, expect
from lxml import etree
import re
from itertools import islice

header={
    'user-agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Mobile Safari/537.36 Edg/128.0.0.0'
}
with sync_playwright() as playwright:

    browser = playwright.chromium.launch(executable_path='C:/Program Files/Google/Chrome/Application/chrome.exe', headless=True)
    context = browser.new_context()
    page = context.new_page()
    page.goto('https://www.douban.com/group/topic/178075294/?_i=55379458etXyYW')
    # 等待动态内容加载
    page.wait_for_selector('#link-report > div > div > div:nth-child(99) > div > img')
    page_source = page.content()
    print(page_source)
    if "男人都会让女人伤心的" in page_source:
       print("成功拿到数据")

    #加一个文件夹
    dir_name='甄嬛传'
    if not os.path.exists(dir_name):
        os.mkdir(dir_name)
    #解析数据
    r=r'<img height="auto" src="(?P<src>.*?)" style=".*?" width="500"'
    obj=re.compile(r)
    result=obj.finditer(page_source)
    for url in result:
        #print(url)
        u=url.group("src")
        print(u)
        fileName=u.split("/")[-1]
        response=requests.get(u,headers=header)
        with open(dir_name+"/"+fileName,'wb') as f:
            f.write(response.content)

print("所有图片都已完成")

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值