【初学Python爬虫】使用selenium、BeautifulSoup爬图片

ppphhz

已于 2022-09-21 17:20:34 修改

阅读量355

点赞数

文章标签： python 爬虫 selenium

于 2022-09-21 17:19:13 首次发布

本文链接：https://blog.csdn.net/phhzhhh/article/details/126976527

版权

准备工作：利用selenium自动刷新网页
python代码：

'''
Author: lu
Date: 2022-09-19 14:27:31
LastEditors: lu
LastEditTime: 2022-09-21 17:13:16
FilePath: \study\savePic.py
Description: 爬图片
'''
#python savePic.py

#pip install os

import os
import requests
import bs4
from bs4 import BeautifulSoup #pip install BeautifulSoup4
from selenium.webdriver import Chrome
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options



webdrive = Chrome(executable_path=r"C:\Users\18851\AppData\Local\Programs\Python\Python310\Scripts\chromedriver.exe") #chromedriver路径
url = "https://www.csdn.net/" #要爬的网站

webdrive.get(url) #发请求
text = webdrive.page_source #得到页面element的html代码
tree = BeautifulSoup(text,"html.parser")
imgs =  tree.find_all('img')
path=r'./phhz'#判断同级有没有文件夹
print(os.path.exists(path))
if(not os.path.exists(path)):#如果同级没有文件夹
    os.mkdir(path)

url = ''
for i in range(1,len(imgs)):
    print('imgs[i] ', imgs[i] )  #标签
    print('imgs[i] ',type(imgs[i] ) )  #是BeautifulSoup标签
    if(imgs[i].has_key('data-src')):   #判断BeautifulSoup获取的标签有没有属性
        url = imgs[i].attrs['data-src']
    else:   
        url = imgs[i].attrs['src']
    img = requests.get(url)
    filename = 'phhz'+str(i)+'.jpg'
    with open('./phhz/'+filename,'wb') as f: f.write(img.content)
webdrive.quit()

结果