import requests import os from bs4 import BeautifulSoup # 使用任意爬虫技术爬取:https://www.umei.cc/bizhitupian/diannaobizhi/ 中的前10页图片,并且打印最终个数,保存到images文件夹 # 创建保存图片的文件夹 if not os.path.exists('images'): os.mkdir('images') count = 0 for i in range(1, 11): if i == 1: url = 'https://www.umei.cc/bizhitupian/diannaobizhi/' else: url = 'https://www.umei.cc/bizhitupian/diannaobizhi/index_{}.htm'.format(i) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0" } response = requests.get(url=url, headers=headers) response.encoding = "utf-8" text = response.text soup = BeautifulSoup(text, 'lxml') # 获取图片列表数 image_elements = soup.select('#infinite_scroll > div') print('第{}页有{}张图片'.format(i, len(image_elements))) # 遍历详情网页获取图片 for image_element in image_elements: # 图片详情网页地址 url_01 = 'https://www.umei.cc/' + image_element.select('div:nth-child(1) > div:nth-child(1) > div > a')[0]['href'] title = image_element.select('span > a')[0].text image_urls = requests.get(url=url_01, headers=headers) print(image_urls) if image_urls.status_code != 200: print('图片详情网页请求失败,跳过') continue # 获取图片网页里的图片地址 image_text = image_urls.text image_soup = BeautifulSoup(image_text, 'lxml') url_02 = image_soup.select('body > div:nth-child(3) > div:nth-child(2) > div:nth-child(6) > a > img')[0]['src'] image_data = requests.get(url=url_02, headers=headers).content count += 1 # 下载图片并保存到本地文件夹images中 with open('images/{}.jpg'.format(title), 'wb') as f: f.write(image_data) print(title, "下载成功") print('第{}页爬取完成'.format(i)) print('总共爬取了{}张图片'.format(count))
bs4爬取
最新推荐文章于 2024-07-13 10:52:24 发布