学习目标:
通过自动化技术实现网络图片批量保存到本地电脑
例如:
- 以优美图库网站上提供的图片为基础进行批量保存
学习内容:
- 从优美图库首页定位到要保存图片的代码部门
- 逐一跳转到各个图片的子页面上
- 从各子页面中提取出需求的图片信息
- 保存获取的图片
学习产出:
# coding=UTF-8
# 拿到首页代码,定位提取到子页面的位置上 href
# 通过href 找到子页面的内容 提取图片下载地址 img -> src
# 下载图片
import requests
import io
import sys
import re
from bs4 import BeautifulSoup
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='UTF-8')
domain = "https://www.umei.cc"
url = "https://www.umei.cc/tags/xinggannvshen-9.htm"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 "
"Safari/537.36 "
}
resp = requests.get(url, headers=headers)
obj = re.compile(r'<div class="picbox"><a href="(?P<page1>.*?)">', re.S) # 取到图片页面的链接
obj1 = obj.finditer(resp.text)
ulli_list = []
for it in obj1: # 拼接成完整的url链接
ul = it.group("page1")
ul_li = domain + ul
ulli_list.append(ul_li)
for liurl in ulli_list:
# 拿到子页面源代码
child_page_resp = requests.get(liurl)
child_page_text = child_page_resp.text
# 获取下载路径
child_page = BeautifulSoup(child_page_text, "html.parser")
div = child_page.find("div", class_="big-pic")
img = div.find("img")
src = img.get("src")
# 下载图片
img_resp = requests.get(src)
img_name = src.split("/")[-1] # 图片url中最后一个/后面的内容
with open("img/" + img_name, mode="wb+") as f:
f.write(img_resp.content)
print(img_name, ",over")
f.close()
print("all_over")
由于技术水平的限制,还没有实现自动翻页的功能,往后会继续学习,同时也恳求各行各业的精英积极提出宝贵的意见,相互学习、相互进步
今天修改后的代码再之前的基础上增加了分页遍历所有图片,统计单个照片、一个页面、整个活动的使用时间,输出显示进度条功能,文件统一进行命名存放的功能,具体完整部分代码见下:
# coding=UTF-8
# 拿到首页代码,定位提取到子页面的位置上 href
# 通过href 找到子页面的内容 提取图片下载地址 img -> src
# 下载图片
import requests
import io
import sys
import re
from bs4 import BeautifulSoup
import datetime
from tqdm import tqdm
all_start = datetime.datetime.now()
domain = "https://www.umei.cc"
for page in range(1, 16): # 提前查看原网页,看有多少页,然后n+1
page_start = datetime.datetime.now()
print('开始获取第{}页的图片信息'.format(page))
url = f"https://www.umei.cc/tags/xinggannvshen-{page}.htm"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 "
"Safari/537.36 "
}
resp = requests.get(url, headers=headers)
obj = re.compile(r'<div class="picbox"><a href="(?P<page1>.*?)">', re.S) # 取到图片页面的链接
obj1 = obj.finditer(resp.text)
ulli_list = []
for it in obj1: # 拼接成完整的url链接
ul = it.group("page1")
ul_li = domain + ul
ulli_list.append(ul_li)
x = 0
pbar = tqdm(total=len(ulli_list), desc="本页采集进度", unit="单文件用时") # 进度条设置
for liurl in ulli_list:
start = datetime.datetime.now()
# 拿到子页面源代码
child_page_resp = requests.get(liurl)
child_page_text = child_page_resp.text
# 获取下载路径
child_page = BeautifulSoup(child_page_text, "html.parser")
div = child_page.find("div", class_="big-pic")
x += 1
img = div.find("img")
# name = img.get("alt") + ".jpg"
src = img.get("src")
# 下载图片
img_resp = requests.get(src)
img_name = src.split("/")[-1] # 图片url中最后一个/后面的内容
# img_name = "umeicc_{}.jpg".format(x) # 所有图片采用一样的命名格式
with open("img1/" + img_name, mode="wb+") as f: # 存放路径的文件夹提前创建好
f.write(img_resp.content)
f.close()
pbar.update(1) # 进度条更新
delta = (datetime.datetime.now() - start).total_seconds()
print("\n" + img_name, f",下载用时:{delta}s")
child_page_resp.close()
page_delta = (datetime.datetime.now() - page_start).total_seconds()
resp.close()
print('\n第{}页的图片信息获取完成'.format(page) + f",用时:{page_delta}s")
all_delta = (datetime.datetime.now() - all_start).total_seconds()
print(f"全部下载完毕,用时:{all_delta}s")
以上代码,运行时发现,统一命名编号有问题,原网页中每页展示的数据条数不是统一的,所以统一命名的问题还存在,等待进一步去解决
这次加入了文件统一命名
# coding=UTF-8
# 拿到首页代码,定位提取到子页面的位置上 href
# 通过href 找到子页面的内容 提取图片下载地址 img -> src
# 下载图片
import requests
import io
import sys
import re
from bs4 import BeautifulSoup
import datetime
from tqdm import tqdm
import time
all_start = datetime.datetime.now()
domain = "https://www.umei.cc"
for page in range(1, 252): # 提前查看原网页,看有多少页,然后n+1 28\29\62\65\72\111\112\115\125\127\152\162\197\202页有异常链接
page_start = datetime.datetime.now()
print('开始获取第{}页的图片信息'.format(page))
url = f"https://www.umei.cc/meinvtupian/xingganmeinv/index_{page}.htm"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 "
"Safari/537.36 "
}
resp = requests.get(url, headers=headers)
obj = re.compile(r'<div class="title"><span><a href="(?P<page1>.*?)">', re.S) # 取到图片页面的链接
obj1 = obj.finditer(resp.text)
ulli_list = []
for it in obj1: # 拼接成完整的url链接
ul = it.group("page1")
ul_li = domain + ul
print('取到第{}页的链接:'.format(page) + ul_li)
ulli_list.append(ul_li)
x = 0
pbar = tqdm(total=len(ulli_list), desc="本页采集进度", unit="单文件用时") # 进度条设置
for liurl in ulli_list:
start = datetime.datetime.now()
# 拿到子页面源代码
child_page_resp = requests.get(liurl)
child_page_text = child_page_resp.text
# 获取下载路径
child_page = BeautifulSoup(child_page_text, "html.parser") # 要加个异常处理抓取到的空白页面时的中断操作
div = child_page.find("div", class_="big-pic") # 要加个异常处理没有图片时的中断操作
img = div.find("img")
# name = img.get("alt") + ".jpg"
src = img.get("src")
# 下载图片
x += 1
img_resp = requests.get(src)
# img_name = src.split("/")[-1] # 图片url中最后一个/后面的内容
img_name1 = "xingganmeinv_{}".format(x) # 所有图片采用一样的命名格式
img_name = img_name1 + "_第{}页.jpg".format(page)
with open("img1/" + img_name, mode="wb+") as f: # 存放路径的文件夹提前创建好
f.write(img_resp.content)
f.close()
pbar.update(1) # 进度条更新
time.sleep(1)
delta = int((datetime.datetime.now() - start).total_seconds())
print("\n" + img_name, f",下载用时:{delta}s")
child_page_resp.close()
page_delta = int((datetime.datetime.now() - page_start).total_seconds())
resp.close()
print('\n第{}页的图片信息获取完成'.format(page) + f",用时:{page_delta}s")
all_delta = int((datetime.datetime.now() - all_start).total_seconds())
print(f"全部下载完毕,用时:{all_delta}s")
实际测试中,存在偶尔出现各种系统性问题导致的任务中断,从而重新根据最新的时间在新的断点处开始
#coding=UTF-8
import asyncio
import aiohttp
import aiofiles
import requests
from lxml import etree
import time
from concurrent.futures import ThreadPoolExecutor
async def download_pic(url, sem):
name = url.rsplit('/', 1)[1] # 设定文件名字
timeout = aiohttp.ClientTimeout(total=300)
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 "
"Safari/537.36 ",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
}
conn = aiohttp.TCPConnector(limit=10)
async with sem:
async with aiohttp.ClientSession(connector=conn, timeout=timeout) as session: # requests
async with session.get(url, headers=headers) as resp: # requests.get()
async with aiofiles.open("pic/" + name, 'wb') as f:
await f.write(await resp.content.read()) # 读取内容异步需要挂起
print('下载完成{}'.format(name))
def get_page_url():
urls = []
for page_num in range(1, 2):
if page_num == 1:
url = 'https://www.umei.cc/bizhitupian/diannaobizhi'
else:
url = 'https://www.umei.cc/bizhitupian/diannaobizhi/index_{}.htm'.format(page_num)
resp = requests.get(url)
html = etree.HTML(resp.text)
table = html.xpath('//div[contains(@class,"item masonry_brick")]')
if table:
url_list = table[0].xpath('//div[contains(@class,"img")]//@href')
urls.extend(url_list)
return urls
page_urls = []
def get_page_urls1(page_num):
if page_num == 1:
url = 'https://www.umei.cc/bizhitupian/diannaobizhi'
else:
url = 'https://www.umei.cc/bizhitupian/diannaobizhi/index_{}.htm'.format(page_num)
resp = requests.get(url)
html = etree.HTML(resp.text)
table = html.xpath('//div[contains(@class,"item masonry_brick")]')
if table:
url_list = table[0].xpath('//div[contains(@class,"img")]/a/@href')
page_urls.extend(url_list)
def get_page_urls():
with ThreadPoolExecutor(100) as t:
for i in range(1, 1044): # 尽量不要一次取完,占用内存太高
args = [i] # 如果中间任务断了,不知道进度走到了哪里,无法确定进度
t.submit(lambda p: get_page_urls1(*p), args)
clean_urls = list()
def get_pic_url1(url):
url = 'https://www.umei.cc{}'.format(url)
resp = requests.get(url)
html = etree.HTML(resp.text)
pic_link = html.xpath('//div[contains(@class,"big-pic")]/a/img/@src')[0]
clean_urls.append(pic_link)
def get_pic_url():
with ThreadPoolExecutor(100) as t:
for i in page_urls:
args = [i]
t.submit(lambda p: get_pic_url1(*p), args)
async def main():
tasks = []
get_page_urls()
get_pic_url()
print('即将下载的文件总数{}'.format(len(clean_urls)))
sem = asyncio.Semaphore(15)
for url in clean_urls:
task = asyncio.create_task(download_pic(url, sem))
tasks.append(task)
await asyncio.wait(tasks)
if __name__ == '__main__':
start = time.time()
print("任务开始执行,后台计时")
asyncio.run(main())
end = time.time()
all_time = int(end - start)
print("任务执行完毕,计时结束")
print('抓取耗时:{}s'.format(all_time))
加入了异步协程,下载文件速度提高了许多,但是开始下载前会在后台一次存取打开所有的图片子页面链接,占用系统资源很高。出错后不容易确定具体的进度