python爬虫批量下载图片之从列表至详情页

最新推荐文章于 2022-12-16 18:08:33 发布

Nobita Chen

最新推荐文章于 2022-12-16 18:08:33 发布

阅读量1.2k

点赞数 2

分类专栏： python 文章标签： python 爬虫

本文链接：https://blog.csdn.net/chenxiong103/article/details/101865251

版权

python 专栏收录该内容

8 篇文章 0 订阅

订阅专栏

#!/usr/bin/python
# -*- coding: UTF-8 -*-

import os, sys, time
import urllib.request, requests, bs4

#os._exit(0)

'''
下载文件
'''
def downfiles(imglist):
   #fname = time.strftime("%Y%m%d%H%M%S", time.localtime()) # 日期命名
   x = 0
   # 遍历
   for imgurl in imglist:
       # 获取获得的从imglist中遍历得到的imgurl
       imgres = requests.get(imgurl)
       fname = imgurl.split('/')[-1]
       with open("D:\\360Downloads\\{}.jpg".format(fname), "wb") as f:
           f.write(imgres.content)
           x += 1
           print("第", x ,"张")
   print("下载完毕") 


'''
读取详情页
'''
def getdetails(url):
   res = requests.get(url)
   downloadedList = [] # 下载网址列表

   res.raise_for_status()
   html = bs4.BeautifulSoup(res.text, 'html5lib')
   data = html.select('.reveal-work-wrap > img') # 返回数组

   for path in data:
      target = path.get('src') # 返回src属性
      target = target.split('@')[0] # 图片路径处理
      downloadedList.append(target) # 加入全局数组
      print(target)

   downfiles(downloadedList)

# 获取网页源代码
url = 'https://www.zcool.com.cn/'
res = requests.get(url)
downloadedList = [] # 下载网址列表

res.raise_for_status()
html = bs4.BeautifulSoup(res.text, 'html5lib')
data = html.select('.card-img > a') # 返回列表页数组

for path in data:
   target = path.get('href') # 返回src属性
   target = target.split('@')[0] # 图片路径处理
   downloadedList.append(target) # 加入全局数组
   print(target)
   getdetails(target) #下载详情页图片

print(type(downloadedList))

优化版

#!/usr/bin/python
# -*- coding: UTF-8 -*-

import os, sys, time
import urllib.request, requests, bs4

'''
windows系统文件命名，去除特殊字符
'''
def strreplace(str):
   str = str.lstrip() #去开头空格
   str = str.replace('？', '').replace('、', '').replace('/', '').replace('╲', '').replace('*', '').replace('<', '').replace('>', '').replace('|', '').replace(':', '').replace('～', '').replace('！', '')
   return str


'''
下载文件
'''
def downfiles(imglist, title):
   x = 0
   folder = 'D:\\360Downloads\\'+ time.strftime("%Y%m%d", time.localtime()) +'\\' + strreplace(title) + '\\'
   
   if not os.path.exists(folder):
      os.makedirs(folder)
      print(folder+' => 已创建')
   
   
      # 遍历
      for imgurl in imglist:
          # 获取从imglist中遍历得到的imgurl
          imgres = requests.get(imgurl)
          fname = imgurl.split('/')[-1]
          with open(folder + fname, "wb") as f:
              f.write(imgres.content)
              x += 1
              print("第", x ,"张")
              
   else:
      pass      
      print('忽略 => ' + title);

'''
读取html，返回相关源码
'''
def geturl(url):
   res = requests.get(url)
   res.raise_for_status()
   html = bs4.BeautifulSoup(res.text, 'html5lib')
   return html

'''
读取详情页
'''
def getdetails(url, title):
   templateid = 0 # 模板id
   downloadedList = [] # 下载网址列表

   html = geturl(url)
   data = html.select('.reveal-work-wrap img') # 返回数组

   if 0 == len(data): # 模板不匹配时
      data = html.select('.article-content-wraper img') # 返回数组
      templateid = 1 # 模板id

   for path in data:
      target = path.get('src') # 返回src属性

      if templateid != 1:
         target = target.split('@')[0] # 图片路径处理

      downloadedList.append(target) # 加入全局数组
      print(target)

   downfiles(downloadedList, title)

'''
Start:抓取需要采集的链接
'''
def getlinks(url='https://www.zcool.com.cn/'):
   downloadedList = [] # 下载网址列表
   downloadedTitle = [] # 下载网址标题数组
   html = geturl(url)

   data = html.select('.card-img > a') # 返回列表页数组

   for path in data:    
      target = path.get('href') # 返回src属性
      target = target.split('@')[0] # 图片路径处理

      title = path.get('title') # 返回title属性
      
      downloadedList.append(target) # 加入全局数组
      downloadedTitle.append(title) # 加入全局数组
      print(title + ' => ' + target)
      getdetails(target, title) #下载详情页图片

getlinks('https://www.zcool.com.cn/discover/607!0!0!0!0!!!!2!-1!1')

Nobita Chen

关注

2
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
python爬虫批量下载图片之从列表至详情页

#!/usr/bin/python# -*- coding: UTF-8 -*-import os, sys, timeimport urllib.request, requests, bs4#os._exit(0)'''下载文件'''def downfiles(imglist): #fname = time.strftime("%Y%m%d%H%M%S", time....
复制链接

扫一扫