Python抓取图片

Python 抓取图片(记录)

记录过程,怕忘了。复制就能用。

# coding=utf-8
import os
import platform
from multiprocessing.pool import ThreadPool

import lxml
import requests
from lxml import etree
import time
from apscheduler.schedulers.blocking import BlockingScheduler
import logging
import random
import bs4
import sys
from random import randint
from clint.textui import progress

# 抓取网址 https://wallhaven.cc/toplist  排行榜  latest 最新  hot 热门  random 随机
url = "https://wallhaven.cc/latest"

# 下载路径
path = "D:\\Download\\wallhaven\\latest"

# http请求头
headers = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'

# 模拟浏览器请求
Hostreferer = {
    'User-Agent': headers,
    # 'Connection': 'keep-alive',
    'Referer': 'https://wallhaven.cc/toplist',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
}


# 创建文件夹
def createFile(file_path):
    if os.path.exists(file_path) is False:
        os.makedirs(file_path)
    # 切换路径至上面创建的文件夹
    os.chdir(file_path)


times = time.strftime("%Y-%m-%d_%H:%M:%S")
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    filename="log.txt",
                    filemode='a')


def imgs(lv):
    # 页码
    global s, t, k, page
    pich = ""
    pagenumber = {}
    # 套图
    pictures = {}
    # 张码
    piececode = {}
    urls = []
    try:
        html = requests.get(url, headers=Hostreferer, timeout=5)
        html.encoding = "utf-8"
        createFile(path)
        # 获取网页内容
        text = etree.HTML(html.text)
        # 获取最大页数
        kom = requests.get("https://wallhaven.cc/toplist?page=2", headers=Hostreferer, timeout=5)
        ls = etree.HTML(kom.text)
        pagenum = ls.xpath("//div[@id='thumbs']/section/header/h2/text()")[1]
        pagenum = pagenum[3:]
        print("总页数:" + pagenum)
        # 定义循环页数
        for i in range(1, int(3) + 1):
            page = url + "?page=" + str(i)
            print("图片页码:" + page)
            file = path + "\\" + str(i)
            createFile(file)
            try:
                two = requests.get(page, headers=Hostreferer, timeout=5)
                two.encoding = "utf-8"
                two_j = etree.HTML(two.text)
                pic = two_j.xpath("//div[@id='thumbs']/section/ul/li/figure/a/@href")
                for j in range(1, len(pic)):
                    go = []
                    # print("第" + str(i) + "页第" + str(j) + "个图片")
                    echo("success", "第" + str(i) + "页第" + str(j) + "个图片")
                    # print("图片地址:" + pic[j])
                    pich = pic[j]
                    t = j
                    try:
                        imgurl = requests.get(pic[j], headers=Hostreferer, timeout=5)
                        imgurl.encoding = "utf-8"
                        img = etree.HTML(imgurl.text)
                        image = img.xpath("//section[@id='showcase']/div/img/@src")
                        echo("success", "图片下载路径:" + image[0])
                        # print("图片下载路径:" + image[0])
                        arry = image[0].split('/')
                        file_name = arry[len(arry) - 1]
                        fileimg = file + "\\" + str(file_name)
                        k = fileimg
                        # imageurl = requests.get(image[0], headers=Hostreferer, timeout=5,stream=True)
                        # print(fileimg)
						# 存取图片路径
                        go.append(fileimg)
                        go.append(image[0])
                        urls.append(go)
						# 第一种下载
                        # f = open(fileimg, 'ab')
                        # f.write(imageurl.content)
                        # imageurl.close()
                        imgurl.close()
                    except Exception as e:
                        print("下载出了个问题")
                        print(e)
                # time.sleep(1)
                two.close()
            except Exception as e:
                # pictures["第" + str(s) + "页第" + str(t) + "个套图"] = pich
                print(e)
            # time.sleep(1)
            s = i
        kom.close()
        print(urls)
        # 存入网址
        imgurls = "imgurl.txt"
        ts = []
        n = 0
        print("开始下载...")
        print("图片数量:" + str(len(urls)))
        # 第二种下载
        for x in urls:
            n = n + 1
            g = open(imgurls, "a+")
            b = str(x[1]) + "\n"
            g.write(b)
            url_response(x[1], x[0], n)
        print("图片抓取完成")

        if pagenumber:
            print("无法抓取页面:")
            print(pagenumber)
        else:
            print("无法抓取页面:0")
        if pictures:
            print("无法抓取套图:")
            print(pictures)
        else:
            print("无法抓取套图:0")
        if piececode:
            print("无法抓取张码:")
            print(piececode)
        else:
            print("无法抓取张码:0")
    except Exception as e:
        timetyr = 3  # 重试次数
        if lv < timetyr:
            lv += 1
            imgs(lv)
        print(e)
        pagenumber["第" + str(s) + "页"] = page


# 定义下载函数
def url_response(url, imglen, n):
    r = requests.get(url, headers=Hostreferer, timeout=5, stream=True)
    # 定义一个1024的字节
    chunk_size = 1024
    size = 0
    content_size = int(r.headers['content-length'])
    with open(imglen, 'ab') as f:
        # 边下载边存硬盘  chunk_size=chunk_size可修改 单位为B
        for chunl in r.iter_content(chunk_size=chunk_size):
            # 写入文件
            f.write(chunl)
            size += len(chunl)  # 已下载文件大小
            # \r 指定第一个字符开始,搭配end属性完成覆盖进度条
            print('\r' + '[下载进度]: %s%.2f%%' % ('>' * int(size * 50 / content_size), float(size / content_size * 100)),
                  end='')
        print('[第' + str(n) + '个图片大小]: %0.2f MB' % (content_size / chunk_size / 1024))
        f.close()


def echo(color, *args):
    colors = {'error': '\033[91m', 'success': '\033[94m', 'info': '\033[93m'}
    if not color in colors or platform.system() == 'Windows':
        print(' '.join(args))
    print(colors[color], ' '.join(args), '\033[0m')


if __name__ == '__main__':
    imgurl = "imgurl.txt"
    open(imgurl, "a+")
    start = time.time()
    imgs(1)
    end = time.time()
    print('\n' + "全部下载完成!用时%s秒" % (end - start))

    scheduler = BlockingScheduler()
    scheduler._logger = logging
    scheduler.start()

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值