图虫网照片下载_图虫网下载图片-CSDN博客

本文链接：https://blog.csdn.net/qq_37728432/article/details/136254769

本文详细介绍了如何使用Python3中的Selenium、requests等库爬取图虫网的图片，同时强调了遵守国家规范和不占用公共资源的原则，提供了代码示例和注意事项。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

描述

利用python3爬取图虫网图片,请按照国家规范爬取资源,勿占用社会资源,学习可以,请不要商用

思路

分析网页内容
获取图片地址
下载保存图片
上传到图库
供大家欣赏使用

注意事项

爬取图片时请注意不要占用资源

代码讲解

导包

    pip3 install selenium time os requests random urllib fake_useragent

需要用到的包

from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import os
import requests
import random
from urllib import parse
from fake_useragent import UserAgent

代码部分

#!/usr/bin/python3
# -*- coding:utf-8 -*-
def get_page(driver,url,path):
    print('open chrome ...')
    driver.get(url)
    print('Waiting for the chrome browser to load resources')
    time.sleep(2)
    html = driver.page_source
    print("resources load OK content len : ",len(html))
    urls = getDownloadUrl(driver)
    downloadImages(urls,path)

def  getDownloadUrl(driver):
     # 下载地址拼写
    # https://cdn3-banquan.ituchong.com/weili/image/ml/{i}.jpeg
    el = driver.find_elements(By.CLASS_NAME,value='search-result__item')
    urls = []
    for i in el:
        imagesUrl = i.get_attribute("href")
        imagesId = get_params(imagesUrl)
        url = "https://cdn3-banquan.ituchong.com/weili/image/l/{}.jpeg".format(imagesId[0])
        urls.append(imagesId[0] +","+url)
    return urls

def downloadImages(urls,path):
    print(" start download images")
    for url in urls:
        arr = url.split(",") 
        download_img(arr[1],arr[0],path)
    # 防止IP禁用     
    time.sleep(2)  

def download_img(url,name,path):
    if not os.path.exists(path):
        os.mkdir(path)
    print('download images ID：',name," url: ",url)
    filePath = path+name+str(time.time())+".jpg"
    # 随机休眠 防止占用公共资源
    time.sleep(random.randint(5, 20))
    content = request_get(url, "image")
    with open(filePath,'wb') as f:
        f.write(content)

def get_params(url):
    params = parse.parse_qs(parse.urlparse(url).query)
    return params['imageId']
# 请求函数
def request_get(url, ret_type="text", timeout=20, encoding="GBK"):
    headers = {
        'User-Agent':UserAgent().random
    }
    res = requests.get(url=url, headers=headers, timeout=timeout)
    res.encoding = encoding
    if ret_type == "text":
        return res.text
    elif ret_type == "image":
        return res.content

if __name__ == '__main__':
    driver = webdriver.Chrome()
    url = "https://stock.tuchong.com/search?isTrusted=true&page=1&platform=image&term=%E6%97%B6%E5%B0%9A"
    urls = []
    path = "D:/images/"
    for i in range(1, 5):
        urlStr = url.format(i)
        urls.append(urlStr)
        print("download images list URL info ：",urlStr)
    for url in urls:
        get_page(driver,url,path)
        print("start dowmload next page images ")
    driver.close()      # 关闭当前窗口网页