python爬取图片地址，并将图片保存到服务器

最新推荐文章于 2021-11-15 10:38:55 发布

一起学python吧

最新推荐文章于 2021-11-15 10:38:55 发布

阅读量2.9k

点赞数

分类专栏：图片下载 python 文章标签：图片下载

本文链接：https://blog.csdn.net/myli_binbin/article/details/98957593

版权

python 同时被 2 个专栏收录

59 篇文章 4 订阅

订阅专栏

图片下载

4 篇文章 0 订阅

订阅专栏

最近在做一项下载图片的任务，所有的图片地址都是使用爬虫爬取下来并保存在mongodb数据库里面的，

由于当时图片部分不是我自己抓取的，不知道不同的库之间还会存在相同的图片地址，所以使用下面代码导致了部分图片下载两遍的情况，但是数据库只存了一遍，浪费了服务器空间（可以在下载之前查一下数据库中是否已经下载了该图片，当然，如果确定图片肯定没有重复的就不用做此判断了）

# -*- coding: utf-8 -*-
import hashlib
import uuid
import os
import urllib.request
from pymongo import MongoClient
import datetime


# 生成新图片唯一名称
def get_unique_name():
    uuid_val = uuid.uuid4()
    uuid_str = str(uuid_val).encode("utf-8")
    md5 = hashlib.md5()
    md5.update(uuid_str)
    return md5.hexdigest()


# 生成对应关系时“_id”的值
def get_old_image_md5(addr):
    addr = str(addr).encode("utf-8")
    md5 = hashlib.md5()
    md5.update(addr)
    return md5.hexdigest()


conn = MongoClient('xx.xx.xx.xx', 27017)
数据库名 = conn.数据库名
集合名 = 数据库名.集合名
print("连接原始图片库成功")
print("本次需要下载的图片个数为:%s" % sourcePS_url.find().count())


cli = MongoClient('xx.xx.xx.xx', 27017)
数据库名 = cli.数据库名
集合名 = 数据库名.集合名
errorUrl = vgs_source.errorUrl
print("连接保存图片库成功")
base_file_path = '/data/images/images/'


# 下载图片
def getPictures():
    num = 0
    for x in source.find().batch_size(80):
        # 找出属于那个网站的数据
        source = x.get("source")
        if source == "TVSOU" or source == "CISCP" or source == "CCS":
            print("不要", source + "数据")
            continue
        else:
            print(source)
            file_path = base_file_path + source.lower() + "/"
            # 找出属于那种类型
            type = x.get("type")
            if type == "other":
                file_path = file_path + "other" + "/"
            elif type == "音乐":
                file_path = file_path + "music" + "/"
            elif type == "儿童":
                file_path = file_path + "child" + "/"
            elif type == "动漫":
                file_path = file_path + "cartoon" + "/"
            elif type == "戏曲":
                file_path = file_path + "traditional_opera" + "/"
            elif type == "新闻":
                file_path = file_path + "news" + "/"
            elif type == "综艺":
                file_path = file_path + "variety" + "/"
            elif type == "电视剧":
                file_path = file_path + "teleplay" + "/"
            elif type == "其他":
                file_path = file_path + "other" + "/"
            elif type == "电影":
                file_path = file_path + "movie" + "/"
            elif type == "少儿":
                file_path = file_path + "child" + "/"
            elif type == "体育":
                file_path = file_path + "sports" + "/"
            elif type == "教育":
                file_path = file_path + "education" + "/"
            elif type == "栏目":
                file_path = file_path + "column" + "/"
            elif type == "纪录片":
                file_path = file_path + "docomentaires" + "/"
            # 如果没有这个path则直接创建
            if not os.path.exists(file_path):
                os.makedirs(file_path)
            if x.get("poster"):
                image_url = x.get("poster")
                if image_url.split("//")[0] == "":
                    image_url = "http:" + image_url
                if "" in image_url:
                    image_url = image_url.replace("", "//")
                # 取源url地址的后缀
                # file_suffix = os.path.splitext(image_url)[1]
                filename = file_path + get_unique_name() + ".jpg"
                try:
                    urllib.request.urlretrieve(image_url, filename=filename)
                except:
                    print("%s下载失败" % x.get("poster"))
                    errorUrl.save({"image_url": x.get("poster"), "table": "sourcePS"})
                    with open("sourcePS.txt", "a") as f:
                        f.write("num: %s," % num)
                        f.write("%s下载失败\n" % x.get("poster"))
                    continue
                sourceDownPics.save({"_id": get_old_image_md5(image_url), "sourcePicUrl": image_url, "downPicPath": filename, "createTime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")})
                print("%s下载成功" % x.get("poster"))
                num += 1
                print("num", num)
            if x.get("photo"):
                image_url = x.get("photo")
                if image_url.split("//")[0] == "":
                    image_url = "http:" + image_url
                if "" in image_url:
                    image_url = image_url.replace("", "//")
                # 取源url地址的后缀
                # file_suffix = os.path.splitext(image_url)[1]
                filename = file_path + get_unique_name() + ".jpg"
                try:
                    urllib.request.urlretrieve(image_url, filename=filename)
                except:
                    print("%s下载失败" % x.get("photo"))
                    errorUrl.save({"image_url": x.get("photo"), "table": "sourcePS"})
                    with open("sourcePS.txt", "a") as f:
                        f.write("num: %s," % num)
                        f.write("%s下载失败\n" % x.get("photo"))
                    continue
                sourceDownPics.save({"_id": get_old_image_md5(image_url), "sourcePicUrl": image_url, "downPicPath": filename, "createTime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")})
                print("%s下载成功" % x.get("photo"))
                num += 1
                print("num", num)


if __name__ == '__main__':
    getPictures()
    print("下载结束")