python爬取图片地址,并将图片保存到服务器

最近在做一项下载图片的任务,所有的图片地址都是使用爬虫爬取下来并保存在mongodb数据库里面的,

由于当时图片部分不是我自己抓取的,不知道不同的库之间还会存在相同的图片地址,所以使用下面代码导致了部分图片下载两遍的情况,但是数据库只存了一遍,浪费了服务器空间(可以在下载之前查一下数据库中是否已经下载了该图片,当然,如果确定图片肯定没有重复的就不用做此判断了)

# -*- coding: utf-8 -*-
import hashlib
import uuid
import os
import urllib.request
from pymongo import MongoClient
import datetime


# 生成新图片唯一名称
def get_unique_name():
    uuid_val = uuid.uuid4()
    uuid_str = str(uuid_val).encode("utf-8")
    md5 = hashlib.md5()
    md5.update(uuid_str)
    return md5.hexdigest()


# 生成对应关系时“_id”的值
def get_old_image_md5(addr):
    addr = str(addr).encode("utf-8")
    md5 = hashlib.md5()
    md5.update(addr)
    return md5.hexdigest()


conn = MongoClient('xx.xx.xx.xx', 27017)
数据库名 = conn.数据库名
集合名 = 数据库名.集合名
print("连接原始图片库成功")
print("本次需要下载的图片个数为:%s" % sourcePS_url.find().count())


cli = MongoClient('xx.xx.xx.xx', 27017)
数据库名 = cli.数据库名
集合名 = 数据库名.集合名
errorUrl = vgs_source.errorUrl
print("连接保存图片库成功")
base_file_path = '/data/images/images/'


# 下载图片
def getPictures():
    num = 0
    for x in source.find().batch_size(80):
        # 找出属于那个网站的数据
        source = x.get("source")
        if source == "TVSOU" or source == "CISCP" or source == "CCS":
            print("不要", source + "数据")
            continue
        else:
            print(source)
            file_path = base_file_path + source.lower() + "/"
            # 找出属于那种类型
            type = x.get("type")
            if type == "other":
                file_path = file_path + "other" + "/"
            elif type == "音乐":
                file_path = file_path + "music" + "/"
            elif type == "儿童":
                file_path = file_path + "child" + "/"
            elif type == "动漫":
                file_path = file_path + "cartoon" + "/"
            elif type == "戏曲":
                file_path = file_path + "traditional_opera" + "/"
            elif type == "新闻":
                file_path = file_path + "news" + "/"
            elif type == "综艺":
                file_path = file_path + "variety" + "/"
            elif type == "电视剧":
                file_path = file_path + "teleplay" + "/"
            elif type == "其他":
                file_path = file_path + "other" + "/"
            elif type == "电影":
                file_path = file_path + "movie" + "/"
            elif type == "少儿":
                file_path = file_path + "child" + "/"
            elif type == "体育":
                file_path = file_path + "sports" + "/"
            elif type == "教育":
                file_path = file_path + "education" + "/"
            elif type == "栏目":
                file_path = file_path + "column" + "/"
            elif type == "纪录片":
                file_path = file_path + "docomentaires" + "/"
            # 如果没有这个path则直接创建
            if not os.path.exists(file_path):
                os.makedirs(file_path)
            if x.get("poster"):
                image_url = x.get("poster")
                if image_url.split("//")[0] == "":
                    image_url = "http:" + image_url
                if "" in image_url:
                    image_url = image_url.replace("", "//")
                # 取源url地址的后缀
                # file_suffix = os.path.splitext(image_url)[1]
                filename = file_path + get_unique_name() + ".jpg"
                try:
                    urllib.request.urlretrieve(image_url, filename=filename)
                except:
                    print("%s下载失败" % x.get("poster"))
                    errorUrl.save({"image_url": x.get("poster"), "table": "sourcePS"})
                    with open("sourcePS.txt", "a") as f:
                        f.write("num: %s," % num)
                        f.write("%s下载失败\n" % x.get("poster"))
                    continue
                sourceDownPics.save({"_id": get_old_image_md5(image_url), "sourcePicUrl": image_url, "downPicPath": filename, "createTime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")})
                print("%s下载成功" % x.get("poster"))
                num += 1
                print("num", num)
            if x.get("photo"):
                image_url = x.get("photo")
                if image_url.split("//")[0] == "":
                    image_url = "http:" + image_url
                if "" in image_url:
                    image_url = image_url.replace("", "//")
                # 取源url地址的后缀
                # file_suffix = os.path.splitext(image_url)[1]
                filename = file_path + get_unique_name() + ".jpg"
                try:
                    urllib.request.urlretrieve(image_url, filename=filename)
                except:
                    print("%s下载失败" % x.get("photo"))
                    errorUrl.save({"image_url": x.get("photo"), "table": "sourcePS"})
                    with open("sourcePS.txt", "a") as f:
                        f.write("num: %s," % num)
                        f.write("%s下载失败\n" % x.get("photo"))
                    continue
                sourceDownPics.save({"_id": get_old_image_md5(image_url), "sourcePicUrl": image_url, "downPicPath": filename, "createTime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")})
                print("%s下载成功" % x.get("photo"))
                num += 1
                print("num", num)


if __name__ == '__main__':
    getPictures()
    print("下载结束")

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值