最近在做一项下载图片的任务,所有的图片地址都是使用爬虫爬取下来并保存在mongodb数据库里面的,
由于当时图片部分不是我自己抓取的,不知道不同的库之间还会存在相同的图片地址,所以使用下面代码导致了部分图片下载两遍的情况,但是数据库只存了一遍,浪费了服务器空间(可以在下载之前查一下数据库中是否已经下载了该图片,当然,如果确定图片肯定没有重复的就不用做此判断了)
# -*- coding: utf-8 -*-
import hashlib
import uuid
import os
import urllib.request
from pymongo import MongoClient
import datetime
# 生成新图片唯一名称
def get_unique_name():
uuid_val = uuid.uuid4()
uuid_str = str(uuid_val).encode("utf-8")
md5 = hashlib.md5()
md5.update(uuid_str)
return md5.hexdigest()
# 生成对应关系时“_id”的值
def get_old_image_md5(addr):
addr = str(addr).encode("utf-8")
md5 = hashlib.md5()
md5.update(addr)
return md5.hexdigest()
conn = MongoClient('xx.xx.xx.xx', 27017)
数据库名 = conn.数据库名
集合名 = 数据库名.集合名
print("连接原始图片库成功")
print("本次需要下载的图片个数为:%s" % sourcePS_url.find().count())
cli = MongoClient('xx.xx.xx.xx', 27017)
数据库名 = cli.数据库名
集合名 = 数据库名.集合名
errorUrl = vgs_source.errorUrl
print("连接保存图片库成功")
base_file_path = '/data/images/images/'
# 下载图片
def getPictures():
num = 0
for x in source.find().batch_size(80):
# 找出属于那个网站的数据
source = x.get("source")
if source == "TVSOU" or source == "CISCP" or source == "CCS":
print("不要", source + "数据")
continue
else:
print(source)
file_path = base_file_path + source.lower() + "/"
# 找出属于那种类型
type = x.get("type")
if type == "other":
file_path = file_path + "other" + "/"
elif type == "音乐":
file_path = file_path + "music" + "/"
elif type == "儿童":
file_path = file_path + "child" + "/"
elif type == "动漫":
file_path = file_path + "cartoon" + "/"
elif type == "戏曲":
file_path = file_path + "traditional_opera" + "/"
elif type == "新闻":
file_path = file_path + "news" + "/"
elif type == "综艺":
file_path = file_path + "variety" + "/"
elif type == "电视剧":
file_path = file_path + "teleplay" + "/"
elif type == "其他":
file_path = file_path + "other" + "/"
elif type == "电影":
file_path = file_path + "movie" + "/"
elif type == "少儿":
file_path = file_path + "child" + "/"
elif type == "体育":
file_path = file_path + "sports" + "/"
elif type == "教育":
file_path = file_path + "education" + "/"
elif type == "栏目":
file_path = file_path + "column" + "/"
elif type == "纪录片":
file_path = file_path + "docomentaires" + "/"
# 如果没有这个path则直接创建
if not os.path.exists(file_path):
os.makedirs(file_path)
if x.get("poster"):
image_url = x.get("poster")
if image_url.split("//")[0] == "":
image_url = "http:" + image_url
if "" in image_url:
image_url = image_url.replace("", "//")
# 取源url地址的后缀
# file_suffix = os.path.splitext(image_url)[1]
filename = file_path + get_unique_name() + ".jpg"
try:
urllib.request.urlretrieve(image_url, filename=filename)
except:
print("%s下载失败" % x.get("poster"))
errorUrl.save({"image_url": x.get("poster"), "table": "sourcePS"})
with open("sourcePS.txt", "a") as f:
f.write("num: %s," % num)
f.write("%s下载失败\n" % x.get("poster"))
continue
sourceDownPics.save({"_id": get_old_image_md5(image_url), "sourcePicUrl": image_url, "downPicPath": filename, "createTime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")})
print("%s下载成功" % x.get("poster"))
num += 1
print("num", num)
if x.get("photo"):
image_url = x.get("photo")
if image_url.split("//")[0] == "":
image_url = "http:" + image_url
if "" in image_url:
image_url = image_url.replace("", "//")
# 取源url地址的后缀
# file_suffix = os.path.splitext(image_url)[1]
filename = file_path + get_unique_name() + ".jpg"
try:
urllib.request.urlretrieve(image_url, filename=filename)
except:
print("%s下载失败" % x.get("photo"))
errorUrl.save({"image_url": x.get("photo"), "table": "sourcePS"})
with open("sourcePS.txt", "a") as f:
f.write("num: %s," % num)
f.write("%s下载失败\n" % x.get("photo"))
continue
sourceDownPics.save({"_id": get_old_image_md5(image_url), "sourcePicUrl": image_url, "downPicPath": filename, "createTime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")})
print("%s下载成功" % x.get("photo"))
num += 1
print("num", num)
if __name__ == '__main__':
getPictures()
print("下载结束")