python 按日期整理照片视频 自动归档 去除重复节省空间

随着手机拍照片越来越方便,记录日常生活的照片和视频越来多,每次手机满了就备份到电脑里,一家人的手机都备份的都有3百多G的容量了,而且我还要备份2份以备丢失。我总觉得里面的照片视频有重复苦于没时间写代码,最近学了一下python觉得挺简单,有这么都库可以使用,花了点时间把我多年想整理的又没做的事情给完成了,代码很粗陋,请大家都指教

1、“文件夹”  移动到  (“文件夹_新”  ,  “文件夹_新_重复”)两个文件夹里面。“文件夹_新” 里是需要的内容,“文件夹_新_重复” 是可以删除的文件

2、按时间:图片exif时间,视频拍摄时间,文件名里的日期,文件名里的时间戳,最后按文件备份到电脑的时间整理。  年/月/图片或视频

import hashlib
from logging import error
import os
import multiprocessing
import shutil
import re
import time
import exifread
import imghdr
import datetime
import pytz
import filetype
from win32com.propsys import propsys, pscon
from PIL import Image

GL_MYDIR = ""


GL_DATE_RE = [r"([1,2]{1}\d{3})([0,1]{1}\d)([0,1,2,3]{1}\d)[^\d]",
              r"([1,2]{1}\d{3})_([0,1]{1}\d)_([0,1,2,3]{1}\d)[^\d]",
              r"([1,2]{1}\d{3}):([0,1]{1}\d):([0,1,2,3]{1}\d)[^\d]",
              r"([1,2]{1}\d{3})-([0,1]{1}\d)-([0,1,2,3]{1}\d)[^\d]"]

GL_YAER_MAX = time.localtime(time.time()).tm_year
GL_YAER_MIN = 2000
# =========================计算哈希码===========================


def calcmd5(q, filepath):
    with open(filepath, 'rb') as f:
        filesize = os.path.getsize(filepath)
        filesize = filesize.to_bytes(4, 'big')
        md5obj = hashlib.md5()
        # 读取文件首行,全部读取太慢了,首行+文件大小
        data = f.readline()
        md5obj.update(data)
        md5obj.update(filesize)
        hash = md5obj.hexdigest()
        q.put({"hash": hash, "path": filepath})


# =========================进度提示===========================
def rate(q, k):
    i = 0
    data_list = []
    while True:
        if not q.empty():
            value = q.get(True)
            i = i+1
            data_list.append(value)
            print(i, 'Get %s ' % value["hash"], end='')
            print(" 总进度:%.2f %%,%d" % ((i*100/k),k))
            # time.sleep(random.random())
        if i >= k:
            # print('完成 回车键退出')
            break
    return data_list
# ==========================格式化时间=======================


def TimeStampToTime(timestamp):
    timeArray = time.localtime(timestamp)
    return time.strftime("%Y{y}/%m{m}", timeArray).format(y='年', m='月')


# =========================图片exif信息=======================
def image_exif_date2(path):
    date = ""
    with open(path, 'rb') as f:
        tags = exifread.process_file(f)
        for tag, value in tags.items():
            if re.match('.*Date.*', tag):
                date = str(value)
                re.match('.*Date.*', tag)

    return date


def image_exif_date(path):
    date = ""
    try:
        if os.path.exists(path):
            img = Image.open(path)
            exif_data = img._getexif()
            date = exif_data[36867]
            # 或者 ImageDate = exif_data[306]
            # print("图片exif", path, date)
        return date
    except Exception as r:
        date = image_exif_date2(path)
        # print("图片exif错误", path, r)
        return date
# =========================视频exif信息=========================


def mp4_exif_date(path):
    date = ''
    try:
        properties = propsys.SHGetPropertyStoreFromParsingName(path)
        date = properties.GetValue(pscon.PKEY_Media_DateEncoded).GetValue()
        if not isinstance(date, datetime.datetime):
            if date:
                date = datetime.datetime.fromtimestamp(int(date))
                date = date.replace(tzinfo=pytz.timezone('UTC'))
                # print("视频exif", path, date)
        return date
    except Exception as r:
        # print("视频exif错误", path, r)
        return ""


# =========================判断是否为图片========================
def imge_flag(path):
    type_list = ['jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif','gif']
    # try:
    #     if imghdr.what(path) in Type_list:
    #         return True
    #     else:
    #         return False
    # except Exception as r:
    #     # print("图片格式错误", path, r)
    falg = file_flag(path, type_list)
    return falg

# =========================判断是否为视频========================


def mp4_flag(path):
    type_list = ['mp4', 'm4v', 'mkv', 'webm',
                 'mov', 'wmv', 'avi', 'mpg', 'flv', '3gp']
    # kind = filetype.guess(path)
    # try:
    #     if kind.extension in Type_list:
    #         return True
    #     else:
    #         return False
    # except Exception as r:
    #     # print("视频格式错误", path, r)
    falg = file_flag(path, type_list)
    return falg

# =========================判断后缀========================


def file_flag(path, type_list):
    tempfilename = os.path.basename(path)
    suffix = os.path.splitext(tempfilename)[-1]
    suffix=suffix.replace(".", "")
    suffix=suffix.lower()
    if suffix in type_list:
        return True
    else:
        return False
# ==========================获得文件日期=======================


def getfiledate(path):
    date_str = ""
    # 图片exif 里的时间
    if date_str is "" and imge_flag(path):
        filename_str = image_exif_date(path)
        print(filename_str)
        for item_re in GL_DATE_RE:
            date_match = re.search(item_re, filename_str)
            if date_match is not None:
                date_str = "%s年/%s月" % (date_match.group(1),
                                        date_match.group(2))
                year = date_str[0:4]
                month = date_str[-3:-1]
                if int(year) < int(GL_YAER_MIN) or int(year) > int(GL_YAER_MAX) or int(month) > 12:
                    date_str = ""
                else:
                    break
    # 视频exif 里的时间
    if date_str is "" and mp4_flag(path):
        filename_str = mp4_exif_date(path)
        if filename_str:
            filename_str = str(filename_str)
            for item_re in GL_DATE_RE:
                date_match = re.search(item_re, filename_str)
                if date_match is not None:
                    date_str = "%s年/%s月" % (date_match.group(1),
                                            date_match.group(2))
                    year = date_str[0:4]
                    month = date_str[-3:-1]
                    if int(year) < int(GL_YAER_MIN) or int(year) > int(GL_YAER_MAX) or int(month) > 12:
                        date_str = ""
                    else:
                        break
    # 文件名里的时间戳
    if date_str is "":
        filename_str = os.path.basename(path)
        date_match = re.search("[^\d](\d{13})[^\d]", filename_str)
        if date_match is not None:
            sjc = int(date_match.group(1))
            timeStamp = int(sjc/1000)
            timeArray = time.localtime(timeStamp)
            date_str = time.strftime(
                "%Y{y}/%m{m}", timeArray).format(y='年', m='月')
            year = date_str[0:4]
            month = date_str[-3:-1]
            if int(year) < int(GL_YAER_MIN) or int(year) > int(GL_YAER_MAX) or int(month) > 12:
                date_str = ""
    # 文件名里的时间
    if date_str is "":
        filename_str = os.path.basename(path)
        for item_re in GL_DATE_RE:
            date_match = re.search(item_re, filename_str)
            if date_match is not None:
                date_str = "%s年/%s月" % (date_match.group(1),
                                        date_match.group(2))
                year = date_str[0:4]
                month = date_str[-3:-1]
                if int(year) < int(GL_YAER_MIN) or int(year) > int(GL_YAER_MAX) or int(month) > 12:
                    date_str = ""
                else:
                    break
    # 文件名创建时间
    if date_str is "":
        date_str = TimeStampToTime(os.path.getctime(path))
        year = date_str[0:4]
        month = date_str[-3:-1]
        if int(year) < int(GL_YAER_MIN) or int(year) > int(GL_YAER_MAX) or int(month) > 12:
            date_str = ""

    if date_str is not "":
        if imge_flag(path):
            date_str=date_str+"/图片"
        elif mp4_flag(path):
            date_str=date_str+"/视频"
    return date_str

# ==========================得到新文件地址=======================
def getfilenewpath(newdir, oldpath):
    date_str = getfiledate(oldpath)
    if date_str is not "":
        newdir = os.path.join(newdir, date_str)
    filename_str = os.path.basename(oldpath)
    newpath = os.path.join(newdir, filename_str)
    newdir2 = os.path.dirname(newpath)
    if not os.path.exists(newdir2):
        os.makedirs(newdir2)
    return newpath


# =========================移动文件===========================
def movefile(newdata_list, newdir):
    if not os.path.exists(newdir):
        os.makedirs(newdir)
    i = 0
    for item in newdata_list:
        if os.path.exists(item["path"]):
            newpath = getfilenewpath(newdir, item["path"])
            newpath2 = os.path.join(os.path.dirname(newpath), "new_" + os.path.basename(newpath))
            i = i+1
            if not os.path.exists(newpath):
                # shutil.copy(item["path"], newpath)
                shutil.move(item["path"], newpath)
            else:
                # shutil.copy(item["path"], newpath2)
                shutil.move(item["path"], newpath2)
            print(i,"move",item["path"] )
#  =========================主函数===========================


def main():
    # 创建进程池
    po = multiprocessing.Pool(4)
    # 创建一个队列
    q = multiprocessing.Manager().Queue()
    totalFileCount = sum([len(files)
                         for root, dirs, files in os.walk(GL_MYDIR)])
    data = po.apply_async(rate, args=(q, totalFileCount))
    k = 0
    for root, dirs, files in os.walk(GL_MYDIR):
        for file in files:
            k = k+1
            hashfile = os.path.join(root, file)
            # print(k, hashfile)
            if os.path.exists(hashfile):
                po.apply_async(calcmd5, args=(q, hashfile))
            else:
                print("no filename")

    po.close()
    if data.successful:
        data_list = data.get()
        temp_list = []
        temp2_list = []
        newdata_list = []
        newdata2_list = []
        print("非重复的文件:")
        i=0
        for item in data_list:
            if item["hash"] not in temp_list:
                temp_list.append(item["hash"])
                newdata_list.append(item)
                i=i+1
                print(i,"only",item["hash"])
            else:
                temp2_list.append(item["hash"])
        # 获得 newdata_list 去掉重复后的数据(重复的只取一个)
        print("重复的文件:")
        i=0
        for item in data_list:
            if item["hash"] in temp2_list:
                newdata2_list.append(item)
                i=i+1
                print(i,"repeat",item["hash"])
        print("开始移动非重复的文件【%d】:" % len(newdata_list))
        movefile(newdata_list, GL_MYDIR+"_新")
        print("开始移动重复的文件【%d】:" % len(newdata2_list))
        movefile(newdata2_list, GL_MYDIR+"_新_重复")
        print("文件移动到%s成功!" % (GL_MYDIR+"_新"))
        


if __name__ == "__main__":
    GL_MYDIR=input("请输入文件夹完整路径:")
    begin_time=time.time()
    # begin_time=time.process_time() 进程时间
    main()
    # end_time=time.process_time()
    end_time=time.time()
    run_time=end_time-begin_time
    print("运行时间",round(run_time,2),"秒")
    os.system("pause")

测试了单个文件大于4G以上程序会有卡死现状暂时我还弄不懂 呵呵。刚开始我还以为是列表一次存储4万多个文件需要内存太大卡死了,又弄了个数据库版的,一样卡死。原因是单个文件太大导致的

数据库版

按日期整理照片视频【数据库版】.py

import hashlib
import os
import multiprocessing
import shutil
import re
import time
import exifread
import imghdr
import datetime
import pytz
import filetype
from win32com.propsys import propsys, pscon
from PIL import Image
import sqlite3


GL_MYDIR = ""


GL_DATE_RE = [r"([1,2]{1}\d{3})([0,1]{1}\d)([0,1,2,3]{1}\d)[^\d]",
              r"([1,2]{1}\d{3})_([0,1]{1}\d)_([0,1,2,3]{1}\d)[^\d]",
              r"([1,2]{1}\d{3}):([0,1]{1}\d):([0,1,2,3]{1}\d)[^\d]",
              r"([1,2]{1}\d{3})-([0,1]{1}\d)-([0,1,2,3]{1}\d)[^\d]"]

GL_YAER_MAX = time.localtime(time.time()).tm_year
GL_YAER_MIN = 2000
CONN = sqlite3.connect('test.db')
# =========================计算哈希码===========================


def calcmd5(q, filepath):
    with open(filepath, 'rb') as f:
        filesize = os.path.getsize(filepath)
        filesize = filesize.to_bytes(4, 'big')
        md5obj = hashlib.md5()
        # 读取文件首行,全部读取太慢了,首行+文件大小
        data = f.readline()
        md5obj.update(data)
        md5obj.update(filesize)
        hash = md5obj.hexdigest()
        q.put({"hash": hash, "path": filepath})


# =========================进度提示===========================
def rate(q, k):
    i = 0
    cur = CONN.cursor()
    create_table("data_list")
    while True:
        if not q.empty():
            value = q.get(True)
            i = i+1
            sql_text = "INSERT INTO data_list VALUES('%s', '%s')" % (
                value["hash"], value["path"])
            cur.execute(sql_text)
            print(i, 'Get %s ' % value["hash"], end='')
            print(" 总进度:%.2f %%,%d" % (i*100/k, k))
        if i >= k:
            CONN.commit()
            print('hash完成')
            return True


def create_table(table_name):
    cursor = CONN.cursor()
    sql = '''SELECT tbl_name FROM sqlite_master WHERE type = 'table' '''
    cursor.execute(sql)
    values = cursor.fetchall()
    tables = []
    for v in values:
        tables.append(v[0])
    # 如果表名不存在,建表
    if table_name not in tables:
        sql_text = "CREATE TABLE %s(hash TEXT,path TEXT)" % table_name
        cursor.execute(sql_text)
        print(table_name + ' 创建成功')
    else:
        cursor = CONN.cursor()
        sql = "Delete from %s" % table_name
        cursor.execute(sql)
        print(table_name + ' 已经存在')
# ==========================格式化时间=======================


def TimeStampToTime(timestamp):
    timeArray = time.localtime(timestamp)
    return time.strftime("%Y{y}/%m{m}", timeArray).format(y='年', m='月')


# =========================图片exif信息=======================
def image_exif_date2(path):
    date = ""
    with open(path, 'rb') as f:
        tags = exifread.process_file(f)
        for tag, value in tags.items():
            if re.match('.*Date.*', tag):
                date = str(value)
                re.match('.*Date.*', tag)

    return date


def image_exif_date(path):
    date = ""
    try:
        if os.path.exists(path):
            img = Image.open(path)
            exif_data = img._getexif()
            date = exif_data[36867]
            # 或者 ImageDate = exif_data[306]
            # print("图片exif", path, date)
        return date
    except Exception as r:
        date = image_exif_date2(path)
        # print("图片exif错误", path, r)
        return date
# =========================视频exif信息=========================


def mp4_exif_date(path):
    date = ''
    try:
        properties = propsys.SHGetPropertyStoreFromParsingName(path)
        date = properties.GetValue(pscon.PKEY_Media_DateEncoded).GetValue()
        if not isinstance(date, datetime.datetime):
            if date:
                date = datetime.datetime.fromtimestamp(int(date))
                date = date.replace(tzinfo=pytz.timezone('UTC'))
                # print("视频exif", path, date)
        return date
    except Exception as r:
        # print("视频exif错误", path, r)
        return ""


# =========================判断是否为图片========================
def imge_flag(path):
    type_list = ['jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif','gif']
    # try:
    #     if imghdr.what(path) in Type_list:
    #         return True
    #     else:
    #         return False
    # except Exception as r:
    #     # print("图片格式错误", path, r)
    falg = file_flag(path, type_list)
    return falg

# =========================判断是否为视频========================


def mp4_flag(path):
    type_list = ['mp4', 'm4v', 'mkv', 'webm',
                 'mov', 'wmv', 'avi', 'mpg', 'flv', '3gp']
    # kind = filetype.guess(path)
    # try:
    #     if kind.extension in Type_list:
    #         return True
    #     else:
    #         return False
    # except Exception as r:
    #     # print("视频格式错误", path, r)
    falg = file_flag(path, type_list)
    return falg

# =========================判断后缀========================


def file_flag(path, type_list):
    tempfilename = os.path.basename(path)
    suffix = os.path.splitext(tempfilename)[-1]
    suffix=suffix.replace(".", "")
    suffix=suffix.lower()
    if suffix in type_list:
        return True
    else:
        return False
# ==========================获得文件日期=======================


def getfiledate(path):
    date_str = ""
    # 图片exif 里的时间
    if date_str is "" and imge_flag(path):
        filename_str = image_exif_date(path)
        for item_re in GL_DATE_RE:
            date_match = re.search(item_re, filename_str)
            if date_match is not None:
                date_str = "%s年/%s月" % (date_match.group(1),
                                        date_match.group(2))
                year = date_str[0:4]
                month = date_str[-3:-1]
                if int(year) < int(GL_YAER_MIN) or int(year) > int(GL_YAER_MAX) or int(month) > 12:
                    date_str = ""
                else:
                    break
    # 视频exif 里的时间
    if date_str is "" and mp4_flag(path):
        filename_str = mp4_exif_date(path)
        if filename_str:
            filename_str = str(filename_str)
            for item_re in GL_DATE_RE:
                date_match = re.search(item_re, filename_str)
                if date_match is not None:
                    date_str = "%s年/%s月" % (date_match.group(1),
                                            date_match.group(2))
                    year = date_str[0:4]
                    month = date_str[-3:-1]
                    if int(year) < int(GL_YAER_MIN) or int(year) > int(GL_YAER_MAX) or int(month) > 12:
                        date_str = ""
                    else:
                        break
    # 文件名里的时间戳
    if date_str is "":
        filename_str = os.path.basename(path)
        date_match = re.search("[^\d](\d{13})[^\d]", filename_str)
        if date_match is not None:
            sjc = int(date_match.group(1))
            timeStamp = int(sjc/1000)
            timeArray = time.localtime(timeStamp)
            date_str = time.strftime(
                "%Y{y}/%m{m}", timeArray).format(y='年', m='月')
            year = date_str[0:4]
            month = date_str[-3:-1]
            if int(year) < int(GL_YAER_MIN) or int(year) > int(GL_YAER_MAX) or int(month) > 12:
                date_str = ""
    # 文件名里的时间
    if date_str is "":
        filename_str = os.path.basename(path)
        for item_re in GL_DATE_RE:
            date_match = re.search(item_re, filename_str)
            if date_match is not None:
                date_str = "%s年/%s月" % (date_match.group(1),
                                        date_match.group(2))
                year = date_str[0:4]
                month = date_str[-3:-1]
                if int(year) < int(GL_YAER_MIN) or int(year) > int(GL_YAER_MAX) or int(month) > 12:
                    date_str = ""
                else:
                    break
    # 文件名创建时间
    if date_str is "":
        date_str = TimeStampToTime(os.path.getctime(path))
        year = date_str[0:4]
        month = date_str[-3:-1]
        if int(year) < int(GL_YAER_MIN) or int(year) > int(GL_YAER_MAX) or int(month) > 12:
            date_str = ""

    if date_str is not "":
        if imge_flag(path):
            date_str = date_str+"/图片"
        elif mp4_flag(path):
            date_str = date_str+"/视频"
    return date_str

# ==========================得到新文件地址=======================


def getfilenewpath(newdir, oldpath):
    date_str = getfiledate(oldpath)
    if date_str is not "":
        newdir = os.path.join(newdir, date_str)
    filename_str = os.path.basename(oldpath)
    newpath = os.path.join(newdir, filename_str)
    newdir2 = os.path.dirname(newpath)
    if not os.path.exists(newdir2):
        os.makedirs(newdir2)
    return newpath


# =========================移动文件===========================
def movefile(newdata_list, newdir):
    if not os.path.exists(newdir):
        os.makedirs(newdir)
    i = 0
    for item in newdata_list:
        if os.path.exists(item[1]):
            newpath = getfilenewpath(newdir, item[1])
            newpath2 = os.path.join(os.path.dirname(
                newpath), "new_" + os.path.basename(newpath))
            i = i+1
            if not os.path.exists(newpath):
                # shutil.copy(item[1], newpath)
                shutil.move(item[1], newpath)
            else:
                # shutil.copy(item[1], newpath2)
                shutil.move(item[1], newpath2)
            print(i,"move",item[1] )

#  =========================主函数===========================


def main():
    # 创建进程池
    po = multiprocessing.Pool(4)
    # 创建一个队列
    q = multiprocessing.Manager().Queue()
    totalFileCount = sum([len(files)
                         for root, dirs, files in os.walk(GL_MYDIR)])
    data = po.apply_async(rate, args=(q, totalFileCount))
    k = 0
    for root, dirs, files in os.walk(GL_MYDIR):
        for file in files:
            k = k+1
            hashfile = os.path.join(root, file)
            # print(k, hashfile)
            if os.path.exists(hashfile):
                po.apply_async(calcmd5, args=(q, hashfile))
            else:
                print("no filename")

    po.close()
    if data.successful:
        flag = data.get()
        temp_list = []
        temp2_list = []
        newdata_list = []
        newdata2_list = []
        cur = CONN.cursor()
        sql_text = "SELECT * FROM data_list "
        data_list = cur.execute(sql_text)
        print("非重复的文件:")
        i = 0
        for item in data_list:
            if item[0] not in temp_list:
                temp_list.append(item[0])
                newdata_list.append(item)
                i = i+1
                print(i, "only", item[0])
            else:
                temp2_list.append(item[0])
        # 获得 newdata_list 去掉重复后的数据(重复的只取一个)
        print("重复的文件:")
        i = 0
        data_list = cur.execute(sql_text)
        for item in data_list:
            if item[0] in temp2_list:
                newdata2_list.append(item)
                i = i+1
                print(i, "repeat", item[0])
        print("开始移动非重复的文件【%d】:" % len(newdata_list))
        movefile(newdata_list, GL_MYDIR+"_新")
        print("开始移动重复的文件【%d】:" % len(newdata2_list))
        movefile(newdata2_list, GL_MYDIR+"_新_重复")
        print("文件移动到%s成功!" % (GL_MYDIR+"_新"))


if __name__ == "__main__":
    GL_MYDIR = input("请输入文件夹完整路径:")
    begin_time = time.time()
    main()
    end_time = time.time()
    run_time = end_time-begin_time
    print("运行时间", round(run_time, 2), "秒")
    os.system("pause")

安装pypiwin32后成功解决win32api不能安装问题:

pip install pypiwin32

  • 2
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

qq_28928247

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值