图片名称不同内容相同滤重

#!/usr/bin/env python
#coding:utf8

import os
import pickle

from hashlib import md5
from collections import defaultdict
from pprint import pprint
from random import choice

IMG_EXTS = ['.jpg', '.gif', '.jpeg', '.png']
OBJ_FILE = 'obj.pickle'

# images path
IMG_PATH = r'E:\creatism_data\beautiful people'

def rm(path):
    print( 'remove %s' % path)
    # 真正执行删除
    os.unlink(path)

def remove_dup(dup_list):
    keep = choice(dup_list)
    print( 'Keep %s' % keep)
    dup_list.remove(keep)
    [rm(f) for f in dup_list]

def store_obj(obj, fp=OBJ_FILE):
    print ('Dump obj to %s' % fp)
    with open(fp, 'wb') as fb:
        pickle.dump(obj, fb)
    print ('Done')

def calc_md5(fp):
    chunk = 4 * 1024
    m = md5()
    with open(fp,'rb') as fb:
        while True:
            content = fb.read(chunk)
            if not content: break
            m.update(content)
    return m.hexdigest()

def get_files(path):
    for root, dirs, files in os.walk(path):
        for fn in files:
            if os.path.splitext(fn)[-1].lower() in IMG_EXTS:
                fp = os.path.join(root, fn)
                yield fp

def get_obj():
    if os.path.exists(OBJ_FILE):
        print ('Obj file exists, we can get result from that :)')
        with open(OBJ_FILE,'rb') as fb:
            return pickle.load(fb)
    print ('Calculating all the image files md5 value ...')
    dup_dl = defaultdict(list)
    for fp in get_files(IMG_PATH):
        fp_md5 = calc_md5(fp)
        dup_dl[fp_md5].append(fp)
    print ('Done')
    return dup_dl

def main():
    obj = get_obj()
    for key, value in obj.items():
        if len(value) >= 2:
            remove_dup(value)

    store_obj(obj)

if __name__ == "__main__":
    main()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值