该楼层疑似违规已被系统折叠 隐藏此楼查看此楼
#!/bin/env python3
import hashlib
import os
import sys
g_buffer = bytearray(8192)
g_buffer_view = memoryview(g_buffer)
def get_digest(path, size):
global g_buffer
global g_buffer_view
md5 = hashlib.md5()
with open(path, 'rb', buffering=0) as fp:
while size > 0:
n = fp.readinto(g_buffer)
md5.update(g_buffer_view[:n])
size -= n
return md5.digest()
def main():
if len(sys.argv) > 1:
START_DIR = os.path.abspath(sys.argv[1])
else:
START_DIR = os.getcwd()
if not os.path.isdir(START_DIR):
print('"{}" is not a directory'.format(START_DIR), file=sys.stderr)
sys.exit(1)
all_ = {}
for root, dirs, files in os.walk(START_DIR, followlinks=False):
for f in files:
p = os.path.join('/', root, f)
if os.path.islink(p): continue
sz = os.path.getsize(p)
if sz == 0: continue
all_.setdefault(sz, []).append(p)
for sz, paths in all_.items():
if len(paths) < 2: continue
dup = {}
for p in paths:
d = get_digest(p, sz)
dup.setdefault(d, []).append(p)
for ps in dup.values():
if len(ps) < 2: continue
print(len(ps))
for p in ps: print(p)
if __name__ == '__main__':
main()