![b2972696a7ff449530de0c733f0d3e47.png](https://img-blog.csdnimg.cn/img_convert/b2972696a7ff449530de0c733f0d3e47.png)
一般拿到一批数据需要清洗数据,比如有部分数据标记错误,还有一些数据重复,为了提高效率,需要把这部分重复的数据给去掉,本文介绍一种在速度和准确率都相对有优势的相似度识别算法dhash.
算法步骤
- 将图片resize成9x8
- 灰度化处理
- 每一行后一个数据和前一个数据进行像素大小比较,如果大于取1否则取0,得到8x8二值序列。
- 计算图片之间的汉明距离,距离越小,相似度越高
代码实现
class DHash(object):
@staticmethod
def calculate_hash(image):
"""
计算图片的dHash值
:param image: PIL.Image
:return: dHash值,string类型
"""
difference = DHash.__difference(image)
# 转化为16进制(每个差值为一个bit,每8bit转为一个16进制)
decimal_value = 0
hash_string = ""
for index, value in enumerate(difference):
if value: # value为0, 不用计算, 程序优化
decimal_value += value * (2 ** (index % 8))
if index % 8 == 7: # 每8位的结束
hash_string += str(hex(decimal_value)[2:].rjust(2, "0")) # 不足2位以0填充。0xf=>0x0f
decimal_value = 0
return hash_string
@staticmethod
def hamming_distance(first, second):
"""
计算两张图片的汉明距离(基于dHash算法)
:param first: Image或者dHash值(str)
:param second: Image或者dHash值(str)
:return: hamming distance. 值越大,说明两张图片差别越大,反之,则说明越相似
"""
# A. dHash值计算汉明距离
if isinstance(first, str):
return DHash.__hamming_distance_with_hash(first, second)
# B. image计算汉明距离
hamming_distance = 0
image1_difference = DHash.__difference(first)
image2_difference = DHash.__difference(second)
for index, img1_pix in enumerate(image1_difference):
img2_pix = image2_difference[index]
if img1_pix != img2_pix:
hamming_distance += 1
return hamming_distance
@staticmethod
def __difference(image):
"""
*Private method*
计算image的像素差值
:param image: PIL.Image
:return: 差值数组。0、1组成
"""
resize_width = 9
resize_height = 8
# 1. resize to (9,8)
smaller_image = image.resize((resize_width, resize_height))
# 2. 灰度化 Grayscale
grayscale_image = smaller_image.convert("L")
# 3. 比较相邻像素
pixels = list(grayscale_image.getdata())
difference = []
for row in range(resize_height):
row_start_index = row * resize_width
for col in range(resize_width - 1):
left_pixel_index = row_start_index + col
difference.append(pixels[left_pixel_index] > pixels[left_pixel_index + 1])
return difference
@staticmethod
def __hamming_distance_with_hash(dhash1, dhash2):
"""
*Private method*
根据dHash值计算hamming distance
:param dhash1: str
:param dhash2: str
:return: 汉明距离(int)
"""
difference = (int(dhash1, 16)) ^ (int(dhash2, 16))
return bin(difference).count("1")
from PIL import Image,ImageEnhance
image1 = Image.open('./scr.jpg')
image2 = ImageEnhance.Contrast(image1).enhance(0.4)
image2.show()
# image2 = Image.open('./scr1.jpg')
hamming_distance = DHash.hamming_distance(image1, image2)
print(hamming_distance)
参考文献
https://github.com/hjaurum/DHashgithub.com https://www.lizenghai.com/archives/31913.htmlwww.lizenghai.com