淘宝亿级图片素材调度与处理的工程实践


import cv2
import numpy as np
from tfsClient import tfsClient
from PIL import Image
from io import BytesIO
# 感知hash算法
def pic_p_hash(img, hash_size = 32):
    img = cv2.resize(img,(hash_size, hash_size))
    # 将图像转换为灰度
    gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
    img = img.astype(np.float32)
    # 计算DCT(离散余弦变换)
    dct = cv2.dct(np.float32(gray))
    # 取DCT的左上角8x8区域
    dct = dct[:hash_size, :hash_size]
    # 计算均值
    avg = np.mean(dct)
    # 生成哈希
    phash = (dct > avg).astype(int)
    phash = phash.flatten()
    phash_str = ''.join([str(x) for x in phash.flatten()])
    phash_hex = hex(int(phash_str, 2))[2:].zfill(hash_size // 4)
    return phash_hex
#均值哈希算法
def pic_avg_hash(img):
    # 缩放为8*8
    img = cv2.resize(img, (8, 8), interpolation=cv2.INTER_CUBIC)
    # 转换为灰度图
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # s为像素和初值为0,hash_str为hash值初值为''
    s = 0
    hash_str = ''
    # 遍历累加求像素和
    for i in range(8):
        for j in range(8):
            s = s + gray[i, j]
    # 求平均灰度
    avg = s / 64
    # 灰度大于平均值为1相反为0生成图片的hash值
    for i in range(8):
        for j in range(8):
            if gray[i, j] > avg:
                hash_str = hash_str + '1'
            else:
                hash_str = hash_str + '0'
    return hash_str
#差值感知算法
def pic_dif_hash(img):
    #缩放8*8
    img=cv2.resize(img,(9,8),interpolation=cv2.INTER_CUBIC)
    #转换灰度图
    gray=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
    hash_str=''
    #每行前一个像素大于后一个像素为1,相反为0,生成哈希
    for i in range(8):
        for j in range(8):
            if   gray[i,j]>gray[i,j+1]:
                hash_str=hash_str+'1'
            else:
                hash_str=hash_str+'0'
    return hash_str
#Hash值对比
def hash_cmp(hash1,hash2):
    n=0
    #hash长度不同则返回-1代表传参出错
    if len(hash1) != len(hash2):
        return -1
    
    #遍历判断
    for i in range(len(hash1)):
        #不相等则n计数+1,n最终为相似度
        if hash1[i]!=hash2[i]:
            n=n+1
    return n
if __name__ == '__main__':
    img1 = cv2.imread('/Users/bixi/Documents/整理归档/产业中心/图库/图库治理/测试图片/a.jpeg')
    img2 = cv2.imread('/Users/bixi/Documents/整理归档/产业中心/图库/图库治理/测试图片/b.jpeg')
    img3 = cv2.imread('/Users/bixi/Documents/整理归档/产业中心/图库/图库治理/测试图片/c.jpeg')
    img4 = cv2.imread('/Users/bixi/Documents/整理归档/产业中心/图库/图库治理/测试图片/d.png')
    img5 = cv2.imread('/Users/bixi/Documents/整理归档/产业中心/图库/图库治理/测试图片/e.jpeg')
    imgHash1 = pic_p_hash(img1, 32)
    imgHash2 = pic_p_hash(img2, 32)
    imgHash3 = pic_p_hash(img3, 32)
    imgHash4 = pic_p_hash(img4, 32)
    imgHash5 = pic_p_hash(img5, 32)
    print(imgHash5)
    cmp1 = hash_cmp(imgHash1, imgHash2)
    cmp2 = hash_cmp(imgHash1, imgHash3)
    cmp3 = hash_cmp(imgHash1, imgHash4)
    cmp4 = hash_cmp(imgHash1, imgHash5)
    print(cmp1)
    print(cmp2)
    print(cmp3)
    print(cmp4)