增加了图像重复度检测方法

This commit is contained in:
jinye_huang 2025-05-06 15:41:29 +08:00
parent d828165547
commit 92a60af402
17 changed files with 539 additions and 0 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 33 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 33 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 31 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

View File

@ -0,0 +1,4 @@
image1,image2,ahash,color_hist,dhash,md5,phash,sift,ssim
additional_1.jpg,additional_2.jpg,0.984375,0.9870948370376991,0.921875,0.0,0.875,0.579540165689286,0.19861068520387853
additional_1.jpg,additional_3.jpg,1.0,0.921953366880821,0.96875,0.0,0.9375,0.5848237706872704,0.18287979964654466
additional_3.jpg,additional_2.jpg,0.984375,0.9566729875886791,0.921875,0.0,0.9375,0.5776319463836918,0.2204994764088436
1 image1 image2 ahash color_hist dhash md5 phash sift ssim
2 additional_1.jpg additional_2.jpg 0.984375 0.9870948370376991 0.921875 0.0 0.875 0.579540165689286 0.19861068520387853
3 additional_1.jpg additional_3.jpg 1.0 0.921953366880821 0.96875 0.0 0.9375 0.5848237706872704 0.18287979964654466
4 additional_3.jpg additional_2.jpg 0.984375 0.9566729875886791 0.921875 0.0 0.9375 0.5776319463836918 0.2204994764088436

View File

@ -0,0 +1,22 @@
image1,image2,algorithm,similarity,compute_time
additional_1.jpg,additional_3.jpg,md5,0.0,0.0008852481842041016
additional_1.jpg,additional_3.jpg,phash,0.9375,0.03219485282897949
additional_1.jpg,additional_3.jpg,ahash,1.0,0.014646530151367188
additional_1.jpg,additional_3.jpg,dhash,0.96875,0.015238761901855469
additional_1.jpg,additional_3.jpg,color_hist,0.921953366880821,0.05095839500427246
additional_1.jpg,additional_3.jpg,sift,0.5848237706872704,0.9035642147064209
additional_1.jpg,additional_3.jpg,ssim,0.18287979964654466,0.09257769584655762
additional_1.jpg,additional_2.jpg,md5,0.0,0.0008139610290527344
additional_1.jpg,additional_2.jpg,phash,0.875,0.017491579055786133
additional_1.jpg,additional_2.jpg,ahash,0.984375,0.015668869018554688
additional_1.jpg,additional_2.jpg,dhash,0.921875,0.015781641006469727
additional_1.jpg,additional_2.jpg,color_hist,0.9870948370376991,0.013017416000366211
additional_1.jpg,additional_2.jpg,sift,0.579540165689286,0.8017916679382324
additional_1.jpg,additional_2.jpg,ssim,0.19861068520387853,0.08038783073425293
additional_3.jpg,additional_2.jpg,md5,0.0,0.0008535385131835938
additional_3.jpg,additional_2.jpg,phash,0.9375,0.017593860626220703
additional_3.jpg,additional_2.jpg,ahash,0.984375,0.01582646369934082
additional_3.jpg,additional_2.jpg,dhash,0.921875,0.015839099884033203
additional_3.jpg,additional_2.jpg,color_hist,0.9566729875886791,0.013316154479980469
additional_3.jpg,additional_2.jpg,sift,0.5776319463836918,0.8478097915649414
additional_3.jpg,additional_2.jpg,ssim,0.2204994764088436,0.08063888549804688
1 image1 image2 algorithm similarity compute_time
2 additional_1.jpg additional_3.jpg md5 0.0 0.0008852481842041016
3 additional_1.jpg additional_3.jpg phash 0.9375 0.03219485282897949
4 additional_1.jpg additional_3.jpg ahash 1.0 0.014646530151367188
5 additional_1.jpg additional_3.jpg dhash 0.96875 0.015238761901855469
6 additional_1.jpg additional_3.jpg color_hist 0.921953366880821 0.05095839500427246
7 additional_1.jpg additional_3.jpg sift 0.5848237706872704 0.9035642147064209
8 additional_1.jpg additional_3.jpg ssim 0.18287979964654466 0.09257769584655762
9 additional_1.jpg additional_2.jpg md5 0.0 0.0008139610290527344
10 additional_1.jpg additional_2.jpg phash 0.875 0.017491579055786133
11 additional_1.jpg additional_2.jpg ahash 0.984375 0.015668869018554688
12 additional_1.jpg additional_2.jpg dhash 0.921875 0.015781641006469727
13 additional_1.jpg additional_2.jpg color_hist 0.9870948370376991 0.013017416000366211
14 additional_1.jpg additional_2.jpg sift 0.579540165689286 0.8017916679382324
15 additional_1.jpg additional_2.jpg ssim 0.19861068520387853 0.08038783073425293
16 additional_3.jpg additional_2.jpg md5 0.0 0.0008535385131835938
17 additional_3.jpg additional_2.jpg phash 0.9375 0.017593860626220703
18 additional_3.jpg additional_2.jpg ahash 0.984375 0.01582646369934082
19 additional_3.jpg additional_2.jpg dhash 0.921875 0.015839099884033203
20 additional_3.jpg additional_2.jpg color_hist 0.9566729875886791 0.013316154479980469
21 additional_3.jpg additional_2.jpg sift 0.5776319463836918 0.8478097915649414
22 additional_3.jpg additional_2.jpg ssim 0.2204994764088436 0.08063888549804688

View File

@ -0,0 +1,97 @@
#!/bin/bash
# 用于运行简化版图像相似度检测工具的脚本
# 确保脚本在出错时退出
set -e
# 颜色定义
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
RED='\033[0;31m'
NC='\033[0m' # No Color
# 打印带颜色的消息
print_green() {
echo -e "${GREEN}$1${NC}"
}
print_yellow() {
echo -e "${YELLOW}$1${NC}"
}
print_red() {
echo -e "${RED}$1${NC}"
}
# 安装必要的Python依赖
install_dependencies() {
print_yellow "安装必要的Python依赖..."
pip install numpy opencv-python pillow matplotlib scikit-image scipy pandas imagehash
print_green "依赖安装完成!"
}
# 检查测试图像目录
check_images_dir() {
TEST_IMAGES_DIR="./scripts/image_test/test_images"
mkdir -p "$TEST_IMAGES_DIR"
# 检查测试图像目录中是否有图像文件
image_count=$(find "$TEST_IMAGES_DIR" -type f \( -name "*.jpg" -o -name "*.jpeg" -o -name "*.png" -o -name "*.bmp" \) | wc -l)
if [ "$image_count" -eq 0 ]; then
print_red "错误: 测试图像目录 '$TEST_IMAGES_DIR' 中没有图像文件!"
print_yellow "请在此目录中放置要检测相似度的图像文件后再运行此脚本。"
print_yellow "您至少需要放置2个或更多图像文件进行比较。"
exit 1
else
print_green "找到 $image_count 个图像文件,可以进行检测。"
fi
}
# 确保输出目录存在
prepare_output_dir() {
print_yellow "准备输出目录..."
OUTPUT_DIR="./scripts/image_test/results"
mkdir -p "$OUTPUT_DIR"
print_green "输出目录准备完成!"
}
# 运行检测脚本
run_check() {
print_yellow "运行图像相似度检测..."
TEST_IMAGES_DIR="./scripts/image_test/test_images"
OUTPUT_DIR="./scripts/image_test/results"
python ./scripts/image_test/simple_dupe_checker.py --input_dir "$TEST_IMAGES_DIR" --output_dir "$OUTPUT_DIR"
print_green "检测完成!"
print_yellow "结果保存在: $OUTPUT_DIR"
}
# 主函数
main() {
print_green "===== 简化版图像相似度检测工具 ====="
# 检查Python是否可用
if ! command -v python &> /dev/null; then
print_red "错误: 未找到Python。请确保已安装Python 3.6+。"
exit 1
fi
# 安装依赖
install_dependencies
# 检查测试图像目录
check_images_dir
# 准备输出目录
prepare_output_dir
# 运行检测
run_check
print_green "===== 检测完毕 ====="
}
# 执行主函数
main

View File

@ -0,0 +1,416 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
简化版图像相似度检测器
直接计算指定目录中图片之间的相似度输出表格结果
"""
import os
import sys
import hashlib
import argparse
import time
import numpy as np
import cv2
from PIL import Image
import matplotlib.pyplot as plt
import pandas as pd
from skimage.metrics import structural_similarity as ssim
import imagehash
from datetime import datetime
class SimpleImageDupeChecker:
"""简化版图像相似度检测器"""
def __init__(self):
"""初始化检测器"""
# 算法列表
self.algorithms = {
'md5': self.compare_md5,
'phash': self.compare_phash,
'ahash': self.compare_ahash,
'dhash': self.compare_dhash,
'color_hist': self.compare_color_histogram,
'sift': self.compare_sift,
'ssim': self.compare_ssim
}
# 初始化SIFT检测器
self.sift = cv2.SIFT_create()
# FLANN匹配器参数
FLANN_INDEX_KDTREE = 1
index_params = dict(algorithm=FLANN_INDEX_KDTREE, trees=5)
search_params = dict(checks=50)
self.flann = cv2.FlannBasedMatcher(index_params, search_params)
def get_md5(self, image_path):
"""计算图像的MD5哈希值"""
with open(image_path, 'rb') as f:
md5hash = hashlib.md5(f.read()).hexdigest()
return md5hash
def compare_md5(self, image1_path, image2_path):
"""比较两张图像的MD5哈希值相似度"""
hash1 = self.get_md5(image1_path)
hash2 = self.get_md5(image2_path)
# 如果哈希完全相同返回1.0否则返回0.0
return 1.0 if hash1 == hash2 else 0.0
def get_phash(self, image_path, hash_size=8):
"""计算图像的感知哈希(pHash)"""
img = Image.open(image_path).convert('L').resize((hash_size * 4, hash_size * 4), Image.LANCZOS)
return imagehash.phash(img, hash_size=hash_size)
def compare_phash(self, image1_path, image2_path):
"""比较两张图像的感知哈希(pHash)相似度"""
hash1 = self.get_phash(image1_path)
hash2 = self.get_phash(image2_path)
# 计算哈希相似度64位哈希的汉明距离
hash_diff = hash1 - hash2
max_diff = 64.0 # 8x8哈希的最大汉明距离
similarity = 1.0 - (hash_diff / max_diff)
return max(0.0, similarity) # 确保相似度不低于0
def get_ahash(self, image_path, hash_size=8):
"""计算图像的平均哈希(aHash)"""
img = Image.open(image_path).convert('L').resize((hash_size, hash_size), Image.LANCZOS)
return imagehash.average_hash(img, hash_size=hash_size)
def compare_ahash(self, image1_path, image2_path):
"""比较两张图像的平均哈希(aHash)相似度"""
hash1 = self.get_ahash(image1_path)
hash2 = self.get_ahash(image2_path)
# 计算哈希相似度
hash_diff = hash1 - hash2
max_diff = 64.0 # 8x8哈希的最大汉明距离
similarity = 1.0 - (hash_diff / max_diff)
return max(0.0, similarity)
def get_dhash(self, image_path, hash_size=8):
"""计算图像的差值哈希(dHash)"""
img = Image.open(image_path).convert('L').resize((hash_size + 1, hash_size), Image.LANCZOS)
return imagehash.dhash(img, hash_size=hash_size)
def compare_dhash(self, image1_path, image2_path):
"""比较两张图像的差值哈希(dHash)相似度"""
hash1 = self.get_dhash(image1_path)
hash2 = self.get_dhash(image2_path)
# 计算哈希相似度
hash_diff = hash1 - hash2
max_diff = 64.0 # 8x8哈希的最大汉明距离
similarity = 1.0 - (hash_diff / max_diff)
return max(0.0, similarity)
def get_color_histogram(self, image_path):
"""计算图像的颜色直方图"""
img = cv2.imread(image_path)
img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
# 计算HSV颜色空间的直方图
hist = cv2.calcHist([img], [0, 1, 2], None, [8, 8, 8], [0, 180, 0, 256, 0, 256])
cv2.normalize(hist, hist, 0, 1.0, cv2.NORM_MINMAX)
return hist.flatten()
def compare_color_histogram(self, image1_path, image2_path):
"""比较两张图像的颜色直方图相似度"""
hist1 = self.get_color_histogram(image1_path)
hist2 = self.get_color_histogram(image2_path)
# 计算直方图相似度
similarity = cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL)
return max(0.0, similarity) # 确保相似度不低于0
def get_sift_features(self, image_path):
"""获取图像的SIFT特征点和描述符"""
img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
keypoints, descriptors = self.sift.detectAndCompute(img, None)
return keypoints, descriptors
def compare_sift(self, image1_path, image2_path):
"""比较两张图像的SIFT特征点相似度"""
kp1, des1 = self.get_sift_features(image1_path)
kp2, des2 = self.get_sift_features(image2_path)
# 如果没有足够的特征点,返回低相似度
if des1 is None or des2 is None or len(des1) < 2 or len(des2) < 2:
return 0.0
# 使用FLANN匹配器查找最佳匹配
matches = self.flann.knnMatch(des1, des2, k=2)
# 应用比率测试,筛选好的匹配
good_matches = []
for m, n in matches:
if m.distance < 0.7 * n.distance:
good_matches.append(m)
# 计算相似度
similarity = len(good_matches) / max(len(kp1), len(kp2)) if max(len(kp1), len(kp2)) > 0 else 0
return min(1.0, similarity) # 确保相似度不超过1
def compare_ssim(self, image1_path, image2_path):
"""比较两张图像的结构相似性(SSIM)"""
img1 = cv2.imread(image1_path, cv2.IMREAD_GRAYSCALE)
img2 = cv2.imread(image2_path, cv2.IMREAD_GRAYSCALE)
# 确保两张图像尺寸相同
h, w = min(img1.shape[0], img2.shape[0]), min(img1.shape[1], img2.shape[1])
img1 = cv2.resize(img1, (w, h))
img2 = cv2.resize(img2, (w, h))
# 计算SSIM
similarity = ssim(img1, img2)
return max(0.0, similarity) # 确保相似度不低于0
def check_images(self, input_dir, output_dir=None):
"""
检查目录中所有图像之间的相似度
Args:
input_dir: 输入图像目录
output_dir: 输出结果目录如果为None则使用input_dir
Returns:
包含相似度信息的DataFrame
"""
# 如果未指定输出目录,使用输入目录
if output_dir is None:
output_dir = input_dir
os.makedirs(output_dir, exist_ok=True)
# 获取所有图像文件
print("正在读取图像文件...")
image_files = []
for file in os.listdir(input_dir):
file_path = os.path.join(input_dir, file)
if os.path.isfile(file_path) and file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.gif')):
image_files.append(file_path)
# 检查图像数量
if len(image_files) == 0:
print(f"错误: 在目录 '{input_dir}' 中没有找到图像文件")
return None
print(f"找到 {len(image_files)} 个图像文件,开始计算相似度...")
# 结果列表
results = []
# 计算所有图像对的相似度
total_pairs = len(image_files) * (len(image_files) - 1) // 2
pair_count = 0
for i, img1 in enumerate(image_files):
img1_name = os.path.basename(img1)
for j, img2 in enumerate(image_files):
if j <= i: # 跳过重复比较
continue
img2_name = os.path.basename(img2)
pair_count += 1
print(f"比较图像对 {pair_count}/{total_pairs}: {img1_name} vs {img2_name}")
# 对每种算法计算相似度
for alg_name, compare_func in self.algorithms.items():
try:
start_time = time.time()
similarity = compare_func(img1, img2)
compute_time = time.time() - start_time
results.append({
'image1': img1_name,
'image2': img2_name,
'algorithm': alg_name,
'similarity': similarity,
'compute_time': compute_time
})
except Exception as e:
print(f" 算法 {alg_name} 比较出错: {e}")
results.append({
'image1': img1_name,
'image2': img2_name,
'algorithm': alg_name,
'similarity': None,
'compute_time': None,
'error': str(e)
})
# 创建DataFrame
df = pd.DataFrame(results)
# 保存结果
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
csv_path = os.path.join(output_dir, f"similarity_results_{timestamp}.csv")
df.to_csv(csv_path, index=False)
print(f"结果已保存至: {csv_path}")
# 创建透视表
pivot_df = df.pivot_table(
index=['image1', 'image2'],
columns='algorithm',
values='similarity',
aggfunc='first'
).reset_index()
# 保存透视表
pivot_path = os.path.join(output_dir, f"similarity_pivot_{timestamp}.csv")
pivot_df.to_csv(pivot_path, index=False)
print(f"透视表已保存至: {pivot_path}")
# 显示结果摘要
self.print_summary(df)
# 生成可视化
self.visualize_results(df, output_dir)
return df
def print_summary(self, df):
"""打印结果摘要"""
print("\n===== 图像相似度检测结果摘要 =====")
# 按算法分组计算平均相似度
alg_summary = df.groupby('algorithm')['similarity'].agg(['mean', 'min', 'max', 'std']).reset_index()
alg_summary = alg_summary.sort_values('mean', ascending=False)
print("\n各算法平均相似度:")
for _, row in alg_summary.iterrows():
print(f" {row['algorithm']:<12}: 平均值 = {row['mean']:.4f} (最小值: {row['min']:.4f}, 最大值: {row['max']:.4f}, 标准差: {row['std']:.4f})")
# 显示最相似的图像对
print("\n相似度最高的图像对:")
for alg in df['algorithm'].unique():
alg_df = df[df['algorithm'] == alg]
if not alg_df.empty:
max_idx = alg_df['similarity'].idxmax()
max_row = alg_df.loc[max_idx]
print(f" {alg:<12}: {max_row['image1']}{max_row['image2']} (相似度: {max_row['similarity']:.4f})")
# 显示最不相似的图像对
print("\n相似度最低的图像对:")
for alg in df['algorithm'].unique():
alg_df = df[df['algorithm'] == alg]
if not alg_df.empty:
min_idx = alg_df['similarity'].idxmin()
min_row = alg_df.loc[min_idx]
print(f" {alg:<12}: {min_row['image1']}{min_row['image2']} (相似度: {min_row['similarity']:.4f})")
def visualize_results(self, df, output_dir):
"""可视化结果"""
print("\n生成结果可视化...")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# 1. 绘制条形图 - 各算法的平均相似度
plt.figure(figsize=(10, 6))
avg_similarity = df.groupby('algorithm')['similarity'].mean().reset_index()
avg_similarity = avg_similarity.sort_values('similarity', ascending=False)
plt.bar(avg_similarity['algorithm'], avg_similarity['similarity'], color='skyblue')
plt.xlabel('算法')
plt.ylabel('平均相似度')
plt.title('各算法的平均相似度')
plt.xticks(rotation=45)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
bar_chart_path = os.path.join(output_dir, f"algorithm_avg_similarity_{timestamp}.png")
plt.savefig(bar_chart_path)
# 2. 绘制热力图 - 各图像对的相似度矩阵
# 为每个算法生成一个热力图
image_names = sorted(list(set(df['image1'].unique()) | set(df['image2'].unique())))
for alg in df['algorithm'].unique():
plt.figure(figsize=(10, 8))
# 创建热力图数据
heatmap_data = np.zeros((len(image_names), len(image_names)))
# 填充热力图数据
for idx, row in df[df['algorithm'] == alg].iterrows():
i = image_names.index(row['image1'])
j = image_names.index(row['image2'])
heatmap_data[i, j] = row['similarity']
heatmap_data[j, i] = row['similarity'] # 对称矩阵
# 热力图对角线设为1.0自己与自己相似度为1
for i in range(len(image_names)):
heatmap_data[i, i] = 1.0
# 创建热力图
plt.imshow(heatmap_data, cmap='viridis')
plt.colorbar(label='相似度')
plt.title(f'{alg} 算法的图像相似度热力图')
# 设置坐标轴刻度
plt.xticks(np.arange(len(image_names)), image_names, rotation=90)
plt.yticks(np.arange(len(image_names)), image_names)
# 添加文本标签
for i in range(len(image_names)):
for j in range(len(image_names)):
text = plt.text(j, i, f"{heatmap_data[i, j]:.2f}",
ha="center", va="center",
color="w" if heatmap_data[i, j] < 0.7 else "black")
plt.tight_layout()
heatmap_path = os.path.join(output_dir, f"similarity_heatmap_{alg}_{timestamp}.png")
plt.savefig(heatmap_path)
# 3. 箱线图 - 各算法的相似度分布
plt.figure(figsize=(10, 6))
# 准备数据
data = []
labels = []
for alg in sorted(df['algorithm'].unique()):
alg_data = df[df['algorithm'] == alg]['similarity']
if not alg_data.empty:
data.append(alg_data)
labels.append(alg)
# 创建箱线图
plt.boxplot(data, labels=labels)
plt.ylabel('相似度')
plt.title('各算法的相似度分布')
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
boxplot_path = os.path.join(output_dir, f"algorithm_boxplot_{timestamp}.png")
plt.savefig(boxplot_path)
# 关闭所有图形
plt.close('all')
print(f"可视化图表已保存到:\n 条形图: {bar_chart_path}\n 箱线图: {boxplot_path}")
print(f" 各算法热力图已保存到输出目录")
def main():
"""主函数"""
parser = argparse.ArgumentParser(description='简化版图像相似度检测工具')
parser.add_argument('--input_dir', '-i', type=str, default='./scripts/image_test/test_images',
help='输入图像目录路径')
parser.add_argument('--output_dir', '-o', type=str, default=None,
help='输出结果目录路径,默认与输入目录相同')
args = parser.parse_args()
print("===== 简化版图像相似度检测工具 =====")
print(f"输入目录: {args.input_dir}")
print(f"输出目录: {args.output_dir or args.input_dir}")
checker = SimpleImageDupeChecker()
checker.check_images(args.input_dir, args.output_dir)
print("\n===== 检测完成 =====")
if __name__ == '__main__':
main()

Binary file not shown.

After

Width:  |  Height:  |  Size: 234 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 224 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 243 KiB