import cv2
import os
import shutil
import numpy as np
from sklearn.cluster import KMeans
from PIL import Image
import pytesseract
from skimage.filters import threshold_otsu
from skimage.segmentation import clear_border
from skimage.measure import label, regionprops
from skimage.morphology import closing, square
from skimage.color import rgb2gray
from tqdm import tqdm
# 设置OCR引擎
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
def preprocess_text_area(image):
# 预处理文本区域
binary = image > threshold_otsu(image)
cleared = clear_border(binary)
label_image = label(cleared)
areas = [r.area for r in regionprops(label_image)]
areas.sort()
if len(areas) > 2:
for region in regionprops(label_image):
# 如果区域太小,则忽略
if region.area < areas[-2]:
for coordinates in region.coords:
label_image[coordinates[0], coordinates[1]] = 0
binary = label_image > 0
# 将布尔数组转换为uint8类型,以便正确转换为PIL图像
binary_uint8 = binary.astype(np.uint8) * 255
return closing(binary_uint8, square(3))
def analyze_font_and_layout(image):
"""
分析图像中的字体和布局特征,返回一个分数表示图像是否更像网页截图。
分数范围在0到1之间,值越接近1表示更有可能是网页截图。
"""
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
# 找到所有连通域
cnts = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
# 过滤出可能是文本的连通域
text_regions = []
for c in cnts:
x, y, w, h = cv2.boundingRect(c)
aspect_ratio = w / float(h)
area = cv2.contourArea(c)
# 筛选长宽比和面积合理的区域作为文本区域
if 0.5 <= aspect_ratio <= 1.5 and area > 100:
text_regions.append((x, y, w, h))
# 如果没有找到文本区域,返回低分
if not text_regions:
return 0.0
# 计算文本区域的分布特征
avg_spacing = np.mean([text_regions[i+1][0] - text_regions[i][0] for i in range(len(text_regions)-1)])
avg_height = np.mean([h for _, _, _, h in text_regions])
# 根据文本区域的分布和高度计算得分
score = 0.0
if avg_spacing > 0 and avg_height > 0:
score = min(avg_spacing, avg_height) / max(avg_spacing, avg_height)
return score
def is_webpage_screenshot(image_path):
img = cv2.imread(image_path)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# 改进的边缘检测
edges = cv2.Canny(gray, 50, 150, apertureSize=3)
lines = cv2.HoughLinesP(edges, 1, np.pi/180, 100, minLineLength=100, maxLineGap=10)
# 检查lines是否为None,避免TypeError
line_count = 0 if lines is None else len(lines)
# 预处理文本区域
preprocessed_gray = preprocess_text_area(rgb2gray(img))
text = pytesseract.image_to_string(preprocessed_gray * 255, lang='eng')
# 颜色分析
pil_img = Image.open(image_path).convert('RGB')
img_array = np.array(pil_img)
img_flatten = img_array.reshape((img_array.shape[0] * img_array.shape[1], 3))
kmeans = KMeans(n_clusters=16).fit(img_flatten)
unique_colors = len(np.unique(kmeans.labels_))
# 字体和布局分析
font_layout_score = analyze_font_and_layout(img)
# 综合评估
return line_count > 10 and len(text) > 100 and unique_colors < 128 and font_layout_score > 0.5
# 设置路径
source_dir = "GPT_images"
dest_dir = "non_webpage_images"
# 创建目标文件夹
if not os.path.exists(dest_dir):
os.makedirs(dest_dir)
# 获取所有图片文件的总数
total_files = 0
for root, dirs, files in os.walk(source_dir):
total_files += sum(1 for file in files if file.lower().endswith(('.png', '.jpg', '.jpeg')))
# 遍历所有文件,使用tqdm创建进度条
for root, dirs, files in os.walk(source_dir):
for file in tqdm(files, total=total_files, desc="Processing Images"):
if file.lower().endswith(('.png', '.jpg', '.jpeg')):
image_path = os.path.join(root, file)
# 检查图片是否为网页截图
if not is_webpage_screenshot(image_path):
# 生成唯一文件名
base_name, ext = os.path.splitext(file)
new_file_name = base_name
counter = 1
while os.path.exists(os.path.join(dest_dir, new_file_name + ext)):
new_file_name = f"{base_name}_{counter}"
counter += 1
# 移动图片
shutil.move(image_path, os.path.join(dest_dir, new_file_name + ext))
print("Non-webpage images have been moved.")
检测网图是不是网页截图v1.0
最新推荐文章于 2024-09-07 00:28:17 发布