检测网图是不是网页截图v1.0

import cv2
import os
import shutil
import numpy as np
from sklearn.cluster import KMeans
from PIL import Image
import pytesseract
from skimage.filters import threshold_otsu
from skimage.segmentation import clear_border
from skimage.measure import label, regionprops
from skimage.morphology import closing, square
from skimage.color import rgb2gray
from tqdm import tqdm

# 设置OCR引擎
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'

def preprocess_text_area(image):
    # 预处理文本区域
    binary = image > threshold_otsu(image)
    cleared = clear_border(binary)
    label_image = label(cleared)
    areas = [r.area for r in regionprops(label_image)]
    areas.sort()
    if len(areas) > 2:
        for region in regionprops(label_image):
            # 如果区域太小,则忽略
            if region.area < areas[-2]:
                for coordinates in region.coords:
                    label_image[coordinates[0], coordinates[1]] = 0
    binary = label_image > 0
    # 将布尔数组转换为uint8类型,以便正确转换为PIL图像
    binary_uint8 = binary.astype(np.uint8) * 255
    return closing(binary_uint8, square(3))

def analyze_font_and_layout(image):
    """
    分析图像中的字体和布局特征,返回一个分数表示图像是否更像网页截图。
    分数范围在0到1之间,值越接近1表示更有可能是网页截图。
    """
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    
    # 找到所有连通域
    cnts = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    
    # 过滤出可能是文本的连通域
    text_regions = []
    for c in cnts:
        x, y, w, h = cv2.boundingRect(c)
        aspect_ratio = w / float(h)
        area = cv2.contourArea(c)
        
        # 筛选长宽比和面积合理的区域作为文本区域
        if 0.5 <= aspect_ratio <= 1.5 and area > 100:
            text_regions.append((x, y, w, h))
    
    # 如果没有找到文本区域,返回低分
    if not text_regions:
        return 0.0
    
    # 计算文本区域的分布特征
    avg_spacing = np.mean([text_regions[i+1][0] - text_regions[i][0] for i in range(len(text_regions)-1)])
    avg_height = np.mean([h for _, _, _, h in text_regions])
    
    # 根据文本区域的分布和高度计算得分
    score = 0.0
    if avg_spacing > 0 and avg_height > 0:
        score = min(avg_spacing, avg_height) / max(avg_spacing, avg_height)
    
    return score

def is_webpage_screenshot(image_path):
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # 改进的边缘检测
    edges = cv2.Canny(gray, 50, 150, apertureSize=3)
    lines = cv2.HoughLinesP(edges, 1, np.pi/180, 100, minLineLength=100, maxLineGap=10)

    # 检查lines是否为None,避免TypeError
    line_count = 0 if lines is None else len(lines)
    
    # 预处理文本区域
    preprocessed_gray = preprocess_text_area(rgb2gray(img))
    text = pytesseract.image_to_string(preprocessed_gray * 255, lang='eng')
    
    # 颜色分析
    pil_img = Image.open(image_path).convert('RGB')
    img_array = np.array(pil_img)
    img_flatten = img_array.reshape((img_array.shape[0] * img_array.shape[1], 3))
    kmeans = KMeans(n_clusters=16).fit(img_flatten)
    unique_colors = len(np.unique(kmeans.labels_))
    
    # 字体和布局分析
    font_layout_score = analyze_font_and_layout(img) 
    
    # 综合评估
    return line_count > 10 and len(text) > 100 and unique_colors < 128 and font_layout_score > 0.5


# 设置路径
source_dir = "GPT_images"
dest_dir = "non_webpage_images"

# 创建目标文件夹
if not os.path.exists(dest_dir):
    os.makedirs(dest_dir)

# 获取所有图片文件的总数
total_files = 0
for root, dirs, files in os.walk(source_dir):
    total_files += sum(1 for file in files if file.lower().endswith(('.png', '.jpg', '.jpeg')))

# 遍历所有文件,使用tqdm创建进度条
for root, dirs, files in os.walk(source_dir):
    for file in tqdm(files, total=total_files, desc="Processing Images"):
        if file.lower().endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(root, file)
            
            # 检查图片是否为网页截图
            if not is_webpage_screenshot(image_path):
                # 生成唯一文件名
                base_name, ext = os.path.splitext(file)
                new_file_name = base_name
                counter = 1
                while os.path.exists(os.path.join(dest_dir, new_file_name + ext)):
                    new_file_name = f"{base_name}_{counter}"
                    counter += 1
                
                # 移动图片
                shutil.move(image_path, os.path.join(dest_dir, new_file_name + ext))

print("Non-webpage images have been moved.")
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值