逐个处理每天的文件夹

根据您的需求,我将优化代码以按天处理目录,并逐个处理每天的文件夹。以下是优化后的代码:

import sys
import os
import time
from datetime import datetime
from typing import Dict, List, Tuple, Generator, Optional
import humanize
import concurrent.futures
from django.db.models import Q

# 设置Django环境
os.chdir('/var/www/OceanXECM')
sys.path.append('/var/www/OceanXECM')
import django_setup
from filestorage.models import FileModel
from doc.version_util import VersionedItem


def list_daily_directories(base_dir: str) -> List[str]:
    """
    列出所有按日期命名的子目录,并按日期排序(从旧到新)
    
    参数:
        base_dir: 基础目录路径
        
    返回:
        按日期排序的目录路径列表
    """
    dirs = []
    for name in os.listdir(base_dir):
        full_path = os.path.join(base_dir, name)
        if os.path.isdir(full_path) and name.isdigit() and len(name) == 8:
            try:
                # 验证是否是有效日期
                datetime.strptime(name, "%Y%m%d")
                dirs.append((name, full_path))
            except ValueError:
                continue
    
    # 按日期排序
    dirs.sort(key=lambda x: x[0])
    return [path for _, path in dirs]


def list_files_with_metadata_optimized(
        directory: str,
        max_workers: int = os.cpu_count() * 2
) -> Generator[Tuple[str, float, int], None, None]:
    """
    优化的文件列表生成器,支持大目录(500万+文件)

    参数:
        directory: 要扫描的目录路径
        max_workers: 并行工作线程数,默认CPU核心数*2

    返回:
        生成器,每次产生 (文件路径, 修改时间, 文件大小) 的元组
    """
    file_paths = []
    for root, _, files in os.walk(directory):
        for file in files:
            file_paths.append(os.path.join(root, file))

    def _process_file(file_path):
        try:
            stat = os.stat(file_path, follow_symlinks=False)
            return (file_path, stat.st_mtime, stat.st_size)
        except (OSError, PermissionError):
            return None

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        for result in executor.map(_process_file, file_paths):
            if result is not None:
                yield result


def batch_query_filemodels(file_paths: List[str], batch_size: int = 2000) -> Dict[str, FileModel]:
    """
    分批次批量查询FileModel记录
    """
    filemodel_map = {}
    total_files = len(file_paths)
    total_batches = (total_files + batch_size - 1) // batch_size

    for batch_index in range(total_batches):
        start_index = batch_index * batch_size
        end_index = min(start_index + batch_size, total_files)
        batch_paths = file_paths[start_index:end_index]

        batch_filemodels = FileModel.objects.filter(file__in=batch_paths).only('file', 'objectid')
        for fm in batch_filemodels:
            filemodel_map[fm.file] = fm

    return filemodel_map


def batch_query_versioned_items(uuids: List[str], batch_size: int = 2000) -> Dict[str, VersionedItem]:
    """
    分批次批量查询VersionedItem记录
    """
    vi_map = {}
    total_uuids = len(uuids)
    total_batches = (total_uuids + batch_size - 1) // batch_size

    for batch_index in range(total_batches):
        start_index = batch_index * batch_size
        end_index = min(start_index + batch_size, total_uuids)
        batch_uuids = uuids[start_index:end_index]

        batch_versioned_items = VersionedItem.objects.filter(file_uuid__in=batch_uuids).only('file_uuid')
        for vi in batch_versioned_items:
            vi_map[vi.file_uuid] = vi

    return vi_map


def process_daily_directory(
    day_dir: str,
    global_stats: Dict,
    day_start_time: float
) -> None:
    """
    处理单个日目录的所有文件
    
    参数:
        day_dir: 日目录路径
        global_stats: 全局统计字典
        day_start_time: 处理开始时间
    """
    day_name = os.path.basename(day_dir)
    print(f"\nProcessing day directory: {day_name}")
    
    # 初始化日统计
    daily_stats = {
        'total_files': 0,
        'storage_model_missing': {'count': 0, 'size': 0},
        'version_item_missing': {'count': 0, 'size': 0}
    }
    
    # 获取日目录下所有文件
    file_infos = list(list_files_with_metadata_optimized(day_dir))
    daily_stats['total_files'] = len(file_infos)
    
    if not file_infos:
        print(f"No files found in {day_name}, skipping...")
        return
    
    # 准备所有文件路径
    file_paths = [info[0] for info in file_infos]
    
    # 查询FileModel记录
    print(f"Querying FileModel records for {day_name} ({len(file_paths)} files)...")
    filemodel_map = batch_query_filemodels(file_paths)
    
    # 准备需要检查VersionedItem的UUID列表
    uuids_to_check = [fm.objectid for fm in filemodel_map.values()]
    
    # 查询VersionedItem记录
    print(f"Querying VersionedItem records for {day_name} ({len(uuids_to_check)} UUIDs)...")
    vi_map = batch_query_versioned_items(uuids_to_check)
    
    # 处理文件并统计
    for file_path, mtime, size in file_infos:
        # 检查FileModel记录
        storage_model = filemodel_map.get(file_path)
        if not storage_model:
            daily_stats['storage_model_missing']['count'] += 1
            daily_stats['storage_model_missing']['size'] += size
            continue
        
        # 检查VersionedItem记录
        if storage_model.objectid not in vi_map:
            daily_stats['version_item_missing']['count'] += 1
            daily_stats['version_item_missing']['size'] += size
    
    # 更新全局统计
    year_month = day_name[:6]  # 取YYYYMM部分
    for stat_type in ['storage_model_missing', 'version_item_missing']:
        if year_month not in global_stats[stat_type]:
            global_stats[stat_type][year_month] = {'count': 0, 'size': 0}
        
        global_stats[stat_type][year_month]['count'] += daily_stats[stat_type]['count']
        global_stats[stat_type][year_month]['size'] += daily_stats[stat_type]['size']
    
    # 打印日处理日志
    elapsed = time.time() - day_start_time
    print(f"Completed processing {day_name}:")
    print(f"  Total files: {daily_stats['total_files']}")
    print(f"  StorageModel missing: {daily_stats['storage_model_missing']['count']} files, "
          f"{humanize.naturalsize(daily_stats['storage_model_missing']['size'])}")
    print(f"  VersionedItem missing: {daily_stats['version_item_missing']['count']} files, "
          f"{humanize.naturalsize(daily_stats['version_item_missing']['size'])}")
    print(f"  Time taken: {humanize.naturaldelta(elapsed)}\n")


def check_missing_records_by_day(base_dir: str) -> Dict[str, Dict]:
    """
    按天检查并统计丢失的记录
    
    参数:
        base_dir: 基础目录路径
        
    返回:
        包含统计结果的字典
    """
    start_time = time.time()
    
    # 初始化全局统计
    global_stats = {
        'storage_model_missing': {},
        'version_item_missing': {},
        'total_time': 0,
        'total_files_processed': 0
    }
    
    # 获取所有日目录并按日期排序
    daily_dirs = list_daily_directories(base_dir)
    if not daily_dirs:
        print("No valid daily directories found!")
        return global_stats
    
    print(f"Found {len(daily_dirs)} daily directories to process")
    
    # 逐个处理日目录
    for day_dir in daily_dirs:
        day_start_time = time.time()
        process_daily_directory(day_dir, global_stats, day_start_time)
    
    # 更新总处理时间和文件数
    global_stats['total_time'] = time.time() - start_time
    
    return global_stats


def print_missing_statistics(result: Dict):
    """打印缺失数据统计信息"""
    print("\n=== Final Statistics ===")
    print("=== StorageModel Missing Records ===")
    for ym, data in sorted(result['storage_model_missing'].items()):
        print(f"{ym}: {data['count']} files, {humanize.naturalsize(data['size'])}")

    total_count = sum(data['count'] for data in result['storage_model_missing'].values())
    total_size = sum(data['size'] for data in result['storage_model_missing'].values())
    print(f"TOTAL: {humanize.intcomma(total_count)} files, {humanize.naturalsize(total_size)}")

    print("\n=== VersionedItem Missing Records ===")
    for ym, data in sorted(result['version_item_missing'].items()):
        print(f"{ym}: {data['count']} files, {humanize.naturalsize(data['size'])}")

    total_count = sum(data['count'] for data in result['version_item_missing'].values())
    total_size = sum(data['size'] for data in result['version_item_missing'].values())
    print(f"TOTAL: {humanize.intcomma(total_count)} files, {humanize.naturalsize(total_size)}")

    print(f"\nTotal processing time: {humanize.naturaldelta(result['total_time'])}")


if __name__ == "__main__":
    base_directory = '/var/www/OceanXECM/data/DOCS/Site-1303174917'
    print("Starting optimized missing records check by day...")
    result = check_missing_records_by_day(base_directory)
    print_missing_statistics(result)

主要优化点:

  1. 按日目录处理:新增了list_daily_directories函数来识别和排序日期格式的目录(如20250328),并按从旧到新的顺序处理。

  2. 每日独立处理:新增了process_daily_directory函数,专门处理单个日目录下的所有文件,包括:

    • 收集该日目录下的所有文件
    • 查询相关数据库记录
    • 统计该日的缺失记录
    • 更新全局统计
    • 打印详细的处理日志
  3. 改进的统计结构:统计信息现在按年月分组(YYYYMM格式),更清晰直观。

  4. 实时进度反馈:每个日目录处理完成后都会立即打印处理结果,包括文件数量、缺失记录数量和耗时。

  5. 内存优化:由于现在是按日处理,不再需要一次性加载所有文件信息到内存中。

这个优化版本应该能更好地处理大量文件,并提供更清晰的进度反馈。每个日目录都是独立处理的,这样即使中途出现问题,也可以知道已经处理了哪些日期的数据。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值