根据您的需求,我将优化代码以按天处理目录,并逐个处理每天的文件夹。以下是优化后的代码:
import sys
import os
import time
from datetime import datetime
from typing import Dict, List, Tuple, Generator, Optional
import humanize
import concurrent.futures
from django.db.models import Q
# 设置Django环境
os.chdir('/var/www/OceanXECM')
sys.path.append('/var/www/OceanXECM')
import django_setup
from filestorage.models import FileModel
from doc.version_util import VersionedItem
def list_daily_directories(base_dir: str) -> List[str]:
"""
列出所有按日期命名的子目录,并按日期排序(从旧到新)
参数:
base_dir: 基础目录路径
返回:
按日期排序的目录路径列表
"""
dirs = []
for name in os.listdir(base_dir):
full_path = os.path.join(base_dir, name)
if os.path.isdir(full_path) and name.isdigit() and len(name) == 8:
try:
# 验证是否是有效日期
datetime.strptime(name, "%Y%m%d")
dirs.append((name, full_path))
except ValueError:
continue
# 按日期排序
dirs.sort(key=lambda x: x[0])
return [path for _, path in dirs]
def list_files_with_metadata_optimized(
directory: str,
max_workers: int = os.cpu_count() * 2
) -> Generator[Tuple[str, float, int], None, None]:
"""
优化的文件列表生成器,支持大目录(500万+文件)
参数:
directory: 要扫描的目录路径
max_workers: 并行工作线程数,默认CPU核心数*2
返回:
生成器,每次产生 (文件路径, 修改时间, 文件大小) 的元组
"""
file_paths = []
for root, _, files in os.walk(directory):
for file in files:
file_paths.append(os.path.join(root, file))
def _process_file(file_path):
try:
stat = os.stat(file_path, follow_symlinks=False)
return (file_path, stat.st_mtime, stat.st_size)
except (OSError, PermissionError):
return None
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
for result in executor.map(_process_file, file_paths):
if result is not None:
yield result
def batch_query_filemodels(file_paths: List[str], batch_size: int = 2000) -> Dict[str, FileModel]:
"""
分批次批量查询FileModel记录
"""
filemodel_map = {}
total_files = len(file_paths)
total_batches = (total_files + batch_size - 1) // batch_size
for batch_index in range(total_batches):
start_index = batch_index * batch_size
end_index = min(start_index + batch_size, total_files)
batch_paths = file_paths[start_index:end_index]
batch_filemodels = FileModel.objects.filter(file__in=batch_paths).only('file', 'objectid')
for fm in batch_filemodels:
filemodel_map[fm.file] = fm
return filemodel_map
def batch_query_versioned_items(uuids: List[str], batch_size: int = 2000) -> Dict[str, VersionedItem]:
"""
分批次批量查询VersionedItem记录
"""
vi_map = {}
total_uuids = len(uuids)
total_batches = (total_uuids + batch_size - 1) // batch_size
for batch_index in range(total_batches):
start_index = batch_index * batch_size
end_index = min(start_index + batch_size, total_uuids)
batch_uuids = uuids[start_index:end_index]
batch_versioned_items = VersionedItem.objects.filter(file_uuid__in=batch_uuids).only('file_uuid')
for vi in batch_versioned_items:
vi_map[vi.file_uuid] = vi
return vi_map
def process_daily_directory(
day_dir: str,
global_stats: Dict,
day_start_time: float
) -> None:
"""
处理单个日目录的所有文件
参数:
day_dir: 日目录路径
global_stats: 全局统计字典
day_start_time: 处理开始时间
"""
day_name = os.path.basename(day_dir)
print(f"\nProcessing day directory: {day_name}")
# 初始化日统计
daily_stats = {
'total_files': 0,
'storage_model_missing': {'count': 0, 'size': 0},
'version_item_missing': {'count': 0, 'size': 0}
}
# 获取日目录下所有文件
file_infos = list(list_files_with_metadata_optimized(day_dir))
daily_stats['total_files'] = len(file_infos)
if not file_infos:
print(f"No files found in {day_name}, skipping...")
return
# 准备所有文件路径
file_paths = [info[0] for info in file_infos]
# 查询FileModel记录
print(f"Querying FileModel records for {day_name} ({len(file_paths)} files)...")
filemodel_map = batch_query_filemodels(file_paths)
# 准备需要检查VersionedItem的UUID列表
uuids_to_check = [fm.objectid for fm in filemodel_map.values()]
# 查询VersionedItem记录
print(f"Querying VersionedItem records for {day_name} ({len(uuids_to_check)} UUIDs)...")
vi_map = batch_query_versioned_items(uuids_to_check)
# 处理文件并统计
for file_path, mtime, size in file_infos:
# 检查FileModel记录
storage_model = filemodel_map.get(file_path)
if not storage_model:
daily_stats['storage_model_missing']['count'] += 1
daily_stats['storage_model_missing']['size'] += size
continue
# 检查VersionedItem记录
if storage_model.objectid not in vi_map:
daily_stats['version_item_missing']['count'] += 1
daily_stats['version_item_missing']['size'] += size
# 更新全局统计
year_month = day_name[:6] # 取YYYYMM部分
for stat_type in ['storage_model_missing', 'version_item_missing']:
if year_month not in global_stats[stat_type]:
global_stats[stat_type][year_month] = {'count': 0, 'size': 0}
global_stats[stat_type][year_month]['count'] += daily_stats[stat_type]['count']
global_stats[stat_type][year_month]['size'] += daily_stats[stat_type]['size']
# 打印日处理日志
elapsed = time.time() - day_start_time
print(f"Completed processing {day_name}:")
print(f" Total files: {daily_stats['total_files']}")
print(f" StorageModel missing: {daily_stats['storage_model_missing']['count']} files, "
f"{humanize.naturalsize(daily_stats['storage_model_missing']['size'])}")
print(f" VersionedItem missing: {daily_stats['version_item_missing']['count']} files, "
f"{humanize.naturalsize(daily_stats['version_item_missing']['size'])}")
print(f" Time taken: {humanize.naturaldelta(elapsed)}\n")
def check_missing_records_by_day(base_dir: str) -> Dict[str, Dict]:
"""
按天检查并统计丢失的记录
参数:
base_dir: 基础目录路径
返回:
包含统计结果的字典
"""
start_time = time.time()
# 初始化全局统计
global_stats = {
'storage_model_missing': {},
'version_item_missing': {},
'total_time': 0,
'total_files_processed': 0
}
# 获取所有日目录并按日期排序
daily_dirs = list_daily_directories(base_dir)
if not daily_dirs:
print("No valid daily directories found!")
return global_stats
print(f"Found {len(daily_dirs)} daily directories to process")
# 逐个处理日目录
for day_dir in daily_dirs:
day_start_time = time.time()
process_daily_directory(day_dir, global_stats, day_start_time)
# 更新总处理时间和文件数
global_stats['total_time'] = time.time() - start_time
return global_stats
def print_missing_statistics(result: Dict):
"""打印缺失数据统计信息"""
print("\n=== Final Statistics ===")
print("=== StorageModel Missing Records ===")
for ym, data in sorted(result['storage_model_missing'].items()):
print(f"{ym}: {data['count']} files, {humanize.naturalsize(data['size'])}")
total_count = sum(data['count'] for data in result['storage_model_missing'].values())
total_size = sum(data['size'] for data in result['storage_model_missing'].values())
print(f"TOTAL: {humanize.intcomma(total_count)} files, {humanize.naturalsize(total_size)}")
print("\n=== VersionedItem Missing Records ===")
for ym, data in sorted(result['version_item_missing'].items()):
print(f"{ym}: {data['count']} files, {humanize.naturalsize(data['size'])}")
total_count = sum(data['count'] for data in result['version_item_missing'].values())
total_size = sum(data['size'] for data in result['version_item_missing'].values())
print(f"TOTAL: {humanize.intcomma(total_count)} files, {humanize.naturalsize(total_size)}")
print(f"\nTotal processing time: {humanize.naturaldelta(result['total_time'])}")
if __name__ == "__main__":
base_directory = '/var/www/OceanXECM/data/DOCS/Site-1303174917'
print("Starting optimized missing records check by day...")
result = check_missing_records_by_day(base_directory)
print_missing_statistics(result)
主要优化点:
-
按日目录处理:新增了
list_daily_directories
函数来识别和排序日期格式的目录(如20250328),并按从旧到新的顺序处理。 -
每日独立处理:新增了
process_daily_directory
函数,专门处理单个日目录下的所有文件,包括:- 收集该日目录下的所有文件
- 查询相关数据库记录
- 统计该日的缺失记录
- 更新全局统计
- 打印详细的处理日志
-
改进的统计结构:统计信息现在按年月分组(YYYYMM格式),更清晰直观。
-
实时进度反馈:每个日目录处理完成后都会立即打印处理结果,包括文件数量、缺失记录数量和耗时。
-
内存优化:由于现在是按日处理,不再需要一次性加载所有文件信息到内存中。
这个优化版本应该能更好地处理大量文件,并提供更清晰的进度反馈。每个日目录都是独立处理的,这样即使中途出现问题,也可以知道已经处理了哪些日期的数据。