多级文件夹数据统计

最新推荐文章于 2023-08-31 09:02:58 发布

weixin_44343512

最新推荐文章于 2023-08-31 09:02:58 发布

阅读量1k

点赞数

文章标签：数据库 python batch命令

本文链接：https://blog.csdn.net/weixin_44343512/article/details/132383912

版权

写在前面：最近论文要用到的数据很多，涉及7对并置站从2010年到2023年的全部数据
今天鼓捣了一整天，想用快速方便的程序来整理整理，仅供参考
部分路径需要修改，取走用的时候注意看

首先不能随便动我的原始数据，dddd，好不容易下载完的
需求一：创建跟我的文件夹结构一致的新文件夹系列
这个很简单，用bat文件就好
分别创建了2010年到2023年，子文件夹分别包括7组站点，站点文件夹内部分别新建了示例文件，以便后续处理

@echo off
setlocal enabledelayedexpansion

for /L %%Y in (2010,1,2023) do (
    set "y=%%Y"
    set "y=!y:~-2!"
    
    echo y value: !y!
    
    mkdir "year_%%Y"
    cd "year_%%Y"
    
    for %%F in (BAKO_CIBG CHPI_CHPG KOKB_KOKV PTAG_PTGG YAR2_YARR YEL2_YELL THTI_THTG) do (
        echo Creating files in year_%%Y\%%F...
        mkdir "%%F"
        cd "%%F"
        
        set "prefix=%%F"
        set "prefix=!prefix:~0,4!"
        
        set "suffix=%%F"
        set "suffix=!suffix:~-4!"
        
        copy nul !prefix!0010.!y!o > nul
        copy nul !prefix!3650.!y!o > nul
        copy nul !suffix!0010.!y!o > nul
        copy nul !suffix!3650.!y!o > nul
        cd ..
    )
    cd ..
)
echo Files created successfully!
pause

结果如图：
在这里插入图片描述

需求二：将下载完的数据分类整理
因为我之前弄过一部分，所以数据有点乱，这一段是我个人的数据调整，估计适用型不高，可以直接跳过。
主要目的是将站点文件夹内的下载数据再统一收到RINEX子文件夹中，方便下一步存入其他类型数据。

# 将所有文件归类存入RINEX文件夹中
import os
import shutil


def process_folder(folder_path):
    # Check if 'useful' folder exists
    useful_folder_path = os.path.join(folder_path, 'useful')
    rinex_folder_path = os.path.join(folder_path, 'RINEX')

    if os.path.exists(useful_folder_path):
        os.rename(useful_folder_path, rinex_folder_path)
    elif not os.path.exists(rinex_folder_path):
        os.mkdir(rinex_folder_path)


    # Get the parent folder name (last two digits) and construct the desired suffix
    parent_folder_name = os.path.basename(os.path.dirname(folder_path))
    desired_suffix = parent_folder_name[-2:] + 'o'

    # Move files with the desired suffix to RINEX folder
    for file in os.listdir(folder_path):
        if file.endswith(desired_suffix):
            file_path = os.path.join(folder_path, file)
            new_file_path = os.path.join(rinex_folder_path, file)
            shutil.move(file_path, new_file_path)

    # Delete .txt files
    for file in os.listdir(folder_path):
        if file.endswith('.txt'):
            file_path = os.path.join(folder_path, file)
            os.remove(file_path)


def main():
    # base_path = os.getcwd()  # Assuming the script is in the same directory as the year folders
    # Specify the base path to the directory containing year folders
    base_path = "F:\\test"

    for year in range(2010, 2024):
        year_folder = os.path.join(base_path, f'year_{year}')
        for subfolder in ['BAKO_CIBG', 'CHPI_CHPG', 'KOKB_KOKV', 'PTAG_PTGG', 'YAR2_YARR', 'YEL2_YELL', 'THTI_THTG']:
            subfolder_path = os.path.join(year_folder, subfolder)
            if os.path.exists(subfolder_path):
                process_folder(subfolder_path)


if __name__ == "__main__":
    main()

需求三：正题开始，统计子文件夹文件个数，比较两类文件日期相同的个数，结果输出至Excel表保存
直接上代码和结果

# 文件夹内各文件的数量统计

import os
from openpyxl import Workbook

# 定义基本目录
base_directory = 'H:\\GPS_S4c\\data'

# 定义年份和站点文件夹
years = list(range(2010, 2024))
sites = ['YEL2_YELL', 'KOKB_KOKV', 'PTAG_PTGG',
         'BAKO_CIBG', 'THTI_THTG', 'CHPI_CHPG', 'YAR2_YARR']

# 定义RINEX文件夹和文件模式
rinex_folder = 'RINEX'

# 定义将年份转换为特定格式的函数
def year_str(year):
    return f"year_{year}"

# 初始化Excel工作簿
wb = Workbook()
ws = wb.active

# 添加表头
header = ['Site\\Year']
for year in years:
    header.extend([str(year)])
ws.append(header)

# 遍历站点
for site in sites:
    # 初始化每个站点的文件统计字典
    files_count = {year: {'start': [], 'end': []} for year in years}
    
    # 初始化每个站点的一致计数器
    consistent_count = {year: 0 for year in years}
    
    # 遍历年份
    for year in years:
        site_folder = os.path.join(base_directory, year_str(year), site, rinex_folder)
        
        if os.path.exists(site_folder):
            files = os.listdir(site_folder)

            for file in files:
                if file.startswith(site[:4].lower()):
                    files_count[year]['start'].append(file)  # 存储文件名
                if file.startswith(site[5:].lower()):
                    files_count[year]['end'].append(file)    # 存储文件名
    
    
    # 统计每年相同元素的数量并存储在新数组中
    start_chars = {year: [file[4:7] for file in files_count[year]['start']] for year in years}
    end_chars = {year: [file[4:7] for file in files_count[year]['end']] for year in years}

    # 添加[site[:4]]行
    row_start = [f'{site[:4]}']
    for year in years:
        row_start.extend([len(files_count[year]['start'])])
    ws.append(row_start)

    # 添加[site[5:]]行
    row_end = [f'{site[5:]}']
    for year in years:
        row_end.extend([len(files_count[year]['end'])])
    ws.append(row_end)
    
    # 输出一致计数结果至表格
    row_consistent = ['Consistent Count']
    for year in years:
        common_elements = len(set(start_chars[year]) & set(end_chars[year]))
        consistent_count[year] = common_elements
        row_consistent.extend([consistent_count[year]])
    ws.append(row_consistent)

    # 添加空行
    ws.append([])

# 保存Excel文件
excel_file_path = os.path.join(base_directory, 'summary.xlsx')
wb.save(excel_file_path)

需求四：最后将不匹配的日期单独摘出来放txt文件，后续需要二次下载筛选
我沿用了三的代码，有一点重复，但我懒得改了

# 缺失文件数量及具体日期统计，输出txt文件
import os
from openpyxl import Workbook

# 定义基本目录
base_directory = 'H:\\GPS_S4c\\data'

# 定义年份和站点文件夹
years = list(range(2010, 2024))
sites = ['YEL2_YELL', 'KOKB_KOKV', 'PTAG_PTGG',
         'BAKO_CIBG', 'THTI_THTG', 'CHPI_CHPG', 'YAR2_YARR']

# 定义RINEX文件夹和文件模式
rinex_folder = 'RINEX'

# 定义将年份转换为特定格式的函数
def year_str(year):
    return f"year_{year}"

# 初始化缺失字符集合
missing_chars_start = {year: set() for year in years}
missing_chars_end = {year: set() for year in years}

# 生成从001到365的集合
all_days = {str(day).zfill(3) for day in range(1, 366)}
sorted_days = sorted(all_days)

# 创建"Missing"文件夹
missing_folder = os.path.join(base_directory, 'Missing')
os.makedirs(missing_folder, exist_ok=True)

# 遍历站点
for site in sites:
    # 初始化每个站点的文件统计字典
    files_count = {year: {'start': [], 'end': []} for year in years}
    
    # 初始化每个站点的一致计数器
    consistent_count = {year: 0 for year in years}
    
    # 遍历年份
    for year in years:
        site_folder = os.path.join(base_directory, year_str(year), site, rinex_folder)
        
        if os.path.exists(site_folder):
            files = os.listdir(site_folder)

            for file in files:
                if file.startswith(site[:4].lower()):
                    files_count[year]['start'].append(file)  # 存储文件名
                if file.startswith(site[5:].lower()):
                    files_count[year]['end'].append(file)    # 存储文件名
    
    
    # 统计每年相同元素的数量并存储在新数组中
    start_chars = {year: [file[4:7] for file in files_count[year]['start']] for year in years}
    end_chars = {year: [file[4:7] for file in files_count[year]['end']] for year in years}

    # 对比并统计缺失的字符
    for year in years:
        missing_chars_start[year] = all_days.difference(start_chars[year])
        missing_chars_end[year] = all_days.difference(end_chars[year])

    # 保存缺失的字符至txt文件
    with open(os.path.join(missing_folder, f"{site[:4]}_missing.txt"), 'w') as f:
        for year in years:
            f.write(f"{year} {len(missing_chars_start[year])} {sorted(missing_chars_start[year])}\n")

    with open(os.path.join(missing_folder, f"{site[5:]}_missing.txt"), 'w') as f:
        for year in years:
            f.write(f"{year} {len(missing_chars_end[year])} {sorted(missing_chars_end[year])}\n")

在这里插入图片描述

上传也给自己保存一下，辛苦一天总算还是有结果的，要是能帮助到一些人，那就再好不过了！
补：后面的代码都是Python写的

weixin_44343512

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
多级文件夹数据统计

分别创建了2010年到2023年，子文件夹分别包括7组站点，站点文件夹内部分别新建了示例文件，以便后续处理。因为我之前弄过一部分，所以数据有点乱，这一段是我个人的数据调整，估计适用型不高，可以直接跳过。需求三：正题开始，统计子文件夹文件个数，比较两类文件日期相同的个数，结果输出至Excel表保存。上传也给自己保存一下，辛苦一天总算还是有结果的，要是能帮助到一些人，那就再好不过了！需求四：最后将不匹配的日期单独摘出来放txt文件，后续需要二次下载筛选。我沿用了三的代码，有一点重复，但我懒得改了。
复制链接

扫一扫