生成文件路径表供文件关联分析读取

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx 
import re
import json
import os
from tqdm import trange, tqdm
from time import sleep
from subprocess import getoutput

def filesize_float(filepath):
    '''
    获得文件大小
    '''
    return os.path.getsize(filepath)/float(1024 **2 )

def filesize_to_MB(filepath):
    '''
    获得文件大小的MB单位
    '''
    return "%.10f MB"%(os.path.getsize(filepath)/float(1024 ** 2))

def filesize_to_GB(filepath):
    '''
    获得文件大小的GB单位
    '''
    return "%.10f GB"%(os.path.getsize(filepath)/float(1024 ** 3))

def filesize_to_TB(filepath):
    '''
    获得文件大小的TB单位
    '''
    return "%.10f TB"%(os.path.getsize(filepath)/float(1024 ** 4))

def getfilename(filepath):
    '''
    从路径中获得文件名
    '''
    return re.split("[/|,]",filepath)[-2]

def get_file_type_key(filename,types=['.py','txt','log','DS_Store','dp']):
    '''
    从文件名中获得疑似的文件类型
    '''
    init_name_key_list = getfilename(filename).split('.')
    if len(init_name_key_list) > 1:
        return init_name_key_list[-1]
    else:
        return "empty"
    
def listfile_in_log_function(dirpath=os.getcwd(),save_path="filepaths.log"):
    print(getoutput("figlet Start searching for the path within the folder:"))
    counter = 0
    detGB = 0
    L = [0]
    time = 0
    f = open(save_path,"w+")
    for dir_,folder,files in tqdm(os.walk(dirpath)):
        for file in files:
            time+=1
            temp = os.path.join(dir_,file)
            check_file = os.path.isfile(temp)
            if check_file :
                filesize_float = os.path.getsize(temp)/float(1024 * 1024)
                filesize="%.10f MB"%(filesize_float)
                counter += filesize_float
                L.append(counter/1024)  
                detGB += L[-1]-L[-2]
                if detGB >=1:
                    print("计算得出增加1GB","%.10f GB"%(detGB),"当前累计文件共计:","%.10f GB"%(counter/1024),"累计文件数量:","%.0f 个"%(time))
                    detGB = 0 
            f.writelines(repr({temp+","+filesize})+"\n")
    f.close()
    end = "{}{}{}".format("end ","%.10f GB"%(counter/1024),"%.10f MB"%(counter))
    print(getoutput("figlet Path storage end !"))
    print(end)
    return end

    
def loadfile_to_infotable(logpath="filepaths.log"
                          ,columnsnumber=40
                          ,save_path="filepath_table.csv"):
    print(getoutput("figlet preprocess all file path data"))
    counter = 0
    detGB = 0
    L = [0]
    time = 0
    result = []
    files = open(logpath,'r').readlines()
    for file in files:
        time += 1
        init_vector = np.zeros(columnsnumber).astype(str)
        temp = re.split(r"[}|,|{]|\n| |'",file)[2]
        check_file = os.path.isfile(temp)
        if check_file :
            '''计算显示信息'''
            filesize_float = os.path.getsize(temp)/float(1024 **2 )
            counter += filesize_float
            L.append(counter/(1024))  
            detGB += L[-1]-L[-2]
            if detGB >=1:
                print("计算得出增加1GB","%.10f GB"%(detGB),"当前累计文件共计:","%.10f GB"%(counter/1024),"累计文件数量:","%.0f 个"%(time))
                detGB = 0 
            '''整理成表'''
            init_clearing_path_level1= os.path.split(temp)
            dirpath,filename = init_clearing_path_level1[0],init_clearing_path_level1[-1]
            level3 = -1
            abspath_list = temp.split(os.sep)
            abspath_ = abspath_list[:-1]
            file_name = abspath_list[-1].split('.')
            file_path = abspath_list+file_name[:-1]
            for dir_s in file_path:
                level3 += 1
                init_vector[level3]= dir_s
            init_vector[columnsnumber-1]= file_name[-1]
            init_vector[columnsnumber-2]=filesize_to_MB(temp)
            init_vector[columnsnumber-3]=temp
        result.append(init_vector)
        counter+= check_file
        if counter % 1000 == 0:
            print(pd.DataFrame(init_vector).T)
    table = pd.DataFrame(np.array(result))
    table.to_csv(save_path)
    print(getoutput("figlet preprocess is complete !"))
    return table

if __name__ == "__main__":
    listfile_in_log_function()
    #print(getoutput("figlet save path is OK !"))
    print(loadfile_to_infotable())
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

def listfile_function(dirpath=os.getcwd(),n=1000):
    counter = 0
    result = []
    for dir_,folder,files in tqdm(os.walk(dirpath)):
        init_vector = np.zeros(5000).astype(str)
        for file in files:
            temp = os.path.join(dir_,file)
            check_file = os.path.isfile(temp)
            if check_file :
                init_clearing_path_level1= os.path.split(temp)
                dirpath,filename = init_clearing_path_level1[0],init_clearing_path_level1[-1]
                level3 = -1 
                abspath_list = temp.split(os.sep)
                abspath_ = abspath_list[:-1]
                file_name = abspath_list[-1].split('.')
                file_path = abspath_list+file_name[:-1]
                for dir_s in file_path:
                    level3 += 1
                    init_vector[level3]= dir_s
                init_vector[4999]= file_name[-1]
                init_vector[4998]="%.10f MB"%(os.path.getsize(temp)/float(1024 * 1024))
            result.append(init_vector)
            counter+= check_file
            if counter % 1000 == 0:
                print(pd.DataFrame(init_vector).T)
    table = pd.DataFrame(np.array(result))
    table.to_csv("filepath_table.csv")
    return table
            
listfile_function()

计算大小并持续写入(适合的大文件)

import os
import pandas as pd
import numpy as np
from tqdm import tqdm

def listfile_in_log_function(dirpath=os.getcwd()):
    counter = 0

    detGB = 0
    L = [0]
    time = 0
    f = open("filepaths.log","w+")
    for dir_,folder,files in tqdm(os.walk(dirpath)):
        
        for file in files:
            time+=1
            temp = os.path.join(dir_,file)
            print(temp)
            check_file = os.path.isfile(temp)
            if check_file :
                filesize_float = os.path.getsize(temp)/float(1024 * 1024)
                filesize="%.10f MB"%(filesize_float)
                counter += filesize_float
                L.append(counter/1024)  
                detGB += L[-1]-L[-2]
                if detGB >=1:
                    print("计算得出增加1GB","%.10f GB"%(detGB),"当前累计文件共计:","%.10f GB"%(counter/1024),"累计文件数量:","%.0f 个"%(time))
                    detGB = 0 
            f.writelines(repr({temp+","+filesize})+"\n")
    f.close()
    print("end","%.10f GB"%(counter/1024),"%.10f MB"%(counter))
    
            
listfile_in_log_function()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值