读取及合并minio上csv文件，处理数据结构，保存至本地

最新推荐文章于 2024-04-23 11:13:38 发布

想要好好撸AI

最新推荐文章于 2024-04-23 11:13:38 发布

阅读量1.3k

点赞数

分类专栏：数据获取及处理文章标签： python pandas

本文链接：https://blog.csdn.net/weixin_37699342/article/details/128239827

版权

数据获取及处理专栏收录该内容

1 篇文章 0 订阅

订阅专栏

背景及目标

所有文件以csv格式存储在minio上，需要将其：

本地读取指定文件或所有文件
下载指定文件
合并所有文件为一个文件
将文件修改为pandas.dataframe格式
将文件名中的时间作为新的一列加入到合并后的文件中
获取指定周期内的数据
以csv的格式保存文件至本地

import os
import utils.minio_operation as minio_operation # 导入之前写好的minio操作库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import time
import schedule
from datetime import timedelta


def download_file(remote_file, local_file, bucket="(指定的bucket)"):
  '''download Minio file and save to local 下载minio的文件到本地'''
  return minio_operation.download_file(remote_file, local_file)

def get_file_list(bucket="(指定的bucket)"):
  '''get all Minio files in (指定的路径) 获取minio的文件列表清单'''
  client = minio_operation.get_minio_client()
  objects = client.list_objects('(指定的bucket)', prefix='(指定bucket下存放csv文件的路径)', recursive=True)
  obj_list = []
  for obj in objects:
    print('***** remote file: ', obj.bucket_name, obj.object_name.encode('utf-8'), ' last modified=', obj.last_modified, ' size=', obj.size, 'B *****') # <bytes> print file details
    obj_list.append(str(obj.object_name.encode('utf-8'), encoding='utf-8')) 
    # print(obj_list)
  return obj_list # <list>

def get_file(remote_file, bucket="(指定的bucket)"):
  '''get the Minio file 获取minio的文件'''
  client = minio_operation.get_minio_client()
  print('***** get remote file:', os.path.join(bucket, remote_file), ' *****') # print file name
  data = client.get_object(bucket, remote_file)
  return data # <class 'urllib3.response.HTTPResponse'>

def operate_file(file):
  '''operate files, modify data structure, set index, etc 操作文件，修改数据类型，设置索引等'''
  f = str(get_file(file).data,encoding = "utf-8").split('\n')
  f1 = pd.DataFrame(f)
  f1 = f1.iloc[:,0].str.split(',', expand = True)

  f1.drop([0], inplace = True)
  f1.reset_index(level = 0, drop = True, inplace = True)
  f1.drop([0], axis = 1, inplace = True)
  f1 = f1.dropna()

  f1.rename(columns = {0:'index', 1:'(各字段名)', 2:'(各字段名)', 3:'(各字段名)', 4:'throughput'}, inplace = True)
  return f1 # <class 'pandas.core.frame.DataFrame'>

def concat_file(file):
  '''merge all files, modify data type 根据指定的字段合并所有minio文件，并提取文件名中的日期，作为新文件的一列'''
  f_merged = pd.DataFrame(columns = ['date', '(各字段名)', '(各字段名)', '(各字段名)', '(各字段名)'])
  for f in file:
    new_file = operate_file(f)

    if new_file.empty:
      print('WARNING!',os.path.join(f),'is empty! \n')
    else:
      new_file.insert(0, 'date', f[-14:-4])
      print(new_file) # print each file
      f_merged = pd.concat([f_merged, new_file], sort=False)
  
  # display all data 取消显示限制，显示所有数据
  # pd.set_option('display.width', 1000)
  # pd.set_option('display.max_rows', None)
  
  f_merged.reset_index(level = 0, drop = True, inplace = True)
  
  f_merged.date = pd.to_datetime(f_merged.date)
  f_merged = f_merged.astype({'(各字段名)':'str', '(各字段名)':'int', '(各字段名)':'float', '(各字段名)':'float'})
  f_merged.replace(float(0), np.nan) # f_merged.fillna(0)
  return f_merged

def get_period_result(file, time_frequency = -7):
  '''get the data within cycle(default=last 7 days) 获取周期内的数据'''
  end_date = datetime.date.today()
  start_date = end_date + timedelta(days = time_frequency)
  print('***** data from', start_date, 'to', end_date,' *****')

  cycle_file = file.loc[(file['date'] >= pd.Timestamp(start_date)) & (file['date'] <= pd.Timestamp(end_date))]
  grouped_cycle_file = cycle_file.groupby(by = ['(要分组的字段名)', '(要分组的字段名)']) # <pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f84f34d1700>
  best_perf_file = grouped_cycle_file.aggregate({'(分组后其它字段名)':np.min, '(分组后其它字段名)':np.max})
  best_perf_file.rename(columns = {'(分组后其它字段名)':'(更改分组后其它字段名)', '(分组后其它字段名)':'(更改分组后其它字段名)'}, inplace = True)
  return best_perf_file

def task_trigger():
  print('scheduled task is running... ...')


if __name__ == "__main__":
  remote_files = get_file_list()
  merged_file = concat_file(remote_files)

  print('***** concat all tables at ', datetime.datetime.now(), ' *****')
  print(merged_file) 

  '''get dataset infos 获取数据集的信息'''
  # print('***** dataset infos *****')
  # print('shape of dataset:', merged_file.shape)
  # print('datatype of variables:\n', merged_file.dtypes)
  # print('***** values by rows and cols *****')
  # print(merged_file.iloc[15:30, 0:2])

  '''visualize dataset 可视化数据集'''
  # print('***** visualize dataset *****')
  # img = merged_file.plot(x='date', y='latency', kind='line', grid=True) # 折线图
  # img = merged_file.plot.bar(x='date', y='latency')
  # plt.show()
  # plt.savefig('./Pic2.png')

  '''save all data in one file as csv 保存合并后的文件到本地'''
  print('***** save all perf in one file as csv *****')
  merged_file.to_csv('merged_file.csv')

  '''get data for specified period and save it as csv 保存指定周期内的文件到本地'''
  best_perf = get_period_result(merged_file, -30)
  if best_perf.empty:
    print('no data on Minio!')
  else:
    print(best_perf)
  print('***** save the best perf for specified period as csv *****')
  best_perf.to_csv('best_perf.csv')

  '''download the specified remote file 下载指定文件'''
  # remote_file = "(指定远程路径)/(指定远程文件名).csv" 
  # local_file = "(指定本地路径)/(指定本地文件名).csv"
  # download_file(remote_file, local_file)

  '''run daily at 10:00 每天10点定时运行'''
  # schedule.every().day.at("10:00").do(task_trigger)
  # while True:
  #   schedule.run_pending()
  #   time.sleep(1)