Python | 基于PythonWebHDFS迁移HDFS数据到本地并压缩

先回顾下之前PythonWebHDFS的操作 : 基于WebHDFS REST API操作HDFS

记录下工作中写过的脚本,如下:

1、dateUtile.py: 主要选取需要迁移数据的时间区间。

import datetime
import sys
import os  

def dateRange(beginDate, endDate):
    dates = []
    dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
    date = beginDate[:]
    while date <= endDate:
        dates.append(date)
        dt = dt + datetime.timedelta(1)
        date = dt.strftime("%Y-%m-%d")
    return dates

# 准备迁移5周前~3周前这段时间内的数据到NAS
def getDateRegion():
    begin_max_days=35
    end_min_days=21

    current_date=datetime.date.today()

    delta_max=datetime.timedelta(days = begin_max_days)
    delta_min=datetime.timedelta(days = end_min_days)

    beginDate=(current_date - delta_max).strftime("%Y-%m-%d")
    endDate=(current_date - delta_min).strftime("%Y-%m-%d")
	
    return dateRange(beginDate,endDate)

2、iterate_migrate_wzmetro.py : 基于时间段获取需要迁移的数据在HDFS上的具体路径

import datetime
import sys
import os  
import dateUtil

# Full Path : /datalog/wzmetro/20190601
sourceFilePath = "/datalog/wzmetro/"
destFilePath = "/data4/hdfs/wzmetro/"

if __name__ == '__main__':

    dates = dateUtil.getDateRegion()
    for date in dates:
        year = date[0:4]
	month = date[5:7]
	day = date[8:10]

	datePath = year+month+day
	destDatePath = year+'/'+month+'/'+day+'/'
	source = sourceFilePath+datePath
	dest = destFilePath+destDatePath

	os.system("python migrate_nas_combine_gz.py " + source + " " + dest )

3、migrate_nas_combine_gz.py: 基于路径开始将HDFS数据迁移到本地

import os
import shutil
import json 
import sys 
import gzip
import zipfile
from hdfs.client import Client 
from hdfs import InsecureClient

import logging
from logging import handlers

logger = logging.getLogger()
logger.setLevel(logging.INFO) 
 
 
logFile = './logs/migrate_hdfs_to_nas.log'
fileHandler = logging.FileHandler(logFile, mode='a')
fileHandler.setLevel(logging.INFO) 
 
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fileHandler.setFormatter(formatter)

timedRotatingFileHandler = handlers.TimedRotatingFileHandler(filename=logFile, when='D')
timedRotatingFileHandler.setLevel(logging.INFO)
timedRotatingFileHandler.setFormatter(formatter)

logger.addHandler(timedRotatingFileHandler)


hdfsPath=sys.argv[1]
localPath=sys.argv[2]

#client = Client("http://192.168.0.1:50070")
client = InsecureClient("http://192.168.0.1:50070", user='hadoop')
check_path_status=client.status(hdfsPath,strict=False)
if(check_path_status !=None):
    dirList= client.list( hdfsPath  ,status=False )

    for i in dirList:
        hdfsLeafPath=hdfsPath+'/'+i

	localPathExistsFlag=os.path.exists(localPath)
        if( not localPathExistsFlag ):
                os.makedirs( localPath )

	localLeafPath=localPath+i
        fileFlag=os.path.exists( localLeafPath )
        if(fileFlag):
                os.remove(localLeafPath)
                logger.info('The File Is Exists , Remove OK ! ')

	client.download(hdfsLeafPath, localPath,overwrite=True)
	
	#parent = os.path.dirname(os.path.realpath(localLeafPath))
	#print("parent name = "+ parent)
	#e_file_name=parent.split(os.path.sep)
	#zipName = e_file_name[len(e_file_name)-1]
	#print('zipName=='+zipName)
	#os.system('zip -r %s.zip  %s' %((zipName ,localPath) ))

        f = zipfile.ZipFile(localPath + '/' + i+".gz", 'w', zipfile.ZIP_DEFLATED)
        f.write(localLeafPath)
    	# 调用了close方法才会保证完成压缩
    	f.close()
	os.remove(localLeafPath)
	
    logger.info('=====>>>>>The Current Local Path Folder: ' + localPath + ',And Delete Every File.')
    # 删除HDFS上路径文件
    client.delete(hdfsPath, recursive=True)
    logger.info("===>>>Download HDFS To Local File ===>> Migrate Local File ===> NAS Server ===> GZip NAS File ===> Remove NAS Source File !!!")
else:
    logger.info("The HDFS Source Path: "+ hdfsPath + " , And The Status=="+str(check_path_status))

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值