先回顾下之前PythonWebHDFS的操作 : 基于WebHDFS REST API操作HDFS
记录下工作中写过的脚本,如下:
1、dateUtile.py: 主要选取需要迁移数据的时间区间。
import datetime
import sys
import os
def dateRange(beginDate, endDate):
dates = []
dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
date = beginDate[:]
while date <= endDate:
dates.append(date)
dt = dt + datetime.timedelta(1)
date = dt.strftime("%Y-%m-%d")
return dates
# 准备迁移5周前~3周前这段时间内的数据到NAS
def getDateRegion():
begin_max_days=35
end_min_days=21
current_date=datetime.date.today()
delta_max=datetime.timedelta(days = begin_max_days)
delta_min=datetime.timedelta(days = end_min_days)
beginDate=(current_date - delta_max).strftime("%Y-%m-%d")
endDate=(current_date - delta_min).strftime("%Y-%m-%d")
return dateRange(beginDate,endDate)
2、iterate_migrate_wzmetro.py : 基于时间段获取需要迁移的数据在HDFS上的具体路径
import datetime
import sys
import os
import dateUtil
# Full Path : /datalog/wzmetro/20190601
sourceFilePath = "/datalog/wzmetro/"
destFilePath = "/data4/hdfs/wzmetro/"
if __name__ == '__main__':
dates = dateUtil.getDateRegion()
for date in dates:
year = date[0:4]
month = date[5:7]
day = date[8:10]
datePath = year+month+day
destDatePath = year+'/'+month+'/'+day+'/'
source = sourceFilePath+datePath
dest = destFilePath+destDatePath
os.system("python migrate_nas_combine_gz.py " + source + " " + dest )
3、migrate_nas_combine_gz.py: 基于路径开始将HDFS数据迁移到本地
import os
import shutil
import json
import sys
import gzip
import zipfile
from hdfs.client import Client
from hdfs import InsecureClient
import logging
from logging import handlers
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logFile = './logs/migrate_hdfs_to_nas.log'
fileHandler = logging.FileHandler(logFile, mode='a')
fileHandler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fileHandler.setFormatter(formatter)
timedRotatingFileHandler = handlers.TimedRotatingFileHandler(filename=logFile, when='D')
timedRotatingFileHandler.setLevel(logging.INFO)
timedRotatingFileHandler.setFormatter(formatter)
logger.addHandler(timedRotatingFileHandler)
hdfsPath=sys.argv[1]
localPath=sys.argv[2]
#client = Client("http://192.168.0.1:50070")
client = InsecureClient("http://192.168.0.1:50070", user='hadoop')
check_path_status=client.status(hdfsPath,strict=False)
if(check_path_status !=None):
dirList= client.list( hdfsPath ,status=False )
for i in dirList:
hdfsLeafPath=hdfsPath+'/'+i
localPathExistsFlag=os.path.exists(localPath)
if( not localPathExistsFlag ):
os.makedirs( localPath )
localLeafPath=localPath+i
fileFlag=os.path.exists( localLeafPath )
if(fileFlag):
os.remove(localLeafPath)
logger.info('The File Is Exists , Remove OK ! ')
client.download(hdfsLeafPath, localPath,overwrite=True)
#parent = os.path.dirname(os.path.realpath(localLeafPath))
#print("parent name = "+ parent)
#e_file_name=parent.split(os.path.sep)
#zipName = e_file_name[len(e_file_name)-1]
#print('zipName=='+zipName)
#os.system('zip -r %s.zip %s' %((zipName ,localPath) ))
f = zipfile.ZipFile(localPath + '/' + i+".gz", 'w', zipfile.ZIP_DEFLATED)
f.write(localLeafPath)
# 调用了close方法才会保证完成压缩
f.close()
os.remove(localLeafPath)
logger.info('=====>>>>>The Current Local Path Folder: ' + localPath + ',And Delete Every File.')
# 删除HDFS上路径文件
client.delete(hdfsPath, recursive=True)
logger.info("===>>>Download HDFS To Local File ===>> Migrate Local File ===> NAS Server ===> GZip NAS File ===> Remove NAS Source File !!!")
else:
logger.info("The HDFS Source Path: "+ hdfsPath + " , And The Status=="+str(check_path_status))