from pyhdfs import HdfsClient
import time
from config import config
class OperaHdfs(object):
def __init__(self):
self.hosts = 'nn1.example.com:50070,nn2.example.com:50070'
self.fs = HdfsClient(hosts=self.hosts, user_name='hdfs')
def jug(self, file):
"""
判断路径/文件是否存在
:param file:
:return:
"""
return self.fs.exists(file)
def mkpath(self, path):
"""
创建路径
:param path:
:return:
"""
if not self.jug(path):
return self.fs.mkdirs(path)
def delpath(self, path):
"""
删除文件夹
:param path:
:return:
"""
if self.jug(file=path):
return self.fs.delete(path, recursive=True)
def get_file_path(self, hdfs_path, partition_dict):
"""
获取目标文件存储的路径(文件前一级)
:param hdfs_path:
:param partition_dict:
:return:
"""
if partition_dict:
for partition_name, partition_value in partition_dict.items():
hdfs_path += '/{}={}'.format(partition_name, partition_value)
self.mkpath(path=hdfs_path)
return hdfs_path
def write_hdfs(self, info, hdfs_path, file_name, partition_dict=None, write_mode='append'):
"""
:param info: 即将写入hdfs的数据
:param hdfs_path: the path of file on hdfs
:param file_name: the file name on hdfs
:param partition_dict: hive table partition
:param write_mode: write mode
:return:
"""
file_path = self.get_path(hdfs_path, file_name, partition_dict)
file = '{}/{}'.format(file_path, file_name)
if self.jug(file=file):
if write_mode == 'append':
timestamp = round(time.time() * 1000)
if '.' in file_name:
file_name_name, file_name_tail = file_name.split('.')
file_name_name = '{}_{}'.format(file_name_name, timestamp)
file_name = '{}.{}'.format(file_name_name, file_name_tail)
else:
file_name = '{}_{}'.format(file_name, timestamp)
file = '{}/{}'.format(file_apth, file_name)
if write_mode == 'nonConflict':
raise Exception('file {} is exists'.format(file))
if write_mode == 'overwrite':
self.delpath(path=file_path)
file_path = self.get_path(hdfs_path, file_name, partition_dict)
file = '{}/{}'.format(file_path, file_name)
self.fs.create(file, info.encode('utf-8'))
opera_hdfs = OperaHdfs()
python操作hdfs
最新推荐文章于 2024-03-19 15:01:36 发布