pycurl实现hadoop的客户端功能

最新推荐文章于 2021-11-01 21:00:00 发布

weixin_33835690

最新推荐文章于 2021-11-01 21:00:00 发布

阅读量109

点赞数

文章标签：大数据 python java

原文链接：http://blog.51cto.com/liran728729/1151764

版权

pycurl实现hadoop的客户端功能

目前在测试一个hadoop的功能，需要频繁的和hadoop打交道。刚开始采用的python的subprocess模块来调用底层的hadoop提供的命令行工具实现的。

一，hadoop提供的命令行格式说明：

hadoop fs [cmd]具体的命令有:

hadoop fs [-fs <local | file system URI>] [-conf <configuration file>]

[-D <property=value>] [-ls <path>] [-lsr <path>] [-du <path>]

[-dus <path>] [-mv <src> <dst>] [-cp <src> <dst>] [-rm [-skipTrash] <src>]

[-rmr [-skipTrash] <src>] [-put <localsrc> ... <dst>] [-copyFromLocal <localsrc> ... <dst>]

[-moveFromLocal <localsrc> ... <dst>] [-get [-ignoreCrc] [-crc] <src> <localdst>

[-getmerge <src> <localdst> [addnl]] [-cat <src>]

[-copyToLocal [-ignoreCrc] [-crc] <src> <localdst>] [-moveToLocal <src> <localdst>]

[-mkdir <path>] [-report] [-setrep [-R] [-w] <rep> <path/file>]

[-touchz <path>] [-test -[ezd] <path>] [-stat [format] <path>]

[-tail [-f] <path>] [-text <path>]

[-chmod [-R] <MODE[,MODE]... | OCTALMODE> PATH...]

[-chown [-R] [OWNER][:[GROUP]] PATH...]

[-chgrp [-R] GROUP PATH...]

[-count[-q] <path>]

[-help [cmd]]

从上面可以看出命令提供的功能还是挺强大的。包括了文件和对目录的各种操作。

举个例子：

要列出hadoop的根目录下面的文件,具体命令如下：

#hadoop fs -ls hdfs://192.168.0.112:50081/

drwx---r-x - test test 0 2013-03-08 11:20 /static

drwx---r-x - test test 0 2013-02-19 15:40 /system

drwxrwxrwx - test test 0 2013-01-22 18:42 /video

其他的命令功能就不一一介绍了，相信看帮组文档自己也可以看懂。

这样会有一个问题，每执行一个命令都会新生成一个jvm，对运行命令的机器造成很大的负担，在命令多的情况下，查看top可以看到java的进程会跑到99%，严重影响到的使用。于是有了下面的实现方法。

二，hadoop提供的web方式

在网上查看官方的客户端API，发现hadoop提供一个web REST API，既采用curl的方式可以轻松实现。官方文档连接为：http://hadoop.apache.org/docs/stable/webhdfs.html

上面对使用方式进行充分的说明。

curl的方式可以进行对hadoop中的文件和目录进行一些基本的操作。

目前官网上提供的有

1,创建并写入文件

2，追加文件

3，打开并读入文件

4，创建目录

5，重命名文件或者目录

6，删除文件或者目录

7，列出文件或者目录状态

8，列出目录列表

下面提供一些具体的使用例子：

a，列出目录的状态

#curl -i http://192.168.0.112:50071/webhdfs/v1/?op=GETFILESTATUS

HTTP/1.1 200 OK

Content-Type: application/json

Transfer-Encoding: chunked

Server: Jetty(6.1.26)

{"FileStatus":{"accessTime":0,"blockSize":0,"group":"TEST","length":0,"modificationTime":1362812718704,"owner":"TEST","pathSuffix":"","permission":"705","replication":0,"type":"DIRECTORY"}}

b，重命名目录

#curl -i -X PUT http://192.168.0.112:50071/webhdfs/v1/test?op=RENAME&destination=/test1

HTTP/1.1 200 OK

Content-Type: application/json

Transfer-Encoding: chunked

{"boolean":true}

其他的功能就不一一介绍了。具体的实现方式请看官方文档

三，由curl的方式想到的

因为我的程序是用python跑的，那么采用curl命令行的方式同样是调用底层命令，python的模块那么多，那么我如果使用python的curl库那不是可以轻松实现python对hadoop中文件和目录的操作。

在经过查资料后，写了一个基本的webhadoop的class，基本的功能大概完成了，其他的东西以后再加吧。

具体的代码如下：

#!/usr/bin/env python
# -*- encoding:utf-8 -*-
"""A library to access Hadoop HTTP REST API,
make sure you hadoop cluster open the http access .
"""
'''
author : liran
data : 2013-03-11
致谢：xwu
武汉云雅科技有限公司
'''
import StringIO
import pycurl
import re
import sys
import logging
import os
class WebHadoop(object):
def __init__(self,host,port,username,logger,prefix="/webhdfs/v1"):
self.host = host
self.port = port
self.user = username
self.logger = logger
self.prefix = prefix
self.status = None
self.url = "http://%s:%s" % (host,port)
selfself.url_path = self.url + self.prefix
def checklink(self):
try:
b = StringIO.StringIO()
c = pycurl.Curl()
checkurl = self.url + "/dfsnodelist.jsp?whatNodes=LIVE"
c.setopt(pycurl.URL, checkurl)
c.setopt(pycurl.HTTPHEADER, ["Accept:"])
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
c.perform()
self.status = c.getinfo(c.HTTP_CODE)
bbody = b.getvalue()
self.Write_Debug_Log(self.status,checkurl)
p = re.compile(r'''Live Datanodes :(.*)</a''')
results = p.findall(body)
b.close()
if results[0] == "0":
self.logger.error("Sorry, There are not live datanodes in Hadoop Cluster!!!")
self.curlObj.close()
sys.exit(255)
return results[0]
except pycurl.error,e:
self.logger.error("Sorry, can not get the hadoop http link .Erros: %s" % e)
c.close()
b.close()
sys.exit(255)
finally:
c.close()
b.close()
def lsdir(self,path):
try:
b = StringIO.StringIO()
put_str = '[{"op":LISTSTATUS}]'
c = pycurl.Curl()
lsdir_url = self.url_path + path + "?op=LISTSTATUS"
c.setopt(pycurl.URL, lsdir_url)
c.setopt(pycurl.HTTPHEADER, ["Accept:"])
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
c.perform()
bbody = b.getvalue()
self.status = c.getinfo(c.HTTP_CODE)
except Exception,e:
print e
finally:
c.close()
b.close()
if self.status == 200:
data_dir = eval(body)
return data_dir['FileStatuses']['FileStatus']
else:
self.logger.error("Sorry,can not list the dir or file status!!!")
self.Write_Debug_Log(self.status,lsdir_url)
return False
def lsfile(self,path):
try:
c = pycurl.Curl()
b = StringIO.StringIO()
put_str = '[{"op":LISTSTATUS}]'
lsdir_url = self.url_path + path + "?op=GETFILESTATUS"
c.setopt(pycurl.URL, lsdir_url)
c.setopt(pycurl.HTTPHEADER, ["Accept:"])
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
c.perform()
bbody = b.getvalue()
self.status = c.getinfo(c.HTTP_CODE)
except Exception,e:
print e
finally:
c.close()
b.close()
if self.status == 200:
data_dir = eval(body)
if data_dir['FileStatus']['type'] == "DIRECTORY":
self.logger.error("Sorry,this file %s is a dir actually!!!" % (path))
return False
else:
return data_dir['FileStatus']
else:
self.logger.error("Sorry,can not list the dir or file status!!!")
self.Write_Debug_Log(self.status,lsdir_url)
return False
def mkdir(self,path,permission="755"):
try:
print "yes ,mkdir function"
b = StringIO.StringIO()
c = pycurl.Curl()
mkdir_str = '[{"op":"MKDIRS","permission"=permission}]'
mkdir_url = "%s%s?op=MKDIRS&permission=%s" % (self.url_path,path,permission)
c.setopt(pycurl.URL, mkdir_url)
c.setopt(pycurl.HTTPHEADER,['Content-Type: application/json','Content-Length: '+str(len(mkdir_str))])
c.setopt(pycurl.CUSTOMREQUEST,"PUT")
c.setopt(pycurl.POSTFIELDS,mkdir_str)
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
c.perform()
self.status = c.getinfo(c.HTTP_CODE)
bbody = b.getvalue()
b.close()
except Exception,e:
print e
finally:
c.close()
if self.status == 200 :
if "true" in body:
self.logger.info("Great,Successfully Create dir %s in hadoop cluster!!" % (path))
return True
elif "false" in body:
self.logger.info("Sorry,can't create this %s dir in hadoop cluster!!1!!")
return False
else:
return False
else:
self.logger.error("Sorry,can't create this %s dir in hadoop cluster!!1" % (path))
self.Write_Debug_Log(self.status,mkdir_url)
def remove(self,path,recursive="True"):
try:
c = pycurl.Curl()
b = StringIO.StringIO()
remove_str = '[{"op":"DELETE","recursive"=recursive}]'
remvoe_url = "%s%s?op=DELETE&recursive=%s" % (self.url_path,path,recursive)
c.setopt(pycurl.URL, remvoe_url)
c.setopt(pycurl.HTTPHEADER,['Content-Type: application/json','Content-Length: '+str(len(remove_str))])
c.setopt(pycurl.CUSTOMREQUEST,"DELETE")
c.setopt(pycurl.POSTFIELDS,remove_str)
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
c.perform()
bbody = b.getvalue()
print type(body)
self.status = c.getinfo(c.HTTP_CODE)
except Exception,e:
print e
finally:
c.close()
b.close()
if self.status == 200 :
if "true" in body:
print "yes ,it in"
self.logger.info("Great,Successfully delete dir or file %s in hadoop cluster!!" % (path))
return True
elif "false" in body:
print "no ,it is not"
self.logger.info("Sorry,can't delete dir or file,maybe this dir is not exsited!!")
return False
else:
return False
else:
self.logger.error("Sorry,can't create this %s dir in hadoop cluster!!1" % (path))
self.Write_Debug_Log(self.status,remvoe_url)
def rename(self,src,dst):
try:
c = pycurl.Curl()
b = StringIO.StringIO()
rename_str = '[{"op":"RENAME"}]'
rename_url = "%s%s?op=RENAME&destination=%s" % (self.url_path,src,dst)
c.setopt(pycurl.URL, rename_url)
c.setopt(pycurl.HTTPHEADER,['Content-Type: application/json','Content-Length: '+str(len(rename_str))])
c.setopt(pycurl.CUSTOMREQUEST,"PUT")
c.setopt(pycurl.POSTFIELDS,rename_str)
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
c.perform()
bbody = b.getvalue()
self.status = c.getinfo(c.HTTP_CODE)
except Exception,e:
print e
finally:
c.close()
b.close()
if self.status == 200 :
if "true" in body:
self.logger.info("Great,Successfully rename dir or file %s in hadoop cluster!!" % (rename_url))
return True
elif "false" in body:
self.logger.info("Sorry,can't rename dir or file,maybe this dir is not exsited!!")
return False
else:
return False
else:
self.logger.error("Sorry,can't create this %s dir in hadoop cluster!!1" % (rename_url))
self.Write_Debug_Log(self.status,rename_url)
def put_file(self,local_path,hdfs_path,overwrite="true",permission="755",buffersize="128"):
print "yes ,put fils ing!!!"
try:
c = pycurl.Curl()
put_str = '[{"op":"CREATE","overwrite":overwrite,"permission":permission,"buffersize":buffersize}]'
put_url = "%s%s?op=CREATE&overwrite=%s&permission=%s&buffersize=%s" % (self.url_path,hdfs_path,overwrite,permission,buffersize)
c.setopt(pycurl.URL, put_url)
header_str = StringIO.StringIO()
c.setopt(pycurl.HTTPHEADER,['Content-Type: application/json','Content-Length: '+str(len(put_str))])
c.setopt(pycurl.CUSTOMREQUEST,"PUT")
c.setopt(pycurl.HEADER,1)
c.setopt(pycurl.HEADERFUNCTION,header_str.write)
c.setopt(pycurl.POSTFIELDS,put_str)
b = StringIO.StringIO()
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
c.perform()
redirect_url = c.getinfo(pycurl.EFFECTIVE_URL)
except Exception,e:
print e
if os.path.isfile(local_path):
try:
f = file(local_path)
filesize = os.path.getsize(local_path)
c.setopt(pycurl.URL, redirect_url)
c.setopt(pycurl.HEADER,1)
c.setopt(pycurl.CUSTOMREQUEST,"PUT")
c.setopt(pycurl.PUT,1)
c.setopt(pycurl.INFILE,f)
c.setopt(pycurl.INFILESIZE,filesize)
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
c.perform()
print "yes.is ready to putting..."
self.status = c.getinfo(c.HTTP_CODE)
print b.getvalue()
except Exception,e:
print e
finally:
b.close()
header_str.close()
f.close()
else:
self.logger.error("Sorry,the %s is not existed,maybe it is not a file." % local_path)
return False
if self.status != 201:
print self.status
self.Write_Debug_Log(self.status,put_str)
return False
else:
self.logger.info("Great,successfully put file into hdfs %s " % hdfs_path)
return True
def append(self,local_path,hdfs_path,buffersize=None):
pass
def get_file(self, local_path, hdfs_path,buffersize="128"):
if not os.path.isfile(local_path):
print local_path
os.mknod(local_path)
c = pycurl.Curl()
f = file(local_path,'wb')
put_str = '[{"op":"OPEN"}]'
put_url = "%s%s?op=OPEN&buffersize=%s" % (self.url_path,hdfs_path,buffersize)
try:
print "yes .aaaaaaaaaaaaaaaaaaaaa"
c.setopt(pycurl.URL, put_url)
c.setopt(pycurl.HTTPHEADER,['Content-Type: application/json','Content-Length: '+str(len(put_str))])
c.setopt(pycurl.CUSTOMREQUEST,"GET")
f = file(local_path,'wb')
c.setopt(pycurl.POSTFIELDS,put_str)
c.setopt(pycurl.WRITEFUNCTION,f.write)
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
c.setopt(pycurl.CONNECTTIMEOUT,60)
c.setopt(pycurl.TIMEOUT,300)
c.perform()
print c.getinfo(pycurl.HTTP_CODE)
self.status = c.getinfo(pycurl.HTTP_CODE)
except Exception,e:
print e
finally:
c.close()
f.close()
if self.status != 200:
print self.status
self.Write_Debug_Log(self.status,put_str)
return False
else:
self.logger.info("Great,successfully put file into hdfs %s " % hdfs_path)
return True
def cat_file(self, hdfs_path,buffersize="128"):
c = pycurl.Curl()
b = StringIO.StringIO()
put_str = '[{"op":"OPEN"}]'
put_url = "%s%s?op=OPEN&buffersize=%s" % (self.url_path,hdfs_path,buffersize)
try:
print "yes .ready to open"
c.setopt(pycurl.URL, put_url)
c.setopt(pycurl.HTTPHEADER,['Content-Type: application/json','Content-Length: '+str(len(put_str))])
c.setopt(pycurl.CUSTOMREQUEST,"GET")
c.setopt(pycurl.POSTFIELDS,put_str)
c.setopt(pycurl.WRITEFUNCTION,b.write)
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
c.perform()
self.status = c.getinfo(pycurl.HTTP_CODE)
print c.getinfo(pycurl.HTTP_CODE)
print "###-------------------------------------------###"
print b.getvalue()
except Exception,e:
print e
finally:
c.close()
b.close()
if self.status != 200:
print self.status
self.Write_Debug_Log(self.status,put_str)
return False
else:
self.logger.info("Great,successfully put file into hdfs %s " % hdfs_path)
return True
def copy_in_hdfs(self,src,dst,overwrite="true",permission="755",buffersize="128"):
tmpfile = "/tmp/copy_inhdfs_tmpfile"
self.get_file(tmpfile,src)
if self.status == 200:
self.put_file(tmpfile,dst,overwrite="true")
if self.status == 201:
os.remove(tmpfile)
return True
else:
os.remove(tmpfile)
return False
else:
os.remove(tmpfile)
return False
def Write_Debug_Log(self,status,url):
if status != 200 or status != 201 :
self.logger.error("Url : \"%s\" ,Exit code : %s"%(url,self.status))
self.logger.error("fetch a error ,but don't quit")