本文介绍使用Hadoop的C API访问HDFS,实现hdfs文件的读写,目录的读取等操作。文章使用的hadoop为单节点伪集群,安装方式及相关的配置说明参见前序文章http://blog.csdn.net/cjf_wei/article/details/75882977。
1.获取HDFS访问对象
操作HDFS首先要根据相关的环境变量,建立客户端到HDFS Namenode的TCP链接,获取HDFS访问对象,建立文件操作句柄等。
//hdfsConnect已经在标记为废弃接口,建议使用hdfsBuilderConnect
链接方式如下:
//Create an HDFS builder.
struct hdfsBuilder *pbld = hdfsNewBuilder();
//使用builder配置的hadoop集群地址
//该地址为hadoop的core-site.xml中fs.defaultFS的配置
hdfsBuilderSetNameNode(pbld,m_strNodeName.c_str());
//使用builder配置的hadoop集群端口
//该端口为hadoop的core-site.xml中fs.defaultFS的配置
hdfsBuilderSetNameNodePort(pbld,nPort);
//使用hdfsBuilderConfSetStr()修改/新增相关环境配置,该函数提供key-value形式的配置方式,ex:
hdfsBuilderConfSetStr(pbld,"fs.hdfs.impl","org.apache.hadoop.hdfs.DistributedFileSystem");
使用hdfsBuilderConnect(),根据hdfsBuilder的参数创建一个HDFS链接句柄
2.文件及目录操作
介绍常用的文件创建,读写及目录操作
//判断在文件系统中是否存在给定路径的文件/文件夹
int hdfsExists(hdfsFS fs, const char *path);
//Delete file
//recursive参数知名是否递归删除,当指明不进行递归删除时,如果待删除的目录非空,将导致操作失败
int hdfsDelete(hdfsFS fs, const char* path, int recursive);
//Open a hdfs file in given mode
/*
*fs hdfs连接句柄
*path 待打开文件的路径
*flags 文件的权限标识 O_RDONLY/O_WRONLY O_CREAT等,可以是他们的某些可行的组合
*bufferSize:待读写文件的大小,如果不确定大小可填写0
*replication:创建写文件句柄时,用于配置副本数,0表示使用集群默认配置的副本数
*blocksize:文件块的大小,0表示使用集群默认配置的文件块大小
*/
hdfsFile hdfsOpenFile(hdfsFS fs, const char* path, int flags,int bufferSize, short replication, tSize blocksize);
//创建目录
int hdfsCreateDirectory(hdfsFS fs, const char* path);
//rename file
//如果目标文件已经存在,将导致移动失败
int hdfsRename(hdfsFS fs, const char* oldPath, const char* newPath);
示例代码:
#include "hdfs.h"
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <iostream>
#include <string>
#include <ctime>
using namespace std;
bool ConnectToHdfs();
hdfsFS m_hdfsfs;
string m_strNodeName = "hdfs://localhost"; //hadoop集群地址
string m_strMaster = "192.168.117.200:9000"; //hadoop热备份主节点
int nPort = 9000;
int main(int argc, char **argv)
{
cout<<"Begin to connect hdfs,cur time:"<<time(NULL)<<endl;
if( !ConnectToHdfs() )
{
cout<<__LINE__<<":Connect to hdfs failed,cur time:"<<time(NULL)<<endl;
return -1;
}
cout<<__LINE__<<":Connect to hdfs success,cur time:"<<time(NULL)<<endl;
//创建文件夹
if( 0 != hdfsCreateDirectory(m_hdfsfs, "/datacenter/data"))
{
cout<<"Create Directory failed"<<endl;
}
//写hdfs
string path = "/datacenter/test/tmp/test.nb" ;
//根据flag O_WRONLY|O_CREAT 知,若文件存在,则打开;不存在,则新建
hdfsFile hdfsfd = hdfsOpenFile(m_hdfsfs, path.c_str(), O_WRONLY|O_CREAT, 0, 0, 0);
string strDataBuf = "";
for(int i = 0 ;i < 1024;++i)
{
strDataBuf += ('A' + i%26);
}
strDataBuf += '\n';
tSize tNumBytes = hdfsWrite(m_hdfsfs, hdfsfd, strDataBuf.c_str(), strDataBuf.size());
cout<<"success write size:" << tNumBytes<<endl;
cout<<"hdfsFlush:"<<hdfsFlush(m_hdfsfs, hdfsfd )<<endl;
cout<<"hdfsCloseFile:"<<hdfsCloseFile(m_hdfsfs, hdfsfd)<<endl;
if ( 0 != hdfsRename(m_hdfsfs, "/datacenter/test/tmp/test.nb", "/datacenter/data/test.nb"))
{
cout<<"move file from /datacenter/test/tmp/test.nb to /datacenter/data/test.nb failed"<<endl;
}
int FileNum;
hdfsFileInfo *fileinfo;
/*
*试图浏览一个HDFS目录,并尝试打开该文件,如果打开失败,尝试删除该文件;如果打开成功,则获取文件的大小
*/
if (NULL != (fileinfo = hdfsListDirectory(m_hdfsfs, "/datacenter/data", &FileNum)))
{
for (int i = 0; i < FileNum; i++,fileinfo++)
{
string mname = fileinfo->mName;
cout<<"ID:"<<i << ",Name:" << mname <<endl;
hdfsFile hdfsTmpFile = hdfsOpenFile(m_hdfsfs, mname.c_str(), O_RDONLY, 0, 2, 0);
if (!hdfsTmpFile)
{
hdfsCloseFile(m_hdfsfs,hdfsTmpFile);
cout<<"Open file ERROR:" << mname<<endl;
if(0 != hdfsDelete(m_hdfsfs, mname.c_str(),0))
{
cout<<"File in tmp dir open failed, del failed:" + mname<<endl;
}
continue;
}
long long nTmpSize= hdfsAvailable(m_hdfsfs, hdfsTmpFile);
cout<<"filesize:"<<nTmpSize<<","<<mname<<endl;
}
}
if ( 0 != hdfsDisconnect(m_hdfsfs) )
{
cout<<__LINE__<<"hdfsDisconnect failed"<<endl;
}
return 0;
}
bool ConnectToHdfs( )
{
struct hdfsBuilder *pbld = hdfsNewBuilder(); //Create an HDFS builder.
hdfsBuilderSetNameNode(pbld,m_strNodeName.c_str()); //使用builder配置的hadoop集群地址
hdfsBuilderSetNameNodePort(pbld,nPort); //使用builder配置的hadoop集群端口
hdfsBuilderConfSetStr(pbld,"fs.hdfs.impl","org.apache.hadoop.hdfs.DistributedFileSystem");
//创建TCP连接
m_hdfsfs = hdfsBuilderConnect(pbld);
//检查是否存在根目录,来判断是否连接成功
int CheckHDFS = hdfsExists(m_hdfsfs,"/");
if (!m_hdfsfs || 0 != CheckHDFS)
{
return false ;
}
else
{
return true ;
}
}
编译&运行:
libhdfs底层是使用JNI技术访问HDFS,所以编译时不仅需要hdfs的动态库(或者静态库),还需要链接java虚拟机的动态库(libjvm)。运行时,需要在环境变量中配置相关的jar包路径,这样JNI接口才能顺利的加载java相关环境。
此处hdfs.h位于当前目录下,相关库文件../lib目录中
ln -sf libhdfs.so.0.0.0 libhdfs.so //建立软连接
g++ -g main.cpp -I ./ -L ../lib -lhdfs -L $JAVA_HOME/jre/lib/amd64/server -ljvm -o run
运行时,需先配置和加载环境变量,此处将环境变量写在profile文件中,并通过source profile来加载;
profile文件内容,所依赖的jar在hadoop安装文件中均能找到:
libdir=/home/cpp/testhdfs/lib
export LD_LIBRARY_PATH=$libdir:$LD_LIBRARY_PATH:${JAVA_HOME}/jre/lib/amd64/server
export CLASSPATH=$CLASSPATH:$libdir/commons-cli-1.2.jar:$libdir/commons-codec-1.4.jar:$libdir/commons-collections-3.2.2.jar:$libdir/commons-configuration-1.6.jar:$libdir/commons-lang-2.6.jar:$libdir/commons-logging-1.1.3.jar:$libdir/guava-11.0.2.jar:$libdir/hadoop-auth-2.7.3.jar:$libdir/hadoop-common-2.7.3.jar:$libdir/hadoop-nfs-2.7.3.jar:$libdir/httpclient-4.2.5.jar:$libdir/httpcore-4.2.5.jar:$libdir/log4j-1.2.17.jar:$libdir/protobuf-java-2.5.0.jar:$libdir/slf4j-api-1.7.10.jar:$libdir/slf4j-log4j12-1.7.10.jar:$libdir/hadoop-hdfs-2.7.3.jar:$libdir/htrace-core-3.1.0-incubating.jar:$libdir/commons-io-2.4.jar
运行结果:
Begin to connect hdfs,cur time:1500682597
log4j:WARN No appenders could be found for logger (org.apache.hadoop.metrics2.lib.MutableMetricsFactory).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
30:Connect to hdfs success,cur time:1500682600
success write size:1025
hdfsFlush:0
hdfsCloseFile:0
ID:0,Name:hdfs://localhost:9000/datacenter/data/test.nb
filesize:1025,hdfs://localhost:9000/datacenter/data/test.nb
问题:
1.未覆盖默认配置 fs.hdfs.impl,报错如下:
hdfsBuilderConnect(forceNewInstance=0, nn=localhost, port=9000, kerbTicketCachePath=(NULL), userName=(NULL)) error:
java.io.IOException: No FileSystem for scheme: hdfs
at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2660)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
2.缺少相关jar包
如果运行时,jar包或者环境变量没有配置好,将会抛出异常,诸如class xxx not found