1.用haddop提供的C API to HDFS来实现文件写入到HDFS中。过程中主要是在配置环境花了点时间
参考官网:http://hadoop.apache.org/common/docs/r0.20.0/libhdfs.html
API主要可以去hadoop软件包解压目录中查看hdfs.h定义的一些已实现的函数
一门语言的初学入门例子,一般都是“hello,world”,下面看写hdfs文件代码:
#include "hdfs.h"
int main(int argc, char **argv) {
hdfsFS fs = hdfsConnect("192.168.1.8", 9000);//default是连本地文件
const char* writePath = "/tmp/testfile.txt";
hdfsFile writeFile = hdfsOpenFile(fs, writePath, O_WRONLY|O_CREAT, 0, 0, 0);
if(!writeFile) {
fprintf(stderr, "Failed to open %s for writing!\n", writePath);
exit(-1);
}
char* buffer = "Hello, World!";
tSize num_written_bytes = hdfsWrite(fs, writeFile, (void*)buffer, strlen(buffer)+1);
if (hdfsFlush(fs, writeFile)) {
fprintf(stderr, "Failed to 'flush' %s\n", writePath);
exit(-1);
}
hdfsCloseFile(fs, writeFile);
}
然后可以ssh 192.168.1.8机器上查看文件是否写入,bin/hadoop fs -ls /tmp,或直接拷到本地以便查看:bin/hadoop fs -get /tmp/testfile.txt /home/test
下面是读hdfs文件代码:
#include "hdfs.h"
int main(int argc, char **argv) {
hdfsFS fs = hdfsConnect("192.168.1.8", 9000);
if (!fs) {
printf("Failed to connect to hdfs!\n");
exit(-1);
}
const char* rfile = "/tmp/testfile.txt";
hdfsFile readFile = hdfsOpenFile(fs, rfile, O_RDONLY, 0, 0, 0);
if (!readFile) {
printf("Failed to open %s for writing!\n", rfile);
exit(-2);
}
char* buffer = "Hello, World!";
// data to be written to the file
char* buffer = malloc(sizeof(char) * (strlen(buffer)+1));
if(buffer == NULL) {
return -2;
}
// read from the file
tSize curSize = fileSize;
for (; curSize == fileSize;) {
curSize = hdfsRead(fs, readFile, (void*)buffer, curSize);
}
printf("%s\n",buffer);
free(buffer);
hdfsCloseFile(fs, readFile);
hdfsDisconnect(fs);
return 0;
}
参考资料:
http://www.itpub.net/thread-1423369-1-1.html#
2.用Java API实现对HDFS文件的读取。
可以具体参考 利用JavaAPI访问HDFS的文件
其实不管用哪种实现方式,都可以实现对HDFS文件的读写,下面是实习公司的需求。
每次写文件后,就通知去读文件,这个必须文件关闭后,才能读得到内容,所以以追加的方式写文件
void WriteToHDFS(void* buffer,int readLength,int structLength)
{
long dataSize = readLength*structLength;//从队列中取出的结构体个数和结构体大小
printf("WriteIucsToHDFS : dataSize is %d,structLength is %d\n",dataSize,structLength);
iucsWriteFile = hdfsOpenFile(fs, iucsWritePath, O_WRONLY|O_APPEND, 0, 0, 0);
int fileSize = hdfsTell(fs,iucsWriteFile);
int num = (BLOCKSIZE-fileSize)/structLength;//看当前打开文件还能容纳几个结构体,BLOCKSIZE=64N
if(dataSize<BLOCKSIZE){
if(num>readLength){//当容纳个数大于取出的结构体个数时,直接将取出来的内容写进原文件中
hdfsWrite(fs, iucsWriteFile, (void*)buffer, dataSize);
if (hdfsFlush(fs, iucsWriteFile))
{
printf("Failed to 'flush' %s\n", iucsWritePath);
return;
}
hdfsCloseFile(fs, iucsWriteFile);
PutRequestIntoQueue(requestqueue,iucsWritePath,fileSize+dataSize,structLength,IUCSCDRTYPE,readLength);//通知读队列
}else{//当容纳个数小于取出来的结构体个数时,先把能容纳的内容写进原文件
//write old file
hdfsWrite(fs, iucsWriteFile, (void*)buffer, num*structLength);
if (hdfsFlush(fs, iucsWriteFile))
{
printf("Failed to 'flush' %s\n", iucsWritePath);
return;
}
hdfsCloseFile(fs, iucsWriteFile);
PutRequestIntoQueue(requestqueue,iucsWritePath,fileSize+num*structLength,structLength,IUCSCDRTYPE,num);
//write new file
if((readLength-num)*structLength>BLOCKSIZE){
printf("--------icus big data come--------\n");
}
memset(iucsWritePath,0,sizeof(iucsWritePath)/sizeof(char));
strcat(iucsWritePath,IUCS_FILEPATH);
getCurrTime();
strcat(iucsWritePath,currTime);
strcat(iucsWritePath,".dat");
iucsWriteFile = hdfsOpenFile(fs, iucsWritePath, O_WRONLY|O_CREAT, 0, 0, 0);
if(!iucsWriteFile)
{
printf( "Failed to open %s for writing!\n", iucsWritePath);
return;
}//再把剩下的文件写到一个新文件中
hdfsWrite(fs, iucsWriteFile, (void*)((char*)buffer+num*structLength), (readLength-num)*structLength);
if (hdfsFlush(fs, iucsWriteFile))
{
printf("Failed to 'flush' %s\n", iucsWritePath);
return;
}
hdfsCloseFile(fs, iucsWriteFile);
PutRequestIntoQueue(requestqueue,iucsWritePath,fileSize+(readLength-num)*structLength,structLength,IUCSCDRTYPE,(readLength-num));
}
}
else
{//如果取出来的结构体大小超过64M,是直接报提示,其实也可以用一个递归来实现将大文件分解存放在小文件中
printf("icus big data come\n");
//WriteBlockHDFS(buffer,readLength,structLength);
}
}
如果是大文件,则递归将内容逐一写到文件中
//datasize>blocksize
void WriteBlockHDFS(void* buffer,int readLength,int structLength){
long dataSize = readLength*structLength;
biccWriteFile = hdfsOpenFile(fs, biccWritePath, O_WRONLY|O_APPEND, 0, 0, 0);
int fileSize = hdfsTell(fs,biccWriteFile);
num = (BLOCKSIZE-fileSize)/structLength;
//write old file
hdfsWrite(fs, biccWriteFile, (void*)buffer, num*structLength);
if (hdfsFlush(fs, biccWriteFile))
{
printf("Failed to 'flush' %s\n", biccWritePath);
return;
}
hdfsCloseFile(fs, biccWriteFile);
//write new file
cntFile++;
memset(biccWritePath,0,sizeof(biccWritePath)/sizeof(char));
strcat(biccWritePath,BICC_FILEPATH);
char str[5] = {0};
sprintf(str,"%d",cntFile);
strcat(biccWritePath,str);
strcat(biccWritePath,".txt");
biccWriteFile = hdfsOpenFile(fs, biccWritePath, O_WRONLY|O_CREAT, 0, 0, 0);
if(!biccWriteFile)
{
printf( "Failed to open %s for writing!\n", biccWritePath);
return;
}
if((datasize-num*structLength)>BLOCKSIZE){
hdfsCloseFile(fs, biccWriteFile);
WriteBlockHDFS((char*)(buffer+num*structLength),readLength-num,structLength);
}
else
{
hdfsWrite(fs, biccWriteFile, (void*)buffer, num*m);
if (hdfsFlush(fs, biccWriteFile))
{
printf("Failed to 'flush' %s\n", biccWritePath);
return;
}
hdfsCloseFile(fs, biccWriteFile);
}
}
注意:结构体数据是以二进制存放,所以在读的时候,也要以结构体的形式读出来,否则读出来的数据就会出错
文件以时间命名,下面是取得当前时间的代码:
char currTime[20] = {0};
void intTochar(int n){
char tempTime[4] = {0};
sprintf(tempTime,"%d",n);
strcat(currTime,tempTime);
}
void getCurrTime(){
time_t nowTime = time(NULL);
struct tm * timeinfo;
timeinfo = localtime (&nowTime);
memset(currTime,0,sizeof(currTime)/sizeof(char));
intTochar(1900+timeinfo->tm_year);
intTochar(1+timeinfo->tm_mon);
intTochar(timeinfo->tm_mday);
intTochar(timeinfo->tm_hour);
intTochar(timeinfo->tm_min);
intTochar(timeinfo->tm_sec);
}