在Hadoop的streaming中使用自定义的inputformat和outputformat

在Hadoop的streaming中有一个选项是指定输入输出格式化的

-inputformat TextInputFormat(default)|SequenceFileAsTextInputFormat|JavaClassName Optional.
-outputformat TextOutputFormat(default)|JavaClassName  Optional.


但是在0.14版本之后,hadoop不再支持带多个jar包文件,所以,如果要使用自己定义的Inputformat或者Outputformat,就得将对应的class文件加入到hadoop-streaming-1.0.1.jar中去,比如:

jar uf ../../contrib/streaming/hadoop-streaming-1.0.1.jar org/apache/hadoop/streaming/*.class


然后在-inputformat后面就可以直接带类名了。
下面通过一个例子来说明下,实现Map的输入<key,value>,key为文件名,value为文档的整篇内容:
1.定义自己的InputFormat:
ContentRecordReder.java

package org.apache.hadoop.streaming;

import java.io.IOException;

//import org.apache.commons.logging.Log;
//import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.RecordReader;

import com.sun.org.apache.commons.logging.Log;
import com.sun.org.apache.commons.logging.LogFactory;

public class ContentRecordReder implements RecordReader<Text,Text> {
	private static final Log LOG = LogFactory.getLog(ContentRecordReder.class.getName());  
    private CompressionCodecFactory compressionCodecs = null;  
    private long start;  
    private long pos;  
    private long end;  
    private byte[] buffer;  
    private String keyName;  
    private FSDataInputStream fileIn;  
      
    public ContentRecordReder(Configuration job,FileSplit split) throws IOException{  
        start = split.getStart(); //从中可以看出每个文件是作为一个split的  
        end = split.getLength() + start;
        final Path path = split.getPath();
        keyName = path.toString();  
        LOG.info("filename in hdfs is : " + keyName);  
        System.out.println("filename in hdfs is : " + keyName);
        final FileSystem fs = path.getFileSystem(job);  
        fileIn = fs.open(path);  
        fileIn.seek(start);  
        buffer = new byte[(int)(end - start)];  
        this.pos = start;

    }  
  
    public Text createKey() {  
        return new Text();  
    }  
  
    public Text createValue() {  
        return new Text();  
    }  
  
    public long getPos() throws IOException{  
        return pos;  
    }  
  
    public float getProgress() {  
        if (start == end) {  
            return 0.0f;  
        } else {  
            return Math.min(1.0f, (pos - start) / (float)(end - start));  
        }  
    }  
  
    public boolean next(Text key, Text value) throws IOException{  
        while(pos < end) {  
            key.set(keyName);  
            value.clear();  
            fileIn.readFully(pos,buffer);  
            value.set(buffer);  
            LOG.info("---内容: " + value.toString());  
            System.out.println("---内容: " + value.toString());
            pos += buffer.length;  
            LOG.info("end is : " + end  + " pos is : " + pos);  
            return true;  
        }  
        return false;  
    }  
  
    public void close() throws IOException{  
        if(fileIn != null) {  
            fileIn.close();  
        }  
          
    }  
}

ContentInputFormat.java

package org.apache.hadoop.streaming;

import java.io.IOException;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobConfigurable;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.FileInputFormat;

public class ContentInputFormat extends FileInputFormat<Text,Text>{
	private long mySplitSize = 1024*1024;
	private CompressionCodecFactory compressionCodecs = null;  
    public void configure(JobConf conf) {  
        compressionCodecs = new CompressionCodecFactory(conf);  
    }
    
    /** 
     * @brief isSplitable 不对文件进行切分,必须对文件整体进行处理 
     * 
     * @param fs 
     * @param file 
     * 
     * @return false 
     */  
    protected boolean isSplitable(FileSystem fs, Path file) {  
        return false; 
    }  
  
    public RecordReader<Text,Text> getRecordReader(InputSplit genericSplit,  
                            JobConf job, Reporter reporter) throws IOException{  
        reporter.setStatus(genericSplit.toString());  
        ContentRecordReder contentRecordReder = new ContentRecordReder(job,(FileSplit)genericSplit);
        return (RecordReader<Text, Text>) contentRecordReder;
    }

}

2.编译

javac -classpath ~/hadoop-1.0.1/hadoop-core-1.0.1.jar:~/hadoop-1.0.1/lib/*:./content-record-reader.jar ./*.java -Xlint:deprecation

3.并加入stream的jar包

jar uf ../../contrib/streaming/hadoop-streaming-1.0.1.jar org/apache/hadoop/streaming/*.class

4.Mapper.cpp

#include <iostream>
#include <string>
using namespace std;


int main()
{
	string key,value;
	char ch;
	cin>>key;
	value = "";
	while(cin>>ch&&!cin.eof()){
		value.append(1,ch);
	}
	cout<<key<<"\t"<<value<<endl;

	return 0;
}

5.Reducer.cpp

#include <iostream>
#include <map>

using namespace std;

int main() {
    map<string,string> wordMap;
     map<string,string>::iterator it;
    string key;
    string value;

    while(cin>>key>>value) {
        //可以在这里对value即文档做处理...
        wordMap[key] +=value;
    }

    for(it=wordMap.begin();it != wordMap.end();it++) {//输出
        cout<<it->first<<"\t"<<it->second<<endl;
    }
    return 0;
}

6.执行

bin/hadoop jar contrib/streaming/hadoop-streaming-1.0.1.jar \
-mapper /home/guoguo/hadoop-1.0.1/cTest/C++/Mapper \
-file /home/guoguo/hadoop-1.0.1/cTest/C++/Mapper \
-inputformat ContentInputFormat \
-reducer /home/guoguo/hadoop-1.0.1/cTest/C++/Reducer \
-file /home/guoguo/hadoop-1.0.1/cTest/C++/Reducer \
-input input \
-output stream-output

7.ok~

8.补充:如果文档是XML格式,可以用StreamXmlRecordReader,具体做法就是用到hadoop-streaming.jar的inputreader选项,如:

-inputreader  "StreamXmlRecordReader,begin=<Store>,end=</Store>"

其中“<Store>”,“</Store>”为XML文件的开始标签和结束标签

9.参考资料

http://hadoop.apache.org/common/docs/r1.0.1/streaming.html

http://dongxicheng.org/mapreduce/hadoop-streaming-advanced-programming/

http://blog.csdn.net/j3smile/article/details/7371209

http://blog.csdn.net/anbo724/article/details/6955175



  • 2
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值