在Hadoop的streaming中有一个选项是指定输入输出格式化的:
- -inputformat TextInputFormat(default)|SequenceFileAsTextInputFormat|JavaClassName Optional.
- -outputformat TextOutputFormat(default)|JavaClassName Optional.
- jar uf ../../contrib/streaming/hadoop-streaming-1.0.1.jar org/apache/hadoop/streaming/*.class
然后在-inputformat后面就可以直接带类名了。
下面通过一个例子来说明下,实现Map的输入<key,value>,key为文件名,value为文档的整篇内容:
1.定义自己的InputFormat:
ContentRecordReder.java
- package org.apache.hadoop.streaming;
- import java.io.IOException;
- //import org.apache.commons.logging.Log;
- //import org.apache.commons.logging.LogFactory;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.FSDataInputStream;
- import org.apache.hadoop.fs.FileSystem;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.io.compress.CompressionCodecFactory;
- import org.apache.hadoop.mapred.FileSplit;
- import org.apache.hadoop.mapred.RecordReader;
- import com.sun.org.apache.commons.logging.Log;
- import com.sun.org.apache.commons.logging.LogFactory;
- public class ContentRecordReder implements RecordReader<Text,Text> {
- private static final Log LOG = LogFactory.getLog(ContentRecordReder.class.getName());
- private CompressionCodecFactory compressionCodecs = null;
- private long start;
- private long pos;
- private long end;
- private byte[] buffer;
- private String keyName;
- private FSDataInputStream fileIn;
- public ContentRecordReder(Configuration job,FileSplit split) throws IOException{
- start = split.getStart(); //从中可以看出每个文件是作为一个split的
- end = split.getLength() + start;
- final Path path = split.getPath();
- keyName = path.toString();
- LOG.info("filename in hdfs is : " + keyName);
- System.out.println("filename in hdfs is : " + keyName);
- final FileSystem fs = path.getFileSystem(job);
- fileIn = fs.open(path);
- fileIn.seek(start);
- buffer = new byte[(int)(end - start)];
- this.pos = start;
- }
- public Text createKey() {
- return new Text();
- }
- public Text createValue() {
- return new Text();
- }
- public long getPos() throws IOException{
- return pos;
- }
- public float getProgress() {
- if (start == end) {
- return 0.0f;
- } else {
- return Math.min(1.0f, (pos - start) / (float)(end - start));
- }
- }
- public boolean next(Text key, Text value) throws IOException{
- while(pos < end) {
- key.set(keyName);
- value.clear();
- fileIn.readFully(pos,buffer);
- value.set(buffer);
- LOG.info("---内容: " + value.toString());
- System.out.println("---内容: " + value.toString());
- pos += buffer.length;
- LOG.info("end is : " + end + " pos is : " + pos);
- return true;
- }
- return false;
- }
- public void close() throws IOException{
- if(fileIn != null) {
- fileIn.close();
- }
- }
- }
- package org.apache.hadoop.streaming;
- import java.io.IOException;
- import org.apache.hadoop.fs.FileSystem;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.io.compress.CompressionCodecFactory;
- import org.apache.hadoop.mapred.FileSplit;
- import org.apache.hadoop.mapred.JobConf;
- import org.apache.hadoop.mapred.JobConfigurable;
- import org.apache.hadoop.mapred.Reporter;
- import org.apache.hadoop.mapred.InputSplit;
- import org.apache.hadoop.mapred.RecordReader;
- import org.apache.hadoop.mapred.FileInputFormat;
- public class ContentInputFormat extends FileInputFormat<Text,Text>{
- private long mySplitSize = 1024*1024;
- private CompressionCodecFactory compressionCodecs = null;
- public void configure(JobConf conf) {
- compressionCodecs = new CompressionCodecFactory(conf);
- }
- /**
- * @brief isSplitable 不对文件进行切分,必须对文件整体进行处理
- *
- * @param fs
- * @param file
- *
- * @return false
- */
- protected boolean isSplitable(FileSystem fs, Path file) {
- return false;
- }
- public RecordReader<Text,Text> getRecordReader(InputSplit genericSplit,
- JobConf job, Reporter reporter) throws IOException{
- reporter.setStatus(genericSplit.toString());
- ContentRecordReder contentRecordReder = new ContentRecordReder(job,(FileSplit)genericSplit);
- return (RecordReader<Text, Text>) contentRecordReder;
- }
- }
- javac -classpath ~/hadoop-1.0.1/hadoop-core-1.0.1.jar:~/hadoop-1.0.1/lib/*:./con
- tent-record-reader.jar ./*.java -Xlint:deprecation
3.并加入stream的jar包
- jar uf ../../contrib/streaming/hadoop-streaming-1.0.1.jar org/apache/hadoop/streaming/*.class
4.Mapper.cpp
- #include <iostream>
- #include <string>
- using namespace std;
- int main()
- {
- string key,value;
- char ch;
- cin>>key;
- value = "";
- while(cin>>ch&&!cin.eof()){
- value.append(1,ch);
- }
- cout<<key<<"\t"<<value<<endl;
- return 0;
- }
- #include <iostream>
- #include <map>
- using namespace std;
- int main() {
- map<string,string> wordMap;
- map<string,string>::iterator it;
- string key;
- string value;
- while(cin>>key>>value) {
- //可以在这里对value即文档做处理...
- wordMap[key] +=value;
- }
- for(it=wordMap.begin();it != wordMap.end();it++) {//输出
- cout<<it->first<<"\t"<<it->second<<endl;
- }
- return 0;
- }
- bin/hadoop jar contrib/streaming/hadoop-streaming-1.0.1.jar \
- -mapper /home/guoguo/hadoop-1.0.1/cTest/C++/Mapper \
- -file /home/guoguo/hadoop-1.0.1/cTest/C++/Mapper \
- -inputformat ContentInputFormat \
- -reducer /home/guoguo/hadoop-1.0.1/cTest/C++/Reducer \
- -file /home/guoguo/hadoop-1.0.1/cTest/C++/Reducer \
- -input input \
- -output stream-output
7.ok~
8.补充:如果文档是XML格式,可以用StreamXmlRecordReader,具体做法就是用到hadoop-streaming.jar的inputreader选项,如:
- <span style="font-size:16px;">-inputreader "StreamXmlRecordReader,begin=<Store>,end=</Store>"</span>
其中“<Store>”,“</Store>”为XML文件的开始标签和结束标签