环境:CentOS 6.3, Hadoop 1.1.2, JDK 1.6, Eclipse, Cascading
应用场景:当需要把输入文件根据某个字段分片(数据按字段值输入到相应的目录)输出时。
直接上代码:
package core.ebay.subscription;
import cascading.flow.FlowConnector;
import cascading.flow.FlowDef;
import cascading.flow.hadoop.HadoopFlowConnector;
import cascading.pipe.Pipe;
import cascading.scheme.hadoop.TextDelimited;
import cascading.tap.SinkMode;
import cascading.tap.Tap;
import cascading.tap.hadoop.Hfs;
import cascading.tap.hadoop.TemplateTap;
import cascading.tuple.Fields;
public class SequenceFileTest {
public static void main(String[] args) {
TextDelimited inScheme = new TextDelimited(new Fields("year", "month","entry"), ";");
TextDelimited outScheme = new TextDelimited(new Fields("month","entry"));
Hfs intap = new Hfs(inScheme, args[0]); //define input
Hfs intap1 = new Hfs(outScheme, args[1]); //define output
String template = "%s"; //output file path type
//String template = "%s-%s"; //base on year & month genereate output directory
Tap month = new TemplateTap(intap1, template,new Fields("month"), SinkMode.REPLACE); //Base on the Filed generate to according directory
Pipe inputPipe = new Pipe("inputPipe");
FlowDef flowDef = FlowDef.flowDef().addSource(inputPipe, intap)
.addTailSink(inputPipe, month);
FlowConnector flowConnector = new HadoopFlowConnector();
flowConnector.connect(flowDef).complete();
}
}
inputFile:
2013;10;132131
2013;10;13213fsfsdfd
2013;10;13213fsfsdfdfsadf
2014;09;sfdsfd
outputFilePath:
10:
10 132131
10 13213fsfsdfd
10 13213fsfsdfdfsadf