假设文件test4.txt有这么两行数据:
1980080113312121212018
1985080113313131313023
规则是前8位为年月日,中间11位为手机号码,后3位表示的是年龄。
我们可以自定义一个加载udf来加载这个文件
package com.besttone.pig.udf.load;
import java.io.IOException;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.pig.LoadFunc;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
public class CutLoadFunc extends LoadFunc {
protected static final Log log = LogFactory.getLog(CutLoadFunc.class);
private final List<Range> ranges;
private RecordReader reader;
private final TupleFactory tupleFactory = TupleFactory.getInstance();
public CutLoadFunc(String cutPattern) throws IOException {
ranges = Range.parse(cutPattern);
}
@Override
public void setLocation(String location, Job job) throws IOException {
// TODO Auto-generated method stub
FileInputFormat.setInputPaths(job, location);
}
@Override
public InputFormat getInputFormat() throws IOException {
// TODO Auto-generated method stub
return new TextInputFormat();
}
@Override
public void prepareToRead(RecordReader reader, PigSplit split)
throws IOException {
// TODO Auto-generated method stub
this.reader = reader;
}
@Override
public Tuple getNext() throws IOException {
// TODO Auto-generated method stub
try {
if (!reader.nextKeyValue()) {
return null;
}
Text value = (Text) reader.getCurrentValue();
String line = value.toString();
Tuple tuple = tupleFactory.newTuple(ranges.size());
for (int i = 0; i < ranges.size(); i++) {
Range range = ranges.get(i);
if (range.getEnd() > line.length()) {
log.warn(String.format(
"Range end (%s) is longer than line length (%s)",
range.getEnd(), line.length()));
continue;
}
tuple.set(i, new DataByteArray(range.getSubstring(line)));
}
return tuple;
} catch (InterruptedException e) {
throw new ExecException(e);
}
}
}
package com.besttone.pig.udf.load;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class Range {
protected static final Log log = LogFactory.getLog(Range.class);
private int start;
private int end;
public int getStart() {
return start;
}
public void setStart(int start) {
this.start = start;
}
public int getEnd() {
return end;
}
public void setEnd(int end) {
this.end = end;
}
public static List<Range> parse(String cutPattern) throws IOException {
String[] rangelist = cutPattern.split(",");
if (rangelist.length == 0) {
throw new IOException("cutPattern参数不合法");
}
List<Range> list = new ArrayList<Range>(rangelist.length);
for (String rangestr : rangelist) {
try {
Range range = new Range();
range.setStart(Integer.parseInt(rangestr.split("-")[0]));
range.setEnd(Integer.parseInt(rangestr.split("-")[1]));
list.add(range);
} catch (Exception e) {
throw new IOException("cutPattern参数不合法");
}
}
return list;
}
public String getSubstring(String line) {
if (this.end > line.length()) {
log.warn(String.format(
"Range end (%s) is longer than line length (%s)", this.end,
line.length()));
return null;
}
return line.substring(this.start, this.end);
}
public static void main(String[] args) {
Range range = new Range();
range.setStart(19);
range.setEnd(22);
System.out.println(range.getSubstring("1980080113312121212018"));
}
}
将这两个类打成jar包,然后进入pig grunt,执行一下脚本
register besttonePigUDF.jar ;
a= load 'test4.txt' using com.besttone.pig.udf.load.CutLoadFunc('0-8,8-19,19-22') as (date:chararray,phone:chararray,age:chararray);
dump a;
可以看到内容被成功的加载:
(19800801,13312121212,018)
(19850801,13313131313,023)