pig自定义load udf

假设文件test4.txt有这么两行数据:

1980080113312121212018
1985080113313131313023

规则是前8位为年月日,中间11位为手机号码,后3位表示的是年龄。

我们可以自定义一个加载udf来加载这个文件


package com.besttone.pig.udf.load;

import java.io.IOException;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.pig.LoadFunc;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;

public class CutLoadFunc extends LoadFunc {

	protected static final Log log = LogFactory.getLog(CutLoadFunc.class);
	private final List<Range> ranges;

	private RecordReader reader;

	private final TupleFactory tupleFactory = TupleFactory.getInstance();

	public CutLoadFunc(String cutPattern) throws IOException {
		ranges = Range.parse(cutPattern);
	}

	@Override
	public void setLocation(String location, Job job) throws IOException {
		// TODO Auto-generated method stub
		FileInputFormat.setInputPaths(job, location);
	}

	@Override
	public InputFormat getInputFormat() throws IOException {
		// TODO Auto-generated method stub
		return new TextInputFormat();
	}

	@Override
	public void prepareToRead(RecordReader reader, PigSplit split)
			throws IOException {
		// TODO Auto-generated method stub
		this.reader = reader;
	}

	@Override
	public Tuple getNext() throws IOException {
		// TODO Auto-generated method stub

		try {
			if (!reader.nextKeyValue()) {
				return null;
			}
			Text value = (Text) reader.getCurrentValue();
			String line = value.toString();

			Tuple tuple = tupleFactory.newTuple(ranges.size());
			for (int i = 0; i < ranges.size(); i++) {
				Range range = ranges.get(i);
				if (range.getEnd() > line.length()) {
					log.warn(String.format(
							"Range end (%s) is longer than line length (%s)",
							range.getEnd(), line.length()));
					continue;
				}
				tuple.set(i, new DataByteArray(range.getSubstring(line)));

			}
			return tuple;

		} catch (InterruptedException e) {
			throw new ExecException(e);
		}

	}

}
package com.besttone.pig.udf.load;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

public class Range {

	protected static final Log log = LogFactory.getLog(Range.class);
	private int start;
	private int end;

	public int getStart() {
		return start;
	}

	public void setStart(int start) {
		this.start = start;
	}

	public int getEnd() {
		return end;
	}

	public void setEnd(int end) {
		this.end = end;
	}

	public static List<Range> parse(String cutPattern) throws IOException {

		String[] rangelist = cutPattern.split(",");
		if (rangelist.length == 0) {
			throw new IOException("cutPattern参数不合法");
		}
		List<Range> list = new ArrayList<Range>(rangelist.length);
		for (String rangestr : rangelist) {
			try {
				Range range = new Range();
				range.setStart(Integer.parseInt(rangestr.split("-")[0]));
				range.setEnd(Integer.parseInt(rangestr.split("-")[1]));
				list.add(range);
			} catch (Exception e) {
				throw new IOException("cutPattern参数不合法");
			}
		}
		return list;

	}

	public String getSubstring(String line) {
		if (this.end > line.length()) {
			log.warn(String.format(
					"Range end (%s) is longer than line length (%s)", this.end,
					line.length()));
			return null;
		}
		return line.substring(this.start, this.end);
	}
	
	public static void main(String[] args) {
		Range range = new Range();
		range.setStart(19);
		range.setEnd(22);
		System.out.println(range.getSubstring("1980080113312121212018"));
	}
}


将这两个类打成jar包,然后进入pig grunt,执行一下脚本

register besttonePigUDF.jar ;

a= load 'test4.txt' using com.besttone.pig.udf.load.CutLoadFunc('0-8,8-19,19-22') as (date:chararray,phone:chararray,age:chararray);

dump a;

可以看到内容被成功的加载:

(19800801,13312121212,018)
(19850801,13313131313,023)


  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值