Hadoop 自定义数据类型及输入格式

  Hadoop提供了大量的数据输入类型,如Text和IntWritable.

  假如我们需要自定义一个数据类型,首先要实现Writable接口,主要包含两个函数readFields和write.如果需要把新的数据类型作为Map函数的key输出的话,在shuffle阶段会有一个排序的过程,需要对key进行比较,那么这里就推荐实现WritableComparable接口,它比Writable接口多一个compareTo函数.


  自定义数据类型:

  注意:自定义的数据类型必须要有空的构造函数,且需要调用父类的构造方法.同时建议在空的构造函数里面对属性进行初始化,因为write函数对实例进行序列化,readFields函数进行反序列化,在进行反序列化时,若属性未进行初始化,可能会出现空指针异常(普通数据类型不会出现该异常,当属性为符合数据类型时才会出现异常),因此建议大家在空的构造函数中将所有的属性都进行初始化.toString函数是作为reduce函数输出时用到的.

package pers.kefault.entity;

import java.io.BufferedReader;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;


public class IndexDoc implements  WritableComparable<IndexDoc> {
	
	private int docId;
	private String urlString;
	private String siteName;
	
	public IndexDoc() {
		super();
		// TODO Auto-generated constructor stub
	}

	public IndexDoc(int docId, String urlString, String siteName) {
		super();
		this.docId = docId;
		this.urlString = urlString;
		this.siteName = siteName;
	}
	//复制对象,好烦~~
	public IndexDoc clone()
	{
		return new IndexDoc(getDocId(),getUrlString(),getSiteName());
	}
	
	public void setDocId(int docId) {
		this.docId = docId;
	}

	public void setUrlString(String urlString) {
		this.urlString = urlString;
	}

	public void setSiteName(String siteName) {
		this.siteName = siteName;
	}

	public int getDocId() {
		return docId;
	}

	public String getSiteName() {
		if(siteName == null || siteName == "")
		{
			siteName = urlString.split("/")[0];
		}
		return siteName;
	}

	public String getUrlString() {
		return urlString;
	}
	
	//读取文件,以Text形式返回
	public Text text()
	{
		//设置读取的文件的路径
		String fileName = "/home/monster/spider/" + urlString;
		File file = new File(fileName);
		BufferedReader reader = null;
		String content = "";
		try
		{
			reader = new BufferedReader(new InputStreamReader(new FileInputStream(file),"GB2312"));
			String tempString = null;
			while((tempString = reader.readLine())!=null)
			{
				content += tempString;
			}
			reader.close();
		}catch (Exception e) {
			// TODO: handle exception
			e.printStackTrace();
		}
		return new Text(content); 
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		// TODO Auto-generated method stub
		docId = in.readInt();
		urlString = in.readUTF();
		siteName = urlString.split("/")[0];
	}

	@Override
	public void write(DataOutput out) throws IOException {
		// TODO Auto-generated method stub
		out.writeInt(docId);
		out.writeUTF(urlString);
	}

	@Override
	public int compareTo(IndexDoc o) {
		// TODO Auto-generated method stub
		if(o.docId < docId)
		{
			return 1;
		}else if(o.docId > docId)
		{
			return -1;
		}
		return 0;
	}
	
	@Override
	public String toString() {
		return docId + "," + urlString + "," + siteName;
	}
}

  

  自定义数据输入格式:

package pers.kefault.format;

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import pers.kefault.entity.IndexDoc;

public class IndexDocInputFormat extends FileInputFormat<Text, IndexDoc> {
	@Override
	public RecordReader<Text, IndexDoc> createRecordReader(InputSplit split, TaskAttemptContext context)
			throws IOException, InterruptedException {
		// TODO Auto-generated method stub
		context.setStatus(split.toString());//不知道干嘛的
		IndexDocRecordReader reader = new IndexDocRecordReader(context.getConfiguration());
		return reader;
	}
}

  自定义数据输入的RecordReader:

package pers.kefault.format;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;

import pers.kefault.entity.IndexDoc;

public class IndexDocRecordReader extends RecordReader<Text, IndexDoc> {

	public static final String KEY_VALUE_SEPERATOR = "mapreduce.input.keyvaluelinerecordreader.key.value.separator";
	private final LineRecordReader lineRecordReader;
	private byte separator = (byte) '\t';
	private Text innerValue;
	private Text key;
	private IndexDoc value;
	
	public Class<Text> getKeyClass() { 
		return Text.class;
	}
	
	@Override
	public  void close() throws IOException {
		// TODO Auto-generated method stub
		System.out.println("close()");
		lineRecordReader.close();
	}

	public IndexDocRecordReader(Configuration conf) {
		// TODO Auto-generated constructor stub
		lineRecordReader = new LineRecordReader();
		String sepStr = conf.get(KEY_VALUE_SEPERATOR,"\t");
		this.separator = (byte) sepStr.charAt(0);
	}

	public static int findSeparator(byte[] utf, int start, int length, byte sep) {
		for (int i = start; i < (start + length); i++) {
			if (utf[i] == sep) {
				return i;
			}
		}
		return -1;
	}
	
	public static void setKeyValue(Text key, IndexDoc value, byte[] line,int lineLen, int pos) {
		if (pos == -1) {
			key.set(line, 0, lineLen);
			value.setDocId(-1);
			value.setSiteName("www.kefault.com");
			value.setUrlString("www.kefault.com/null");
		} else {
			key.set(line, 0, pos); //设置键  从 第 0位置 到 截取标识符的位置
			Text text = new Text();
			text.set(line, pos + 1, lineLen - pos - 1);
			String[] str = text.toString().split(",");
			value.setDocId(Integer.parseInt(str[0]));
			value.setUrlString(str[1]);
			value.setSiteName(str[2]);
		}
	}
	
	@Override
	public Text getCurrentKey() throws IOException, InterruptedException {
		// TODO Auto-generated method stub
		return key;
	}

	@Override
	public IndexDoc getCurrentValue() throws IOException, InterruptedException {
		// TODO Auto-generated method stub
		return value;
	}

	@Override
	public float getProgress() throws IOException, InterruptedException {
		// TODO Auto-generated method stub
		return lineRecordReader.getProgress();
	}

	@Override
	public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException {
		// TODO Auto-generated method stub
		lineRecordReader.initialize(genericSplit, context);
	}

	@Override
	public  boolean nextKeyValue() throws IOException, InterruptedException {
		// TODO Auto-generated method stub
		byte[] line = null;
		int lineLen = -1;
		
		if (lineRecordReader.nextKeyValue()) {
			innerValue = lineRecordReader.getCurrentValue();
			line = innerValue.getBytes();
			lineLen = innerValue.getLength();
		} else {
			System.out.println("return false;");
			return false;
		}
		if (line == null){
			System.out.println("return false;");
			return false;
		}
		
		if (key == null) {
			key = new Text();
		}
		if (value == null) {
			value = new IndexDoc(); 
		}
		
		int pos = findSeparator(line, 0, lineLen, this.separator);
		setKeyValue(key, value, line, lineLen, pos);
		return true;
	}
}

  整个程序的重点放在RecordReader上.简单来说,读取数据按行读入,返回一个key-value对.而开始读入的信息保存成Text类型,然后对Text解析,变成key-value对的形式.别看代码这么多,其实拿过来要修改的地方也不是很多.对Text分解时,依据的是一个分割符号,这里用的是'\t',大家也可以自定.然后依据分割成的字符串创建key的对象和value的对象就行了.


  如果大家使用Eclipse的话,在控制台是不显示MapReduce执行过程中的错误的,所以大家可以看一下点击打开链接.希望可以让你调试代码更轻松一点.


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值