MapReduce-XML处理-定制InputFormat及定制RecordReader

14 篇文章 0 订阅
13 篇文章 5 订阅
这一篇说明如何自定义InputFormat以及RecordReader这两个组件,通过使用mapreduce处理xml文件格式的文件来说明其用法,这一个例子来自《hadoop硬实战》一书的技术点12讲解的用法,如果有说明得不清楚的可以自行进行查阅
下面就来说说这个实例要达到的目的以下是输入数据:
<configuration>
  <property>
    <name>hadoop.kms.authentication.type</name>
    <value>simple</value>
    <description>
      Authentication type for the KMS. Can be either &quot;simple&quot;
      or &quot;kerberos&quot;.
    </description>
  </property>
  <property>
    <name>hadoop.kms.authentication.kerberos.keytab</name>
    <value>${user.home}/kms.keytab</value>
    <description>
      Path to the keytab with credentials for the configured Kerberos principal.
    </description>
  </property>
  <property>
    <name>hadoop.kms.authentication.kerberos.principal</name>
    <value>HTTP/localhost</value>
    <description>
      The Kerberos principal to use for the HTTP endpoint.
      The principal must start with 'HTTP/' as per the Kerberos HTTP SPNEGO specification.
    </description>
  </property>
  <property>
    <name>hadoop.kms.authentication.kerberos.name.rules</name>
    <value>DEFAULT</value>
    <description>
      Rules used to resolve Kerberos principal names.
    </description>
  </property>
</configuration>
实现的结果:<name>标签中的数据提取出来做为key,把<value>标签中的提取出来做为值进行键值对的输出
hadoop.kms.authentication.kerberos.keytab       ${user.home}/kms.keytab
hadoop.kms.authentication.kerberos.name.rules   DEFAULT
hadoop.kms.authentication.kerberos.principal    HTTP/localhost
hadoop.kms.authentication.type  simple

实现步骤:
1.定制InputFormat实现方法:实现InputFormat接口,或者继承InputFormat的子类,主要实现以下两个方法:
List<InputSplit> getSplits(), 获取由输入文件计算出输入分片(InputSplit),解决数据或文件分割成片问题。
RecordReader<K,V> createRecordReader(),创建RecordReader,从InputSplit中读取数据,解决读取分片中数据问题

2.定制RecordReader实现方法:实现RecordReader接口(旧版API)继承RecordReader类(新版API),下面以新版API为例实现以下方法:

  public abstract void initialize(InputSplit split, TaskAttemptContext context ) throws IOException, InterruptedException;
  public abstract boolean nextKeyValue() throws IOException, InterruptedException;
  public abstract KEYIN getCurrentKey() throws IOException, InterruptedException;
  public abstract VALUEIN getCurrentValue() throws IOException, InterruptedException;
  public abstract float getProgress() throws IOException, InterruptedException;
  public abstract void close() throws IOException;
其中nextKeyValue(),getCurrentKey(),getCurrentValue()方法会在Mapper换执行过程中反复调用直到该MAP任务所分到的分片被完全的处理,hadoop1.2.1的源码如下:
public void run(Context context) throws IOException, InterruptedException {
    setup(context);
    try {
    //map通过这里反复调用RecordReader的方法
    while (context.nextKeyValue()) {
    //context.getCurrentKey()在MapContext的方法中调用相关RecordReader的方法
    /**
    * @Override
  * public KEYIN getCurrentKey() throws IOException, InterruptedException {
    * return reader.getCurrentKey();
  * }
    */
        map(context.getCurrentKey(), context.getCurrentValue(), context);
      }
    } finally {
      cleanup(context);
    }
  }

最核心的就是处理好迭代多行文本的内容的逻辑,每次迭代通过调用nextKeyValue()方法来判断是否还有可读的文本行,直接设置当前的Key和Value,分别在方法getCurrentKey()和getCurrentValue()中返回对应的值。在实现的代码中会有相应注释说明。

定制InputFormat:

import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class XMLInputFormat extends TextInputFormat {
	private static final Logger log = LoggerFactory.getLogger(XMLInputFormat.class);
	@Override
	public RecordReader<LongWritable, Text> createRecordReader(
			InputSplit inputSplit, TaskAttemptContext context) {
		try {
			return new XMLRecordReader(inputSplit, context.getConfiguration());
		} catch (IOException e) {
			log.warn("Error while creating XmlRecordReader", e);
			return null;
		}
	}
	@Override
	protected boolean isSplitable(JobContext context, Path file) {
		// TODO Auto-generated method stub
		return super.isSplitable(context, file);
	}
}
定义RecordReader:(这是xml文件处理的关键)

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

public class XMLRecordReader extends RecordReader<LongWritable, Text> {
	private long start;
	private long end;
	private FSDataInputStream fsin;
	private DataOutputBuffer buffer = new DataOutputBuffer();
	private byte[] startTag;
	private byte[] endTag;
	private LongWritable currentKey;
	private Text currentValue;
	public static final String START_TAG_KEY = "xmlinput.start";
	public static final String END_TAG_KEY = "xmlinput.end";
	public XMLRecordReader() {
	}
	/**
	 * 初始化读取资源以及相关的参数也可以放到initialize()方法中去执行
	 * @param inputSplit
	 * @param context
	 * @throws IOException
	 */
	public XMLRecordReader(InputSplit inputSplit, Configuration context) throws IOException {
		/**
		 * 获取开传入的开始和结束标签
		 */
		startTag = context.get(START_TAG_KEY).getBytes("UTF-8");
		endTag = context.get(END_TAG_KEY).getBytes("UTF-8");
		FileSplit fileSplit = (FileSplit) inputSplit;
		/**
		 * 获取分片的开始位置和结束的位置
		 */
		start = fileSplit.getStart();
		end = start + fileSplit.getLength();
		Path file = fileSplit.getPath();
		FileSystem fs = file.getFileSystem(context);
		/**
		 * 根据分片打开一个HDFS的文件输入流
		 */
		fsin = fs.open(fileSplit.getPath());
		/**
		 * 定位到分片开始的位置
		 */
		fsin.seek(start);
	}
	@Override
	public void close() throws IOException {
		fsin.close();
	}
	@Override
	public LongWritable getCurrentKey() throws IOException, InterruptedException {
		return currentKey;
	}
	@Override
	public Text getCurrentValue() throws IOException, InterruptedException {
		return currentValue;
	}
	@Override
	public float getProgress() throws IOException, InterruptedException {
		return fsin.getPos() - start / (float) end - start;
	}
	@Override
	public void initialize(InputSplit inputSplit, TaskAttemptContext context)
			throws IOException, InterruptedException {
		/*startTag = context.getConfiguration().get(START_TAG_KEY).getBytes("UTF-8");
		endTag = context.getConfiguration().get(END_TAG_KEY).getBytes("UTF-8");
		FileSplit fileSplit = (FileSplit) inputSplit;
		start = fileSplit.getStart();
		end = start + fileSplit.getLength();
		Path file = fileSplit.getPath();
		FileSystem fs = file.getFileSystem(context.getConfiguration());
		fsin = fs.open(fileSplit.getPath());
		fsin.seek(start);*/
	}
	@Override
	public boolean nextKeyValue() throws IOException, InterruptedException {
		currentKey = new LongWritable();
		currentValue = new Text();
		return next(currentKey, currentValue);
	}
	private boolean next(LongWritable key, Text value) throws IOException {
		/**
		 *  通过readUntilMatch方法查找xml段开始的标签,直到找到了,才开始
		 *  写xml片段到buffer中去,如readUntilMatch的第二个参数为false则不查找的过
		 *  程中写入数据到buffer,如果为true的话就边查找边写入
		 */
		if( fsin.getPos() < end && readUntilMatch(startTag, false)) {
			//进入代码段则说明找到了开始标签,现在fsin的指针指在找到的开始标签的
			//最后一位上,所以向buffer中写入开始标签
			buffer.write(startTag);
			try {
				/**
				 * 在fsin中去查找结束标签边查找边记录直到找到结束标签为止
				 */
				if(readUntilMatch(endTag, true)) {
					/**
					 * 找到标签后把结束标签的指针位置的偏移量赋值给key
					 * 把buffer中记录的整个xml完整片断赋值给value
					 */
					key.set(fsin.getPos());
					value.set(buffer.getData(), 0, buffer.getLength());
					return true;
				}
			} finally {
				buffer.reset();
			}
		}
		return false;
	}
	/**
	 * 读取xml文件匹配标签的方法
	 * @param startTag
	 * @param isWrite
	 * @return
	 * @throws IOException
	 */
	private boolean readUntilMatch(byte[] startTag, boolean isWrite) throws IOException {
		int i = 0;
		while(true) {
			/**
			 * 从输入文件只读取一个Byte的数据
			 */
			int b = fsin.read();
			if( b == -1) {
				return false;
			}
			/**
			 *  如果在查找开始标签则不记录查找过程,
			 *  在查找结束标签时才记录查找过程。
			 */
			if(isWrite) {
				buffer.write(b);
			}
			/**
			 * 判断时否找到指定的标签来判断函数结束的时间点
			 */
			if(b == startTag[i]) {
				i ++;
				if( i >= startTag.length) {
					return true;
				}
			} else {
				i = 0;
			}
			// see if we've passed the stop point:
			if (!isWrite && i == 0 && fsin.getPos() >= end) {
				return false;
			}
		}
	}
}
map阶段:

import static javax.xml.stream.XMLStreamConstants.CHARACTERS;
import static javax.xml.stream.XMLStreamConstants.START_ELEMENT;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import javax.xml.stream.FactoryConfigurationError;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class XMLMapper extends Mapper<LongWritable, Text, Text, Text>{
	/**
	 *  从XMLRecordReader可以知道传入的value为指定的包含开始标签和结束标签
	 *  的一个片断
	 */
	@Override
	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		String content = value.toString();
		System.out.println("--content--");
		try {
			//把value的值转化为一个XML的输入流,以便后继的处理
			XMLStreamReader reader = XMLInputFactory.newInstance().createXMLStreamReader(
					new ByteArrayInputStream(content.getBytes()));
			String propertyName = "";
			String propertyValue = "";
			String currentElement = "";
			/**
			 * 定义处理业务可以通过下面的main函数来看这段程序的实现的功能
			 */
			while (reader.hasNext()) {
				int code = reader.next();
				switch (code) {
				case START_ELEMENT:
					currentElement = reader.getLocalName();
					break;
				case CHARACTERS:
					if (currentElement.equalsIgnoreCase("name")) {
						propertyName += reader.getText();
					} else if (currentElement.equalsIgnoreCase("value")) {
						propertyValue += reader.getText();
					}
					break;
				}
			}
			reader.close();
			context.write(new Text(propertyName.trim()), new Text(propertyValue.trim()));
		} catch (XMLStreamException e) {
			e.printStackTrace();
		} catch (FactoryConfigurationError e) {
			e.printStackTrace();
		}
	}
	public static void main(String[] args) {
		String content = "<property><name>seven</name><value>24</value></property>";
		System.out.println("--content--");
		try {
			XMLStreamReader reader = XMLInputFactory.newInstance().createXMLStreamReader(
					new ByteArrayInputStream(content.getBytes()));
			String propertyName = "";
			String propertyValue = "";
			String currentElement = "";
			while (reader.hasNext()) {
				int code = reader.next();
				switch (code) {
				case START_ELEMENT:
					currentElement = reader.getLocalName();
					break;
				case CHARACTERS:
					if (currentElement.equalsIgnoreCase("name")) {
						propertyName += reader.getText();
					} else if (currentElement.equalsIgnoreCase("value")) {
						propertyValue += reader.getText();
					}
					break;
				}
			}
			reader.close();
			System.out.println(propertyName + " " + propertyValue);
		} catch (XMLStreamException e) {
			e.printStackTrace();
		} catch (FactoryConfigurationError e) {
			e.printStackTrace();
		}
	}
}
reduce阶段:

import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class XMLReducer extends Reducer<Text, Text, Text, Text>{
	private Text val_ = new Text();
	@Override
	protected void reduce(Text key, Iterable<Text> value, Context context)
			throws IOException, InterruptedException {
		for(Text val: value) {
			val_.set(val.toString());
			context.write(key, val_);
		}
	}
}
启动函数:

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class JobMain {
	public static void main(String[] args) throws Exception{
		Configuration configuration = new Configuration();
		configuration.set("key.value.separator.in.input.line", " ");
		configuration.set("xmlinput.start", "<property>");
		configuration.set("xmlinput.end", "</property>");
		Job job = new Job(configuration, "xmlread-job");
		job.setJarByClass(JobMain.class);
		job.setMapperClass(XMLMapper.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		job.setInputFormatClass(XMLInputFormat.class);
		job.setNumReduceTasks(1);
		job.setReducerClass(XMLReducer.class);
		//job.setOutputFormatClass(XMLOutputFormat.class);
		FileInputFormat.addInputPath(job, new Path(args[0]));
		Path output = new Path(args[1]);
		FileOutputFormat.setOutputPath(job, output);
		output.getFileSystem(configuration).delete(output, true);
		System.exit(job.waitForCompletion(true) ? 0: 1);
	}
}
运行结果:


结论:

这里通过mapreduce对xml输入文件的处理说了InputFormat以及RecordReader的定制,下一篇将基于这个实例说明OutputFormat以及RecordWriter的定制,实例将把最后结果 输出为xml格式的文件,可参见《MapReduce-XML处理-定制OutputFormat及定制RecordWriter》。

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值