所有基于文件也就是基于HDFS的InputFormat的实现的基类都是FileInputFormat,在这里面我们实现了getSplit方法,而其后的具体实现类则各自实现自己的RecordReader.用于从分片中解析出键值对.
InputSplit抽象类源代码
<span style="font-size:14px;color:#000000;">public abstract class InputFormat<K, V> {
/**
* Logically split the set of input files for the job.
*
* <p>Each {@link InputSplit} is then assigned to an individual {@link Mapper}
* for processing.</p>
*
* <p><i>Note</i>: The split is a <i>logical</i> split of the inputs and the
* input files are not physically split into chunks. For e.g. a split could
* be <i><input-file-path, start, offset></i> tuple. The InputFormat
* also creates the {@link RecordReader} to read the {@link InputSplit}.
*
* @param context job configuration.
* @return an array of {@link InputSplit}s for the job.
*/
public abstract
List<InputSplit> getSplits(JobContext context
) throws IOException, InterruptedException;
/**
* Create a record reader for a given split. The framework will call
* {@link RecordReader#initialize(InputSplit, TaskAttemptContext)} before
* the split is used.
* @param split the split to be read
* @param context the information about the task
* @return a new record reader
* @throws IOException
* @throws InterruptedException
*/
public abstract
RecordReader<K,V> createRecordReader(InputSplit split,
TaskAttemptContext context
) throws IOException,
InterruptedException;
}</span>
FileInputFormat的getSplit方法解析
<span style="font-size:14px;color:#000000;">public List<InputSplit> getSplits(JobContext job
) throws IOException {
//计算文件切分的最小值,由配置参数mapred.min.split.size确定,默认是1字节,其设定的值必须比1大
long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
//计算文件切分的最大值,由配置参数mapred.max.split.size确定
long maxSize = getMaxSplitSize(job);
// generate splits 生成InputSplit
List<InputSplit> splits = new ArrayList<InputSplit>();
List<FileStatus>files = listStatus(job);
for (FileStatus file: files) {
Path path = file.getPath();
FileSystem fs = path.getFileSystem(job.getConfiguration());
long length = file.getLen();
BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
if ((length != 0) && isSplitable(job, path)) {
</span><pre name="code" class="java"><span style="font-size:14px;color:#000000;">//文件切分算法,用于确定InputSplit的个数和每个InputSplit对应的数据段.
//计算InputSplit的方法是 splitSize=max{minSize,min{maxSize,blockSize}}</span>
long blockSize = file.getBlockSize();
long splitSize = computeSplitSize(blockSize, minSize, maxSize);
//host选择算法,确定每个InputSplit的元数据信息,这通常由四部分组成<file,start,length,hosts>表示InputSplit
//所在的文件,起始位置,长度和host节点列表
long bytesRemaining = length;
while (((double) bytesRemaining)/splitSize > SPLIT_SLOP) {
//计算该分片所在块的块索引
int blkIndex = getBlockIndex(blkLocations, length-bytesRemaining);
splits.add(new FileSplit(path, length-bytesRemaining, splitSize,
blkLocations[blkIndex].getHosts()));
bytesRemaining -= splitSize;
}
if (bytesRemaining != 0) {
splits.add(new FileSplit(path, length-bytesRemaining, bytesRemaining,
blkLocations[blkLocations.length-1].getHosts()));
}
} else if (length != 0) {
splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
} else {
//Create empty hosts array for zero length files
splits.add(new FileSplit(path, 0, length, new String[0]));
}
}
// Save the number of input files in the job-conf
job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
LOG.debug("Total # of splits: " + splits.size());
return splits;
}
在Host选择算法中,当InputSplit尺寸大于Block尺寸时,MapTask并不能实现完全的数据本地性,也就是说,总有一部分数据需要从远程节点获取
自定义InputFormat---ComputeIntensiveTextInputFormat实现
基于HDFS的InputSplit一般扩展FileInputFormat类,然后自己实现RecordReader用于实现从分片数据中解析出KV键值对
这个例子主要是展现如何编写自定义的InputFormat,这个InputFormat直接继承自已实现的TextInputFormat,主要覆盖getSplit方法,展示如何为计算密集型应用分配host,不考虑数据的本地性,而是考虑计算的均匀性.将map任务均衡的分配给各个节点.
<span style="font-size:14px;color:#000000;">public class ComputeIntensiveTextInputFormat extends TextInputFormat{
private static final double SPLIT_SLOP = 1.1;
static final String NUM_INPUT_FILES = "mapreduce.input.num.files";
@Override
public List<InputSplit> getSplits(JobContext arg0) throws IOException {
// TODO Auto-generated method stub
long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(arg0));
long maxSize = getMaxSplitSize(arg0);
//这里获取集群可用服务器列表,然后为每个输入分片均匀分配服务器
String[] servers=getActiveServerList(arg0);
if(servers==null)
return null;
List<InputSplit> splits=new ArrayList<InputSplit>();
List<FileStatus> files=listStatus(arg0);
int currentServer=0;
for(FileStatus file:files){
Path path=file.getPath();
long length=file.getLen();
if(length!=0&&isSplitable(arg0, path)){
long blockSize=file.getBlockSize();
long splitSize=computeSplitSize(blockSize, minSize, maxSize);
long bytesRemaining=length;
while((double)bytesRemaining/splitSize>SPLIT_SLOP){
splits.add(new FileSplit(path, length-bytesRemaining, splitSize, new String[]{servers[currentServer]}));
currentServer=getNextServer(currentServer, servers.length);
bytesRemaining-=splitSize;
}
}else if(length!=0){
splits.add(new FileSplit(path, 0, length, new String[]{servers[currentServer]}));
currentServer=getNextServer(currentServer, servers.length);
}else{
splits.add(new FileSplit(path, 0, length, new String[0]));
}
}
arg0.getConfiguration().setLong(NUM_INPUT_FILES,splits.size());
return splits;
}
//获取集群中可用服务器的列表
private String[] getActiveServerList(JobContext context){
String[] servers=null;
try{
JobClient jc=new JobClient((JobConf) context.getConfiguration());
ClusterStatus status=jc.getClusterStatus(true);
Collection<String> atc=status.getActiveTrackerNames();
servers=new String[atc.size()];
int s=0;
for(String serverInfo:atc){
StringTokenizer st=new StringTokenizer(serverInfo,":");
String trackerName=st.nextToken();
StringTokenizer st1=new StringTokenizer(trackerName, "_");
st1.nextToken();
servers[s++]=st1.nextToken();
}
}catch (IOException e) {
// TODO: handle exception
e.printStackTrace();
}
return servers;
}
private static int getNextServer(int current,int max){
current++;
if(current>=max)
current=0;
return current;
}
}</span>
自定义InputFormat---FileLis
tInputFormat实现
自定义InputSplit
public class MultiFileSplit extends InputSplit implements Writable{
private long length;
private String[] hosts;
private List<String> files=null;
public void readFields(DataInput arg0) throws IOException {
// TODO Auto-generated method stub
length=arg0.readLong();
hosts=new String[arg0.readInt()];
for(int i=0;i<hosts.length;i++)
hosts[i]=arg0.readUTF();
int length=arg0.readInt();
files=new ArrayList<String>();
for(int i=0;i<length;i++)
files.add(arg0.readUTF());
}
public void write(DataOutput arg0) throws IOException {
// TODO Auto-generated method stub
arg0.writeLong(length);
arg0.writeInt(hosts.length);
for(String host:hosts)
arg0.writeUTF(host);
arg0.write(files.size());
for(String file:files)
arg0.writeUTF(file);
}
@Override
public long getLength() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return length;
}
@Override
public String[] getLocations() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return hosts;
}
public MultiFileSplit(long length, String[] hosts) {
super();
this.length = length;
this.hosts = hosts;
}
public MultiFileSplit() {
}
public void addFile(Path path){
files.add(path.toString());
}
public List<String> getFiles(){
return files;
}
}
自定义FileListInputFormat实现
<span style="font-size:14px;">public class FileListInputFormat extends FileInputFormat<Text, Text>{
private static final String MAPCOUNT="map.reduce.map.count";
private static final String INPUTPATH="mapred.input.dir";
@Override
public List<InputSplit> getSplits(JobContext arg0) throws IOException {
// TODO Auto-generated method stub
Configuration conf=arg0.getConfiguration();
String fileName=conf.get(INPUTPATH, "");
String[] hosts=getActiveServerList(arg0);
Path p=new Path(StringUtils.unEscapeString(fileName));
List<InputSplit> splits=new LinkedList<InputSplit>();
FileSystem fs=p.getFileSystem(conf);
int mappers=0;
mappers=conf.getInt(MAPCOUNT,0);
if(mappers==0)
throw new IOException("Number of mappers is not specified");
FileStatus[] files=fs.globStatus(p);
int nfiles=files.length;
if(nfiles<mappers)
mappers=nfiles;
for(int i=0;i<mappers;i++)
splits.add(new MultiFileSplit(0, hosts));
Iterator<InputSplit> siter=splits.iterator();
for(FileStatus f:files){
if(!siter.hasNext())
siter=splits.iterator();
((MultiFileSplit)(siter.next())).addFile(f.getPath().toUri().getPath());
}
return splits;
}
@Override
public RecordReader<Text, Text> createRecordReader(InputSplit arg0,
TaskAttemptContext arg1) throws IOException, InterruptedException {
// TODO Auto-generated method stub
return null;
}
@SuppressWarnings("unused")
private static void setMapCount(Job job,int mappers){
Configuration conf=job.getConfiguration();
conf.setInt(MAPCOUNT, mappers);
}
//获取集群中可用服务器的列表
private String[] getActiveServerList(JobContext context){
String[] servers=null;
try{
JobClient jc=new JobClient((JobConf) context.getConfiguration());
ClusterStatus status=jc.getClusterStatus(true);
Collection<String> atc=status.getActiveTrackerNames();
servers=new String[atc.size()];
int s=0;
for(String serverInfo:atc){
StringTokenizer st=new StringTokenizer(serverInfo,":");
String trackerName=st.nextToken();
StringTokenizer st1=new StringTokenizer(trackerName, "_");
st1.nextToken();
servers[s++]=st1.nextToken();
}
}catch (IOException e) {
// TODO: handle exception
e.printStackTrace();
}
return servers;
}
}</span>
参考书籍:
Hadoop技术内幕 深入理解MapReduce架构设计与实现原理
Hadoop高级编程---构建与实现大数据解决方案
Hadoop权威指南 第2版
Hadoop实战