现象
23/10/19 16:41:40 INFO mapreduce.JobSubmitter: Cleaning up the staging area /home/yarn/staging/xxxx/.staging/job_1693365764595_288273
23/10/19 16:41:40 INFO fs.TrashPolicyDefault: Moved: 'hdfsold://10.22*.xxx.xx:xxx/home/yarn/staging/xxxx/.staging/job_1693365764595_288273' to trash at: hdfsold://10.2*.xx.xx:xxxx/user/xxx/.Trash/Current/home/yarn/staging/xxxx/.staging/job_1693365764595_288273
Exception in thread "main" java.lang.ArrayIndexOutOfBoundsException: 0
at org.apache.hadoop.mapred.FileInputFormat.identifyHosts(FileInputFormat.java:836)
at org.apache.hadoop.mapred.FileInputFormat.getSplitHostsAndCachedHosts(FileInputFormat.java:803)
at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:423)
at org.apache.hadoop.mapreduce.JobSubmitter.writeOldSplits(JobSubmitter.java:453)
at org.apache.hadoop.mapreduce.JobSubmitter.writeSplits(JobSubmitter.java:444)
at org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal(JobSubmitter.java:200)
at org.apache.hadoop.mapreduce.Job$12.run(Job.java:1599)
at org.apache.hadoop.mapreduce.Job$12.run(Job.java:1596)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1731)
at org.apache.hadoop.mapreduce.Job.submit(Job.java:1596)
at org.apache.hadoop.mapred.JobClient$1.run(JobClient.java:576)
at org.apache.hadoop.mapred.JobClient$1.run(JobClient.java:571)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1731)
at org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:571)
at org.apache.hadoop.mapred.JobClient.submitJob(JobClient.java:562)
at org.apache.hadoop.streaming.StreamJob.submitAndMonitorJob(StreamJob.java:1021)
at org.apache.hadoop.streaming.StreamJob.run(StreamJob.java:135)
at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:76)
at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:90)
at org.apache.hadoop.streaming.HadoopStreaming.main(HadoopStreaming.java:50)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.hadoop.util.RunJar.run(RunJar.java:323)
at org.apache.hadoop.util.RunJar.main(RunJar.java:236)
原因分析及解决方案:
由于存储丢快导致的allTopos.length 为0,直接导致数组越界。代码展示
private String[][] getSplitHostsAndCachedHosts(BlockLocation[] blkLocations,
long offset, long splitSize, NetworkTopology clusterMap)
throws IOException {
// 此处代码略过...
// We don't yet support cached hosts when bytesInThisBlock > splitSize
return new String[][] { identifyHosts(allTopos.length, racksMap),
new String[0]};
}
private String[] identifyHosts(int replicationFactor,
Map<Node,Node
Info> racksMap) {
/*
问题分析:
由于HBASE(存储)丢块导致数据异常,此时 replicationFactor 为0,导致下面申请一个长度为0的数组,后面往数组里添加元素的时候直接数组越界。
String [] retVal = new String[replicationFactor];
新加代码如下
*/
if (replicationFactor == 0) {
System.out.println("allTopos are invalid, because of BLOCK LOST, pls double check blkLocations");
System.exit(1);
}
String [] retVal = new String[replicationFactor];
List <NodeInfo> rackList = new LinkedList<NodeInfo>();
rackList.addAll(racksMap.values());
// Sort the racks based on their contribution to this split
sortInDescendingOrder(rackList);
boolean done = false;
int index = 0;
// Get the host list for all our aggregated items, sort
// them and return the top entries
for (NodeInfo ni: rackList) {
Set<NodeInfo> hostSet = ni.getLeaves();
List<NodeInfo>hostList = new LinkedList<NodeInfo>();
hostList.addAll(hostSet);
// Sort the hosts in this rack based on their contribution
sortInDescendingOrder(hostList);
for (NodeInfo host: hostList) {
// Strip out the port number from the host name
/*
上面如果数组的长度是0,此时index已经进行了自加,数组已经越界。即便不越界导致index == replicationFactor 永远不成立
还有一种方案就是把下面的代码调整顺序(这里没有考虑到replicationFactor 为0的情况):
if (index == replicationFactor) {
done = true;
break;
}
retVal[index++] = host.node.getName().split(":")[0];
*/
retVal[index++] = host.node.getName().split(":")[0];
if (index == replicationFactor) {
done = true;
break;
}
}
if (done == true) {
break;
}
}
return retVal;
}