Hadoop自定义开发输入类

最新推荐文章于 2022-05-19 10:47:12 发布

学习中....

最新推荐文章于 2022-05-19 10:47:12 发布

阅读量103

点赞数

分类专栏： Hadoop 文章标签： hadoop FileInpuStream CreateRecordReader

本文链接：https://blog.csdn.net/qq_36055407/article/details/95446487

版权

Hadoop 专栏收录该内容

31 篇文章 0 订阅

订阅专栏

目的将map输入的longWritable转化为按照行数读取，即key1代表第几行。

文件中数据如下：

package combiner;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Main {
	public static void main(String[] args) throws Exception {
		Job job=Job.getInstance(new Configuration());
		job.setJarByClass(Main.class);
        //加载自定义输入类
		job.setInputFormatClass(MyInput.class);
        //mapper
		job.setMapperClass(MyMapper.class);
		job.setOutputKeyClass(IntWritable.class);
		job.setOutputValueClass(Text.class);
		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		System.exit(job.waitForCompletion(true)?0:1);
	}	
	//Mapper
	public static class MyMapper extends Mapper<IntWritable, Text, IntWritable, Text>{

		@Override
		public void map(IntWritable key, Text value, Mapper<IntWritable, Text, IntWritable, Text>.Context context)
				throws IOException, InterruptedException {
			context.write(key, value);
		}
	}
	public static class MyInput extends FileInputFormat<IntWritable, Text>{
		@Override
		public RecordReader<IntWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context)
				throws IOException, InterruptedException {
			return new In();
		}
	}

	//目的将key值转化为行数
	public static class In extends RecordReader<IntWritable, Text>{
		//首先获取一个流进行读取数据
		BufferedReader br;
		//每次读取的数据的大小
		String data;
		//流数据的总字节大小
		int size;
		//行
		int line=0;
		//已读取的进度
		float cur=0L;
		
		//初始化行读，通过split获取输入流和文件总字节大小
		@Override
		public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
			//首先将split转化为FileSplit
			FileSplit fs=(FileSplit)split;
			//通过FileSplit实例获取FileSystem
			FileSystem fileSystem=fs.getPath().getFileSystem(context.getConfiguration());
			InputStream is=fileSystem.open(fs.getPath());
			//获取文件的总大小
			size=is.available();
			//将字节六转化为字符流再转化为BufferedReader
			br=new BufferedReader(new InputStreamReader(is));
		}
		//判断是否还有数据
		@Override
		public boolean nextKeyValue() throws IOException, InterruptedException {
			//判断数据是否读完
			data=br.readLine();
			if(data!=null) {
				line++;
				return true;
			}
			return false;
		}
		//获取正在读取的行的key值，根据自己的需求读取，默认为每行的字节大小
		@Override
		public IntWritable getCurrentKey() throws IOException, InterruptedException {
			return new IntWritable(line);
		}
		//获取当前读取的行的数据
		@Override
		public Text getCurrentValue() throws IOException, InterruptedException {
			cur+=data.getBytes().length;
			return new Text(data);
		}
		//获取过程数据，返回数据读取的进度
		@Override
		public float getProgress() throws IOException, InterruptedException {
			//获取过程，也就是读取的进度(cur/size)
			return cur/size;
		}
		//关闭流
		@Override
		public void close() throws IOException {
			if(br!=null) {
				br.close();
			}
		}
	}
}

2.上传到集群输出结果如下：

[root@wpy apps]# hadoop jar input.jar /word.txt /out0000
19/07/11 16:55:17 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
19/07/11 16:55:18 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
19/07/11 16:55:19 INFO input.FileInputFormat: Total input paths to process : 1
19/07/11 16:55:19 INFO mapreduce.JobSubmitter: number of splits:1
19/07/11 16:55:19 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1562835141268_0001
19/07/11 16:55:19 INFO impl.YarnClientImpl: Submitted application application_1562835141268_0001
19/07/11 16:55:19 INFO mapreduce.Job: The url to track the job: http://wpy:8088/proxy/application_1562835141268_0001/
19/07/11 16:55:19 INFO mapreduce.Job: Running job: job_1562835141268_0001
19/07/11 16:55:29 INFO mapreduce.Job: Job job_1562835141268_0001 running in uber mode : false
19/07/11 16:55:29 INFO mapreduce.Job:  map 0% reduce 0%
19/07/11 16:55:40 INFO mapreduce.Job:  map 100% reduce 0%
19/07/11 16:55:52 INFO mapreduce.Job:  map 100% reduce 100%
19/07/11 16:55:52 INFO mapreduce.Job: Job job_1562835141268_0001 completed successfully
19/07/11 16:55:52 INFO mapreduce.Job: Counters: 49
	File System Counters
		FILE: Number of bytes read=116
		FILE: Number of bytes written=212807
		FILE: Number of read operations=0
		FILE: Number of large read operations=0
		FILE: Number of write operations=0
		HDFS: Number of bytes read=165
		HDFS: Number of bytes written=82
		HDFS: Number of read operations=6
		HDFS: Number of large read operations=0
		HDFS: Number of write operations=2
	Job Counters 
		Launched map tasks=1
		Launched reduce tasks=1
		Data-local map tasks=1
		Total time spent by all maps in occupied slots (ms)=9193
		Total time spent by all reduces in occupied slots (ms)=8278
		Total time spent by all map tasks (ms)=9193
		Total time spent by all reduce tasks (ms)=8278
		Total vcore-milliseconds taken by all map tasks=9193
		Total vcore-milliseconds taken by all reduce tasks=8278
		Total megabyte-milliseconds taken by all map tasks=9413632
		Total megabyte-milliseconds taken by all reduce tasks=8476672
	Map-Reduce Framework
		Map input records=7
		Map output records=7
		Map output bytes=96
		Map output materialized bytes=116
		Input split bytes=92
		Combine input records=0
		Combine output records=0
		Reduce input groups=7
		Reduce shuffle bytes=116
		Reduce input records=7
		Reduce output records=7
		Spilled Records=14
		Shuffled Maps =1
		Failed Shuffles=0
		Merged Map outputs=1
		GC time elapsed (ms)=167
		CPU time spent (ms)=1250
		Physical memory (bytes) snapshot=306790400
		Virtual memory (bytes) snapshot=1685078016
		Total committed heap usage (bytes)=136056832
	Shuffle Errors
		BAD_ID=0
		CONNECTION=0
		IO_ERROR=0
		WRONG_LENGTH=0
		WRONG_MAP=0
		WRONG_REDUCE=0
	File Input Format Counters 
		Bytes Read=73
	File Output Format Counters 
		Bytes Written=82

结果：