MapReduce的编程开发——自定义格式输入输出

最新推荐文章于 2023-07-18 16:20:10 发布

Comet_sgf

最新推荐文章于 2023-07-18 16:20:10 发布

阅读量436

点赞数

分类专栏：大数据文章标签：大数据 mapreduce hadoop

本文链接：https://blog.csdn.net/Comet_sgf/article/details/113455744

版权

大数据专栏收录该内容

7 篇文章 1 订阅

订阅专栏

文章目录

前言
一、启动Hadoop
二、环境搭配
三、格式实验
总结

前言

本文主要是学习MapReduce的学习笔记，对所学内容进行记录。
实验环境：
1.Linux Ubuntu 16.04

2.hadoop3.0.0

3.eclipse4.5.1

一、启动Hadoop

进入Hadoop启动目录cd /apps/hadoop/sbin
启动Hadoop./start-all.sh
输入‘jps’，启动后显示如下信息

二、环境搭配

打开eclipse->Window->Preferences;
选择Hadoop Map/Reduce,选择Hadoop包根目录，/apps/hadoop，点击Apply，点击OK;
点击window–>show view–>other–>mapreduce tools–>map/reduce locations，之后页面会出现对应的标签页；
点击3中图标1，在Local name输入myhadoop，在DFS Master 框下Port输入8020，点击Finish，出现3中右侧页面；
点击3中
图标2，选择下图内容，出现第3步图中左侧内容

完成环境配置环境。

三、格式实验

新建test项目，将hadoop配置文件复制到src文件夹下cp /apps/hadoop/etc/hadoop/{core-site.xml,hdfs-site.xml,log4j.properties} /home/dolphin/workspace/test/src，新建名为mr的package；
新建Point3D.java文件代码如下：

package mr;
 
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
 
import org.apache.hadoop.io.WritableComparable;
 
public class Point3D implements WritableComparable<Point3D> {
	public float x, y, z;
	public Point3D(float fx, float fy, float fz) {
		this.x = fx;
		this.y = fy;
		this.z = fz;
	}
	public Point3D() {
		this(0.0f, 0.0f, 0.0f);
	}
	public void readFields(DataInput in) throws IOException {
		x = in.readFloat();
		y = in.readFloat();
		z = in.readFloat();
	}
	@Override
	public void write(DataOutput out) throws IOException {
		out.writeFloat(x);
		out.writeFloat(y);
		out.writeFloat(z);
	}
	public String toString() {
		return "X:"+Float.toString(x) + ", "
				+ "Y:"+Float.toString(y) + ", "
				+ "Z:"+Float.toString(z);
	}
	public float distanceFromOrigin() {
		return (float) Math.sqrt( x*x + y*y +z*z);
	}
	public int compareTo(Point3D other) {
		return Float.compare(
				distanceFromOrigin(), 
				 other.distanceFromOrigin());
	}
	public boolean equals(Object o) {
		if( !(o instanceof Point3D)) {
			return false;
		}
		Point3D other = (Point3D) o;
		return this.x == other.x && this.y == other.y && this.z == other.z;
	}
	public int hashCode() {
		return Float.floatToIntBits(x)
				^ Float.floatToIntBits(y)
				^ Float.floatToIntBits(z);
	}
	
}

新建Point3DDriver.java文件，代码如下：

package mr;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class Point3DDriver {
	static int count=0;
	public static class MyMapper extends Mapper<Text, Text, Text, Point3D> {
		protected void map(Text key, Text value, Mapper<Text, Text, Text, Point3D>.Context context)
				throws IOException, InterruptedException {
			count++;
			String[] vs = value.toString().split(",");
			Point3D p = new Point3D(Float.parseFloat(vs[0].split(":")[1]), Float.parseFloat(vs[1].split(":")[1]), Float.parseFloat(vs[2].split(":")[1])	);
				context.write(new Text(key), p);
				System.out.println("map==========>"+count);
		}
	}
	public static class MyReducer extends Reducer<Text, Point3D, Text, Point3D>{
		
		protected void reduce(Text key, Point3D value,
				Reducer<Text, Point3D, Text, Point3D>.Context context) throws IOException, InterruptedException {
			
			context.write(key, value);
		}
		
	}
 
	public static void main(String[] args) {
 
		try {
			Configuration conf = new Configuration();
			String[] paths = new GenericOptionsParser(conf, args).getRemainingArgs();
			if (paths.length < 2) {
				throw new RuntimeException("usage <input> <output>");
			}
 
			Job job = Job.getInstance(conf, "Point3DDriver");
			job.setJarByClass(Point3DDriver.class);
 
			job.setMapperClass(MyMapper.class);
			job.setReducerClass(MyReducer.class);
			job.setInputFormatClass(MyInputFormat.class);
			
			job.setMapOutputKeyClass(Text.class);
			job.setMapOutputValueClass(Point3D.class);
			job.setOutputFormatClass(MyOutputFormat.class);
			//job.setOutputFormatClass(MyOutputFormat2.class);
			FileInputFormat.addInputPaths(job, paths[0]);
			FileOutputFormat.setOutputPath(job, new Path(paths[1] + System.currentTimeMillis()));
			System.exit(job.waitForCompletion(true) ? 0 : 1);
		} catch (IOException e) {
			e.printStackTrace();
		} catch (ClassNotFoundException e) {
			e.printStackTrace();
		} catch (InterruptedException e) {
			e.printStackTrace();
		}
	}
 
}

新建MyInputFormat.java文件，代码如下：

package mr;

import java.io.IOException;
 
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.SplittableCompressionCodec;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 
 
public class MyInputFormat extends FileInputFormat<Text,Text> {
	@Override
	  protected boolean isSplitable(JobContext context, Path file) {
	    final CompressionCodec codec =
	      new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
	    if (null == codec) {
	      return true;
	    }
	    return codec instanceof SplittableCompressionCodec;
	  }
	
	@Override
	public RecordReader<Text, Text> createRecordReader(InputSplit genericSplit, TaskAttemptContext context)
			throws IOException, InterruptedException {
		context.setStatus(genericSplit.toString());
		return new MyRecordReader(context.getConfiguration());
	}
 
}

新建MyOutputFormat.java文件，代码如下：

package mr;

import java.io.DataOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
 
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.ReflectionUtils;
 
public class MyOutputFormat<K, V> extends FileOutputFormat<K, V> {
	public static String SEPERATOR = "mapreduce.output.textoutputformat.separator";
 
	protected static class MyLineRecordWriter<K, V> extends RecordWriter<K, V> {
		private static final String utf8 = "UTF-8";
		private static final byte[] newline;
 
		static {
			try {
				newline = "\n".getBytes(utf8);
			} catch (UnsupportedEncodingException uee) {
				throw new IllegalArgumentException("can't find " + utf8 + " encoding");
			}
		}
 
		protected DataOutputStream out;
		private final byte[] keyValueSeparator;
 
		public MyLineRecordWriter(DataOutputStream out, String keyValueSeparator) {
			this.out = out;
			try {
				this.keyValueSeparator = keyValueSeparator.getBytes(utf8);
			} catch (UnsupportedEncodingException uee) {
				throw new IllegalArgumentException("can't find " + utf8 + " encoding");
			}
		}
		public MyLineRecordWriter(DataOutputStream out) {
			this(out, "=========>");
		}
 
		/**
		 * Write the object to the byte stream, handling Text as a special case.
		 * 
		 * @param o
		 *            the object to print
		 * @throws IOException
		 *             if the write throws, we pass it on
		 */
		private void writeObject(Object o) throws IOException {
			if (o instanceof Text) {
				Text to = (Text) o;
				out.write(to.getBytes(), 0, to.getLength());
			} else {
				out.write(o.toString().getBytes(utf8));
			}
		}
 
		public synchronized void write(K key, V value) throws IOException {
 
			boolean nullKey = key == null || key instanceof NullWritable;
			boolean nullValue = value == null || value instanceof NullWritable;
			if (nullKey && nullValue) {
				return;
			}
			if (!nullKey) {
				writeObject(key);
			}
			if (!(nullKey || nullValue)) {
				out.write(keyValueSeparator);
			}
			if (!nullValue) {
				writeObject(value);
			}
			out.write(newline);
		}
 
		public synchronized void close(TaskAttemptContext context) throws IOException {
			out.close();
		}
	}
 
	public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
		Configuration conf = job.getConfiguration();
		boolean isCompressed = getCompressOutput(job);
		String keyValueSeparator = conf.get(SEPERATOR, "=========>");
		CompressionCodec codec = null;
		String extension = "";
		if (isCompressed) {
			Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class);
			codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
			extension = codec.getDefaultExtension();
		}
		Path file = getDefaultWorkFile(job, extension);
		FileSystem fs = file.getFileSystem(conf);
		if (!isCompressed) {
			FSDataOutputStream fileOut = fs.create(file, false);
			return new MyLineRecordWriter<K, V>(fileOut, keyValueSeparator);
		} else {
			FSDataOutputStream fileOut = fs.create(file, false);
			return new MyLineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)),
					keyValueSeparator);
		}
	}
}

新建MyRecordReader.java文件，代码如下：

package mr;

import java.io.IOException;
 
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
@InterfaceAudience.Public
@InterfaceStability.Stable
public class MyRecordReader extends RecordReader<Text, Text> {
 
	 public static final String KEY_VALUE_SEPERATOR = 
			    "mapreduce.input.mylinerecordreader.key.value.separator";
			  
			  private final LineRecordReader lineRecordReader;
			  private byte separator = (byte) '=';
 
			  private Text innerValue;
 
			  private Text key;
			  
			  private Text value;
			  
			  public Class<Text> getKeyClass() { return Text.class; }
			  
			  public MyRecordReader(Configuration conf)
			    throws IOException {
			    
			    lineRecordReader = new LineRecordReader();
			    String sepStr = conf.get(KEY_VALUE_SEPERATOR, "=");
			    this.separator = (byte) sepStr.charAt(0);
			  }
 
			  public void initialize(InputSplit genericSplit,
			      TaskAttemptContext context) throws IOException {
			    lineRecordReader.initialize(genericSplit, context);
			   
				
			  }
			  
			  public static int findSeparator(byte[] utf, int start, int length, 
			      byte sep) {
			    for (int i = start; i < (start + length); i++) {
			      if (utf[i] == sep) {
			        return i;
			      }
			    }
			    return -1;
			  }
 
			  public static void setKeyValue(Text key, Text value, byte[] line,
			      int lineLen, int pos) {
			    if (pos == -1) {
			      key.set(line, 0, lineLen);
			      value.set("");
			    } else {
			      key.set(line, 0, pos);
			      value.set(line, pos + 1, lineLen - pos - 1);
			    }
			  }
			  /** Read key/value pair in a line. */
			  public synchronized boolean nextKeyValue()
			    throws IOException {
			    byte[] line = null;
			    int lineLen = -1;
			    if (lineRecordReader.nextKeyValue()) {
			      innerValue = lineRecordReader.getCurrentValue();
			      line = innerValue.getBytes();
			      lineLen = innerValue.getLength();
			    } else {
			      return false;
			    }
			    if (line == null)
			      return false;
			    if (key == null) {
			      key = new Text();
			    }
			    if (value == null) {
			      value = new Text();
			    }
			    int pos = findSeparator(line, 0, lineLen, this.separator);
			    setKeyValue(key, value, line, lineLen, pos);
			    return true;
			  }
			  
			  public Text getCurrentKey() {
			    return key;
			  }
 
			  public Text getCurrentValue() {
			    return value;
			  }
 
			  public float getProgress() throws IOException {
			    return lineRecordReader.getProgress();
			  }
			  
			  public synchronized void close() throws IOException { 
			    lineRecordReader.close();
			  }
}

向hadoop创建数据文件夹，指令如下，上传数据，数据如下：

//指令
hadoop fs -mkdir /test/
hadoop fs -mkdir /test/input/
//数据
one=x:1,y:2,z:3
two=x:4,y:5,z:6
three=x:7,y:8,z:9
//数据上传到Hadoop
hadoop fs -put /home/dolphin/Desktop/text.txt /test/input/

8.在Point3DDriver.java的Run Configurations的Java Application栏目的(x)=Arguments栏目下输入参数hdfs://localhost:8020/test/input/text.txt hdfs://localhost:8020/test/output/，然后点击Apply，Run运行程序，得到如下图输出：
在这里插入图片描述