前言
本文主要是学习MapReduce的学习笔记,对所学内容进行记录。
实验环境:
1.Linux Ubuntu 16.04
2.hadoop3.0.0
3.eclipse4.5.1
一、启动Hadoop
- 进入Hadoop启动目录
cd /apps/hadoop/sbin
- 启动Hadoop
./start-all.sh
- 输入‘jps’,启动后显示如下信息
二、环境搭配
-
打开eclipse->Window->Preferences;
-
选择Hadoop Map/Reduce,选择Hadoop包根目录,
/apps/hadoop
,点击Apply,点击OK; -
点击window–>show view–>other–>mapreduce tools–>map/reduce locations,之后页面会出现对应的标签页;
-
点击3中图标1,在Local name输入myhadoop,在DFS Master 框下Port输入8020,点击Finish,出现3中右侧页面;
-
点击3中
-
图标2,选择下图内容,出现第3步图中左侧内容
完成环境配置环境。
三、格式实验
- 新建test项目,将hadoop配置文件复制到src文件夹下
cp /apps/hadoop/etc/hadoop/{core-site.xml,hdfs-site.xml,log4j.properties} /home/dolphin/workspace/test/src
,新建名为mr的package; - 新建Point3D.java文件代码如下:
package mr;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class Point3D implements WritableComparable<Point3D> {
public float x, y, z;
public Point3D(float fx, float fy, float fz) {
this.x = fx;
this.y = fy;
this.z = fz;
}
public Point3D() {
this(0.0f, 0.0f, 0.0f);
}
public void readFields(DataInput in) throws IOException {
x = in.readFloat();
y = in.readFloat();
z = in.readFloat();
}
@Override
public void write(DataOutput out) throws IOException {
out.writeFloat(x);
out.writeFloat(y);
out.writeFloat(z);
}
public String toString() {
return "X:"+Float.toString(x) + ", "
+ "Y:"+Float.toString(y) + ", "
+ "Z:"+Float.toString(z);
}
public float distanceFromOrigin() {
return (float) Math.sqrt( x*x + y*y +z*z);
}
public int compareTo(Point3D other) {
return Float.compare(
distanceFromOrigin(),
other.distanceFromOrigin());
}
public boolean equals(Object o) {
if( !(o instanceof Point3D)) {
return false;
}
Point3D other = (Point3D) o;
return this.x == other.x && this.y == other.y && this.z == other.z;
}
public int hashCode() {
return Float.floatToIntBits(x)
^ Float.floatToIntBits(y)
^ Float.floatToIntBits(z);
}
}
- 新建Point3DDriver.java文件,代码如下:
package mr;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class Point3DDriver {
static int count=0;
public static class MyMapper extends Mapper<Text, Text, Text, Point3D> {
protected void map(Text key, Text value, Mapper<Text, Text, Text, Point3D>.Context context)
throws IOException, InterruptedException {
count++;
String[] vs = value.toString().split(",");
Point3D p = new Point3D(Float.parseFloat(vs[0].split(":")[1]), Float.parseFloat(vs[1].split(":")[1]), Float.parseFloat(vs[2].split(":")[1]) );
context.write(new Text(key), p);
System.out.println("map==========>"+count);
}
}
public static class MyReducer extends Reducer<Text, Point3D, Text, Point3D>{
protected void reduce(Text key, Point3D value,
Reducer<Text, Point3D, Text, Point3D>.Context context) throws IOException, InterruptedException {
context.write(key, value);
}
}
public static void main(String[] args) {
try {
Configuration conf = new Configuration();
String[] paths = new GenericOptionsParser(conf, args).getRemainingArgs();
if (paths.length < 2) {
throw new RuntimeException("usage <input> <output>");
}
Job job = Job.getInstance(conf, "Point3DDriver");
job.setJarByClass(Point3DDriver.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setInputFormatClass(MyInputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Point3D.class);
job.setOutputFormatClass(MyOutputFormat.class);
//job.setOutputFormatClass(MyOutputFormat2.class);
FileInputFormat.addInputPaths(job, paths[0]);
FileOutputFormat.setOutputPath(job, new Path(paths[1] + System.currentTimeMillis()));
System.exit(job.waitForCompletion(true) ? 0 : 1);
} catch (IOException e) {
e.printStackTrace();
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
- 新建MyInputFormat.java文件,代码如下:
package mr;
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.SplittableCompressionCodec;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
public class MyInputFormat extends FileInputFormat<Text,Text> {
@Override
protected boolean isSplitable(JobContext context, Path file) {
final CompressionCodec codec =
new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
if (null == codec) {
return true;
}
return codec instanceof SplittableCompressionCodec;
}
@Override
public RecordReader<Text, Text> createRecordReader(InputSplit genericSplit, TaskAttemptContext context)
throws IOException, InterruptedException {
context.setStatus(genericSplit.toString());
return new MyRecordReader(context.getConfiguration());
}
}
- 新建MyOutputFormat.java文件,代码如下:
package mr;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.ReflectionUtils;
public class MyOutputFormat<K, V> extends FileOutputFormat<K, V> {
public static String SEPERATOR = "mapreduce.output.textoutputformat.separator";
protected static class MyLineRecordWriter<K, V> extends RecordWriter<K, V> {
private static final String utf8 = "UTF-8";
private static final byte[] newline;
static {
try {
newline = "\n".getBytes(utf8);
} catch (UnsupportedEncodingException uee) {
throw new IllegalArgumentException("can't find " + utf8 + " encoding");
}
}
protected DataOutputStream out;
private final byte[] keyValueSeparator;
public MyLineRecordWriter(DataOutputStream out, String keyValueSeparator) {
this.out = out;
try {
this.keyValueSeparator = keyValueSeparator.getBytes(utf8);
} catch (UnsupportedEncodingException uee) {
throw new IllegalArgumentException("can't find " + utf8 + " encoding");
}
}
public MyLineRecordWriter(DataOutputStream out) {
this(out, "=========>");
}
/**
* Write the object to the byte stream, handling Text as a special case.
*
* @param o
* the object to print
* @throws IOException
* if the write throws, we pass it on
*/
private void writeObject(Object o) throws IOException {
if (o instanceof Text) {
Text to = (Text) o;
out.write(to.getBytes(), 0, to.getLength());
} else {
out.write(o.toString().getBytes(utf8));
}
}
public synchronized void write(K key, V value) throws IOException {
boolean nullKey = key == null || key instanceof NullWritable;
boolean nullValue = value == null || value instanceof NullWritable;
if (nullKey && nullValue) {
return;
}
if (!nullKey) {
writeObject(key);
}
if (!(nullKey || nullValue)) {
out.write(keyValueSeparator);
}
if (!nullValue) {
writeObject(value);
}
out.write(newline);
}
public synchronized void close(TaskAttemptContext context) throws IOException {
out.close();
}
}
public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
Configuration conf = job.getConfiguration();
boolean isCompressed = getCompressOutput(job);
String keyValueSeparator = conf.get(SEPERATOR, "=========>");
CompressionCodec codec = null;
String extension = "";
if (isCompressed) {
Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class);
codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
extension = codec.getDefaultExtension();
}
Path file = getDefaultWorkFile(job, extension);
FileSystem fs = file.getFileSystem(conf);
if (!isCompressed) {
FSDataOutputStream fileOut = fs.create(file, false);
return new MyLineRecordWriter<K, V>(fileOut, keyValueSeparator);
} else {
FSDataOutputStream fileOut = fs.create(file, false);
return new MyLineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)),
keyValueSeparator);
}
}
}
- 新建MyRecordReader.java文件,代码如下:
package mr;
import java.io.IOException;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
@InterfaceAudience.Public
@InterfaceStability.Stable
public class MyRecordReader extends RecordReader<Text, Text> {
public static final String KEY_VALUE_SEPERATOR =
"mapreduce.input.mylinerecordreader.key.value.separator";
private final LineRecordReader lineRecordReader;
private byte separator = (byte) '=';
private Text innerValue;
private Text key;
private Text value;
public Class<Text> getKeyClass() { return Text.class; }
public MyRecordReader(Configuration conf)
throws IOException {
lineRecordReader = new LineRecordReader();
String sepStr = conf.get(KEY_VALUE_SEPERATOR, "=");
this.separator = (byte) sepStr.charAt(0);
}
public void initialize(InputSplit genericSplit,
TaskAttemptContext context) throws IOException {
lineRecordReader.initialize(genericSplit, context);
}
public static int findSeparator(byte[] utf, int start, int length,
byte sep) {
for (int i = start; i < (start + length); i++) {
if (utf[i] == sep) {
return i;
}
}
return -1;
}
public static void setKeyValue(Text key, Text value, byte[] line,
int lineLen, int pos) {
if (pos == -1) {
key.set(line, 0, lineLen);
value.set("");
} else {
key.set(line, 0, pos);
value.set(line, pos + 1, lineLen - pos - 1);
}
}
/** Read key/value pair in a line. */
public synchronized boolean nextKeyValue()
throws IOException {
byte[] line = null;
int lineLen = -1;
if (lineRecordReader.nextKeyValue()) {
innerValue = lineRecordReader.getCurrentValue();
line = innerValue.getBytes();
lineLen = innerValue.getLength();
} else {
return false;
}
if (line == null)
return false;
if (key == null) {
key = new Text();
}
if (value == null) {
value = new Text();
}
int pos = findSeparator(line, 0, lineLen, this.separator);
setKeyValue(key, value, line, lineLen, pos);
return true;
}
public Text getCurrentKey() {
return key;
}
public Text getCurrentValue() {
return value;
}
public float getProgress() throws IOException {
return lineRecordReader.getProgress();
}
public synchronized void close() throws IOException {
lineRecordReader.close();
}
}
- 向hadoop创建数据文件夹,指令如下,上传数据,数据如下:
//指令
hadoop fs -mkdir /test/
hadoop fs -mkdir /test/input/
//数据
one=x:1,y:2,z:3
two=x:4,y:5,z:6
three=x:7,y:8,z:9
//数据上传到Hadoop
hadoop fs -put /home/dolphin/Desktop/text.txt /test/input/
8.在Point3DDriver.java的Run Configurations的Java Application栏目的(x)=Arguments栏目下输入参数hdfs://localhost:8020/test/input/text.txt hdfs://localhost:8020/test/output/
,然后点击Apply,Run运行程序,得到如下图输出:
总结
本次实验代码量较大,需要仔细研读类之间的调用关系和作用,达到掌握自定义格式输出的目的。