一、项目整体架构
二、项目的pom.xml设置
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>SecondSort</artifactId>
<version>1.0-SNAPSHOT</version>
<name>SecondSort</name>
<!-- FIXME change it to the project's website -->
<url>http://www.example.com</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.7</maven.compiler.source>
<maven.compiler.target>1.7</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.10.0</version>
</dependency>
</dependencies>
<build>
<pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
<plugins>
<!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
<plugin>
<artifactId>maven-clean-plugin</artifactId>
<version>3.1.0</version>
</plugin>
<!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.0</version>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.22.1</version>
</plugin>
<plugin>
<artifactId>maven-jar-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-install-plugin</artifactId>
<version>2.5.2</version>
</plugin>
<plugin>
<artifactId>maven-deploy-plugin</artifactId>
<version>2.8.2</version>
</plugin>
<!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
<plugin>
<artifactId>maven-site-plugin</artifactId>
<version>3.7.1</version>
</plugin>
<plugin>
<artifactId>maven-project-info-reports-plugin</artifactId>
<version>3.0.0</version>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>
三、各个类的代码
自定义组合key类
package org.example;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class MyKeyPair implements WritableComparable<MyKeyPair>{
//组合key属性
private String first;//第一个排序字段
private int second;//第二个排序字段
private int getSencomd;
//实现反序列化对象input的字段
//从输入流读取数据
@Override
public void readFields(DataInput dataInput) throws IOException {
this.first = dataInput.readUTF();
this.second = dataInput.readInt();
}
//实现序列化对象output的字段
//将数据写入数据流
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(first);
dataOutput.writeInt(second);
}
//实现比较器
//将两个对象进行比较,以便进行排序
@Override
public int compareTo(MyKeyPair k) {
//默认升序排序
//这里实现降序排序
int res = this.first.compareTo(k.first);
if (res != 0){ //若第一个字段不相等,则返回
return res;
}else {
//Integer.valueOf() 实现讲String变成int
return -Integer.valueOf(this.second).compareTo(Integer.valueOf(k.getSencomd));
}
}
//实现字段的get和set方法
public int getSecond(){
return second;
}
public void setSecond(int second) {
this.second = second;
}
public String getFirst(){
return this.first;
}
public void setFirst(String first) {
this.first = first;
}
}
自定义分区类
package org.example;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Partitioner;
public class MyPartitioner extends Partitioner<MyKeyPair,IntWritable>{
//实现自定义分区字段
//myKeyPair 对应key
//intWritable对应value
//i对应分区的数量,相当于Reduce数量
//返回的是分区编号
@Override
public int getPartition(MyKeyPair myKeyPair, IntWritable intWritable, int i) {
//将第一字段作为分区字段
return (myKeyPair.getFirst().hashCode() & Integer.MAX_VALUE) % i;
}
}
自定义分组类
package org.example;
import org.apache.hadoop.io.WritableComparator;
public class MyGroupComparator extends WritableComparator {
protected MyGroupComparator(){
//指定分组<key,value>对中key的类型,true为创建该类型的实例
//若不指定类型,将报空值错误
super(MyKeyPair.class,true);
}
//以第一个字段进行分组
public int compare(MyKeyPair o1,MyKeyPair o2){
return o1.getFirst().compareTo(o2.getFirst());
}
}
定义Mapper类
package org.example;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.util.StringTokenizer;
public class MyMapper extends
Mapper<LongWritable,Text,MyKeyPair,IntWritable>{
public void map(LongWritable key,Text value,Context context)
throws IOException,InterruptedException{
String line = value.toString();
String[] ss = line.split(",");
//设置MyKeyPair类型输出Key
MyKeyPair outKey = new MyKeyPair();
String first = ss[0];
String second = ss[1];
outKey.setFirst(first);
outKey.setSecond(Integer.valueOf(second));
//设置Intwrite类型输出value
IntWritable outValue = new IntWritable();
outValue.set(Integer.valueOf(second));
//输出<key,value>对
context.write(outKey,outValue);
}
}
定义Reducer类
package org.example;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class MyReducer extends
Reducer<MyKeyPair,IntWritable,Text,IntWritable>{
public void reduce(MyKeyPair key , Iterable<IntWritable> valuse,Context context)
throws IOException,InterruptedException{
Text outKey = new Text();
for (IntWritable value:valuse){
outKey.set(key.getFirst());
context.write(outKey,value);
}
}
}
定义程序应用主类
package org.example;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
public class MySecondSort {
public static void main( String[] args ) throws IOException, ClassNotFoundException, InterruptedException
{
//初始化Configuration类
Configuration conf = new Configuration();
//设置hdfs访问地址
//conf.set("fs.default.name","hdfs://192.168.170.133:9000");
//构建任务对象
Job myJob = Job.getInstance(conf, MySecondSort.class.getName());
myJob.setJarByClass(MySecondSort.class);
//设置mapper处理类
myJob.setMapperClass(MyMapper.class);
//设置自定义分区类
myJob.setPartitionerClass(MyPartitioner.class);
//设置自定义分组类
myJob.setGroupingComparatorClass(MyGroupComparator.class);
//设置reduce处理类
myJob.setReducerClass(MyReducer.class);
//设置map任务的输出类型,与重写的map方法输出类型一致
myJob.setMapOutputKeyClass(MyKeyPair.class);
myJob.setMapOutputValueClass(IntWritable.class);
//设置reduce任务输出类型,与重写的reduce方法输出类型一致
myJob.setOutputKeyClass(Text.class);
myJob.setOutputValueClass(IntWritable.class);
//将输入的数据集分割成小数据块splites
myJob.setInputFormatClass(TextInputFormat.class);
//提供一个RecordWriter的实现,负责数据输出
myJob.setOutputFormatClass(TextOutputFormat.class);
//设置输入路径
FileInputFormat.addInputPath(myJob,new Path("/wc/input") );
//设置输出路径
FileOutputFormat.setOutputPath(myJob, new Path("/output"));
//提交任务给Hadoop集群
myJob.waitForCompletion(true);
}
}