文章目录
MapReduce
mapreduce程序应该是在很多机器上并行启动,而且先执行map task,当众多的maptask都处理完自己的数据后,还需要启动众多的reduce task,这个过程如果用用户自己手动调度不太现实,需要一个自动化的调度平台——hadoop中就为运行mapreduce之类的分布式运算程序开发了一个自动化调度平台——YARN
安装yarn集群
yarn集群中有两个角色:
主节点:Resource Manager 1台
从节点:Node Manager N台
Resource Manager一般安装在一台专门的机器上,Node Manager应该与HDFS中的data node重叠在一起
** 系统配置文件/etc/profile **
#java
export JAVA_HOME=/appdata/jdk
export CLASSPATH=.:$JAVA_HOME/jre/lib/rt.jar:$JAVA_HOME/lib/dt.jar:$JAVA_HOME/lib/tools.jar
export PATH=.:$PATH:$JAVA_HOME/bin
#hadoop
export HADOOP_HOME=/appdata/hadoop
export PATH=.:$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib:$HADOOP_COMMON_LIB_NATIVE_DIR"
修改配置文件:
yarn-site.xml
<property>
<name>yarn.resourcemanager.hostname</name>
<value>n1</value>
</property>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<!-- 每个nodemanager可调用的内存-->
<property>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>2048</value>
</property>
<!-- 每一个节点,虚拟cpu的个数 -->
<property>
<name>yarn.nodemanager.resource.cpu-vcores</name>
<value>2</value>
</property>
然后复制到每一台机器上
启动yarn集群:sbin/start-yarn.sh
停止:sbin/stop-yarn.sh
启动完成后,可以在windows上用浏览器访问resourcemanager的web端口:
http://n1:8088
看resource mananger是否认出了所有的node manager节点
mapreduce程序运行的三种模式
Windows本地客户端提交程序到yarn集群上面运行
wordcount程序整体运行流程示意图
map阶段: 将每一行文本数据变成<单词,1>这样的kv数据
reduce阶段:将相同单词的一组kv数据进行聚合:累加所有的v
注意点:mapreduce程序中,
map阶段的进、出数据,
reduce阶段的进、出数据,
类型都应该是实现了HADOOP序列化框架的类型,
如:
String对应Text
Integer对应IntWritable
Long对应LongWritable
** Mapper代码 **
package com.initialize;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* KEYIN:是map task读取到的数据的key的类型,是一行的起始偏移量Long
* VALUEIN:是map task读取到的数据的value的类型,是一行的内容String
*
* KEYOUT:是用户的自定义map方法要返回的结果kv数据的key的类型,在wordcount逻辑中,我们需要返回的是单词String
* VALUEOUT:是用户的自定义map方法要返回的结果Kv数据的value的类型,在wordcount逻辑中,我们需要返回的是整数Integer
*
* 但是,在mapreduce中,map产出的数据需要传输给reduce,需要进行序列化和反序列化,而jdk中原生序列化机制产生的数据比较冗余,就会导致数据在mapreduce运行过程中传输效率地下
* 所以,hadoop专门设计了自己的序列化机制,那么,mapreduce中传输的数据类型就必须要实现hadoop自己的序列化接口
*
* hadoop为jdk中的常用基本数据类型Long, String, Integer, Fload等数据类型封装了自己的实现了hadoop序列化接口的类型:LongWritable, Text, IntWritable, FloatWritable
*
*/
public class WordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//切单词
String line = value.toString();
String[] words = line.split(" ");
for(String word : words){
context.write(new Text(word), new IntWritable(1));
}
}
}
** reduce代码 **
package com.initialize;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.Iterator;
public class WordcountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException{
int count = 0;
Iterator<IntWritable> iterator = values.iterator();
while(iterator.hasNext()){
IntWritable value = iterator.next();
count += value.get();
}
context.write(key, new IntWritable(count));
}
}
** 客户端程序**
package com.initialize;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.net.URI;
/**
* 用于提交mapreduce job的客户端程序
* 功能:
* 1.封装本次job运行时所需要的必要参数
* 2.跟yarn进行交互,将mapreduce程序成功的启动,运行
*/
public class JobSubmitter {
public static void main(String[] args) throws Exception {
//在代码中设置JVM系统参数,用于给job对象来获取访问HDFS的用户身份
System.setProperty("HADOOP_USER_NAME", "lys");
Configuration conf = new Configuration();
//设置job运行时,要访问的默认文件系统
conf.set("fs.defaultFS", "hdfs://n1:9000");
//设置job提交到那里运行
conf.set("mapreduce.framework.name", "yarn");
conf.set("yarn.resourcemanager.hostname", "n1");
//如果要从windows系统上运行这个job提交客户端程序,则需要加这个跨平台提交的参数
conf.set("mapreduce.app-submission.cross-platform", "true");
Job job = Job.getInstance(conf);
//1.封装参数:jar包所在的位置
job.setJar("D:/wc.jar");//将jar包中相关程序封装成task到yarn集群上运行
//job.setJarByClass(JobSubmitter.class);
//2.封装参数:本次job所要调用的Mapper实现类,Reducer实现类
job.setMapperClass(WordcountMapper.class);
job.setReducerClass(WordcountReducer.class);
//3.封装参数:本次job的Mapper实现类,Reducer实现类产生的结果数据的key,value类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
Path output = new Path("/wordcount/output");
FileSystem fs = FileSystem.get(new URI("hdfs://n1:9000"), conf, "lys");
if(fs.exists(output)){
fs.delete(output, true);
}
//4.封装参数:本次job要处理的输入数据集所在路径,最终结果的输出路径
FileInputFormat.setInputPaths(job, new Path("/wordcount/input"));
FileOutputFormat.setOutputPath(job, output); //注意:输出路径必须不存在
//5.疯传参数:要想启动的reduce task的数量
job.setNumReduceTasks(2);
//6.提交job给yarn
boolean res = job.waitForCompletion(true);
System.exit(res?0:-1);
}
}
运行结果可在hadoop的dfs集群中查看。
在yarn集群上面提交mapreduce程序
$java -cp mapreduce24-1.0-SNAPSHOT.jar com.initialize.JobSubmitter2
//局部jar包会出现,部分类找不到的异常。
$hadoop jar mapreduce24-1.0-SNAPSHOT.jar com.initialize.JobSubmitter2
//hadoop jar 会将本地的Hadoop的jar包加载的class路径下。
//在本地模式运行,默认加载配置文件,mapred-define中有mapreduce.framework.name=local,默认值。
$hadoop jar mapreduce24-1.0-SNAPSHOT.jar com.initialize.JobSubmitter2
//在mrperd-site.xml配置过mapreduce.framework.name=yarn后,会将程序提交到集群中运行。
package com.initialize;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* KEYIN:是map task读取到的数据的key的类型,是一行的起始偏移量Long
* VALUEIN:是map task读取到的数据的value的类型,是一行的内容String
*
* KEYOUT:是用户的自定义map方法要返回的结果kv数据的key的类型,在wordcount逻辑中,我们需要返回的是单词String
* VALUEOUT:是用户的自定义map方法要返回的结果Kv数据的value的类型,在wordcount逻辑中,我们需要返回的是整数Integer
*
* 但是,在mapreduce中,map产出的数据需要传输给reduce,需要进行序列化和反序列化,而jdk中原生序列化机制产生的数据比较冗余,就会导致数据在mapreduce运行过程中传输效率地下
* 所以,hadoop专门设计了自己的序列化机制,那么,mapreduce中传输的数据类型就必须要实现hadoop自己的序列化接口
*
* hadoop为jdk中的常用基本数据类型Long, String, Integer, Fload等数据类型封装了自己的实现了hadoop序列化接口的类型:LongWritable, Text, IntWritable, FloatWritable
*
*/
public class WordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//切单词
String line = value.toString();
String[] words = line.split(" ");
for(String word : words){
context.write(new Text(word), new IntWritable(1));
}
}
}
package com.initialize;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.Iterator;
public class WordcountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException{
int count = 0;
Iterator<IntWritable> iterator = values.iterator();
while(iterator.hasNext()){
IntWritable value = iterator.next();
count += value.get();
}
context.write(key, new IntWritable(count));
}
}
package com.initialize;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* 如果要在hadoop集群的某台机器上启动这个job提交客户端的话
* conf里面就不需要指定fs.defaultFS mapreduce.framework.name
*
* 因为在集群机器上hadoop jar xxx.jar com.initialize.JobSubmitter2 命令来启动客户端main方法时,
* hadoop jar这个命令会将所在机器上的hadoop安装目录中的jar包和配置文件加入到运行的classpath中
*
* 那么,我们的客户端main方法中的new Configuration()语句就会加载classpath中的配置文件,自然就有了
* fs.defaultFS和mapreduce.framework.name和yarn.resourcemanager.hostname 这些参数配置
*/
public class JobSubmitter2 {
public static void main(String[] args) throws Exception {
//没有指定默认文件系统
//没有指定mapreduce-job提交到那运行
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(JobSubmitter2.class);
job.setMapperClass(WordcountMapper.class);
job.setReducerClass(WordcountReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path("/wordcount/input"));
FileOutputFormat.setOutputPath(job, new Path("/wordcount/output"));
job.setNumReduceTasks(3);
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);
}
}
运行
** 运行结果截图**
本地模式运行
本地模式使用与debug调试
package com.initialize;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class JobSubmitterWindowLocal {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
//没指定默认的文件系统
//conf.set("fs.defaultFS", "file:///");//默认配置
//conf.set("mapreduce.framework.name", "local");//默认配置
Job job = Job.getInstance(conf);
job.setJarByClass(JobSubmitterWindowLocal.class);
job.setMapperClass(WordcountMapper.class);
job.setReducerClass(WordcountReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path("D:/input"));
FileOutputFormat.setOutputPath(job, new Path("D:/output"));
job.setNumReduceTasks(3);
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);
}
}
运行结果
mapreduce编程模型和具体实现框架之间的概念关系
案例
流量统计
计算日志中,每一个手机号码使用的流量的总和。
package com.initialize.flow;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* 本案例的功能:演示自定义数据类型如何实现hadoop的序列化接口
* 1.该类一定要保留空参构造函数
* 2.write方法中输出字段二进制数据的顺序要与readFields方法读取数据的顺序一致
*
*/
public class FlowBean implements Writable {
private int upFlow;
private int dFlow;
private String phone;
private int amountFlow;
public FlowBean(){}
public FlowBean(String phone, int upFlow, int dFlow){
this.phone = phone;
this.upFlow = upFlow;
this.dFlow = dFlow;
this.amountFlow = upFlow + dFlow;
}
/**
* hadoop系统在序列化该类的对象要调用的方法
* @param out
* @throws IOException
*/
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(upFlow);
out.writeUTF(phone);
out.writeInt(dFlow);
out.writeInt(amountFlow);
}
/**
* hadoop系统在反序列化该类的对象时要调用的方法
* @throws IOException
*/
@Override
public void readFields(DataInput in) throws IOException {
this.upFlow = in.readInt();
this.phone = in.readUTF();
this.dFlow = in.readInt();
this.amountFlow = in.readInt();
}
@Override
public String toString() {
return "FlowBean{" +
"upFlow=" + upFlow +
", dFlow=" + dFlow +
", phone='" + phone + '\'' +
", amountFlow=" + amountFlow +
'}';
}
public int getUpFlow() {
return upFlow;
}
public void setUpFlow(int upFlow) {
this.upFlow = upFlow;
}
public int getdFlow() {
return dFlow;
}
public void setdFlow(int dFlow) {
this.dFlow = dFlow;
}
public String getPhone() {
return phone;
}
public void setPhone(String phone) {
this.phone = phone;
}
public int getAmountFlow() {
return amountFlow;
}
public void setAmountFlow(int amountFlow) {
this.amountFlow = amountFlow;
}
}
package com.initialize.flow;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class FlowCountMapper extends Mapper<LongWritable, Text, Text, FlowBean> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] fields = line.split("\t");
String phone = fields[1];
int upFlow = Integer.parseInt(fields[fields.length-3]);
int dFlow = Integer.parseInt(fields[fields.length-2]);
context.write(new Text(phone), new FlowBean(phone, upFlow, dFlow));
}
}
package com.initialize.flow;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class FlowCountReducer extends Reducer<Text, FlowBean, Text, FlowBean> {
/**
* key:是某个手机号
* values:是这个手机号所产生的所有访问记录中的流量数据
*
* <135,flowBean1><135,flowBean2><135.flowBean3><135.flowBean4>
*/
@Override
protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {
int upSum = 0;
int dSum = 0;
for(FlowBean value:values){
upSum += value.getUpFlow();
dSum += value.getdFlow();
}
context.write(key, new FlowBean(key.toString(), upSum, dSum));
}
}
package com.initialize.flow;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class JobSubmitter {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(JobSubmitter.class);
job.setMapperClass(FlowCountMapper.class);
job.setReducerClass(FlowCountReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowBean.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
FileInputFormat.setInputPaths(job, new Path("C:\\Users\\Desktop\\input"));
FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\Desktop\\output"));
job.waitForCompletion(true);
}
}
运算数据:
1363157985066 13726230503 00-FD-07-A4-72-B8:CMCC 120.196.100.82 i02.c.aliimg.com 24 27 2481 24681 200
1363157995052 13826544101 5C-0E-8B-C7-F1-E0:CMCC 120.197.40.4 4 0 264 0 200
1363157991076 13926435656 20-10-7A-28-CC-0A:CMCC 120.196.100.99 2 4 132 1512 200
1363154400022 13926251106 5C-0E-8B-8B-B1-50:CMCC 120.197.40.4 4 0 240 0 200
1363157993044 18211575961 94-71-AC-CD-E6-18:CMCC-EASY 120.196.100.99 iface.qiyi.com 视频网站 15 12 1527 2106 200
1363157995074 84138413 5C-0E-8B-8C-E8-20:7DaysInn 120.197.40.4 122.72.52.12 20 16 4116 1432 200
1363157993055 13560439658 C4-17-FE-BA-DE-D9:CMCC 120.196.100.99 18 15 1116 954 200
1363157995033 15920133257 5C-0E-8B-C7-BA-20:CMCC 120.197.40.4 sug.so.360.cn 信息安全 20 20 3156 2936 200
1363157983019 13719199419 68-A1-B7-03-07-B1:CMCC-EASY 120.196.100.82 4 0 240 0 200
1363157984041 13660577991 5C-0E-8B-92-5C-20:CMCC-EASY 120.197.40.4 s19.cnzz.com 站点统计 24 9 6960 690 200
1363157973098 15013685858 5C-0E-8B-C7-F7-90:CMCC 120.197.40.4 rank.ie.sogou.com 搜索引擎 28 27 3659 3538 200
1363157986029 15989002119 E8-99-C4-4E-93-E0:CMCC-EASY 120.196.100.99 www.umeng.com 站点统计 3 3 1938 180 200
1363157992093 13560439658 C4-17-FE-BA-DE-D9:CMCC 120.196.100.99 15 9 918 4938 200
1363157986041 13480253104 5C-0E-8B-C7-FC-80:CMCC-EASY 120.197.40.4 3 3 180 180 200
1363157984040 13602846565 5C-0E-8B-8B-B6-00:CMCC 120.197.40.4 2052.flash2-http.qq.com 综合门户 15 12 1938 2910 200
1363157995093 13922314466 00-FD-07-A2-EC-BA:CMCC 120.196.100.82 img.qfc.cn 12 12 3008 3720 200
1363157982040 13502468823 5C-0A-5B-6A-0B-D4:CMCC-EASY 120.196.100.99 y0.ifengimg.com 综合门户 57 102 7335 110349 200
1363157986072 18320173382 84-25-DB-4F-10-1A:CMCC-EASY 120.196.100.99 input.shouji.sogou.com 搜索引擎 21 18 9531 2412 200
1363157990043 13925057413 00-1F-64-E1-E6-9A:CMCC 120.196.100.55 t3.baidu.com 搜索引擎 69 63 11058 48243 200
1363157988072 13760778710 00-FD-07-A4-7B-08:CMCC 120.196.100.82 2 2 120 120 200
1363157985066 13726238888 00-FD-07-A4-72-B8:CMCC 120.196.100.82 i02.c.aliimg.com 24 27 2481 24681 200
1363157993055 13560436666 C4-17-FE-BA-DE-D9:CMCC 120.196.100.99 18 15 1116 954 200
运算结果:
自定义分区器
package com.initialize.flow;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
import java.util.HashMap;
public class ProvincePartitioner extends Partitioner<Text, FlowBean> {
static HashMap<String, Integer> codeMap = new HashMap<>();
static{
codeMap.put("135", 0);
codeMap.put("136", 1);
codeMap.put("137", 2);
codeMap.put("138", 3);
codeMap.put("139", 4);
}
@Override
public int getPartition(Text key, FlowBean value, int numPartitions) {
Integer code = codeMap.get(key.toString().substring(0, 3));
return code == null?5:code;
}
}
package com.initialize.flow;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class JobSubmitter {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(JobSubmitter.class);
job.setMapperClass(FlowCountMapper.class);
job.setReducerClass(FlowCountReducer.class);
//设置参数:maptask在做数据分区时,用哪个分区逻辑类(如果不指定,默认HashPartitioner)
job.setPartitionerClass(ProvincePartitioner.class);
//由于我们的ProvincePartitioner可能会产生6种分区号,所以,需要6个reduce task来接收
job.setNumReduceTasks(6);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowBean.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
FileInputFormat.setInputPaths(job, new Path("C:\\Users\\刘元帅\\Desktop\\input"));
FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\刘元帅\\Desktop\\output"));
job.waitForCompletion(true);
}
}
结果:产生六个分区结果文件
DataOutputStream类write()和writeUTF()
package com.initialize.flow;
import java.io.DataOutputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
public class DataOutputstreamTest {
public static void main(String[] args) throws Exception {
DataOutputStream dos = new DataOutputStream(new FileOutputStream("D:/a.dat"));
dos.write("我爱你".getBytes("utf-8"));
dos.close();
DataOutputStream dos2 = new DataOutputStream(new FileOutputStream("D:/b.dat"));
dos2.writeUTF("我爱你");
dos2.close();
}
** 解释**:writeUTF方法在保存数据时,会在开头加两个字节,用于表示该字符串有多长。
TreeMap
package com.initialize.flow;
import java.util.Comparator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
public class TreeMapTest {
public static void main(String[] args){
TreeMap<FlowBean, String> tm1 = new TreeMap<FlowBean, String>(new Comparator<FlowBean>() {
@Override
public int compare(FlowBean o1, FlowBean o2) {
if(o2.getAmountFlow() - o1.getAmountFlow() == 0){
return o1.getPhone().compareTo(o2.getPhone());
}
return o2.getAmountFlow() -o1.getAmountFlow();
}
});
/*tm1.put("a", 2);
tm1.put("b", 1);
tm1.put("aa", 11);
tm1.put("ab", 1);
*/
FlowBean b1 = new FlowBean("1367788", 500, 300);
FlowBean b2 = new FlowBean("1367766", 400, 200);
FlowBean b3 = new FlowBean("1367755", 600, 400);
FlowBean b4 = new FlowBean("1367744", 300, 500);
tm1.put(b1, null);
tm1.put(b2, null);
tm1.put(b3, null);
tm1.put(b4, null);
Set<Map.Entry<FlowBean, String>> entrySet = tm1.entrySet();
for(Map.Entry<FlowBean, String> entry : entrySet){
System.out.println(entry.getKey() + ", " + entry.getValue());
}
}
}
结果
页面的访问次数
** mapper**
package com.initialize.page.topn;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class PageTopnMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] split = line.split(" ");
context.write(new Text(split[1]), new IntWritable(1));
}
}
** reducer**
package com.initialize.page.topn;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
public class PageTopnReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
TreeMap<PageCount, Object> treeMap = new TreeMap<>();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0;
for(IntWritable value : values){
count += value.get();
}
PageCount pageCount = new PageCount();
pageCount.set(key.toString(), count);
treeMap.put(pageCount, null);
}
/**
* 该方法会在每一批key相同的数据处理完以后,调用一次。每个task调用一次
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
//如果取top.n的值为空,则返回5
int topn = conf.getInt("top.n", 5);
Set<Map.Entry<PageCount, Object>> entrySet = treeMap.entrySet();
int i=0;
for(Map.Entry<PageCount, Object> entry :entrySet){
context.write(new Text(entry.getKey().getPage()), new IntWritable(entry.getKey().getCount()));
i++;
if(i==topn) return;
}
}
}
** JobSubmitter **
package com.initialize.page.topn;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.Properties;
public class JobSubmitter {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
/**
* 通过加载classpath下的*-site.xml文件解析参数
*/
Configuration conf = new Configuration();//默认加载hadoop相关文件:core-site.xml, dfs-site.xml, yarn-site.xml, mapred-site.xml
conf.addResource("xx-oo.xml");
/**
* 通过代码设置参数
*/
//conf.setInt("top.n", 3);
//conf.setInt("top.n", Integer.parseInt(args[0]));
/**
* 通过属性配置文件获取参数
*/
/*Properties props = new Properties();
props.load(JobSubmitter.class.getClassLoader().getResourceAsStream("topn.properties"));
conf.setInt("top.n", Integer.parseInt(props.getProperty("top.n")));*/
Job job = Job.getInstance(conf);
job.setJarByClass(JobSubmitter.class);
job.setMapperClass(PageTopnMapper.class);
job.setReducerClass(PageTopnReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path("C:\\Users\\刘元帅\\Desktop\\request.dat"));
FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\刘元帅\\Desktop\\output"));
job.waitForCompletion(true);
}
}
** pageCount**
package com.initialize.page.topn;
public class PageCount implements Comparable<PageCount>{
private String page;
private int count;
public void set(String page, int count){
this.page = page;
this.count = count;
}
public String getPage() {
return page;
}
public void setPage(String page) {
this.page = page;
}
public int getCount() {
return count;
}
public void setCount(int count) {
this.count = count;
}
@Override
public int compareTo(PageCount o) {
return o.getCount() - this.count == 0?this.page.compareTo(o.getPage()):o.getCount()-this.count;
}
}
相关配置文件位置:
** oo-xx.xml**
<configuration>
<property>
<name>top.n</name>
<value>6</value>
</property>
<property>
<name>mygirlfriend</name>
<value>angelababy</value>
</property>
</configuration>
** topn.properties **
top.n=5
使用Reduce默认的排序方法
利用Reduce默认的排序方法,显示网站访问的次数由高到低排序。进行两次mapreduce计算。
package com.initialize.page.conut.sort;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class PageCount implements WritableComparable<PageCount> {
private String page;
private int count;
public void set(String page, int count){
this.page = page;
this.count = count;
}
@Override
public String toString() {
return this.page + "," + this.count;
}
public String getPage() {
return page;
}
public void setPage(String page) {
this.page = page;
}
public int getCount() {
return count;
}
public void setCount(int count) {
this.count = count;
}
@Override
public int compareTo(PageCount o) {
return o.getCount()-this.count == 0?this.page.compareTo(o.page):o.getCount()-this.getCount();
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(this.page);
out.writeInt(this.count);
}
@Override
public void readFields(DataInput in) throws IOException {
this.page = in.readUTF();
this.count = in.readInt();
}
}
package com.initialize.page.conut.sort;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class PageCountStep1 {
public static class PageCountStep1Mapper extends Mapper<LongWritable, Text, Text, IntWritable>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] split = line.split(" ");
context.write(new Text(split[1]), new IntWritable(1));
}
}
public static class PageCountStep1Reducer extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0;
for(IntWritable v : values){
count += v.get();
}
context.write(key, new IntWritable(count));
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(PageCountStep1.class);
job.setMapperClass(PageCountStep1Mapper.class);
job.setReducerClass(PageCountStep1Reducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path("C:\\Users\\刘元帅\\Desktop\\request.dat"));
FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\刘元帅\\Desktop\\output"));
job.setNumReduceTasks(3);
job.waitForCompletion(true);
}
}
第一次计算产生的结果:产生了三个分区文件
package com.initialize.page.conut.sort;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class PageCountStep2 {
public static class PageCountStep2Mapper extends Mapper<LongWritable, Text, PageCount, NullWritable>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split("\t");
PageCount pageCount = new PageCount();
pageCount.set(split[0], Integer.parseInt(split[1]));
context.write(pageCount, NullWritable.get());
}
}
public static class PageCountStep2Reducer extends Reducer<PageCount, NullWritable, PageCount, NullWritable> {
@Override
protected void reduce(PageCount key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
context.write(key, NullWritable.get());
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(PageCountStep2.class);
job.setMapperClass(PageCountStep2Mapper.class);
job.setReducerClass(PageCountStep2Reducer.class);
job.setMapOutputKeyClass(PageCount.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(PageCount.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path("C:\\Users\\刘元帅\\Desktop\\output"));
FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\刘元帅\\Desktop\\sort_put"));
job.setNumReduceTasks(1);
job.waitForCompletion(true);
}
}
将第一次运行产生的结果,当作第二次文件运行的输入文件。
第二次运行产生的结果: