项目需求#求出每一个订单中成交金额最大的三笔
本质:求分组topn
#原始订单数据a.txt
----------------------------------------
订单号,用户id,商品名称,单价,数量
order001,u001,手机14,7999.8,2
order001,u001,手机13,6999.8,2
order001,u001,牛奶,79.2,1
order001,u001,大米,59.8,3
order002,u002,大米,69.9,1
order002,u002,香水,559.9,1
order002,u002,苹果,199.6,3
order002,u002,大米,59.8,3
order002,u002,冰箱,3959.8,1
--------------------------------------
思路:
map:读取数据切分字段,封装数据到一个bean中作为key传输,key要按照金额成交额比大小;
reduce:利用自定义的GroupingComparator将数据按订单id进行分组,然后在reduce方法中输出每组数据的前N条即可。
输出结果
----------------------------------------
order002,u002,冰箱,3959.8,1,3959.8
order002,u002,苹果,199.6,3,598.8
order002,u002,香水,559.9,1,559.9
----------------------------------------
order001,u001,手机14,7999.8,2,15999.6
order001,u001,手机13,6999.8,2,13999.6
order001,u001,大米,59.8,3,179.4
1.项目结构
2.项目源码
package com.bucket.h3.bean;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class OrderBean implements WritableComparable<OrderBean> {
private String orderId;
private String userId;
private String pdtName;
private float price;
private int num;
private float fee;
public void set(String orderId, String userId, String pdtName, float price, int num) {
this.orderId = orderId;
this.userId = userId;
this.pdtName = pdtName;
this.price = price;
this.num = num;
this.fee = price * num;
}
public String getOrderId() {
return orderId;
}
public void setOrderId(String orderId) {
this.orderId = orderId;
}
public String getUserId() {
return userId;
}
public void setUserId(String userId) {
this.userId = userId;
}
public String getPdtName() {
return pdtName;
}
public void setPdtName(String pdtName) {
this.pdtName = pdtName;
}
public float getPrice() {
return price;
}
public void setPrice(float price) {
this.price = price;
}
public int getNum() {
return num;
}
public void setNum(int num) {
this.num = num;
}
public float getFee() {
return fee;
}
public void setFee(float fee) {
this.fee = fee;
}
@Override
public int compareTo(OrderBean a) {
return this.orderId.compareTo(a.getOrderId()) == 0 ? Float.compare(a.getFee(),this.getFee()) : this.orderId.compareTo(a.getOrderId());
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(this.orderId);
dataOutput.writeUTF(this.userId);
dataOutput.writeUTF(this.pdtName);
dataOutput.writeFloat(this.price);
dataOutput.writeInt(this.num);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
this.orderId = dataInput.readUTF();
this.userId = dataInput.readUTF();
this.pdtName = dataInput.readUTF();
this.price =dataInput.readFloat();
this.num = dataInput.readInt();
this.fee = this.price*this.num;
}
@Override
public String toString() {
return this.orderId + ',' + this.userId + ',' + this.pdtName + ',' + this.price +','+this.num+','+this.fee;
}
}
package com.bucket.h3.group;
import com.bucket.h3.bean.OrderBean;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class OrderIdGroupingComparator extends WritableComparator {
public OrderIdGroupingComparator() {
super(OrderBean.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
OrderBean a1 = (OrderBean) a, b1 = (OrderBean) b;
return a1.getOrderId().compareTo(b1.getOrderId());
}
}
package com.bucket.h3.group;
import com.bucket.h3.bean.OrderBean;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;
public class OrderTopnPartitioner extends Partitioner<OrderBean, NullWritable> {
@Override
public int getPartition(OrderBean key, NullWritable nullWritable, int i) {
//按照订单id分发数据,保证id相同的数据分配给同一个reduce
return (key.getOrderId().hashCode() & Integer.MAX_VALUE) % i;
}
}
package com.bucket.h3.mapreduce;
import com.bucket.h3.bean.OrderBean;
import com.bucket.h3.group.OrderIdGroupingComparator;
import com.bucket.h3.group.OrderTopnPartitioner;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.FileInputStream;
import java.io.IOException;
public class OrderTopn {
public static class OrderTopnMapper extends Mapper<LongWritable, Text, OrderBean, NullWritable>{
OrderBean bean = new OrderBean();
NullWritable v = NullWritable.get();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] s = value.toString().split(",");
bean.set(s[0],s[1],s[2],Float.parseFloat(s[3]),Integer.parseInt(s[4]));
context.write(bean,v);
}
}
public static class OrderTopnReduce extends Reducer<OrderBean,NullWritable,OrderBean,NullWritable>{
@Override
protected void reduce(OrderBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
int i=0;
for (NullWritable v: values) {
context.write(key,v);
if(++i ==3)return;
}
}
}
public static void main(String[] args) {
try {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(OrderTopn.class);
job.setMapperClass(OrderTopnMapper.class);
job.setReducerClass(OrderTopnReduce.class);
job.setPartitionerClass(OrderTopnPartitioner.class);
job.setGroupingComparatorClass(OrderIdGroupingComparator.class);
job.setNumReduceTasks(2);
job.setMapOutputKeyClass(OrderBean.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(OrderBean.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job,new Path("/order/topn/in"));
FileOutputFormat.setOutputPath(job,new Path("/order/topn/out"));
job.waitForCompletion(true);
} catch (Exception e) {
e.printStackTrace();
}
}
}
3.打包部署
# hadoop fs -rm -r /order/topn/out/ 清理一下输出结果目录
创建目录
[root@node1 package]# hadoop fs -mkdir -p /order/topn/in
准备源文件 a.txt 并上传hdfs
[root@node1 package]# hadoop fs -put a.txt /order/topn/in
#拷贝一个副本
4.运行
[root@node1 package]# hadoop jar /app/package/bucket.jar com.bucket.h3.mapreduce.OrderTopn
5.运行结果
运行出错:
2022-10-08 22:19:07,726 INFO conf.Configuration: resource-types.xml not found
2022-10-08 22:19:07,726 INFO resource.ResourceUtils: Unable to find 'resource-types.xml'.
解决方法:
hadoop3的配置和hadoop2的配置基本相同,但是,在hadoop3中还需要在mapred-site.xml上新增一些东西
<property>
<name>yarn.app.mapreduce.am.env</name>
<value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
</property>
<property>
<name>mapreduce.map.env</name>
<value>HADOOP_MAPRED_HOME=${HADOP_HOME}</value>
</property>
<property>
<name>mapreduce.reduce.env</name>
<value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
</property>
<property>
<name>mapreduce.application.classpath</name>
<value>
${HADOOP_HOME}/etc/hadoop,
${HADOOP_HOME}/share/hadoop/common/*,
${HADOOP_HOME}/share/hadoop/common/lib/*,
${HADOOP_HOME}/share/hadoop/hdfs/*,
${HADOOP_HOME}/share/hadoop/hdfs/lib/*,
${HADOOP_HOME}/share/hadoop/mapreduce/*,
${HADOOP_HOME}/share/hadoop/mapreduce/lib/*,
${HADOOP_HOME}/share/hadoop/yarn/*,
${HADOOP_HOME}/share/hadoop/yarn/lib/*
</value>
</property>
2022-10-08 23:44:28,895 INFO mapreduce.Job: Job job_1665241250854_0002 completed successfully
2022-10-08 23:44:28,970 INFO mapreduce.Job: Counters: 54
[root@node1 package]# hadoop fs -ls /order/topn/out
Found 3 items
-rw-r--r-- 3 root supergroup 0 2022-10-08 23:44 /order/topn/out/_SUCCESS
-rw-r--r-- 3 root supergroup 111 2022-10-08 23:44 /order/topn/out/part-r-00000
-rw-r--r-- 3 root supergroup 111 2022-10-08 23:44 /order/topn/out/part-r-00001
拉取结果文件到本地
[root@node1 package]# hadoop fs -get /order/topn/out/part-r-00000 0000.txt
[root@node1 package]# hadoop fs -get /order/topn/out/part-r-00001 0001.txt