文章目录
mr编程案例(一)
maptask与输入切片关系示意图
注意:当maptask发送到dataNode节点运行时,就已经确定,参与运算的数据来自什么地方;(hdfs,mysql,…)
倒排索引案例(一)
目的:统计单词在每个文件中出现的次数
测试数据
hello tom
hello jim
hello kitty
hello rose
hello jerry
hello jim
hello kitty
hello jack
hello jerry
hello java
hello c++
hello c++
** 第一次mr**
package com.initialize.index;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class IndexStepOne {
public static class IndexStepOneMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
//产生<hello-文件名,1>
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//从输入切片信息中获取当前正在处理的一行数据所属的文件
FileSplit inputSplit = (FileSplit)context.getInputSplit();
String fileName = inputSplit.getPath().getName();
String[] words = value.toString().split(" ");
for(String w : words){
//将“单词-文件名”作为key,1作为value,输出
context.write(new Text(w + "-" + fileName), new IntWritable(1));
}
}
}
public static class IndexStepOneReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0;
for(IntWritable value : values){
count += value.get();
}
context.write(key, new IntWritable(count));
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//默认只加载core-default.xml, core-site.xml
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(IndexStepOne.class);
job.setMapperClass(IndexStepOneMapper.class);
job.setReducerClass(IndexStepOneReducer.class);
job.setNumReduceTasks(3);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path("C:\\Users\\Desktop\\input"));
FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\Desktop\\output1"));
job.waitForCompletion(true);
}
}
产生结果文件:
第二次mr
package com.initialize.index;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class IndexStepTwo {
public static class IndexStepTwoMapper extends Mapper<LongWritable, Text, Text, Text>{
//<hello-a.txt 4><hello-b.txt 4>
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split("-");
context.write(new Text(split[0]), new Text(split[1].replaceAll("\t", "-->")));
}
}
public static class IndexStepTwoReducer extends Reducer<Text, Text, Text, Text>{
//一组数据: <hello,a.txt-->4> <hello,b.txt-->4> <hello,c.txt-->4>
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
//Stringbuffer是线程安全的,StringBuilder是非线程安全的,在不涉及线程安全的情况下,StringBuilder更快
StringBuilder sb = new StringBuilder();
for(Text value : values){
sb.append(value.toString()).append("\t");
}
context.write(key, new Text(sb.toString()));
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(IndexStepTwo.class);
job.setMapperClass(IndexStepTwoMapper.class);
job.setReducerClass(IndexStepTwoReducer.class);
job.setNumReduceTasks(1);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job, new Path("C:\\Users\\Desktop\\output1"));
FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\Desktop\\output2"));
job.waitForCompletion(true);
}
}
产生结果文件:
订单topn(二)
目的:求每一个订单中金额最大的那几条。
** 测试数据**
order001,u001,小米6,1999.9,2
order001,u001,雀巢咖啡,99.0,2
order001,u001,安慕希,250.0,2
order001,u001,经典红双喜,200.0,4
order001,u001,防水电脑包,400.0,2
order002,u002,小米手环,199.0,3
order002,u002,榴莲,15.0,10
order002,u002,苹果,4.5,20
order002,u002,肥皂,10.0,40
order003,u001,小米6,1999.9,2
order003,u001,雀巢咖啡,99.0,2
order003,u001,安慕希,250.0,2
order003,u001,经典红双喜,200.0,4
order003,u001,防水电脑包,400.0,2
package com.initialize.order.topn;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class OrderBean implements WritableComparable<OrderBean> {
private String orderId;
private String userId;
private String pdtName;
private float price;
private int number;
private float amountFee;
@Override
public String toString() {
return orderId + "," + userId + "," + pdtName + "," + price + "," + number + "," + amountFee;
}
public String getOrderId() {
return orderId;
}
public void setOrderId(String orderId) {
this.orderId = orderId;
}
public String getUserId() {
return userId;
}
public void setUserId(String userId) {
this.userId = userId;
}
public String getPdtName() {
return pdtName;
}
public void setPdtName(String pdtName) {
this.pdtName = pdtName;
}
public float getPrice() {
return price;
}
public void setPrice(float price) {
this.price = price;
}
public int getNumber() {
return number;
}
public void setNumber(int number) {
this.number = number;
}
public float getAmountFee() {
return amountFee;
}
public void setAmountFee(float amountFee) {
this.amountFee = amountFee;
}
public void set(String orderId, String userId, String pdtName, float price, int number) {
this.orderId = orderId;
this.userId = userId;
this.pdtName = pdtName;
this.price = price;
this.number = number;
this.amountFee = price * number;
}
/**
* 比较规则:先比总金额,如果相同,再比商品名称
* @param o
* @return
*/
@Override
public int compareTo(OrderBean o) {
return Float.compare(o.getAmountFee(), this.getAmountFee())==0?this.getPdtName().compareTo(o.getPdtName()):Float.compare(o.getAmountFee(), this.getAmountFee());
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(this.orderId);
out.writeUTF(this.userId);
out.writeUTF(this.pdtName);
out.writeFloat(this.price);
out.writeInt(this.number);
}
@Override
public void readFields(DataInput in) throws IOException {
this.orderId = in.readUTF();
this.userId = in.readUTF();
this.pdtName = in.readUTF();
this.price = in.readFloat();
this.number = in.readInt();
this.amountFee = this.price * this.number;
}
}
package com.initialize.order.topn;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
public class OrderTopn {
public static class OrderTopnMapper extends Mapper<LongWritable, Text, Text, OrderBean>{
OrderBean orderBean = new OrderBean();
Text k = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] fields = value.toString().split(",");
orderBean.set(fields[0], fields[1], fields[2], Float.parseFloat(fields[3]), Integer.parseInt(fields[4]));
k.set(fields[0]);
//从这里交给maptask的kv对象,会被maptask序列化后存储,所以不用担心覆盖的问题。
context.write(k, orderBean);
}
}
public static class OrderTopnReducer extends Reducer<Text, OrderBean, OrderBean, NullWritable>{
@Override
protected void reduce(Text key, Iterable<OrderBean> values, Context context) throws IOException, InterruptedException {
//获取topn的参数
int topn = context.getConfiguration().getInt("order.top.n", 3);
ArrayList<OrderBean> beanList = new ArrayList<>();
//reduce task提供的values迭代器,每次迭代返回给我们的都是同一个对象,只是set了不同的值
for(OrderBean orderBean : values){
//构造一个新的对象,来存储本次迭代出来的值
OrderBean newBean = new OrderBean();
newBean.set(orderBean.getOrderId(), orderBean.getUserId(), orderBean.getPdtName(), orderBean.getPrice(), orderBean.getNumber());
beanList.add(newBean);
}
//对beanList中的orderBean对象排序(按总金额大小倒序排序,如果总金额相同按商品名称排序)
Collections.sort(beanList);
for(int i=0;i<topn;i++){
context.write(beanList.get(i), NullWritable.get());
}
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
conf.setInt("order.top.n", 2);
Job job = Job.getInstance(conf);
job.setJarByClass(OrderBean.class);
job.setMapperClass(OrderTopnMapper.class);
job.setReducerClass(OrderTopnReducer.class);
job.setNumReduceTasks(2);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(OrderBean.class);
job.setOutputKeyClass(OrderBean.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path("C:\\Users\\刘元帅\\Desktop\\aaa.txt"));
FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\刘元帅\\Desktop\\output"));
job.waitForCompletion(true);
}
}
查看结果:
GroupingComparator应用示例–求分组topn(三)
package com.initialize.order.topn.grouping;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class OrderBean implements WritableComparable<OrderBean> {
private String orderId;
private String userId;
private String pdtName;
private float price;
private int number;
private float amountFee;
@Override
public String toString() {
return orderId + "," + userId + "," + pdtName + "," + price + "," + number + "," + amountFee;
}
public String getOrderId() {
return orderId;
}
public void setOrderId(String orderId) {
this.orderId = orderId;
}
public String getUserId() {
return userId;
}
public void setUserId(String userId) {
this.userId = userId;
}
public String getPdtName() {
return pdtName;
}
public void setPdtName(String pdtName) {
this.pdtName = pdtName;
}
public float getPrice() {
return price;
}
public void setPrice(float price) {
this.price = price;
}
public int getNumber() {
return number;
}
public void setNumber(int number) {
this.number = number;
}
public float getAmountFee() {
return amountFee;
}
public void setAmountFee(float amountFee) {
this.amountFee = amountFee;
}
public void set(String orderId, String userId, String pdtName, float price, int number) {
this.orderId = orderId;
this.userId = userId;
this.pdtName = pdtName;
this.price = price;
this.number = number;
this.amountFee = price * number;
}
/**
* 比较规则:按商品id排序,再按商品价格排序。
* @param o
* @return
*/
@Override
public int compareTo(OrderBean o) {
return this.orderId.compareTo(o.getOrderId())==0?Float.compare(o.getAmountFee(), this.getAmountFee()):this.orderId.compareTo(o.getOrderId());
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(this.orderId);
out.writeUTF(this.userId);
out.writeUTF(this.pdtName);
out.writeFloat(this.price);
out.writeInt(this.number);
}
@Override
public void readFields(DataInput in) throws IOException {
this.orderId = in.readUTF();
this.userId = in.readUTF();
this.pdtName = in.readUTF();
this.price = in.readFloat();
this.number = in.readInt();
this.amountFee = this.price * this.number;
}
}
package com.initialize.order.topn.grouping;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
/**
* reducetask会调用该方法,将不同的数据划分成不同的组。
* orderId相同的为同一组
*/
public class OrderIdGroupingComparator extends WritableComparator {
public OrderIdGroupingComparator(){
super(OrderBean.class, true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
OrderBean o1 = (OrderBean)a;
OrderBean o2 = (OrderBean)b;
return o1.getOrderId().compareTo(o2.getOrderId());
}
}
package com.initialize.order.topn.grouping;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;
/**
* maptask通过调用该函数,将orderId相同的数据分发到同一个分区中。
*/
public class OrderIdPartitioner extends Partitioner<OrderBean, NullWritable> {
@Override
public int getPartition(OrderBean key, NullWritable value, int numPartitions) {
return (key.getOrderId().hashCode() & Integer.MAX_VALUE) % numPartitions;
}
}
package com.initialize.order.topn.grouping;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class OrderTopn {
public static class OrderTopnMapper extends Mapper<LongWritable, Text, OrderBean, NullWritable>{
OrderBean orderBean = new OrderBean();
NullWritable v = NullWritable.get();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] fields = value.toString().split(",");
orderBean.set(fields[0], fields[1], fields[2], Float.parseFloat(fields[3]), Integer.parseInt(fields[4]));
//从这里交给maptask的kv对象,会被maptask序列化后存储,所以不用担心覆盖的问题。
context.write(orderBean, v);
}
}
public static class OrderTopnReducer extends Reducer<OrderBean, NullWritable, OrderBean, NullWritable>{
/**
* 虽然reduce方法中的参数key只有一个,但是只要迭代器迭代一次,key中的值就会变
*/
@Override
protected void reduce(OrderBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
//获取topn的参数
int topn = context.getConfiguration().getInt("order.top.n", 3);
int i=0;
for(NullWritable v : values){
context.write(key, v);
if(++i == topn) return;
}
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
conf.setInt("order.top.n", 2);
Job job = Job.getInstance(conf);
job.setJarByClass(OrderBean.class);
job.setMapperClass(OrderTopnMapper.class);
job.setReducerClass(OrderTopnReducer.class);
job.setPartitionerClass(OrderIdPartitioner.class);
job.setGroupingComparatorClass(OrderIdGroupingComparator.class);
job.setNumReduceTasks(2);
job.setMapOutputKeyClass(OrderBean.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(OrderBean.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path("C:\\Users\\Desktop\\aaa.txt"));
FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\Desktop\\output"));
job.waitForCompletion(true);
}
}
共同的好友(四)
目的:求出那些人之间有共同好友,及共同好友都是那些人。
测试数据:
A:B,C,D,F,E,O
B:A,C,E,K
C:F,A,D,I
D:A,E,F,L
E:B,C,D,M,L
F:A,B,C,D,E,O,M
G:A,C,D,E,F
H:A,C,D,E,O
I:A,O
J:B,O
K:A,C,D
L:D,E,F
M:E,F,G
O:A,H,I,J
package com.initialize.friend;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
/**
* 求两个人有那些共同好友
*/
public class CommonFriendsOne {
public static class CommonFriendOneMapper extends Mapper<LongWritable, Text, Text, Text>{
Text k = new Text();
Text v = new Text();
//A:B,C,D,F,E,O
//输出:B->A C->A D->A
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] userAndFriends = value.toString().split(":");
String user = userAndFriends[0];
String[] friends = userAndFriends[1].split(",");
v.set(user);
for(String f : friends){
k.set(f);
context.write(k, v);
}
}
}
public static class CommonFriendOneReducer extends Reducer<Text, Text, Text, Text>{
//一组数据:B --> A E F J
//一组数据:C --> B F E J...
//前者是后者所有人共同的朋友
@Override
protected void reduce(Text friend, Iterable<Text> values, Context context) throws IOException, InterruptedException {
ArrayList<String> userList = new ArrayList();
for(Text user : values){
userList.add(user.toString());
}
Collections.sort(userList);
for(int i=0;i<userList.size()-1;i++){
for(int j=i+1;j<userList.size();j++){
context.write(new Text(userList.get(i)+"-"+userList.get(j)),friend);
}
}
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(CommonFriendsOne.class);
job.setMapperClass(CommonFriendOneMapper.class);
job.setReducerClass(CommonFriendOneReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job, new Path("C:\\Users\\刘元帅\\Desktop\\xx.txt"));
FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\刘元帅\\Desktop\\output"));
job.waitForCompletion(true);
}
}
package com.initialize.friend;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
/**
* 求两个人有那些共同好友
*/
public class CommonFriendsTwo {
public static class CommonFriendsTwoMapper extends Mapper<LongWritable, Text, Text, Text>{
Text k = new Text();
Text v = new Text();
//B-C A
// B-D A
// B-F A
// B-G A
// B-H A
// B-I A
// B-K A
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] current = value.toString().split("\t");
k = new Text(current[0]);
v = new Text(current[1]);
context.write(k, v);
}
}
public static class CommonFriendsTwoReducer extends Reducer<Text, Text, Text, Text>{
//一组数据:B-G A C D E
@Override
protected void reduce(Text friend, Iterable<Text> values, Context context) throws IOException, InterruptedException {
StringBuilder friends = new StringBuilder();
for(Text user : values){
friends.append("-->" + user + " ");
}
context.write(friend, new Text(friends.toString()));
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(CommonFriendsTwo.class);
job.setMapperClass(CommonFriendsTwoMapper.class);
job.setReducerClass(CommonFriendsTwoReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job, new Path("C:\\Users\\刘元帅\\Desktop\\output"));
FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\刘元帅\\Desktop\\output2"));
job.waitForCompletion(true);
}
}
运行结果:
替换默认的文本输入输出组件为sequence文件输入输出组件(五)
package com.initialize.index.sequence;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import java.io.IOException;
public class IndexStepOne {
public static class IndexStepOneMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
//产生<hello-文件名,1>
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//从输入切片信息中获取当前正在处理的一行数据所属的文件
FileSplit inputSplit = (FileSplit)context.getInputSplit();
String fileName = inputSplit.getPath().getName();
String[] words = value.toString().split(" ");
for(String w : words){
//将“单词-文件名”作为key,1作为value,输出
context.write(new Text(w + "-" + fileName), new IntWritable(1));
}
}
}
public static class IndexStepOneReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0;
for(IntWritable value : values){
count += value.get();
}
context.write(key, new IntWritable(count));
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//默认只加载core-default.xml, core-site.xml
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(IndexStepOne.class);
job.setMapperClass(IndexStepOneMapper.class);
job.setReducerClass(IndexStepOneReducer.class);
job.setNumReduceTasks(3);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// job.setInputFormatClass(TextInputFormat.class); 默认的输入组件
job.setOutputFormatClass(SequenceFileOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path("C:\\Users\\Desktop\\input"));
FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\Desktop\\output1"));
job.waitForCompletion(true);
}
}
中间结果:
sequence数据结构入上图。
package com.initialize.index.sequence;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class IndexStepTwo {
public static class IndexStepTwoMapper extends Mapper<Text, IntWritable, Text, Text>{
@Override
protected void map(Text key, IntWritable value, Context context) throws IOException, InterruptedException {
String[] split = key.toString().split("-");
context.write(new Text(split[0]), new Text(split[1] + "-->" + value));
}
}
public static class IndexStepTwoReducer extends Reducer<Text, Text, Text, Text>{
//一组数据: <hello,a.txt-->4> <hello,b.txt-->4> <hello,c.txt-->4>
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
//Stringbuffer是线程安全的,StringBuilder是非线程安全的,在不涉及线程安全的情况下,StringBuilder更快
StringBuilder sb = new StringBuilder();
for(Text value : values){
sb.append(value.toString()).append("\t");
}
context.write(key, new Text(sb.toString()));
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(IndexStepTwo.class);
job.setMapperClass(IndexStepTwoMapper.class);
job.setReducerClass(IndexStepTwoReducer.class);
job.setNumReduceTasks(1);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// job.setInputFormatClass(TextInputFormat.class); 默认的输入组件
job.setInputFormatClass(SequenceFileInputFormat.class);
FileInputFormat.setInputPaths(job, new Path("C:\\Users\\Desktop\\output1"));
FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\Desktop\\output2"));
job.waitForCompletion(true);
}
}
结果:正常
mapreduce内部核心工作原理
hadoop中的具体实现
mr编程案例(二)
join算法的代码实现(一)
实现:select a.,b. from a join b on a.uid=b.uid;
测试数据:
order.txt
order001,u001
order002,u001
order003,u005
order004,u002
order005,u003
order006,u004
user.txt
u001,senge,18,angelababy
u002,laozhao,48,ruhua
u003,xiaoxu,16,chunge
u004,laoyang,28,zengge
u005,nana,14,huangbo
package com.initialize.join;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class JoinBean implements Writable {
private String orderId;
private String userId;
private String userName;
private int userAge;
private String userFriend;
private String tableName;
public void set(String orderId, String userId, String userName, int userAge, String userFriend, String tableName) {
this.orderId = orderId;
this.userId = userId;
this.userName = userName;
this.userAge = userAge;
this.userFriend = userFriend;
this.tableName = tableName;
}
public String getOrderId() {
return orderId;
}
public void setOrderId(String orderId) {
this.orderId = orderId;
}
public String getUserId() {
return userId;
}
public void setUserId(String userId) {
this.userId = userId;
}
public String getUserName() {
return userName;
}
public void setUserName(String userName) {
this.userName = userName;
}
public int getUserAge() {
return userAge;
}
public void setUserAge(int userAge) {
this.userAge = userAge;
}
public String getUserFriend() {
return userFriend;
}
public void setUserFriend(String userFriend) {
this.userFriend = userFriend;
}
public String getTableName() {
return tableName;
}
public void setTableName(String tableName) {
this.tableName = tableName;
}
@Override
public String toString() {
return this.orderId + "," + this.userId + "," + this.userAge + "," + this.userName + "," + this.userFriend;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(this.orderId);
out.writeUTF(this.userId);
out.writeUTF(this.userName);
out.writeInt(this.userAge);
out.writeUTF(this.userFriend);
out.writeUTF(this.tableName);
}
@Override
public void readFields(DataInput in) throws IOException {
this.orderId = in.readUTF();
this.userId = in.readUTF();
this.userName = in.readUTF();
this.userAge = in.readInt();
this.userFriend = in.readUTF();
this.tableName = in.readUTF();
}
}
package com.initialize.join;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
/**
* 本例是最差的实现方式
*
* 还可以利用Partitoner + CompareTo + GroupingComparator组合拳来高效实现
*/
public class ReduceSideJoin {
public static class ReduceSideJoinMapper extends Mapper<LongWritable, Text, Text, JoinBean>{
String fileName = null;
JoinBean bean = new JoinBean();
Text k = new Text();
/**
* maptask在做数据处理时,会先调用一次setup(),调完后才对每一行反复调用map()
*/
@Override
protected void setup(Context context) throws IOException, InterruptedException {
FileSplit inputSplit = (FileSplit)context.getInputSplit();
fileName = inputSplit.getPath().getName();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] fields = value.toString().split(",");
if(fileName.startsWith("order")){
bean.set(fields[0], fields[1], "NULL", -1, "NULL", "order");
}else{
bean.set("NULL", fields[0], fields[1], Integer.parseInt(fields[2]), fields[3], "user");
}
k.set(bean.getUserId());
context.write(k, bean);;
}
}
public static class ReduceSideJoinReducer extends Reducer<Text, JoinBean, JoinBean, NullWritable>{
@Override
protected void reduce(Text key, Iterable<JoinBean> beans, Context context) throws IOException, InterruptedException {
//缓存订单数据
ArrayList<JoinBean> orderList = new ArrayList<>();
JoinBean userBean = null; //缓存用户数据
//将文件中的数据大量保存在缓存中,会占用大量的缓存。
//如果迭代出来的第一条数据就是user数据,在将大量的order数据保存到缓存中就会浪费内存。
//只要控制号排序规则将用户数据排在所有数据的第一条,就没必要将订单数据缓存。
try {
//区分两类数据
for(JoinBean bean : beans){
if("order".equals(bean.getTableName())){
JoinBean newBean = new JoinBean();
BeanUtils.copyProperties(newBean, bean);
orderList.add(newBean);
}else{
userBean = new JoinBean();
BeanUtils.copyProperties(userBean, bean);
}
}
//拼接数据,并输出
for(JoinBean bean : orderList){
bean.setUserName(userBean.getUserName());
bean.setUserAge(userBean.getUserAge());
bean.setUserFriend(userBean.getUserFriend());
context.write(bean, NullWritable.get());
}
} catch (IllegalAccessException e) {
e.printStackTrace();
} catch (InvocationTargetException e) {
e.printStackTrace();
}
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(ReduceSideJoin.class);
job.setMapperClass(ReduceSideJoinMapper.class);
job.setReducerClass(ReduceSideJoinReducer.class);
job.setNumReduceTasks(2);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(JoinBean.class);
job.setOutputKeyClass(JoinBean.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path("C:\\Users\\Desktop\\input"));
FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\Desktop\\output"));
job.waitForCompletion(true);
}
}
mr数据倾斜–利用Combiner组件 (二)
maptask局部聚合数据来减轻数据倾斜影响
利用Combiner组件,在maptask局部聚合减少网络数据的传输量,从而减轻数据倾斜。Combiner的本质就是Reducer,需要继承Reucer接口。
package com.initialize.wc.skew;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.Random;
public class SkewWordCount {
public static class SkewWordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
Random random = new Random();
Text k = new Text();
IntWritable v = new IntWritable(1);
int numReduceTasks = 0;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
numReduceTasks = context.getNumReduceTasks();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] words = value.toString().split(" ");
for(String w : words){
k.set(w + "\001" + random.nextInt(numReduceTasks));
context.write(k, v);
}
}
}
public static class SkewWordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
IntWritable v = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0;
for(IntWritable value : values){
count += value.get();
}
v.set(count);
context.write(key, v);
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(SkewWordCount.class);
job.setMapperClass(SkewWordCountMapper.class);
job.setReducerClass(SkewWordCountReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//设置maptask端的局部聚合逻辑类
//maptask会调用该类进行局部聚合,指定类必须继承reducer接口。
job.setCombinerClass(SkewWordCountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path("C:\\Users\\Desktop\\input"));
FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\Desktop\\output1"));
job.setNumReduceTasks(3);
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);
}
}
数据倾斜的通用解决方案-打散倾斜的key
经过第一次mr生成的数据后,再次对结果进行mr计算。
package com.initialize.wc.skew;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class SkewWordCount2 {
public static class SkewWordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
Text k = new Text();
IntWritable v = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] wordAndCount = value.toString().split("\t");
v.set(Integer.parseInt(wordAndCount[1]));
k.set(wordAndCount[0].split("\001")[0]);
context.write(k, v);
}
}
public static class SkewWordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
IntWritable v = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0;
for(IntWritable value : values){
count += value.get();
}
v.set(count);
context.write(key, v);
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(SkewWordCount2.class);
job.setMapperClass(SkewWordCountMapper.class);
job.setReducerClass(SkewWordCountReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setCombinerClass(SkewWordCountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path("C:\\Users\\刘元帅\\Desktop\\output1"));
FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\刘元帅\\Desktop\\output2"));
job.setNumReduceTasks(3);
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);
}
}
mapReduce程序在yarn上面启动运行的过程
将mr程序打成jar,在Linux上面运行程序
1.启动RunJar
2.启动MRAppMaster(在任意一个datanode上面启动)
3.启动YarnChild(在多个datanode上启动,运行maptask程序)
4.有YarnChild(maptask)运行完毕后,启动YarnChild(在多个datanode上面启动,运行reducetask程序)
5.YarnChild(reducetask)运行完毕后,关闭MRAppMaster程序
6.退出RunJar