说明:该需求可以自己造数据,在条件允许的范围内,可模拟数据倾斜,通过加随机数,打散KEY的分布,再聚合
数据:
pk,ccj,ccj
zcl,pxj,zmj
xwc,jpeson,wxc
pxj,zmj,lzh
wxc,pk,zmj
zmj,pk,zcl
lzh,zcl,zcl
zmj,jpeson,lzh
pxj,lzh,zcl
wfy,pxj,pxj
ccj,wlp,wxc
pk,xwc,ccj
zcl,jpeson,xwc
pk,ccj,jpeson
wxc,zmj,wfy
wxc,ccj,pxj
jpeson,zcl,xwc
ccj,pk,jpeson
xwc,ccj,wfy
wlp,pxj,xwc
pk,zcl,lzh
lzh,pxj,jpeson
pxj,ccj,jpeson
pk,pk,pxj
zmj,zmj,xwc
pk,wlp,zcl
zmj,pxj,pxj
lzh,xwc,wxc
xwc,zmj,zcl
wxc,zcl,zmj
zcl,jpeson,zcl
pxj,ccj,zcl
zmj,pk,jpeson
lzh,zmj,xwc
xwc,wfy,pk
pk,zmj,zcl
pk,pxj,ccj
jpeson,zmj,jpeson
wlp,xwc,lzh
ccj,ccj,ccj
jpeson,jpeson,jpeson
wxc,xwc,xwc
wxc,zmj,pxj
jpeson,xwc,xwc
pk,xwc,pk
wfy,ccj,xwc
zcl,lzh,wfy
wxc,ccj,lzh
zcl,wxc,wlp
wfy,zmj,pxj
jpeson,pk,wlp
lzh,wxc,zcl
pxj,zmj,wlp
pxj,ccj,zmj
wlp,lzh,zmj
wfy,zcl,pk
zcl,ccj,wlp
zmj,wxc,zcl
zmj,jpeson,wlp
pk,jpeson,wxc
zmj,ccj,ccj
pk,jpeson,wfy
zcl,zcl,pxj
wxc,wxc,lzh
pk,xwc,pxj
wxc,jpeson,wfy
zcl,pk,wfy
wfy,wxc,ccj
zmj,wlp,lzh
wfy,lzh,wlp
wxc,wlp,ccj
jpeson,pk,wxc
wlp,xwc,pxj
lzh,zmj,ccj
zmj,zcl,wfy
ccj,jpeson,ccj
pxj,pk,ccj
wfy,wxc,zmj
zmj,zcl,wfy
pk,lzh,xwc
jpeson,wfy,wxc
zmj,zcl,wlp
ccj,xwc,zcl
zcl,wfy,lzh
pxj,zmj,wxc
xwc,jpeson,pxj
lzh,wfy,ccj
wfy,lzh,xwc
lzh,wlp,ccj
zcl,lzh,zcl
xwc,xwc,wfy
pxj,xwc,pxj
lzh,wfy,jpeson
wlp,zmj,wxc
xwc,pxj,ccj
jpeson,xwc,pxj
zcl,xwc,xwc
pk,xwc,jpeson
wlp,ccj,lzh
zcl,lzh,wlp
一次性统计Word count
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.cc.pxj.wfy</groupId>
<artifactId>phoneWcRuoZe</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<hadoop.version>2.6.0-cdh5.16.2</hadoop.version>
</properties>
<repositories>
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
</repositories>
<dependencies>
<!-- 添加Hadoop依赖 -->
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/junit/junit -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.17</version>
</dependency>
</dependencies>
<build>
<pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
<plugins>
<!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
<plugin>
<artifactId>maven-clean-plugin</artifactId>
<version>3.1.0</version>
</plugin>
<!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.0</version>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.22.1</version>
</plugin>
<plugin>
<artifactId>maven-jar-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-install-plugin</artifactId>
<version>2.5.2</version>
</plugin>
<plugin>
<artifactId>maven-deploy-plugin</artifactId>
<version>2.8.2</version>
</plugin>
<!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
<plugin>
<artifactId>maven-site-plugin</artifactId>
<version>3.7.1</version>
</plugin>
<plugin>
<artifactId>maven-project-info-reports-plugin</artifactId>
<version>3.0.0</version>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>
mapper
package com.ccj.pxj.homework.two.wc.one;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class WcMapper extends Mapper<LongWritable, Text,Text, IntWritable> {
IntWritable one=new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//获取行
String line = value.toString();
String[] splits = line.split(",");
for (String split : splits) {
context.write(new Text(split),one);
}
}
}
reducer
package com.ccj.pxj.homework.two.wc.one;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class WcReducer extends Reducer<Text, IntWritable,Text,IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum=0;
for (IntWritable value : values) {
sum+=value.get();
}
context.write(key,new IntWritable(sum));
}
}
driver
package com.ccj.pxj.homework.two.wc.one;
import com.ccj.pxj.phone.utils.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class WcDriver implements Tool {
private Configuration conf;
@Override
public int run(String[] args) throws Exception {
String OutPath="wc/one/out";
String InputPath="data/wc.txt";
// 1. 获得 Job 对象
Job job = Job.getInstance(getConf());
// 2. 设置主类
job.setJarByClass(WcDriver.class);
// 3. 设置 Mapper 类
job.setMapperClass(WcMapper.class);
// 4. 不需要reduce
job.setReducerClass(WcReducer.class);
// 5. 设置 Map key-value类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 6. 设置输入路径
FileUtils.deleteOutput(conf,OutPath);
FileInputFormat.setInputPaths(job, new Path(InputPath));
// 7 设置输出路径
FileOutputFormat.setOutputPath(job,new Path(OutPath));
return job.waitForCompletion(true) ? 0 : 1;
}
@Override
public void setConf(Configuration conf) {
this.conf=conf;
}
@Override
public Configuration getConf() {
return this.conf;
}
public static void main(String[] args) throws Exception {
int resultCode = ToolRunner.run(new WcDriver(), args);
if (resultCode == 0) {
System.out.println("执行成功!");
} else {
System.out.println("执行失败!");
}
}
}
统计结果:
ccj 31
jpeson 28
lzh 26
pk 26
pxj 28
wfy 22
wlp 19
wxc 24
xwc 32
zcl 33
zmj 31
加随机数实现
加随机数
mapper
package com.ccj.pxj.homework.two.wc.random;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.util.Random;
public class WcMapperAddRandom extends Mapper<LongWritable,Text, Text, IntWritable> {
IntWritable one=new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
Random random = new Random();
String line = value.toString();
String[] words = line.split(",");
for (String word : words) {
// String keys=word+"_"+random.nextInt(100);
context.write(new Text(word+"_"+random.nextInt(100)),one);
}
}
}
reducer
package com.ccj.pxj.homework.two.wc.random;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class WcReducerAddRandom extends Reducer<Text, IntWritable,Text,IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum=0;
for (IntWritable value : values) {
sum+=value.get();
}
context.write(key,new IntWritable(sum));
}
}
driver
package com.ccj.pxj.homework.two.wc.random;
import com.ccj.pxj.homework.two.wc.one.WcMapper;
import com.ccj.pxj.homework.two.wc.one.WcReducer;
import com.ccj.pxj.phone.utils.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class WcDriverAdd implements Tool {
private Configuration conf;
@Override
public int run(String[] args) throws Exception {
String OutPath="wc/random/out";
String InputPath="data/wc.txt";
// 1. 获得 Job 对象
Job job = Job.getInstance(getConf());
// 2. 设置主类
job.setJarByClass(WcDriverAdd.class);
// 3. 设置 Mapper 类
job.setMapperClass(WcMapperAddRandom.class);
// 4. 不需要reduce
job.setReducerClass(WcReducerAddRandom.class);
// 5. 设置 Map key-value类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 6. 设置输入路径
FileUtils.deleteOutput(conf,OutPath);
FileInputFormat.setInputPaths(job, new Path(InputPath));
// 7 设置输出路径
FileOutputFormat.setOutputPath(job,new Path(OutPath));
return job.waitForCompletion(true) ? 0 : 1;
}
@Override
public void setConf(Configuration conf) {
this.conf=conf;
}
@Override
public Configuration getConf() {
return this.conf;
}
public static void main(String[] args) throws Exception {
int resultCode = ToolRunner.run(new WcDriverAdd(), args);
if (resultCode == 0) {
System.out.println("执行成功!");
} else {
System.out.println("执行失败!");
}
}
}
第一次落地数据
ccj_0 1
ccj_1 1
ccj_13 1
ccj_18 1
ccj_19 1
ccj_20 1
ccj_22 1
ccj_23 1
ccj_24 1
ccj_30 1
ccj_31 1
ccj_35 1
ccj_4 1
ccj_40 2
ccj_45 1
ccj_54 1
ccj_57 2
ccj_7 1
ccj_71 1
ccj_75 3
ccj_78 1
ccj_80 1
ccj_83 1
ccj_88 1
ccj_93 1
ccj_97 1
ccj_98 1
jpeson_1 1
jpeson_10 1
jpeson_13 1
jpeson_17 2
jpeson_28 1
jpeson_29 2
jpeson_36 1
jpeson_39 2
jpeson_4 1
jpeson_40 1
jpeson_53 1
jpeson_55 1
jpeson_57 1
jpeson_60 1
jpeson_61 1
jpeson_64 1
jpeson_70 1
jpeson_73 1
jpeson_87 1
jpeson_88 1
jpeson_89 2
jpeson_91 1
jpeson_93 1
jpeson_94 1
lzh_0 2
lzh_15 1
lzh_17 1
lzh_18 1
lzh_21 1
lzh_24 2
lzh_29 1
lzh_31 1
lzh_34 1
lzh_35 1
lzh_39 1
lzh_43 1
lzh_46 1
lzh_52 2
lzh_59 2
lzh_7 1
lzh_71 1
lzh_73 1
lzh_79 1
lzh_87 1
lzh_99 2
pk_1 1
pk_15 1
pk_20 1
pk_21 1
pk_23 1
pk_24 1
pk_25 2
pk_31 1
pk_32 1
pk_40 1
pk_48 2
pk_5 1
pk_53 1
pk_63 1
pk_64 1
pk_65 1
pk_66 2
pk_72 1
pk_79 1
pk_8 1
pk_82 2
pk_88 1
pxj_0 1
pxj_12 1
pxj_14 1
pxj_20 2
pxj_21 1
pxj_22 1
pxj_25 2
pxj_26 1
pxj_28 3
pxj_33 1
pxj_35 2
pxj_39 1
pxj_40 1
pxj_5 1
pxj_50 1
pxj_56 1
pxj_60 1
pxj_62 1
pxj_74 1
pxj_77 1
pxj_80 1
pxj_86 1
pxj_90 1
wfy_10 1
wfy_12 1
wfy_18 1
wfy_20 1
wfy_21 1
wfy_23 1
wfy_3 2
wfy_39 1
wfy_42 1
wfy_45 1
wfy_57 1
wfy_58 2
wfy_69 1
wfy_70 1
wfy_77 1
wfy_82 1
wfy_83 1
wfy_84 1
wfy_86 1
wfy_89 1
wlp_0 1
wlp_11 1
wlp_19 1
wlp_26 1
wlp_28 1
wlp_31 1
wlp_33 1
wlp_42 1
wlp_46 1
wlp_47 1
wlp_53 2
wlp_56 1
wlp_58 1
wlp_60 1
wlp_62 1
wlp_75 1
wlp_78 1
wlp_92 1
wxc_12 1
wxc_28 1
wxc_39 1
wxc_4 1
wxc_50 1
wxc_59 1
wxc_6 1
wxc_60 1
wxc_61 1
wxc_73 1
wxc_74 1
wxc_76 1
wxc_77 1
wxc_80 1
wxc_82 1
wxc_84 1
wxc_89 2
wxc_90 2
wxc_91 1
wxc_93 1
wxc_96 1
wxc_97 1
xwc_13 1
xwc_21 1
xwc_25 1
xwc_28 1
xwc_29 1
xwc_31 1
xwc_34 1
xwc_35 1
xwc_38 1
xwc_39 3
xwc_45 2
xwc_46 1
xwc_53 1
xwc_58 1
xwc_7 1
xwc_72 1
xwc_74 1
xwc_76 1
xwc_78 1
xwc_81 1
xwc_86 1
xwc_89 1
xwc_90 2
xwc_92 2
xwc_96 1
xwc_97 2
zcl_1 1
zcl_12 2
zcl_13 1
zcl_14 1
zcl_18 1
zcl_19 1
zcl_2 1
zcl_24 1
zcl_27 1
zcl_29 1
zcl_34 1
zcl_4 1
zcl_40 1
zcl_41 1
zcl_43 2
zcl_48 1
zcl_51 1
zcl_53 1
zcl_55 1
zcl_56 1
zcl_58 1
zcl_60 1
zcl_65 1
zcl_67 1
zcl_77 1
zcl_79 1
zcl_87 1
zcl_95 2
zcl_98 2
zmj_1 1
zmj_12 1
zmj_13 2
zmj_16 1
zmj_23 1
zmj_33 1
zmj_34 1
zmj_47 1
zmj_50 1
zmj_52 1
zmj_54 1
zmj_65 1
zmj_68 1
zmj_71 1
zmj_72 1
zmj_73 1
zmj_74 2
zmj_75 1
zmj_81 2
zmj_82 1
zmj_84 2
zmj_87 1
zmj_91 1
zmj_92 1
zmj_93 1
zmj_97 1
zmj_98 1
去随机数
package com.ccj.pxj.homework.two.wc.random.sub;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.util.Random;
public class WcMapperSubRandom extends Mapper<LongWritable,Text, Text, IntWritable> {
// IntWritable one=new IntWritable();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
Random random = new Random();
String line = value.toString();
String[] words = line.split("\t");
String word= words[0];
int num = Integer.parseInt(words[1]);
context.write(new Text(word.substring(0,word.indexOf("_"))),new IntWritable(num));
}
}
package com.ccj.pxj.homework.two.wc.random.sub;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class WcReducerSubRandom extends Reducer<Text, IntWritable,Text,IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum=0;
for (IntWritable value : values) {
sum+=value.get();
}
context.write(key,new IntWritable(sum));
}
}
package com.ccj.pxj.homework.two.wc.random.sub;
import com.ccj.pxj.phone.utils.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class WcDriverSub implements Tool {
private Configuration conf;
@Override
public int run(String[] args) throws Exception {
String OutPath="wc/subrandom/out";
String InputPath="wc/random/out/part-r-00000";
// 1. 获得 Job 对象
Job job = Job.getInstance(getConf());
// 2. 设置主类
job.setJarByClass(WcDriverSub.class);
// 3. 设置 Mapper 类
job.setMapperClass(WcMapperSubRandom.class);
// 4. 不需要reduce
job.setReducerClass(WcReducerSubRandom.class);
// 5. 设置 Map key-value类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 6. 设置输入路径
FileUtils.deleteOutput(conf,OutPath);
FileInputFormat.setInputPaths(job, new Path(InputPath));
// 7 设置输出路径
FileOutputFormat.setOutputPath(job,new Path(OutPath));
return job.waitForCompletion(true) ? 0 : 1;
}
@Override
public void setConf(Configuration conf) {
this.conf=conf;
}
@Override
public Configuration getConf() {
return this.conf;
}
public static void main(String[] args) throws Exception {
int resultCode = ToolRunner.run(new WcDriverSub(), args);
if (resultCode == 0) {
System.out.println("执行成功!");
} else {
System.out.println("执行失败!");
}
}
}
结果
ccj 31
jpeson 28
lzh 26
pk 26
pxj 28
wfy 22
wlp 19
wxc 24
xwc 32
zcl 33
zmj 31