MapReduce的WordCount案例
案例一:统计单词个数
首先准备一个word.txt文件上传到Linux(内容随意,一行只有一个单词就行)
然后
hdfs dfs -put 你的txt存放目录 上传到hdfs的要存放目录
例:hdfs dfs -put /usr/local/data/word.txt /word
package com.liu.hadoop;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* @Author : ld
* @Description :
* @ClassName : WordCountDemo1
* @Date : 2021/9/22 18:59
* @Version : 1.0
*/
// 用来统计文件中单词个数
// 重写 覆盖mapreduce框架中map() 和reduce()方法
public class WordCountDemo1 {
// map类
// 第一对kv,是决定数据输入的格式
// 第二队kv 是决定数据输出的格式
public static class WCMapper extends Mapper<LongWritable, Text,Text,LongWritable> {
/*
map阶段数据是一行一行过来的
每一行数据都需要执行代码
*/
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
//通过Context输出 Text(一整行数据),1
context.write(new Text(line),new LongWritable(1));
}
}
//reduce类
// 用来接收map端输出的数据
public static class WCReduce extends Reducer<Text,LongWritable,Text,LongWritable>{
/**
* reduce 聚合程序 每一个k都会调用一次
* 默认是一个节点
* key:每一个单词
* values:map端 当前k所对应的所有的v
*/
protected void reduce(Text key,Iterable<LongWritable> values,Context context) throws IOException, InterruptedException {
long sum= 0L;
for (LongWritable value : values) {
sum+=value.get();
}
// 把计算结果输出到hdfs
context.write(key,new LongWritable(sum));
}
}
/**
* 是当前mapreduce程序入口
* 用来构建mapreduce程序
*/
public static void main(String[] args) throws Exception {
Job job = Job.getInstance();//创建一个job任务
job.setJobName("mapreduce单词统计"); //指定job名称
//构建mr
//指定当前main所在类名(识别具体的类)
job.setJarByClass(WordCountDemo1.class);
job.setMapperClass(WCMapper.class);//指定map端口
job.setMapOutputKeyClass(Text.class);// 指定map输出的kv类型
job.setMapOutputValueClass(LongWritable.class);
//指定reduce端类
//指定reduce端输出的kv类型
job.setReducerClass(WCReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
// 指定输入路径
Path in = new Path("/word");
FileInputFormat.addInputPath(job, in);
//指定输出
Path out = new Path("/output");
//如果路径存在 删除
FileSystem fs = FileSystem.get(new Configuration());
if(fs.exists(out)){
fs.delete(out,true);
}
FileOutputFormat.setOutputPath(job,out);
//启动
job.waitForCompletion(true);
/**
* 提交任务
* 1.通过maven中package将项目打包上传服务器然后执行
* 2.执行任务 hadoop jar hadoop-1.0-SNAPSHOT.jar com.liu.hadoop.WordCountDemo1 /word /output
*
*/
System.out.println("mr正在执行");
}
}
#运行:
hadoop jar hadoop-1.0-SNAPSHOT.jar com.liu.hadoop.WordCountDemo1 /word /output
案例二:按班级统计年龄总和
首先把准备好的students.txt文件上传到hdfs的data目录下
点击下载students.txt
(没积分???随便写一篇文章就能有积分下载了)
package com.liu.hadoop;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* @Author : ld
* @Description :
* @ClassName : WordCountDemo3
* @Date : 2021/9/22 18:59
* @Version : 1.0
*/
public class WordCountDemo3 {
public static class SumMapper extends Mapper<LongWritable,Text, Text, LongWritable> {
protected void map(LongWritable key,Text value,Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] split = line.split(",");
LongWritable age = new LongWritable(Integer.valueOf(split[2]));
String clazz=split[4];
context.write(new Text(clazz),age);
}
}
public static class SumReduce extends Reducer<Text,LongWritable,Text,LongWritable> {
protected void reduce(Text key,Iterable<LongWritable> values ,Context context) throws IOException, InterruptedException {
long sum= 0L;
for (LongWritable value : values) {
sum+=value.get();
}
context.write(key,new LongWritable(sum));
}
}
public static void main(String[] args) throws Exception{
//创建一个job任务
Job job = Job.getInstance();
//指定job名称
job.setJobName("第三个mr程序,年龄统计");
//构建mr
//指定当前main所在类名(识别具体的类)
job.setJarByClass(WordCountDemo3.class);
//指定map端类
job.setMapperClass(SumMapper.class);
// 指定map输出的kv类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
//指定reduce端类
//指定reduce端输出的kv类型
job.setReducerClass(SumReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
// 指定输入路径
Path in = new Path("/data");
FileInputFormat.addInputPath(job,in);
//指定输出
Path out = new Path("/output");
//如果路径存在 删除
FileSystem fs = FileSystem.get(new Configuration());
if(fs.exists(out)){
fs.delete(out,true);
}
FileOutputFormat.setOutputPath(job,out);
//启动任务
job.waitForCompletion(true);
System.out.println("mr3正在执行");
}
}
Linux中运行:(先切换到放jar包的目录下)
hadoop jar hadoop-1.0-SNAPSHOT.jar com.liu.hadoop.WordCountDemo3
别问为什么后面没加路径,上面的代码中设置过了
还有确保hdfs的data目录下只有students.txt这一个文件