package org;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.File;
import java.io.IOException;
import java.net.URI;
public class WordCountApp {
public static class mymapper extends Mapper
<LongWritable,Text,Text,LongWritable>{
//
LongWritable one=new LongWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
//先读出一行
String lines=value.toString();
//拆分成单词,放入字符串数组
String [] words=lines.split(" ");
//把字符数组中的每一个单词 --> word,1
for (String word:words
) {
context.write(new Text(word),one);
}
}
}
public static class myreduce extends Reducer
<Text,LongWritable,Text,LongWritable>{
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context)throws IOException, InterruptedException {
//统计count
long sum=0;
// 把values 中的值,累加
for (LongWritable value:values
) {
sum+=value.get();
}
//把结果输出
context.write(key,new LongWritable(sum));
}
}
/**
* 主方法
* Driver,封装了MapReduce作业的所有信息
*/
public static void main(String[] args)throws IOException, ClassNotFoundException, InterruptedException {
//创建配置项
Configuration configuration = new Configuration();
// hadoop jar 包 wordcount(类名) 数据输入地址args[0] 输出地址args[1]
//准备工作,清理已经存在的输出目录
Path outputpath = new Path(args[1]);
//链接到HDFS
//FileSystem.get(new URI("192.168.1.1"),configuration,"hadoop");
FileSystem fileSystem = FileSystem.get(configuration);
//0、如果 输出目录存在,--删除
if (fileSystem.exists(outputpath)) {
fileSystem.delete(outputpath, true);
System.out.println("输出目录存在,但是已经被删除了!");
}
//1、创建一个job
Job job = Job.getInstance(configuration, "wordcount");
//2.设置处理哪个类 告诉job要处理的是哪个类
job.setJarByClass(WordCountApp.class);
//3.作业要处理的数据的路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
//4map处理的相关参数(固定套路) 1.找到自己处理的类 2。设置输出的key 3.设置输出value
job.setMapperClass(mymapper.class);
//固定
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
//5设置reduce先关参数
job.setReducerClass(myreduce.class);
//
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
//6.设置combiner 逻辑跟reduce一样
job.setCombinerClass(myreduce.class);
//7.作业处理完之后数据输出的路径
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//8.程序执行后的处理
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
大数据处理思路---java
最新推荐文章于 2024-06-28 08:15:00 发布
该博客介绍了一个使用Hadoop MapReduce实现的WordCount应用程序。Mapper类将文本输入拆分为单词,Reducer类则对单词计数并输出。程序在主方法中配置并提交Job,处理输入文件并生成结果到指定的输出路径。
摘要由CSDN通过智能技术生成