准备工作
1、在liunx系统上准备wordcount.txt
vi wordcount.txt
准备点数据进行计算
2、把wordcount.txt上传到hdfs系统上面,hdfs dfs -put /home/hadoop/wordcount /wordcount
,路径自己对应好!
步骤图:
3、编写java代码进行计算
创建一个map类WordCountMapper
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/*
Mapper泛型:
keyin:k1的类型 行偏移量 LongWritable
valuein:v1的类型 一行文本数据 Text
keyout:k2的类型 每个单词 Text
valueout:v2的类型 固定值 1 LongWritable
*/
public class WordCountMapper extends Mapper<LongWritable, Text,Text,LongWritable> {
/*
map的方法是将k1,v1转化为k2,v2
key:k1
value:v1
Context:Mapreduce的上下文对象
*/
/*
k1 v1
0 hello,world
11 hello,hadoop
---------------------------------------------------------
k2 v2
hello 1
world 1
hello 1
hadoop 1
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
Text text = new Text();
// 对每一行数据字符进行拆分
String line = value.toString();
String[] split = line.split(",");
// 遍历数组,获取每个单词
for (String word : split){ // word 1
text.set(word);
context.write(text,new LongWritable(1));
}
}
}
在创建一个reduces类WordCountReducer
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/*
keyin:k2 Text 每个单词
valuein:v2 LongWritable 集合中泛型的类型 <1,1>
keyout:k3 Text 每个单词
value:v3 Longwritable 每个单词出现的次数
*/
public class WordCountReducer extends Reducer<Text, LongWritable,Text,LongWritable> {
/*
reduce方法是将k2,v2转为k3,v3
key:k2
values:集合
Context:上下文对象
*/
/*
新 k2 v2
hello <1,1>
word <1,1,1>
hadoop <1,1>
-------------------------------------------
k3 v3
hello 2
word 3
hadoop 2
*/
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
long count = 0; //存储values的值
//1、遍历values集合
for (LongWritable value : values) {
//2、将集合中的值相加
count += value.get();
}
//3、将k3和v3写入上下文中
context.write(key,new LongWritable(count));
}
}
最后创建一个job类JobMain
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class JobMain extends Configured implements Tool {
@Override
public int run(String[] strings) throws Exception {
//创建一个任务对象,jobName可以自己定义
Job job = Job.getInstance(super.getConf(),"wordcount"); //通过super.getConf()调用Configuration
//打包放在集群运行时,需要做个配置
job.setJarByClass(JobMain.class);
//第一步:设置文件读取类:k1和v1
job.setInputFormatClass(TextInputFormat.class); //怎么读
TextInputFormat.addInputPath(job,new Path("hdfs://master:9000/wordcount")); //在哪里读
//第二步:设置maper类
job.setMapperClass(WordCountMapper.class); //前面自己写的maper类
//设置map阶段的输出类型:对应k2,v2的类型
job.setMapOutputKeyClass(Text.class); //k2
job.setMapOutputValueClass(LongWritable.class); //v2
//第三、四、五、六步采用默认的方式(分区,排序,归悦,分组)
//第七步:设置reduce类
job.setReducerClass(WordCountReducer.class); //前面自己写的reducer类
//设置reduce阶段的输出类型:对应前面的k3,v3类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
//第八步:设置输出类
job.setOutputFormatClass(TextOutputFormat.class);
//设置输出的路径
TextOutputFormat.setOutputPath(job,new Path("hdfs://master:9000/out"));
//如果成功返回0,否则1
boolean b = job.waitForCompletion(true);
return b?0:1;
}
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration(); //获取hdfs配置文件
int run = ToolRunner.run(configuration,new JobMain(),args);
System.exit(run);
}
}
可以在java中运行也可以编译到liunx系统上运行!