hadoop主要是用来解决大数据的存储和计算两大难题,存储交给了hdfs,而计算则是由mapreduce来负责
mapreduce框架:
MapReduce将复杂的,运行大规模集群上的并行计算过程高度地抽象两个函数:Map和Reduce
MapReduce采用“分而治之”策略,将一个分布式文件系统中的大规模数据集,分成许多独立的分片。这些分片可以被多个Map任务并行处理。MapReduce采用Master/Slave架构,一个Master,若干Slave。Master运行JobTracker负责作业调度,Slave运行TaskTracker负责具体作业处理。
JobTracker
负责任务调度与资源监控。
监控Job和TaskTracker的健康状态,一旦失败,相应任务就要发生转移。
跟踪任务进度,汇报给调度器,调度器根据在资源空闲时,分配合适的任务。
TaskTracker
定期使用“心跳”向JobTracker报告任务进度,同时接受新任务。
项目演练:Word Count
需求:将aa.txt该文件的每个单词的数量统计出来
aa.txt 内容
hello java
hello china
hello mysql
hello python
输出格式
hello,4
java,1
china,1
mysql,1
python,1
maven项目的依赖包
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.6.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.6.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.6.0</version>
</dependency>
map类
package org.example.test;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
//map阶段
//keyin 输入数据的,按行输入,key为偏移量,例如第一行有2个单词,偏移量1~2
//valuein 输入数据的value
//keyout 输出数据的类型
//valueout 输出数据的个数
public class MyMapper extends Mapper<LongWritable,Text,Text,IntWritable> {
Text k = new Text();
//1
IntWritable v = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//hello world
//hello china
//hello mysql
//1.获取一行
String values = value.toString();
//2.行的切割
String[] words = values.split(" ");
//3.循环写出
for (String word : words) {
//hello
k.set(word);
context.write(k,v);
}
}
}
reduce类
package org.example.test;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
//keyin和keyout为mapper的输出
//valuein:单词
//valueout:单词的个数
public class MyReduce extends Reducer<Text, IntWritable,Text, LongWritable> {
LongWritable v= new LongWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
//hello,1
//world,1
//hello,1
//china,1
//hello,1
//mysql,1
//1.累加求和
int sum = 0;
for (IntWritable value : values) {
sum+= value.get();
}
//2.写出,结果为
// hello,3
// world,1
// china,1
// mysql,1
v.set(sum);
context.write(key,v);
}
}
driver类
package org.example.test;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class MyDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//1.获取job对象
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
//2.设置jar存储路径,通过反射设置jar的存储路径
job.setJarByClass(MyDriver.class);
//3.关联map和reduce
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReduce.class);
//4.设置map阶段输出数据的key和value类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//5.设置最终阶段输出的key和value类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//6.设置输入路径和输出路径
FileInputFormat.setInputPaths(job,new Path("d://zhangqi/aa.txt"));
FileOutputFormat.setOutputPath(job,new Path("d://zhangqi/bb.txt"));
//7.提交job
// job.submit();
//设置为true时,在完成时可以打印信息,false不会打印信息
job.waitForCompletion(true);
}
}