MapReduce的核心思想
将大规模的数据集拆分为多个小规模的数据块,然后并行地对这些数据块进行处理。整个处理过程分为两个阶段:Map阶段和Reduce阶段。
Map阶段
输入的数据被划分为若干个键值对(key-value ),每个键值对经过用户自定义的Map函数进行处理,生成中间结果。
Reduce阶段
中间结果被按照键(key)进行分组,然后将同一键的所有值(values)传递给用户自定义的Reduce函数进行处理,并生成最终的输出结果。
基础数据类型
hadoop数据类型 | java数据类型 |
---|---|
Text | String |
IntWritable | int |
LongWritable | long |
ByteWritable | byte |
BytesWritable | byte[] |
DoubleWritable | double |
FloatWritable | float |
NullWritable | null |
ShortWritable | short |
自定义数据类型
通过实现Writable接口重写它的write(),readFields()方法进行数据类型的读写
特别注意:map控制写,reduce控制读,读的顺序与写的顺序必须一致
public void write(DataOutput dataOutput) throws IOException { dataOutput.writeUTF(this.orderId); dataOutput.writeUTF(this.orderCreateDate); dataOutput.writeUTF(this.userId); dataOutput.writeUTF(this.sex); dataOutput.writeInt(this.age); dataOutput.writeUTF(this.phone); dataOutput.writeInt(this.type); } public void readFields(DataInput dataInput) throws IOException { this.orderId = dataInput.readUTF(); this.orderCreateDate = dataInput.readUTF(); this.userId = dataInput.readUTF(); this.sex = dataInput.readUTF(); this.age = dataInput.readInt(); this.phone = dataInput.readUTF(); this.type = dataInput.readInt(); }
mapreduce案例
本案例计算在mapper,没有用reduce
Mapper
public class MapJoinMapper extends Mapper<LongWritable, Text, CombineBean, CombineBean> { private Map<String, CombineBean> userCache = null; private String fileName; /*完成小数据量集的装配->map[String,CombineBean]*/ /*step1->FileSystem*/ /*step2->path*/ @Override protected void setup(Mapper<LongWritable, Text, CombineBean, CombineBean>.Context context) throws IOException, InterruptedException { //获取FileSplit FileSplit inputSplit = (FileSplit) context.getInputSplit(); //获取文件路径 Path path = inputSplit.getPath(); //通过上下文配置获取文件系统 FileSystem fs = FileSystem.get(context.getConfiguration()); //通过路径获取文件名 fileName = path.getName(); if (fileName.indexOf("order") < 0) { return; } /*加载用户数据->map*/ Path userInputPath = new Path("D:/ideaProjects/hadoop_pro/mapJoin/input/user.data"); FSDataInputStream in = fs.open(userInputPath); List<String> lines = IOUtils.readLines(in, Charset.forName("UTF-8")); userCache =new HashMap<String, CombineBean>(); for (String line : lines) { String[] split = line.split(","); String userId = split[0]; userCache.put(userId , new CombineBean(split[0], split[1], Integer.parseInt(split[2]), split[3], 1)); } } @Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, CombineBean, CombineBean>.Context context) throws IOException, InterruptedException { if(fileName.contains("order")){ String line = value.toString(); String[] split = line.split(","); String userId = split[2]; CombineBean order = new CombineBean(split[0], split[1], split[2], 0); if(userCache.containsKey(userId)){ context.write(userCache.get(userId) , order); } } } @Override//销毁(自己定义) protected void cleanup(Mapper<LongWritable, Text, CombineBean, CombineBean>.Context context) throws IOException, InterruptedException { super.cleanup(context); } }
Launch
public class MapJoinLaunch { public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { //log4j中日志打印 BasicConfigurator.configure(); //通过job实例获取job对象 Job job = Job.getInstance(); //设置名字 job.setJobName("mapJoin"); //设置启动类 job.setJarByClass(MapJoinLaunch.class); //配置Mapper job.setMapperClass(MapJoinMapper.class); //配置reduce job.setReducerClass(MapJoinReduce.class); //配置Mapper(key)输出类型 job.setMapOutputKeyClass(Text.class); //配置Mapper(value)输出类型 job.setMapOutputValueClass(IntWritable.class); //配置key输出类型 job.setOutputKeyClass(Text.class); //配置value输出类型 job.setOutputValueClass(IntWritable.class); //通过job配置获取FileSystem FileSystem fs = FileSystem.get(job.getConfiguration()); //获取文件输出路径 Path out = new Path("D:/ideaProjects/hadoop_pro/mapJoin/output"); if (fs.exists(out)) { fs.delete(out, true); } //获取文件输入路径 FileInputFormat.addInputPath(job, new Path("D:/ideaProjects/hadoop_pro/mapJoin/input")); //设置文件输出路径 FileOutputFormat.setOutputPath(job, out); //设置reduce数量 job.setNumReduceTasks(0); //提交job job.waitForCompletion(true); } }
input
oreder.data o_001,2023-11-11,u_001 o_002,2023-11-11,u_002 user.data u_001,女,18,18629xxxxxx u_002,男,18,18629370xxx
output
CombineBean{orderId='', orderCreateDate='', userId='u_001', sex='女', age=18, phone='18629xxxxxx', type=1} CombineBean{orderId='o_001', orderCreateDate='2023-11-11', userId='u_001', sex='', age=0, phone='', type=0} CombineBean{orderId='', orderCreateDate='', userId='u_002', sex='男', age=18, phone='18629370xxx', type=1} CombineBean{orderId='o_002', orderCreateDate='2023-11-11', userId='u_002', sex='', age=0, phone='', type=0}