hadoop项目实战:
数据处理流程:
数据处理架构:
实战wordCount案例:
package com.lc.hadoop; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; /** * 基于mapReduce编程模型的开发wordCount */ public class mapReduceTest { //创建hdfs文件系统 private Configuration configuration = null; private FileSystem fileSystem = null; private String sysPath = "hdfs://www.imooc.com:8020"; //开发map过程 public static class myMapper extends Mapper<LongWritable,Text,Text,LongWritable> { private LongWritable longWritable = new LongWritable(1); @Override //编写自己的业务逻辑 //每个key进来一次都会调用一次 public void map(LongWritable key,Text value,Context context) throws IOException, InterruptedException { //将每行数据转化为String String keyString = value.toString(); //每行数据按照空格拆分出每个单词 String keyStrings[] = keyString.split(" "); //将每个单词的个数赋值为1 for (String values : keyStrings){ context.write(new Text(values),longWritable); } } } //开发reduce过程 public static class myReduce extends Reducer<Text,LongWritable,Text,LongWritable> { @Override //该方法会被多次调用 public void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException{ //编写自己的业务逻辑,计算wordCount单词数 int num = 0; for(LongWritable longWritable : values){ num += longWritable.get(); } context.write(key,new LongWritable(num)); } } //mapReduce主执行程序 public static void main(String[] a)throws Exception{ Configuration configuration = new Configuration(); //由于mapReduce中不能存在相同的文件名称, // 所以我们运行相同的程序时需要事先删除已经存在的文件 FileSystem fileSystem = FileSystem.get(configuration); Path path = new Path(a[1]); if(fileSystem.exists(path)){ fileSystem.delete(path,true); System.out.println("文件已存在,但是已删除"); } //创建job Job job = Job.getInstance(configuration,"wordCount"); //设置job的相关参数 //设置job的处理类 job.setJarByClass(mapReduceTest.class); //设置作业的文件输入路径 FileInputFormat.setInputPaths(job,a[0]); //设置job的map处理类 job.setMapperClass(myMapper.class); //设置job的输入key,value job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); //设置job的reduce处理类 job.setReducerClass(myReduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); //设置combiner,相当于本地reduce job.setCombinerClass(myReduce.class); //设置文件的输出路径 FileOutputFormat.setOutputPath(job,new Path(a[1])); System.exit(job.waitForCompletion(true)? 0 : 1); } }