大数据（hadoop-mapreduce案例讲解）

最新推荐文章于 2022-10-07 14:38:51 发布

???Sir

最新推荐文章于 2022-10-07 14:38:51 发布

阅读量170

点赞数

文章标签：大数据 java python

原文链接：https://my.oschina.net/u/3728166/blog/3056219

版权

2019独角兽企业重金招聘Python工程师标准>>>

package com.vip;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


public class MapReduceCaseAvg extends Configured implements Tool{

	public static class AvgMapper extends Mapper<Object, Text, Text, IntWritable>{
		@Override
		protected void map(Object key, Text value, Context context)
				throws IOException, InterruptedException {
			//获取文件内容
			String content = value.toString() ;
			//字符串切分
			StringTokenizer st = new StringTokenizer(content) ;
			while(st.hasMoreElements()){
				String strName = st.nextToken() ;	//学员姓名
				String strSorce = st.nextToken() ;	//学员成绩
				//输出key，value
				context.write(new Text(strName),  new IntWritable(Integer.parseInt(strSorce)));
			}
		}
	}
	
	public static class AvgReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
		//<张三 ，{98，89，79}>
		@Override
		protected void reduce(Text key, Iterable<IntWritable> values,
				Context context) throws IOException, InterruptedException {
			//平均值，即使将所有的成绩相加除以科目数
			int sum = 0 ;	//总成绩
			int num = 0 ;	//总科目
			for (IntWritable score : values) {
				sum += score.get() ;	//累加每门课得成绩
				num ++ ;
			}
			context.write(key,  new IntWritable((int)sum/num));
		}
	}
	
	
	@Override
	public int run(String[] args) throws Exception {
		//任务和参数
		Job job = Job.getInstance(getConf(), "avg mr") ;
		job.setJarByClass(MapReduceCaseAvg.class);
		
		/*设置map方法的类*/
		job.setMapperClass(AvgMapper.class);
		job.setReducerClass(AvgReducer.class);
		
		/*设置输出的key和value的类型*/
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		/*设置输入输出参数*/
		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		/*提交作业到集群并等待任务完成*/
		boolean isSuccess = job.waitForCompletion(true);
		
		return isSuccess ? 0 : 1 ;
	}
	
	public static void main(String[] args) throws Exception {
		int res = ToolRunner.run(new MapReduceCaseAvg(), args) ;
		System.exit(res);
	}
}

package com.vip;

import java.io.IOException;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class MapReduceCaseFilte extends Configured implements Tool {
	
	public static class FilterMapper extends Mapper<Object, Text, NullWritable, Text>{
		@Override
		protected void map(Object key, Text value, Context context)
				throws IOException, InterruptedException {
			//以空格切分字段
			String[] strSrc = value.toString().split(" ");
			//拼接字符串
			String strDst = strSrc[0] + " " + strSrc[1] + " " + strSrc[2] + " " + strSrc[6] ;
			context.write(NullWritable.get(), new Text(strDst));
		}
	}
	
	
	@Override
	public int run(String[] args) throws Exception {
		Job job = Job.getInstance(getConf(), "mrfilter") ;
		job.setJarByClass(MapReduceCaseFilte.class);
		
		/*设置map方法的类*/
		job.setMapperClass(FilterMapper.class);
		
		/*设置输出的key和value的类型*/
		job.setOutputKeyClass(NullWritable.class);
		job.setOutputValueClass(Text.class);
		
		/*设置输入输出参数*/
		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		/*提交作业到集群并等待任务完成*/
		boolean isSuccess = job.waitForCompletion(true);
		
		return isSuccess ? 0 : 1 ;
	}

	public static void main(String[] args) throws Exception {
		int res = ToolRunner.run(new MapReduceCaseFilte(), args) ;
		System.exit(res);
	}
}

// cat act  
// tar art

//<act,{cat,tac,cta}>

package com.vip;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


public class MapReduceCaseWords extends Configured implements Tool{

	
	@Override
	public int run(String[] args) throws Exception {
		Configuration conf = new Configuration() ;
		//删除已经存在的输出目录
		Path mypath = new Path(args[1]) ;
		FileSystem hdfs = mypath.getFileSystem(conf);
		if(hdfs.isDirectory(mypath)){
			hdfs.delete(mypath, true) ;
		}
		
		//设置任务信息
		Job job = Job.getInstance(conf, "words mr") ;
		job.setJarByClass(MapReduceCaseWords.class);
		
		/*设置map方法的类*/
		job.setMapperClass(WordsMapper.class);
		
		job.setReducerClass(WordsReducer.class);
		
		/*设置输出的key和value的类型*/
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		
		/*设置输入输出参数*/
		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		/*提交作业到集群并等待任务完成*/
		boolean isSuccess = job.waitForCompletion(true);
		
		return isSuccess ? 0 : 1 ;
	}
	
	public static void main(String[] args) throws Exception {
		String[] args0 = {"hdfs://192.168.153.111:9000/input5",
				"hdfs://192.168.153.111:9000/output12"} ;
		int res = ToolRunner.run(new MapReduceCaseWords(), args0) ;
		System.exit(res);
	}

}

package com.vip;

import java.io.IOException;
import java.util.Arrays;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class WordsMapper extends Mapper<Object, Text, Text, Text>{
	private Text keyText = new Text() ;
	private Text valueText = new Text() ;
	
	@Override
	protected void map(Object key, Text value, Context context)
			throws IOException, InterruptedException {
		String word = value.toString() ;
		char[] wordChars = word.toCharArray();	//单词转化为字符数组
		Arrays.sort(wordChars); 				//对字符数组进行排序
		String sword = new String(wordChars) ;	//字符数组在转化为字符串
		keyText.set(sword);              		//设置输出key
		valueText.set(word);  					//设置输出得value得值
		context.write(keyText, valueText);		//map输出
	}
}

package com.vip;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class WordsReducer extends Reducer<Text, Text, Text, Text>{
	private Text outputKey = new Text() ;	//输出key
	private Text outputValue = new Text() ;	//输出的value
	
	@Override
	protected void reduce(Text key, Iterable<Text> values, Context context)
			throws IOException, InterruptedException {
		String output ="" ;
		//对相同字母组成的单词，使用~符号进行拼接
		for (Text word : values) {
			if(!output.equals("")){
				output = output + "~" ;
			}
			output = output + word.toString() ;
		}
		//输出有两个单词或以上的结果
		StringTokenizer outputTokenize = new StringTokenizer(output, "~") ;
		if(outputTokenize.countTokens() >= 2){
			output = output.replaceAll("~", ",") ;
			outputKey.set(key.toString()); 			//设置key的值
			outputValue.set(output);				//设置value的值
			context.write(outputKey, outputValue);	//输出
		}
	}
	
	
}

转载于:https://my.oschina.net/u/3728166/blog/3056219