MapReduce读取文本,实现降序排序

目录

1、Maven导入hadoop-client包

2、core-site.xml文件配置

3、log4j.properties 文件配置

4、Top5.java(主要代码)

5、测试数据

6、运行结果


1、Maven导入hadoop-client包

		<dependency>
			<groupId>org.apache.hadoop</groupId>
			<artifactId>hadoop-client</artifactId>
			<version>2.7.3</version>
		</dependency>

2、core-site.xml文件配置

<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration xmlns:xi="http://www.w3.org/2001/XInclude">
	<!--  
	<property>
		<name>fs.defaultFS</name>
		<value>hdfs://localhost:8020</value>
		<descript>配置HDFS环境,不配置则默认使用Windows系统下的磁盘</descript>
	</property>
	 -->
	<property>
		<name>fs.defaultFS</name>
		<value>file://34455/</value>
		<descript>使用Windows系统下的磁盘</descript>
	</property>
	
</configuration>

3、log4j.properties 文件配置

hadoop.root.logger=INFO,console
hadoop.log.dir=.
hadoop.log.file=hadoop.log

log4j.threshold=ALL

log4j.appender.NullAppender=org.apache.log4j.varia.NullAppender

hadoop.log.maxfilesize=256MB
hadoop.log.maxbackupindex=20
log4j.appender.RFA=org.apache.log4j.RollingFileAppender
log4j.appender.RFA.File=${hadoop.log.dir}/${hadoop.log.file}

log4j.appender.RFA.MaxFileSize=${hadoop.log.maxfilesize}
log4j.appender.RFA.MaxBackupIndex=${hadoop.log.maxbackupindex}

log4j.appender.RFA.layout=org.apache.log4j.PatternLayout

log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender
log4j.appender.DRFA.File=${hadoop.log.dir}/${hadoop.log.file}

log4j.appender.DRFA.DatePattern=.yyyy-MM-dd

log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout

log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n

log4j.logger.org.apache.hadoop.conf.Configuration=ERROR

log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.target=System.err
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n

#Default values
hadoop.tasklog.taskid=null
hadoop.tasklog.iscleanup=false
hadoop.tasklog.noKeepSplits=4
hadoop.tasklog.totalLogFileSize=100
hadoop.tasklog.purgeLogSplits=true
hadoop.tasklog.logsRetainHours=12

log4j.appender.TLA=org.apache.hadoop.mapred.TaskLogAppender
log4j.appender.TLA.taskId=${hadoop.tasklog.taskid}
log4j.appender.TLA.isCleanup=${hadoop.tasklog.iscleanup}
log4j.appender.TLA.totalLogFileSize=${hadoop.tasklog.totalLogFileSize}

log4j.appender.TLA.layout=org.apache.log4j.PatternLayout
log4j.appender.TLA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n

hadoop.security.logger=INFO,NullAppender
hadoop.security.log.maxfilesize=256MB
hadoop.security.log.maxbackupindex=20
log4j.category.SecurityLogger=${hadoop.security.logger}
hadoop.security.log.file=SecurityAuth-${user.name}.audit
log4j.appender.RFAS=org.apache.log4j.RollingFileAppender 
log4j.appender.RFAS.File=${hadoop.log.dir}/${hadoop.security.log.file}
log4j.appender.RFAS.layout=org.apache.log4j.PatternLayout
log4j.appender.RFAS.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
log4j.appender.RFAS.MaxFileSize=${hadoop.security.log.maxfilesize}
log4j.appender.RFAS.MaxBackupIndex=${hadoop.security.log.maxbackupindex}

log4j.appender.DRFAS=org.apache.log4j.DailyRollingFileAppender 
log4j.appender.DRFAS.File=${hadoop.log.dir}/${hadoop.security.log.file}
log4j.appender.DRFAS.layout=org.apache.log4j.PatternLayout
log4j.appender.DRFAS.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
log4j.appender.DRFAS.DatePattern=.yyyy-MM-dd

hdfs.audit.logger=INFO,NullAppender
hdfs.audit.log.maxfilesize=256MB
hdfs.audit.log.maxbackupindex=20
log4j.logger.org.apache.hadoop.hdfs.server.namenode.FSNamesystem.audit=${hdfs.audit.logger}
log4j.additivity.org.apache.hadoop.hdfs.server.namenode.FSNamesystem.audit=false
log4j.appender.RFAAUDIT=org.apache.log4j.RollingFileAppender
log4j.appender.RFAAUDIT.File=${hadoop.log.dir}/hdfs-audit.log
log4j.appender.RFAAUDIT.layout=org.apache.log4j.PatternLayout
log4j.appender.RFAAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n
log4j.appender.RFAAUDIT.MaxFileSize=${hdfs.audit.log.maxfilesize}
log4j.appender.RFAAUDIT.MaxBackupIndex=${hdfs.audit.log.maxbackupindex}

mapred.audit.logger=INFO,NullAppender
mapred.audit.log.maxfilesize=256MB
mapred.audit.log.maxbackupindex=20
log4j.logger.org.apache.hadoop.mapred.AuditLogger=${mapred.audit.logger}
log4j.additivity.org.apache.hadoop.mapred.AuditLogger=false
log4j.appender.MRAUDIT=org.apache.log4j.RollingFileAppender
log4j.appender.MRAUDIT.File=${hadoop.log.dir}/mapred-audit.log
log4j.appender.MRAUDIT.layout=org.apache.log4j.PatternLayout
log4j.appender.MRAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n
log4j.appender.MRAUDIT.MaxFileSize=${mapred.audit.log.maxfilesize}
log4j.appender.MRAUDIT.MaxBackupIndex=${mapred.audit.log.maxbackupindex}

hadoop.mapreduce.jobsummary.logger=${hadoop.root.logger}
hadoop.mapreduce.jobsummary.log.file=hadoop-mapreduce.jobsummary.log
hadoop.mapreduce.jobsummary.log.maxfilesize=256MB
hadoop.mapreduce.jobsummary.log.maxbackupindex=20
log4j.appender.JSA=org.apache.log4j.RollingFileAppender
log4j.appender.JSA.File=${hadoop.log.dir}/${hadoop.mapreduce.jobsummary.log.file}
log4j.appender.JSA.MaxFileSize=${hadoop.mapreduce.jobsummary.log.maxfilesize}
log4j.appender.JSA.MaxBackupIndex=${hadoop.mapreduce.jobsummary.log.maxbackupindex}
log4j.appender.JSA.layout=org.apache.log4j.PatternLayout
log4j.appender.JSA.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n
log4j.logger.org.apache.hadoop.mapred.JobInProgress$JobSummary=${hadoop.mapreduce.jobsummary.logger}
log4j.additivity.org.apache.hadoop.mapred.JobInProgress$JobSummary=false

log4j.logger.org.apache.hadoop.yarn.server.resourcemanager.RMAppManager$ApplicationSummary=${yarn.server.resourcemanager.appsummary.logger}
log4j.additivity.org.apache.hadoop.yarn.server.resourcemanager.RMAppManager$ApplicationSummary=false
log4j.appender.RMSUMMARY=org.apache.log4j.RollingFileAppender
log4j.appender.RMSUMMARY.File=${hadoop.log.dir}/${yarn.server.resourcemanager.appsummary.log.file}
log4j.appender.RMSUMMARY.MaxFileSize=256MB
log4j.appender.RMSUMMARY.MaxBackupIndex=20
log4j.appender.RMSUMMARY.layout=org.apache.log4j.PatternLayout
log4j.appender.RMSUMMARY.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n

4、Top5.java(主要代码)

package com.gxwz.mapreduce;

import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/**
 * TODO 	 MapReduce读取文本,实现降序排序
 * @author   com
 * @Date	 2019年9月28日 	Configured 
 */
public class Top5 extends Configured implements Tool {

	public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
		
		Text outkey = new Text();
		IntWritable outval = new IntWritable(1);
		String [] line = null;
		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
				throws IOException, InterruptedException {
			line = value.toString().split("\t");
			if(null != line && line.length > 0 && Arrays.toString(line).length()>2) {
				for (String s : line) {
					outkey.set(s);
					context.write(outkey, outval);
				}
			}
		}
	}
	
	public static class MyReduce extends Reducer<Text, IntWritable, Text, LongWritable> {
		
		Text outkey = new Text();
		LongWritable outval = new LongWritable();
		Integer sum = new Integer(0);	//非new生成的Long变量指向的是java常量池中的对象,而new Long()生成的变量指向堆中新建的对象,两者在内存中的地址不同
		Map<String, Long> map = new HashMap<String, Long>();
		@Override
		protected void reduce(Text key, Iterable<IntWritable> values,
				Reducer<Text, IntWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {
			sum = 0;
			for (IntWritable value : values) {
				sum += value.get();
			}
			map.put(key.toString(), (long)sum);
		}
		
		@Override
		protected void cleanup(Reducer<Text, IntWritable, Text, LongWritable>.Context context)
				throws IOException, InterruptedException {
			List<Map.Entry<String, Long>> list = new LinkedList<Map.Entry<String,Long>>(map.entrySet());
			Collections.sort(list, new Comparator<Map.Entry<String,Long>>() {
				@Override
				public int compare(Entry<String, Long> o1, Entry<String, Long> o2) {
					return (int) (o2.getValue() - o1.getValue());
				}
			});
			for (Entry<String, Long> entry : list) {
				System.out.println(entry.getKey()+":"+entry.getValue());
				outkey.set(entry.getKey());
				outval.set(entry.getValue());
				context.write(outkey, outval);
			}
		}
	}
	
	@Override
	public int run(String[] args) throws Exception {
		
		// 1、配置文件获取
		Configuration conf = this.getConf();
		// 2、获取文件目录
		FileSystem fs = FileSystem.get(conf); 
		// 3、定义 job的输入输出路径
		Path inpath = new Path(args[0]);
		Path outpath = new Path(args[1]);
		// 4、判断输出文件是否为空
		if(fs.exists(outpath)) {
			fs.delete(outpath, true);
			System.out.println("The old path has been deleted!");
		}
		// 5、获取一个job的实例
		Job job = Job.getInstance();
		// 6、设置MapReduce的打包类
		job.setJarByClass(Top5.class);
		// 7、设置Mapper类和Reducer类
		job.setMapperClass(MyMapper.class);
		job.setReducerClass(MyReduce.class);
		// 8、设置MR的输入输出格式
		job.setInputFormatClass(TextInputFormat.class);
		job.setOutputFormatClass(TextOutputFormat.class);
		// 9、因为Mapper的输出和Reducer的输出类型不一样,所有还需设置Mapper类的输出类
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		// 10、设置job的输入输出路径
		FileInputFormat.addInputPath(job, inpath);
		FileOutputFormat.setOutputPath(job, outpath);
		// 11、提交job任务
		int result = job.waitForCompletion(true) ? 0 : 1;
		return result;
	}
	
	//C:\Users\com\Desktop\mr\top10\ C:\Users\com\Desktop\mr\top10\output\
	public static void main(String[] args) {
		String [] path = new String[2];
		path[0] = "C:\\Users\\com\\Desktop\\mr\\top10";			//输入路径
		path[1] = "C:\\Users\\com\\Desktop\\mr\\top10\\output"; //输出路径
		try {
			int result = ToolRunner.run(new Top5(), path);
			String msg = result==0?"job finish!":"job fail!";
			System.out.println(msg);
			System.exit(result);
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

}

5、测试数据

小明	小绿	小黑

小红	小红	小白

小蓝	小蓝	小蓝

小黑	小白	小黑

小红	小红	小黄

小黑	小白	小绿

小红	小蓝	小蓝

小红	小红	小黄

小绿	小蓝	小蓝

小黑	小白	小蓝

6、运行结果

小蓝	8
小红	7
小黑	5
小白	4
小绿	3
小黄	2
小明	1

 

  • 2
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
实现按访问次数排序,可以使用MapReduce框架来完成。 首先,对于每个访问记录,我们需要将其拆分成访问者ID和访问次数。然后,将访问者ID作为键,访问次数作为值进行map操作,输出键值对。 接下来,对于每个访问者ID,将其对应的所有访问次数加起来,得到该访问者的总访问次数。然后,将访问者ID作为键,总访问次数作为值进行reduce操作,输出键值对。 最后,对reduce输出的键值对按照值进行排序,即可得到按访问次数排序的结果。 下面是一个简单的MapReduce实现: ```java public class AccessCountSort { public static class AccessCountMap extends Mapper<LongWritable, Text, Text, IntWritable> { private final static IntWritable ONE = new IntWritable(1); public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] tokens = value.toString().split("\\s+"); String userId = tokens[0]; context.write(new Text(userId), ONE); } } public static class AccessCountReduce extends Reducer<Text, IntWritable, Text, IntWritable> { public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int count = 0; for (IntWritable value : values) { count += value.get(); } context.write(key, new IntWritable(count)); } } public static class CountComparator extends WritableComparator { protected CountComparator() { super(IntWritable.class, true); } @Override public int compare(WritableComparable w1, WritableComparable w2) { IntWritable v1 = (IntWritable) w1; IntWritable v2 = (IntWritable) w2; return -1 * v1.compareTo(v2); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "AccessCountSort"); job.setJarByClass(AccessCountSort.class); job.setMapperClass(AccessCountMap.class); job.setReducerClass(AccessCountReduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setSortComparatorClass(CountComparator.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } } ``` 在程中,我们定义了一个AccessCountMap类和一个AccessCountReduce类,分别实现map和reduce操作。在map操作中,我们使用访问者ID作为键,访问次数1作为值输出键值对。在reduce操作中,对于每个访问者ID,将其对应的所有访问次数加起来,得到该访问者的总访问次数。 另外,我们还定义了一个CountComparator类,用于对reduce输出的键值对按值进行排序。在这个类中,我们重写了WritableComparator的compare方法,使得排序时按照值的大小进行排序。 最后,在main函数中,我们设置了MapReduce作业的输入路径、输出路径、Mapper类、Reducer类、输出键值对类型和排序比较器类,并启动了作业。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

日月星辰TEL

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值