MapReuce笔记四之hadoop类型和MR操作hdfs数据实例

最新推荐文章于 2022-12-13 10:00:42 发布

臭小优

最新推荐文章于 2022-12-13 10:00:42 发布

阅读量2.2k

点赞数

分类专栏： mapReduce Hadoop 文章标签： ArrayWritable MapReduce hadoop类型

本文链接：https://blog.csdn.net/ty4315/article/details/52781901

版权

Hadoop 同时被 2 个专栏收录

12 篇文章 0 订阅

订阅专栏

mapReduce

6 篇文章 0 订阅

订阅专栏

hadoop类型和MR操作hdfs数据实例

摘要由CSDN通过智能技术生成

Hadoop类型

Hadoop的类型全部在hadoop.io包中，下表是java与hadoop类型的对应关系

Java	Hadoop
long	org.apache.hadoop.io.LongWritable
Int	org.apache.hadoop.io.IntWritable
Byte	org.apache.hadoop.io.ByteWritable
boolean	org.apache.hadoop.io.BooleanWritable
double	org.apache.hadoop.io.DoubleWritable
float	org.apache.hadoop.io.FloatWritable
string	org.apache.hadoop.io.Text
null	org.apache.hadoop.io.NullWritable	NullWritable.get()获取实例
Set,map,list	org.apache.hadoop.io.ArrayWritable
Bytes	org.apache.hadoop.io.BytesWritable	存储音频视频

WordCount实例

1：编写代码

package mapreduce;

import java.io.IOException;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * hdfs上的hello中的内容为
tiger pig
pig cat dog
dog bird cat
tiger house
bus bike bus car

 * @author think
 *
 */
public class WordCount {

	public static void main(String[] args) throws Exception {
		String inPath = args[0];
		Path outPath = new Path(args[1]);

		//1:hdfs configuration,get SystemFile Object
		Configuration conf = new Configuration();
		URI uri = new URI("/");// URI uri = new URI("hdfs://192.168.79.128:9000/");
		FileSystem fileSystem = FileSystem.get(uri, conf);

		if (fileSystem.exists(outPath)) {
			fileSystem.delete(outPath, true);
		}

		// 2:job object
		String jobName = WordCount.class.getName();
		Job job = Job.getInstance(conf, jobName);
		job.setJarByClass(WordCount.class);

		// 3:输入路径
		FileInputFormat.setInputPaths(job, inPath);

		// 4:指定inputFormat的子类，可选，默认是TextInputFormat
		job.setInputFormatClass(TextInputFormat.class);

		// 5:指定mapper类，指定mapper的输出<k2,v2>类型
		job.setMapperClass(MapTask.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(LongWritable.class);

		// 6:指定reduce类，指定reduce的输出<k3,v3>类型
		job.setReducerClass(ReduceTask.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(LongWritable.class);

		// 7:指定输出路径
		FileOutputFormat.setOutputPath(job, outPath);

		// 8:指定outputformat子类
		job.setOutputFormatClass(TextOutputFormat.class);

		// 9:提交yarn执行
		job.waitForCompletion(true);
	}
	
	/**
	 * Map 任务
	 * @author think
	 * LongWritable, Text, Text, LongWritable这4个参数依次代表map任务的输入键值对<k1,v1>和输出键值对<k2,v2>
	 */
	public static class MapTask extends Mapper<LongWritable, Text, Text, LongWritable>
	{
		Logger logger = LoggerFactory.getLogger(WordCount.class);
		
		Text k2 = new Text();

		LongWritable v2 = new LongWritable();
		
		/**
		 * 重写map方法
		 * context是一个mapper的内部类
		 */
		@Override
		protected void map(LongWritable key, Text value,
				Mapper<LongWritable, Text, Text, LongWritable>.Context context)
				throws IOException, InterruptedException {
			//1:key为内容的字节序数，value为内容
			String content = value.toString();
			System.out.println("内容：" + key.get() + " ," + content);
			logger.info("内容：" + key.get() + " ," + content);
			
			String[] arrs = content.split(",");
			for(String word : arrs)
			{
				k2.set(word);
				v2.set(1);
				context.write(k2, v2);
				logger.info("map：" + k2.toString() + "," + v2);
			}
		}
	}
	
	/**
	 * Reduce 任务 
	 * @author think
	 * Text, LongWritable, Text, LongWritable这4个参数依次代表reduce任务的输入键值对<k2,v2s>和输出键值对<k3,v3>
	 */
	public static class ReduceTask extends Reducer<Text, LongWritable, Text, LongWritable>
	{
		LongWritable v3 = new LongWritable();
		
		@Override
		protected void reduce(Text k2, Iterable<LongWritable> v2s,
				Reducer<Text, LongWritable, Text, LongWritable>.Context content)
				throws IOException, InterruptedException {
			System.out.println("k2:" + k2.toString());
			long sum = 0;
			for(LongWritable v2 : v2s)
			{
				System.out.println("v2:" + v2);
				sum += v2.get();
			}
			v3.set(sum);
			content.write(k2, v3);
			System.out.println("k3,v3:" + k2.toString() + "," + v3);
		}
	}
	
	
}

2：打包并上传到linux下

点击java类->右键export->JAR File导出jar包，下图是两个需要注意的地方

3：在linux创建一个文件word然后将文件上传到hdfs中

hadoop fs -put ./word /word

hadoop fs -text /word/word word在word目录下

hadoop fs -cp /word/word /word/word2可以多复制几个文件

4：执行hadoop jar，然后查看结果

hadoop jar wordCount.jar /word /out

在/out目录中会自动生成文件记录结果如/out/part-r-00000，查看此文件中的结果

hadoop fs -text /out/part-r-00000

5：在http://shb01:8088中查看集群中map和reduce任务的输出

我们重点需要关注的是Counters中的一些信息如本地读取等

6：查看日志需要在yarn-site.xml中加入如下内容

<name>yarn.log-aggregation-enable</name>

</property>

ArrayWritable类型

Hadoop中使用ArrayWritable来操作集合，必须自己写一个类继承它，ArrayWritable类中有一个Witable[]数组属性，使用时必须传递值它会遍历这个属性。

下面是一个使用ArrayWritable统计流量的例子。

存在至少一个文件，此类文件模仿的是一个手机流量的日志信息

每行数据结构为

序号1：1363157993044是时间戳

序号2：13610002000是手机号

序号6是上行数据包，7是下行数据包，8上行总流量，9下行总流量，10是状态吗200表示成功。

1363157993044 13610002000 94-71-AC-CD-E6-18:CMCC-EASY 120.196.100.99 iface.qiyi.com 视频网站 15 12 1527 2106 200

假设有很多行每个手机号会出现多次，我们需要统计每个手机号的上下行数据包，上下行总流量的汇总数据。以下是代码。

package mapreduce;

import java.io.IOException;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class FlowCount {

	/**
	 * @param args
	 * @throws Exception
	 */
	public static void main(String[] args) throws Exception {
		String inputPaths = args[0];
		Path outPath = new Path(args[1]);
		
		//1:获取fileSystem对象，操作hdfs数据
		Configuration conf = new Configuration();
		URI uri = new URI("hdfs://192.168.79.139:9000/");
		FileSystem fileSystem = FileSystem.get(uri, conf);
		if(fileSystem.exists(outPath))
		{
			fileSystem.delete(outPath, true);
		}
		
		//2:获取job对象
		Job job = Job.getInstance(conf, FlowCount.class.getName());
		job.setJarByClass(FlowCount.class);
		
		//3:指定输入路径
		FileInputFormat.setInputPaths(job, inputPaths);
		
		//4:指定inputFormat子类
		job.setInputFormatClass(TextInputFormat.class);
		
		//5:指定mapper类及其输出类型
		job.setMapperClass(MapTask.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(FlowWritable.class);
		
		//6:指定reducer类及其输出类型
		job.setReducerClass(ReduceTask.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		
		//7:指定outputFormat子类
		job.setOutputFormatClass(TextOutputFormat.class);
		
		//8:指定输出路径
		FileOutputFormat.setOutputPath(job, outPath);
		
		//9:提交yarn执行
		job.waitForCompletion(true);
	}

	/**
	 * Map任务
	 * 4个参数LongWritable, Text, Text, FlowWritable对应map的输入<k1,v1><每行的字节序数，每行内容>
	 * map的输出<k2,v2><手机号,FlowWritable(实例包含上下行数据包，上下行流量)>
	 * @author think
	 *
	 */
	public static class MapTask extends Mapper<LongWritable, Text, Text, FlowWritable>
	{
		Logger logger = LoggerFactory.getLogger(MapTask.class);
		
		Text k2 = new Text();
		
		@Override
		protected void map(LongWritable key, Text value,
				Mapper<LongWritable, Text, Text, FlowWritable>.Context context)
				throws IOException, InterruptedException {
			
			String[] values = value.toString().split("\t");
			k2.set(values[1]);

			FlowWritable flow = new FlowWritable();
			flow.set(values[5], values[6], values[7], values[8]);
			
			context.write(k2, flow);
			logger.info("MapTask[" + k2.toString() + ":" + flow + "]");
		}
	}
	
	/**
	 * Reduce任务
	 * 4个参数Text, FlowWritable, Text, Text对应reduce的输入<k2,v2s><手机号,FlowWritable(实例包含上下行数据包，上下行流量)>
	 * reduce的输出<k3,v3><手机号，流量信息>
	 * @author think
	 *
	 */
	public static class ReduceTask extends Reducer<Text, FlowWritable, Text, Text>
	{
		Logger logger = LoggerFactory.getLogger(ReduceTask.class);
		
		Text k3 = new Text();
		
		Text v3 = new Text();
		
		@Override
		protected void reduce(Text k2, Iterable<FlowWritable> v2s,
				Reducer<Text, FlowWritable, Text, Text>.Context context)
				throws IOException, InterruptedException {
			long six = 0;
			long seven = 0;
			long eight = 0;
			long nine = 0;
			for(FlowWritable v2 : v2s)
			{
				long[] flowArrs = v2.getLongArrs();
				six += flowArrs[0];
				seven += flowArrs[1];
				eight += flowArrs[2];
				nine += flowArrs[3];
			}
			
			k3.set(k2);
			String flowString = "up package[" + six + "];down package[" + seven + "];up flow[" + eight + "];down flow[" + nine +"]";
			v3.set(flowString);
			context.write(k3, v3);
			
		}
	}
	
	/**
	 * FlowWritable用来存放文件中的数据包和流量信息，对应序号6~9
	 * @author think
	 *
	 */
	public static class FlowWritable extends ArrayWritable
	{
		//必须在构造函数中调用super明确类型
		public FlowWritable() {
			super(LongWritable.class);
		}
		
		/**
		 * 将值赋给ArrayWritable中的values属性
		 * @param six
		 * @param seven
		 * @param eight
		 * @param nine
		 */
		public void set(String six, String seven, String eight, String nine)
		{
			Writable[] values = new Writable[4];
			//System.out.println("-" + six + "-" + seven + "-" + eight + "-" + nine);
			values[0] = new LongWritable(Long.valueOf(six));
			values[1] = new LongWritable(Long.valueOf(seven));
			values[2] = new LongWritable(Long.valueOf(eight));
			values[3] = new LongWritable(Long.valueOf(nine));
			super.set(values);
		}
		
		/**
		 * 从ArrayWritable中获取values属性的值
		 * @return
		 */
		public long[] getLongArrs()
		{
			LongWritable[] values = (LongWritable[])super.toArray();
			if(null != values)
			{
				long[] valueArrs = new long[values.length];
				for(int i = 0; i < values.length; i++)
				{
					valueArrs[i] = values[i].get();
				}
				return valueArrs;
			}
			else
			{
				return null;
			}
		}
		
	}
}