MapReuce笔记四之hadoop类型和MR操作hdfs数据实例

hadoop类型和MR操作hdfs数据实例
摘要由CSDN通过智能技术生成
Hadoop类型

Hadoop的类型全部在hadoop.io包中,下表是java与hadoop类型的对应关系

Java

Hadoop

 

long

org.apache.hadoop.io.LongWritable

 

Int

org.apache.hadoop.io.IntWritable

 

Byte

org.apache.hadoop.io.ByteWritable

 

boolean

org.apache.hadoop.io.BooleanWritable

 

double

org.apache.hadoop.io.DoubleWritable

 

float

org.apache.hadoop.io.FloatWritable

 

string

org.apache.hadoop.io.Text

 

null

org.apache.hadoop.io.NullWritable

NullWritable.get()获取实例

Set,map,list

org.apache.hadoop.io.ArrayWritable

 

Bytes

org.apache.hadoop.io.BytesWritable

存储音频视频

 

WordCount实例

1:编写代码

package mapreduce;

import java.io.IOException;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * hdfs上的hello中的内容为
tiger pig
pig cat dog
dog bird cat
tiger house
bus bike bus car

 * @author think
 *
 */
public class WordCount {

	public static void main(String[] args) throws Exception {
		String inPath = args[0];
		Path outPath = new Path(args[1]);

		//1:hdfs configuration,get SystemFile Object
		Configuration conf = new Configuration();
		URI uri = new URI("/");// URI uri = new URI("hdfs://192.168.79.128:9000/");
		FileSystem fileSystem = FileSystem.get(uri, conf);

		if (fileSystem.exists(outPath)) {
			fileSystem.delete(outPath, true);
		}

		// 2:job object
		String jobName = WordCount.class.getName();
		Job job = Job.getInstance(conf, jobName);
		job.setJarByClass(WordCount.class);

		// 3:输入路径
		FileInputFormat.setInputPaths(job, inPath);

		// 4:指定inputFormat的子类,可选,默认是TextInputFormat
		job.setInputFormatClass(TextInputFormat.class);

		// 5:指定mapper类,指定mapper的输出<k2,v2>类型
		job.setMapperClass(MapTask.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(LongWritable.class);

		// 6:指定reduce类,指定reduce的输出<k3,v3>类型
		job.setReducerClass(ReduceTask.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(LongWritable.class);

		// 7:指定输出路径
		FileOutputFormat.setOutputPath(job, outPath);

		// 8:指定outputformat子类
		job.setOutputFormatClass(TextOutputFormat.class);

		// 9:提交yarn执行
		job.waitForCompletion(true);
	}
	
	/**
	 * Map 任务
	 * @author think
	 * LongWritable, Text, Text, LongWritable这4个参数依次代表map任务的输入键值对<k1,v1>和输出键值对<k2,v2>
	 */
	public static class MapTask extends Mapper<LongWritable, Text, Text, LongWritable>
	{
		Logger logger = LoggerFactory.getLogger(WordCount.class);
		
		Text k2 = new Text();

		LongWritable v2 = new LongWritable();
		
		/**
		 * 重写map方法
		 * context是一个mapper的内部类
		 */
		@Override
		protected void map(LongWritable key, Text value,
				Mapper<LongWritable, Text, Text, LongWritable>.Context context)
				throws IOException, InterruptedException {
			//1:key为内容的字节序数,value为内容
			String content = value.toString();
			System.out.println("内容:" + key.get() + " ," + content);
			logger.info("内容:" + key.get() + " ," + content);
			
			String[] arrs = content.split(",");
			for(String word : arrs)
			{
				k2.set(word);
				v2.set(1);
				context.write(k2, v2);
				logger.info("map:" + k2.toString() + "," + v2);
			}
		}
	}
	
	/**
	 * Reduce 任务 
	 * @author think
	 * Text, LongWritable, Text, LongWritable这4个参数依次代表reduce任务的输入键值对<k2,v2s>和输出键值对<k3,v3>
	 */
	public static class ReduceTask extends Reducer<Text, LongWritable, Text, LongWritable>
	{
		LongWritable v3 = new LongWritable();
		
		@Override
		protected void reduce(Text k2, Iterable<LongWritable> v2s,
				Reducer<Text, LongWritable, Text, LongWritable>.Context content)
				throws IOException, InterruptedException {
			System.out.println("k2:" + k2.toString());
			long sum = 0;
			for(LongWritable v2 : v2s)
			{
				System.out.println("v2:" + v2);
				sum += v2.get();
			}
			v3.set(sum);
			content.write(k2, v3);
			System.out.println("k3,v3:" + k2.toString() + "," + v3);
		}
	}
	
	
}

2:打包并上传到linux下

点击java类->右键export->JAR File导出jar包,下图是两个需要注意的地方





3:在linux创建一个文件word然后将文件上传到hdfs中

hadoop fs -put ./word /word

hadoop fs -text /word/word  word在word目录下

hadoop fs -cp /word/word  /word/word2可以多复制几个文件

 

4:执行hadoop jar,然后查看结果

hadoop jar wordCount.jar /word  /out

在/out目录中会自动生成文件记录结果如/out/part-r-00000,查看此文件中的结果

hadoop fs -text /out/part-r-00000



5:在http://shb01:8088中查看集群中map和reduce任务的输出

         我们重点需要关注的是Counters中的一些信息如本地读取等



6:查看日志需要在yarn-site.xml中加入如下内容

<property>

               <name>yarn.log-aggregation-enable</name>

                <value>true</value>

        </property>

 

ArrayWritable类型

Hadoop中使用ArrayWritable来操作集合,必须自己写一个类继承它,ArrayWritable类中有一个Witable[]数组属性,使用时必须传递值它会遍历这个属性。

下面是一个使用ArrayWritable统计流量的例子。

存在至少一个文件,此类文件模仿的是一个手机流量的日志信息

每行数据结构为

序号1:1363157993044是时间戳

序号2:13610002000是手机号

序号6是上行数据包,7是下行数据包,8上行总流量,9下行总流量,10是状态吗200表示成功。

1363157993044      13610002000  94-71-AC-CD-E6-18:CMCC-EASY     120.196.100.99        iface.qiyi.com  视频网站         15     12         1527         2106         200

 

假设有很多行每个手机号会出现多次,我们需要统计每个手机号的上下行数据包,上下行总流量的汇总数据。以下是代码。

package mapreduce;

import java.io.IOException;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class FlowCount {

	/**
	 * @param args
	 * @throws Exception
	 */
	public static void main(String[] args) throws Exception {
		String inputPaths = args[0];
		Path outPath = new Path(args[1]);
		
		//1:获取fileSystem对象,操作hdfs数据
		Configuration conf = new Configuration();
		URI uri = new URI("hdfs://192.168.79.139:9000/");
		FileSystem fileSystem = FileSystem.get(uri, conf);
		if(fileSystem.exists(outPath))
		{
			fileSystem.delete(outPath, true);
		}
		
		//2:获取job对象
		Job job = Job.getInstance(conf, FlowCount.class.getName());
		job.setJarByClass(FlowCount.class);
		
		//3:指定输入路径
		FileInputFormat.setInputPaths(job, inputPaths);
		
		//4:指定inputFormat子类
		job.setInputFormatClass(TextInputFormat.class);
		
		//5:指定mapper类及其输出类型
		job.setMapperClass(MapTask.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(FlowWritable.class);
		
		//6:指定reducer类及其输出类型
		job.setReducerClass(ReduceTask.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		
		//7:指定outputFormat子类
		job.setOutputFormatClass(TextOutputFormat.class);
		
		//8:指定输出路径
		FileOutputFormat.setOutputPath(job, outPath);
		
		//9:提交yarn执行
		job.waitForCompletion(true);
	}

	/**
	 * Map任务
	 * 4个参数LongWritable, Text, Text, FlowWritable对应map的输入<k1,v1><每行的字节序数,每行内容>
	 * map的输出<k2,v2><手机号,FlowWritable(实例包含上下行数据包,上下行流量)>
	 * @author think
	 *
	 */
	public static class MapTask extends Mapper<LongWritable, Text, Text, FlowWritable>
	{
		Logger logger = LoggerFactory.getLogger(MapTask.class);
		
		Text k2 = new Text();
		
		@Override
		protected void map(LongWritable key, Text value,
				Mapper<LongWritable, Text, Text, FlowWritable>.Context context)
				throws IOException, InterruptedException {
			
			String[] values = value.toString().split("\t");
			k2.set(values[1]);

			FlowWritable flow = new FlowWritable();
			flow.set(values[5], values[6], values[7], values[8]);
			
			context.write(k2, flow);
			logger.info("MapTask[" + k2.toString() + ":" + flow + "]");
		}
	}
	
	/**
	 * Reduce任务
	 * 4个参数Text, FlowWritable, Text, Text对应reduce的输入<k2,v2s><手机号,FlowWritable(实例包含上下行数据包,上下行流量)>
	 * reduce的输出<k3,v3><手机号,流量信息>
	 * @author think
	 *
	 */
	public static class ReduceTask extends Reducer<Text, FlowWritable, Text, Text>
	{
		Logger logger = LoggerFactory.getLogger(ReduceTask.class);
		
		Text k3 = new Text();
		
		Text v3 = new Text();
		
		@Override
		protected void reduce(Text k2, Iterable<FlowWritable> v2s,
				Reducer<Text, FlowWritable, Text, Text>.Context context)
				throws IOException, InterruptedException {
			long six = 0;
			long seven = 0;
			long eight = 0;
			long nine = 0;
			for(FlowWritable v2 : v2s)
			{
				long[] flowArrs = v2.getLongArrs();
				six += flowArrs[0];
				seven += flowArrs[1];
				eight += flowArrs[2];
				nine += flowArrs[3];
			}
			
			k3.set(k2);
			String flowString = "up package[" + six + "];down package[" + seven + "];up flow[" + eight + "];down flow[" + nine +"]";
			v3.set(flowString);
			context.write(k3, v3);
			
		}
	}
	
	/**
	 * FlowWritable用来存放文件中的数据包和流量信息,对应序号6~9
	 * @author think
	 *
	 */
	public static class FlowWritable extends ArrayWritable
	{
		//必须在构造函数中调用super明确类型
		public FlowWritable() {
			super(LongWritable.class);
		}
		
		/**
		 * 将值赋给ArrayWritable中的values属性
		 * @param six
		 * @param seven
		 * @param eight
		 * @param nine
		 */
		public void set(String six, String seven, String eight, String nine)
		{
			Writable[] values = new Writable[4];
			//System.out.println("-" + six + "-" + seven + "-" + eight + "-" + nine);
			values[0] = new LongWritable(Long.valueOf(six));
			values[1] = new LongWritable(Long.valueOf(seven));
			values[2] = new LongWritable(Long.valueOf(eight));
			values[3] = new LongWritable(Long.valueOf(nine));
			super.set(values);
		}
		
		/**
		 * 从ArrayWritable中获取values属性的值
		 * @return
		 */
		public long[] getLongArrs()
		{
			LongWritable[] values = (LongWritable[])super.toArray();
			if(null != values)
			{
				long[] valueArrs = new long[values.length];
				for(int i = 0; i < values.length; i++)
				{
					valueArrs[i] = values[i].get();
				}
				return valueArrs;
			}
			else
			{
				return null;
			}
		}
		
	}
}





  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值