Hadoop类型
Hadoop的类型全部在hadoop.io包中,下表是java与hadoop类型的对应关系
Java | Hadoop |
|
long | org.apache.hadoop.io.LongWritable |
|
Int | org.apache.hadoop.io.IntWritable |
|
Byte | org.apache.hadoop.io.ByteWritable |
|
boolean | org.apache.hadoop.io.BooleanWritable |
|
double | org.apache.hadoop.io.DoubleWritable |
|
float | org.apache.hadoop.io.FloatWritable |
|
string | org.apache.hadoop.io.Text |
|
null | org.apache.hadoop.io.NullWritable | NullWritable.get()获取实例 |
Set,map,list | org.apache.hadoop.io.ArrayWritable |
|
Bytes | org.apache.hadoop.io.BytesWritable | 存储音频视频 |
WordCount实例
1:编写代码
package mapreduce;
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* hdfs上的hello中的内容为
tiger pig
pig cat dog
dog bird cat
tiger house
bus bike bus car
* @author think
*
*/
public class WordCount {
public static void main(String[] args) throws Exception {
String inPath = args[0];
Path outPath = new Path(args[1]);
//1:hdfs configuration,get SystemFile Object
Configuration conf = new Configuration();
URI uri = new URI("/");// URI uri = new URI("hdfs://192.168.79.128:9000/");
FileSystem fileSystem = FileSystem.get(uri, conf);
if (fileSystem.exists(outPath)) {
fileSystem.delete(outPath, true);
}
// 2:job object
String jobName = WordCount.class.getName();
Job job = Job.getInstance(conf, jobName);
job.setJarByClass(WordCount.class);
// 3:输入路径
FileInputFormat.setInputPaths(job, inPath);
// 4:指定inputFormat的子类,可选,默认是TextInputFormat
job.setInputFormatClass(TextInputFormat.class);
// 5:指定mapper类,指定mapper的输出<k2,v2>类型
job.setMapperClass(MapTask.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
// 6:指定reduce类,指定reduce的输出<k3,v3>类型
job.setReducerClass(ReduceTask.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
// 7:指定输出路径
FileOutputFormat.setOutputPath(job, outPath);
// 8:指定outputformat子类
job.setOutputFormatClass(TextOutputFormat.class);
// 9:提交yarn执行
job.waitForCompletion(true);
}
/**
* Map 任务
* @author think
* LongWritable, Text, Text, LongWritable这4个参数依次代表map任务的输入键值对<k1,v1>和输出键值对<k2,v2>
*/
public static class MapTask extends Mapper<LongWritable, Text, Text, LongWritable>
{
Logger logger = LoggerFactory.getLogger(WordCount.class);
Text k2 = new Text();
LongWritable v2 = new LongWritable();
/**
* 重写map方法
* context是一个mapper的内部类
*/
@Override
protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, Text, LongWritable>.Context context)
throws IOException, InterruptedException {
//1:key为内容的字节序数,value为内容
String content = value.toString();
System.out.println("内容:" + key.get() + " ," + content);
logger.info("内容:" + key.get() + " ," + content);
String[] arrs = content.split(",");
for(String word : arrs)
{
k2.set(word);
v2.set(1);
context.write(k2, v2);
logger.info("map:" + k2.toString() + "," + v2);
}
}
}
/**
* Reduce 任务
* @author think
* Text, LongWritable, Text, LongWritable这4个参数依次代表reduce任务的输入键值对<k2,v2s>和输出键值对<k3,v3>
*/
public static class ReduceTask extends Reducer<Text, LongWritable, Text, LongWritable>
{
LongWritable v3 = new LongWritable();
@Override
protected void reduce(Text k2, Iterable<LongWritable> v2s,
Reducer<Text, LongWritable, Text, LongWritable>.Context content)
throws IOException, InterruptedException {
System.out.println("k2:" + k2.toString());
long sum = 0;
for(LongWritable v2 : v2s)
{
System.out.println("v2:" + v2);
sum += v2.get();
}
v3.set(sum);
content.write(k2, v3);
System.out.println("k3,v3:" + k2.toString() + "," + v3);
}
}
}
2:打包并上传到linux下
点击java类->右键export->JAR File导出jar包,下图是两个需要注意的地方
3:在linux创建一个文件word然后将文件上传到hdfs中
hadoop fs -put ./word /word
hadoop fs -text /word/word word在word目录下
hadoop fs -cp /word/word /word/word2可以多复制几个文件
4:执行hadoop jar,然后查看结果
hadoop jar wordCount.jar /word /out
在/out目录中会自动生成文件记录结果如/out/part-r-00000,查看此文件中的结果
hadoop fs -text /out/part-r-00000
5:在http://shb01:8088中查看集群中map和reduce任务的输出
我们重点需要关注的是Counters中的一些信息如本地读取等
6:查看日志需要在yarn-site.xml中加入如下内容
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
ArrayWritable类型
Hadoop中使用ArrayWritable来操作集合,必须自己写一个类继承它,ArrayWritable类中有一个Witable[]数组属性,使用时必须传递值它会遍历这个属性。
下面是一个使用ArrayWritable统计流量的例子。
存在至少一个文件,此类文件模仿的是一个手机流量的日志信息
每行数据结构为
序号1:1363157993044是时间戳
序号2:13610002000是手机号
序号6是上行数据包,7是下行数据包,8上行总流量,9下行总流量,10是状态吗200表示成功。
1363157993044 13610002000 94-71-AC-CD-E6-18:CMCC-EASY 120.196.100.99 iface.qiyi.com 视频网站 15 12 1527 2106 200
假设有很多行每个手机号会出现多次,我们需要统计每个手机号的上下行数据包,上下行总流量的汇总数据。以下是代码。
package mapreduce;
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class FlowCount {
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
String inputPaths = args[0];
Path outPath = new Path(args[1]);
//1:获取fileSystem对象,操作hdfs数据
Configuration conf = new Configuration();
URI uri = new URI("hdfs://192.168.79.139:9000/");
FileSystem fileSystem = FileSystem.get(uri, conf);
if(fileSystem.exists(outPath))
{
fileSystem.delete(outPath, true);
}
//2:获取job对象
Job job = Job.getInstance(conf, FlowCount.class.getName());
job.setJarByClass(FlowCount.class);
//3:指定输入路径
FileInputFormat.setInputPaths(job, inputPaths);
//4:指定inputFormat子类
job.setInputFormatClass(TextInputFormat.class);
//5:指定mapper类及其输出类型
job.setMapperClass(MapTask.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowWritable.class);
//6:指定reducer类及其输出类型
job.setReducerClass(ReduceTask.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//7:指定outputFormat子类
job.setOutputFormatClass(TextOutputFormat.class);
//8:指定输出路径
FileOutputFormat.setOutputPath(job, outPath);
//9:提交yarn执行
job.waitForCompletion(true);
}
/**
* Map任务
* 4个参数LongWritable, Text, Text, FlowWritable对应map的输入<k1,v1><每行的字节序数,每行内容>
* map的输出<k2,v2><手机号,FlowWritable(实例包含上下行数据包,上下行流量)>
* @author think
*
*/
public static class MapTask extends Mapper<LongWritable, Text, Text, FlowWritable>
{
Logger logger = LoggerFactory.getLogger(MapTask.class);
Text k2 = new Text();
@Override
protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, Text, FlowWritable>.Context context)
throws IOException, InterruptedException {
String[] values = value.toString().split("\t");
k2.set(values[1]);
FlowWritable flow = new FlowWritable();
flow.set(values[5], values[6], values[7], values[8]);
context.write(k2, flow);
logger.info("MapTask[" + k2.toString() + ":" + flow + "]");
}
}
/**
* Reduce任务
* 4个参数Text, FlowWritable, Text, Text对应reduce的输入<k2,v2s><手机号,FlowWritable(实例包含上下行数据包,上下行流量)>
* reduce的输出<k3,v3><手机号,流量信息>
* @author think
*
*/
public static class ReduceTask extends Reducer<Text, FlowWritable, Text, Text>
{
Logger logger = LoggerFactory.getLogger(ReduceTask.class);
Text k3 = new Text();
Text v3 = new Text();
@Override
protected void reduce(Text k2, Iterable<FlowWritable> v2s,
Reducer<Text, FlowWritable, Text, Text>.Context context)
throws IOException, InterruptedException {
long six = 0;
long seven = 0;
long eight = 0;
long nine = 0;
for(FlowWritable v2 : v2s)
{
long[] flowArrs = v2.getLongArrs();
six += flowArrs[0];
seven += flowArrs[1];
eight += flowArrs[2];
nine += flowArrs[3];
}
k3.set(k2);
String flowString = "up package[" + six + "];down package[" + seven + "];up flow[" + eight + "];down flow[" + nine +"]";
v3.set(flowString);
context.write(k3, v3);
}
}
/**
* FlowWritable用来存放文件中的数据包和流量信息,对应序号6~9
* @author think
*
*/
public static class FlowWritable extends ArrayWritable
{
//必须在构造函数中调用super明确类型
public FlowWritable() {
super(LongWritable.class);
}
/**
* 将值赋给ArrayWritable中的values属性
* @param six
* @param seven
* @param eight
* @param nine
*/
public void set(String six, String seven, String eight, String nine)
{
Writable[] values = new Writable[4];
//System.out.println("-" + six + "-" + seven + "-" + eight + "-" + nine);
values[0] = new LongWritable(Long.valueOf(six));
values[1] = new LongWritable(Long.valueOf(seven));
values[2] = new LongWritable(Long.valueOf(eight));
values[3] = new LongWritable(Long.valueOf(nine));
super.set(values);
}
/**
* 从ArrayWritable中获取values属性的值
* @return
*/
public long[] getLongArrs()
{
LongWritable[] values = (LongWritable[])super.toArray();
if(null != values)
{
long[] valueArrs = new long[values.length];
for(int i = 0; i < values.length; i++)
{
valueArrs[i] = values[i].get();
}
return valueArrs;
}
else
{
return null;
}
}
}
}