一、补充
1、如何实现自定义序列化
1)实现Writable 接口 2) 实现WritableComparable接口
如果自定义的数据类型,是当做Key值,因为Key值必须能排序才行,所以需要实现WritableComparable接口,当做Value值,直接实现Writable接口即可。
手机号码Key PhoneFlowWritable 是Value
在MR程序中,只有能排序的数据类型才能当做Key。 因为Key需要排序。
Map端很多地方输出的结果都是有序的,什么有序?--》 key值有序。
public interface WritableComparable<T> extends Writable, Comparable<T> {
}
package com.bigdata.day12;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* @Author laoyan
* @Description TODO
* @Date 2022/8/2 9:39
* @Version 1.0
*
* 这种数据类型,可以当做Key值,因为它具有排序的功能,当然也可以当做Value
自定义数据类型的时候什么情况下实现 Writable?
什么情况下实现WritableComparable?
就看你自定义的类型当不当做Key.
*/
public class AdminWritable implements WritableComparable<AdminWritable> {
private String name;
private int age;
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public int getAge() {
return age;
}
public void setAge(int age) {
this.age = age;
}
@Override
public int compareTo(AdminWritable admin) {
// 根据年龄进行排序
return this.age - admin.getAge();
}
@Override
public void write(DataOutput out) throws IOException {
// 此处省略
out.writeUTF(name);
out.writeInt(age);
}
@Override
public void readFields(DataInput in) throws IOException {
this.name = in.readUTF();
this.age = in.readInt();
}
}
2、优化一段代码
遇到的问题:

解决方案:
就是重复利用new Text() new PhoneFlowWritable() 对象即可。
优化过之后的代码
package com.bigdata.phoneflow;
import com.bigdata.WordCountMapper;
import com.bigdata.WordCountPartitioner;
import com.bigdata.WordCountReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.Iterator;
/**
* @Author laoyan
* @Description TODO
* @Date 2022/8/1 14:33
* @Version 1.0
*/
class PhoneFlowMapper extends Mapper<LongWritable, Text,Text,PhoneFlowWritable> {
// 该方法,只执行一次,一般的初始化操作都会在这个方法里面完成
Text text = null;
PhoneFlowWritable phoneFlowWritable = null;
@Override
protected void setup(Mapper<LongWritable, Text, Text, PhoneFlowWritable>.Context context) throws IOException, InterruptedException {
text = new Text();
phoneFlowWritable = new PhoneFlowWritable();
}
// 将每一句话,都变为 手机号码 --> PhoneFlowWritable对象
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, PhoneFlowWritable>.Context context) throws IOException, InterruptedException {
//1363157986072 18320173382 84-25-DB-4F-10-1A:CMCC-EASY 120.196.100.99 input.shouji.sogou.com 搜索引擎 21 18 9531 2412 200
String line = value.toString();
String[] arr = line.split("\\s+");// \s 表示一个空白符,+ 表示出现一次到多次
String phone = arr[1];
int upFlow = Integer.parseInt(arr[arr.length-3]);
int downFlow = Integer.parseInt(arr[arr.length-2]);
text.set(phone);
phoneFlowWritable.setPhone(phone);
phoneFlowWritable.setUpFlow(upFlow);
phoneFlowWritable.setDownFlow(downFlow);
context.write(text ,phoneFlowWritable);
}
}
// 手机号 --> 流量数据PhoneFlowWritable 手机号码 --> 统计的结果
class PhoneFlowReducer extends Reducer<Text,PhoneFlowWritable,Text,Text> {
Text value = null;
@Override
protected void setup(Reducer<Text, PhoneFlowWritable, Text, Text>.Context context) throws IOException, InterruptedException {
value =new Text();
}
// reduce 会将相同的key值,汇总在一起,将value值合并到一个迭代器中
@Override
protected void reduce(Text key, Iterable<PhoneFlowWritable> values, Reducer<Text, PhoneFlowWritable, Text, Text>.Context context) throws IOException, InterruptedException {
int upFlowNum = 0;
int downFlowNum = 0;
Iterator<PhoneFlowWritable> iterator = values.iterator();
while(iterator.hasNext()){
PhoneFlowWritable phoneFlowWritable = iterator.next();
upFlowNum += phoneFlowWritable.getUpFlow();
downFlowNum += phoneFlowWritable.getDownFlow();
}
StringBuffer sb = new StringBuffer();
sb.append("手机号"+key+"流量统计:");
sb.append("上行流量是:"+upFlowNum);
sb.append("下行流量是:"+downFlowNum);
sb.append("总的流量是:"+(upFlowNum + downFlowNum));
value.set(sb.toString());
context.write(key,value);
}
}
public class PhoneFlowDriver {
public static void main(String[] args) throws Exception{
Configuration configuration = new Configuration();
// 使用本地的文件系统,而不是hdfs
configuration.set("fs.defaultFS","file:///");
// 使用本地的资源(CPU,内存等), 也可以使用yarn平台跑任务
configuration.set("mapreduce.framework.name","local");
Job job = Job.getInstance(configuration, "手机流量统计");
// map任务的设置
job.setMapperClass(PhoneFlowMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(PhoneFlowWritable.class);
// reduce任务的设置
job.setReducerClass(PhoneFlowReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// 设置要统计的数据的路径,结果输出路径
FileInputFormat.setInputPaths(job,new Path("mr01/phoneFlow/input"));
// ouput文件夹一定不要出现,否则会报错
FileOutputFormat.setOutputPath(job,new Path("mr01/phoneFlow/output"));
// 等待任务执行结束
boolean b = job.waitForCompletion(true);
// 此处是一定要退出虚拟机的
System.exit(b ? 0:-1);
}
}
waitForCompletion方法的boolean参数verbose为true表明要打印运行进度,为false就只是等待job运行结束,不打印运行日志。
二、统计最高温度
0188010010999992000010100004+70930-008670FM-12+0009ENJA V0202101N002110021019N0025001N1+00101+00031098181ADDAA106004191AY181061AY251061GF108991081061002501999999MA1999999098061MD1510071+9999MW1501REMSYN088AAXX 01004 01001 11325 82104 10010 20003 39806 49818 55007 60041 75085 886// 333 91119;
需求:求每一年的最高温度
数据格式的说明:
1、每一行的 【15,18】年份
2、87位指的是 温度的正负
3、【88,91】 指的是温度,如果温度是9999表示无效
4、92位是校验位,如果是0,1,4,5,9 这几个值表示温度有效。
思路
Map 端: Key值是年份 温度
Reduce端: 到reduce长什么样子呢? key [3243,3434,34343,34343]
根据集合中的数据,求最大值,写出到磁盘即可。
package com.bigdata.temp;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
class TempMaxMapper extends Mapper<LongWritable, Text,Text, IntWritable>{
Text year = null;
IntWritable temp = null;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
year = new Text();
temp = new IntWritable();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 最终的结果 年份 温度
String line = value.toString();
// 获取年份
String _year = line.substring(15, 19);
// 获取温度以及符号
int _temp = Integer.parseInt(line.substring(87, 92));
String validateCode = line.substring(92, 93);//获取校验码
if(_temp == 9999 || validateCode.matches("[^01459]")){
return ; // 表示代码终止
}
year.set(_year);
temp.set(_temp);
context.write(year,temp);
}
}
class TempMaxReducer extends Reducer<Text, IntWritable,Text, Text>{
Text text = null;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
text = new Text();
}
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
// 1901 [xxx,xxx,xxx]
int maxTemp = Integer.MIN_VALUE; // 因为设置为哪个数字都不合适
for (IntWritable temp:values) {
// 求两个值的最大值
maxTemp = Integer.max(maxTemp,temp.get());
}
text.set("这一年的最高温是"+maxTemp);
context.write(key,text);
}
}
public class TempMaxDriver {
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
// 使用本地的文件系统,而不是hdfs
configuration.set("fs.defaultFS","file:///");
// 使用本地的资源(CPU,内存等), 也可以使用yarn平台跑任务
configuration.set("mapreduce.framework.name","local");
Job job = Job.getInstance(configuration, "统计最高温");
// 指定 map
job.setMapperClass(TempMaxMapper.class);
// hello 1
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 指定 reduce
job.setReducerClass(TempMaxReducer.class);
// hello 5
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// 此处也可以使用绝对路径
FileInputFormat.setInputPaths(job,"../WordCount/mr03/input/");
FileOutputFormat.setOutputPath(job,new Path("../WordCount/mr03/output"));
// 这个true表示将日志打印给前端,false 表只执行
boolean result = job.waitForCompletion(true);
// 返回结果如果为true表示任务成功了,正常退出,否则非正常退出
System.exit(result?0:-1);
}
}
三、求TopN案例
求每个人,他打分最高的五部电影。
1、有一种数据的结构,叫JSON (JavaScript)
JSON: JavaScript Object Notation(JavaScript 对象标记法)
H5: 编写页面的
Java:编写后端代码的
H5页面和Java之间是要传递数据的,传递的数据可以有很多种形式,xml和JSON。
2、如何表示
JSON 语法衍生于 JavaScript 对象标记法语法:
数据在名称/值对中 "name":"老闫"
数据由逗号分隔 "name":"老闫","age":30
花括号容纳对象 {"name":"老闫","age":30}
方括号容纳数组 [{"name":"老闫","age":30},{"name":"老闫2","age":10}]

rating.json
{"movie":"1193","rate":"5","timeStamp":"978300760","uid":"1"}
{"movie":"661","rate":"3","timeStamp":"978302109","uid":"1"}
{"movie":"914","rate":"3","timeStamp":"978301968","uid":"1"}
{"movie":"3408","rate":"4","timeStamp":"978300275","uid":"1"}
{"movie":"2355","rate":"5","timeStamp":"978824291","uid":"1"}
{"movie":"1197","rate":"3","timeStamp":"978302268","uid":"1"}
{"movie":"1287","rate":"5","timeStamp":"978302039","uid":"1"}
{"movie":"2804","rate":"5","timeStamp":"978300719","uid":"1"}
{"movie":"594","rate":"4","timeStamp":"978302268","uid":"1"}
3、导入工具包
因为我们想将json数据转换为Java 对象 -- Jackson
<!--导入jackson工具包-->
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>2.9.5</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
<version>2.9.5</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.9.5</version>
</dependency>
mavan的一个小用法,先声明版本号,再使用版本号:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.bigdata</groupId>
<artifactId>HadoopDay11</artifactId>
<version>1.0-SNAPSHOT</version>
<!--设置打包方式,为jar
此处可以设置三个值:
jar 纯java项目,打包jar
war web项目,打包war
pom maven的父子工程,父工程就是pom
-->
<packaging>jar</packaging>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<!--声明一个变量-->
<jackson.version>2.9.5</jackson.version>
<hadoop.version>3.3.1</hadoop.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>1.0.6</version>
</dependency>
<!--导入jackson工具包-->
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>${jackson.version}</version>
</dependency>
</dependencies>
</project>
4、编写一个实体,用于映射json数据
package com.bigdata.day12.topn;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* @Author laoyan
* @Description TODO
* @Date 2022/8/2 14:26
* @Version 1.0
*/
public class RatingWritable implements Writable {
private String movie;
private int rate;
private String timeStamp;
private String uid;
public RatingWritable() {
}
public RatingWritable(String movie, int rate, String timeStamp, String uid) {
this.movie = movie;
this.rate = rate;
this.timeStamp = timeStamp;
this.uid = uid;
}
public String getMovie() {
return movie;
}
public void setMovie(String movie) {
this.movie = movie;
}
public int getRate() {
return rate;
}
public void setRate(int rate) {
this.rate = rate;
}
public String getTimeStamp() {
return timeStamp;
}
public void setTimeStamp(String timeStamp) {
this.timeStamp = timeStamp;
}
public String getUid() {
return uid;
}
public void setUid(String uid) {
this.uid = uid;
}
@Override
public String toString() {
return "RatingWritable{" +
"movie='" + movie + '\'' +
", rate=" + rate +
", timeStamp='" + timeStamp + '\'' +
", uid='" + uid + '\'' +
'}';
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(movie);
out.writeInt(rate);
out.writeUTF(timeStamp);
out.writeUTF(uid);
}
@Override
public void readFields(DataInput in) throws IOException {
movie = in.readUTF();// ctrl + d
rate = in.readInt();
timeStamp = in.readUTF();
uid = in.readUTF();
}
}


修改成如下代码即可:

完整代码展示:
package com.bigdata.topN;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.codehaus.jackson.map.ObjectMapper;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
class TopNMapper extends Mapper<LongWritable, Text, IntWritable,MovieWritable>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
/**
* 读取一行数据,得到一个json ,解析json ,获取 uid 和 name 和评分
*/
String jsonStr = value.toString();
// 如何将一个json解析呢? 可以使用如下工具类: jackson fastjson gson 等
ObjectMapper objectMapper = new ObjectMapper();
MovieWritable movie = objectMapper.readValue(jsonStr, MovieWritable.class);
System.out.println("mapper端数据:"+movie);
context.write(new IntWritable(movie.getUid()),movie);
}
}
class TopNReducer extends Reducer<IntWritable,MovieWritable,Text, NullWritable>{
@Override
protected void reduce(IntWritable key, Iterable<MovieWritable> values, Context context) throws IOException, InterruptedException {
// uid 后面是他评价的所有电影信息
// 后面开始进行排序,取前五名
List<MovieWritable> list = new ArrayList<MovieWritable>();
for (MovieWritable movie:values) {
// 不能在这个地方直接add ,否则数据会重复
// list.add(movie);
MovieWritable rate = new MovieWritable();
rate.setRate(movie.getRate());
rate.setMovie(movie.getMovie());
rate.setTimeStamp(movie.getTimeStamp());
rate.setUid(movie.getUid());
list.add(rate);
}
System.out.println(list);
// 排好了顺序
Collections.sort(list, new Comparator<MovieWritable>() {
@Override
public int compare(MovieWritable m1, MovieWritable m2) {
return m2.getRate() - m1.getRate();
}
});
System.out.println(list);
// 取前五个
int length = Math.min(5,list.size());
// StringBuffer 和 StringBuilder StringBuffer是线程安全的
StringBuffer sb =new StringBuffer(key.get()+"最喜欢的五部的电影是:\n");
for (int i = 0; i < length; i++) {
MovieWritable movie = list.get(i);
sb.append(movie.getMovie()+",分数为:"+movie.getRate()+"\n");
}
context.write(new Text(sb.toString()),NullWritable.get());
}
}
public class TopNDriver {
public static void main(String[] args) throws Exception{
Configuration configuration = new Configuration();
// 使用本地的文件系统,而不是hdfs
configuration.set("fs.defaultFS","file:///");
// 使用本地的资源(CPU,内存等), 也可以使用yarn平台跑任务
configuration.set("mapreduce.framework.name","local");
Job job = Job.getInstance(configuration, "电影排名");
// 指定 map
job.setMapperClass(TopNMapper.class);
// hello 1
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(MovieWritable.class);
job.setReducerClass(TopNReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
// 此处也可以使用绝对路径
FileInputFormat.setInputPaths(job,"../WordCount/mr05/input/");
FileOutputFormat.setOutputPath(job,new Path("../WordCount/mr05/output"));
boolean result = job.waitForCompletion(true);
// 返回结果如果为true表示任务成功了,正常退出,否则非正常退出
System.exit(result?0:-1);
}
}
结果展示:

四、Yarn
1、概念
Hadoop三大件:HDFS、MapReduce、Yarn
Yarn其实就是一个类似于操作系统一样的东西。
Yarn是MapReduce运行的环境,Yarn可以管理程序运行所需要的东西(内存,CPU,带宽等资源)

Yarn诞生于Hadoop,但是现在已经脱离了Hadoop,变成了一个独立的软件,系统。
2、Yarn的组成部分


我们的Yarn,其实有两大部分组成:
必须清除
1、ResourceManager (BOSS): 1个
他用来管理整个的Yarn平台,里面有一个资源调度器。
2、NodeManager (各个机器上的主管) 多个
听从我们的ResouceManager的调遣。是每一台电脑的管家。
3、Container(容器)
每一个NodeManager中,有一个或者多个这样的容器。是包含了一些资源的封装(CPU,内存,硬盘等),类似于我们熟悉的虚拟机。
4、AppMaster (项目经理)
每一个MapReduce任务启动提交后,会有一个对应的AppMaster。这个主要作用是负责整个job任务的运行。
3、Yarn如何进行配置和搭建
/opt/installs/hadoop/etc/hadoop 文件夹下:
mapred-site.xml
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>
指定mapreduce运行平台为yarn
yarn-site.xml
<!--指定resourceManager启动的主机为第一台服务器-->
<property>
<name>yarn.resourcemanager.hostname</name>
<value>bigdata01</value>
</property>
<!--配置yarn的shuffle服务-->
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
检查hadoop-env.sh 中是否配置了权限:
export YARN_RESOURCEMANAGER_USER=root
export YARN_NODEMANAGER_USER=root
继续配置:为了防止报AppMaster的错误,需要如下配置
yarn-site.xml
<property>
<name>yarn.application.classpath</name>
<value>/opt/installs/hadoop/etc/hadoop:/opt/installs/hadoop/share/hadoop/common/lib/*:/opt/installs/hadoop/share/hadoop/common/*:/opt/installs/hadoop/share/hadoop/hdfs:/opt/installs/hadoop/share/hadoop/hdfs/lib/*:/opt/installs/hadoop/share/hadoop/hdfs/*:/opt/installs/hadoop/share/hadoop/mapreduce/*:/opt/installs/hadoop/share/hadoop/yarn:/opt/installs/hadoop/share/hadoop/yarn/lib/*:/opt/installs/hadoop/share/hadoop/yarn/*</value>
</property>
获取classpath的值:

分发mapred-site.xml & yarn-site.xml 到另外两台电脑上。
cd /opt/installs/hadoop/etc/hadoop/
xsync.sh mapred-site.xml yarn-site.xml
启动和停止yarn平台:
启动: start-yarn.sh
停止: stop-yarn.sh

也可以使用web访问一下:
跟ResourceManager电脑的IP保持一致
http://192.168.233.128:8088

4、关于启动和停止的命令

5、使用yarn平台进行wordCount计算
将一个wc.txt 上传至hdfs平台,然后通过yarn平台进行计算

数据
hadoop spark hello hadoop
spark hello flink world
scala python python scala
运行hadoop自带的workCount:
hadoop jar /opt/installs/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.3.1.jar wordcount /home/wc.txt /home/output

我们还可以在Yarn平台上查看运行的情况:

以上这个案例使用的就是yarn运行,数据必须在hdfs上,yarn也必须启动。代码虽然在本地,但是也会上传到hdfs的。
五、MapReduce任务有三种运行开发模式
1、local模式
数据在本地,代码也在本地,使用本机的电脑的资源运行我们的MR

输入和输出路径指的都是本地路径,运行时耗费的资源也是本地资源。
2、local模式2
数据在hdfs上,代码在本地,使用本机的电脑的资源运行我们的MR
System.setProperty("HADOOP_USER_NAME","root");
Configuration configuration = new Configuration();
configuration.set("fs.defaultFS","hdfs://192.168.32.128:9820");
// 使用本地的资源(CPU,内存等), 也可以使用yarn平台跑任务
configuration.set("mapreduce.framework.name","local");
这个里面的输入和输出路径指的是hdfs上的路径。
3、Yarn模式
数据在hdfs上,代码在yarn上。
System.setProperty("HADOOP_USER_NAME","root");
Configuration configuration = new Configuration();
configuration.set("fs.defaultFS","hdfs://192.168.32.128:9820");
configuration.set("mapreduce.framework.name","yarn");
// 跨平台任务提交打开
configuration.set("mapreduce.app-submission.cross-platform", "true");
案例:使用Yarn运行自己编写的WordCount:
修改代码如下:
package com.bigdata.day12.workcount;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* @Author laoyan
* @Description TODO
* @Date 2022/8/1 10:02
* @Version 1.0
*/
public class WordCountDriver2 {
public static void main(String[] args) throws Exception {
System.setProperty("HADOOP_USER_NAME","root");
Configuration configuration = new Configuration();
// 使用本地的文件系统,而不是hdfs
configuration.set("fs.defaultFS","hdfs://192.168.32.128:9820");
// 使用本地的资源(CPU,内存等), 也可以使用yarn平台跑任务
configuration.set("mapreduce.framework.name","yarn");
//要解决客户端在windows上运行,同时将MapTask和ReduceTask提交到linux集群上运行产生的冲突问题,就要修改下mapReduce的一些默认配置
configuration.set("mapreduce.app-submission.cross-platform", "true");
Job job = Job.getInstance(configuration, "老闫在yarn上运行workCount");
// map任务的设置
// 这句话不要忘记添加
job.setJarByClass(WordCountDriver2.class);
job.setMapperClass(WordCountMapper2.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 指定分区的类是哪一个
job.setPartitionerClass(WordCountPartitioner2.class);
// 还要执行 reduce的数量 因为一个reduce 就会产生一个结果文件
// reduce任务的设置
job.setReducerClass(WordCountReducer2.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 设置要统计的数据的路径,结果输出路径
FileInputFormat.setInputPaths(job,new Path(args[0]));
// ouput文件夹一定不要出现,否则会报错
FileOutputFormat.setOutputPath(job,new Path(args[1]));
// 等待任务执行结束
boolean b = job.waitForCompletion(true);
// 此处是一定要退出虚拟机的
System.exit(b ? 0:-1);
}
}
将程序达成jar包:


将我们的大好的Jar包上传至linux服务器,运行该jar包:
hadoop jar WC.jar com.bigdata.day12.workcount.WordCountDriver2 /input /oottpp2
jar包的名字最好短一点
com.bigdata.day12.workcount.WordCountDriver2 这个是Main方法所在的类的全路径
/input hdfs上文件的路径
/oottpp2 hdfs上数据的统计的输出路径



如果出现web端查看错误:

记得dfs-site.xml中添加如下配置,重启集群:
<property>
<name>dfs.webhdfs.enabled</name>
<value>true</value>
</property>
接着在本地windows系统的hosts文件中,添加映射
配置浏览器所在系统的 hosts 文件
windows:
在 C:\Windows\System32\drivers\etc\hosts 末尾增加内容(Hadoop集群中各节点及主机名的映射)

六、练习
1、数据清洗
# 数据清洗概念
通常情况下,大数据平台获得原始数据文件中,存在大量无效数据和缺失数据,需要再第一时间,对数据进行清洗,获得符合后续处理需求的数据内容和格式
# 需求
对手机流量原始数据,将其中的手机号为"null"和不完整的数据去除
数据格式:
# 源数据
id 手机号 手机mac ip地址 上传 下载 HTTP状态码
1363157985066 13726230503 00-FD-07-A4-72-B8:CMCC 120.196.100.82 24 27 2481 24681 200
1363157995052 13826544101 5C-0E-8B-C7-F1-E0:CMCC 120.197.40.4 4 0 264 0 200
1363157991076 13926435656 20-10-7A-28-CC-0A:CMCC 120.196.100.99 2 4 132 1512 200
1363154400022 13926251106 5C-0E-8B-8B-B1-50:CMCC 120.197.40.4 4 0 240 0 200
1363157985066 13726230503 00-FD-07-A4-72-B8:CMCC 120.196.100.82 24 27 2481 24681 200
1363157995052 13826544101 5C-0E-8B-C7-F1-E0:CMCC 120.197.40.4 4 0 264 0 200
1363157991076 13926435656 20-10-7A-28-CC-0A:CMCC 120.196.100.99 2 4 132 1512 200
1363154400022 13926251106 5C-0E-8B-8B-B1-50:CMCC 120.197.40.4 4 0 240 0 200
1363157995052 13826544109 5C-0E-8B-C7-F1-E0:CMCC 120.197.40.4 4 0
1363157995052 null 5C-0E-8B-C7-F1-E0:CMCC 120.197.40.4 4 0 240 0 200
1363157991076 13926435659 20-10-7A-28-CC-0A:CMCC 120.196.100.99 2 4 null null null
# 期望结果【删除其中手机号不符合要求,上传流量确实和下载流量确实的数据,并仅保留手机号 上传流量 下载流量。】
13726230503 2481 24681
13826544101 264 0
13926435656 132 1512
13926251106 240 0
13726230503 2481 24681
13826544101 264 0
13926435656 132 1512
13926251106 240 0
编码提示:
# 重点:
MapReduce整个流程中可以取消reduce阶段的程序执行,map输出的会直接作为结果输出到HDFS文件中。
# 编码实现
1. 删除job中有关reducer的相关设置:reducer类和输出的key value类型。
2. 手动设置reducetask的个数为0
job.setNumReduceTasks(0);//取消reducer
package com.bigdata;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
class CleanLogMapper extends Mapper<LongWritable, Text,Text,Text>{
Text text = null;
Text text2 = null;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
text = new Text();
text2 = new Text();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] split = line.split("\\s+");
String phone = split[1];
String upFlow = split[split.length-3];
String downFlow = split[split.length-2];
if(phone.equals("null") || upFlow.equals("null") || downFlow.equals("null") || split.length != 9){
return ;
}
text.set(phone);
text2.set(upFlow+" "+downFlow);
context.write(text,text2);
}
}
public class CleanLogDriver {
public static void main(String[] args) throws Exception{
Configuration conf = new Configuration();
conf.set("fs.defaultFS","file:///");
conf.set("mapreduce.framework.name","local");
Job job = Job.getInstance(conf, "清理数据");
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
// 为什么要加这句话: 因为我们没有写reduce 但是默认还是会走reduce,所以需要添加reduce的数量为 0
job.setNumReduceTasks(0);
job.setMapperClass(CleanLogMapper.class);
// 一般 不写,除非在yarn上运行,必须写,就是DriverClass
job.setJarByClass(CleanLogDriver.class);
// 这个地方跟以前可以不一样
FileInputFormat.addInputPath(job,new Path("../WordCount/mr06/input"));
FileOutputFormat.setOutputPath(job,new Path("../WordCount/mr06/output"));
// 每运行一会儿就追一下日志,查看进度的
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : -1);
}
}
2、自定义分区案例
案例之·学生成绩统计分析
# 将学生成绩,按照各科成绩降序排序,各个科目成绩单独输出。
数据如下:
# 自定义partition
将下面数据分区处理:
人名 科目 成绩
张三 语文 10
李四 数学 30
王五 语文 20
赵6 英语 40
张三 数学 50
李四 语文 10
张三 英语 70
李四 英语 80
王五 英语 45
王五 数学 10
赵6 数学 10
赵6 语文 100
package com.bigdata.score;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
class ScorePartitonerMapper extends Mapper<LongWritable, Text, Text,Student>{
Text text = null; // 学科
// 先按照相同的学科汇总在一起,然后按照成绩排序,排序后成绩倒序输出
Student student = null;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
// 因为成绩要倒序排序,默认的IntWritable是正序的
text = new Text();
student = new Student();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 这个是跳过第一行数据,假如需要跳过第二行,也需要计算第一行的长度
if(key.get() == 0){
return ;
}
String line = value.toString();
String[] split = line.split("\\s+");
String subject = split[1];
String name = split[0];
int score = Integer.valueOf(split[2]);
text.set(subject);
student.setName(name);
student.setScore(score);
student.setSubject(subject);
context.write(text,student);
}
}
class ScorePartitoner extends Partitioner<Text,Student>{
@Override
public int getPartition(Text subjectText, Student student, int i) {
String subjectName = subjectText.toString();
switch (subjectName) {
case "语文":
return 0;
case "数学":
return 1;
case "英语":
return 2;
default:
return 3;
}
}
}
class Student implements WritableComparable<Student>{
private String name;
private int score;
private String subject;
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public int getScore() {
return score;
}
public void setScore(int score) {
this.score = score;
}
public String getSubject() {
return subject;
}
public void setSubject(String subject) {
this.subject = subject;
}
@Override
public int compareTo(Student o) {
return o.getScore() -this.score;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(name);
out.writeInt(score);
out.writeUTF(subject);
}
@Override
public void readFields(DataInput in) throws IOException {
name = in.readUTF();
score = in.readInt();
subject = in.readUTF();
}
@Override
public String toString() {
return name + " " +subject + " " + score ;
}
}
class ScoreReduer extends Reducer<Text,Student,Text, NullWritable>{
@Override
protected void reduce(Text key, Iterable<Student> values, Context context) throws IOException, InterruptedException {
List<Student> list = new ArrayList<>();
for (Student stu:values) {
Student student = new Student();
student.setSubject(stu.getSubject());
student.setScore(stu.getScore());
student.setName(stu.getName());
list.add(student);
}
Collections.sort(list);
for (Student s:list) {
Text text = new Text(s.toString());
context.write(text,NullWritable.get());
}
}
}
public class ScorePartitionerDriver {
public static void main(String[] args) throws Exception{
Configuration conf = new Configuration();
conf.set("fs.defaultFS","file:///");
conf.set("mapreduce.framework.name","local");
Job job = Job.getInstance(conf, "成绩分区");
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Student.class);
job.setMapperClass(ScorePartitonerMapper.class);
job.setPartitionerClass(ScorePartitoner.class);
job.setReducerClass(ScoreReduer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
// 为什么要加这句话: 因为我们没有写reduce 但是默认还是会走reduce,所以需要添加reduce的数量为 0
job.setNumReduceTasks(3);
// 一般 不写,除非在yarn上运行,必须写,就是DriverClass
job.setJarByClass(ScorePartitionerDriver.class);
// 这个地方跟以前可以不一样
FileInputFormat.addInputPath(job,new Path("../WordCount/mr07/input"));
FileOutputFormat.setOutputPath(job,new Path("../WordCount/mr07/output"));
// 每运行一会儿就追一下日志,查看进度的
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : -1);
}
}
3、排序
默认排序:
案例之·斗鱼主播日志数据按照观众人数升序序排序
# 案例
用户id 观众人数
团团 300
小黑 200
哦吼 400
卢本伟 100
八戒 250
悟空 100
唐僧 100
# 期望结果
卢本伟 100
悟空 100
唐僧 100
小黑 200
八戒 250
团团 300
哦吼 400
提示:
● 默认排序规则
1. 默认排序调用mapper输出key的compareTo方法比较大小,决定排序规则。
2. 默认升序。
所以以上这个题目,谁是Key,谁是Value呢?
参考答案:
package com.bigdata.paixu01;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
class PaiXuMapper extends Mapper<LongWritable, Text, IntWritable,Text>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String s = value.toString();
String[] split = s.split("\\s+");
int visitNum = Integer.valueOf(split[1]);
String name = split[0];
context.write(new IntWritable(visitNum),new Text(name));
}
}
public class PaiXu01 {
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
// 使用本地的文件系统,而不是hdfs
configuration.set("fs.defaultFS","file:///");
// 使用本地的资源(CPU,内存等), 也可以使用yarn平台跑任务
configuration.set("mapreduce.framework.name","local");
Job job = Job.getInstance(configuration, "排序01");
// 指定 map
job.setMapperClass(PaiXuMapper.class);
// hello 1
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(Text.class);
// 此处也可以使用绝对路径
FileInputFormat.setInputPaths(job,"../WordCount/mr04/input/");
FileOutputFormat.setOutputPath(job,new Path("../WordCount/mr04/output"));
boolean result = job.waitForCompletion(true);
// 返回结果如果为true表示任务成功了,正常退出,否则非正常退出
System.exit(result?0:-1);
}
}
案例之·斗鱼主播日志数据按照观众人数降序排序?
# 自定义排序
# 案例
团团 300
小黑 200
哦吼 400
卢本伟 100
八戒 250
悟空 100
唐僧 100
# 期望
哦吼 400
团团 300
八戒 250
小黑 200
卢本伟 100
悟空 100
唐僧 100
关键代码提示:
# 1. 需要自定义Mapper输出的key的类型,实现WritableComparable接口
# 2. 实现compareTo方法。
# 3. 补齐write和readField的序列化相关方法
package com.bigdata.paixu01;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class IntWritable2 implements WritableComparable<IntWritable2> {
private int value;
public IntWritable2() {}
public IntWritable2(int value) { set(value); }
/** Set the value of this IntWritable. */
public void set(int value) { this.value = value; }
/** Return the value of this IntWritable. */
public int get() { return value; }
// 这个方法别人忙你调用的,只要存在排序的地方就会调用这个方法。
// java 基础 比如各种排序方法 Collections.sort(list)
@Override
public int compareTo(IntWritable2 o) {
int thisValue = this.value;
int thatValue = o.get();
return (thisValue > thatValue ? -1 : (thisValue==thatValue ? 0 : 1));
}
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(value);
}
@Override
public void readFields(DataInput in) throws IOException {
value = in.readInt();
}
@Override
public String toString() {
return value+"";
}
}
测试代码:
package com.bigdata.paixu01;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
class PaiXuMapper extends Mapper<LongWritable, Text, IntWritable,Text>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String s = value.toString();
String[] split = s.split("\\s+");
int visitNum = Integer.valueOf(split[1]);
String name = split[0];
context.write(new IntWritable(visitNum),new Text(name));
}
}
class PaiXuMapper2 extends Mapper<LongWritable, Text, IntWritable2,Text>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String s = value.toString();
String[] split = s.split("\\s+");
int visitNum = Integer.valueOf(split[1]);
String name = split[0];
context.write(new IntWritable2(visitNum),new Text(name));
}
}
public class PaiXu01 {
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
// 使用本地的文件系统,而不是hdfs
configuration.set("fs.defaultFS","file:///");
// 使用本地的资源(CPU,内存等), 也可以使用yarn平台跑任务
configuration.set("mapreduce.framework.name","local");
Job job = Job.getInstance(configuration, "排序01");
// 指定 map
job.setMapperClass(PaiXuMapper2.class);
// hello 1
job.setMapOutputKeyClass(IntWritable2.class);
job.setMapOutputValueClass(Text.class);
// 此处也可以使用绝对路径
FileInputFormat.setInputPaths(job,"../WordCount/mr04/input/");
FileOutputFormat.setOutputPath(job,new Path("../WordCount/mr04/output2"));
boolean result = job.waitForCompletion(true);
// 返回结果如果为true表示任务成功了,正常退出,否则非正常退出
System.exit(result?0:-1);
}
}
public class WatcherWritable implements WritableComparable<WatcherWritable> {
private int watcher;
/**
* 在排序会调用,
*/
public int compareTo(WatcherWritable o) {
//写法2:降序
return o.watcher - this.watcher;
// 写法1:降序
/*
if(this.watcher>o.watcher){
return -1;
}else if (this.watcher == o.watcher){
return 0;
}else{
return 1;
}*/
}
...
// 省略构造方法,get set方法,序列化方法和tostring
}
优化一下:
我们的答案是数字在前,名字在后,跟人家的要求稍微不一样。
思路: 文件的输出格式是由 Key 和Value决定的。
我们可以将key输出的内容多一些,value就不输出了,不输出使用NullWritable代替。
package com.bigdata.paixu01;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
class PaiXuMapper extends Mapper<LongWritable, Text, IntWritable,Text>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String s = value.toString();
String[] split = s.split("\\s+");
int visitNum = Integer.valueOf(split[1]);
String name = split[0];
context.write(new IntWritable(visitNum),new Text(name));
}
}
class PaiXuMapper2 extends Mapper<LongWritable, Text, IntWritable2,NullWritable>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String s = value.toString();
String[] split = s.split("\\s+");
int visitNum = Integer.valueOf(split[1]);
String name = split[0];
context.write(new IntWritable2(visitNum,name), NullWritable.get());
}
}
public class PaiXu01 {
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
// 使用本地的文件系统,而不是hdfs
configuration.set("fs.defaultFS","file:///");
// 使用本地的资源(CPU,内存等), 也可以使用yarn平台跑任务
configuration.set("mapreduce.framework.name","local");
Job job = Job.getInstance(configuration, "排序01");
// 指定 map
job.setMapperClass(PaiXuMapper2.class);
// hello 1
job.setMapOutputKeyClass(IntWritable2.class);
job.setMapOutputValueClass(NullWritable.class);
// 此处也可以使用绝对路径
FileInputFormat.setInputPaths(job,"../WordCount/mr04/input/");
FileOutputFormat.setOutputPath(job,new Path("../WordCount/mr04/output2"));
boolean result = job.waitForCompletion(true);
// 返回结果如果为true表示任务成功了,正常退出,否则非正常退出
System.exit(result?0:-1);
}
}
package com.bigdata.paixu01;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class IntWritable2 implements WritableComparable<IntWritable2> {
private int value;
private String name;
public IntWritable2() {}
public IntWritable2(int value,String name) { set(value,name); }
/** Set the value of this IntWritable. */
public void set(int value,String name ) {
this.value = value;
this.name = name;
}
/** Return the value of this IntWritable. */
public int get() { return value; }
// 这个方法别人忙你调用的,只要存在排序的地方就会调用这个方法。
// java 基础 比如各种排序方法 Collections.sort(list)
@Override
public int compareTo(IntWritable2 o) {
int thisValue = this.value;
int thatValue = o.get();
return (thisValue > thatValue ? -1 : (thisValue==thatValue ? 0 : 1));
}
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(value);
out.writeUTF(name);
}
@Override
public void readFields(DataInput in) throws IOException {
value = in.readInt();
name = in.readUTF();
}
@Override
public String toString() {
return name+" "+value;
}
}
案例之·主播数据按照观众人数降序排序,如果观众人数相同,按照直播时长降序
# 案例数据
用户id 观众人数 直播时长
团团 300 1000
小黑 200 2000
哦吼 400 7000
卢本伟 100 6000
八戒 250 5000
悟空 100 4000
唐僧 100 3000
# 期望结果
哦吼 400 7000
团团 300 1000
八戒 250 5000
小黑 200 2000
卢本伟 100 6000
悟空 100 4000
唐僧 100 3000
关键代码:
public class PlayWritable implements WritableComparable<PlayWritable> {
private int viewer;
private int length;
/**
* 按照viewer降序,如果viewer相同,按照length降序
* @param o
* @return
*/
public int compareTo(PlayWritable o) {
if(this.viewer != o.viewer){
return o.viewer - this.viewer;
}else{
return o.length - this.length;
}
}
// 省略构造方法,get set方法,序列化方法和tostring
...
}
规律:
1) 排序是按照Key值排序
2)mapper 是可以单独使用的,假如只有mapper ,这个时候 set ReducerNum 为 0
3) 不管是mapper输出还是reducer输出,输出的格式是按照 Key的输出+ Value输出。Key的输出和Value的输出格式是按照这个类的 toString方法格式输出的。假如只想输出key ,就将value设置为 NullWritable,假如只想输出value,Key的类型就是NullWritable。
4)假如你有分区,分区的泛型,跟mapper的输出照应。
实例代码:
package com.bigdata.paixu01;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
class PaiXuMapper3 extends Mapper<LongWritable, Text, IntWritable2,NullWritable>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String s = value.toString();
String[] split = s.split("\\s+");
int visitNum = Integer.valueOf(split[1]);
int shiChang = Integer.valueOf(split[2]);
String name = split[0];
context.write(new IntWritable2(visitNum,name,shiChang), NullWritable.get());
}
}
package com.bigdata.paixu01;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class IntWritable2 implements WritableComparable<IntWritable2> {
private int value;
private String name;
private int shiChang;
public IntWritable2() {}
public IntWritable2(int value,String name) { set(value,name); }
public IntWritable2(int value,String name,int shiChang) { set(value,name,shiChang); }
// 方法的重载
public void set(int value) {
this.value = value;
}
/** Set the value of this IntWritable. */
public void set(int value,String name ) {
this.value = value;
this.name = name;
}
public void set(int value,String name,int shiChang ) {
this.value = value;
this.name = name;
this.shiChang = shiChang;
}
/** Return the value of this IntWritable. */
public int get() { return value; }
// 这个方法别人忙你调用的,只要存在排序的地方就会调用这个方法。
// java 基础 比如各种排序方法 Collections.sort(list)
@Override
public int compareTo(IntWritable2 o) {
int thisValue = this.value;
int thatValue = o.get();
if(thisValue != thatValue){
return thatValue - thisValue;
}else {
return o.shiChang - this.shiChang;
}
}
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(value);
out.writeUTF(name);
out.writeInt(shiChang);
}
@Override
public void readFields(DataInput in) throws IOException {
value = in.readInt();
name = in.readUTF();
shiChang = in.readInt();
}
@Override
public String toString() {
return name+" "+value+" "+shiChang;
}
}
Driver:
public class PaiXu01 {
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
// 使用本地的文件系统,而不是hdfs
configuration.set("fs.defaultFS","file:///");
// 使用本地的资源(CPU,内存等), 也可以使用yarn平台跑任务
configuration.set("mapreduce.framework.name","local");
Job job = Job.getInstance(configuration, "排序01");
// 指定 map
job.setMapperClass(PaiXuMapper3.class);
// hello 1
job.setMapOutputKeyClass(IntWritable2.class);
job.setMapOutputValueClass(NullWritable.class);
job.setNumReduceTasks(0);
// 此处也可以使用绝对路径
FileInputFormat.setInputPaths(job,"../WordCount/mr08/input/");
FileOutputFormat.setOutputPath(job,new Path("../WordCount/mr08/output3"));
boolean result = job.waitForCompletion(true);
// 返回结果如果为true表示任务成功了,正常退出,否则非正常退出
System.exit(result?0:-1);
}
}
七、Main方法如何传参
以问题驱动学习:
假如你到公司中,如何自己学习?你们公司正在使用的技术是什么就学什么,特别大的技术,不要学。


package com.bigdata;
public class TestMain {
/**
* args 是一个字符串数组,谁给它可以赋值呢?
* @param args
*/
public static void main(String[] args) {
System.out.println("参数打印开始");
for (String str:args) {
System.out.println(str);
}
System.out.println("参数打印完毕");
}
}
思考:假如你的这个代码不在idea中如何传参?
任何java代码都可以打成jar包,jar包中的文件如何运行?
java -jar xxxx.jar 某个类的全路径 如果这个类有参数,直接跟在后面
java -jar hello.jar com.bigdata.TestMain 10 20 30
1万+

被折叠的 条评论
为什么被折叠?



