Java基本类型 | Writable | 序列化大小(字节) |
布尔型(boolean) | BooleanWritable | 1 |
字节型(byte) | ByteWritable | 1 |
整型(int) | IntWritable | 4 |
VIntWritable | 1~5 | |
浮点型(float) | FloatWritable | 4 |
长整型(long) | LongWritable | 8 |
VLongWritable | 1~9 | |
双精度浮点型(double) | DoubleWritable | 8 |
[root@master ~]# rz -E
rz waiting to receive.
[root@master ~]# ls
ac.sh words.txt 文档
anaconda-ks.cfg 公共 下载
dump.rdb 模板 音乐
initial-setup-ks.cfg 视频 桌面
mysql57-community-release-el7-10.noarch.rpm 图片
[root@master ~]# cat words.txt
java,c,c++,sql,scale,python
mysql,springboot,redis
hadoop,hive,hbase,spark,flink
kafka,sqoop,flume,datax,kettle,flink
phoenix
mysql,springboot,redis
hadoop,hive,hbase,spark,flink
kafka,sqoop,flume,datax,kettle,flink
phoenix
mysql,springboot,redis
hadoop,hive,hbase,spark,flink
kafka,sqoop,flume,datax,kettle,flink
phoenix
mysql,springboot,redis
hadoop,hive,hbase,spark,flink
kafka,sqoop,flume,datax,kettle,flink
phoenix[root@master ~]# mv words.txt /usr/local/soft/data/
[root@master ~]# cd /usr/local/soft/data/
[root@master data]# ls
new_db.sql student.sql theZen.txt words.txt
score.sql theZenOfPython.txt wordcount
[root@master data]# cat words.txt
java,c,c++,sql,scale,python
mysql,springboot,redis
hadoop,hive,hbase,spark,flink
kafka,sqoop,flume,datax,kettle,flink
phoenix
mysql,springboot,redis
hadoop,hive,hbase,spark,flink
kafka,sqoop,flume,datax,kettle,flink
phoenix
mysql,springboot,redis
hadoop,hive,hbase,spark,flink
kafka,sqoop,flume,datax,kettle,flink
phoenix
mysql,springboot,redis
hadoop,hive,hbase,spark,flink
kafka,sqoop,flume,datax,kettle,flink
[root@master data]# hdfs dfs -put words.txt /data/wc/input/
package com.shujia.MapReduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class Demo01WordCount {
//Map任务
/**
*继承Mapper类,指定输入的K-V的类型以及输出K-V的类型
* 然后重写map方法
*/
public static class MyMapper extends Mapper<LongWritable, Text,Text, IntWritable> {
@Override
/**
* key 偏移量
* value 一行数据
* context MR任务运行的是上下文环境,可以获取当前运行的配置等其他信息
* 主要用于将map端构建好的K-V进行输出
*/
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
//自己的map逻辑
//按分割符切分除每一个单词
String[] splits = value.toString().split(",");
for (String word : splits) {
//以单词做为key,1作为value,构建好输出的K-V
//再通过context输出
/**
* java,c,c++,sql,scale,python
* 输出:java,1
* c,1
* c++,1
* sql,1
* scale,1
* python,1
*/
Text outPutKey = new Text(word);
IntWritable outputValue = new IntWritable(1);
context.write(outPutKey,outputValue);
}
}
}
/**
*继承Reducer类,指定输入的K-V的类型(同map端的输出K-V类型保存一致)以及输出K-V的类型
* 然后重写map方法
*/
//Reduce任务
public static class MyReducer extends Reducer<Text, IntWritable,Text,IntWritable>{
@Override
/**
* key 经过shuffle过程,相同的K进入到同一个reduce
* values 相同的Key对应的value的一个“集合”
* context 主要用于输出数据到HDFS
*/
protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
int sum=0; //用于保存最后的结果
for (IntWritable value : values) {
sum+=value.get();
}
//将最后的结果输出
context.write(key,new IntWritable(sum));
}
}
//Driver程序,主要负责配置和提交任务
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://master:9000");
//创建一个MapReduce的job
Job job = Job.getInstance(conf);
//配置任务
job.setJobName("Demo01WordCount");
//设置任务运行哪个类
job.setJarByClass(Demo01WordCount.class);
//设置Reduce的数量,默认是1,最终生成文件的数量同Reduce的数量一致
job.setNumReduceTasks(3);
//配置map端
//指定map运行时哪一个类
job.setMapperClass(MyMapper.class);
//配置Map端输出的key类型
job.setMapOutputKeyClass(Text.class);
//配置Map端输出的value类型
job.setMapOutputValueClass(IntWritable.class);
//配置Reduce端
//指定Reduce运行时哪一个类
job.setReducerClass(MyReducer.class);
//配置Reduce端输出的key类型
job.setOutputKeyClass(Text.class);
//配置Reduce端输出的value类型
job.setOutputValueClass(IntWritable.class);
//配置输入输出路径
/**
* hdfs dfs -mkdir /data/wc/input
* hdfs dfs -put words.txt /data/wc/input
*/
FileInputFormat.addInputPath(job,new Path("/data/wc/input"));
Path path = new Path("/data/wc/output");
FileSystem fs = FileSystem.get(conf);
//判断输出路径是否存在,存在则删除
if (fs.exists(path)){
fs.delete(path,true);
}
//输出路径已存在,会报错
FileOutputFormat.setOutputPath(job,path);
//等待任务完成
job.waitForCompletion(true);
}
/**
* 1.将words.txt上传至虚拟机并使用HDFS命令上传至HDFS
* hdfs dfs -mkdir /data/wc/input
* hdfs dfs -put words.txt /data/wc/input
* 2.将代码通过maven的package打包成jar包,并上传至虚拟机
* 3.使用命令提交任务
* hadoop jar hadoop-1.0-SNAPSHOT.jar com.shujia.MapReduce.Demo01WordCount
* 查看日志:在任务运行时会自动生成一个applicationId
* yarn logs -applicationId application_1647858149677_0004
* 也可以通过historyserver去查看,因为任务真正运行在NodeManager中,日志可能会分散
* historyserver可以负责从Nodemanager中收集日志到Master中方便查看日志
* 启动historyserver:在Master上启动即可
* mr-jobhistory-daemon.sh start historyserver
* http://master:19888
*/
}
[root@master data]# cd ..
[root@master soft]# ls
0?? hadoop-2.7.6 redis shell01 zookeeper-3.4.6
A?? jdk1.8.0_171 redis-6.2.6 show
data packages shell test.txt
[root@master soft]# mkdir jars
[root@master soft]# mv data/hadoop-1.0-SNAPSHOT.jar ./jars/
[root@master soft]# ls
0?? hadoop-2.7.6 packages shell test.txt
A?? jars redis shell01 zookeeper-3.4.6
data jdk1.8.0_171 redis-6.2.6 show
[root@master soft]# cd jars/
[root@master jars]# ls
hadoop-1.0-SNAPSHOT.jar
[root@master jars]# pwd
/usr/local/soft/jars
[root@master jars]# hadoop jar hadoop-1.0-SNAPSHOT.jar com.shujia.MapReduce.Demo01WordCount
[root@master jars]# hdfs dfs -cat /data/wc/output/part-r-00000
c++ 1
datax 4
hbase 4
hive 4
kettle 4
mysql 4
spark 4
package com.shujia;
public class HashCode {
public static void main(String[] args) {
String k1="spark";
String k2="c++";
String k3="datax";
String k4="hbase";
String k5="hive";
String k6="kettle";
String k7="mysql";
int numReduceTasks = 3;
System.out.println(k1.hashCode()%numReduceTasks);
System.out.println(k2.hashCode()%numReduceTasks);
System.out.println(k3.hashCode()%numReduceTasks);
System.out.println(k4.hashCode()%numReduceTasks);
System.out.println(k5.hashCode()%numReduceTasks);
System.out.println(k6.hashCode()%numReduceTasks);
System.out.println(k7.hashCode()%numReduceTasks);
}
}
[root@master ~]# mr-jobhistory-daemon.sh start historyserver
starting historyserver, logging to /usr/local/soft/hadoop-2.7.6/logs/mapred-root-historyserver-master.out
[root@master ~]# yarn logs -applicationId application_1647858149677_0004