HDFS安装(伪集群)
- 准备物理虚拟机一台
版本:CentOS7
- 配置网络
# 如果需要配置双网卡,需要添加网卡硬件支持
vi /etc/sysconfig/network-scripts/ifcfg-ens33
1. 将ip的分配方式改为static
2. 添加 IPADDR=静态IP地址
3. 添加 NETMASK=255.255.255.0
4. 网卡开机自启动 ONBOOT=yes
- 关闭防火墙
[root@bogon ~]# systemctl stop firewalld
[root@bogon ~]# systemctl disable firewalld
- 修改主机名
[root@bogon ~]# vi /etc/hostname
HadoopNode00
- 配置hosts映射
[root@Spark ~]# vi /etc/hosts
127.0.0.1 localhost localhost.localdomain localhost4 localhost4.localdomain4
::1 localhost localhost.localdomain localhost6 localhost6.localdomain6
192.168.191.100 HadoopNode00
[root@Spark ~]# ping HadoopNode00
- ssh免密登陆
[root@HadoopNode00 ~]# ssh-keygen -t rsa # 生成密钥
[root@HadoopNode00 ~]# ssh-copy-id HadoopNode00
- 安装JDK 8
[root@Spark ~]# rpm -ivh jdk-8u171-linux-x64.rpm
准备中... ################################# [100%]
正在升级/安装...
1:jdk1.8-2000:1.8.0_171-fcs ################################# [100%]
Unpacking JAR files...
tools.jar...
plugin.jar...
javaws.jar...
deploy.jar...
rt.jar...
jsse.jar...
charsets.jar...
localedata.jar...
- 解压Hadoop
[root@HadoopNode00 ~]# tar -zxf hadoop-2.9.2.tar.gz -C /usr
- 修改HDFS的配置文件
[root@HadoopNode00 hadoop-2.9.2]# vim /usr/hadoop-2.9.2/etc/hadoop/core-site.xml
<!--nn访问入口-->
<property>
<name>fs.defaultFS</name>
<value>hdfs://HadoopNode00:9000</value>
</property>
<!--hdfs工作基础目录-->
<property>
<name>hadoop.tmp.dir</name>
<value>/usr/hadoop-2.9.2/hadoop-${user.name}</value>
</property>
[root@HadoopNode00 hadoop-2.9.2]# vim /usr/hadoop-2.9.2/etc/hadoop/hdfs-site.xml
<!--block副本因子-->
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
<!--配置Sencondary namenode所在物理主机-->
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>HadoopNode00:50090</value>
</property>
<!--设置datanode最大文件操作数-->
<property>
<name>dfs.datanode.max.xcievers</name>
<value>4096</value>
</property>
<!--设置datanode并行处理能力-->
<property>
<name>dfs.datanode.handler.count</name>
<value>6</value>
</property>
[root@Spark HadoopNode00-2.9.2]# vim /usr/hadoop-2.9.2/etc/hadoop/slaves
HadoopNode00
- 配置JDK和Hadoop的环境变量
[root@Spark HadoopNode00-2.9.2]# vim /root/.bashrc
export HADOOP_HOME=/usr/hadoop-2.9.2
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
[root@Spark HadoopNode00-2.9.2]# source /root/.bashrc
- 格式化namenode
[root@HadoopNode00 ~]# hdfs namenode -format
- 启动关闭
start-dfs.sh # 开启HDFS
stop-dfs.sh # 关闭hdfs
- web界面
http://主机名:50070
HDFS Shell
- 上传文件
[root@HadoopNode00 ~]# hadoop fs -put /root/install.log /1.txt
- ls文件
[root@HadoopNode00 ~]# hadoop fs -ls /
Found 1 items
-rw-r--r-- 1 root supergroup 8901 2019-09-17 23:28 /1.txt
- 下载文件
[root@HadoopNode00 ~]# hadoop fs -get /1.txt /root/user.txt
- 删除文件
[root@HadoopNode00 ~]# hadoop fs -rm /2.txt
- 查看文件
[root@HadoopNode00 ~]# hadoop fs -cat /1.txt
- 创建文件夹
[root@HadoopNode00 ~]# hadoop fs -mkdir /user
- 复制文件
[root@HadoopNode00 ~]# hadoop fs -cp /1.txt /user/
- 开启回收站机制
core-site.xml
<property>
<name>fs.trash.interval</name>
<value>1</value>
</property>
Java API
依赖
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.9.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.9.2</version>
</dependency>
Windows 配置Hadoop环境
- 解压hadoop到指定的目录
- 拷贝hadoop.dll和winutils.exe到hadoop/bin 目录下
- 配置Hadoop环境变量
- 配置主机名和IP的映射关系
权限不足解决方案
-
-DHADOOP_USER_NAME=root
-
System.setProperty(“HADOOP_USER_NAME”, “root”);
-
<property> <name>dfs.permissions.enabled</name> <value>false</value> </property>
相关操作
public class App {
private Configuration configuration;
private FileSystem fileSystem;
@Before
public void getClient() throws Exception {
System.setProperty("HADOOP_USER_NAME", "root");
/*
* 准备配置对象
* */
configuration = new Configuration();
/*
* 添加相应的配置文件*/
configuration.addResource("core-site.xml");
configuration.addResource("hdfs-site.xml");
/*
* 通过FileSystem.newInstance 获得客户端对象*/
fileSystem = FileSystem.newInstance(configuration);
}
@Test
public void testUpload01() throws Exception {
/*
*
* 源文件 | 目标文件
* Path 对象
* */
fileSystem.copyFromLocalFile(new Path("G:\\A.docx"), new Path("/user/2.docx"));
}
@Test
public void testUpload02() throws Exception {
/*
* 准备 本地输入流
* */
FileInputStream inputStream = new FileInputStream("G:\\A.docx");
/*
* 准备 hdfs 输出流
* */
FSDataOutputStream outputStream = fileSystem.create(new Path("/user/3.docx"));
/*
* 使用工具类进行拷贝
* */
IOUtils.copyBytes(inputStream, outputStream, 1024, true);
}
@Test
public void testDownload01() throws Exception {
fileSystem.copyToLocalFile(false, new Path("/1.txt"), new Path("G:\\3.txt"), true);
}
@Test
public void testDownload02() throws Exception {
FileOutputStream outputStream = new FileOutputStream("G:\\4.txt");
FSDataInputStream inputStream = fileSystem.open(new Path("/1.txt"));
IOUtils.copyBytes(inputStream, outputStream, 1024, true);
}
@Test
public void test011() throws IOException {
RemoteIterator<LocatedFileStatus> list = fileSystem.listFiles(new Path("/"), true);
while (list.hasNext()) {
LocatedFileStatus locatedFileStatus = list.next();
Path path = locatedFileStatus.getPath();
System.out.println(path.toString());
}
}
@Test
public void test02() throws Exception{
fileSystem.delete(new Path("/user"),false);
}
@Test
public void test03() throws Exception{
boolean exists = fileSystem.exists(new Path("/1.txt"));
if (exists){
System.out.println("文件存在");
}else {
System.out.println("文件不存在");
}
}
@Test
public void testy04() throws Exception{
fileSystem.mkdirs(new Path("/user1"));
}
}
YARN 环境搭建
配置YARN
etc/hadoop/yarn-site.xml
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<description>The hostname of the RM.</description>
<name>yarn.resourcemanager.hostname</name>
<value>HadoopNode00</value>
</property>
etc/hadoop/mapred-site.xml
etc/hadoop/ 下其实是没有这个文件 的但是有yitmp结尾的文件,将其改名即可
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
启动YARN
[root@HadoopNode00 ~]# start-yarn.sh
MR
依赖
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.9.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.9.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-jobclient</artifactId>
<version>2.9.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.9.2</version>
</dependency>
Mapper
/*
* keyIn LongWritable (Long) 输入文本字节偏移量
* valueIn Text (String) 输入文本行
* keyOut Text(String)
* valueOut IntWritable(Int)
* */
public class WCMapper extends Mapper<LongWritable, Text,Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] names = value.toString().split(" ");
for (String name : names) {
context.write(new Text(name),new IntWritable(1));
}
}
}
Reduce
/*
*keyIn Text 与mapper的keyOut的数据类型相对应
*valeuIn IntWritable 与mapper的ValueOut的数据类型相对应
* KeyOut
* valueOut
* */
public class WCReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable value : values) {
sum += value.get();
}
context.write(key, new IntWritable(sum));
}
}
Job
public class JobRunner {
public static void main(String[] args) throws Exception {
/*
* 获取配置对象
* */
Configuration conf = new Configuration();
/*
* 获取Job对象
* */
Job job = Job.getInstance(conf);
/*
* 设置数据输入输出组件
* */
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
/*
*设置数据输入输出路径
* */
TextInputFormat.setInputPaths(job, new Path("/wordcount.txt"));
/*
* 注意: 此输出路径不能存在
* */
TextOutputFormat.setOutputPath(job, new Path("/test/out1"));
/*
* 设置MAP 和 REDUCE 处理逻辑
* */
job.setMapperClass(WCMapper.class);
job.setReducerClass(WCReducer.class);
/*
* 设置 map任务和reduce任务的输出泛型
* */
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 提交
//job.submit();
job.waitForCompletion(true);
}
}
部署运行
远程Jar 包部署
job.setJarByClass(JobRunner.class);
打包
运行 hadoop jar 包的名字 主类名
[root@HadoopNode00 ~]# hadoop jar Hadoop_Test-1.0-SNAPSHOT.jar com.zzj.mr.test01.JobRunner
本地仿真
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>
log4j.properties
### 配置根 ###
log4j.rootLogger = info,console
### 配置输出到控制台 ###
log4j.appender.console = org.apache.log4j.ConsoleAppender
log4j.appender.console.Target = System.out
log4j.appender.console.layout = org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern = %p %d{yyyy-MM-dd HH:mm:ss} %c %m%n
跨平台提交
需要拷贝相关配置文件到resource目录
- core-site.xml
- hdfs-site.xml
- yarn-site.xml
- mapred-site.xml
System.setProperty("HADOOP_USER_NAME", "root");
conf.addResource("conf2/core-site.xml");
conf.addResource("conf2/hdfs-site.xml");
conf.addResource("conf2/mapred-site.xml");
conf.addResource("conf2/yarn-site.xml");
conf.set(MRJobConfig.JAR, "G:\\IDEA_WorkSpace\\BigData\\Hadoop_Test\\target\\Hadoop_Test-1.0-SNAPSHOT.jar");
配置mapred-site.xml
<property>
<name>mapreduce.app-submission.cross-platform</name>
<value>true</value>
</property>
或者用代码
conf.set("mapreduce.app-submission.cross-platform", "true");
自定义Bean对象
import org.apache.hadoop.io.Writable;
import sun.rmi.runtime.Log;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.Serializable;
public class FlowBean implements Writable {
private String phone;
private Long upFlow;
private Long downFlow;
private Long sumFlow;
public FlowBean() {
}
public FlowBean(String phone, Long upFlow, Long downFlow, Long sumFlow) {
this.phone = phone;
this.upFlow = upFlow;
this.downFlow = downFlow;
this.sumFlow = sumFlow;
}
public String getPhone() {
return phone;
}
public void setPhone(String phone) {
this.phone = phone;
}
public Long getUpFlow() {
return upFlow;
}
public void setUpFlow(Long upFlow) {
this.upFlow = upFlow;
}
public Long getDownFlow() {
return downFlow;
}
public void setDownFlow(Long downFlow) {
this.downFlow = downFlow;
}
public Long getSumFlow() {
return sumFlow;
}
public void setSumFlow(Long sumFlow) {
this.sumFlow = sumFlow;
}
@Override
public String toString() {
return "" +
"phone='" + phone + '\'' +
" upFlow=" + upFlow +
" downFlow=" + downFlow +
" sumFlow=" + sumFlow ;
}
/*
* 序列化 编码
* */
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(this.phone);
dataOutput.writeLong(this.upFlow);
dataOutput.writeLong(this.downFlow);
dataOutput.writeLong(this.sumFlow);
}
/*
* 反序列化 解码
* */
public void readFields(DataInput dataInput) throws IOException {
this.phone = dataInput.readUTF();
this.upFlow = dataInput.readLong();
this.downFlow = dataInput.readLong();
this.sumFlow = dataInput.readLong();
}
}
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class FlowMapper extends Mapper<LongWritable, Text, Text, FlowBean> {
/*
* 18611781163 700000 10000
* 18611781163 700000 10000
* 18611781163 700000 10000
* */
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] data = line.split(" ");
/*
* phone
* */
context.write(new Text(data[0]), new FlowBean(data[0], Long.valueOf(data[1]), Long.valueOf(data[2]), (Long.valueOf(data[1]) + Long.valueOf(data[2]))));
}
}
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/*
* 18611781163 FlowBean[]
*
* */
public class FlowReducer extends Reducer<Text, FlowBean, NullWritable, FlowBean> {
@Override
protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {
Long up = 0L;
Long down = 0L;
Long sum = 0L;
for (FlowBean flowBean : values) {
up += flowBean.getUpFlow();
down += flowBean.getDownFlow();
sum += flowBean.getSumFlow();
}
context.write(NullWritable.get(), new FlowBean(key.toString(), up, down, sum));
}
}
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class FlowRunner {
public static void main(String[] args) throws Exception {
System.setProperty("HADOOP_USER_NAME", "root");
Configuration conf = new Configuration();
conf.addResource("conf2/core-site.xml");
conf.addResource("conf2/hdfs-site.xml");
conf.addResource("conf2/mapred-site.xml");
conf.addResource("conf2/yarn-site.xml");
conf.set(MRJobConfig.JAR, "G:\\IDEA_WorkSpace\\BigData\\Hadoop_Test\\target\\Hadoop_Test-1.0-SNAPSHOT.jar");
conf.set("mapreduce.app-submission.cross-platform", "true");
Job job = Job.getInstance(conf);
job.setJarByClass(FlowRunner.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
TextInputFormat.setInputPaths(job, new Path("/flow.dat"));
TextOutputFormat.setOutputPath(job, new Path("/test/out333"));
job.setMapperClass(FlowMapper.class);
job.setReducerClass(FlowReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowBean.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(FlowBean.class);
job.waitForCompletion(true);
}
}
MR组件
InputFormat
分类
-
FileInputFormat
-
TextInputFormat
- key LongWriteable 行的字节偏移量
- value Text 文本
切片:以文件为切分单位,有多少个文件就至少有多少个切片
-
NLineInputFormat
- key LongWriteable 行的字节偏移量
- value Text 文本
切片:n行为一个切片,默认1行为一个切片,可以设置
conf.set(“mapreduce.input.lineinputformat.linespermap”,“10”)
NLineInputFormat.setNumLinesPerSplit();
-
CombineTextInputFormat
-
key LongWriteable 行的字节偏移量
-
value Text 文本
切片:按照SplitSize切分,一个切片可能对应多个Block块
CombineTextInputFormat.setMaxInputSplitSize();
-
CombineTextInputFormat.setMinInputSplitSize();
-
SequenceFileInputFormat
- key 文件名
- value 文件数据
-
-
DBInputFormat(数据库)
-
TableInputFormat(HBase)
NLineInputFormat
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class JobRunner {
public static void main(String[] args) throws Exception {
//略
job.setInputFormatClass(NLineInputFormat.class);
//略
}
}
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.io.Serializable;
/*
* keyIn LongWritable (Long) 输入文本字节偏移量
* valueIn Text (String) 输入文本行
* keyOut Text(String)
* valueOut IntWritable(Int)
* */
public class WCMapper extends Mapper<LongWritable, Text,Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] names = value.toString().split(" ");
for (String name : names) {
context.write(new Text(name),new IntWritable(1));
}
}
}
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/*
*keyIn Text 与mapper的keyOut的数据类型相对应
*valeuIn IntWritable 与mapper的ValueOut的数据类型相对应
* KeyOut
* valueOut
* */
public class WCReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable value : values) {
sum += value.get();
}
context.write(key, new IntWritable(sum));
}
}
CombineTextInputFormat
对于小文件计算进行优化
public class JobRunner {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(JobRunner.class);
/*
* 设置数据输入输出组件
* */
job.setInputFormatClass(CombineTextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
CombineTextInputFormat.setMinInputSplitSize(job, 1048576);
CombineTextInputFormat.setInputPaths(job, new Path("G:\\Note\\Hadoop\\数据文件\\data"));
TextOutputFormat.setOutputPath(job, new Path("G:\\Note\\Hadoop\\数据文件\\out111122"));
job.setMapperClass(WCMapper.class);
job.setReducerClass(WCReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.waitForCompletion(true);
}
}
DBInputFormat
public class JobRunner {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
DBConfiguration.configureDB(conf, "com.mysql.jdbc.Driver", "jdbc:mysql://hadoopnode00:3306/hadoop", "root", "1234");
Job job = Job.getInstance(conf);
job.setJarByClass(JobRunner.class);
job.setInputFormatClass(DBInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
DBInputFormat.setInput(job, User.class, "select id,name from user", "select count(1) from user");
FileOutputFormat.setOutputPath(job, new Path("G:\\IDEA_WorkSpace\\BigData\\Hadoop_Test\\src\\main\\java\\com\\zzj\\DBInputFormat\\out1"));
job.setMapperClass(DBMapper.class);
job.setReducerClass(DBReducer.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
job.waitForCompletion(true);
}
}
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class DBMapper extends Mapper<LongWritable, User, LongWritable, Text> {
@Override
protected void map(LongWritable key, User value, Context context) throws IOException, InterruptedException {
context.write(key, new Text(value.toString()));
}
}
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class DBReducer extends Reducer<LongWritable, Text, NullWritable, Text> {
@Override
protected void reduce(LongWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
for (Text value : values) {
context.write(NullWritable.get(), value);
}
}
}
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.lib.db.DBWritable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
public class User implements Writable, DBWritable {
int id;
String name;
public User() {
}
public User(int id, String name) {
this.id = id;
this.name = name;
}
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
@Override
public String toString() {
return "User{" +
"id=" + id +
", name='" + name + '\'' +
'}';
}
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeInt(this.id);
dataOutput.writeUTF(this.name);
}
public void readFields(DataInput dataInput) throws IOException {
this.id = dataInput.readInt();
this.name = dataInput.readUTF();
}
public void write(PreparedStatement preparedStatement) throws SQLException {
preparedStatement.setInt(1, this.id);
preparedStatement.setString(2, this.name);
}
public void readFields(ResultSet resultSet) throws SQLException {
this.id = resultSet.getInt(1);
this.name = resultSet.getString(2);
}
}
- 如果在本地运行
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.38</version>
</dependency>
- 如果在yarn上运行,需要在hadoopNode00中添加MySQL的环境中将mysql的jar包放入/usr/hadoop-2.9.2/share/hadoop/yarn/ 中即可
- 如果是远程提交
System.setProperty("HADOOP_USER_NAME", "root");
conf.addResource("conf2/core-site.xml");
conf.addResource("conf2/hdfs-site.xml");
conf.addResource("conf2/mapred-site.xml");
conf.addResource("conf2/yarn-site.xml");
conf.set(MRJobConfig.JAR, "G:\\IDEA_WorkSpace\\BigData\\Hadoop_Test\\target\\Hadoop_Test-1.0-SNAPSHOT.jar");
conf.set("mapreduce.app-submission.cross-platform", "true");
自定义InputFormat
解决小文件存储问题,将多个小文件存放在一个SequenceFile(SequenceFile文件是Hadoop用来存储二进制文件形式的key-value的文件格式),SequenceFile,存储的形式为文件的路径名称为key,文件的内容为value
public class JobRunner {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(JobRunner.class);
job.setInputFormatClass(OwnInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
OwnInputFormat.setInputPaths(job, new Path("G:\\Note\\Hadoop\\数据文件\\data"));
SequenceFileOutputFormat.setOutputPath(job, new Path("G:\\Note\\Hadoop\\数据文件\\out12313"));
job.setMapperClass(FileMapper.class);
job.setReducerClass(FileReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(BytesWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(BytesWritable.class);
job.waitForCompletion(true);
}
}
public class OwnRecordReader extends RecordReader<Text, BytesWritable> {
FileSplit fileSplit;
Configuration conf;
BytesWritable value = new BytesWritable();
Text key = new Text();
boolean isProgress = true;
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
fileSplit = (FileSplit) inputSplit;
conf = taskAttemptContext.getConfiguration();
}
public boolean nextKeyValue() throws IOException, InterruptedException {
if (isProgress) {
byte[] bytes = new byte[(int) fileSplit.getLength()];
//获取fs 对象
/*
* 当前文件的路径
* */
Path path = fileSplit.getPath();
FileSystem fileSystem = path.getFileSystem(conf);
/*
* 获取到文件的数据流
* */
FSDataInputStream inputStream = fileSystem.open(path);
IOUtils.readFully(inputStream, bytes, 0, bytes.length);
/*
* 封装value
* */
value.set(bytes, 0, bytes.length);
key.set(path.toString());
IOUtils.closeStream(inputStream);
isProgress = false;
return true;
}
return false;
}
public Text getCurrentKey() throws IOException, InterruptedException {
return this.key;
}
public BytesWritable getCurrentValue() throws IOException, InterruptedException {
return this.value;
}
public float getProgress() throws IOException, InterruptedException {
return 0;
}
public void close() throws IOException {
}
}
Partitioner
import org.apache.hadoop.mapreduce.Partitioner;
import java.util.HashMap;
public class OwnPartitioner<KEY, VALUE> extends Partitioner<KEY, VALUE> {
private static HashMap<String, Integer> areaMap = new HashMap<String, Integer>();
static {
areaMap.put("hn", 0);
areaMap.put("henna", 0);
areaMap.put("bj", 1);
areaMap.put("tj", 2);
areaMap.put("hb", 3);
}
public int getPartition(KEY key, VALUE value, int i) {
return areaMap.get(key.toString()) == null ? 5 : areaMap.get(key.toString());
}
}
package com.baizhi.partitioner;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class FlowBean implements Writable {
private String phone;
private Long upFlow;
private Long downFlow;
private Long sumFlow;
//略set get
/*
* 序列化 编码
* */
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(this.phone);
dataOutput.writeLong(this.upFlow);
dataOutput.writeLong(this.downFlow);
dataOutput.writeLong(this.sumFlow);
}
public void readFields(DataInput dataInput) throws IOException {
this.phone = dataInput.readUTF();
this.upFlow = dataInput.readLong();
this.downFlow = dataInput.readLong();
this.sumFlow = dataInput.readLong();
}
}
public class FlowMapper extends Mapper<LongWritable, Text, Text, FlowBean> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] data = line.split(" ");
context.write(new Text(data[3]), new FlowBean(data[0], Long.valueOf(data[1]), Long.valueOf(data[2]), (Long.valueOf(data[1]) + Long.valueOf(data[2]))));
}
}
public class FlowReducer extends Reducer<Text, FlowBean, Text, FlowBean> {
@Override
protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {
Long up = 0L;
Long down = 0L;
Long sum = 0L;
String phone = "";
for (FlowBean flowBean : values) {
up += flowBean.getUpFlow();
down += flowBean.getDownFlow();
sum += flowBean.getSumFlow();
phone = flowBean.getPhone();
}
context.write(key, new FlowBean(phone, up, down, sum));
}
}
public class FlowRunner {
public static void main(String[] args) throws Exception {
//略
job.setNumReduceTasks(4);
job.waitForCompletion(true);
}
}
OutputFormat
自定义输出
public class FileMapper extends Mapper<LongWritable, Text, NullWritable, Text> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
context.write(NullWritable.get(), value);
}
}
public class FileReducer extends Reducer<NullWritable, Text, NullWritable, Text> {
@Override
protected void reduce(NullWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
for (Text value : values) {
context.write(NullWritable.get(), value);
}
}
}
class Job{
//略
job.setOutputFormatClass(OwnOutputFormat.class);
//略
}
public class OwnOutputFormat extends FileOutputFormat<NullWritable, Text> {
public RecordWriter<NullWritable, Text> getRecordWriter(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
try {
return new OwnRecordWriter(taskAttemptContext);
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
}
public class OwnRecordWriter extends RecordWriter<NullWritable, Text> {
FileSystem fileSystem;
FSDataOutputStream outputStream;
public OwnRecordWriter(TaskAttemptContext taskAttemptContext) throws Exception {
fileSystem = FileSystem.get(taskAttemptContext.getConfiguration());
outputStream = fileSystem.create(new Path("G:\\Note\\Hadoop\\数据文件\\testoutputforamt.txt"));
}
public void write(NullWritable nullWritable, Text text) throws IOException, InterruptedException {
outputStream.write(text.getBytes());
}
public void close(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
IOUtils.closeStream(outputStream);
fileSystem.close();
}
}
Combiner
- Combiner是MR程序中Mapper和Reducer之外的一种组件
- Combiner的组件的父类就是Reducer
- Combiner和Reucer的区别在于运行的位置
Combiner是在每一个MapTask的节点上运行 (局部汇总)
Reducer是接收全局的所有的Mapper结果再进行处理 (全局汇总)
- Combiner的意义就是对于每一个MapTask的输出进行局部汇总,减少网络传输量
- Combiner能够运用的前提是不能影响最终业务结果(累加操作不会影响)而且 Combiner的输出KV 应该能跟Reducer的KV相对应
应用场景
只适用于累加不影响结果的场合
使用
- 新建CombinerClass继承Reducer ,job.setCombinerClass();
- 直接使用ReducerClass 作为CombinerClass
job.setCombinerClass(WCReducer.class);
Hadoop高可用集群搭建
HadoopNode01 | HadoopNode02 | HadoopNode03 |
---|---|---|
nn1 | nn2 | |
journal node | journal node | journal node |
zkfc | zkfc | |
datanode | datanode | datanode |
zk01 | zk02 | zk03 |
rm1 | rm2 | |
nodemanager | nodemanager | nodemanager |
1.Zookeeper集群搭建
添加配置文件
[root@HadoopNode0X ~]tar -zxvf zookeeper-3.4.6.tar.gz -C /usr/
[root@HadoopNode0X zookeeper-3.4.6]# mkdir data
[root@HadoopNode0X zookeeper-3.4.6]# cp conf/zoo_sample.cfg conf/zoo.cfg
[root@HadoopNode0X zookeeper-3.4.6]# vim conf/zoo.cfg
tickTime=2000 # session 过期时间
initLimit=10
syncLimit=5
dataDir=/usr/zookeeper-3.4.6/data/ # zk 数据文件
clientPort=2181 # 外部服务 端口号
server.1=HadoopNode01:2887:3887
server.2=HadoopNode02:2888:3888
server.3=HadoopNode03:2889:3889
新建myid文件
在上述配置文件中对的data目录下,创建一个myid文件,在节点1中的myid中写 1 (就一个数字1),在节点2中写数字2,依次列推
复制配置启动
2.Hadoop集群搭建
基础环境准备
- 配置主机名和IP的映射关系
[root@HadoopNodeX ~]# vi /etc/hosts
192.168.191.21 hadoopNode00
192.168.191.22 hadoopNode01
192.168.191.23 hadoopNode02
192.168.191.24 hadoopNode03
192.168.191.31 ZK01
192.168.191.32 ZK02
192.168.191.33 ZK03
- 关闭防火墙
- 同步时钟
[root@HadoopNodeX ~]# yum -y install ntpdate
[root@HadoopNodeX ~]# ntpdate -u ntp.api.bz
25 Sep 11:19:26 ntpdate[1749]: step time server 114.118.7.163 offset 201181.363384 sec
[root@HadoopNodeX ~]# date
Wed Sep 25 11:19:52 CST 2019
- 配置SSH 免密登陆
[root@HadoopNodeX ~]# ssh-keygen -t rsa # 现在三台机器上都运行次命令,在运行下面的
[root@HadoopNodeX ~]# ssh-copy-id hadoopNode01
[root@HadoopNodeX ~]# ssh-copy-id hadoopNode02
[root@HadoopNodeX ~]# ssh-copy-id hadoopNode03
- Java环境
export JAVA_HOME=/home/java/jdk1.8.0_181
export PATH=$PATH:$JAVA_HOME/bin
-
启动ZK
-
安装Hadoop
- 解压配置环境变量
export HADOOP_HOME=/home/hadoop/hadoop-2.9.2 export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
- 配置core-site.xml
<property> <name>fs.defaultFS</name> <value>hdfs://mycluster</value> </property> <property> <name>hadoop.tmp.dir</name> <value>/usr/hadoop-2.9.2/hadoop-${user.name}</value> </property> <property> <name>fs.trash.interval</name> <value>30</value> </property> <property> <name>net.topology.script.file.name</name> <value>/usr/hadoop-2.9.2/etc/hadoop/rack.sh</value> </property>
- 创建机架脚本
在对应的文件夹中/usr/hadoop-2.9.2/etc/hadoop/ 创建一个rack.sh文件,将下方内容粘贴进入
while [ $# -gt 0 ] ; do nodeArg=$1 exec</usr/hadoop-2.9.2/etc/hadoop/topology.data result="" while read line ; do ar=( $line ) if [ "${ar[0]}" = "$nodeArg" ] ; then result="${ar[1]}" fi done shift if [ -z "$result" ] ; then echo -n "/default-rack" else echo -n "$result " fi done
另外 需要对机架文件进行权限控制,chmod u+x /usr/hadoop-2.9.2/etc/hadoop/rack.sh
- 创建机架映射文件
创建 /usr/hadoop-2.9.2/etc/hadoop/topology.data
192.168.191.22 /rack1 192.168.191.23 /rack1 192.168.191.24 /rack2
- 配置 hdfs-site.xml
<configuration> <property> <name>dfs.replication</name> <value>3</value> </property> <property> <name>dfs.ha.automatic-failover.enabled</name> <value>true</value> </property> <property> <name>ha.zookeeper.quorum</name> <value>zk01:2181,zk02:2181,zk03:2181</value> </property> <property> <name>dfs.nameservices</name> <value>mycluster</value> </property> <property> <name>dfs.ha.namenodes.mycluster</name> <value>nn1,nn2</value> </property> <property> <name>dfs.namenode.rpc-address.mycluster.nn1</name> <value>hadoopNode01:9000</value> </property> <property> <name>dfs.namenode.rpc-address.mycluster.nn2</name> <value>hadoopNode02:9000</value> </property> <property> <name>dfs.namenode.shared.edits.dir</name> <value>qjournal://hadoopNode01:8485;hadoopNode02:8485;hadoopNode03:8485/mycluster</value> </property> <property> <name>dfs.client.failover.proxy.provider.mycluster</name> <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value> </property> <property> <name>dfs.ha.fencing.methods</name> <value>sshfence</value> </property> <property> <name>dfs.ha.fencing.ssh.private-key-files</name> <value>/root/.ssh/id_rsa</value> </property> </configuration>
- 编辑slaves信息
hadoopNode01
hadoopNode02
hadoopNode03
启动
[root@HadoopNodeX ~]# rm -rf /usr/hadoop-2.9.2/hadoop-root/*
[root@HadoopNodeX ~]# hadoop-daemons.sh start journalnode # 在任意节点中运行即可
[root@HadoopNode01 ~]# hdfs namenode -format
[root@HadoopNode01 ~]# hadoop-daemon.sh start namenode
[root@HadoopNode02 ~]# hdfs namenode -bootstrapStandby # 下载主namenode的元数据
[root@HadoopNode02 ~]# hadoop-daemon.sh start namenode
[root@HadoopNode01|02 ~]# hdfs zkfc -formatZK #在任意的一个01|02 节点上格式化即可
[root@HadoopNode01 ~]# hadoop-daemon.sh start zkfc
[root@HadoopNode02 ~]# hadoop-daemon.sh start zkfc
[root@HadoopNodeX ~]# hadoop-daemon.sh start datanode
[root@HadoopNode01 ~]# jps
2324 JournalNode
2661 DFSZKFailoverController
2823 Jps
2457 NameNode
2746 DataNode
[root@HadoopNode02 ~]# jps
2595 DataNode
2521 DFSZKFailoverController
2681 Jps
2378 NameNode
2142 JournalNode
[root@HadoopNode03 .ssh]# jps
2304 Jps
2146 JournalNode
2229 DataNode
日常维护
[root@HadoopNode01 ~]# stop-dfs.sh
[root@HadoopNode01 ~]# start-dfs.sh
3.YARN 高可用
yarn
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.resourcemanager.ha.enabled</name>
<value>true</value>
</property>
<property>
<name>yarn.resourcemanager.zk-address</name>
<value>zk01:2181,zk02:2181,zk03:2181</value>
</property>
<property>
<name>yarn.resourcemanager.cluster-id</name>
<value>rmcluster01</value>
</property>
<property>
<name>yarn.resourcemanager.ha.rm-ids</name>
<value>rm1,rm2</value>
</property>
<property>
<name>yarn.resourcemanager.hostname.rm1</name>
<value>hadoopNode02</value>
</property>
<property>
<name>yarn.resourcemanager.hostname.rm2</name>
<value>hadoopNode03</value>
</property>
mapred
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
启动
[root@HadoopNode02 ~]# yarn-daemon.sh start resourcemanager
[root@HadoopNode03 ~]# yarn-daemon.sh start resourcemanager
[root@HadoopNodeX ~]# yarn-daemon.sh start nodemanager