大数据组件安装大全

安装部署

初始化环境

配置网络

vi /etc/sysconfig/network-scripts/ifcfg-ens33
# 修改
DEVICE=ens33
TYPE=Ethernet
ONBOOT=yes
BOOTPROTO=static
NAME="ens33"
PREFIX=24
IPADDR=192.168.88.120
GATEWAY=192.168.88.2
DNS1=114.114.114.114

修改主机名

hostnamectl --static set-hostname hadoop102

关闭防火墙

systemctl stop firewalld
systemctl disable firewalld

创建用户

vim /etc/sudoers
# 添加
atguigu   ALL=(ALL)     NOPASSWD: ALL

创建文件夹

  • 这些文件加用于存放组件压缩包和组件
mkdir /opt/module /opt/software

关闭selinux

vi /etc/sysconfig/selinux
# 修改
SELINUX=disabled

克隆集群

外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传

修改从节点ip主机名

vi /etc/sysconfig/network-scripts/ifcfg-ens33
# 分别修改ip为不同值
IPADDR=192.168.88.121
# 分别修改主机名
hostnamectl --static set-hostname hadoop103

配置SSH

# 一直回车
ssh-keygen -t rsa 
# 拷贝证书
ssh-copy-id 192.168.88.120
ssh-copy-id 192.168.88.121
ssh-copy-id 192.168.88.122

配置主机映射文件

vi /etc/hosts
# 添加
192.168.88.120  hadoop102
192.168.88.121  hadoop103
192.168.88.122  hadoop104
# 分发
scp /etc/hosts root@hadoop103:/etc
scp /etc/hosts root@hadoop104:/etc

配置yum源

cd /etc/yum.repos.d
yum install -y wget
wget -O /etc/yum.repos.d/CentOS-Base.repo http://mirrors.aliyun.com/repo/Centos-7.repo
# 清除yum缓存
yum clean all
# 缓存阿里云源
yum makecache

配置JDK

解压

tar -zxvf /opt/software/jdk-8u212-linux-x64.tar.gz -C /opt/module/

配置环境变量

vim /etc/profile.d/my_env.sh
# Java
export JAVA_HOME=/opt/module/jdk1.8.0_212
export PATH=$PATH:$JAVA_HOME/bin

测试

java -version
java version "1.8.0_212"
Java(TM) SE Runtime Environment (build 1.8.0_212-b10)
Java HotSpot(TM) 64-Bit Server VM (build 25.212-b10, mixed mode)

分发

 xcall /opt/module/jdk1.8.0_212/

Zookeeper搭建

解压

tar -zxvf /opt/software/apache-zookeeper-3.7.1-bin.tar.gz -C /opt/module/

修改配置文件

mv /opt/module/apache-zookeeper-3.7.1-bin/conf/zoo_sample.cfg /opt/module/apache-zookeeper-3.7.1-bin/conf/zoo.cfg 
vim /opt/module/apache-zookeeper-3.7.1-bin/conf/zoo.cfg
# 修改
dataDir=/opt/module/apache-zookeeper-3.7.1-bin/zkData
# 添加
server.1=hadoop102:2888:3888
server.2=hadoop103:2888:3888
server.3=hadoop104:2888:3888

创建日志文件

mkdir /opt/module/apache-zookeeper-3.7.1-bin/zkData

创建myid文件

vim /opt/module/apache-zookeeper-3.7.1-bin/zkData/myid
# 集群中唯一标识,需要对应zoo.cfg文件中的server.数字
1

配置环境变量

vim /etc/profile.d/my_env.sh
# Zookeeper
export ZOOKEEPER_HOME=/opt/module/apache-zookeeper-3.7.1-bin
export PATH=$PATH:$ZOOKEEPER_HOME/bin

分发

xsync /opt/module/apache-zookeeper-3.7.1-bin/
xsync /etc/profile.d/my_env.sh

分别修改myid

vim /opt/module/apache-zookeeper-3.7.1-bin/zkData/myid

分别启动/停止

# 启动
/opt/module/apache-zookeeper-3.7.1-bin/bin/zkServer.sh start
# 停止
/opt/module/apache-zookeeper-3.7.1-bin/bin/zkServer.sh stop
# 查看状态
/opt/module/apache-zookeeper-3.7.1-bin/bin/zkServer.sh status

HadoopHA搭建

集群规划

进程hadoop102hadoop103hadoop104作用
NameNode管理元数据
处理用户请求给DataNode
DataNode数据存储
负责处理NameNode发送过来的请求
ResourceManager负责整个Hadoop集群中资源的调度和分配
NodeManager负责管理各个节点的资源,接受RM的任务并启动容器执行
JournalNode存储了关键的元数据信息
保证数据的可靠性和一致性
DFSZKFailoverController管理NameNode的主备切换
当Active因为网络波动或者其他停止,为防止脑裂不论后期是否可以重新启动都直接将Active强制关闭

解压

tar -zxvf /opt/software/hadoop-3.3.4.tar.gz -C /opt/module/

配置环境变量

vim /etc/profile.d/my_env.sh
source /etc/profile.d/my_env.sh
#HADOOP_HOME
export HADOOP_HOME=/opt/module/hadoop-3.3.4
export PATH=$PATH:$HADOOP_HOME/bin
export PATH=$PATH:$HADOOP_HOME/sbin
export HADOOP_CLASSPATH=`/opt/module/hadoop-3.3.4/bin/hadoop classpath`

修改配置文件

core-site.xml
  • mycluster:集群名称不能有下划线
vim /opt/module/hadoop-3.3.4/etc/hadoop/core-site.xml
<configuration>
    	<!-- 指定hdfs的nameservice为cluster1 -->
    	<property>
        	<name>fs.defaultFS</name>
        	<value>hdfs://mycluster</value>
    	</property>

    	<!-- 指定hadoop临时目录 -->
    	<property>
        	<name>hadoop.tmp.dir</name>
        	<value>/opt/module/hadoop-3.3.4/data/tmp</value>
    	</property>

    	<!-- 指定zookeeper地址 -->
    	<property>
        	<name>ha.zookeeper.quorum</name>
        	<value>hadoop102:2181,hadoop103:2181,hadoop104:2181</value>
    	</property>

    	<!-- hadoop链接zookeeper的超时时长设置 -->
    	<property>
        	<name>ha.zookeeper.session-timeout.ms</name>
        	<value>10000</value>
        	<description>ms</description>
    	</property>
	<!-- 指定哪些主机可以使用root用户作为代理执行Hadoop操作 -->
	<property>
		<name>hadoop.proxyuser.root.hosts</name>
		<value>*</value>
	</property>
	<!-- 指定哪些用户组可以使用root用户作为代理执行Hadoop操作 -->
	<property>
		<name>hadoop.proxyuser.root.groups</name>
		<value>*</value>
	</property>
    <!-- 设置用户 -->
	<property>
  		<name>hadoop.http.staticuser.user</name>
  		<value>root</value>
	</property>
</configuration>
hdfs-site.xml
vim /opt/module/hadoop-3.3.4/etc/hadoop/hdfs-site.xml
<configuration>
	<!-- 指定副本数 -->
    <property>
        <name>dfs.replication</name>
        <value>2</value>
    </property>
    <!-- 配置namenode和datanode的工作目录-数据存储目录 -->
    <!-- namenode一般保存元数据 -->
    <property>
        <name>dfs.namenode.name.dir</name>
        <value>/opt/module/hadoop-3.3.4/data/hdfs/namenode</value>
    </property>
    <!-- datanode一般保存数据块 -->
    <property>
        <name>dfs.datanode.data.dir</name>
        <value>/opt/module/hadoop-3.3.4/data/hdfs/datanode</value>
    </property>
    <!-- 启用webhdfs:开启可以在WebUI中对文件进行创建删除操作 -->
    <property>
        <name>dfs.webhdfs.enabled</name>
        <value>true</value>
    </property>
	<!-- Hadoop中的逻辑名称 -->
    <property>
        <name>dfs.nameservices</name>
        <value>mycluster</value>
    </property>
    <!-- 多个NameNode的节点地址,最少2个 -->
    <property>
        <name>dfs.ha.namenodes.mycluster</name>
        <value>nn1,nn2</value>
    </property>
    <!-- nn1的RPC通信地址 -->
    <property>
        <name>dfs.namenode.rpc-address.mycluster.nn1</name>
        <value>hadoop102:9000</value>
    </property>
    <!-- nn1的http通信地址 -->
    <property>
        <name>dfs.namenode.http-address.mycluster.nn1</name>
        <value>hadoop102:9870</value>
    </property>
    <!-- nn2的RPC通信地址 -->
    <property>
        <name>dfs.namenode.rpc-address.mycluster.nn2</name>
        <value>hadoop103:9000</value>
    </property>
    <!-- nn2的http通信地址 -->
    <property>
        <name>dfs.namenode.http-address.mycluster.nn2</name>
        <value>hadoop103:9870</value>
    </property>
    
    <!-- 共享编辑日志,最少3台,实现数据同步 -->
    <property>
        <name>dfs.namenode.shared.edits.dir</name>
        <value>qjournal://hadoop102:8485;hadoop103:8485;hadoop104:8485/mycluster</value>
    </property>
    <!-- 指定JournalNode在本地磁盘的位置 -->
    <property>
        <name>dfs.journalnode.edits.dir</name>
        <value>/opt/module/hadoop-3.3.4/data/journalnode</value>
    </property>

    <!-- 开启NameNode失败自动切换 -->
    <property>
        <name>dfs.ha.automatic-failover.enabled</name>
        <value>true</value>
    </property>
    <!-- 配置失败自动切换实现方式 -->
    <property>
        <name>dfs.client.failover.proxy.provider.mycluster</name>
        <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
    </property>

    <!-- 配置隔离机制方法,当发生故障时强制关闭无法无法访问的NameNode,解决脑裂问题 -->
    <property>
        <name>dfs.ha.fencing.methods</name>
        <value>sshfence</value>
    </property>
    <!-- 使用sshfence隔离机制时需要ssh免登陆 -->
    <property>
        <name>dfs.ha.fencing.ssh.private-key-files</name>
        <value>/root/.ssh/id_rsa</value>
    </property>
    <!-- 配置sshfence隔离机制超时时间 -->
    <property>
        <name>dfs.ha.fencing.ssh.connect-timeout</name>
        <value>30000</value>
    </property>
	<!-- 定义了在进行故障切换控制器 CLI 检查时的 RPC 超时时间 -->
    <property>
        <name>ha.failover-controller.cli-check.rpc-timeout.ms</name>
        <value>60000</value>
    </property>
</configuration>
mapred-site.xml
vim /opt/module/hadoop-3.3.4/etc/hadoop/mapred-site.xml
<configuration>
    <!-- 指定mr框架为yarn方式 -->
    <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
    </property>
    <!-- 指定mapreduce jobhistory地址 -->
    <property>
        <name>mapreduce.jobhistory.address</name>
        <value>hadoop103:10020</value>
    </property>
    <!-- 任务历史服务器的web地址 -->
    <property>
        <name>mapreduce.jobhistory.webapp.address</name>
        <value>hadoop103:19888</value>
    </property>
</configuration>
yarn-site.xml
vim /opt/module/hadoop-3.3.4/etc/hadoop/yarn-site.xml
<configuration>
    <!-- 开启RM高可用 -->
    <property>
        <name>yarn.resourcemanager.ha.enabled</name>
        <value>true</value>
    </property>
    <!-- 指定RM的cluster id -->
    <property>
        <name>yarn.resourcemanager.cluster-id</name>
        <value>yrc</value>
    </property>

    <!-- 指定RM的名字 -->
    <property>
        <name>yarn.resourcemanager.ha.rm-ids</name>
        <value>rm1,rm2</value>
    </property>

    <!-- 分别指定RM的地址 -->
    <property>
        <name>yarn.resourcemanager.hostname.rm1</name>
        <value>hadoop103</value>
    </property>
    <property>
        <name>yarn.resourcemanager.hostname.rm2</name>
        <value>hadoop104</value>
    </property>

    <!-- 指定zk集群地址 -->
    <property>
        <name>yarn.resourcemanager.zk-address</name>
        <value>hadoop102:2181,hadoop103:2181,hadoop104:2181</value>
    </property>
	
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
    </property>
    
	<!-- 开启日志聚合,开启后会将各个应用程序和容器的日志收集到指定位置 -->
    <property>
        <name>yarn.log-aggregation-enable</name>
        <value>true</value>
    </property>
	<!-- 设置日志保留的时间(秒),这里保存7天 -->
    <property>
        <name>yarn.log-aggregation.retain-seconds</name>
        <value>604800</value>
    </property>
    <!-- 日志聚合在HDFS中保存的位置 -->
    <property>
        <name>yarn.nodemanager.remote-app-log-dir</name>
        <value>/user/logs</value>
    </property>
    
    <!-- 启用自动恢复 -->
    <property>
        <name>yarn.resourcemanager.recovery.enabled</name>
        <value>true</value>
    </property>
    <!-- 制定resourcemanager的状态信息存储在zookeeper集群上 -->
    <property>
        <name>yarn.resourcemanager.store.class</name>
        <value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>
    </property>
    <!-- 环境变量的继承 -->
    <property>
        <name>yarn.nodemanager.env-whitelist</name>
        <value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
    </property>
    
    <!--yarn单个容器允许分配的最大最小内存 -->
    <property>
        <name>yarn.scheduler.minimum-allocation-mb</name>
        <value>512</value>
    </property>
    <property>
        <name>yarn.scheduler.maximum-allocation-mb</name>
        <value>4096</value>
    </property>
    
    <!-- yarn容器允许管理的物理内存大小 -->
    <property>
        <name>yarn.nodemanager.resource.memory-mb</name>
        <value>4096</value>
    </property>
    
    <!-- 关闭yarn对物理内存和虚拟内存的限制检查 -->
    <property>
        <name>yarn.nodemanager.pmem-check-enabled</name>
        <value>true</value>
    </property>
    <property>
        <name>yarn.nodemanager.vmem-check-enabled</name>
        <value>false</value>
    </property>
</configuration>
hadoop-env.sh
vim /opt/module/hadoop-3.3.4/etc/hadoop/hadoop-env.sh
export HDFS_NAMENODE_USER=root
export HDFS_DATANODE_USER=root
export HDFS_SECONDARYNAMENODE_USER=root
export YARN_RESOURCEMANAGER_USER=root
export YARN_NODEMANAGER_USER=root
export JAVA_HOME=/opt/module/jdk1.8.0_212
workers
vim /opt/module/hadoop-3.3.4/etc/hadoop/workers
hadoop102
hadoop103
hadoop104

分发

xsync /opt/module/hadoop-3.3.4/
xsync /etc/profile.d/my_env.sh

初始化

启动ZK
MyZk.sh start
启动journalnode
  • 3台机器分别启动
  • 初始化完成后续启动不需要手动启动journalnode进程
hdfs --daemon start journalnode
初始化NameNode
hadoop namenode -format
启动NameNode
  • 启动hadoop102
hdfs --daemon start namenode
同步NameNode数据
  • hadoop103同步
hdfs namenode --bootstrapStandby
格式化ZKFC
  • 在哪台机器执行哪台就是第一个active
hdfs zkfc -formatZK
启动所有
start-all.sh

主备切换

查看节点状态
# hdfs-site.xml中的dfs.ha.namenodes.mycluster参数
hdfs haadmin -getServiceState nn1
切换
# 将nn2切换为Standby
hdfs haadmin -transitionToStandby --forcemanual nn2
# 将nn1切换为Active
hdfs haadmin -transitionToActive --forcemanual nn1

数据均衡

节点间数据均衡
# 10:表示节点间利用率相差不超过10%
start-balancer.sh -threshold 10
# 停止数据均衡
stop-balancer.sh
磁盘间数据均衡
生成均衡计划
hdfs diskbalancer -plan hadoop103
执行均衡计划
hdfs diskbalancer -execute hadoop103.plan.json
查看当前均衡任务的执行情况
hdfs diskbalancer -query hadoop103
取消均衡任务
hdfs diskbalancer -cancel hadoop103.plan.json

Yarn调优

增加ApplicationMaster资源比例
  • 这个参数调整容量调度器中每个资源队列上Application Master占用最大总资源
  • 调整每个Application Master可以使用的最大内存资源
vim /opt/module/hadoop-3.3.4/etc/hadoop/capacity-scheduler.xml
# 修改
<property>
    <name>yarn.scheduler.capacity.maximum-am-resource-percent</name>
    <value>0.8</value>
</property>

Kafka搭建

解压

tar -zxvf /opt/software/kafka_2.12-3.3.1.tgz -C /opt/module/

配置环境变量

vim /etc/profile.d/my_env.sh
source /etc/profile.d/my_env.sh
# Kafka
export KAFKA_HOME=/opt/module/kafka_2.12-3.3.1
export PATH=$PATH:$KAFKA_HOME/bin

修改配置文件

vim /opt/module/kafka_2.12-3.3.1/config/server.properties
# 修改,这个参数为ZK中zoo.cfg中server.数字
broker.id=1
log.dirs=/opt/module/kafka_2.12-3.3.1/data
zookeeper.connect=hadoop102:2181,hadoop103:2181,hadoop104:2181/kafka
num.partitions=4
# 添加
listeners=PLAINTEXT://hadoop102:9092
# 注释
log.retention.hours=168
log.retention.check.interval.ms=300000

分发

xsync /opt/module/kafka_2.12-3.3.1
xsync /etc/profile.d/my_env.sh

分别修改配置

vim /opt/module/kafka_2.12-3.3.1/config/server.properties
listeners=PLAINTEXT://本机ip:9092
broker.id=ZK中zoo.cfg中server.数字

启动

  • 前提:开启ZK
  • 分别启动
nohup kafka-server-start.sh /opt/module/kafka_2.12-3.3.1/config/server.properties >> /opt/module/kafka_2.12-3.3.1/logs/kafka.log 2>&1 &

测试

  • 没报错就是成功
kafka-topics.sh --bootstrap-server hadoop102:9092 --list

Flume搭建

解压

tar -zxvf /opt/software/apache-flume-1.10.1-bin.tar.gz -C /opt/module/

配置环境变量

vim /etc/profile.d/my_env.sh
source /etc/profile.d/my_env.sh
# Flume
export FLUME_HOME=/opt/module/apache-flume-1.10.1-bin
export PATH=$PATH:$FLUME_HOME/bin

修改log4j2.xml

vim /opt/module/apache-flume-1.10.1-bin/conf/log4j2.xml
# 修改
<Property name="LOG_DIR">/opt/module/apache-flume-1.10.1-bin/logs</Property>
# 添加Loggers标签中
<Root level="INFO">
	<AppenderRef ref="LogFile" />
	<AppenderRef ref="Console" />
</Root>

修改flume-env.sh

mv /opt/module/apache-flume-1.10.1-bin/conf/flume-env.sh.template /opt/module/apache-flume-1.10.1-bin/conf/flume-env.sh
vim /opt/module/apache-flume-1.10.1-bin/conf/flume-env.sh
# 添加
export JAVA_HOME=/opt/module/jdk1.8.0_212

解决Jar包冲突

mv /opt/module/hadoop-3.3.4/share/hadoop/common/lib/guava-27.0-jre.jar /opt/module/apache-flume-1.10.1-bin/lib/
rm -f /opt/module/apache-flume-1.10.1-bin/lib/guava-11.0.2.jar

Mysql安装

#!/bin/bash
set -x
[ "$(whoami)" = "root" ] || exit 1
[ "$(ls *.rpm | wc -l)" = "7" ] || exit 1
test -f mysql-community-client-8.0.31-1.el7.x86_64.rpm && \
test -f mysql-community-client-plugins-8.0.31-1.el7.x86_64.rpm && \
test -f mysql-community-common-8.0.31-1.el7.x86_64.rpm && \
test -f mysql-community-icu-data-files-8.0.31-1.el7.x86_64.rpm && \
test -f mysql-community-libs-8.0.31-1.el7.x86_64.rpm && \
test -f mysql-community-libs-compat-8.0.31-1.el7.x86_64.rpm && \
test -f mysql-community-server-8.0.31-1.el7.x86_64.rpm || exit 1

# 卸载MySQL
systemctl stop mysql mysqld 2>/dev/null
rpm -qa | grep -i 'mysql\|mariadb' | xargs -n1 rpm -e --nodeps 2>/dev/null
rm -rf /var/lib/mysql /var/log/mysqld.log /usr/lib64/mysql /etc/my.cnf /usr/my.cnf

set -e
# 安装并启动MySQL
yum install -y *.rpm >/dev/null 2>&1
systemctl start mysqld

#更改密码级别并重启MySQL
sed -i '/\[mysqld\]/avalidate_password.length=4\nvalidate_password.policy=0' /etc/my.cnf
systemctl restart mysqld

# 更改MySQL配置
tpass=$(cat /var/log/mysqld.log | grep "temporary password" | awk '{print $NF}')
cat << EOF | mysql -uroot -p"${tpass}" --connect-expired-password >/dev/null 2>&1
set password='000000';
update mysql.user set host='%' where user='root';
alter user 'root'@'%' identified with mysql_native_password by '000000';
flush privileges;
EOF

Maxwell安装

解压

tar -zxvf /opt/software/maxwell-1.29.2.tar.gz -C /opt/module/

配置Mysql

修改my.cnf
vim /etc/my.cnf
  • 添加
#数据库id
server-id = 1
##启动binlog,该参数的值会作为binlog的文件名
log-bin=mysql-bin
##binlog类型,maxwell要求为row类型
binlog_format=row
##启用binlog的数据库,需根据实际情况作出修改
binlog-do-db= financial_lease
重启Mysql
sudo systemctl restart mysqld

创建Maxwell数据库与用户

登录数据库
mysql -uroot -p000000
创建数据库
create database maxwell;
创建用户
# 创建maxwell用户,密码为maxwell
CREATE USER 'maxwell'@'%' IDENTIFIED BY 'maxwell';
# 赋予maxwell对maxwell库的所有权限
GRANT ALL ON maxwell.* TO 'maxwell'@'%';
# 赋予对其他数据库的一些权限
GRANT SELECT, REPLICATION CLIENT, REPLICATION SLAVE ON *.* TO 'maxwell'@'%';

配置Maxwell

cp /opt/module/maxwell-1.29.2/config.properties.example /opt/module/maxwell-1.29.2/config.properties
vim /opt/module/maxwell-1.29.2/config.properties
log_level=info

producer=kafka
kafka.bootstrap.servers=hadoop102:9092,hadoop103:9092,hadoop104:9092
kafka_topic=topic_db
# 指定数据按照主键分组进入Kafka不同分区,避免数据倾斜
producer_partition_by=primary_key

# mysql login info
host=hadoop102
user=maxwell
password=maxwell
jdbc_options=useSSL=false&serverTimezone=Asia/Shanghai&allowPublicKeyRetrieval=true

启/停

  • 启动
/opt/module/maxwell-1.29.2/bin/maxwell --config /opt/module/maxwell-1.29.2/config.properties --daemon
  • 停止
ps -ef | grep com.zendesk.maxwell.Maxwell | grep -v grep | awk '{print $2}' | xargs kill -9

DataX安装

解压

  • 此版本为尚硅谷修改版,默认读取Mysql是5.7以下
  • 如果使用官方版本并读取Mysql5.7以上,需要修改/opt/module/datax/plugin/reader/mysqlreader/libs中的mysqljar包
  • writer文件也同样修改,并且编写配置文件时指定Driver
tar -zxvf /opt/software/datax.tar.gz -C /opt/module/

Hive安装

解压

tar -zxvf /opt/software/hive-3.1.3.tar.gz -C /opt/module

配置环境变量

vim /etc/profile.d/my_env.sh
source  /etc/profile.d/my_env.sh
# Hive
export HIVE_HOME=/opt/module/apache-hive-3.1.3-bin
export PATH=$PATH:$HIVE_HOME/bin

解决jar包冲突

mv /opt/module/apache-hive-3.1.3-bin/lib/log4j-slf4j-impl-2.17.1.jar /opt/module/apache-hive-3.1.3-bin/lib/log4j-slf4j-impl-2.17.1.jar.bak

添加hive-site.xml

vim /opt/module/apache-hive-3.1.3-bin/conf/hive-site.xml
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
    <!-- 配置Hive保存元数据信息所需的 MySQL URL地址 -->
    <property>
        <name>javax.jdo.option.ConnectionURL</name>
        <value>jdbc:mysql://hadoop102:3306/metastore?useSSL=false&amp;useUnicode=true&amp;characterEncoding=UTF-8&amp;allowPublicKeyRetrieval=true</value>
    </property>

    <!-- 配置Hive连接MySQL的驱动全类名 -->
    <property>
        <name>javax.jdo.option.ConnectionDriverName</name>
        <value>com.mysql.cj.jdbc.Driver</value>
    </property>

    <!-- 配置Hive连接MySQL的用户名 -->
    <property>
        <name>javax.jdo.option.ConnectionUserName</name>
        <value>hive</value>
    </property>

    <!-- 配置Hive连接MySQL的密码 -->
    <property>
        <name>javax.jdo.option.ConnectionPassword</name>
        <value>hive</value>
    </property>

    <!-- Hive数据仓库的存储路径 -->
    <property>
        <name>hive.metastore.warehouse.dir</name>
        <value>/user/hive/warehouse</value>
    </property>

    <!-- 禁用Hive元数据架构验证 -->
    <property>
        <name>hive.metastore.schema.verification</name>
        <value>false</value>
    </property>

    <!-- Hive Server2的Thrift服务端口 -->
    <property>
        <name>hive.server2.thrift.port</name>
        <value>10000</value>
    </property>

    <!-- Hive Server2绑定的主机地址 -->
    <property>
        <name>hive.server2.thrift.bind.host</name>
        <value>hadoop102</value>
    </property>

    <!-- 禁用Hive元数据事件通知的API认证 -->
    <property>
        <name>hive.metastore.event.db.notification.api.auth</name>
        <value>false</value>
    </property>

    <!-- 在Hive CLI中打印查询结果的列头 -->
    <property>
        <name>hive.cli.print.header</name>
        <value>true</value>
    </property>

    <!-- 在Hive CLI中打印当前数据库名称 -->
    <property>
        <name>hive.cli.print.current.db</name>
        <value>true</value>
    </property>

    <!-- 指定Hive Metastore的Thrift URI -->
    <property>
        <name>hive.metastore.uris</name>
        <value>thrift://hadoop102:9083</value>
    </property>

    <!-- 指定Hadoop文件系统的默认名称 -->
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://mycluster</value>
    </property>
</configuration>

将hadoop配置链接到hive

ln -s /opt/module/hadoop-3.3.4/etc/hadoop/core-site.xml /opt/module/apache-hive-3.1.3-bin/conf/core-site.xml
ln -s /opt/module/hadoop-3.3.4/etc/hadoop/hdfs-site.xml /opt/module/apache-hive-3.1.3-bin/conf/hdfs-site.xml
ln -s /opt/module/hadoop-3.3.4/etc/hadoop/mapred-site.xml /opt/module/apache-hive-3.1.3-bin/conf/mapred-site.xml
ln -s /opt/module/hadoop-3.3.4/etc/hadoop/yarn-site.xml /opt/module/apache-hive-3.1.3-bin/conf/yarn-site.xml

创建数据库与用户

# 创建元数据库
create database metastore;
# 创建用户
create user 'hive'@'hadoop102' identified by 'hive';
# 赋予权限
grant all privileges on metastore.* to 'hive'@'hadoop102';

上传Mysql连接包

cp /opt/software/mysql/mysql-connector-j-8.0.31.jar /opt/module/apache-hive-3.1.3-bin/lib/

初始化元数据

schematool -initSchema -dbType mysql -verbose

修改字符集

alter table metastore.COLUMNS_V2 modify column COMMENT varchar(256) character set utf8;
alter table metastore.TABLE_PARAMS modify column PARAM_VALUE mediumtext character set utf8;

Spark安装

解压

tar -zxvf /opt/software/spark-3.3.1-bin-without-hadoop.tgz -C /opt/module/

修改spark-env.sh

mv /opt/module/spark-3.3.1-bin-without-hadoop/conf/spark-env.sh.template /opt/module/spark-3.3.1-bin-without-hadoop/conf/spark-env.sh
vim /opt/module/spark-3.3.1-bin-without-hadoop/conf/spark-env.sh
# 添加
export SPARK_DIST_CLASSPATH=$(hadoop classpath)

配置环境变量

vim /etc/profile.d/my_env.sh
source /etc/profile.d/my_env.sh
# Spark
export SPARK_HOME=/opt/module/spark-3.3.1-bin-without-hadoop
export PATH=$PATH:$SPARK_HOME/bin

Hive on Spark

Hive创建Spark配置文件

vim /opt/module/apache-hive-3.1.3-bin/conf/spark-default.conf
park.master             yarn
spark.eventLog.enabled  true
spark.eventLog.dir      hdfs://hadoop102:9000/spark-history
spark.executor.memory   1g
spark.driver.memory     1g

创建HDFS文件保存历史日志

hadoop fs -mkdir /spark-history

上传Spark纯净版Jar包

hadoop fs -mkdir /spark-jars
hadoop fs -put /opt/module/spark-3.3.1-bin-without-hadoop/jars/* /spark-jars

修改hive-site.xml

vim /opt/module/apache-hive-3.1.3-bin/conf/hive-site.xml
<!--Spark依赖位置-->
<property>
        <name>spark.yarn.jars</name>
        <value>hdfs://mycluster/spark-jars/*</value>
</property>
<!--Hive执行引擎-->
<property>
        <name>hive.execution.engine</name>
        <value>spark</value>
</property>
<!--提交任务超时时间-->
<property>
        <name>hive.spark.client.connect.timeout</name>
        <value>5000</value>
</property>

DolphinScheduler部署

解压

tar -zxvf /opt/software/apache-dolphinscheduler-2.0.5-bin.tar.gz -C /opt/module/

创建元数据库与用户

CREATE DATABASE dolphinscheduler DEFAULT CHARACTER SET utf8 DEFAULT COLLATE utf8_general_ci;
CREATE USER 'dolphinscheduler'@'%' IDENTIFIED BY 'dolphinscheduler';
GRANT ALL PRIVILEGES ON dolphinscheduler.* TO 'dolphinscheduler'@'%';
flush privileges;

配置一键部署脚本

vim /opt/module/apache-dolphinscheduler-2.0.5-bin/conf/config/install_config.conf
# 要部署的集群地址
ips="hadoop102,hadoop103,hadoop104"
# master地址:负责 DAG 任务切分、任务提交、任务监控
masters="hadoop102"
# WorkerServer地址:负责任务的执行和提供日志服务
workers="hadoop102:default,hadoop103:default,hadoop104:default"
# alertServer地址:提供告警相关服务
alertServer="hadoop103"
# API接口层:主要负责处理前端UI层的请求
apiServers="hadoop104"
# 注释
# pythonGatewayServers="ds1"
# DS安装地址,不存在会自动创建
installPath="/opt/module/dolphinscheduler"
# 部署用户
deployUser="root"
# DS本地数据存储目录
dataBasedirPath="/opt/module/dolphinscheduler/data"
# JAVA路径
javaHome="/opt/module/jdk1.8.0_212"
# 指定元数据库类型
DATABASE_TYPE=${DATABASE_TYPE:-"mysql"}
# 数据库URL
SPRING_DATASOURCE_URL=${SPRING_DATASOURCE_URL:-"jdbc:mysql://hadoop102:3306/dolphinscheduler?useUnicode=true&allowPublicKeyRetrieval=true&characterEncoding=UTF-8"}
# 数据库用户名
SPRING_DATASOURCE_USERNAME=${SPRING_DATASOURCE_USERNAME:-"dolphinscheduler"}
# 数据库密码
PRING_DATASOURCE_PASSWORD=${SPRING_DATASOURCE_PASSWORD:-"dolphinscheduler"}
# 注册中心插件
registryPluginName="zookeeper"
# ZK集群地址
registryServers="hadoop102:2181,hadoop103:2181,hadoop104:2181"
# ZK中节点名
registryNamespace="dolphinscheduler"
# 资源存储类型
resourceStorageType="HDFS"
# 资源上传地址
resourceUploadPath="/dolphinscheduler"
# HDFS访问地址
defaultFS="hdfs://mycluster"
# 如果配置了YARN HA就配置这个,配置YARN的配置的地址
yarnHaIps="hadoop103,hadoop104"
# 访问HDFS用户
hdfsRootUser="root"

Hadoop HA启动后拷贝hdfs-site

cp /opt/module/hadoop-3.3.4/etc/hadoop/hdfs-site.xml /opt/module/apache-dolphinscheduler-2.0.5-bin/conf/

初始化数据库

cp /opt/software/mysql/mysql-connector-j-8.0.31.jar /opt/module/apache-dolphinscheduler-2.0.5-bin/lib/
cd /opt/module/apache-dolphinscheduler-2.0.5-bin/
script/create-dolphinscheduler.sh

部署DolphinScheduler

  • 前提Zookeeper开启
cd /opt/module/apache-dolphinscheduler-2.0.5-bin/
./install.sh 

启/停

# 启动
/opt/module/dolphinscheduler/bin/start-all.sh 
# 停止
/opt/module/dolphinscheduler/bin/stop-all.sh

WebUI

  • 账号:admin
  • 密码:dolphinscheduler123
# 访问api的地址
http://hadoop104:12345/dolphinscheduler

Superset 部署

安装Miniconda

cd /opt/software/
bash Miniconda3-latest-Linux-x86_64.sh
# 一直回车,知道选择yes
yes
# 输入yes后输入地址,后面一直yes
/opt/module/miniconda3

配置环境变量

vim /etc/profile.d/my_env.sh
# minicond
export CONDA_HOME=/opt/module/miniconda3
export PATH=$PATH:$CONDA_HOME/bin
source ~/.bashrc

取消开机自启

  • 新开一个终端执行
conda config --set auto_activate_base false

创建Python环境

conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main
conda config --set show_channel_urls yes
conda config --remove-key channels
conda create --name superset python=3.8.16
conda activate superset

安装Superset

sudo yum install -y gcc gcc-c++ libffi-devel python-devel python-pip python-wheel python-setuptools openssl-devel cyrus-sasl-devel openldap-devel
pip install virtualenv
pip install --upgrade pip -i https://pypi.douban.com/simple/
pip install apache-superset==2.0.0 -i https://pypi.tuna.tsinghua.edu.cn/simple -r /opt/software/base.txt

配置元数据库

  • 使用新终端
CREATE DATABASE superset DEFAULT CHARACTER SET utf8 DEFAULT COLLATE utf8_general_ci;
create user superset@'%' identified WITH mysql_native_password BY 'superset';
grant all privileges on *.* to superset@'%' with grant option;
flush privileges;

修改配置文件

  • 使用新终端
vim /opt/module/miniconda3/envs/superset/lib/python3.8/site-packages/superset/config.py
SQLALCHEMY_DATABASE_URI = 'mysql://superset:superset@hadoop102:3306/superset?charset=utf8'
conda install mysqlclient

初始化

export FLASK_APP=superset
superset db upgrade
# 初始化账号密码,回车默认
superset fab create-admin
superset init
pip install gunicorn -i https://pypi.douban.com/simple/

启动

# 启动
gunicorn --workers 5 --timeout 120 --bind hadoop102:8787  "superset.app:create_app()" --daemon 
# 停止
ps -ef | awk '/superset/ && !/awk/{print $2}' | xargs kill -9
http://hadoop102:8787

Flink搭建

解压

tar -zxvf /opt/software/flink-1.17.1-bin-scala_2.12.tgz -C /opt/module/

修改集群配置

vim /opt/module/flink-1.17.1/conf/flink-conf.yaml
# JobMaster地址
jobmanager.rpc.address: hadoop102
# 配置外网访问地址
jobmanager.bind-host: 0.0.0.0
# TaskManager外网访问地址
taskmanager.bind-host: 0.0.0.0
# TaskManager地址,需要配置本机地址
taskmanager.host: hadoop102
# 客户端连接地址
rest.address: hadoop102
rest.bind-address: 0.0.0.0
# 每个TaskManager能分配的Slot数量,这里给linux的CUP核数
taskmanager.numberOfTaskSlots: 4
vim /opt/module/flink-1.17.1/conf/workers
hadoop102
hadoop103
hadoop104
vim /opt/module/flink-1.17.1/conf/masters
hadoop102:8081

配置环境变量

vim /etc/profile.d/my_env.sh
# Flink
export FLINK_HOME=/opt/module/flink-1.17.1
export PATH=$PATH:$FLINK_HOME/bin

分发

xsync /opt/module/flink-1.17.1/

修改flink-conf.yaml

  • 从节点修改
# TaskManager地址,需要配置本机地址
taskmanager.host: hadoop103

启动

# 启动
/opt/module/flink-1.17.1/bin/start-cluster.sh
# 停止
/opt/module/flink-1.17.1/bin/stop-cluster.sh

Hbase搭建

解压

tar -zxvf /opt/software/hbase-2.4.11-bin.tar.gz -C /opt/module/

配置环境变量

vim /etc/profile.d/my_enc.sh
# Hbase
export HBASE_HOME=/opt/module/hbase-2.4.11
export PATH=$PATH:$HBASE_HOME/bin

修改配置文件

hbase-env.sh
vim /opt/module/hbase-2.4.11/conf/hbase-env.sh
# 添加
export HBASE_MANAGES_ZK=false
hbase-site.xml
vim /opt/module/hbase-2.4.11/conf/hbase-site.xml
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<configuration>
 	<!-- 指定Zookeeper集群地址 -->  
 	<property>
		<name>hbase.zookeeper.quorum</name>
		<value>hadoop102,hadoop103,hadoop104</value>
	</property>
	<!-- Zookeeper 端口,默认2181 -->
  	<property>
    		<name>hbase.zookeeper.property.clientPort</name>
    		<value>2181</value>
  	</property>
  	<!-- Zookeeper元数据的存储目录,需要和Zookeeper的zoo.cfg 配置的一致 -->
  	<property>
    		<name>hbase.zookeeper.property.dataDir</name>
    		<value>/opt/module/apache-zookeeper-3.7.1-bin/zkData</value>
  	</property>
	<!-- 指定hbase在HDFS上存储的路径 -->
	<property>
		<name>hbase.rootdir</name>
		<value>hdfs://mycluster/hbase</value>
	</property>
	<!-- 指定hbase是分布式的 -->
	<property>
		<name>hbase.cluster.distributed</name>
		<value>true</value>
	</property>
	<!-- 指定写操作写入文件系统 -->
	<property>
		<name>hbase.wal.provider</name>
		<value>filesystem</value>
	</property>
	<!-- 解决启动HMaster无法初始化WAL的问题 -->
  	<property>
    		<name>hbase.unsafe.stream.capability.enforce</name>
    		<value>false</value>
  	</property>
  	<!-- 指定HBase RegionServer Web页面访问端口,默认端口号16030 -->
  	<property>
    		<name>hbase.regionserver.info.port</name>
    		<value>16030</value>
  	</property>
  	<!-- 指定HBase Master Web页面访问端口,默认端口号16010 -->
  	<property>
    		<name>hbase.master.info.port</name>
    		<value>16010</value>
  	</property>
</configuration>
regionservers
vim /opt/module/hbase-2.4.11/conf/regionservers
hadoop102
hadoop103
hadoop104
配置软连接
ln -sf /opt/module/hadoop-3.3.4/etc/hadoop/hdfs-site.xml /opt/module/hbase-2.4.11/conf/
ln -s /opt/module/hadoop-3.3.4/etc/hadoop/core-site.xml /opt/module/hbase-2.4.11/conf/

解决log4j兼容性问题

mv /opt/module/hbase-2.4.11/lib/client-facing-thirdparty/slf4j-reload4j-1.7.33.jar  /opt/module/hbase-2.4.11/lib/client-facing-thirdparty/slf4j-reload4j-1.7.33.jar.bak

分发

xsync /opt/module/hbase-2.4.11
xsync /etc/profile.d/my_env.sh

启停

start-hbase.sh
stop-hbase.sh
http://hadoop102:16010/

Redis安装

  • 这个因为尚硅谷给的压缩包有问题是,所以随便找了一个

解压

tar -zxvf /opt/software/redis-6.2.6.tar.gz -C /opt/module/

安装C++

yum install gcc-c++ -y
 # 测试
gcc -v

初始化

cd /opt/module/redis-6.2.6/
make
make PREFIX=/opt/module/redis-6.2.6 install

修改配置文件

vim /opt/module/redis-6.2.6/redis.conf
# 是否将redis设置为守护进程运行
daemonize yes
# 是否开启保护模式
protected-mode no
# 监听什么网段的连接
bind 0.0.0.0

配置环境变量

vim /etc/profile.d/my_env.sh
# Redis
export REDIS_HOME=/opt/module/redis-6.2.6
export PATH=$PATH:$REDIS_HOME/bin

启停

# 启动,因为开启了守护进程所以不会在前台启动
/opt/module/redis-6.2.6/bin/redis-server /opt/module/redis-6.2.6/redis.conf
# 停止
/opt/module/redis-6.2.6/bin/redis-cli shutdown

Doris安装

设置系统环境

vim /etc/security/limits.conf
# 添加
*           soft  nproc  409600
*           hard  nproc  409600
*           soft  stack  409600
*           hard  stack  409600
*           soft  nofile  409600
*           hard  nofile  409600
vim /etc/sysctl.conf
# 添加
vm.max_map_count=2000000
xsync /etc/security/limits.conf
xsync /etc/sysctl.conf
reboot
解压
mkdir -p /opt/module/doris
tar -xvf /opt/software/apache-doris-fe-1.2.4.1-bin-x86_64.tar.xz -C /opt/module/doris/
tar -xvf /opt/software/apache-doris-be-1.2.4.1-bin-x86_64.tar.xz -C /opt/module/doris/
tar -xvf /opt/software/apache-doris-dependencies-1.2.4.1-bin-x86_64.tar.xz -C /opt/module/doris/
cp /opt/module/doris/apache-doris-dependencies-1.2.4.1-bin-x86_64/java-udf-jar-with-dependencies.jar /opt/module/doris/apache-doris-be-1.2.4.1-bin-x86_64/lib/

配置FE

  • FE(Frontend):存储、维护元数据库,负责接收、解析查询请求、规划查询计划、调度查询执行、返回查询结果
修改fe.conf
vim /opt/module/doris/apache-doris-fe-1.2.4.1-bin-x86_64/conf/fe.conf
# web 页面访问端口
http_port = 7030
# 配置文件中指定元数据路径:默认在 fe 的根目录下,可以不配
# meta_dir = /opt/module/doris/fe/doris-meta
# 修改绑定 ip
priority_networks = 192.168.0.0/16
启动
# 启动
/opt/module/doris/apache-doris-fe-1.2.4.1-bin-x86_64/bin/start_fe.sh --daemon
Web UI
  • 账号:root
  • 密码:无
http://hadoop162:7030/login

配置BE

  • BE(Backend):负责物理数据的存储和计算,根据FE生成的物理计划,分布式的执行
修改be.conf
vim /opt/module/doris/apache-doris-be-1.2.4.1-bin-x86_64/conf/be.conf
# 不配置存储目录, 则会使用默认的存储目录
# storage_root_path = /opt/module/doris-1.1.1/doris-storage1;/opt/module/doris-1.1.1/doris-storage2.SSD,10
priority_networks = 192.168.0.0/16
webserver_port = 7040 
分发
xsync /opt/module/doris/apache-doris-be-1.2.4.1-bin-x86_64/
使用Mysql添加BE
# 默认没有密码
mysql -hhadoop102 -P9030 -uroot
ALTER SYSTEM ADD BACKEND "hadoop102:9050";
ALTER SYSTEM ADD BACKEND "hadoop103:9050";
ALTER SYSTEM ADD BACKEND "hadoop104:9050";
查看状态
SHOW PROC '/backends'\G
启动
  • 三个节点分别启动
/opt/module/doris/apache-doris-be-1.2.4.1-bin-x86_64/bin/start_be.sh --daemon

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值