对标EMR 7.2.0
HDFS: Hadoop 3.3.6
Hive: Hive 3.1.3
Hue: Hue 4.11.0
Spark: Spark 3.5.1
Flink: 1.18.1
一、准备依赖环境
1、jdk:1.8.0_351
mkdir -p /data/software
cd /data/software
wget https://tools.qihangxingchen.com/download/jdk-8u351-linux-x64.tar.gz
tar -zxf jdk-8u351-linux-x64.tar.gz -C /usr/local/
cd /usr/local/jdk1.8.0_351
cat >> ~/.bashrc <<'EOF'
export JAVA_HOME=/usr/local/jdk1.8.0_351
export JRE_HOME=${JAVA_HOME}/jre
export CLASSPATH=.:${JAVA_HOME}/lib:${JRE_HOME}/lib
export PATH=${JAVA_HOME}/bin:$PATH
EOF
source ~/.bashrc
cat >> /etc/profile <<'EOF'
export JAVA_HOME=/usr/local/jdk1.8.0_351
export JRE_HOME=${JAVA_HOME}/jre
export CLASSPATH=.:${JAVA_HOME}/lib:${JRE_HOME}/lib
export PATH=${JAVA_HOME}/bin:$PATH
EOF
source /etc/profile
ln -s /usr/local/jdk1.8.0_351/bin/java /bin/java
2、mysql: 8.0.33
cd /data/software
wget https://tools.qihangxingchen.com/download/mysql-8.0.33-el7-x86_64.tar.gz
tar -zxvf mysql-8.0.33-el7-x86_64.tar.gz
mv mysql-8.0.33-el7-x86_64 /usr/local/mysql8
ln -s /usr/local/mysql8/bin/mysql /usr/bin/mysql
ln -s /usr/local/mysql8/bin/mysqldump /usr/bin/mysqldump
ln -s /usr/local/mysql8/bin/mysqld /usr/bin/mysqld
## 创建目录
mkdir -p /data/storage/mysql8/data
mkdir -p /data/storage/mysql8/logs
chown -R mysql:mysql /data/storage/mysql8/
## 配置my.cnf
cat >/etc/my.cnf <<EOF
[mysqld]
basedir=/usr/local/mysql8
datadir=/data/storage/mysql8/data
port = 3306
socket=/tmp/mysql8.sock
mysqlx_socket=/tmp/mysqlx8.sock
default_authentication_plugin=mysql_native_password
log-error=/data/storage/mysql8/logs/mysqld.log
pid-file=/tmp/mysqld8.pid
default-storage-engine=INNODB
log-bin=mysql-bin
binlog-format=ROW
relay_log_recovery = 1
master_info_repository =table
relay_log_info_repository =table
gtid_mode = on
enforce_gtid_consistency = on
binlog_cache_size=65536
server_id=1
max_connections=2000
max_connect_errors = 800
max_user_connections = 4000
innodb_flush_log_at_trx_commit = 0
innodb_buffer_pool_size = 2048M
back_log = 2048
lock_wait_timeout =50
skip_name_resolve =1
slow_query_log =1
long_query_time=1
innodb_file_per_table=1
sql_mode = STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_ENGINE_SUBSTITUTION
sync_binlog = 0
character-set-server=utf8mb4
log_timestamps=SYSTEM
explicit_defaults_for_timestamp=0
skip_ssl
[client]
socket=/tmp/mysql8.sock
EOF
## 初始化并启动mysql服务
mysqld --defaults-file=/etc/my.cnf --datadir=/data/storage/mysql8/data --initialize-insecure --user=mysql
cp -a /usr/local/mysql8/support-files/mysql.server /etc/init.d/mysql
update-rc.d mysql defaults
systemctl enable mysql
systemctl restart mysql
## 创建root用户赋权限
mysql -uroot -e "create user root@'%' identified by 'test@012';grant all on *.* to root@'%'; flush privileges;"
3、/etc/hosts
10.20.12.41 office-ops-lzl-01
10.20.12.42 office-ops-lzl-02
4、设置免密登录
5、maven安装
wget https://dlcdn.apache.org/maven/maven-3/3.9.9/binaries/apache-maven-3.9.9-bin.tar.gz
tar -zxvf apache-maven-3.9.9-bin.tar.gz -C /usr/local/
vim /usr/local/apache-maven-3.9.9/conf/settings.xml
## 增加如下内容
<mirror>
<id>nexus-aliyun</id>
<mirrorOf>central</mirrorOf>
<name>Nexus aliyun</name>
<url>http://maven.aliyun.com/nexus/content/groups/public</url>
</mirror>
<mirror>
<id>aliyunmaven</id>
<mirrorOf>*</mirrorOf>
<name>spring-plugin</name>
<url>http://maven.aliyun.com/repository/spring-plugin</url>
</mirror>
<mirror>
<id>repo2</id>
<mirrorOf>central</mirrorOf>
<name>Mirror from Maven Repo2</name>
<url>http://repo.spring.io/plugins-release/</url>
</mirror>
vim /etc/profile
export MAVEN_HOME=/usr/local/apache-maven-3.9.9
export PATH=$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$HIVE_HOME/bin:$SPARK_HOME/bin:$MAVEN_HOME/bin:$PATH
source /etc/profile
6、scale安装
cd /data/software
wget https://downloads.lightbend.com/scala/2.13.14/scala-2.13.14.tgz
tar -zxvf scala-2.13.14.tgz
mv scala-2.13.14 /usr/local/scala
vim /etc/profile
export SCALA_HOME=/usr/local/scala
export PATH=$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$HIVE_HOME/bin:$SPARK_HOME/bin:$MAVEN_HOME/bin:$SCALA_HOME/bin:$PATH
source /etc/profile
chown -R work.work /usr/local/scala
二、Hadoop部署
1、单机安装
cd /data/software
wget https://downloads.apache.org/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz
tar -zxvf hadoop-3.3.6.tar.gz
mv hadoop-3.3.6 /usr/local/hadoop
## 配置环境变量
vim /etc/profile
#hadoop environment
export HADOOP_HOME=/usr/local/hadoop
export PATH=$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$PATH
## 使当前窗口生效
source /etc/profile
## 验证hadoop
root@office-ops-lzl-01:/data/software# hadoop version
Hadoop 3.3.6
Source code repository https://github.com/apache/hadoop.git -r 1be78238728da9266a4f88195058f08fd012bf9c
Compiled by ubuntu on 2023-06-18T08:22Z
Compiled on platform linux-x86_64
Compiled with protoc 3.7.1
From source with checksum 5652179ad55f76cb287d9c633bb53bbd
This command was run using /usr/local/hadoop/share/hadoop/common/hadoop-common-3.3.6.jar
## 配置hadoop-env.sh文件
vim $HADOOP_HOME/etc/hadoop/hadoop-env.sh
# 增加如下:
export JAVA_HOME=/usr/local/jdk1.8.0_351
export HADOOP_ROOT_LOGGER=WARN,console
## 创建hdfs所需目录
mkdir -p /usr/local/hadoop/data/dfs/name
chown -R work.work /usr/local/hadoop/data
2、文件配置
- core-site.xml的配置
cd $HADOOP_HOME/etc/hadoop
vim core-site.xml
<configuration>
<!-- 配置分布式文件系统的schema和ip以及port,默认8020-->
<property>
<name>fs.defaultFS</name>
<value>hdfs://10.20.12.41:8020/</value>
</property>
<!-- 指定 hadoop 数据的存储目录 -->
<property>
<name>hadoop.tmp.dir</name>
<value>/usr/local/hadoop/data</value>
</property>
<!-- 配置 HDFS 网页登录使用的静态用户为 atguigu -->
<property>
<name>hadoop.http.staticuser.user</name>
<value>lzl</value>
</property>
</configuration>
- hdfs-site.xml的配置
<configuration>
<!-- 配置副本数,注意,伪分布模式只能是1。-->
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
</configuration>
- mapred-site.xml的配置
<configuration>
<!-- 指定 MapReduce 程序运行在 Yarn 上 -->
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>
- yarn-site.xml的配置
<configuration>
<!-- Site specific YARN configuration properties -->
<!-- 指定 MR 走 shuffle -->
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<!-- 指定 ResourceManager 的地址-->
<property>
<name>yarn.resourcemanager.hostname</name>
<value>0.0.0.0</value>
</property>
<!-- 环境变量的继承 -->
<property>
<name>yarn.nodemanager.env-whitelist</name>
<value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
</property>
</configuration>
3、格式化NameNode
hdfs namenode -
format
4、启动HDFS
# 需长期生效写到全局变量中
export HDFS_NAMENODE_USER='work'
export HDFS_DATANODE_USER='work'
export HDFS_SECONDARYNAMENODE_USER='work'
# 启动
start-all.sh
# jps查看进程
root@office-ops-lzl-01:/usr/local/hadoop# jps
19732 SecondaryNameNode
21092 Jps
19531 DataNode
20671 NameNode
29321 ResourceManager
5、WebUI 9870
三、安装hive
## 安装部分
cd /data/software
wget https://downloads.apache.org/hive/hive-3.1.3/apache-hive-3.1.3-bin.tar.gz ## 此处的apache-hive-3.1.3-bin.tar.gz是需要修改spark指定版本重新编译的,编译过程另见文档 :https://confluence.hkbge-inc.com/pages/resumedraft.action?draftId=110367513&draftShareId=b194bf7c-61a5-40dc-a4f0-38a2bdd1187d&
tar -zxvf apache-hive-3.1.3-bin.tar.gz
mv apache-hive-3.1.3-bin /usr/local/hive
# 将全局环境变量调整如下:
vim /etc/profile
#hadoop environment
export HADOOP_HOME=/usr/local/hadoop
export HIVE_HOME=/usr/local/hive
export PATH=$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$HIVE_HOME/bin:$PATH
source /etc/profile
## 配置部分
cd /usr/local/hive/conf
cp hive-default.xml.template hive-site.xml
vim hive-site.xml
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
--><configuration>
<!-- jdbc 连接的 URL -->
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://10.20.12.41:3306/hive?useSSL=false</value>
</property>
<!-- jdbc 连接的 Driver-->
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.cj.jdbc.Driver</value>
</property>
<!-- jdbc 连接的 username-->
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>root</value>
</property>
<!-- jdbc 连接的 password -->
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>qhxc@021</value>
</property>
<!-- Hive 元数据存储版本的验证 -->
<property>
<name>hive.metastore.schema.verification</name>
<value>false</value>
</property>
<!--元数据存储授权-->
<property>
<name>hive.metastore.event.db.notification.api.auth</name>
<value>false</value>
</property>
<!-- Hive 默认在 HDFS 的工作目录 -->
<property>
<name>hive.metastore.warehouse.dir</name>
<value>/user/hive/warehouse</value>
</property>
<!-- 指定metastore服务的地址 -->
<property>
<name>hive.metastore.uris</name>
<value>thrift://10.20.12.41:9083</value>
</property>
<!-- 指定 hiveserver2 连接的 host -->
<property>
<name>hive.server2.thrift.bind.host</name>
<value>0.0.0.0</value>
</property>
<!-- 指定 hiveserver2 连接的端口号 -->
<property>
<name>hive.server2.thrift.port</name>
<value>10000</value>
</property>
<property>
<name>hive.server2.enable.doAs</name>
<value>false</value>
<description>
Setting this property to true will have HiveServer2 execute
Hive operations as the user making the calls to it.
</description>
</property>
</configuration>
## 下载mysql JDBC连接驱动
## mysql官方托管:https://downloads.mysql.com/archives/c-j/
mysql-connector-j-8.0.33.jar (存放至/data/software)
cp mysql-connector-j-8.0.33.jar $HIVE_HOME/lib/
## 初始化元数据库(注:需要保证Hadoop集群是开启状态)
schematool -dbType mysql -initSchema -verbose
## 启动hive并测试
su - work
work@office-ops-lzl-01:~$ hive
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/local/hive/lib/log4j-slf4j-impl-2.17.1.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/local/hadoop/share/hadoop/common/lib/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
Hive Session ID = d9044cb9-f55e-4798-ab17-351cfb574f2b
Logging initialized using configuration in jar:file:/usr/local/hive/lib/hive-common-3.1.3.jar!/hive-log4j2.properties Async: true
Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
Hive Session ID = 5cf8e845-9245-46b0-8018-a281ebdac64d
hive> show databases;
OK
default
Time taken: 0.992 seconds, Fetched: 1 row(s)
hive> show tables;
OK
Time taken: 0.047 seconds
## 启动hiveserver2服务
##1、前台启动
hiveserver2 或者 hive --service hiveserver2
##2、后台启动
#### 记录日志
mkdir -p /data/logs/hive
chown -R work.work /data/logs
su - work
nohup hive --service metastore 1>/data/logs/hive/hivemetastore.log 2>/data/logs/hive/hivemetastore.err &
nohup hiveserver2 1>/data/logs/hive/hiveserver.log 2>/data/logs/hive/hiveserver.err &
work@office-ops-lzl-01:~$ netstat -nlpt | grep 9083
(Not all processes could be identified, non-owned process info
will not be shown, you would have to be root to see it all.)
tcp6 0 0 :::9083 :::* LISTEN 48097/java
四、Spark安装和对接
## spark安装
cd /data/software
wget https://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz ## spark-3.5.1-bin-hadoop3.tgz
tar -zxvf spark-3.5.1-bin-hadoop3.tgz
mv spark-3.5.1-bin-hadoop3 /usr/local/spark
chown -R work.work /usr/local/spark
## 添加环境变量
vim /etc/profile
export SPARK_HOME=/usr/local/spark
export PATH=$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$HIVE_HOME/bin:$SPARK_HOME/bin:$PATH
source /etc/profile
## 修改配置文件
cd /usr/local/spark/
cd conf/
cp spark-env.sh.template spark-env.sh
vim spark-env.sh
HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop
YARN_CONF_DIR=/usr/local/hadoop/etc/hadoop
export SPARK_DIST_CLASSPATH=$(hadoop classpath) ## 这个很重要!!!不加的话hive sql命令执行一直failed
## 拷贝jar包
cd /usr/local/spark/jars
### 在HDFS创建存储历史日志的路径
hadoop fs -mkdir /spark-history
### 在HDFS创建存放jar包的路径
hadoop fs -mkdir /spark-jars
### 向HDFS上传spark纯净版jar包
hadoop fs -put /usr/local/spark/jars/* /spark-jars
## 配置对接
su - work
vim /usr/local/hive/conf/hive-site.xml
<!-- ##################对接spark###################### -->
<property>
<name>spark.yarn.jars</name>
<value>hdfs://bj-qhxc-office-ops-lzl-01:8020/spark-jars/*</value>
</property>
<property>
<name>hive.execution.engine</name>
<value>spark</value>
</property>
<!--Hive和Spark连接超时时间-->
<property>
<name>hive.spark.client.connect.timeout</name>
<value>60000ms</value>
</property>
cd /usr/local/hive/conf
vim spark-defaults.conf
spark.master yarn
spark.eventLog.enabled true
spark.eventLog.dir hdfs://0.0.0.0:8020/spark-history
spark.executor.memory 1g
spark.driver.memory
## 重新启动hive
ps -ef | grep hive
kill -9 xxpidxx
nohup hive --service metastore 1>/data/logs/hive/hivemetastore.log 2>/data/logs/hive/hivemetastore.err &
nohup hiveserver2 1>/data/logs/hive/hiveserver.log 2>/data/logs/hive/hiveserver.err &
五、Flink部署
cd /data/software
wget https://dlcdn.apache.org/flink/flink-1.18.1/flink-1.18.1-bin-scala_2.12.tgz
## 为了运行Flink,只需提前安装好 Java 11。你可以通过以下命令来检查 Java 是否已经安装正确。
java -version
## 下载 release 1.18.1 并解压。
tar -xzf flink-1.18.1-bin-scala_2.12.tgz
cd flink-1.18.1-bin-scala_2.12
## 启动集群
cd flink-1.18.1-bin-scala_2.12
## 提交作业
Flink 的 Releases 附带了许多的示例作业。你可以任意选择一个,快速部署到已运行的集群上。
$ ./bin/flink run examples/streaming/WordCount.jar
$ tail log/flink-*-taskexecutor-*.out
(nymph,1)
(in,3)
(thy,1)
(orisons,1)
(be,4)
(all,2)
(my,1)
(sins,1)
(remember,1)
(d,4)
## 停止集群
./bin/stop-cluster.sh
六、Hue
wget https://github.com/cloudera/hue/archive/refs/tags/release-4.11.0.tar.gz
tar -zxvf release-4.11.0.tar.gz
mv release-4.11.0 /usr/local/hue
chown -R work.work /usr/local/hue/
apt install python2.7-dev python-pip libxml2-dev libxslt1-dev libsqlite3-dev libldap2-dev libsasl2-dev npm
apt-get install libmysqlclient-dev
cp /usr/local/mysql/include/my_config.h /usr/include/mysql/
ln -s /usr/bin/pip2 /usr/bin/pip
cd /usr/local/hue
pip install setuptools-scm
pip install future
pip install python-daemon
vim desktop/core/src/desktop/supervisor.py
from daemon.pidlockfile 改为 from daemon.pidfile
pip install cryptography
## 创建用户,此处我使用的是用户名/密码: admin / admin
## mysql数据库中创建 hue数据库
create database hue;
create user 'hue'@'%' identified By 'test@012';
grant all on hue.* TO 'hue'@'%';
flush privileges;
##### ## 数据库迁移
##### ./build/env/bin/hue migrate
make apps
groupadd hue
useradd -g hue hue
## 启动hue
./build/env/bin/supervisor
/usr/local/hue/build/env/local/lib/python2.7/site-packages/requests_kerberos-0.12.0-py2.7.egg/requests_kerberos/kerberos_.py:11: CryptographyDeprecationWarning: Python 2 is no longer supported by the Python core team. Support for it is now deprecated in cryptography, and will be removed in the next release.
from cryptography import x509
/usr/local/hue/build/env/local/lib/python2.7/site-packages/requests_kerberos-0.12.0-py2.7.egg/requests_kerberos/kerberos_.py:11: CryptographyDeprecationWarning: Python 2 is no longer supported by the Python core team. Support for it is now deprecated in cryptography, and will be removed in the next release.
from cryptography import x509
[11/Sep/2024 19:31:23 +0000] settings DEBUG DESKTOP_DB_TEST_NAME SET: /usr/local/hue/desktop/desktop-test.db
[11/Sep/2024 19:31:23 +0000] settings DEBUG DESKTOP_DB_TEST_USER SET: hue_test
[11/Sep/2024 19:31:23 +0000] settings DEBUG DESKTOP_DB_TEST_NAME SET: /usr/local/hue/desktop/desktop-test.db
[11/Sep/2024 19:31:23 +0000] settings DEBUG DESKTOP_DB_TEST_USER SET: hue_test
[11/Sep/2024 04:31:23 +0000] sslcompat DEBUG ipaddress module is available
[11/Sep/2024 04:31:23 +0000] sslcompat WARNING backports.ssl_match_hostname is unavailable
[11/Sep/2024 04:31:23 +0000] sslcompat DEBUG ssl.match_hostname is available
[11/Sep/2024 04:31:23 +0000] sslcompat DEBUG ipaddress module is available
[11/Sep/2024 04:31:23 +0000] sslcompat WARNING backports.ssl_match_hostname is unavailable
[11/Sep/2024 04:31:23 +0000] sslcompat DEBUG ssl.match_hostname is available
[11/Sep/2024 04:31:24 +0000] decorators INFO AXES: BEGIN LOG
[11/Sep/2024 04:31:24 +0000] decorators INFO AXES: Using django-axes 4.5.4
[11/Sep/2024 04:31:24 +0000] decorators INFO AXES: blocking by IP only.
[11/Sep/2024 04:31:24 +0000] decorators INFO AXES: BEGIN LOG
[11/Sep/2024 04:31:24 +0000] decorators INFO AXES: Using django-axes 4.5.4
[11/Sep/2024 04:31:24 +0000] decorators INFO AXES: blocking by IP only.
[11/Sep/2024 04:31:24 +0000] __init__ INFO Couldn't import snappy. Support for snappy compression disabled.
[11/Sep/2024 04:31:24 +0000] urls WARNING djangosaml2 module not found
[11/Sep/2024 04:31:24 +0000] kt_renewer INFO Keytab renewer not starting, no keytab configured
[11/Sep/2024 04:31:24 +0000] __init__ INFO Couldn't import snappy. Support for snappy compression disabled.
[11/Sep/2024 04:31:25 +0000] urls WARNING djangosaml2 module not found
/usr/local/hue/build/env/local/lib/python2.7/site-packages/requests_kerberos-0.12.0-py2.7.egg/requests_kerberos/kerberos_.py:11: CryptographyDeprecationWarning: Python 2 is no longer supported by the Python core team. Support for it is now deprecated in cryptography, and will be removed in the next release.
from cryptography import x509
[11/Sep/2024 04:31:25 +0000] settings INFO Welcome to Hue 4.11.0
[11/Sep/2024 04:31:25 +0000] settings DEBUG Installed Django modules: DesktopModule(aws: aws),DesktopModule(azure: azure),DesktopModule(hadoop: hadoop),DesktopModule(libanalyze: libanalyze),DesktopModule(liboauth: liboauth),DesktopModule(liboozie: liboozie),DesktopModule(librdbms: librdbms),DesktopModule(libsaml: libsaml),DesktopModule(libsentry: libsentry),DesktopModule(libsolr: libsolr),DesktopModule(libzookeeper: libzookeeper),DesktopModule(Hue: desktop),DesktopModule(About: about),DesktopModule(Hive: beeswax),DesktopModule(File Browser: filebrowser),DesktopModule(HBase Browser: hbase),DesktopModule(Help: help),DesktopModule(hive: hive),DesktopModule(Impala: impala),DesktopModule(Job Browser: jobbrowser),DesktopModule(Job Designer: jobsub),DesktopModule(Table Browser: metastore),DesktopModule(Oozie Editor/Dashboard: oozie),DesktopModule(Pig Editor: pig),DesktopModule(Proxy: proxy),DesktopModule(RDBMS UI: rdbms),DesktopModule(Solr Search: search),DesktopModule(Hadoop Security: security),DesktopModule(Spark: spark),DesktopModule(Sqoop: sqoop),DesktopModule(User Admin: useradmin),DesktopModule(ZooKeeper Browser: zookeeper),DesktopModule(Data Importer: indexer),DesktopModule(Metadata: metadata),DesktopModule(Notebook: notebook),DesktopModule(Analytics Dashboards: dashboard),DesktopModule(Kafka: kafka)
[11/Sep/2024 04:31:25 +0000] settings DEBUG DESKTOP_DB_TEST_NAME SET: /usr/local/hue/desktop/desktop-test.db
[11/Sep/2024 04:31:25 +0000] settings DEBUG DESKTOP_DB_TEST_USER SET: hue_test
[11/Sep/2024 04:31:25 +0000] sslcompat DEBUG ipaddress module is available
[11/Sep/2024 04:31:25 +0000] sslcompat WARNING backports.ssl_match_hostname is unavailable
[11/Sep/2024 04:31:25 +0000] sslcompat DEBUG ssl.match_hostname is available
[11/Sep/2024 04:31:26 +0000] decorators INFO AXES: BEGIN LOG
[11/Sep/2024 04:31:26 +0000] decorators INFO AXES: Using django-axes 4.5.4
[11/Sep/2024 04:31:26 +0000] decorators INFO AXES: blocking by IP only.
[11/Sep/2024 04:31:26 +0000] __init__ INFO Couldn't import snappy. Support for snappy compression disabled.
[11/Sep/2024 04:31:27 +0000] urls WARNING djangosaml2 module not found
[11/Sep/2024 04:31:27 +0000] runcherrypyserver INFO Starting server with options:
{'daemonize': False,
'host': '0.0.0.0',
'pidfile': None,
'port': 8000,
'server_group': 'hue',
'server_name': 'localhost',
'server_user': 'hue',
'ssl_certificate': None,
'ssl_certificate_chain': None,
'ssl_cipher_list': 'ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-AES256-GCM-SHA384:DHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES256-GCM-SHA384:!aNULL:!eNULL:!EXPORT:!DES:!RC4:!MD5:!PSK:!aECDH:!EDH-DSS-DES-CBC3-SHA:!EDH-RSA-DES-CBC3-SHA:!KRB5-DES-CBC3-SHA',
'ssl_no_renegotiation': False,
'ssl_private_key': None,
'threads': 50,
'workdir': None}
[11/Sep/2024 04:31:27 +0000] middleware INFO Unloading HueRemoteUserMiddleware
[11/Sep/2024 04:31:27 +0000] middleware INFO Unloading SpnegoMiddleware
[11/Sep/2024 04:31:27 +0000] middleware INFO Unloading ProxyMiddleware
## 主配置文件修改
vim desktop/conf/pseudo-distributed.ini
[desktop]
http_host=0.0.0.0
http_port=8888
[notebook]
[[[mysql]]]
name = MySQL
interface=sqlalchemy
options='{"url": "mysql://hue:test@012@10.20.12.41:3306/hue"}'
[[[hive]]]
name=Hive
interface=hiveserver2
[dashboard]
[hadoop]
[beeswax]
hive_server_host=10.20.12.41
hive_server_port=10000
hive_metastore_host=office-ops-lzl-01
hive_metastore_port=9083
hive_conf_dir=/usr/local/hive/conf
## 重启hue服务
ps -ef | grep hue
kill -9 xxx
./build/env/bin/supervisor
## 浏览器访问 http://ip:8888 ## 默认端口为8000
七、DataX
参考文档:DataX/userGuid.md at master · alibaba/DataX · GitHub
cd /data/software
wget https://datax-opensource.oss-cn-hangzhou.aliyuncs.com/202309/datax.tar.gz
tar -zxvf datax.tar.gz
mv datax /usr/local/
chown -R work.work /usr/local/datax
## 自检脚本
cd /usr/local/datax/bin/
python datax.py ../job/job.json
DataX (DATAX-OPENSOURCE-3.0), From Alibaba !
Copyright (C) 2010-2017, Alibaba Group. All Rights Reserved.
2024-09-11 16:47:41.082 [main] INFO MessageSource - JVM TimeZone: GMT+08:00, Locale: zh_CN
2024-09-11 16:47:41.084 [main] INFO MessageSource - use Locale: zh_CN timeZone: sun.util.calendar.ZoneInfo[id="GMT+08:00",offset=28800000,dstSavings=0,useDaylight=false,transitions=0,lastRule=null]
2024-09-11 16:47:41.183 [main] INFO VMInfo - VMInfo# operatingSystem class => sun.management.OperatingSystemImpl
2024-09-11 16:47:41.188 [main] INFO Engine - the machine info =>
osInfo: Linux amd64 5.15.0-119-generic
jvmInfo: Oracle Corporation 1.8 25.351-b10
cpu num: 4
totalPhysicalMemory: -0.00G
freePhysicalMemory: -0.00G
maxFileDescriptorCount: -1
currentOpenFileDescriptorCount: -1
GC Names [PS MarkSweep, PS Scavenge]
MEMORY_NAME | allocation_size | init_size
PS Eden Space | 256.00MB | 256.00MB
Code Cache | 240.00MB | 2.44MB
Compressed Class Space | 1,024.00MB | 0.00MB
PS Survivor Space | 42.50MB | 42.50MB
PS Old Gen | 683.00MB | 683.00MB
Metaspace | -0.00MB | 0.00MB
2024-09-11 16:47:41.198 [main] INFO Engine -
{
"setting":{
"speed":{
"channel":1
},
"errorLimit":{
"record":0,
"percentage":0.02
}
},
"content":[
{
"reader":{
"name":"streamreader",
"parameter":{
"column":[
{
"value":"DataX",
"type":"string"
},
{
"value":19890604,
"type":"long"
},
{
"value":"1989-06-04 00:00:00",
"type":"date"
},
{
"value":true,
"type":"bool"
},
{
"value":"test",
"type":"bytes"
}
],
"sliceRecordCount":100000
}
},
"writer":{
"name":"streamwriter",
"parameter":{
"print":false,
"encoding":"UTF-8"
}
}
}
]
}
2024-09-11 16:47:41.216 [main] INFO PerfTrace - PerfTrace traceId=job_-1, isEnable=false
2024-09-11 16:47:41.217 [main] INFO JobContainer - DataX jobContainer starts job.
2024-09-11 16:47:41.217 [main] INFO JobContainer - Set jobId = 0
2024-09-11 16:47:41.273 [job-0] INFO JobContainer - jobContainer starts to do prepare ...
2024-09-11 16:47:41.274 [job-0] INFO JobContainer - DataX Reader.Job [streamreader] do prepare work .
2024-09-11 16:47:41.274 [job-0] INFO JobContainer - DataX Writer.Job [streamwriter] do prepare work .
2024-09-11 16:47:41.274 [job-0] INFO JobContainer - jobContainer starts to do split ...
2024-09-11 16:47:41.274 [job-0] INFO JobContainer - Job set Channel-Number to 1 channels.
2024-09-11 16:47:41.275 [job-0] INFO JobContainer - DataX Reader.Job [streamreader] splits to [1] tasks.
2024-09-11 16:47:41.275 [job-0] INFO JobContainer - DataX Writer.Job [streamwriter] splits to [1] tasks.
2024-09-11 16:47:41.296 [job-0] INFO JobContainer - jobContainer starts to do schedule ...
2024-09-11 16:47:41.299 [job-0] INFO JobContainer - Scheduler starts [1] taskGroups.
2024-09-11 16:47:41.301 [job-0] INFO JobContainer - Running by standalone Mode.
2024-09-11 16:47:41.308 [taskGroup-0] INFO TaskGroupContainer - taskGroupId=[0] start [1] channels for [1] tasks.
2024-09-11 16:47:41.315 [taskGroup-0] INFO Channel - Channel set byte_speed_limit to -1, No bps activated.
2024-09-11 16:47:41.316 [taskGroup-0] INFO Channel - Channel set record_speed_limit to -1, No tps activated.
2024-09-11 16:47:41.325 [taskGroup-0] INFO TaskGroupContainer - taskGroup[0] taskId[0] attemptCount[1] is started
2024-09-11 16:47:41.627 [taskGroup-0] INFO TaskGroupContainer - taskGroup[0] taskId[0] is successed, used[302]ms
2024-09-11 16:47:41.628 [taskGroup-0] INFO TaskGroupContainer - taskGroup[0] completed it's tasks.
2024-09-11 16:47:51.317 [job-0] INFO StandAloneJobContainerCommunicator - Total 100000 records, 2600000 bytes | Speed 253.91KB/s, 10000 records/s | Error 0 records, 0 bytes | All Task WaitWriterTime 0.030s | All Task WaitReaderTime 0.038s | Percentage 100.00%
2024-09-11 16:47:51.318 [job-0] INFO AbstractScheduler - Scheduler accomplished all tasks.
2024-09-11 16:47:51.319 [job-0] INFO JobContainer - DataX Writer.Job [streamwriter] do post work.
2024-09-11 16:47:51.319 [job-0] INFO JobContainer - DataX Reader.Job [streamreader] do post work.
2024-09-11 16:47:51.319 [job-0] INFO JobContainer - DataX jobId [0] completed successfully.
2024-09-11 16:47:51.320 [job-0] INFO HookInvoker - No hook invoked, because base dir not exists or is a file: /usr/local/datax/hook
2024-09-11 16:47:51.321 [job-0] INFO JobContainer -
[total cpu info] =>
averageCpu | maxDeltaCpu | minDeltaCpu
-1.00% | -1.00% | -1.00%
[total gc info] =>
NAME | totalGCCount | maxDeltaGCCount | minDeltaGCCount | totalGCTime | maxDeltaGCTime | minDeltaGCTime
PS MarkSweep | 0 | 0 | 0 | 0.000s | 0.000s | 0.000s
PS Scavenge | 0 | 0 | 0 | 0.000s | 0.000s | 0.000s
2024-09-11 16:47:51.322 [job-0] INFO JobContainer - PerfTrace not enable!
2024-09-11 16:47:51.322 [job-0] INFO StandAloneJobContainerCommunicator - Total 100000 records, 2600000 bytes | Speed 253.91KB/s, 10000 records/s | Error 0 records, 0 bytes | All Task WaitWriterTime 0.030s | All Task WaitReaderTime 0.038s | Percentage 100.00%
2024-09-11 16:47:51.324 [job-0] INFO JobContainer -
任务启动时刻 : 2024-09-11 16:47:41
任务结束时刻 : 2024-09-11 16:47:51
任务总计耗时 : 10s
任务平均流量 : 253.91KB/s
记录写入速度 : 10000rec/s
读出记录总数 : 100000
读写失败总数 : 0
八、Azkaban
cd /data/software
wget https://github.com/shengminjie/Azkaban-complete/archive/refs/heads/master.zip
unzip master.zip
cd Azkaban-complete-master/3.90.0/
mkdir /usr/local/azkaban
unzip azkaban-db-0.1.0-SNAPSHOT.zip
unzip azkaban-exec-server-0.1.0-SNAPSHOT.zip
unzip azkaban-web-server-0.1.0-SNAPSHOT.zip
mv azkaban-db-0.1.0-SNAPSHOT /usr/local/azkaban/azkaban-db
mv azkaban-exec-server-0.1.0-SNAPSHOT /usr/local/azkaban/azkaban-exec-server
mv azkaban-web-server-0.1.0-SNAPSHOT /usr/local/azkaban/azkaban-web-server
### 数据库配置
mysql -uroot -p
> create database azkaban default character set latin1;
#注意:这里创建 azkaban 时建议使用 latin1 编码,因为索引太长, utf8 编码格式不支持,最高支持 1000
use azkaban;
source /data/software/Azkaban-complete-master/3.90.0/create-all-sql-0.1.0-SNAPSHOT.sql;
## mysql5
##grant all on azkaban.* TO 'azkaban'@'%' identified By 'qhxc@021';
## mysql8
create user 'azkaban'@'%' identified By 'test@012';
grant all on azkaban.* TO 'azkaban'@'%';
flush privileges;
### 配置文件修改
## azkaban web服务端配置
cd /usr/local/azkaban/azkaban-web-server/conf/
vim azkaban.properties ## 修改数据库相关配置
# Azkaban Personalization Settings
azkaban.name=Test
azkaban.label=My Local Azkaban
azkaban.color=#FF3601
azkaban.default.servlet.path=/index
web.resource.dir=web/
default.timezone.id=America/Los_Angeles
# Azkaban UserManager class
user.manager.class=azkaban.user.XmlUserManager
user.manager.xml.file=conf/azkaban-users.xml
# Loader for projects
executor.global.properties=conf/global.properties
azkaban.project.dir=projects
# Velocity dev mode
velocity.dev.mode=false
# Azkaban Jetty server properties.
jetty.use.ssl=false
jetty.maxThreads=25
jetty.port=8081
# Azkaban Executor settings
# mail settings
mail.sender=
mail.host=
# User facing web server configurations used to construct the user facing server URLs. They are useful when there is a reverse proxy between Azkaban web servers and users.
# enduser -> myazkabanhost:443 -> proxy -> localhost:8081
# when this parameters set then these parameters are used to generate email links.
# if these parameters are not set then jetty.hostname, and jetty.port(if ssl configured jetty.ssl.port) are used.
# azkaban.webserver.external_hostname=myazkabanhost.com
# azkaban.webserver.external_ssl_port=443
# azkaban.webserver.external_port=8081
job.failure.email=
job.success.email=
lockdown.create.projects=false
cache.directory=cache
# JMX stats
jetty.connector.stats=true
executor.connector.stats=true
# Azkaban mysql settings by default. Users should configure their own username and password.
database.type=mysql
mysql.port=3306
mysql.host=10.20.12.41
mysql.database=azkaban
mysql.user=azkaban
mysql.password=test@012
mysql.numconnections=100
#Multiple Executor
azkaban.use.multiple.executors=true
azkaban.executorselector.filters=StaticRemainingFlowSize,MinimumFreeMemory,CpuStatus
azkaban.executorselector.comparator.NumberOfAssignedFlowComparator=1
azkaban.executorselector.comparator.Memory=1
azkaban.executorselector.comparator.LastDispatched=1
azkaban.executorselector.comparator.CpuUsage=1
## azkaban executor服务器配置
cd ../../azkaban-exec-server/conf/
vim azkaban.properties ## 修改数据库相关配置以及增加最后两行
# Azkaban Personalization Settings
azkaban.name=Test
azkaban.label=My Local Azkaban
azkaban.color=#FF3601
azkaban.default.servlet.path=/index
web.resource.dir=web/
default.timezone.id=America/Los_Angeles
# Azkaban UserManager class
user.manager.class=azkaban.user.XmlUserManager
user.manager.xml.file=conf/azkaban-users.xml
# Loader for projects
executor.global.properties=conf/global.properties
azkaban.project.dir=projects
# Velocity dev mode
velocity.dev.mode=false
# Azkaban Jetty server properties.
jetty.use.ssl=false
jetty.maxThreads=25
jetty.port=8081
# Where the Azkaban web server is located
azkaban.webserver.url=http://localhost:8081
# mail settings
mail.sender=
mail.host=
# User facing web server configurations used to construct the user facing server URLs. They are useful when there is a reverse proxy between Azkaban web servers and users.
# enduser -> myazkabanhost:443 -> proxy -> localhost:8081
# when this parameters set then these parameters are used to generate email links.
# if these parameters are not set then jetty.hostname, and jetty.port(if ssl configured jetty.ssl.port) are used.
# azkaban.webserver.external_hostname=myazkabanhost.com
# azkaban.webserver.external_ssl_port=443
# azkaban.webserver.external_port=8081
job.failure.email=
job.success.email=
lockdown.create.projects=false
cache.directory=cache
# JMX stats
jetty.connector.stats=true
executor.connector.stats=true
# Azkaban plugin settings
azkaban.jobtype.plugin.dir=plugins/jobtypes
# Azkaban mysql settings by default. Users should configure their own username and password.
database.type=mysql
mysql.port=3306
mysql.host=10.20.12.41
mysql.database=azkaban
mysql.user=azkaban
mysql.password=test@012
mysql.numconnections=100
# Azkaban Executor settings
executor.maxThreads=50
executor.flow.threads=30
# 此处不指定端口,后续启动需激活的时候是随机端口
executor.port=12321
flow.num.job.threads=11
cp /data/software/mysql-connector-j-8.0.33.jar /usr/local/azkaban/azkaban-exec-server/lib/
cp /data/software/mysql-connector-j-8.0.33.jar /usr/local/azkaban/azkaban-web-server/lib/
### 启动服务
## 1、启动 AzkabanExecutorServer
cd /usr/local/azkaban/azkaban-exec-server/bin/
./start-exec.sh
# jps 出现AzkabanExecutorServer进程表示启动成功
root@office-ops-lzl-01:/usr/local/azkaban/azkaban-exec-server/bin# jps
494149 AzkabanExecutorServer
494197 Jps
409893 SecondaryNameNode
413203 RunJar
413105 RunJar
410238 NodeManager
409500 NameNode
410093 ResourceManager
409642 DataNode
486889 GradleDaemon
## 2、激活AzkabanExecutorServer
## curl http://localhost:12321/executor?action=activate
root@office-ops-lzl-01:/data/software/Azkaban-complete-master/3.90.0# curl http://localhost:12321/executor?action=activate
{"status":"success"}root@office-ops-lzl-01:/data/software/Azkaban-complete-master/3.90.0#
## 3、启动AzkabanWebServer
cd /usr/local/azkaban/azkaban-web-server/
## 此处必须是这个目录下执行,其他目录执行失效
./bin/start-web.sh
###### 至此azkaban部署完成 ###############
访问 http://ip:8081
########### 停服务 ###########
cd /opt/azkaban/azkaban-web-server/
./bin/shutdown-web.sh
cd ../azkaban-exec-server/bin/
./shutdown-exec.sh
##############################
########### 启服务 ###########
cd /opt/azkaban/azkaban-exec-server/bin/
./start-exec.sh
sleep 3
curl http://localhost:12321/executor?action=activate
cd /opt/azkaban/azkaban-web-server/
./bin/start-web.sh
##############################