Hadoop+Hive+Hue+Spark

对标EMR 7.2.0

HDFS: Hadoop 3.3.6
Hive: Hive 3.1.3
Hue: Hue 4.11.0
Spark: Spark 3.5.1
Flink: 1.18.1

一、准备依赖环境

1、jdk:1.8.0_351

mkdir -p /data/software
cd /data/software
wget https://tools.qihangxingchen.com/download/jdk-8u351-linux-x64.tar.gz
tar -zxf jdk-8u351-linux-x64.tar.gz -C /usr/local/
cd /usr/local/jdk1.8.0_351
cat >>  ~/.bashrc <<'EOF'
export JAVA_HOME=/usr/local/jdk1.8.0_351
export JRE_HOME=${JAVA_HOME}/jre
export CLASSPATH=.:${JAVA_HOME}/lib:${JRE_HOME}/lib
export PATH=${JAVA_HOME}/bin:$PATH
EOF
source  ~/.bashrc
cat >>  /etc/profile <<'EOF'
export JAVA_HOME=/usr/local/jdk1.8.0_351
export JRE_HOME=${JAVA_HOME}/jre
export CLASSPATH=.:${JAVA_HOME}/lib:${JRE_HOME}/lib
export PATH=${JAVA_HOME}/bin:$PATH
EOF
source /etc/profile
ln -s /usr/local/jdk1.8.0_351/bin/java /bin/java

2、mysql: 8.0.33

cd /data/software
wget https://tools.qihangxingchen.com/download/mysql-8.0.33-el7-x86_64.tar.gz
tar -zxvf mysql-8.0.33-el7-x86_64.tar.gz
mv mysql-8.0.33-el7-x86_64 /usr/local/mysql8
ln -s /usr/local/mysql8/bin/mysql /usr/bin/mysql
ln -s /usr/local/mysql8/bin/mysqldump /usr/bin/mysqldump
ln -s /usr/local/mysql8/bin/mysqld /usr/bin/mysqld
  
## 创建目录
mkdir -p /data/storage/mysql8/data
mkdir -p /data/storage/mysql8/logs
chown -R mysql:mysql /data/storage/mysql8/
 
## 配置my.cnf
cat  >/etc/my.cnf  <<EOF
[mysqld]
basedir=/usr/local/mysql8
datadir=/data/storage/mysql8/data
port = 3306
socket=/tmp/mysql8.sock
mysqlx_socket=/tmp/mysqlx8.sock
default_authentication_plugin=mysql_native_password
  
log-error=/data/storage/mysql8/logs/mysqld.log
pid-file=/tmp/mysqld8.pid
default-storage-engine=INNODB
log-bin=mysql-bin
binlog-format=ROW
  
relay_log_recovery = 1
master_info_repository  =table
relay_log_info_repository =table
gtid_mode = on
enforce_gtid_consistency = on
  
binlog_cache_size=65536
server_id=1
max_connections=2000
max_connect_errors = 800
max_user_connections = 4000
innodb_flush_log_at_trx_commit = 0
innodb_buffer_pool_size = 2048M
back_log        = 2048
lock_wait_timeout =50
skip_name_resolve =1
slow_query_log =1
long_query_time=1
innodb_file_per_table=1
sql_mode        = STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_ENGINE_SUBSTITUTION
sync_binlog = 0
character-set-server=utf8mb4
log_timestamps=SYSTEM
explicit_defaults_for_timestamp=0
skip_ssl
 
[client]
socket=/tmp/mysql8.sock
EOF
 
## 初始化并启动mysql服务
mysqld --defaults-file=/etc/my.cnf --datadir=/data/storage/mysql8/data --initialize-insecure --user=mysql
cp -a /usr/local/mysql8/support-files/mysql.server /etc/init.d/mysql
update-rc.d mysql defaults
systemctl enable mysql
systemctl restart mysql
 
## 创建root用户赋权限
mysql -uroot -e "create user root@'%' identified by 'test@012';grant all on *.* to root@'%';  flush privileges;"

3、/etc/hosts

10.20.12.41 office-ops-lzl-01
10.20.12.42 office-ops-lzl-02

4、设置免密登录

5、maven安装

wget https://dlcdn.apache.org/maven/maven-3/3.9.9/binaries/apache-maven-3.9.9-bin.tar.gz
tar -zxvf apache-maven-3.9.9-bin.tar.gz -C /usr/local/
vim /usr/local/apache-maven-3.9.9/conf/settings.xml
## 增加如下内容
    <mirror>
      <id>nexus-aliyun</id>
      <mirrorOf>central</mirrorOf>
      <name>Nexus aliyun</name>
      <url>http://maven.aliyun.com/nexus/content/groups/public</url>
    </mirror>
    <mirror>
      <id>aliyunmaven</id>
      <mirrorOf>*</mirrorOf>
      <name>spring-plugin</name>
      <url>http://maven.aliyun.com/repository/spring-plugin</url>
    </mirror>
    <mirror>
      <id>repo2</id>
      <mirrorOf>central</mirrorOf>
      <name>Mirror from Maven Repo2</name>
      <url>http://repo.spring.io/plugins-release/</url>
    </mirror>
vim /etc/profile
export MAVEN_HOME=/usr/local/apache-maven-3.9.9
export PATH=$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$HIVE_HOME/bin:$SPARK_HOME/bin:$MAVEN_HOME/bin:$PATH
 
source /etc/profile

6、scale安装

cd /data/software
wget https://downloads.lightbend.com/scala/2.13.14/scala-2.13.14.tgz
tar -zxvf scala-2.13.14.tgz
mv scala-2.13.14 /usr/local/scala
vim /etc/profile
export SCALA_HOME=/usr/local/scala
export PATH=$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$HIVE_HOME/bin:$SPARK_HOME/bin:$MAVEN_HOME/bin:$SCALA_HOME/bin:$PATH
 
source /etc/profile
chown -R work.work /usr/local/scala

二、Hadoop部署

1、单机安装

cd /data/software
wget https://downloads.apache.org/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz
tar -zxvf hadoop-3.3.6.tar.gz
mv hadoop-3.3.6 /usr/local/hadoop
## 配置环境变量
vim /etc/profile
#hadoop environment
export HADOOP_HOME=/usr/local/hadoop
export PATH=$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$PATH
## 使当前窗口生效
source /etc/profile
## 验证hadoop
root@office-ops-lzl-01:/data/software# hadoop version
Hadoop 3.3.6
Source code repository https://github.com/apache/hadoop.git -r 1be78238728da9266a4f88195058f08fd012bf9c
Compiled by ubuntu on 2023-06-18T08:22Z
Compiled on platform linux-x86_64
Compiled with protoc 3.7.1
From source with checksum 5652179ad55f76cb287d9c633bb53bbd
This command was run using /usr/local/hadoop/share/hadoop/common/hadoop-common-3.3.6.jar
## 配置hadoop-env.sh文件
vim $HADOOP_HOME/etc/hadoop/hadoop-env.sh
# 增加如下:
export JAVA_HOME=/usr/local/jdk1.8.0_351
export HADOOP_ROOT_LOGGER=WARN,console
 
## 创建hdfs所需目录
mkdir -p /usr/local/hadoop/data/dfs/name
chown -R work.work /usr/local/hadoop/data

2、文件配置

  • core-site.xml的配置
cd $HADOOP_HOME/etc/hadoop
vim core-site.xml
<configuration>
<!-- 配置分布式文件系统的schema和ip以及port,默认8020-->
        <property>
                <name>fs.defaultFS</name>
                <value>hdfs://10.20.12.41:8020/</value>
        </property>
        <!-- 指定 hadoop 数据的存储目录 -->
        <property>
                <name>hadoop.tmp.dir</name>
                <value>/usr/local/hadoop/data</value>
        </property>
        <!-- 配置 HDFS 网页登录使用的静态用户为 atguigu -->
        <property>
                <name>hadoop.http.staticuser.user</name>
                <value>lzl</value>
        </property>
</configuration>
  • hdfs-site.xml的配置
<configuration>
        <!-- 配置副本数,注意,伪分布模式只能是1。-->
        <property>
                <name>dfs.replication</name>
                <value>1</value>
        </property>
</configuration>
  • mapred-site.xml的配置
<configuration>
        <!-- 指定 MapReduce 程序运行在 Yarn 上 -->
        <property>
                <name>mapreduce.framework.name</name>
                <value>yarn</value>
        </property>
</configuration>
  • yarn-site.xml的配置
<configuration>
 
<!-- Site specific YARN configuration properties -->
        <!-- 指定 MR 走 shuffle -->
        <property>
                <name>yarn.nodemanager.aux-services</name>
                <value>mapreduce_shuffle</value>
        </property>
        <!-- 指定 ResourceManager 的地址-->
        <property>
                <name>yarn.resourcemanager.hostname</name>
                <value>0.0.0.0</value>
        </property>
        <!-- 环境变量的继承 -->
        <property>
                <name>yarn.nodemanager.env-whitelist</name>
                <value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
        </property>
 
</configuration>

3、格式化NameNode

hdfs namenode -format

4、启动HDFS


# 需长期生效写到全局变量中
export HDFS_NAMENODE_USER='work'
export HDFS_DATANODE_USER='work'
export HDFS_SECONDARYNAMENODE_USER='work'
# 启动
start-all.sh
# jps查看进程
root@office-ops-lzl-01:/usr/local/hadoop# jps
19732 SecondaryNameNode
21092 Jps
19531 DataNode
20671 NameNode
29321 ResourceManager

5、WebUI 9870

三、安装hive

## 安装部分
cd /data/software
wget https://downloads.apache.org/hive/hive-3.1.3/apache-hive-3.1.3-bin.tar.gz  ## 此处的apache-hive-3.1.3-bin.tar.gz是需要修改spark指定版本重新编译的,编译过程另见文档 :https://confluence.hkbge-inc.com/pages/resumedraft.action?draftId=110367513&draftShareId=b194bf7c-61a5-40dc-a4f0-38a2bdd1187d&
tar -zxvf apache-hive-3.1.3-bin.tar.gz
mv apache-hive-3.1.3-bin /usr/local/hive
 
# 将全局环境变量调整如下:
vim /etc/profile
#hadoop environment
export HADOOP_HOME=/usr/local/hadoop
export HIVE_HOME=/usr/local/hive
export PATH=$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$HIVE_HOME/bin:$PATH
 
source /etc/profile
## 配置部分
cd /usr/local/hive/conf
cp hive-default.xml.template hive-site.xml
vim hive-site.xml
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?><!--
   Licensed to the Apache Software Foundation (ASF) under one or more
   contributor license agreements.  See the NOTICE file distributed with
   this work for additional information regarding copyright ownership.
   The ASF licenses this file to You under the Apache License, Version 2.0
   (the "License"); you may not use this file except in compliance with
   the License.  You may obtain a copy of the License at
 
       http://www.apache.org/licenses/LICENSE-2.0
 
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
--><configuration>
        <!-- jdbc 连接的 URL -->
        <property>
                <name>javax.jdo.option.ConnectionURL</name>
                <value>jdbc:mysql://10.20.12.41:3306/hive?useSSL=false</value>
        </property>
        <!-- jdbc 连接的 Driver-->
        <property>
                <name>javax.jdo.option.ConnectionDriverName</name>
                <value>com.mysql.cj.jdbc.Driver</value>
        </property>
        <!-- jdbc 连接的 username-->
        <property>
                <name>javax.jdo.option.ConnectionUserName</name>
                <value>root</value>
        </property>
        <!-- jdbc 连接的 password -->
        <property>
                <name>javax.jdo.option.ConnectionPassword</name>
                <value>qhxc@021</value>
        </property>
        <!-- Hive 元数据存储版本的验证 -->
        <property>
                <name>hive.metastore.schema.verification</name>
                <value>false</value>
        </property>
        <!--元数据存储授权-->
        <property>
                <name>hive.metastore.event.db.notification.api.auth</name>
                <value>false</value>
        </property>
        <!-- Hive 默认在 HDFS 的工作目录 -->
        <property>
                <name>hive.metastore.warehouse.dir</name>
                <value>/user/hive/warehouse</value>
        </property>
        <!-- 指定metastore服务的地址 -->
        <property>
                <name>hive.metastore.uris</name>
                <value>thrift://10.20.12.41:9083</value>
        </property>
        <!-- 指定 hiveserver2 连接的 host -->
        <property>
                <name>hive.server2.thrift.bind.host</name>
                <value>0.0.0.0</value>
        </property>
        <!-- 指定 hiveserver2 连接的端口号 -->
        <property>
                <name>hive.server2.thrift.port</name>
                <value>10000</value>
        </property>
        <property>
            <name>hive.server2.enable.doAs</name>
            <value>false</value>
            <description>
              Setting this property to true will have HiveServer2 execute
              Hive operations as the user making the calls to it.
            </description>
         </property>
</configuration>
## 下载mysql JDBC连接驱动
## mysql官方托管:https://downloads.mysql.com/archives/c-j/
 
mysql-connector-j-8.0.33.jar (存放至/data/software)
cp mysql-connector-j-8.0.33.jar $HIVE_HOME/lib/
 
## 初始化元数据库(注:需要保证Hadoop集群是开启状态)
schematool -dbType mysql -initSchema -verbose
 
## 启动hive并测试
su - work
work@office-ops-lzl-01:~$ hive
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/local/hive/lib/log4j-slf4j-impl-2.17.1.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/local/hadoop/share/hadoop/common/lib/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
Hive Session ID = d9044cb9-f55e-4798-ab17-351cfb574f2b
 
Logging initialized using configuration in jar:file:/usr/local/hive/lib/hive-common-3.1.3.jar!/hive-log4j2.properties Async: true
Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
Hive Session ID = 5cf8e845-9245-46b0-8018-a281ebdac64d
hive> show databases;
OK
default
Time taken: 0.992 seconds, Fetched: 1 row(s)
hive> show tables;
OK
Time taken: 0.047 seconds
 
## 启动hiveserver2服务
##1、前台启动
hiveserver2  或者  hive --service hiveserver2
 
##2、后台启动
#### 记录日志
mkdir -p /data/logs/hive
chown -R work.work /data/logs
su - work
nohup hive --service metastore 1>/data/logs/hive/hivemetastore.log 2>/data/logs/hive/hivemetastore.err &
nohup hiveserver2 1>/data/logs/hive/hiveserver.log 2>/data/logs/hive/hiveserver.err &
 
work@office-ops-lzl-01:~$ netstat -nlpt | grep 9083
(Not all processes could be identified, non-owned process info
 will not be shown, you would have to be root to see it all.)
tcp6       0      0 :::9083                 :::*                    LISTEN      48097/java

四、Spark安装和对接

## spark安装
cd /data/software
wget https://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz   ## spark-3.5.1-bin-hadoop3.tgz
tar -zxvf spark-3.5.1-bin-hadoop3.tgz
mv spark-3.5.1-bin-hadoop3 /usr/local/spark
chown -R work.work /usr/local/spark
## 添加环境变量
vim /etc/profile
export SPARK_HOME=/usr/local/spark
export PATH=$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$HIVE_HOME/bin:$SPARK_HOME/bin:$PATH
 
source /etc/profile
## 修改配置文件
cd /usr/local/spark/
cd conf/
cp spark-env.sh.template spark-env.sh
vim spark-env.sh
HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop
YARN_CONF_DIR=/usr/local/hadoop/etc/hadoop
export SPARK_DIST_CLASSPATH=$(hadoop classpath)  ## 这个很重要!!!不加的话hive sql命令执行一直failed
 
## 拷贝jar包
cd /usr/local/spark/jars
### 在HDFS创建存储历史日志的路径
hadoop fs -mkdir /spark-history
### 在HDFS创建存放jar包的路径
hadoop fs -mkdir /spark-jars
### 向HDFS上传spark纯净版jar包
hadoop fs -put /usr/local/spark/jars/* /spark-jars
 
## 配置对接
su - work
vim /usr/local/hive/conf/hive-site.xml
        <!-- ##################对接spark###################### -->
        <property>
                <name>spark.yarn.jars</name>
                <value>hdfs://bj-qhxc-office-ops-lzl-01:8020/spark-jars/*</value>
        </property>
        <property>
          <name>hive.execution.engine</name>
          <value>spark</value>
        </property>
        <!--Hive和Spark连接超时时间-->
        <property>
            <name>hive.spark.client.connect.timeout</name>
            <value>60000ms</value>
        </property>
 
cd /usr/local/hive/conf
vim spark-defaults.conf
spark.master                               yarn
spark.eventLog.enabled                   true
spark.eventLog.dir                        hdfs://0.0.0.0:8020/spark-history
spark.executor.memory                    1g
spark.driver.memory
 
## 重新启动hive
ps -ef | grep hive
kill -9 xxpidxx
nohup hive --service metastore 1>/data/logs/hive/hivemetastore.log 2>/data/logs/hive/hivemetastore.err &
nohup hiveserver2 1>/data/logs/hive/hiveserver.log 2>/data/logs/hive/hiveserver.err &

五、Flink部署

cd /data/software
wget https://dlcdn.apache.org/flink/flink-1.18.1/flink-1.18.1-bin-scala_2.12.tgz
## 为了运行Flink,只需提前安装好 Java 11。你可以通过以下命令来检查 Java 是否已经安装正确。
java -version
## 下载 release 1.18.1 并解压。
tar -xzf flink-1.18.1-bin-scala_2.12.tgz
cd flink-1.18.1-bin-scala_2.12
## 启动集群
cd flink-1.18.1-bin-scala_2.12
## 提交作业
Flink 的 Releases 附带了许多的示例作业。你可以任意选择一个,快速部署到已运行的集群上。
 
$ ./bin/flink run examples/streaming/WordCount.jar
$ tail log/flink-*-taskexecutor-*.out
  (nymph,1)
  (in,3)
  (thy,1)
  (orisons,1)
  (be,4)
  (all,2)
  (my,1)
  (sins,1)
  (remember,1)
  (d,4)
 
## 停止集群
./bin/stop-cluster.sh

六、Hue

wget https://github.com/cloudera/hue/archive/refs/tags/release-4.11.0.tar.gz
tar -zxvf release-4.11.0.tar.gz
mv release-4.11.0 /usr/local/hue
chown -R work.work /usr/local/hue/
apt install python2.7-dev python-pip libxml2-dev libxslt1-dev libsqlite3-dev libldap2-dev libsasl2-dev npm
apt-get install libmysqlclient-dev
cp /usr/local/mysql/include/my_config.h /usr/include/mysql/
ln -s /usr/bin/pip2 /usr/bin/pip
cd /usr/local/hue
 
pip install setuptools-scm
pip install future
pip install python-daemon
vim desktop/core/src/desktop/supervisor.py
from daemon.pidlockfile 改为 from daemon.pidfile
pip install cryptography
 
## 创建用户,此处我使用的是用户名/密码: admin / admin
 
## mysql数据库中创建 hue数据库
create database hue;
create user 'hue'@'%' identified By 'test@012';
grant all on hue.* TO 'hue'@'%';
flush privileges;
 
##### ## 数据库迁移
##### ./build/env/bin/hue migrate
 
make apps
groupadd hue
useradd -g hue hue
## 启动hue
./build/env/bin/supervisor
/usr/local/hue/build/env/local/lib/python2.7/site-packages/requests_kerberos-0.12.0-py2.7.egg/requests_kerberos/kerberos_.py:11: CryptographyDeprecationWarning: Python 2 is no longer supported by the Python core team. Support for it is now deprecated in cryptography, and will be removed in the next release.
  from cryptography import x509
/usr/local/hue/build/env/local/lib/python2.7/site-packages/requests_kerberos-0.12.0-py2.7.egg/requests_kerberos/kerberos_.py:11: CryptographyDeprecationWarning: Python 2 is no longer supported by the Python core team. Support for it is now deprecated in cryptography, and will be removed in the next release.
  from cryptography import x509
[11/Sep/2024 19:31:23 +0000] settings     DEBUG    DESKTOP_DB_TEST_NAME SET: /usr/local/hue/desktop/desktop-test.db
[11/Sep/2024 19:31:23 +0000] settings     DEBUG    DESKTOP_DB_TEST_USER SET: hue_test
[11/Sep/2024 19:31:23 +0000] settings     DEBUG    DESKTOP_DB_TEST_NAME SET: /usr/local/hue/desktop/desktop-test.db
[11/Sep/2024 19:31:23 +0000] settings     DEBUG    DESKTOP_DB_TEST_USER SET: hue_test
[11/Sep/2024 04:31:23 +0000] sslcompat    DEBUG    ipaddress module is available
[11/Sep/2024 04:31:23 +0000] sslcompat    WARNING  backports.ssl_match_hostname is unavailable
[11/Sep/2024 04:31:23 +0000] sslcompat    DEBUG    ssl.match_hostname is available
[11/Sep/2024 04:31:23 +0000] sslcompat    DEBUG    ipaddress module is available
[11/Sep/2024 04:31:23 +0000] sslcompat    WARNING  backports.ssl_match_hostname is unavailable
[11/Sep/2024 04:31:23 +0000] sslcompat    DEBUG    ssl.match_hostname is available
[11/Sep/2024 04:31:24 +0000] decorators   INFO     AXES: BEGIN LOG
[11/Sep/2024 04:31:24 +0000] decorators   INFO     AXES: Using django-axes 4.5.4
[11/Sep/2024 04:31:24 +0000] decorators   INFO     AXES: blocking by IP only.
[11/Sep/2024 04:31:24 +0000] decorators   INFO     AXES: BEGIN LOG
[11/Sep/2024 04:31:24 +0000] decorators   INFO     AXES: Using django-axes 4.5.4
[11/Sep/2024 04:31:24 +0000] decorators   INFO     AXES: blocking by IP only.
[11/Sep/2024 04:31:24 +0000] __init__     INFO     Couldn't import snappy. Support for snappy compression disabled.
[11/Sep/2024 04:31:24 +0000] urls         WARNING  djangosaml2 module not found
[11/Sep/2024 04:31:24 +0000] kt_renewer   INFO     Keytab renewer not starting, no keytab configured
[11/Sep/2024 04:31:24 +0000] __init__     INFO     Couldn't import snappy. Support for snappy compression disabled.
[11/Sep/2024 04:31:25 +0000] urls         WARNING  djangosaml2 module not found
/usr/local/hue/build/env/local/lib/python2.7/site-packages/requests_kerberos-0.12.0-py2.7.egg/requests_kerberos/kerberos_.py:11: CryptographyDeprecationWarning: Python 2 is no longer supported by the Python core team. Support for it is now deprecated in cryptography, and will be removed in the next release.
  from cryptography import x509
[11/Sep/2024 04:31:25 +0000] settings     INFO     Welcome to Hue 4.11.0
[11/Sep/2024 04:31:25 +0000] settings     DEBUG    Installed Django modules: DesktopModule(aws: aws),DesktopModule(azure: azure),DesktopModule(hadoop: hadoop),DesktopModule(libanalyze: libanalyze),DesktopModule(liboauth: liboauth),DesktopModule(liboozie: liboozie),DesktopModule(librdbms: librdbms),DesktopModule(libsaml: libsaml),DesktopModule(libsentry: libsentry),DesktopModule(libsolr: libsolr),DesktopModule(libzookeeper: libzookeeper),DesktopModule(Hue: desktop),DesktopModule(About: about),DesktopModule(Hive: beeswax),DesktopModule(File Browser: filebrowser),DesktopModule(HBase Browser: hbase),DesktopModule(Help: help),DesktopModule(hive: hive),DesktopModule(Impala: impala),DesktopModule(Job Browser: jobbrowser),DesktopModule(Job Designer: jobsub),DesktopModule(Table Browser: metastore),DesktopModule(Oozie Editor/Dashboard: oozie),DesktopModule(Pig Editor: pig),DesktopModule(Proxy: proxy),DesktopModule(RDBMS UI: rdbms),DesktopModule(Solr Search: search),DesktopModule(Hadoop Security: security),DesktopModule(Spark: spark),DesktopModule(Sqoop: sqoop),DesktopModule(User Admin: useradmin),DesktopModule(ZooKeeper Browser: zookeeper),DesktopModule(Data Importer: indexer),DesktopModule(Metadata: metadata),DesktopModule(Notebook: notebook),DesktopModule(Analytics Dashboards: dashboard),DesktopModule(Kafka: kafka)
[11/Sep/2024 04:31:25 +0000] settings     DEBUG    DESKTOP_DB_TEST_NAME SET: /usr/local/hue/desktop/desktop-test.db
[11/Sep/2024 04:31:25 +0000] settings     DEBUG    DESKTOP_DB_TEST_USER SET: hue_test
[11/Sep/2024 04:31:25 +0000] sslcompat    DEBUG    ipaddress module is available
[11/Sep/2024 04:31:25 +0000] sslcompat    WARNING  backports.ssl_match_hostname is unavailable
[11/Sep/2024 04:31:25 +0000] sslcompat    DEBUG    ssl.match_hostname is available
[11/Sep/2024 04:31:26 +0000] decorators   INFO     AXES: BEGIN LOG
[11/Sep/2024 04:31:26 +0000] decorators   INFO     AXES: Using django-axes 4.5.4
[11/Sep/2024 04:31:26 +0000] decorators   INFO     AXES: blocking by IP only.
[11/Sep/2024 04:31:26 +0000] __init__     INFO     Couldn't import snappy. Support for snappy compression disabled.
[11/Sep/2024 04:31:27 +0000] urls         WARNING  djangosaml2 module not found
[11/Sep/2024 04:31:27 +0000] runcherrypyserver INFO     Starting server with options:
{'daemonize': False,
 'host': '0.0.0.0',
 'pidfile': None,
 'port': 8000,
 'server_group': 'hue',
 'server_name': 'localhost',
 'server_user': 'hue',
 'ssl_certificate': None,
 'ssl_certificate_chain': None,
 'ssl_cipher_list': 'ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-AES256-GCM-SHA384:DHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES256-GCM-SHA384:!aNULL:!eNULL:!EXPORT:!DES:!RC4:!MD5:!PSK:!aECDH:!EDH-DSS-DES-CBC3-SHA:!EDH-RSA-DES-CBC3-SHA:!KRB5-DES-CBC3-SHA',
 'ssl_no_renegotiation': False,
 'ssl_private_key': None,
 'threads': 50,
 'workdir': None}
[11/Sep/2024 04:31:27 +0000] middleware   INFO     Unloading HueRemoteUserMiddleware
[11/Sep/2024 04:31:27 +0000] middleware   INFO     Unloading SpnegoMiddleware
[11/Sep/2024 04:31:27 +0000] middleware   INFO     Unloading ProxyMiddleware
 
## 主配置文件修改
vim desktop/conf/pseudo-distributed.ini
[desktop]
  http_host=0.0.0.0
  http_port=8888
 
[notebook]
[[[mysql]]]
name = MySQL
interface=sqlalchemy
options='{"url": "mysql://hue:test@012@10.20.12.41:3306/hue"}'
[[[hive]]]
name=Hive
interface=hiveserver2
[dashboard]
[hadoop]
[beeswax]
hive_server_host=10.20.12.41
hive_server_port=10000
hive_metastore_host=office-ops-lzl-01
hive_metastore_port=9083
hive_conf_dir=/usr/local/hive/conf
 
## 重启hue服务
ps -ef | grep hue
kill -9 xxx
./build/env/bin/supervisor
 
## 浏览器访问 http://ip:8888   ## 默认端口为8000

七、DataX

参考文档:DataX/userGuid.md at master · alibaba/DataX · GitHub

cd /data/software
wget https://datax-opensource.oss-cn-hangzhou.aliyuncs.com/202309/datax.tar.gz
tar -zxvf datax.tar.gz
mv datax /usr/local/
chown -R work.work /usr/local/datax
## 自检脚本
cd /usr/local/datax/bin/
python datax.py ../job/job.json
 
DataX (DATAX-OPENSOURCE-3.0), From Alibaba !
Copyright (C) 2010-2017, Alibaba Group. All Rights Reserved.
 
 
2024-09-11 16:47:41.082 [main] INFO  MessageSource - JVM TimeZone: GMT+08:00, Locale: zh_CN
2024-09-11 16:47:41.084 [main] INFO  MessageSource - use Locale: zh_CN timeZone: sun.util.calendar.ZoneInfo[id="GMT+08:00",offset=28800000,dstSavings=0,useDaylight=false,transitions=0,lastRule=null]
2024-09-11 16:47:41.183 [main] INFO  VMInfo - VMInfo# operatingSystem class => sun.management.OperatingSystemImpl
2024-09-11 16:47:41.188 [main] INFO  Engine - the machine info  =>
 
    osInfo: Linux amd64 5.15.0-119-generic
    jvmInfo:    Oracle Corporation 1.8 25.351-b10
    cpu num:    4
 
    totalPhysicalMemory:    -0.00G
    freePhysicalMemory: -0.00G
    maxFileDescriptorCount: -1
    currentOpenFileDescriptorCount: -1
 
    GC Names    [PS MarkSweep, PS Scavenge]
 
    MEMORY_NAME                    | allocation_size                | init_size                     
    PS Eden Space                  | 256.00MB                       | 256.00MB                      
    Code Cache                     | 240.00MB                       | 2.44MB                        
    Compressed Class Space         | 1,024.00MB                     | 0.00MB                        
    PS Survivor Space              | 42.50MB                        | 42.50MB                       
    PS Old Gen                     | 683.00MB                       | 683.00MB                      
    Metaspace                      | -0.00MB                        | 0.00MB                        
 
 
2024-09-11 16:47:41.198 [main] INFO  Engine -
{
    "setting":{
        "speed":{
            "channel":1
        },
        "errorLimit":{
            "record":0,
            "percentage":0.02
        }
    },
    "content":[
        {
            "reader":{
                "name":"streamreader",
                "parameter":{
                    "column":[
                        {
                            "value":"DataX",
                            "type":"string"
                        },
                        {
                            "value":19890604,
                            "type":"long"
                        },
                        {
                            "value":"1989-06-04 00:00:00",
                            "type":"date"
                        },
                        {
                            "value":true,
                            "type":"bool"
                        },
                        {
                            "value":"test",
                            "type":"bytes"
                        }
                    ],
                    "sliceRecordCount":100000
                }
            },
            "writer":{
                "name":"streamwriter",
                "parameter":{
                    "print":false,
                    "encoding":"UTF-8"
                }
            }
        }
    ]
}
 
2024-09-11 16:47:41.216 [main] INFO  PerfTrace - PerfTrace traceId=job_-1, isEnable=false
2024-09-11 16:47:41.217 [main] INFO  JobContainer - DataX jobContainer starts job.
2024-09-11 16:47:41.217 [main] INFO  JobContainer - Set jobId = 0
2024-09-11 16:47:41.273 [job-0] INFO  JobContainer - jobContainer starts to do prepare ...
2024-09-11 16:47:41.274 [job-0] INFO  JobContainer - DataX Reader.Job [streamreader] do prepare work .
2024-09-11 16:47:41.274 [job-0] INFO  JobContainer - DataX Writer.Job [streamwriter] do prepare work .
2024-09-11 16:47:41.274 [job-0] INFO  JobContainer - jobContainer starts to do split ...
2024-09-11 16:47:41.274 [job-0] INFO  JobContainer - Job set Channel-Number to 1 channels.
2024-09-11 16:47:41.275 [job-0] INFO  JobContainer - DataX Reader.Job [streamreader] splits to [1] tasks.
2024-09-11 16:47:41.275 [job-0] INFO  JobContainer - DataX Writer.Job [streamwriter] splits to [1] tasks.
2024-09-11 16:47:41.296 [job-0] INFO  JobContainer - jobContainer starts to do schedule ...
2024-09-11 16:47:41.299 [job-0] INFO  JobContainer - Scheduler starts [1] taskGroups.
2024-09-11 16:47:41.301 [job-0] INFO  JobContainer - Running by standalone Mode.
2024-09-11 16:47:41.308 [taskGroup-0] INFO  TaskGroupContainer - taskGroupId=[0] start [1] channels for [1] tasks.
2024-09-11 16:47:41.315 [taskGroup-0] INFO  Channel - Channel set byte_speed_limit to -1, No bps activated.
2024-09-11 16:47:41.316 [taskGroup-0] INFO  Channel - Channel set record_speed_limit to -1, No tps activated.
2024-09-11 16:47:41.325 [taskGroup-0] INFO  TaskGroupContainer - taskGroup[0] taskId[0] attemptCount[1] is started
2024-09-11 16:47:41.627 [taskGroup-0] INFO  TaskGroupContainer - taskGroup[0] taskId[0] is successed, used[302]ms
2024-09-11 16:47:41.628 [taskGroup-0] INFO  TaskGroupContainer - taskGroup[0] completed it's tasks.
2024-09-11 16:47:51.317 [job-0] INFO  StandAloneJobContainerCommunicator - Total 100000 records, 2600000 bytes | Speed 253.91KB/s, 10000 records/s | Error 0 records, 0 bytes |  All Task WaitWriterTime 0.030s |  All Task WaitReaderTime 0.038s | Percentage 100.00%
2024-09-11 16:47:51.318 [job-0] INFO  AbstractScheduler - Scheduler accomplished all tasks.
2024-09-11 16:47:51.319 [job-0] INFO  JobContainer - DataX Writer.Job [streamwriter] do post work.
2024-09-11 16:47:51.319 [job-0] INFO  JobContainer - DataX Reader.Job [streamreader] do post work.
2024-09-11 16:47:51.319 [job-0] INFO  JobContainer - DataX jobId [0] completed successfully.
2024-09-11 16:47:51.320 [job-0] INFO  HookInvoker - No hook invoked, because base dir not exists or is a file: /usr/local/datax/hook
2024-09-11 16:47:51.321 [job-0] INFO  JobContainer -
     [total cpu info] =>
        averageCpu                     | maxDeltaCpu                    | minDeltaCpu                   
        -1.00%                         | -1.00%                         | -1.00%
                         
 
     [total gc info] =>
         NAME                 | totalGCCount       | maxDeltaGCCount    | minDeltaGCCount    | totalGCTime        | maxDeltaGCTime     | minDeltaGCTime    
         PS MarkSweep         | 0                  | 0                  | 0                  | 0.000s             | 0.000s             | 0.000s            
         PS Scavenge          | 0                  | 0                  | 0                  | 0.000s             | 0.000s             | 0.000s            
 
2024-09-11 16:47:51.322 [job-0] INFO  JobContainer - PerfTrace not enable!
2024-09-11 16:47:51.322 [job-0] INFO  StandAloneJobContainerCommunicator - Total 100000 records, 2600000 bytes | Speed 253.91KB/s, 10000 records/s | Error 0 records, 0 bytes |  All Task WaitWriterTime 0.030s |  All Task WaitReaderTime 0.038s | Percentage 100.00%
2024-09-11 16:47:51.324 [job-0] INFO  JobContainer -
任务启动时刻                    : 2024-09-11 16:47:41
任务结束时刻                    : 2024-09-11 16:47:51
任务总计耗时                    :                 10s
任务平均流量                    :          253.91KB/s
记录写入速度                    :          10000rec/s
读出记录总数                    :              100000
读写失败总数                    :                   0

八、Azkaban

cd /data/software
wget https://github.com/shengminjie/Azkaban-complete/archive/refs/heads/master.zip
unzip master.zip
cd Azkaban-complete-master/3.90.0/
mkdir /usr/local/azkaban
unzip azkaban-db-0.1.0-SNAPSHOT.zip
unzip azkaban-exec-server-0.1.0-SNAPSHOT.zip
unzip azkaban-web-server-0.1.0-SNAPSHOT.zip
mv azkaban-db-0.1.0-SNAPSHOT /usr/local/azkaban/azkaban-db
mv azkaban-exec-server-0.1.0-SNAPSHOT /usr/local/azkaban/azkaban-exec-server
mv azkaban-web-server-0.1.0-SNAPSHOT /usr/local/azkaban/azkaban-web-server
 
### 数据库配置
mysql -uroot -p
> create database azkaban default character set latin1;
#注意:这里创建 azkaban 时建议使用 latin1 编码,因为索引太长, utf8 编码格式不支持,最高支持 1000
use azkaban;
source /data/software/Azkaban-complete-master/3.90.0/create-all-sql-0.1.0-SNAPSHOT.sql;
## mysql5
##grant all on azkaban.* TO 'azkaban'@'%' identified By 'qhxc@021';
## mysql8
create user 'azkaban'@'%' identified By 'test@012';
grant all on azkaban.* TO 'azkaban'@'%';
flush privileges;
 
### 配置文件修改
## azkaban web服务端配置
cd /usr/local/azkaban/azkaban-web-server/conf/
vim azkaban.properties  ## 修改数据库相关配置
 
# Azkaban Personalization Settings
azkaban.name=Test
azkaban.label=My Local Azkaban
azkaban.color=#FF3601
azkaban.default.servlet.path=/index
web.resource.dir=web/
default.timezone.id=America/Los_Angeles
# Azkaban UserManager class
user.manager.class=azkaban.user.XmlUserManager
user.manager.xml.file=conf/azkaban-users.xml
# Loader for projects
executor.global.properties=conf/global.properties
azkaban.project.dir=projects
# Velocity dev mode
velocity.dev.mode=false
# Azkaban Jetty server properties.
jetty.use.ssl=false
jetty.maxThreads=25
jetty.port=8081
# Azkaban Executor settings
# mail settings
mail.sender=
mail.host=
# User facing web server configurations used to construct the user facing server URLs. They are useful when there is a reverse proxy between Azkaban web servers and users.
# enduser -> myazkabanhost:443 -> proxy -> localhost:8081
# when this parameters set then these parameters are used to generate email links.
# if these parameters are not set then jetty.hostname, and jetty.port(if ssl configured jetty.ssl.port) are used.
# azkaban.webserver.external_hostname=myazkabanhost.com
# azkaban.webserver.external_ssl_port=443
# azkaban.webserver.external_port=8081
job.failure.email=
job.success.email=
lockdown.create.projects=false
cache.directory=cache
# JMX stats
jetty.connector.stats=true
executor.connector.stats=true
# Azkaban mysql settings by default. Users should configure their own username and password.
database.type=mysql
mysql.port=3306
mysql.host=10.20.12.41
mysql.database=azkaban
mysql.user=azkaban
mysql.password=test@012
mysql.numconnections=100
#Multiple Executor
azkaban.use.multiple.executors=true
azkaban.executorselector.filters=StaticRemainingFlowSize,MinimumFreeMemory,CpuStatus
azkaban.executorselector.comparator.NumberOfAssignedFlowComparator=1
azkaban.executorselector.comparator.Memory=1
azkaban.executorselector.comparator.LastDispatched=1
azkaban.executorselector.comparator.CpuUsage=1
 
## azkaban executor服务器配置
cd ../../azkaban-exec-server/conf/
vim azkaban.properties  ## 修改数据库相关配置以及增加最后两行
 
# Azkaban Personalization Settings
azkaban.name=Test
azkaban.label=My Local Azkaban
azkaban.color=#FF3601
azkaban.default.servlet.path=/index
web.resource.dir=web/
default.timezone.id=America/Los_Angeles
# Azkaban UserManager class
user.manager.class=azkaban.user.XmlUserManager
user.manager.xml.file=conf/azkaban-users.xml
# Loader for projects
executor.global.properties=conf/global.properties
azkaban.project.dir=projects
# Velocity dev mode
velocity.dev.mode=false
# Azkaban Jetty server properties.
jetty.use.ssl=false
jetty.maxThreads=25
jetty.port=8081
# Where the Azkaban web server is located
azkaban.webserver.url=http://localhost:8081
# mail settings
mail.sender=
mail.host=
# User facing web server configurations used to construct the user facing server URLs. They are useful when there is a reverse proxy between Azkaban web servers and users.
# enduser -> myazkabanhost:443 -> proxy -> localhost:8081
# when this parameters set then these parameters are used to generate email links.
# if these parameters are not set then jetty.hostname, and jetty.port(if ssl configured jetty.ssl.port) are used.
# azkaban.webserver.external_hostname=myazkabanhost.com
# azkaban.webserver.external_ssl_port=443
# azkaban.webserver.external_port=8081
job.failure.email=
job.success.email=
lockdown.create.projects=false
cache.directory=cache
# JMX stats
jetty.connector.stats=true
executor.connector.stats=true
# Azkaban plugin settings
azkaban.jobtype.plugin.dir=plugins/jobtypes
# Azkaban mysql settings by default. Users should configure their own username and password.
database.type=mysql
mysql.port=3306
mysql.host=10.20.12.41
mysql.database=azkaban
mysql.user=azkaban
mysql.password=test@012
mysql.numconnections=100
# Azkaban Executor settings
executor.maxThreads=50
executor.flow.threads=30
# 此处不指定端口,后续启动需激活的时候是随机端口
executor.port=12321
flow.num.job.threads=11
 
cp /data/software/mysql-connector-j-8.0.33.jar /usr/local/azkaban/azkaban-exec-server/lib/
cp /data/software/mysql-connector-j-8.0.33.jar /usr/local/azkaban/azkaban-web-server/lib/
 
### 启动服务
## 1、启动 AzkabanExecutorServer
cd /usr/local/azkaban/azkaban-exec-server/bin/
./start-exec.sh
# jps 出现AzkabanExecutorServer进程表示启动成功
root@office-ops-lzl-01:/usr/local/azkaban/azkaban-exec-server/bin# jps
494149 AzkabanExecutorServer
494197 Jps
409893 SecondaryNameNode
413203 RunJar
413105 RunJar
410238 NodeManager
409500 NameNode
410093 ResourceManager
409642 DataNode
486889 GradleDaemon
 
## 2、激活AzkabanExecutorServer
## curl http://localhost:12321/executor?action=activate
root@office-ops-lzl-01:/data/software/Azkaban-complete-master/3.90.0# curl http://localhost:12321/executor?action=activate
{"status":"success"}root@office-ops-lzl-01:/data/software/Azkaban-complete-master/3.90.0#
 
## 3、启动AzkabanWebServer
cd /usr/local/azkaban/azkaban-web-server/
## 此处必须是这个目录下执行,其他目录执行失效
./bin/start-web.sh
 
###### 至此azkaban部署完成 ###############
访问 http://ip:8081
 
 
 
###########  停服务 ###########
cd /opt/azkaban/azkaban-web-server/
./bin/shutdown-web.sh
cd ../azkaban-exec-server/bin/
./shutdown-exec.sh
##############################
 
###########  启服务 ###########
cd /opt/azkaban/azkaban-exec-server/bin/
./start-exec.sh 
sleep 3
curl http://localhost:12321/executor?action=activate
cd /opt/azkaban/azkaban-web-server/
./bin/start-web.sh
##############################

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值