源码编译Tez基于Hadoop3.3.1
热身运动
- 搭建java环境
# 安装java环境
mkdir /usr/java
# tar xf jdk-8u261-linux-x64.tar.gz -C /opt/
ln -s /opt/jdk1.8.0_261 /usr/java/default
cat > /etc/profile.d/java.sh <<-EOF
#!/bin/bash
export JAVA_HOME=/usr/java/default
export PATH=$PATH:$JAVA_HOME/bin
EOF
source /etc/profile
java -verison
- 安装maven
#!/bin/bash
install_maven(){
source_url='https://mirrors.tuna.tsinghua.edu.cn/apache/maven/maven-3/3.6.3/binaries/apache-maven-3.6.3-bin.tar.gz'
cd ~ && wget ${source_url} -O apache-maven-3.6.3-bin.tar.gz
tar xvf apache-maven-3.6.3-bin.tar.gz -C /usr/local/
if [ -d /usr/local/apache-maven-3.6.3 ];then
cat > /etc/profile.d/maven.sh <<-EOF
#!/bin/bash
export M2_HOME=/usr/local/apache-maven-3.6.3
export PATH=\$PATH:\$M2_HOME/bin
EOF
chmod 0744 /etc/profile.d/maven.sh
source /etc/profile.d/maven.sh
rm -f ~/apache-maven-3.6.3-bin.tar.gz
fi
}
install_maven
cat > /usr/local/apache-maven-3.6.3/conf/settings.xml <<-EOF
<?xml version="1.0" encoding="UTF-8"?>
<settings xmlns="http://maven.apache.org/SETTINGS/1.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/SETTINGS/1.0.0 http://maven.apache.org/xsd/settings-1.0.0.xsd">
<localRepository>/root/.m2/repository</localRepository>
<pluginGroups>
</pluginGroups>
<proxies>
</proxies>
<servers>
</servers>
<mirrors>
<mirror>
<id>aliyunmaven</id>
<mirrorOf>*</mirrorOf>
<name>阿里云公共仓库</name>
<url>https://maven.aliyun.com/repository/public</url>
</mirror>
<mirror>
<id>aliyun-central</id>
<mirrorOf>central</mirrorOf>
<name>阿里云central仓库</name>
<url>https://maven.aliyun.com/repository/central</url>
</mirror>
<mirror>
<id>aliyun-google</id>
<mirrorOf>repo</mirrorOf>
<name>阿里云google仓库</name>
<url>
https://maven.aliyun.com/repository/google
</url>
</mirror>
<mirror>
<id>aliyun-gradle</id>
<mirrorOf>repo1</mirrorOf>
<name>阿里云gradle仓库</name>
<url>
https://maven.aliyun.com/repository/gradle-plugin
</url>
</mirror>
<mirror>
<id>aliyun-spring</id>
<mirrorOf>repo2</mirrorOf>
<name>阿里云spring仓库</name>
<url>
https://maven.aliyun.com/repository/spring
</url>
</mirror>
<mirror>
<id>aliyun-spring-plugin</id>
<mirrorOf>repo3</mirrorOf>
<name>阿里云spring-plugin仓库</name>
<url>
https://maven.aliyun.com/repository/spring-plugin
</url>
</mirror>
<mirror>
<id>aliyun-grails-core</id>
<mirrorOf>repo4</mirrorOf>
<name>阿里云grails-core仓库</name>
<url>
https://maven.aliyun.com/repository/grails-core
</url>
</mirror>
<mirror>
<id>aliyun-apache-snapshots</id>
<mirrorOf>repo5</mirrorOf>
<name>阿里云apache-snapshots仓库</name>
<url>
https://maven.aliyun.com/repository/apache-snapshots
</url>
</mirror>
</mirrors>
<profiles>
</profiles>
</settings>
EOF
- 安装protoc 2.5.0
安装编译protobuf
# 编译protoc 版本要求是2.5.0
wget https://github.com/protocolbuffers/protobuf/releases/download/v2.5.0/protobuf-2.5.0.tar.gz
$ tar xf protobuf-2.5.0.tar.gz
$ ./configure
$ make -j 8
$ make install
$ protoc --version
libprotoc 2.5.0
- 编译
# 方式一: 下载官方的二进制包
wget https://dlcdn.apache.org/tez/0.10.1/apache-tez-0.10.1-bin.tar.gz
# 方式二: 源码编译
下载包
https://github.com/apache/tez/archive/refs/tags/rel/release-0.10.1.tar.gz
在源码目录中的pom.xml中将tez-ui移除,太多坑。。。
<slf4j.version>1.7.25</slf4j.version> 跟hadoop-3.3.1 一致
mvn install -Dhadoop.version=3.3.1 -DskipTests -Dmaven.javadoc.skip=true
编译安装完成后,会在源码目录下的 tez-dist/target/ 中找到编译好的 Tez
有两个版本
tez-0.10.1-minimal.tar.gz
tez-0.10.1.tar.gz
上传 Tez 的 jar 包到 HDFS
# 下载的官方二进制包,但是支持的是hadoop3.1.3. 我觉得不够香
wget https://dlcdn.apache.org/tez/0.10.1/apache-tez-0.10.1-bin.tar.gz
tar xf apache-tez-0.10.1-bin.tar.gz
mkdir ~/tez-0.10.1
tar xf share/tez.tar.gz -C ~/tez-0.10.1/
hadoop fs -mkdir /libs/tez
hadoop fs -put tez-0.10.1 /libs/tez
# 自己动手编译的包
mkdir tez-0.10.1-hadoop-3.3.1
tar xf tez-0.10.1.tar.gz -C tez-0.10.1-hadoop-3.3.1/
hadoop fs -mkdir /libs/tez
hadoop fs -put tez-0.10.1-hadoop-3.3.1 /libs/tez
配置 Hive
注: 可将配置文件按服务分开 依次为 hivemetastore-site.xml(metastore的配置文件) hiveserver2-site.xml(hiveserver2的配置文件) 和 hive-site.xml(全局配置文件)
这样就可以在每一个集群主机都包含这些文件
# 在hive的配置目录增加tez-site.xml文件,也要分发到客户端
cat > tez-site.xml <<-EOF
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>tez.lib.uris</name>
<value>${fs.defaultFS}/libs/tez/tez-0.10.1-hadoop-3.3.1,${fs.defaultFS}/libs/tez/tez-0.10.1-hadoop-3.3.1/lib</value>
<description>指定 Hive 运行依赖的 tez 包</description>
</property>
<property>
<name>tez.lib.uris.classpath</name>
<value>${fs.defaultFS}/libs/tez/tez-0.10.1-hadoop-3.3.1,${fs.defaultFS}/libs/tez/tez-0.10.1-hadoop-3.3.1/lib</value>
<description>指定 Hadoop 依赖的相关包</description>
</property>
<property>
<name>tez.use.cluster.hadoop-libs</name>
<value>true</value>
<description>是否使用 Hadoop 自身的 lib 包</description>
</property>
<property>
<name>tez.am.resource.memory.mb</name>
<value>4096</value>
<description>环形缓冲区大小</description>
</property>
<property>
<name>tez.am.resource.cpu.vcores</name>
<value>4</value>
<description>cpu核数限制</description>
</property>
</configuration>
EOF
# 如果想之后默认使用tez引擎,到客户端
修改hive-site.xml
<property>
<name>hive.execution.engine</name>
<value>tez</value>
</property>
<property>
<name>hive.tez.container.size</name>
<value>4096</value>
</property>
部署 Tez 本地程序
[god@node01 opt]$ mkdir /opt/bigdata/tez/tez-0.10.1 -p
[god@node01 opt]$ tar xf tez-0.10.1-minimal.tar.gz -C /opt/bigdata/tez/tez-0.10.1/
[god@node01 opt]$ cd /opt/bigdata/tez
[god@node01 tez]$ ln -s tez-0.10.1 current
Tez 包的引入
cat /opt/bigdata/hive/current/conf/hive-env.sh
export HADOOP_HOME=/opt/bigdata/hadoop/current
export HIVE_CONF_DIR=/opt/bigdata/hive/current/conf
export HIVE_HOME=/opt/bigdata/hive/current
export TEZ_HOME=/opt/bigdata/tez/current
复制tez包到其他机器
# 先将tez整个目录tar打包,用sc传送,然后再解压。这样可以保持软连接
[god@node01 ~]$ cd /opt/bigdata
# 打包
[god@node01 bigdata]$ tar czf tez.gz tez
# 传送
[god@node01 bigdata]$ for i in {2..5}; do scp -p tez.gz node0${i}:`pwd` ;done
# 解包,删除压缩包
[god@node01 bigdata]$ for i in {2..5};do ssh node0${i} "cd /opt/bigdata && tar xf tez.gz && rm -f tez.gz ";done
# 复制hive-env.sh tez-site.xml
[god@node01 tez]$ cd /opt/bigdata/hive/current/conf/ # = cd $HIVE_HOME/conf
[god@node01 conf]$ for i in {2..5};do scp -p hive-env.sh tez-site.xml node0${i}:`pwd` ;done
# 在client主机中加入hive.execution.engine=tez
cat > hive-site.xml <<-EOF
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>hive.execution.engine</name>
<value>tez</value>
</property>
<property>
<name>hive.tez.container.size</name>
<value>4096</value>
</property>
<property>
<name>hive.metastore.uris</name>
<value>thrift://node01:9083</value>
<description>metastore地址</description>
</property>
</configuration>
EOF
[god@node02 conf]$ for i in {3..5};do scp -p hive-site.xml node0${i}:`pwd` ;done
测试功能
# 准备数据
create external table my_tez
(
id int,
name string,
likes array<string>,
address map<string,string>
)
partitioned by(age int)
row format delimited
fields terminated by ','
collection items terminated by '-'
map keys terminated by ':'
location '/chauncy';
for i in {1..100}; do echo "${i},小明${i},抽烟-喝酒-烫头,上海:静安${1}" >> data.txt ;done
hdfs dfs -mkdir -p /chauncy/age=10
hadoop fs -copyFromLocal data.txt /chauncy/age=10
# 启动 Metastore 服务,在hive master主机上
[god@node01 conf]$ nohup hive --service metastore > /opt/bigdata/hive/current/metastore.log 2>&1 &
# 让 Hiveserver2 服务启动到后台,选其他几台找那个任意一个机器
[god@node02 ~]$ nohup hiveserver2 --hiveconf hive.server2.thrift.prot=10000 > /opt/bigdata/hive/current/hiveserver2.log 2>&1 &
# node04连接
beeline -u jdbc:hive2://node02:10000/default -n god
0: jdbc:hive2://node02:10000/default> set hive.execution.engine; # 变成了tez
0: jdbc:hive2://node02:10000/default> msck repair table my_tez; # 修复分区表,hdfs数据加载到表
0: jdbc:hive2://node02:10000/default> select count(*) from my_tez; # 查询数据,快了很多
----------------------------------------------------------------------------------------------
VERTICES MODE STATUS TOTAL COMPLETED RUNNING PENDING FAILED KILLED
----------------------------------------------------------------------------------------------
Map 1 .......... container SUCCEEDED 1 1 0 0 0 0
Reducer 2 ...... container SUCCEEDED 1 1 0 0 0 0
----------------------------------------------------------------------------------------------
VERTICES: 02/02 [==========================>>] 100% ELAPSED TIME: 4.46 s
----------------------------------------------------------------------------------------------
INFO : Completed executing command(queryId=god_20211024194720_d805887c-5379-4a89-bc9a-0e1b0354492a); Time taken: 9.535 seconds
INFO : OK
INFO : Concurrency mode is disabled, not creating a lock manager
+------+
| _c0 |
+------+
| 100 |
+------+
1 row selected (9.737 seconds)
# 切换引擎
0: jdbc:hive2://node02:10000/default> set hive.execution.engine=mr;
No rows affected (0.008 seconds)
0: jdbc:hive2://node02:10000/default> select count(*) from my_tez;
# 加餐: 生产环境提交hive任务的方式
beeline -u jdbc:hive2://node02:10000/data_dev -n god -e "show tables;"