pyhive
检查HiveServer2
- HiveServer2的启动
$HIVE_HOME/bin/hiveserver2
- 测试客户端连接
$HIVE_HOME/bin/beeline
!connect jdbc:hive2://localhost:10000
报错:User: xxx is not allowed to impersonate anonymous,进行如下配置
- hadoop配置
<!--解决beeline连接hive权限不足的问题-->
<!--设置允许root用户登录-->
<property>
<name>hadoop.proxyuser.root.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.root.groups</name>
<value>*</value>
</property>
<!--设置允许spark用户登录-->
<property>
<name>hadoop.proxyuser.spark.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.spark.groups</name>
<value>*</value>
</property>
安装pyhive
- 安装依赖
pip install sasl
pip install thrift
pip install thrift-sasl
pip install pyhive
- sasl安装可能会出错
sudo apt-get install libsasl2-dev
连接pyhive
from pyhive import hive
conn = hive.Connection(host='127.0.0.1',
port=10000,
auth="CUSTOM",
username='root',
password='hive')
cursor = conn.cursor()
cursor.execute('select * from t limit 10')
for result in cursor.fetchall():
print(result)
cursor.close()
conn.close()
pyspark
- 环境变量下配置好python环境即可
#Java Environment
export JAVA_HOME=/opt/java
export JRE_HOME=${JAVA_HOME}/jre
export CLASSPATH=.:${JAVA_HOME}/lib:${JRE_HOME}/lib
export PATH=${JAVA_HOME}/bin:$PATH
#Hadoop Environment
export HADOOP_HOME=/opt/hadoop
export CLASSPATH=$($HADOOP_HOME/bin/hadoop classpath):$CLASSPATH
export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
#Hive Enviroment
export HIVE_HOME=/opt/hive
export PATH=$PATH:$HIVE_HOME/bin
#scala environment
export SCALA_HOME=/opt/scala
export PATH=${SCALA_HOME}/bin:$PATH
#Spark environment
export SPARK_HOME=/opt/spark
export PATH=${SPARK_HOME}/bin:$PATH
#jupyter直接运行pyspark配置
export PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9-src.zip:$PYTHONPATH
export PYSPARK_PYTHON=/home/spark/envs/py3/bin/python3
export PYSPARK_DRIVER_PYTHON=/home/spark/envs/py3/bin/python3
#sbt环境
export PATH=/opt/sbt/:$PATH