关于idea打包sparkSQL整合Hive执行

-Thursday

已于 2022-05-30 21:20:33 修改

阅读量597

点赞数

文章标签： spark intellij-idea hive

于 2022-05-30 21:11:23 首次发布

本文链接：https://blog.csdn.net/u013360871/article/details/125053882

版权

一：spark代码

package com.qf.bigdata.spark.sql.day2

import com.qf.bigdata.spark.core.day3.SparkUtils
import org.apache.spark.sql.SparkSession

/**
 * SparkSQL整合Hive执行
 */
object Demo11_SparkSQL_Hive {
  def main(args: Array[String]): Unit = {

    //1. 校验jar后面是否输入了两个参数
    if (args == null || args.length != 2) {
      println(
        """
          |Parameters error!!! | usage : xxx.jar <basic path> <info path>
          |""".stripMargin)
      System.exit(-1) // 程序异常退出
    }

    val Array(basicPath, infoPath) = args

    //2. 获取到sparksession对象
    val spark: SparkSession = SparkUtils.getSparkSession("yarn", "spark_hive", true)
    import spark.implicits._

    //3. 建表
    //3.1 建库
    spark.sql(
      """
        |create database if not exists spark_sql;
        |""".stripMargin)

    //3.2 切换数据库
    spark.sql(
      """
        |use spark_sql
        |""".stripMargin)

    //3.3 建表
    // teacher_basic
    spark.sql(
      """
        |create table if not exists spark_sql.teacher_basic(
        |name String,
        |age Int,
        |married String,
        |classes Int
        |)
        |row format delimited fields terminated by ','
        |""".stripMargin)

    // teacher_info
    spark.sql(
      """
        |create table if not exists spark_sql.teacher_info(
        |name String,
        |height Int,
        |weight Int
        |)
        |row format delimited fields terminated by ','
        |""".stripMargin)

    //3.4 导入数据
    // teacher_basic
    spark.sql(
      s"""
        |load data inpath '${basicPath}' into table spark_sql.teacher_basic
        |""".stripMargin)

    // teacher_info
    spark.sql(
      s"""
         |load data inpath '${infoPath}' into table spark_sql.teacher_info
         |""".stripMargin)

    //3.5 join查询并建立新表
    spark.sql(
      """
        |create table if not exists  spark_sql.teacher
        |as
        |select
        |b.name,
        |b.age,
        |b.married,
        |b.classes,
        |i.height,
        |i.weight
        |from spark_sql.teacher_basic b left join spark_sql.teacher_info i
        |on b.name = i.name
        |""".stripMargin)

    //4. 释放资源
    SparkUtils.close(spark)
  }
}

二：打包到服务端执行

##0. 将hive-site.xml/yarn-site.xml/core-site.xml拷贝到当前工程的resurces目录下
##1. 打包
##2. 启动hadoop集群
##3. 启动hive集群
##4. spark-submit
[root@hadoop scrpits]# vim spark_hive.sh
#!/bin/bash

SPARK_HOME=/opt/apps/spark-3.1.2
HADOOP_CONF_DIR=/opt/apps/hadoop-3.2.1/etc/hadoop

${SPARK_HOME}/bin/spark-submit \
--master yarn \
--deploy-mode cluster \
--class com.qf.bigdata.spark.sql.day2.Demo11_SparkSQL_Hive \
--executor-memory 600M \
--executor-cores 1 \
--driver-cores 1 \
--driver-memory 600M \
/data/jar/spark.jar \
/input/teacher/teacher_basic.txt \
/input/teacher/teacher_info.txt

##5. 执行脚本
[root@hadoop scrpits]# sh spark_hive.sh

##6. 遇见问题：
##6.1 如果我们执行jar包可以成功，但是jar包中没有数据。我们检查，我们从hive客户端中建立数据库可以在hdfs中创建库目录，但是spark程序执行的时候创建的数据库，在hdfs中没有目录。
##6.2 我们再次去检查mysql中的hive的库，发现spark程序建立的数据库它映射的目录不是hdfs，而是file:///的本地文件系统目录。
##6.3 根据前两者我们推测，首先hive可以建立数据库到hdfs说明hive本身没有毛病；其次通过spark程序建立数据库建立在本地文件系统，说明spark不知道建立的库目录默认是应该放在hdfs中。由此我们得出结论，只要spark可以知道建立的库存放的hdfs的目录在哪里，那么我们就可以将我们的库存放在hdfs了。

##7. 解决问题
##7.1 修改hive-site.xml
<property>
    <name>hive.metastore.warehouse.dir</name>
    <value>hdfs://hadoop:9000/user/hive/warehouse</value>
</property>

##7.2 将hive-site.xml拷贝到$SPARK_HOME/conf目录下
##7.3 重启hive的metastore的服务
##7.4 重新打包并上传即可

三：需要的配置文件

core-site.xml

<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->

<!-- Put site-specific property overrides in this file. -->

<configuration>
    <!-- 设置namenode节点 -->
    <!-- 注意: hadoop1.x时代默认端口9000 hadoop2.x时代默认端口8020 hadoop3.x时代默认端口 9820 -->
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://bigdata:9820</value>
    </property>

    <!-- hdfs的基础路径，被其他属性所依赖的一个基础路径 -->
    <property>
        <name>hadoop.tmp.dir</name>
        <value>/usr/local/hadoop/tmp</value>
    </property>

    <!-- 当前用户全设置成root -->
    <property>
        <name>hadoop.http.staticuser.user</name>
        <value>root</value>
    </property>

    <!-- 不开启权限检查 -->
    <property>
        <name>dfs.permissions.enabled</name>
        <value>false</value>
    </property>
    <property>
        <name>hadoop.proxyuser.root.hosts</name>
        <value>*</value>
    </property>
    <property>
        <name>hadoop.proxyuser.root.groups</name>
        <value>*</value>
    </property>
</configuration>

hive-site.xml

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?><!--
   Licensed to the Apache Software Foundation (ASF) under one or more
   contributor license agreements.  See the NOTICE file distributed with
   this work for additional information regarding copyright ownership.
   The ASF licenses this file to You under the Apache License, Version 2.0
   (the "License"); you may not use this file except in compliance with
   the License.  You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
-->
<configuration>
    <!-- hive元数据地址，默认是/user/hive/warehouse -->
    <property>
        <name>hive.metastore.warehouse.dir</name>
        <value>hdfs://bigdata:9820/user/hive/warehouse</value>
    </property>
    <!-- hive查询时输出列名 -->
    <property>
        <name>hive.cli.print.header</name>
        <value>true</value>
    </property>
    <!-- 显示当前数据库名 -->
    <property>
        <name>hive.cli.print.current.db</name>
        <value>true</value>
    </property>
    <!-- 开启本地模式，默认是false -->
    <property>
        <name>hive.exec.mode.local.auto</name>
        <value>true</value>
    </property>
    <!-- URL用于连接远程元数据 -->
    <property>
        <name>hive.metastore.uris</name>
        <value>thrift://bigdata:9083</value>
    </property>
    <!-- 元数据使用mysql数据库 -->
    <property>
        <name>javax.jdo.option.ConnectionURL</name>
        <value>jdbc:mysql://192.168.0.170:3306/hive?createDatabaseIfNotExist=true&amp;characterEncoding=utf8&amp;useSSL=false&amp;serverTimezone=UTC&amp;allowPublicKeyRetrieval=true</value>
    </property>
    <property>
        <name>javax.jdo.option.ConnectionUserName</name>
        <value>root</value>
    </property>
    <property>
        <name>javax.jdo.option.ConnectionPassword</name>
        <value>123456</value>
    </property>
    <property>
        <name>javax.jdo.option.ConnectionDriverName</name>
        <value>com.mysql.cj.jdbc.Driver</value>
    </property>
</configuration>

yarn-site.xml

<?xml version="1.0"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->
<configuration>
        <property>
                <name>yarn.resourcemanager.hostname</name>
                <value>bigdata</value>
        </property>
        <property>
                <name>yarn.nodemanager.aux-services</name>
                <value>mapreduce_shuffle</value>
        </property>

        <property>
                <name>yarn.nodemanager.vmem-check-enabled</name>
                <value>false</value>
        </property>


        <property>
                <name>yarn.resourcemanager.scheduler.class</name>
                <value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler</value>
        </property>

        <property>
                <name>yarn.scheduler.fair.preemption</name>
                <value>true</value>
        </property>
        <property>
                <name>yarn.scheduler.fair.preemption.cluster-utilization-threshold</name>
                <value>1.0</value>
        </property>
</configuration>