elasticsearch数据导入hive

最新推荐文章于 2024-08-08 13:06:49 发布

功夫猫熊yeah

最新推荐文章于 2024-08-08 13:06:49 发布

阅读量3k

点赞数 1

分类专栏： hive elasticsearch

本文链接：https://blog.csdn.net/weixin_39031707/article/details/98238077

版权

hive 同时被 2 个专栏收录

11 篇文章 0 订阅

订阅专栏

elasticsearch

1 篇文章 0 订阅

订阅专栏

hive数据导入到elasticsearch网上很多教程，但是elasticsearch导入到hive网上查阅了有两种办法，
1.创建hive和elasticsearch的映射表，然后利用insert into语句或者insert overwrite 语句导入到另一个hive表中

2.就是利用代码来实现
今天就介绍下第二种方法把，本文用的spark 来进行elasticsearch to hive的，不说了，直接上代码
pom.xml文件
如下：

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <groupId>com.taobao.ym_dmp</groupId>
    <version>1.0-SNAPSHOT</version>
    <artifactId>dmp_tags</artifactId>
    <modelVersion>4.0.0</modelVersion>
    <packaging>jar</packaging>

    <properties>
        <scala.version>2.11.0</scala.version>
        <jdk.version>1.8</jdk.version>
    </properties>

    <dependencies>
        <dependency>
            <groupId>net.minidev</groupId>
            <artifactId>json-smart</artifactId>
            <version>2.3</version>
        </dependency>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.4</version>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
            <version>2.2.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.11</artifactId>
            <version>2.2.0</version>
        </dependency>
        <dependency>
            <groupId>org.scala-lang</groupId>
            <artifactId>scala-library</artifactId>
            <version>${scala.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.2</version>
        </dependency>
        <dependency>
            <groupId>org.mongodb.scala</groupId>
            <artifactId>mongo-scala-driver_2.11</artifactId>
            <version>2.1.0</version>
        </dependency>
        <dependency>
            <groupId>org.mongodb.spark</groupId>
            <artifactId>mongo-spark-connector_2.11</artifactId>
            <version>2.2.0</version>
        </dependency>
        <!-- user agent 解析 start -->
        <dependency>
            <groupId>cz.mallat.uasparser</groupId>
            <artifactId>uasparser</artifactId>
            <version>0.6.2</version>
        </dependency>
        <dependency>
            <groupId>joda-time</groupId>
            <artifactId>joda-time</artifactId>
            <version>2.10.1</version>
        </dependency>
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.58</version>
        </dependency>
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>8.0.13</version>
        </dependency>
        <dependency>
            <groupId>com.amazonaws</groupId>
            <artifactId>aws-java-sdk-s3</artifactId>
            <version>1.11.588</version>
        </dependency>
        <dependency>
            <groupId>com.aerospike</groupId>
            <artifactId>aerospike-client</artifactId>
            <version>4.2.0</version>
        </dependency>
        <dependency>
            <groupId>org.elasticsearch</groupId>
            <artifactId>elasticsearch-spark-20_2.11</artifactId>
            <version>7.0.0</version>
        </dependency>
        <!--dependency>
            <groupId>com.github.housepower</groupId>
            <artifactId>clickhouse-native-jdbc</artifactId>
            <version>1.6-stable</version>
        </dependency

        <dependency>
            <groupId>com.amazonaws</groupId>
            <artifactId>aws-java-sdk-dynamodb</artifactId>
            <version>1.11.534</version>
        </dependency>
        <dependency>
            <groupId>com.amazonaws</groupId>
            <artifactId>amazon-dax-client</artifactId>
            <version>1.0.202289.0</version>
        </dependency>
          -->
        <dependency>
            <groupId>org.elasticsearch.client</groupId>
            <artifactId>elasticsearch-rest-high-level-client</artifactId>
            <version>6.2.4</version>
        </dependency>
    </dependencies>


    <build>
        <sourceDirectory>src/main/scala</sourceDirectory>
        <testSourceDirectory>src/test/scala</testSourceDirectory>
        <plugins>
            <!-- 指定多个源代码目录、多个资源文件目录 -->
            <plugin>
                <groupId>org.codehaus.mojo</groupId>
                <artifactId>build-helper-maven-plugin</artifactId>
                <version>1.9.1</version>
                <executions>
                    <execution>
                        <id>add-source</id>
                        <phase>generate-sources</phase>
                        <goals>
                            <goal>add-source</goal>
                        </goals>
                        <configuration>
                            <sources>
                                <source>src/main/scala</source>
                                <source>src/main/java</source>
                            </sources>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.1</version>
                <configuration>
                    <source>${jdk.version}</source>
                    <target>${jdk.version}</target>
                </configuration>
            </plugin>
            <plugin>
                <groupId>org.scala-tools</groupId>
                <artifactId>maven-scala-plugin</artifactId>
                <executions>
                    <execution>
                        <goals>
                            <goal>compile</goal>
                            <goal>testCompile</goal>
                        </goals>
                    </execution>
                </executions>
                <configuration>
                    <scalaVersion>${scala.version}</scalaVersion>
                    <args>
                        <arg>-target:jvm-1.5</arg>
                    </args>
                </configuration>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <version>2.4.1</version>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                            <filters>
                                <filter>
                                    <artifact>*:*</artifact>
                                    <excludes>
                                        <exclude>META-INF/*.SF</exclude>
                                        <exclude>META-INF/*.DSA</exclude>
                                        <exclude>META-INF/*.RSA</exclude>
                                    </excludes>
                                </filter>
                            </filters>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
</project>

spark代码如下：

package com.taobao.dmp.impl



import java.net.URI

import com.alibaba.fastjson.{JSON, JSONArray, JSONObject}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.http.client.methods.HttpGet
import org.apache.http.entity.ContentType
import org.apache.http.nio.entity.NStringEntity
import org.apache.http.util.EntityUtils
import org.apache.http.{HttpEntity, HttpHost}
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.storage.StorageLevel
import org.elasticsearch.client.{Response, RestClient}
import org.elasticsearch.spark._
import org.joda.time.DateTime
import org.joda.time.format.DateTimeFormat

import scala.collection.mutable.ArrayBuffer
import scala.util.control.Breaks._
case class IfaClass(ifa:String,bundles:Array[String],countrys:Array[String])
object Es2Hive {

 def run(): Unit ={
  //get Audience Str
  try{


   //生成sparksession对象
   val conf = new SparkConf()

   conf.set("es.nodes","xxx.xxx.xxx.xxx")//此处为elasticsearch ip
   conf.set("es.port","9200")
   conf.set("es.index.auto.create","true")
   conf.set("spark.es.nodes.wan.only","false")
   conf.set("spark.defalut.parallelism","750")
   conf.set("es.batch.size.bytes", "50mb")
   conf.set("es.batch.size.entries", "10000")
   conf.set("es.scroll.size", "10000")
   val ss = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()






   generateUserTagBySpark(ss)



   //关闭spark
   ss.close()
  } catch{
   case e: Exception => {
    e.printStackTrace()
    throw new Exception("genrate audience")
   }
  }
 }


 def generateUserTagBySpark(ss: SparkSession) ={
//  val TargetFilePath = s"s3://www.taobao.com/hive_dataware/dmp/t_dmp_target_audience_tbl/day=$Day/audience_id=$AudienceId/"
//  FileSystem.get(new URI("s3://www.taobao.com"), ss.sparkContext.hadoopConfiguration).delete(new Path(TargetFilePath), true)
  //IfaClass(ifa:String,bundles:Array[String],countrys:Array[String])
//  import ss.implicits._
  val queryDsl=
   s"""
      |{
     |  "query":{
     |    "match_all": {}
     |  }
     |}
    """.stripMargin
  //IfaClass(line._1,line._2.get("bundles"),line._2.get("countrys"))
  for(index <- 0 to 7){
   val rdd =  ss.sparkContext.esRDD(s"t_dmp_idfa_bundle_country_array_tbl_$index",
    queryDsl).map(line=>IfaClass(line._1,assemArr(line._2.get("bundles").toString),assemArr(line._2.get("countrys").toString)))
   import ss.implicits._
   val ifaBundleCountryResult=rdd.toDF()

   println(s"generate final t_dmp_idfa_bundle_country_array_tbl_$index start")
   //dylan先删除历史数据
   val MediaFilePath = s"s3://www.taobao.com/hive_dataware/dmp/t_dmp_idfa_bundle_country_array_tbl_$index"
   FileSystem.get(new URI("s3://www.taobao.com"), ss.sparkContext.hadoopConfiguration).delete(new Path(MediaFilePath), true)

   //  ifaBundleCountryResult.write.format("orc").save("s3://www.taobao.com/hive_dataware/dmp/t_dmp_idfa_bundle_country_array_tbl")
   ifaBundleCountryResult.repartition(500).write.format("orc").save(MediaFilePath)
   println(s"write to t_dmp_idfa_bundle_country_array_tbl_$index success")
   ifaBundleCountryResult.unpersist(true)
  }

 }

 def assemArr(assSr:String):Array[String]={
  val arr: Array[String] =assSr.replace("Some(Buffer(","").replace(")","").split(",")
  arr
 }

 def main(args: Array[String]): Unit = {

  run()
 }
}