elasticsearch数据导入hive

hive数据导入到elasticsearch网上很多教程,但是elasticsearch导入到hive网上查阅了有两种办法,
1.创建hive和elasticsearch的映射表,然后利用insert into语句或者insert overwrite 语句导入到另一个hive表中

2.就是利用代码来实现
今天就介绍下第二种方法把,本文用的spark 来进行elasticsearch to hive的 ,不说了,直接上代码
pom.xml文件
如下:

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <groupId>com.taobao.ym_dmp</groupId>
    <version>1.0-SNAPSHOT</version>
    <artifactId>dmp_tags</artifactId>
    <modelVersion>4.0.0</modelVersion>
    <packaging>jar</packaging>

    <properties>
        <scala.version>2.11.0</scala.version>
        <jdk.version>1.8</jdk.version>
    </properties>

    <dependencies>
        <dependency>
            <groupId>net.minidev</groupId>
            <artifactId>json-smart</artifactId>
            <version>2.3</version>
        </dependency>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.4</version>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
            <version>2.2.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.11</artifactId>
            <version>2.2.0</version>
        </dependency>
        <dependency>
            <groupId>org.scala-lang</groupId>
            <artifactId>scala-library</artifactId>
            <version>${scala.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.2</version>
        </dependency>
        <dependency>
            <groupId>org.mongodb.scala</groupId>
            <artifactId>mongo-scala-driver_2.11</artifactId>
            <version>2.1.0</version>
        </dependency>
        <dependency>
            <groupId>org.mongodb.spark</groupId>
            <artifactId>mongo-spark-connector_2.11</artifactId>
            <version>2.2.0</version>
        </dependency>
        <!-- user agent 解析 start -->
        <dependency>
            <groupId>cz.mallat.uasparser</groupId>
            <artifactId>uasparser</artifactId>
            <version>0.6.2</version>
        </dependency>
        <dependency>
            <groupId>joda-time</groupId>
            <artifactId>joda-time</artifactId>
            <version>2.10.1</version>
        </dependency>
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.58</version>
        </dependency>
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>8.0.13</version>
        </dependency>
        <dependency>
            <groupId>com.amazonaws</groupId>
            <artifactId>aws-java-sdk-s3</artifactId>
            <version>1.11.588</version>
        </dependency>
        <dependency>
            <groupId>com.aerospike</groupId>
            <artifactId>aerospike-client</artifactId>
            <version>4.2.0</version>
        </dependency>
        <dependency>
            <groupId>org.elasticsearch</groupId>
            <artifactId>elasticsearch-spark-20_2.11</artifactId>
            <version>7.0.0</version>
        </dependency>
        <!--dependency>
            <groupId>com.github.housepower</groupId>
            <artifactId>clickhouse-native-jdbc</artifactId>
            <version>1.6-stable</version>
        </dependency

        <dependency>
            <groupId>com.amazonaws</groupId>
            <artifactId>aws-java-sdk-dynamodb</artifactId>
            <version>1.11.534</version>
        </dependency>
        <dependency>
            <groupId>com.amazonaws</groupId>
            <artifactId>amazon-dax-client</artifactId>
            <version>1.0.202289.0</version>
        </dependency>
          -->
        <dependency>
            <groupId>org.elasticsearch.client</groupId>
            <artifactId>elasticsearch-rest-high-level-client</artifactId>
            <version>6.2.4</version>
        </dependency>
    </dependencies>


    <build>
        <sourceDirectory>src/main/scala</sourceDirectory>
        <testSourceDirectory>src/test/scala</testSourceDirectory>
        <plugins>
            <!-- 指定多个源代码目录、多个资源文件目录 -->
            <plugin>
                <groupId>org.codehaus.mojo</groupId>
                <artifactId>build-helper-maven-plugin</artifactId>
                <version>1.9.1</version>
                <executions>
                    <execution>
                        <id>add-source</id>
                        <phase>generate-sources</phase>
                        <goals>
                            <goal>add-source</goal>
                        </goals>
                        <configuration>
                            <sources>
                                <source>src/main/scala</source>
                                <source>src/main/java</source>
                            </sources>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.1</version>
                <configuration>
                    <source>${jdk.version}</source>
                    <target>${jdk.version}</target>
                </configuration>
            </plugin>
            <plugin>
                <groupId>org.scala-tools</groupId>
                <artifactId>maven-scala-plugin</artifactId>
                <executions>
                    <execution>
                        <goals>
                            <goal>compile</goal>
                            <goal>testCompile</goal>
                        </goals>
                    </execution>
                </executions>
                <configuration>
                    <scalaVersion>${scala.version}</scalaVersion>
                    <args>
                        <arg>-target:jvm-1.5</arg>
                    </args>
                </configuration>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <version>2.4.1</version>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                            <filters>
                                <filter>
                                    <artifact>*:*</artifact>
                                    <excludes>
                                        <exclude>META-INF/*.SF</exclude>
                                        <exclude>META-INF/*.DSA</exclude>
                                        <exclude>META-INF/*.RSA</exclude>
                                    </excludes>
                                </filter>
                            </filters>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
</project>

spark代码如下:

package com.taobao.dmp.impl



import java.net.URI

import com.alibaba.fastjson.{JSON, JSONArray, JSONObject}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.http.client.methods.HttpGet
import org.apache.http.entity.ContentType
import org.apache.http.nio.entity.NStringEntity
import org.apache.http.util.EntityUtils
import org.apache.http.{HttpEntity, HttpHost}
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.storage.StorageLevel
import org.elasticsearch.client.{Response, RestClient}
import org.elasticsearch.spark._
import org.joda.time.DateTime
import org.joda.time.format.DateTimeFormat

import scala.collection.mutable.ArrayBuffer
import scala.util.control.Breaks._
case class IfaClass(ifa:String,bundles:Array[String],countrys:Array[String])
object Es2Hive {

 def run(): Unit ={
  //get Audience Str
  try{


   //生成sparksession对象
   val conf = new SparkConf()

   conf.set("es.nodes","xxx.xxx.xxx.xxx")//此处为elasticsearch ip
   conf.set("es.port","9200")
   conf.set("es.index.auto.create","true")
   conf.set("spark.es.nodes.wan.only","false")
   conf.set("spark.defalut.parallelism","750")
   conf.set("es.batch.size.bytes", "50mb")
   conf.set("es.batch.size.entries", "10000")
   conf.set("es.scroll.size", "10000")
   val ss = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()






   generateUserTagBySpark(ss)



   //关闭spark
   ss.close()
  } catch{
   case e: Exception => {
    e.printStackTrace()
    throw new Exception("genrate audience")
   }
  }
 }


 def generateUserTagBySpark(ss: SparkSession) ={
//  val TargetFilePath = s"s3://www.taobao.com/hive_dataware/dmp/t_dmp_target_audience_tbl/day=$Day/audience_id=$AudienceId/"
//  FileSystem.get(new URI("s3://www.taobao.com"), ss.sparkContext.hadoopConfiguration).delete(new Path(TargetFilePath), true)
  //IfaClass(ifa:String,bundles:Array[String],countrys:Array[String])
//  import ss.implicits._
  val queryDsl=
   s"""
      |{
     |  "query":{
     |    "match_all": {}
     |  }
     |}
    """.stripMargin
  //IfaClass(line._1,line._2.get("bundles"),line._2.get("countrys"))
  for(index <- 0 to 7){
   val rdd =  ss.sparkContext.esRDD(s"t_dmp_idfa_bundle_country_array_tbl_$index",
    queryDsl).map(line=>IfaClass(line._1,assemArr(line._2.get("bundles").toString),assemArr(line._2.get("countrys").toString)))
   import ss.implicits._
   val ifaBundleCountryResult=rdd.toDF()

   println(s"generate final t_dmp_idfa_bundle_country_array_tbl_$index start")
   //dylan先删除历史数据
   val MediaFilePath = s"s3://www.taobao.com/hive_dataware/dmp/t_dmp_idfa_bundle_country_array_tbl_$index"
   FileSystem.get(new URI("s3://www.taobao.com"), ss.sparkContext.hadoopConfiguration).delete(new Path(MediaFilePath), true)

   //  ifaBundleCountryResult.write.format("orc").save("s3://www.taobao.com/hive_dataware/dmp/t_dmp_idfa_bundle_country_array_tbl")
   ifaBundleCountryResult.repartition(500).write.format("orc").save(MediaFilePath)
   println(s"write to t_dmp_idfa_bundle_country_array_tbl_$index success")
   ifaBundleCountryResult.unpersist(true)
  }

 }

 def assemArr(assSr:String):Array[String]={
  val arr: Array[String] =assSr.replace("Some(Buffer(","").replace(")","").split(",")
  arr
 }

 def main(args: Array[String]): Unit = {

  run()
 }
}
评论 7
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值