hive数据导入到elasticsearch网上很多教程,但是elasticsearch导入到hive网上查阅了有两种办法,
1.创建hive和elasticsearch的映射表,然后利用insert into语句或者insert overwrite 语句导入到另一个hive表中
2.就是利用代码来实现
今天就介绍下第二种方法把,本文用的spark 来进行elasticsearch to hive的 ,不说了,直接上代码
pom.xml文件
如下:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<groupId>com.taobao.ym_dmp</groupId>
<version>1.0-SNAPSHOT</version>
<artifactId>dmp_tags</artifactId>
<modelVersion>4.0.0</modelVersion>
<packaging>jar</packaging>
<properties>
<scala.version>2.11.0</scala.version>
<jdk.version>1.8</jdk.version>
</properties>
<dependencies>
<dependency>
<groupId>net.minidev</groupId>
<artifactId>json-smart</artifactId>
<version>2.3</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.4</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.2.0</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
</dependency>
<dependency>
<groupId>org.mongodb.scala</groupId>
<artifactId>mongo-scala-driver_2.11</artifactId>
<version>2.1.0</version>
</dependency>
<dependency>
<groupId>org.mongodb.spark</groupId>
<artifactId>mongo-spark-connector_2.11</artifactId>
<version>2.2.0</version>
</dependency>
<!-- user agent 解析 start -->
<dependency>
<groupId>cz.mallat.uasparser</groupId>
<artifactId>uasparser</artifactId>
<version>0.6.2</version>
</dependency>
<dependency>
<groupId>joda-time</groupId>
<artifactId>joda-time</artifactId>
<version>2.10.1</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.58</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.13</version>
</dependency>
<dependency>
<groupId>com.amazonaws</groupId>
<artifactId>aws-java-sdk-s3</artifactId>
<version>1.11.588</version>
</dependency>
<dependency>
<groupId>com.aerospike</groupId>
<artifactId>aerospike-client</artifactId>
<version>4.2.0</version>
</dependency>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-spark-20_2.11</artifactId>
<version>7.0.0</version>
</dependency>
<!--dependency>
<groupId>com.github.housepower</groupId>
<artifactId>clickhouse-native-jdbc</artifactId>
<version>1.6-stable</version>
</dependency
<dependency>
<groupId>com.amazonaws</groupId>
<artifactId>aws-java-sdk-dynamodb</artifactId>
<version>1.11.534</version>
</dependency>
<dependency>
<groupId>com.amazonaws</groupId>
<artifactId>amazon-dax-client</artifactId>
<version>1.0.202289.0</version>
</dependency>
-->
<dependency>
<groupId>org.elasticsearch.client</groupId>
<artifactId>elasticsearch-rest-high-level-client</artifactId>
<version>6.2.4</version>
</dependency>
</dependencies>
<build>
<sourceDirectory>src/main/scala</sourceDirectory>
<testSourceDirectory>src/test/scala</testSourceDirectory>
<plugins>
<!-- 指定多个源代码目录、多个资源文件目录 -->
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>build-helper-maven-plugin</artifactId>
<version>1.9.1</version>
<executions>
<execution>
<id>add-source</id>
<phase>generate-sources</phase>
<goals>
<goal>add-source</goal>
</goals>
<configuration>
<sources>
<source>src/main/scala</source>
<source>src/main/java</source>
</sources>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>${jdk.version}</source>
<target>${jdk.version}</target>
</configuration>
</plugin>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
<args>
<arg>-target:jvm-1.5</arg>
</args>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.1</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
spark代码如下:
package com.taobao.dmp.impl
import java.net.URI
import com.alibaba.fastjson.{JSON, JSONArray, JSONObject}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.http.client.methods.HttpGet
import org.apache.http.entity.ContentType
import org.apache.http.nio.entity.NStringEntity
import org.apache.http.util.EntityUtils
import org.apache.http.{HttpEntity, HttpHost}
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.storage.StorageLevel
import org.elasticsearch.client.{Response, RestClient}
import org.elasticsearch.spark._
import org.joda.time.DateTime
import org.joda.time.format.DateTimeFormat
import scala.collection.mutable.ArrayBuffer
import scala.util.control.Breaks._
case class IfaClass(ifa:String,bundles:Array[String],countrys:Array[String])
object Es2Hive {
def run(): Unit ={
//get Audience Str
try{
//生成sparksession对象
val conf = new SparkConf()
conf.set("es.nodes","xxx.xxx.xxx.xxx")//此处为elasticsearch ip
conf.set("es.port","9200")
conf.set("es.index.auto.create","true")
conf.set("spark.es.nodes.wan.only","false")
conf.set("spark.defalut.parallelism","750")
conf.set("es.batch.size.bytes", "50mb")
conf.set("es.batch.size.entries", "10000")
conf.set("es.scroll.size", "10000")
val ss = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
generateUserTagBySpark(ss)
//关闭spark
ss.close()
} catch{
case e: Exception => {
e.printStackTrace()
throw new Exception("genrate audience")
}
}
}
def generateUserTagBySpark(ss: SparkSession) ={
// val TargetFilePath = s"s3://www.taobao.com/hive_dataware/dmp/t_dmp_target_audience_tbl/day=$Day/audience_id=$AudienceId/"
// FileSystem.get(new URI("s3://www.taobao.com"), ss.sparkContext.hadoopConfiguration).delete(new Path(TargetFilePath), true)
//IfaClass(ifa:String,bundles:Array[String],countrys:Array[String])
// import ss.implicits._
val queryDsl=
s"""
|{
| "query":{
| "match_all": {}
| }
|}
""".stripMargin
//IfaClass(line._1,line._2.get("bundles"),line._2.get("countrys"))
for(index <- 0 to 7){
val rdd = ss.sparkContext.esRDD(s"t_dmp_idfa_bundle_country_array_tbl_$index",
queryDsl).map(line=>IfaClass(line._1,assemArr(line._2.get("bundles").toString),assemArr(line._2.get("countrys").toString)))
import ss.implicits._
val ifaBundleCountryResult=rdd.toDF()
println(s"generate final t_dmp_idfa_bundle_country_array_tbl_$index start")
//dylan先删除历史数据
val MediaFilePath = s"s3://www.taobao.com/hive_dataware/dmp/t_dmp_idfa_bundle_country_array_tbl_$index"
FileSystem.get(new URI("s3://www.taobao.com"), ss.sparkContext.hadoopConfiguration).delete(new Path(MediaFilePath), true)
// ifaBundleCountryResult.write.format("orc").save("s3://www.taobao.com/hive_dataware/dmp/t_dmp_idfa_bundle_country_array_tbl")
ifaBundleCountryResult.repartition(500).write.format("orc").save(MediaFilePath)
println(s"write to t_dmp_idfa_bundle_country_array_tbl_$index success")
ifaBundleCountryResult.unpersist(true)
}
}
def assemArr(assSr:String):Array[String]={
val arr: Array[String] =assSr.replace("Some(Buffer(","").replace(")","").split(",")
arr
}
def main(args: Array[String]): Unit = {
run()
}
}