sparkSql写入starRocks

官方的方法默认使用逗号切割字段。但是如果字段的值里有逗号的话会使字段错位。且多出来值会报错。

故这里修改官方默认逗号。使用“****************”切割字段。

以下是完整代码。

package com.qucheng.bigdata.util
import java.io.{BufferedReader, InputStreamReader}
import java.nio.charset.{Charset, StandardCharsets}

import org.apache.commons.net.util.Base64
import org.apache.http.HttpHeaders
import org.apache.http.client.config.RequestConfig
import org.apache.http.client.methods.{CloseableHttpResponse, HttpPut}
import org.apache.http.entity.StringEntity
import org.apache.http.impl.client.{CloseableHttpClient, DefaultConnectionKeepAliveStrategy, DefaultRedirectStrategy, HttpClientBuilder}
import org.joda.time.DateTime
import sun.nio.cs.ext.GBK

object PutUtil {
var CHARSET = "UTF-8"
//  var CHARSET = "GBK"
  val BINARY_CT = "application/octet-stream"
  var CONTENT_TYPE = s"text/plain; charset=${CHARSET}"
  var TIMEOUT = 30000
  var userName = "root"
  var password = ""

  def basicAuthHeader( username:String,  password:String):String =  {
val  tobeEncode:String = username + ":" + password
val encoded = Base64.encodeBase64(tobeEncode.getBytes(StandardCharsets.UTF_8))
val res = "Basic " + new String(encoded)
//      println("here:"+res)
    res
  }

def clientGen(user:String = "root", passwd:String = ""): CloseableHttpClient ={
this.userName = user
this.password = passwd
var httpClient: Any = null
    HttpClientBuilder.create()
      .setKeepAliveStrategy(new DefaultConnectionKeepAliveStrategy)
      .setRedirectStrategy(new DefaultRedirectStrategy(){  // refer to linux cmd: curl --location-trusted
        override def isRedirectable(method: String): Boolean = {
super.isRedirectable(method)
true
        }
      })
      .build()
  }

/**
    * HTTP put
    *
    * @param payload      data to send(string)
    * @param api          Stream load API
    * @param contentType  json/ binary type to PUT
    */
  def put( httpclient:CloseableHttpClient,
           payload:String,
           api:String ,
           contentType:String = CONTENT_TYPE ,
           headers: Map[String, String] = null,
           debug: Boolean = true,
           showPayLoad: Boolean = false): (Boolean,CloseableHttpClient, CloseableHttpResponse) ={
var response:CloseableHttpResponse = null
    var status = true
    try{
val httpPut = new HttpPut(api)
val requestConfig = RequestConfig.custom()
        .setAuthenticationEnabled(true)
        .setCircularRedirectsAllowed(true)
        .setRedirectsEnabled(true)
        .setRelativeRedirectsAllowed(true)
        .setExpectContinueEnabled(true)
        .setConnectTimeout(TIMEOUT).setConnectionRequestTimeout(TIMEOUT)
        .setSocketTimeout(TIMEOUT).build()
      httpPut.setConfig(requestConfig)
      httpPut.setHeader(HttpHeaders.EXPECT,"100-continue")  // .setExpectContinueEnabled(true)
      httpPut.setHeader(HttpHeaders.AUTHORIZATION, basicAuthHeader(this.userName, this.password))  // Authorization: Basic cm9vdDo=

      if (headers != null && headers.size > 0) {
        headers.foreach(entry =>{
          httpPut.setHeader(entry._1, entry._2)
        })
      }

val content =new StringEntity(payload, Charset.forName(CHARSET))
      content.setContentType(contentType)
      content.setContentEncoding(CHARSET)
      httpPut.setEntity(content)
      response = httpclient.execute(httpPut)
if(debug) {
println("### Debug: "+new DateTime())
println(response.getStatusLine())
val br = new BufferedReader(new InputStreamReader(response.getEntity.getContent()))
val sb = new StringBuffer()
var str = ""
        while( str != null){
          sb.append(str.trim)
          str = br.readLine()
        }
        httpPut.getAllHeaders.foreach(println)
println(sb.toString)
println("### payload: ")
if (showPayLoad) println(payload)
      }
      status = true
      (status, httpclient, response)
    }catch {
case ex:Exception => {
println(s"### post err: @ ${new DateTime().toLocalDateTime()}")
        ex.printStackTrace()}
        status = false
        (status, httpclient, response)
    }
    (status, httpclient, response)
  }


/**
    * PutUtil Object mainly handles static funcs for Spark
    *
    * PutUtil.main() runs locally can help to test putting payload into stream load API
    *    Args for this demo:
    *    - hostname: master1
    *    - fe http port: 8030
    *    - database name: starrocks_demo
    *    - table names: demo1_dup_tb1 and demo1_agg_tb2
    *    - TODO customize above args to fit your environment.
    */
  def main(args: Array[String]): Unit = {
// duplicate table1
    // cols: date, hour, minute, name , metric
    val api = "http://master1:8030/api/starrocks_demo/demo1_dup_tb1/_stream_load"
    val payload = "20190903_11_1_tom_130\n20190903_11_2_jerry_838"
    val headers = Map(
//"label"->"label123"
      "max_filter_ratio"->"0.2",
      "columns"->"date,hour,minute,username,visit",
      "column_separator"->"_"
    )
put(clientGen(), payload, api, this.CONTENT_TYPE,headers, true)._3.close()

// agg replace col table2
    // cols: id, name , metric
    val api2 = "http://master1:8030/api/starrocks_demo/demo1_agg_tb2/_stream_load"
    val payload2 = "1_tom_313\n1_tom_318"
    val headers2 = Map(
//"label"->"label123"
      "max_filter_ratio"->"0.2",
      "columns"->"siteid,username,visit",
      "column_separator"->"_"
    )
put(clientGen(), payload2, api2, this.CONTENT_TYPE,headers2, true)._3.close()

  }
}
object Consts {
val dateSep = "-"
  val starrocksSep = "\t"
}

package com.qucheng.bigdata.util

import org.apache.http.client.methods.CloseableHttpResponse
import org.apache.http.impl.client.CloseableHttpClient

class MySrSink(headers:Map[String,String],
               dbName:String,
               userName:String,
               password:String,
               tblName:String,
               hostName:String,
               port:Int = 18030,
               debug:Boolean = true, showPayLoad: Boolean = false) extends Serializable {
val CHARSET = "UTF-8"
  val BINARY_CT = "application/octet-stream"
  val CONTENT_TYPE = "text/plain"
  var TIMEOUT = 30000

  var api = s"http://${hostName}:${port}/api/${dbName}/${tblName}/_stream_load"
  var httpClient: CloseableHttpClient = _
var response:CloseableHttpResponse = _
var status :Boolean = true

  def invoke(value: String): Unit = {
httpClient = PutUtil.clientGen(userName, password)
try {
val res = PutUtil.put(httpClient, value, api, CONTENT_TYPE, headers, debug, showPayLoad)
status = res._1
httpClient = res._2
response = res._3
    } catch {
case ex:Exception => {
println("### invoke ERROR:")
        ex.printStackTrace()
      }
    } finally {
try {
httpClient.close()
response.close()
      } catch {
case ex:Exception => {
println("### http close ERROR:")
          ex.printStackTrace()
        }
      }
    }
  }
}

这里修改官方原来切割字段的代码

def dfWriteStarrocks(spark :SparkSession,sourceDF: DataFrame, Datebase: String,partition:Int): Unit = {

val strings: Array[String] = Datebase.split("\\.")

val debug = true

    //做隐式转换,否则DF不能map
    import spark.implicits._
    sourceDF
//官方代码
//      .map( x => x.toString().replaceAll("\\[|\\]","").replace(",",Consts.starrocksSep))
//修改后,防止字段错位
      .map( x => x.mkString("***********").toString.replace("***********",Consts.starrocksSep))
      .repartition(partition).foreachPartition(
      itr => {
val sink = new MySrSink(Map(
"max_filter_ratio" -> "1",
          "column_separator" -> Consts.starrocksSep),
          strings(0),
          starRocksUser,
          starRocksPass,
          strings(1),
          "starRocksFE地址",
          18030,
          true,
          false)
if (itr.hasNext) sink.invoke(itr.mkString("\n"))
      }
    )
  }

参考官方github https://github.com/StarRocks/demo/tree/master/SparkDemo/src/main

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值