外部数据源 之 Plugin 可插拔的方式整合到Spark中/Spark源码中

一:概念理解

Plugin 可插拔的方式整合到Spark中/Spark源码中
为了在用外部数据源读取文件的时候,本来读取是valui 使他变成一个带schema的df 有具体的信息
外部数据源就是把很多东西都封装好了
BaseRelation:作用是定义了schema信息

TableScan
查表 读表 构建RDD(Row)
TableScan
select a,b,c,d,e from student;

PrunedScan
列裁剪
select a,b,c from student;

PrunedFilteredScan
列裁剪,行过滤
select a,b,c from student where id>10;

数据源头就已经过滤掉了 性能最好

InsertableRelation
插入的 数据可以写回去

RelationProvider
关系提供者

student: a,b,c,d,

二:代码实现

1

package com.ruozedata.bigdata.sql04.text

import org.apache.spark.sql.types.{DataType, LongType, StringType}

object Utils {

  def castTo(value:String, dataType:DataType) = {
    dataType match {
      case _:LongType => value.toLong
      case _:StringType => value
    }
  }

}

2

package com.ruozedata.bigdata.sql04.text

import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SQLContext, SaveMode}
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}

class TextDatasourceRelation(override val sqlContext:SQLContext,
                             path:String,
                             userSchema:StructType)
  extends BaseRelation
    with TableScan
    with PrunedScan
    with PrunedFilteredScan
    with InsertableRelation
    with Logging{
  override def schema: StructType = {
    if(userSchema != null) {
      userSchema
    } else {
      StructType(
        StructField("id",LongType,false) ::
        StructField("name", StringType, false) ::
          StructField("gender", StringType, false) ::
          StructField("salary", LongType, false) ::
          StructField("comm", LongType, false) :: Nil
      )
    }
  }

  override def buildScan(): RDD[Row] = {

    logWarning("this is ruozedata custom buildScan()...")

    val rdd = sqlContext.sparkContext.wholeTextFiles(path).map(x => x._2)
    val schemaFields = schema.fields

    /**
      * TODO... 把field給他作用到rdd上面去
      *
      * 如何根据schema的field的数据类型以及字段顺序整合到rdd
      */
    val rows = rdd.map(fileContent => {
      val lines = fileContent.split("\n")
      val data = lines.map(x => x.split(",").map(x => x.trim).toSeq)

      val typedValues = data.map(x => x.zipWithIndex.map{
        case (value, index) => {
          val colName = schemaFields(index).name

          Utils.castTo(if(colName.equalsIgnoreCase("gender")){
            if(value == "0") {
              "男"
            }else if(value == "1") {
              "女"
            }else{
              "未知"
            }
          } else {
            value
          },schemaFields(index).dataType)
        }
      })

      /**
        * 10000 long
        * ruoze string
        */
      //TODO... 解析schema
      typedValues.map(x=>Row.fromSeq(x))
    })
    rows.flatMap(x=>x)
  }

  override def buildScan(requiredColumns: Array[String]): RDD[Row] = {
    logWarning("this is ruozedata custom buildScan(requiredColumns)...")

    val rdd = sqlContext.sparkContext.wholeTextFiles(path).map(x => x._2)
    val schemaFields = schema.fields

    /**
      * TODO... 把field給他作用到rdd上面去
      *
      * 如何根据schema的field的数据类型以及字段顺序整合到rdd
      */
    val rows = rdd.map(fileContent => {
      val lines = fileContent.split("\n")
      val data = lines.map(x => x.split(",").map(x => x.trim).toSeq)

      val typedValues = data.map(x => x.zipWithIndex.map{
        case (value, index) => {
          val colName = schemaFields(index).name

          val castedValue = Utils.castTo(if(colName.equalsIgnoreCase("gender")){
            if(value == "0") {
              "男"
            }else if(value == "1") {
              "女"
            }else{
              "未知"
            }
          } else {
            value
          },schemaFields(index).dataType)

          if(requiredColumns.contains(colName)) {
            Some(castedValue)
          } else {
            None
          }
        }
      })

      /**
        * 10000 long
        * ruoze string
        */
      //TODO... 解析schema
      typedValues.map(x=>Row.fromSeq(x.filter(_.isDefined).map(x => x.get)))
    })
    rows.flatMap(x=>x)
  }

  override def buildScan(requiredColumns: Array[String],
                         filters: Array[Filter]): RDD[Row] = {
    logWarning("this is ruozedata custom buildScan(requiredColumns,filters)...")

    logWarning("Filter:")
    filters.foreach(x => println(x.toString))

    val rdd = sqlContext.sparkContext.wholeTextFiles(path).map(x => x._2)
    val schemaFields = schema.fields

    /**
      * TODO... 把field給他作用到rdd上面去
      *
      * 如何根据schema的field的数据类型以及字段顺序整合到rdd
      */
    val rows = rdd.map(fileContent => {
      val lines = fileContent.split("\n")
      val data = lines.map(x => x.split(",").map(x => x.trim).toSeq)

      val typedValues = data.map(x => x.zipWithIndex.map{
        case (value, index) => {
          val colName = schemaFields(index).name

          val castedValue = Utils.castTo(if(colName.equalsIgnoreCase("gender")){
            if(value == "0") {
              "男"
            }else if(value == "1") {
              "女"
            }else{
              "未知"
            }
          } else {
            value
          },schemaFields(index).dataType)

          if(requiredColumns.contains(colName)) {
            Some(castedValue)
          } else {
            None
          }
        }
      })

      /**
        * 10000 long
        * ruoze string
        */
      //TODO... 解析schema
      typedValues.map(x=>Row.fromSeq(x.filter(_.isDefined).map(x => x.get)))
    })
    rows.flatMap(x=>x)
  }

  override def insert(data: DataFrame, overwrite: Boolean): Unit = {
    data.write
      .mode(if (overwrite) SaveMode.Overwrite else SaveMode.Append)
      .save(path)
  }
}

3

package com.ruozedata.bigdata.sql04.text

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.sources.{BaseRelation, RelationProvider, SchemaRelationProvider}
import org.apache.spark.sql.types.StructType

class DefaultSource extends RelationProvider with SchemaRelationProvider {

  def createRelation(
                      sqlContext: SQLContext,
                      parameters: Map[String, String],
                      schema: StructType): BaseRelation = {
    // 非常重要,读文件必然需要path
    val path = parameters.get("path")

    path match {
      case Some(p) => new TextDatasourceRelation(sqlContext, p, schema)
      case _ => throw new IllegalArgumentException("Path is required for custom-text-datasource-api")
    }
  }

  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = {
    createRelation(sqlContext,parameters,null)
  }
}

4

package com.ruozedata.bigdata.sql04.app

import org.apache.spark.sql.SparkSession

/**
  * Created by ruozedata on 2019/1/3.
  */
object CustomTextApp {

  def main(args: Array[String]) {
    val sparkSession = SparkSession.builder()
      .appName("CustomTextApp")
      .master("local[2]")
      .getOrCreate()

    import sparkSession.implicits._

    val people = sparkSession.read.format("com.ruozedata.bigdata.sql04.text").option("path","d://ruoze-extds.data").load()
//    people.select("id","name").filter('id>2).filter('salary>2000).show()

    people.select("id","name").write.format("com.ruozedata.bigdata.sql04.text").save("d://extds-output/")

//    val people = sparkSession.read.text("file:///d:/ruoze-extds.data")
//    people.show()
    sparkSession.stop()

  }

  }

三:生产使用

在IDEA上面打成jar。只生产上使用
先打包,上传,比之前的删掉,启动spark-shell --jar 架包路径

val people = spark.read.format("com.ruozedata.bigdata.sql04.text").option("path","file:///home/hadoop/data/ext").load()

people.show
这样会报错

people.select ("id","name","gender").show  

四:记住

基于spark外部数据源写过这种插件,读一个日志的时候不需要再代码中写,直接引进来,使得项目更加精简紧凑,
各个接口做什么事情,要记住

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值