外部数据源之 Plugin 可插拔的方式整合到Spark中/Spark源码中

最新推荐文章于 2021-04-21 18:18:37 发布

墨卿风竹

最新推荐文章于 2021-04-21 18:18:37 发布

阅读量260

点赞数

文章标签：外部数据源之 Plugin 可插拔的方式整合到Spark中/Spar

本文链接：https://blog.csdn.net/qq_43688472/article/details/88230085

版权

一：概念理解

Plugin 可插拔的方式整合到Spark中/Spark源码中
为了在用外部数据源读取文件的时候，本来读取是valui 使他变成一个带schema的df 有具体的信息
外部数据源就是把很多东西都封装好了
BaseRelation：作用是定义了schema信息

TableScan
查表读表构建RDD（Row）
TableScan
select a,b,c,d,e from student;

PrunedScan
列裁剪
select a,b,c from student;

PrunedFilteredScan
列裁剪，行过滤
select a,b,c from student where id>10;

数据源头就已经过滤掉了性能最好

InsertableRelation
插入的数据可以写回去

RelationProvider
关系提供者

student: a,b,c,d,

二：代码实现

1

package com.ruozedata.bigdata.sql04.text

import org.apache.spark.sql.types.{DataType, LongType, StringType}

object Utils {

  def castTo(value:String, dataType:DataType) = {
    dataType match {
      case _:LongType => value.toLong
      case _:StringType => value
    }
  }

}

2

package com.ruozedata.bigdata.sql04.text

import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SQLContext, SaveMode}
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}

class TextDatasourceRelation(override val sqlContext:SQLContext,
                             path:String,
                             userSchema:StructType)
  extends BaseRelation
    with TableScan
    with PrunedScan
    with PrunedFilteredScan
    with InsertableRelation
    with Logging{
  override def schema: StructType = {
    if(userSchema != null) {
      userSchema
    } else {
      StructType(
        StructField("id",LongType,false) ::
        StructField("name", StringType, false) ::
          StructField("gender", StringType, false) ::
          StructField("salary", LongType, false) ::
          StructField("comm", LongType, false) :: Nil
      )
    }
  }

  override def buildScan(): RDD[Row] = {

    logWarning("this is ruozedata custom buildScan()...")

    val rdd = sqlContext.sparkContext.wholeTextFiles(path).map(x => x._2)
    val schemaFields = schema.fields

    /**
      * TODO... 把field給他作用到rdd上面去
      *
      * 如何根据schema的field的数据类型以及字段顺序整合到rdd
      */
    val rows = rdd.map(fileContent => {
      val lines = fileContent.split("\n")
      val data = lines.map(x => x.split(",").map(x => x.trim).toSeq)

      val typedValues = data.map(x => x.zipWithIndex.map{
        case (value, index) => {
          val colName = schemaFields(index).name

          Utils.castTo(if(colName.equalsIgnoreCase("gender")){
            if(value == "0") {
              "男"
            }else if(value == "1") {
              "女"
            }else{
              "未知"
            }
          } else {
            value
          },schemaFields(index).dataType)
        }
      })

      /**
        * 10000 long
        * ruoze string
        */
      //TODO... 解析schema
      typedValues.map(x=>Row.fromSeq(x))
    })
    rows.flatMap(x=>x)
  }

  override def buildScan(requiredColumns: Array[String]): RDD[Row] = {
    logWarning("this is ruozedata custom buildScan(requiredColumns)...")

    val rdd = sqlContext.sparkContext.wholeTextFiles(path).map(x => x._2)
    val schemaFields = schema.fields

    /**
      * TODO... 把field給他作用到rdd上面去
      *
      * 如何根据schema的field的数据类型以及字段顺序整合到rdd
      */
    val rows = rdd.map(fileContent => {
      val lines = fileContent.split("\n")
      val data = lines.map(x => x.split(",").map(x => x.trim).toSeq)

      val typedValues = data.map(x => x.zipWithIndex.map{
        case (value, index) => {
          val colName = schemaFields(index).name

          val castedValue = Utils.castTo(if(colName.equalsIgnoreCase("gender")){
            if(value == "0") {
              "男"
            }else if(value == "1") {
              "女"
            }else{
              "未知"
            }
          } else {
            value
          },schemaFields(index).dataType)

          if(requiredColumns.contains(colName)) {
            Some(castedValue)
          } else {
            None
          }
        }
      })

      /**
        * 10000 long
        * ruoze string
        */
      //TODO... 解析schema
      typedValues.map(x=>Row.fromSeq(x.filter(_.isDefined).map(x => x.get)))
    })
    rows.flatMap(x=>x)
  }

  override def buildScan(requiredColumns: Array[String],
                         filters: Array[Filter]): RDD[Row] = {
    logWarning("this is ruozedata custom buildScan(requiredColumns,filters)...")

    logWarning("Filter:")
    filters.foreach(x => println(x.toString))

    val rdd = sqlContext.sparkContext.wholeTextFiles(path).map(x => x._2)
    val schemaFields = schema.fields

    /**
      * TODO... 把field給他作用到rdd上面去
      *
      * 如何根据schema的field的数据类型以及字段顺序整合到rdd
      */
    val rows = rdd.map(fileContent => {
      val lines = fileContent.split("\n")
      val data = lines.map(x => x.split(",").map(x => x.trim).toSeq)

      val typedValues = data.map(x => x.zipWithIndex.map{
        case (value, index) => {
          val colName = schemaFields(index).name

          val castedValue = Utils.castTo(if(colName.equalsIgnoreCase("gender")){
            if(value == "0") {
              "男"
            }else if(value == "1") {
              "女"
            }else{
              "未知"
            }
          } else {
            value
          },schemaFields(index).dataType)

          if(requiredColumns.contains(colName)) {
            Some(castedValue)
          } else {
            None
          }
        }
      })

      /**
        * 10000 long
        * ruoze string
        */
      //TODO... 解析schema
      typedValues.map(x=>Row.fromSeq(x.filter(_.isDefined).map(x => x.get)))
    })
    rows.flatMap(x=>x)
  }

  override def insert(data: DataFrame, overwrite: Boolean): Unit = {
    data.write
      .mode(if (overwrite) SaveMode.Overwrite else SaveMode.Append)
      .save(path)
  }
}

3

package com.ruozedata.bigdata.sql04.text

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.sources.{BaseRelation, RelationProvider, SchemaRelationProvider}
import org.apache.spark.sql.types.StructType

class DefaultSource extends RelationProvider with SchemaRelationProvider {

  def createRelation(
                      sqlContext: SQLContext,
                      parameters: Map[String, String],
                      schema: StructType): BaseRelation = {
    // 非常重要，读文件必然需要path
    val path = parameters.get("path")

    path match {
      case Some(p) => new TextDatasourceRelation(sqlContext, p, schema)
      case _ => throw new IllegalArgumentException("Path is required for custom-text-datasource-api")
    }
  }

  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = {
    createRelation(sqlContext,parameters,null)
  }
}

4

package com.ruozedata.bigdata.sql04.app

import org.apache.spark.sql.SparkSession

/**
  * Created by ruozedata on 2019/1/3.
  */
object CustomTextApp {

  def main(args: Array[String]) {
    val sparkSession = SparkSession.builder()
      .appName("CustomTextApp")
      .master("local[2]")
      .getOrCreate()

    import sparkSession.implicits._

    val people = sparkSession.read.format("com.ruozedata.bigdata.sql04.text").option("path","d://ruoze-extds.data").load()
//    people.select("id","name").filter('id>2).filter('salary>2000).show()

    people.select("id","name").write.format("com.ruozedata.bigdata.sql04.text").save("d://extds-output/")

//    val people = sparkSession.read.text("file:///d:/ruoze-extds.data")
//    people.show()
    sparkSession.stop()

  }

  }

三：生产使用

在IDEA上面打成jar。只生产上使用
先打包，上传，比之前的删掉，启动spark-shell --jar 架包路径

val people = spark.read.format("com.ruozedata.bigdata.sql04.text").option("path","file:///home/hadoop/data/ext").load()

people.show
这样会报错

people.select ("id","name","gender").show

四：记住

基于spark外部数据源写过这种插件，读一个日志的时候不需要再代码中写，直接引进来，使得项目更加精简紧凑，
各个接口做什么事情，要记住

墨卿风竹

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
外部数据源之 Plugin 可插拔的方式整合到Spark中/Spark源码中

一：概念理解Plugin 可插拔的方式整合到Spark中/Spark源码中为了在用外部数据源读取文件的时候，本来读取是valui 使他变成一个带schema的df 有具体的信息外部数据源就是把很多东西都封装好了BaseRelation：作用是定义了schema信息TableScan查表读表构建RDD（Row）TableScan select a,b,c,d,e from s...
复制链接

扫一扫