spark业务开发-列过滤(filter)

spark业务开发-列过滤(filter)

输入数据
order_number,order_date,purchaser,quantity,product_id,remark
10001,2016-01-16,1001,1,102,机q器w记e录r
10003,2016-01-17,1002,2,105,人工记录
10002,2016-01-19,1002,3,106,人工补录
10004,2016-02-21,1003,4,107,自然交易
10001,2016-01-16,1001,1,102,机器记录
输出数据
+------------+-------------------+---------+--------+----------+--------+
|order_number|         order_date|purchaser|quantity|product_id|  remark|
+------------+-------------------+---------+--------+----------+--------+
|       10002|2016-01-19 00:00:00|     1002|       3|       106|人工补录|
+------------+-------------------+---------+--------+----------+--------+
程序代码
package com.cch.bigdata.spark.process.filter

import com.cch.bigdata.spark.process.AbstractTransform
import com.cch.bigdata.spark.process.filter.model.FilterColumn
import org.apache.spark.sql.{DataFrame, SparkSession}

import scala.collection.mutable.ListBuffer

class Filter extends AbstractTransform{

  case class FilterColumn(name:String,expr:String,leftValue:Any,rightValue:Any)


  val spark: SparkSession = SparkSession.builder().appName("Filter").master("local[1]").getOrCreate()

  //过滤模式
  private val filter_mode_set = Set("and", "or")
  private val filter_mode = "and"

  //需要过滤的列名,数组
  private val columns = Array("quantity", "remark")

  //操作符
  private val op_symbol = Array("bt", "eq")

  //left value
  private val left_value = Array("2", "人工补录")

  //right value
  private val right_value = Array("4", "")

  private val path = "src/main/resources/csv/orders.csv"


  override def process(): Unit = {

    if (filter_mode.isEmpty) {
      throw new RuntimeException("过滤模式未配置")
    } else {
      if (!filter_mode_set.contains(filter_mode)) {
        throw new RuntimeException("过滤模式必须是以下方式:[and/or]")
      }
    }

    if (columns.isEmpty || columns.length == 0) {
      throw new RuntimeException("过滤列未配置")
    }

    if (op_symbol.isEmpty) {
      throw new RuntimeException("过滤列操作符未配置")
    }

    if (left_value.isEmpty) {
      throw new RuntimeException("过滤列匹配值未配置")
    }

    val list: ListBuffer[FilterColumn] = ListBuffer()
    var index = 0
    columns.foreach(c => {
      list.append(FilterColumn(c, op_symbol(index), left_value(index), right_value(index)))
      index += 1
    })


    //获取上游数据集
    val df: DataFrame = loadCsv(path,spark)

    index = 0;

    val filterStringBuffer = new StringBuilder

    list.foreach(filterColumn => {
      if (filterColumn.name.isEmpty) {
        throw new RuntimeException("过滤列未配置")
      }

      if (filterColumn.expr.isEmpty) {
        throw new RuntimeException("过滤列对应的表达式未配置")
      }


      if (filterColumn.leftValue == null) {
        throw new RuntimeException("过滤值未配置")
      }


      if (index > 0) {
        filterStringBuffer.append(" ").append(filter_mode).append(" ")
      }

      if (filterColumn.expr.equals("bt")) {
        if (filterColumn.rightValue == null) {
          throw new RuntimeException("介于过滤方式对应的值不完整")
        }
        filterStringBuffer.append("(")
        filterStringBuffer.append(filterColumn.name)
        filterStringBuffer.append(" >= ").append("'").append(filterColumn.leftValue).append("'")
        filterStringBuffer.append(" and ")
        filterStringBuffer.append(filterColumn.name)
        filterStringBuffer.append(" <= ").append("'").append(filterColumn.rightValue).append("'")
        filterStringBuffer.append(") ")
      } else {
        filterStringBuffer.append("(").append(filterColumn.name)
        filterStringBuffer.append(" ")
        filterStringBuffer.append(parseExpr(filterColumn.expr))
        filterStringBuffer.append(" ")
        filterStringBuffer.append("'").append(filterColumn.leftValue).append("'")
        filterStringBuffer.append(")")
      }

      index += 1
    })

    df.filter(filterStringBuffer.toString()).show()

  }

  def parseExpr(expr: String): String = {
    var result = ""
    expr match {
      case "eq" => {
        result = "=="
      }

      case "ne" => {
        result = "!="
      }

      case "gt" => {
        //大于
        result = ">"
      }

      case "lt" => {
        result = "<"
      }

      case "ge" => {
        result = ">="
      }

      case "le" => {
        result = "<="
      }
    }
    result
  }
}

package com.cch.bigdata.spark.process

import com.cch.bigdata.spark.process.filter.Filter

object FilterTest {

  def main(args: Array[String]): Unit = {
    val filter = new Filter()
    filter.process()
  }
}

参数解释
  • filter_mode:过滤模式(字符串),只能选择and或者or

  • columns:需要过滤的列名(字符串数组)

  • op_symbol:每个列需要过滤时的表达式(字符串数组),索引内容和列名对应

  • left_value:需要过滤的值(字符串数组),索引内容和列名对应

  • right_value:如果过滤表达式使用了bt(介于),这里需要有明确的值,否则,需要用逗号做索引占用

  • op_symbol中的表达式

    • eq代表相等过滤
    • ne代表不相等过滤
    • gt代表大于过滤
    • lt代表小于过滤
    • ge代表大于等于过滤
    • le代表小于等于过滤
    • bt代表介于过滤,例如:age 介于18到60之间,此时需要配置列的rightValue属性
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值