spark业务开发-列过滤(filter)
输入数据
order_number,order_date,purchaser,quantity,product_id,remark
10001,2016-01-16,1001,1,102,机q器w记e录r
10003,2016-01-17,1002,2,105,人工记录
10002,2016-01-19,1002,3,106,人工补录
10004,2016-02-21,1003,4,107,自然交易
10001,2016-01-16,1001,1,102,机器记录
输出数据
+------------+-------------------+---------+--------+----------+--------+
|order_number| order_date|purchaser|quantity|product_id| remark|
+------------+-------------------+---------+--------+----------+--------+
| 10002|2016-01-19 00:00:00| 1002| 3| 106|人工补录|
+------------+-------------------+---------+--------+----------+--------+
程序代码
package com.cch.bigdata.spark.process.filter
import com.cch.bigdata.spark.process.AbstractTransform
import com.cch.bigdata.spark.process.filter.model.FilterColumn
import org.apache.spark.sql.{DataFrame, SparkSession}
import scala.collection.mutable.ListBuffer
class Filter extends AbstractTransform{
case class FilterColumn(name:String,expr:String,leftValue:Any,rightValue:Any)
val spark: SparkSession = SparkSession.builder().appName("Filter").master("local[1]").getOrCreate()
//过滤模式
private val filter_mode_set = Set("and", "or")
private val filter_mode = "and"
//需要过滤的列名,数组
private val columns = Array("quantity", "remark")
//操作符
private val op_symbol = Array("bt", "eq")
//left value
private val left_value = Array("2", "人工补录")
//right value
private val right_value = Array("4", "")
private val path = "src/main/resources/csv/orders.csv"
override def process(): Unit = {
if (filter_mode.isEmpty) {
throw new RuntimeException("过滤模式未配置")
} else {
if (!filter_mode_set.contains(filter_mode)) {
throw new RuntimeException("过滤模式必须是以下方式:[and/or]")
}
}
if (columns.isEmpty || columns.length == 0) {
throw new RuntimeException("过滤列未配置")
}
if (op_symbol.isEmpty) {
throw new RuntimeException("过滤列操作符未配置")
}
if (left_value.isEmpty) {
throw new RuntimeException("过滤列匹配值未配置")
}
val list: ListBuffer[FilterColumn] = ListBuffer()
var index = 0
columns.foreach(c => {
list.append(FilterColumn(c, op_symbol(index), left_value(index), right_value(index)))
index += 1
})
//获取上游数据集
val df: DataFrame = loadCsv(path,spark)
index = 0;
val filterStringBuffer = new StringBuilder
list.foreach(filterColumn => {
if (filterColumn.name.isEmpty) {
throw new RuntimeException("过滤列未配置")
}
if (filterColumn.expr.isEmpty) {
throw new RuntimeException("过滤列对应的表达式未配置")
}
if (filterColumn.leftValue == null) {
throw new RuntimeException("过滤值未配置")
}
if (index > 0) {
filterStringBuffer.append(" ").append(filter_mode).append(" ")
}
if (filterColumn.expr.equals("bt")) {
if (filterColumn.rightValue == null) {
throw new RuntimeException("介于过滤方式对应的值不完整")
}
filterStringBuffer.append("(")
filterStringBuffer.append(filterColumn.name)
filterStringBuffer.append(" >= ").append("'").append(filterColumn.leftValue).append("'")
filterStringBuffer.append(" and ")
filterStringBuffer.append(filterColumn.name)
filterStringBuffer.append(" <= ").append("'").append(filterColumn.rightValue).append("'")
filterStringBuffer.append(") ")
} else {
filterStringBuffer.append("(").append(filterColumn.name)
filterStringBuffer.append(" ")
filterStringBuffer.append(parseExpr(filterColumn.expr))
filterStringBuffer.append(" ")
filterStringBuffer.append("'").append(filterColumn.leftValue).append("'")
filterStringBuffer.append(")")
}
index += 1
})
df.filter(filterStringBuffer.toString()).show()
}
def parseExpr(expr: String): String = {
var result = ""
expr match {
case "eq" => {
result = "=="
}
case "ne" => {
result = "!="
}
case "gt" => {
//大于
result = ">"
}
case "lt" => {
result = "<"
}
case "ge" => {
result = ">="
}
case "le" => {
result = "<="
}
}
result
}
}
package com.cch.bigdata.spark.process
import com.cch.bigdata.spark.process.filter.Filter
object FilterTest {
def main(args: Array[String]): Unit = {
val filter = new Filter()
filter.process()
}
}
参数解释
-
filter_mode:过滤模式(字符串),只能选择and或者or
-
columns:需要过滤的列名(字符串数组)
-
op_symbol:每个列需要过滤时的表达式(字符串数组),索引内容和列名对应
-
left_value:需要过滤的值(字符串数组),索引内容和列名对应
-
right_value:如果过滤表达式使用了bt(介于),这里需要有明确的值,否则,需要用逗号做索引占用
-
op_symbol中的表达式
- eq代表相等过滤
- ne代表不相等过滤
- gt代表大于过滤
- lt代表小于过滤
- ge代表大于等于过滤
- le代表小于等于过滤
- bt代表介于过滤,例如:age 介于18到60之间,此时需要配置列的rightValue属性