最近使用sparksql,需求是需要对一些非结构化的数据进行处理,具体的需求是:
1 类似{“”,“”},合并此类数据,如果有相同的field,则把value累加,无则把field加入
2 类似a,b,c ,需要聚合后累加去重统计字母出现的次数
3 string 类型的数据 实现累加
package com.dianyou import com.dianyou.utl.JsonUtil import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} import org.apache.spark.sql.types._ import org.json.JSONObject /** * Created by Administrator on 2018/6/7. */ object UdafTest { /*val log = Logger.getLogger(this.getClass) Logger.getLogger("org").setLevel(Level.ERROR)*/ def main(args: Array[String]): Unit = { val sqlContex = SparkSession.builder().master("local[2]") .appName("udaf-test" .config("spark.testing.memory", "2147480000") .getOrCreate() /*val tempDataFrame1 = sqlContex.createDataFrame(Seq( ("li","a,b,c"), ("liu","d,e,f"), ("li","g,h,g") //("li", "a,b,g") )).toDF("id", "content").repartition(1) //注册udaf函数 tempDataFrame1.createOrReplaceTempView("temp") sqlContex.udf.register("sumdevice",new distinctSum) sqlContex.sql("select id, sumdevice(content) as num from temp group by id").show()*/ /* val tempDataFrame2 = sqlContex.createDataFrame(Seq( ("li","{\"GZ\":\"5\",\"SZ\":\"6\"}"), ("liu","{\"BJ\":\"10\"}"), ("li","{\"GZ\":\"10\",\"SH\":\"8\"}") //("li", "a,b,g") )).toDF("id", "city").repartition(1) tempDataFrame2.createOrReplaceTempView("temp2") sqlContex.udf.register("jsonSumdevice",new jsonSum) sqlContex.sql("select id, jsonSumdevice(city) as num from temp2 group by id").show()*/ val tempDataFrame3 = sqlContex.createDataFrame(Seq( ("li","5"), ("liu","6"), ("li","7") //("li", "a,b,g") )).toDF("id", "num").repartition(1) tempDataFrame3.createOrReplaceTempView("temp3") sqlContex.udf.register("string2Int",(str:String)=>(Integer.parseInt(str))) sqlContex.sql("select id, sum(num) as num from temp3 group by id").show() } // // 合并json数据,有此字段则相加,无则加入 class jsonSum extends UserDefinedAggregateFunction{ override def inputSchema: StructType = StructType(Array(StructField("input",StringType,true))) override def bufferSchema: StructType = StructType(Array(StructField("CITY",StringType,true))) override def dataType: DataType = StringType override def deterministic: Boolean = true override def initialize(buffer: MutableAggregationBuffer): Unit ={buffer(0)=""} override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { buffer(0)=if(buffer.getString(0).trim.length==0) {input.getString(0)}else{ mergeCityJson(buffer.getString(0),input.getString(0)) } } override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { buffer1(0) = if (buffer1.getString(0).trim.length == 0) { buffer2.getString(0) } else { mergeCityJson(buffer1.getString(0), buffer2.getString(0)) } } override def evaluate(buffer: Row): Any = buffer.getString(0) } // 累加去重 class distinctSum extends UserDefinedAggregateFunction{ override def inputSchema: StructType = StructType(Array(StructField("input",StringType,true))) override def bufferSchema: StructType = StructType(Array(StructField("cont",StringType,true))) override def dataType: DataType = IntegerType override def deterministic: Boolean = true override def initialize(buffer: MutableAggregationBuffer): Unit = {buffer(0)=null} override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {buffer(0)=buffer.getString(0)+","+input} override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { buffer1(0)= buffer1.getString(0)+","+buffer2.getString(0) } override def evaluate(buffer: Row): Any = buffer.getString(0).replace("[","").replaceAll("]","").split(",").filter(_!="null").distinct.length } // 合并json def mergeCityJson(json1:String,json2:String):String={ var cityJson1 = new JSONObject(json1) val cityMap2 = JsonUtil.json2Map(json2) for(key<-cityMap2.keySet){ if(cityJson1.has(key)){ cityJson1.put(key,cityJson1.getInt(key)+Integer.parseInt(cityMap2.getOrElse(key,"0").toString)) }else{ cityJson1.put(key,cityMap2.getOrElse(key,null)) } cityJson1 } cityJson1.toString() } }