class MyUDTF extends GenericUDTF {
override def close(): Unit = {
// TODO Auto-generated method stub
}
//这个方法的作用:1.输入参数校验 2. 输出列定义,可以多于1列,相当于可以生成多行多列数据
override def initialize(args: Array[ObjectInspector]): StructObjectInspector = {
if (args.length != 1) throw new UDFArgumentLengthException("ExplodeMap takes only one argument")
if (args(0).getCategory != ObjectInspector.Category.PRIMITIVE) throw new UDFArgumentException("ExplodeMap takes string as a parameter")
val fieldNames = new util.ArrayList[String]
val fieldOIs = new util.ArrayList[ObjectInspector]
//这里定义的是输出列默认字段名称
fieldNames.add("col1")
//这里定义的是输出列字段类型
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector)
ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs)
}
//这是处理数据的方法,入参数组里只有1行数据,即每次调用process方法只处理一行数据
override def process(args: Array[AnyRef]): Unit = {
//将字符串切分成单个字符的数组
val strLst = args(0).toString.split(",")
for(str <- strLst){
//调用forward方法,必须传字符串数组,即使只有一个元素
forward(Array(str))
}
}
}
object UDTFDemo {
System.setProperty("hadoop.home.dir","d://software/hadoop-2.9.2")
def main(args:Array[String]): Unit ={
//测试数据所在的本地路径
val file = "file:/d://data/ads.txt"
//创建sparksession
val spark = SparkSession
.builder
.master("local")
.appName("ads")
.config("fs.defaultFS", "hdfs://hadoop-senior.test.com")
.enableHiveSupport() //启用hive
.getOrCreate()
//sparksession直接读取csv,可设置分隔符delimitor.
val df = spark.read
.option("header","true").option("sep","\t")
.csv(file)
df.printSchema()
//将DataFrame注册成视图,然后即可使用hql访问
df.createOrReplaceTempView("page_ads")
//注册utdf算子,这里无法使用sparkSession.udf.register()
spark.sql("create temporary function ads_explode as 'com.bw.spark.udf.MyUDTF'")
//使用UDTF算子处理原表userDF
// val udtfDF = spark.sql("select ads_explode(adids) from page_ads")
val udtfDF = spark.sql(
"SELECT pageid, adid FROM page_ads LATERAL VIEW ads_explode(adids) adTable AS adid"
)
udtfDF.show
spark.close()
}
}