74、Spark SQL之UDF自定义函数实战

UDF自定义函数实战

UDF:User Defined Function。用户自定义函数。
Java版本

public class UDF {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setAppName("UDFJava").setMaster("local");
        JavaSparkContext sparkContext = new JavaSparkContext(conf);
        SQLContext sqlContext = new SQLContext(sparkContext);
        
        List<String> stringList = new ArrayList<String>();
        stringList.add("Feng Xiangbin");
        stringList.add("Zhao Jun");
        stringList.add("Spark");
        stringList.add("Hadoop");
        JavaRDD<String> rdd = sparkContext.parallelize(stringList);
        JavaRDD<Row> nameRDD = rdd.map(new Function<String, Row>() {
            @Override
            public Row call(String v1) throws Exception {
                return RowFactory.create(v1);
            }
        });
        
        List<StructField> fieldList = new ArrayList<StructField>();
        fieldList.add(DataTypes.createStructField("name", DataTypes.StringType, true));
        StructType structType = DataTypes.createStructType(fieldList);
        DataFrame dataFrame = sqlContext.createDataFrame(nameRDD, structType);
        
        dataFrame.registerTempTable("name");
        sqlContext.udf().register("strLen", new UDF1<String, Integer>() {
            @Override
            public Integer call(String s) throws Exception {
                return s.length();
            }
        },DataTypes.IntegerType);

        sqlContext.sql("select name,strLen(name) from name").javaRDD().foreach(new VoidFunction<Row>() {
            @Override
            public void call(Row row) throws Exception {
                System.out.println(row);
            }
        });
    }
}

Scala版本

object UDF {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("UDFScala").setMaster("local")
    val sparkContext = new SparkContext(conf)
    val sqlContext = new SQLContext(sparkContext)

    val name = Array("Feng Xiangbin", "Zhao Jun", "Spark", "Hadoop")

    val nameRDD = sparkContext.parallelize(name)
    val nameRowRDD = nameRDD.map(s => Row(s))
    val structType = StructType(Array(StructField("name", StringType, true)))

    val df = sqlContext.createDataFrame(nameRowRDD, structType)
    df.registerTempTable("name")
    sqlContext.udf.register("strLen", (str:String) => str.length)
    sqlContext.sql("select name, strLen(name) from name").rdd.foreach(row => println(row))
  }
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值