UDF自定义函数实战
UDF:User Defined Function。用户自定义函数。
Java版本
public class UDF {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("UDFJava").setMaster("local");
JavaSparkContext sparkContext = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sparkContext);
List<String> stringList = new ArrayList<String>();
stringList.add("Feng Xiangbin");
stringList.add("Zhao Jun");
stringList.add("Spark");
stringList.add("Hadoop");
JavaRDD<String> rdd = sparkContext.parallelize(stringList);
JavaRDD<Row> nameRDD = rdd.map(new Function<String, Row>() {
@Override
public Row call(String v1) throws Exception {
return RowFactory.create(v1);
}
});
List<StructField> fieldList = new ArrayList<StructField>();
fieldList.add(DataTypes.createStructField("name", DataTypes.StringType, true));
StructType structType = DataTypes.createStructType(fieldList);
DataFrame dataFrame = sqlContext.createDataFrame(nameRDD, structType);
dataFrame.registerTempTable("name");
sqlContext.udf().register("strLen", new UDF1<String, Integer>() {
@Override
public Integer call(String s) throws Exception {
return s.length();
}
},DataTypes.IntegerType);
sqlContext.sql("select name,strLen(name) from name").javaRDD().foreach(new VoidFunction<Row>() {
@Override
public void call(Row row) throws Exception {
System.out.println(row);
}
});
}
}
Scala版本
object UDF {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("UDFScala").setMaster("local")
val sparkContext = new SparkContext(conf)
val sqlContext = new SQLContext(sparkContext)
val name = Array("Feng Xiangbin", "Zhao Jun", "Spark", "Hadoop")
val nameRDD = sparkContext.parallelize(name)
val nameRowRDD = nameRDD.map(s => Row(s))
val structType = StructType(Array(StructField("name", StringType, true)))
val df = sqlContext.createDataFrame(nameRowRDD, structType)
df.registerTempTable("name")
sqlContext.udf.register("strLen", (str:String) => str.length)
sqlContext.sql("select name, strLen(name) from name").rdd.foreach(row => println(row))
}
}