Table of Contents
问题代码
SparkSessionJavaTest.java
package sparkSQL.apachedemo; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.*; import org.apache.spark.sql.expressions.Aggregator; import org.apache.spark.sql.expressions.MutableAggregationBuffer; import org.apache.spark.sql.expressions.UserDefinedAggregateFunction; import org.apache.spark.sql.types.DataType; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.TypedColumn; import java.io.Serializable; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; import static org.apache.spark.sql.functions.col; /** * @Classname SparkSessionJavaTest * @Date 2019/11/14 下午7:00 * @Auther hadoop * @Description: * SparkSQL Java版本 */ public class SparkSessionJavaTest { public static void main(String[] args){ Logger.getLogger("org").setLevel(Level.INFO); SparkConf conf = new SparkConf() .setAppName("SparkSessionJavaTest") .setMaster("local[2]"); SparkSession spark = SparkSession .builder() .config(conf) // .enableHiveSupport() .getOrCreate(); String filePath = "file:/usr/local/spark/examples/src/main/resources/"; typeSafeUserDefinedAggregateFunction(spark,filePath); spark.stop(); } /** * 集合操作 *自定义安全类型 * @param spark * @param filePath */ private static void typeSafeUserDefinedAggregateFunction(SparkSession spark,String filePath){ Encoder<Employee> employeeEncoder = Encoders.bean(Employee.class); String path = filePath+ "employees.json"; Dataset<Employee> ds = spark.read().json(path).as(employeeEncoder); ds.show(); MyAverage2 myAverage = new MyAverage2(); // Convert the function to a `TypedColumn` and give it a name TypedColumn<Employee, Double> averageSalary = myAverage.toColumn().name("average_salary"); Dataset<Double> result = ds.select(averageSalary); result.show(); } /** * Employee内部类 */ public static class Employee implements Serializable{ private String name; private long salary; public Employee(String name, long salary) { this.name = name; this.salary = salary; } public String getName() { return name; } public void setName(String name) { this.name = name; } public long getSalary() { return salary; } public void setSalary(long salary) { this.salary = salary; } } /** * Average 内部类 */ public static class Average implements Serializable{ private long sum; private long count; public Average(long sum, long count) { this.sum = sum; this.count = count; } public long getSum() { return sum; } public void setSum(long sum) { this.sum = sum; } public long getCount() { return count; } public void setCount(long count) { this.count = count; } } public static class MyAverage2 extends Aggregator<Employee,Average,Double> { //A zero value for this aggregation.Should satisfy the property taht any b + zero = b public Average zero(){ return new Average(0L,0L); } //Combine tow values to produce a new value. For performance,the function may modify 'buffer' //and return it instead of constructing a new object public Average reduce(Average buffer,Employee employee){ long newSum = buffer.getSum() + employee.getSalary(); long newCount = buffer.getCount() + 1; buffer.setSum(newSum); buffer.setCount(newCount); return buffer; } //Merge tow intermediate values public Average merge(Average b1,Average b2){ long mergeSum = b1.getSum() + b2.getSum(); long mergeCount = b1.getCount() + b2.getCount(); b1.setSum(mergeSum); b1.setCount(mergeCount); return b1; } //Transform the output of the reduction public Double finish(Average reduction){ return ((double)reduction.getSum()) / reduction.getCount(); } //Specifies the Encoder for the intermediate value type public Encoder<Average> bufferEncoder(){ return Encoders.bean(Average.class); } //Specifies the Encoder for the final output value type public Encoder<Double> outputEncoder(){ return Encoders.DOUBLE(); } } }
问题:
在学习Spark SQL官网Type-Safe User-Defined Aggregate Functions示例的时候,遇到问题如下:
19/11/15 14:26:36 ERROR CodeGenerator: failed to compile: org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 37, Column 85: No applicable constructor/method found for zero actual parameters; candidates are: "sparkSQL.apachedemo.SparkSessionJavaTest$Employee(java.lang.String, long)"
org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 37, Column 85: No applicable constructor/method found for zero actual parameters; candidates are: "sparkSQL.apachedemo.SparkSessionJavaTest$Employee(java.lang.String, long)"
.......(省略)
19/11/15 14:26:36 INFO CodeGenerator:
/* 001 */ public java.lang.Object generate(Object[] references) {
/* 002 */ return new SpecificSafeProjection(references);
/* 003 */ }
/* 004 */
/* 005 */ class SpecificSafeProjection extends org.apache.spark.sql.catalyst.expressions.codegen.BaseProjection {
/* 006 */
/* 007 */ private Object[] references;
/* 008 */ private InternalRow mutableRow;
/* 009 */
/* 010 */
/* 011 */ public SpecificSafeProjection(Object[] references) {
/* 012 */ this.references = references;
/* 013 */ mutableRow = (InternalRow) references[references.length - 1];
/* 014 */
/* 015 */ }
/* 016 */
/* 017 */ public void initialize(int partitionIndex) {
/* 018 */
/* 019 */ }
/* 020 */
/* 021 */ public java.lang.Object apply(java.lang.Object _i) {
/* 022 */ InternalRow i = (InternalRow) _i;
/* 023 */
/* 024 */ sparkSQL.apachedemo.SparkSessionJavaTest$Employee value_6 = InitializeJavaBean_0(i);
/* 025 */ if (false) {
/* 026 */ mutableRow.setNullAt(0);
/* 027 */ } else {
/* 028 */
/* 029 */ mutableRow.update(0, value_6);
/* 030 */ }
/* 031 */
/* 032 */ return mutableRow;
/* 033 */ }
/* 034 */
/* 035 */
/* 036 */ private sparkSQL.apachedemo.SparkSessionJavaTest$Employee InitializeJavaBean_0(InternalRow i) {
/* 037 */ final sparkSQL.apachedemo.SparkSessionJavaTest$Employee value_1 = false ? null : new sparkSQL.apachedemo.SparkSessionJavaTest$Employee();
/* 038 */ sparkSQL.apachedemo.SparkSessionJavaTest$Employee javaBean_0 = value_1;
/* 039 */ if (!false) {
/* 040 */
/* 041 */
/* 042 */ boolean isNull_3 = i.isNullAt(0);
/* 043 */ UTF8String value_3 = isNull_3 ? null : (i.getUTF8String(0));
/* 044 */ boolean isNull_2 = true;
/* 045 */ java.lang.String value_2 = null;
/* 046 */ if (!isNull_3) {
/* 047 */
/* 048 */ isNull_2 = false;
/* 049 */ if (!isNull_2) {
/* 050 */
/* 051 */ Object funcResult_0 = null;
/* 052 */ funcResult_0 = value_3.toString();
/* 053 */
/* 054 */ if (funcResult_0 != null) {
/* 055 */ value_2 = (java.lang.String) funcResult_0;
/* 056 */ } else {
/* 057 */ isNull_2 = true;
/* 058 */ }
/* 059 */
/* 060 */
/* 061 */ }
/* 062 */ }
/* 063 */ javaBean_0.setName(value_2);
/* 064 */
/* 065 */
/* 066 */ boolean isNull_5 = i.isNullAt(1);
/* 067 */ long value_5 = isNull_5 ? -1L : (i.getLong(1));
/* 068 */
/* 069 */ if (isNull_5) {
/* 070 */ throw new NullPointerException(((java.lang.String) references[0] /* errMsg */));
/* 071 */ }
/* 072 */ javaBean_0.setSalary(value_5);
/* 073 */
/* 074 */ }
/* 075 */
/* 076 */ return value_1;
/* 077 */ }
/* 078 */
/* 079 */ }19/11/15 14:26:37 ERROR Executor: Exception in task 0.0 in stage 2.0 (TID 2)
java.util.concurrent.ExecutionException: org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 37, Column 85: failed to compile: org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 37, Column 85: No applicable constructor/method found for zero actual parameters; candidates are:...(省略)
定位问题
File 'generated.java', Line 37, Column 85: failed to compile
/* 037 */ final sparkSQL.apachedemo.SparkSessionJavaTest$Employee value_1 = false ? null : new sparkSQL.apachedemo.SparkSessionJavaTest$Employee();
可以看到,代码运行的时候调用Employees.class类中的无参构造器,而代码中使用是有参构造器。
public Employee(String name, long salary) { this.name = name; this.salary = salary; }
将这个构造器注释掉,使用默认构造器,这个问题就消失了。
没完呢?
但是,还有一个Average.class类也定义了有参构造器,会不会也有这个问题,也顺便检验一下上面的分析是否正确?
(啊哈,原来的异常已经消失了,但是抛出了新的异常。)
19/11/15 14:50:06 ERROR CodeGenerator: failed to compile: org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 24, Column 84: No applicable constructor/method found for zero actual parameters; candidates are: "sparkSQL.apachedemo.SparkSessionJavaTest$Average(long, long)"
org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 24, Column 84: No applicable constructor/method found for zero actual parameters; candidates are: "sparkSQL.apachedemo.SparkSessionJavaTest$Average(long, long)"
.......(省略)19/11/15 14:50:06 INFO CodeGenerator:
/* 001 */ public java.lang.Object generate(Object[] references) {
/* 002 */ return new SpecificSafeProjection(references);
/* 003 */ }
/* 004 */
/* 005 */ class SpecificSafeProjection extends org.apache.spark.sql.catalyst.expressions.codegen.BaseProjection {
/* 006 */
/* 007 */ private Object[] references;
/* 008 */ private InternalRow mutableRow;
/* 009 */
/* 010 */
/* 011 */ public SpecificSafeProjection(Object[] references) {
/* 012 */ this.references = references;
/* 013 */ mutableRow = (InternalRow) references[references.length - 1];
/* 014 */
/* 015 */ }
/* 016 */
/* 017 */ public void initialize(int partitionIndex) {
/* 018 */
/* 019 */ }
/* 020 */
/* 021 */ public java.lang.Object apply(java.lang.Object _i) {
/* 022 */ InternalRow i = (InternalRow) _i;
/* 023 */
/* 024 */ final sparkSQL.apachedemo.SparkSessionJavaTest$Average value_1 = false ? null : new sparkSQL.apachedemo.SparkSessionJavaTest$Average();
/* 025 */ sparkSQL.apachedemo.SparkSessionJavaTest$Average javaBean_0 = value_1;
/* 026 */ if (!false) {
/* 027 */
/* 028 */
/* 029 */ long value_3 = i.getLong(0);
/* 030 */
/* 031 */ if (false) {
/* 032 */ throw new NullPointerException(((java.lang.String) references[0] /* errMsg */));
/* 033 */ }
/* 034 */ javaBean_0.setCount(value_3);
/* 035 */
/* 036 */
/* 037 */ long value_5 = i.getLong(1);
/* 038 */
/* 039 */ if (false) {
/* 040 */ throw new NullPointerException(((java.lang.String) references[1] /* errMsg */));
/* 041 */ }
/* 042 */ javaBean_0.setSum(value_5);
/* 043 */
/* 044 */ }
/* 045 */ if (false) {
/* 046 */ mutableRow.setNullAt(0);
/* 047 */ } else {
/* 048 */
/* 049 */ mutableRow.update(0, value_1);
/* 050 */ }
/* 051 */
/* 052 */ return mutableRow;
/* 053 */ }
/* 054 */
/* 055 */
/* 056 */ }19/11/15 14:50:06 ERROR Executor: Exception in task 0.0 in stage 3.0 (TID 3)
java.util.concurrent.ExecutionException: org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 24, Column 84: failed to compile: org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 24, Column 84: No applicable constructor/method found for zero actual parameters; candidates are: "sparkSQL.apachedemo.SparkSessionJavaTest$Average(long, long)"...(省略)
定位问题
File 'generated.java', Line 24, Column 84: failed to compile
/* 024 */ final sparkSQL.apachedemo.SparkSessionJavaTest$Average value_1 = false ? null : new sparkSQL.apachedemo.SparkSessionJavaTest$Average();
确实,是因为代码在编译的时候使用的是无参构造器,而代码中已经指定了有参构造器,导致有异常抛出。
将有参构造器注释掉,使用默认的构造器
public Average(long sum, long count) { this.sum = sum; this.count = count; }
并将代码中的zero函数进行修改。
public Average zero(){ Average average = new Average(); average.setSum(0L); average.setCount(0L); return average; }
这样就问题就能完整解决。
正确的代码
SparkSessionJavaTest.java
package sparkSQL.apachedemo; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.*; import org.apache.spark.sql.expressions.Aggregator; import org.apache.spark.sql.expressions.MutableAggregationBuffer; import org.apache.spark.sql.expressions.UserDefinedAggregateFunction; import org.apache.spark.sql.types.DataType; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.TypedColumn; import java.io.Serializable; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; import static org.apache.spark.sql.functions.col; /** * @Classname SparkSessionJavaTest * @Date 2019/11/14 下午7:00 * @Auther hadoop * @Description: * SparkSQL Java版本 */ public class SparkSessionJavaTest { public static void main(String[] args){ Logger.getLogger("org").setLevel(Level.INFO); SparkConf conf = new SparkConf() .setAppName("SparkSessionJavaTest") .setMaster("local[2]"); SparkSession spark = SparkSession .builder() .config(conf) // .enableHiveSupport() .getOrCreate(); String filePath = "file:/usr/local/spark/examples/src/main/resources/"; typeSafeUserDefinedAggregateFunction(spark,filePath); spark.stop(); } /** * 集合操作 *自定义安全类型 * @param spark * @param filePath */ private static void typeSafeUserDefinedAggregateFunction(SparkSession spark,String filePath){ Encoder<Employee> employeeEncoder = Encoders.bean(Employee.class); String path = filePath+ "employees.json"; Dataset<Employee> ds = spark.read().json(path).as(employeeEncoder); ds.show(); MyAverage2 myAverage = new MyAverage2(); // Convert the function to a `TypedColumn` and give it a name TypedColumn<Employee, Double> averageSalary = myAverage.toColumn().name("average_salary"); Dataset<Double> result = ds.select(averageSalary); result.show(); } /** * Employee内部类 */ public static class Employee implements Serializable{ private String name; private long salary; public Employee(String name, long salary) { this.name = name; this.salary = salary; } public String getName() { return name; } public void setName(String name) { this.name = name; } public long getSalary() { return salary; } public void setSalary(long salary) { this.salary = salary; } } /** * Average 内部类 */ public static class Average implements Serializable{ private long sum; private long count; // public Average(long sum, long count) { // this.sum = sum; // this.count = count; // } public long getSum() { return sum; } public void setSum(long sum) { this.sum = sum; } public long getCount() { return count; } public void setCount(long count) { this.count = count; } } public static class MyAverage2 extends Aggregator<Employee,Average,Double> { //A zero value for this aggregation.Should satisfy the property taht any b + zero = b public Average zero(){ Average average = new Average(); average.setSum(0L); average.setCount(0L); return average; } //Combine tow values to produce a new value. For performance,the function may modify 'buffer' //and return it instead of constructing a new object public Average reduce(Average buffer,Employee employee){ long newSum = buffer.getSum() + employee.getSalary(); long newCount = buffer.getCount() + 1; buffer.setSum(newSum); buffer.setCount(newCount); return buffer; } //Merge tow intermediate values public Average merge(Average b1,Average b2){ long mergeSum = b1.getSum() + b2.getSum(); long mergeCount = b1.getCount() + b2.getCount(); b1.setSum(mergeSum); b1.setCount(mergeCount); return b1; } //Transform the output of the reduction public Double finish(Average reduction){ return ((double)reduction.getSum()) / reduction.getCount(); } //Specifies the Encoder for the intermediate value type public Encoder<Average> bufferEncoder(){ return Encoders.bean(Average.class); } //Specifies the Encoder for the final output value type public Encoder<Double> outputEncoder(){ return Encoders.DOUBLE(); } } }
总结
这样的问题也是第一次遇到,在自己慢慢摸索中将问题定位出来,为什么自己会走那么长的弯路,发现自己将代码中的日志级别设置为Logger.getLogger("org").setLevel(Level.ERROR),根本就没有发现具体的问题所在。将日志级别设置为Logger.getLogger("org").setLevel(Level.INFO)后就可以看到全部日志,也容易发现问题,在以后的时间中也要多注意这样的问题。