SparkSQL官网Java示例ERROR CodeGenerator: failed to compile问题

最新推荐文章于 2024-07-22 14:20:45 发布

zerone-f

最新推荐文章于 2024-07-22 14:20:45 发布

阅读量2.2k

点赞数

分类专栏： spark SparkSQL 文章标签： Spark SQL ERROR CodeGenerator

本文链接：https://blog.csdn.net/someby/article/details/103084807

版权

spark 同时被 2 个专栏收录

175 篇文章 3 订阅

订阅专栏

SparkSQL

33 篇文章 0 订阅

订阅专栏

Table of Contents

问题代码

SparkSessionJavaTest.java

package sparkSQL.apachedemo;


import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.expressions.Aggregator;
import org.apache.spark.sql.expressions.MutableAggregationBuffer;
import org.apache.spark.sql.expressions.UserDefinedAggregateFunction;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.sql.TypedColumn;



import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;

import static org.apache.spark.sql.functions.col;

/**
 * @Classname SparkSessionJavaTest
 * @Date 2019/11/14 下午7:00
 * @Auther hadoop
 * @Description:
 * SparkSQL Java版本
 */

public class SparkSessionJavaTest {
    public static void main(String[] args){
        Logger.getLogger("org").setLevel(Level.INFO);
        SparkConf conf = new SparkConf()
                .setAppName("SparkSessionJavaTest")
                .setMaster("local[2]");
        SparkSession spark = SparkSession
                .builder()
                .config(conf)
//                .enableHiveSupport()
                .getOrCreate();
        String filePath = "file:/usr/local/spark/examples/src/main/resources/";

        typeSafeUserDefinedAggregateFunction(spark,filePath);
        spark.stop();
    }

    /**
     * 集合操作
     *自定义安全类型
     * @param spark
     * @param filePath
     */
    private static void typeSafeUserDefinedAggregateFunction(SparkSession spark,String filePath){
        Encoder<Employee> employeeEncoder = Encoders.bean(Employee.class);
        String path = filePath+ "employees.json";
        Dataset<Employee> ds = spark.read().json(path).as(employeeEncoder);
        ds.show();

        MyAverage2 myAverage = new MyAverage2();
	// Convert the function to a `TypedColumn` and give it a name
        TypedColumn<Employee, Double> averageSalary = myAverage.toColumn().name("average_salary");
        Dataset<Double> result = ds.select(averageSalary);
        result.show();

    }

 /**
     * Employee内部类
     */
    public static class Employee implements Serializable{
        private String name;
        private long salary;

        public Employee(String name, long salary) {
            this.name = name;
            this.salary = salary;
        }

        public String getName() {
            return name;
        }

        public void setName(String name) {
            this.name = name;
        }

        public long getSalary() {
            return salary;
        }

        public void setSalary(long salary) {
            this.salary = salary;
        }
    }


    /**
     * Average 内部类
     */
    public static class  Average implements Serializable{
        private long sum;
        private long count;

        public Average(long sum, long count) {
            this.sum = sum;
            this.count = count;
        }

        public long getSum() {
            return sum;
        }

        public void setSum(long sum) {
            this.sum = sum;
        }

        public long getCount() {
            return count;
        }

        public void setCount(long count) {
            this.count = count;
        }
    }

    public static class MyAverage2 extends Aggregator<Employee,Average,Double> {
        //A zero value for this aggregation.Should satisfy the property taht any b + zero = b
        public Average zero(){
            return new Average(0L,0L);
        }
        //Combine tow values to produce a new value. For performance,the function may modify 'buffer'
        //and return it instead of constructing a new object
        public Average reduce(Average buffer,Employee employee){
            long newSum = buffer.getSum() + employee.getSalary();
            long newCount = buffer.getCount() + 1;
            buffer.setSum(newSum);
            buffer.setCount(newCount);
            return buffer;
        }
        //Merge tow intermediate values
        public Average merge(Average b1,Average b2){
            long mergeSum = b1.getSum() + b2.getSum();
            long mergeCount = b1.getCount() + b2.getCount();
            b1.setSum(mergeSum);
            b1.setCount(mergeCount);
            return b1;
        }
        //Transform the output of the reduction
        public Double finish(Average reduction){
            return ((double)reduction.getSum()) / reduction.getCount();

        }
        //Specifies the Encoder for the intermediate value type
        public Encoder<Average> bufferEncoder(){
            return Encoders.bean(Average.class);
        }
        //Specifies the Encoder for the final output value type
        public Encoder<Double> outputEncoder(){
            return Encoders.DOUBLE();
        }

    }
}

问题：

在学习Spark SQL官网Type-Safe User-Defined Aggregate Functions示例的时候，遇到问题如下：

19/11/15 14:26:36 ERROR CodeGenerator: failed to compile: org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 37, Column 85: No applicable constructor/method found for zero actual parameters; candidates are: "sparkSQL.apachedemo.SparkSessionJavaTest$Employee(java.lang.String, long)"
org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 37, Column 85: No applicable constructor/method found for zero actual parameters; candidates are: "sparkSQL.apachedemo.SparkSessionJavaTest$Employee(java.lang.String, long)"
.......（省略）
19/11/15 14:26:36 INFO CodeGenerator:
/* 001 */ public java.lang.Object generate(Object[] references) {
/* 002 */ return new SpecificSafeProjection(references);
/* 003 */ }
/* 004 */
/* 005 */ class SpecificSafeProjection extends org.apache.spark.sql.catalyst.expressions.codegen.BaseProjection {
/* 006 */
/* 007 */ private Object[] references;
/* 008 */ private InternalRow mutableRow;
/* 009 */
/* 010 */
/* 011 */ public SpecificSafeProjection(Object[] references) {
/* 012 */ this.references = references;
/* 013 */ mutableRow = (InternalRow) references[references.length - 1];
/* 014 */
/* 015 */ }
/* 016 */
/* 017 */ public void initialize(int partitionIndex) {
/* 018 */
/* 019 */ }
/* 020 */
/* 021 */ public java.lang.Object apply(java.lang.Object _i) {
/* 022 */ InternalRow i = (InternalRow) _i;
/* 023 */
/* 024 */ sparkSQL.apachedemo.SparkSessionJavaTest$Employee value_6 = InitializeJavaBean_0(i);
/* 025 */ if (false) {
/* 026 */ mutableRow.setNullAt(0);
/* 027 */ } else {
/* 028 */
/* 029 */ mutableRow.update(0, value_6);
/* 030 */ }
/* 031 */
/* 032 */ return mutableRow;
/* 033 */ }
/* 034 */
/* 035 */
/* 036 */ private sparkSQL.apachedemo.SparkSessionJavaTest$Employee InitializeJavaBean_0(InternalRow i) {
/* 037 */ final sparkSQL.apachedemo.SparkSessionJavaTest$Employee value_1 = false ? null : new sparkSQL.apachedemo.SparkSessionJavaTest$Employee();
/* 038 */ sparkSQL.apachedemo.SparkSessionJavaTest$Employee javaBean_0 = value_1;
/* 039 */ if (!false) {
/* 040 */
/* 041 */
/* 042 */ boolean isNull_3 = i.isNullAt(0);
/* 043 */ UTF8String value_3 = isNull_3 ? null : (i.getUTF8String(0));
/* 044 */ boolean isNull_2 = true;
/* 045 */ java.lang.String value_2 = null;
/* 046 */ if (!isNull_3) {
/* 047 */
/* 048 */ isNull_2 = false;
/* 049 */ if (!isNull_2) {
/* 050 */
/* 051 */ Object funcResult_0 = null;
/* 052 */ funcResult_0 = value_3.toString();
/* 053 */
/* 054 */ if (funcResult_0 != null) {
/* 055 */ value_2 = (java.lang.String) funcResult_0;
/* 056 */ } else {
/* 057 */ isNull_2 = true;
/* 058 */ }
/* 059 */
/* 060 */
/* 061 */ }
/* 062 */ }
/* 063 */ javaBean_0.setName(value_2);
/* 064 */
/* 065 */
/* 066 */ boolean isNull_5 = i.isNullAt(1);
/* 067 */ long value_5 = isNull_5 ? -1L : (i.getLong(1));
/* 068 */
/* 069 */ if (isNull_5) {
/* 070 */ throw new NullPointerException(((java.lang.String) references[0] /* errMsg */));
/* 071 */ }
/* 072 */ javaBean_0.setSalary(value_5);
/* 073 */
/* 074 */ }
/* 075 */
/* 076 */ return value_1;
/* 077 */ }
/* 078 */
/* 079 */ }

19/11/15 14:26:37 ERROR Executor: Exception in task 0.0 in stage 2.0 (TID 2)
java.util.concurrent.ExecutionException: org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 37, Column 85: failed to compile: org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 37, Column 85: No applicable constructor/method found for zero actual parameters; candidates are:

...(省略)

定位问题

File 'generated.java', Line 37, Column 85: failed to compile

/* 037 */ final sparkSQL.apachedemo.SparkSessionJavaTest$Employee value_1 = false ? null : new sparkSQL.apachedemo.SparkSessionJavaTest$Employee();

可以看到，代码运行的时候调用Employees.class类中的无参构造器，而代码中使用是有参构造器。

  public Employee(String name, long salary) {
            this.name = name;
            this.salary = salary;
        }

将这个构造器注释掉，使用默认构造器，这个问题就消失了。

没完呢？

但是，还有一个Average.class类也定义了有参构造器，会不会也有这个问题，也顺便检验一下上面的分析是否正确？

（啊哈，原来的异常已经消失了，但是抛出了新的异常。）

19/11/15 14:50:06 ERROR CodeGenerator: failed to compile: org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 24, Column 84: No applicable constructor/method found for zero actual parameters; candidates are: "sparkSQL.apachedemo.SparkSessionJavaTest$Average(long, long)"
org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 24, Column 84: No applicable constructor/method found for zero actual parameters; candidates are: "sparkSQL.apachedemo.SparkSessionJavaTest$Average(long, long)"

.......（省略）

19/11/15 14:50:06 INFO CodeGenerator:
/* 001 */ public java.lang.Object generate(Object[] references) {
/* 002 */ return new SpecificSafeProjection(references);
/* 003 */ }
/* 004 */
/* 005 */ class SpecificSafeProjection extends org.apache.spark.sql.catalyst.expressions.codegen.BaseProjection {
/* 006 */
/* 007 */ private Object[] references;
/* 008 */ private InternalRow mutableRow;
/* 009 */
/* 010 */
/* 011 */ public SpecificSafeProjection(Object[] references) {
/* 012 */ this.references = references;
/* 013 */ mutableRow = (InternalRow) references[references.length - 1];
/* 014 */
/* 015 */ }
/* 016 */
/* 017 */ public void initialize(int partitionIndex) {
/* 018 */
/* 019 */ }
/* 020 */
/* 021 */ public java.lang.Object apply(java.lang.Object _i) {
/* 022 */ InternalRow i = (InternalRow) _i;
/* 023 */
/* 024 */ final sparkSQL.apachedemo.SparkSessionJavaTest$Average value_1 = false ? null : new sparkSQL.apachedemo.SparkSessionJavaTest$Average();
/* 025 */ sparkSQL.apachedemo.SparkSessionJavaTest$Average javaBean_0 = value_1;
/* 026 */ if (!false) {
/* 027 */
/* 028 */
/* 029 */ long value_3 = i.getLong(0);
/* 030 */
/* 031 */ if (false) {
/* 032 */ throw new NullPointerException(((java.lang.String) references[0] /* errMsg */));
/* 033 */ }
/* 034 */ javaBean_0.setCount(value_3);
/* 035 */
/* 036 */
/* 037 */ long value_5 = i.getLong(1);
/* 038 */
/* 039 */ if (false) {
/* 040 */ throw new NullPointerException(((java.lang.String) references[1] /* errMsg */));
/* 041 */ }
/* 042 */ javaBean_0.setSum(value_5);
/* 043 */
/* 044 */ }
/* 045 */ if (false) {
/* 046 */ mutableRow.setNullAt(0);
/* 047 */ } else {
/* 048 */
/* 049 */ mutableRow.update(0, value_1);
/* 050 */ }
/* 051 */
/* 052 */ return mutableRow;
/* 053 */ }
/* 054 */
/* 055 */
/* 056 */ }

19/11/15 14:50:06 ERROR Executor: Exception in task 0.0 in stage 3.0 (TID 3)
java.util.concurrent.ExecutionException: org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 24, Column 84: failed to compile: org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 24, Column 84: No applicable constructor/method found for zero actual parameters; candidates are: "sparkSQL.apachedemo.SparkSessionJavaTest$Average(long, long)"

...(省略)

定位问题

File 'generated.java', Line 24, Column 84: failed to compile

/* 024 */ final sparkSQL.apachedemo.SparkSessionJavaTest$Average value_1 = false ? null : new sparkSQL.apachedemo.SparkSessionJavaTest$Average();

确实，是因为代码在编译的时候使用的是无参构造器，而代码中已经指定了有参构造器，导致有异常抛出。

将有参构造器注释掉，使用默认的构造器

  public Average(long sum, long count) {
            this.sum = sum;
            this.count = count;
        }

并将代码中的zero函数进行修改。

        public Average zero(){
            Average average = new Average();
            average.setSum(0L);
            average.setCount(0L);
            return average;
        }

这样就问题就能完整解决。

正确的代码

SparkSessionJavaTest.java

package sparkSQL.apachedemo;


import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.expressions.Aggregator;
import org.apache.spark.sql.expressions.MutableAggregationBuffer;
import org.apache.spark.sql.expressions.UserDefinedAggregateFunction;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.sql.TypedColumn;



import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;

import static org.apache.spark.sql.functions.col;

/**
 * @Classname SparkSessionJavaTest
 * @Date 2019/11/14 下午7:00
 * @Auther hadoop
 * @Description:
 * SparkSQL Java版本
 */

public class SparkSessionJavaTest {
    public static void main(String[] args){
        Logger.getLogger("org").setLevel(Level.INFO);
        SparkConf conf = new SparkConf()
                .setAppName("SparkSessionJavaTest")
                .setMaster("local[2]");
        SparkSession spark = SparkSession
                .builder()
                .config(conf)
//                .enableHiveSupport()
                .getOrCreate();
        String filePath = "file:/usr/local/spark/examples/src/main/resources/";

        typeSafeUserDefinedAggregateFunction(spark,filePath);
        spark.stop();
    }

    /**
     * 集合操作
     *自定义安全类型
     * @param spark
     * @param filePath
     */
    private static void typeSafeUserDefinedAggregateFunction(SparkSession spark,String filePath){
        Encoder<Employee> employeeEncoder = Encoders.bean(Employee.class);
        String path = filePath+ "employees.json";
        Dataset<Employee> ds = spark.read().json(path).as(employeeEncoder);
        ds.show();

        MyAverage2 myAverage = new MyAverage2();
	// Convert the function to a `TypedColumn` and give it a name
        TypedColumn<Employee, Double> averageSalary = myAverage.toColumn().name("average_salary");
        Dataset<Double> result = ds.select(averageSalary);
        result.show();

    }

 /**
     * Employee内部类
     */
    public static class Employee implements Serializable{
        private String name;
        private long salary;

        public Employee(String name, long salary) {
            this.name = name;
            this.salary = salary;
        }

        public String getName() {
            return name;
        }

        public void setName(String name) {
            this.name = name;
        }

        public long getSalary() {
            return salary;
        }

        public void setSalary(long salary) {
            this.salary = salary;
        }
    }


    /**
     * Average 内部类
     */
    public static class  Average implements Serializable{
        private long sum;
        private long count;

        // public Average(long sum, long count) {
        //     this.sum = sum;
        //     this.count = count;
        // }

        public long getSum() {
            return sum;
        }

        public void setSum(long sum) {
            this.sum = sum;
        }

        public long getCount() {
            return count;
        }

        public void setCount(long count) {
            this.count = count;
        }
    }

    public static class MyAverage2 extends Aggregator<Employee,Average,Double> {
        //A zero value for this aggregation.Should satisfy the property taht any b + zero = b
        public Average zero(){
            Average average = new Average();
            average.setSum(0L);
            average.setCount(0L);
            return average;
        }
        //Combine tow values to produce a new value. For performance,the function may modify 'buffer'
        //and return it instead of constructing a new object
        public Average reduce(Average buffer,Employee employee){
            long newSum = buffer.getSum() + employee.getSalary();
            long newCount = buffer.getCount() + 1;
            buffer.setSum(newSum);
            buffer.setCount(newCount);
            return buffer;
        }
        //Merge tow intermediate values
        public Average merge(Average b1,Average b2){
            long mergeSum = b1.getSum() + b2.getSum();
            long mergeCount = b1.getCount() + b2.getCount();
            b1.setSum(mergeSum);
            b1.setCount(mergeCount);
            return b1;
        }
        //Transform the output of the reduction
        public Double finish(Average reduction){
            return ((double)reduction.getSum()) / reduction.getCount();

        }
        //Specifies the Encoder for the intermediate value type
        public Encoder<Average> bufferEncoder(){
            return Encoders.bean(Average.class);
        }
        //Specifies the Encoder for the final output value type
        public Encoder<Double> outputEncoder(){
            return Encoders.DOUBLE();
        }

    }
}

总结

这样的问题也是第一次遇到，在自己慢慢摸索中将问题定位出来，为什么自己会走那么长的弯路，发现自己将代码中的日志级别设置为Logger.getLogger("org").setLevel(Level.ERROR)，根本就没有发现具体的问题所在。将日志级别设置为Logger.getLogger("org").setLevel(Level.INFO)后就可以看到全部日志，也容易发现问题，在以后的时间中也要多注意这样的问题。