读环境 spark 3.2.4 ,hudi 0.12.3
实时表同步环境 flink 1.15.2,hive 3.1.2,hudi 0.12.3
bug
Caused by: java.lang.IllegalArgumentException: For input string: "null"
at scala.collection.immutable.StringLike.parseBoolean(StringLike.scala:330)
at scala.collection.immutable.StringLike.toBoolean(StringLike.scala:289)
at scala.collection.immutable.StringLike.toBoolean$(StringLike.scala:289)
at scala.collection.immutable.StringOps.toBoolean(StringOps.scala:33)
at org.apache.spark.sql.execution.datasources.parquet.ParquetToSparkSchemaConverter.<init>(ParquetSchemaConverter.scala:61)
at org.apache.spark.sql.execution.datasources.parquet.HoodieParquetFileFormatHelper$.buildImplicitSchemaChangeInfo(HoodieParquetFileFormatHelper.scala:30)
at org.apache.spark.sql.execution.datasources.parquet.Spark32PlusHoodieParquetFileFormat.$anonfun$buildReaderWithPartitionValues$2(Spark32PlusHoodieParquetFileFormat.scala:231)
at org.apache.hudi.HoodieDataSourceHelper$.$anonfun$buildHoodieParquetReader$1(HoodieDataSourceHelper.scala:67)
at org.apache.hudi.HoodieBaseRelation.$anonfun$createBaseFileReader$1(HoodieBaseRelation.scala:598)
at org.apache.hudi.HoodieBaseRelation$BaseFileReader.apply(HoodieBaseRelation.scala:651)
at org.apache.hudi.HoodieMergeOnReadRDD.compute(HoodieMergeOnReadRDD.scala:121)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1491)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
23/10/12 11:25:37 INFO SparkContext: Invoking stop() from shutdown hook
解决方案
--conf 'spark.hadoop.spark.sql.legacy.parquet.nanosAsLong=false'
--conf 'spark.hadoop.spark.sql.parquet.binaryAsString=false'
--conf 'spark.hadoop.spark.sql.parquet.int96AsTimestamp=true'
--conf 'spark.hadoop.spark.sql.caseSensitive=false'
scala spark 示例
object TestHoodie {
def main(args: Array[String]): Unit = {
System.setProperty("HADOOP_USER_NAME", "root")
val conf = new SparkConf()
.set("spark.serializer","org.apache.spark.serializer.KryoSerializer")
.set("spark.sql.catalog.spark_catalog","org.apache.spark.sql.hudi.catalog.HoodieCatalog")
.set("spark.sql.extensions","org.apache.spark.sql.hudi.HoodieSparkSessionExtension")
.set("spark.hadoop.spark.sql.legacy.parquet.nanosAsLong","false")
.set("spark.hadoop.spark.sql.parquet.binaryAsString","false")
.set("spark.hadoop.spark.sql.parquet.int96AsTimestamp","true")
.set("spark.hadoop.spark.sql.caseSensitive","false")
.setMaster("local")
val sparkSession = SparkSession.builder()
.appName("spark demo")
.config(conf)
.getOrCreate()
val df = sparkSession.read.format("org.apache.hudi").load("hdfs://ks9p-cs-hadoop01:9000/data/hive/warehouse/ods.db/student")
df.createOrReplaceTempView("student")
val sqlDf = sparkSession.sql("select * from student where dt='2023-10-11'")
sqlDf.show()
}
}
spark thrift
./start-thriftserver.sh \
--jars $(echo /data/spark/jars/custom-jars/*.jar | tr ' ' ',') \
--conf spark.driver.extraClassPath=$(echo /data/spark/jars/custom-jars/*.jar | tr ' ' ',') \ \
--conf spark.sql.hive.convertMetastoreParquet=false \
--conf spark.sql.metadataCacheTTLSeconds=1 \
--hiveconf hive.server2.thrift.port=10003 \
--hiveconf hive.server2.thrift.bind.host=KS9P-CS-HADOOP07 \
--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \
--conf 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' \
--conf 'spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog' \
--conf 'spark.hadoop.spark.sql.legacy.parquet.nanosAsLong=false' \
--conf 'spark.hadoop.spark.sql.parquet.binaryAsString=false' \
--conf 'spark.hadoop.spark.sql.parquet.int96AsTimestamp=true' \
--conf 'spark.hadoop.spark.sql.caseSensitive=false' \
--master yarn \
--deploy-mode client \
--driver-memory 16G \
--driver-cores 1 \
--num-executors 10 \
--executor-memory 4G \
--executor-cores 1 &