一、描述
使用spark mysql-connector-java-5.1.8.jar向mysql、tidb的r_day_stat_survey_detail_new表写数据,r_day_stat_survey_detail_new定义如下
CREATE TABLE `r_day_stat_survey_detail_new` (
......
`submit_time` datetime(6) DEFAULT NULL
) ;
写代码如下
var res = surveyDF.select($"test",$"hashedOperaId",$"advertisingId",$"ad_id",$"mid",$"question_index",$"question_type",(when($"cType"==="LEADS",map_first_value_to_json($"detail")).otherwise(list_to_json($"cIndex"))).name("detail"),from_unixtime($"timeMs").as("submit_time"))
报如下错误
20/04/23 11:38:27 ERROR Executor: Exception in task 0.0 in stage 0.0 (TID 0)
java.sql.BatchUpdateException: Data truncation: invalid time format: '{52277 7 7 3 13 49 0}'
at com.mysql.jdbc.PreparedStatement.executeBatchedInserts(PreparedStatement.java:1809)
at com.mysql.jdbc.PreparedStatement.executeBatch(PreparedStatement.java:1441)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.savePartition(JdbcUtils.scala:222)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:300)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:299)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:902)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:902)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1899)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1899)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
二、分析
1、只向mysql写,还是报粗
2、强转成TimestampType,如下
import org.apache.spark.sql.types.{TimestampType}
......
var res = surveyDF.select($"test",$"hashedOperaId",$"advertisingId",$"ad_id",$"mid",$"question_index",$"question_type",(when($"cType"==="LEADS",map_first_value_to_json($"detail")).otherwise(list_to_json($"cIndex"))).name("detail"),$"timeMs".cast(TimestampType).as("submit_time"))
也不行
3、使用from_unixtime(MM/dd/yyyy HH:mm:ss.SSS)函数如下
from_unixtime($"timeMs","MM/dd/yyyy HH:mm:ss.SSS").as("submit_time")
又报如下错误
20/04/24 02:24:38 ERROR Executor: Exception in task 0.0 in stage 0.0 (TID 0)
java.sql.BatchUpdateException: Data truncation: invalid time format: '53'
at com.mysql.jdbc.PreparedStatement.executeBatchedInserts(PreparedStatement.java:1809)
at com.mysql.jdbc.PreparedStatement.executeBatch(PreparedStatement.java:1441)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.savePartition(JdbcUtils.scala:222)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:300)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:299)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:902)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:902)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1899)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1899)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
使用from_unixtime($"timeMs","yyyy-MM-dd HH:mm:ss.SSS"),还是报
20/04/24 02:38:15 ERROR Executor: Exception in task 0.2 in stage 0.0 (TID 2)
java.sql.BatchUpdateException: Data truncation: invalid time format: '{52277 7 7 3 13 49 0}'
at com.mysql.jdbc.PreparedStatement.executeBatchedInserts(PreparedStatement.java:1809)
at com.mysql.jdbc.PreparedStatement.executeBatch(PreparedStatement.java:1441)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.savePartition(JdbcUtils.scala:222)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:300)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:299)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:902)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:902)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1899)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1899)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
使用from_unixtime($"timeMs","yyyy/MM/dd HH:mm:ss.SSS").as("submit_time"),还是报
20/04/24 02:44:55 ERROR Executor: Exception in task 0.1 in stage 0.0 (TID 1)
java.sql.BatchUpdateException: Data truncation: invalid time format: '{52277 7 7 3 13 49 0}'
at com.mysql.jdbc.PreparedStatement.executeBatchedInserts(PreparedStatement.java:1809)
at com.mysql.jdbc.PreparedStatement.executeBatch(PreparedStatement.java:1441)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.savePartition(JdbcUtils.scala:222)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:300)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:299)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:902)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:902)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1899)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1899)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
4、定义udf函数
import java.sql.Time;
def getDateTime = udf{
(l:Long) => {
val f = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS")
f.format(new Date(l))
}
}
调用udf,如下
var res = surveyDF.select($"test",$"hashedOperaId",$"advertisingId",$"ad_id",$"mid",$"question_index",$"question_type",(when($"cType"==="LEADS",map_first_value_to_json($"detail")).otherwise(list_to_json($"cIndex"))).name("detail"),getTimestamp($"timeMs").as("submit_time"))
再次执行,毫秒正常入库
三、解决方法
定义udf函数
注意:本文归作者所有,未经作者允许,不得转载