项目环境:
<scala.version>2.12.14</scala.version>
<spark.version>3.0.3</spark.version>
<hadoop.version>3.1.3</hadoop.version>
<hive.version>3.1.2</hive.version>
发现问题:
之前测试了访问外置hive的demo运行没有问题
demo如下
package myspark.sql
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
class test {
}
object SparkSQLTest{
def main(args: Array[String]): Unit = {
// System.setProperty("hadoop.home.dir", "D:\\main\\hadoop-3.1.3");
System.setProperty("HADOOP_USER_NAME", "hadoop")
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("SparkSQLTest")
val spark = SparkSession
.builder()
.enableHiveSupport()
.config(sparkConf)
// .config("spark.sql.warehouse.dir", "hdfs://hadoop102:8020/user/hive/warehouse")
.getOrCreate()
spark.sql("show databases").show
}
}
临时需要从Excel、CSV文件中读取数据作测试
demoB如下
object ReadCSV {
def main(args: Array[String]): Unit = {
// System.setProperty("hadoop.home.dir", "D:\\main\\hadoop-3.1.3");
System.setProperty("HADOOP_USER_NAME", "hadoop")
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("SparkSQLTest")
val spark = SparkSession
.builder()
// .enableHiveSupport()
.config(sparkConf)
// .config("spark.sql.warehouse.dir", "hdfs://hadoop102:8020/user/hive/warehouse")
.getOrCreate()
//
val path = "file:/D:\\programming\\460000.csv"
val df = spark.read
.option("hearder", "false")
.option("encoding","GBK")
.csv(path)
.toDF("plate_number", "plate_color", "province_code", "transport_field_code", "city_code", "customer_code", "position_code", "time_terminal", "time_remote", "longitude", "latitude", "speed_pos", "speed_odo", "mileage", "direction", "altitude", "status_code", "alarm_code")
}
}
结果运行报错如下:
log文本
21/08/17 13:52:08 INFO FileSourceStrategy: Post-Scan Filters: (length(trim(value#0, None)) > 0)
21/08/17 13:52:08 INFO FileSourceStrategy: Output Data Schema: struct<value: string>
Exception in thread "main" java.lang.NoClassDefFoundError: com/fasterxml/jackson/core/exc/InputCoercionException
at com.fasterxml.jackson.module.scala.deser.NumberDeserializers$.<init>(ScalaNumberDeserializersModule.scala:48)
at com.fasterxml.jackson.module.scala.deser.NumberDeserializers$.<clinit>(ScalaNumberDeserializersModule.scala)
at com.fasterxml.jackson.module.scala.deser.ScalaNumberDeserializersModule.$init$(ScalaNumberDeserializersModule.scala:60)
at com.fasterxml.jackson.module.scala.DefaultScalaModule.<init>(DefaultScalaModule.scala:18)
at com.fasterxml.jackson.module.scala.DefaultScalaModule$.<init>(DefaultScalaModule.scala:36)
at com.fasterxml.jackson.module.scala.DefaultScalaModule$.<clinit>(DefaultScalaModule.scala)
at org.apache.spark.rdd.RDDOperationScope$.<init>(RDDOperationScope.scala:82)
at org.apache.spark.rdd.RDDOperationScope$.<clinit>(RDDOperationScope.scala)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:321)
at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:439)
at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:425)
at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:47)
at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3627)
at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2697)
at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3618)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:767)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3616)
at org.apache.spark.sql.Dataset.head(Dataset.scala:2697)
at org.apache.spark.sql.Dataset.take(Dataset.scala:2904)
at org.apache.spark.sql.execution.datasources.csv.TextInputCSVDataSource$.infer(CSVDataSource.scala:114)
at org.apache.spark.sql.execution.datasources.csv.CSVDataSource.inferSchema(CSVDataSource.scala:67)
at org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.inferSchema(CSVFileFormat.scala:62)
at org.apache.spark.sql.execution.datasources.DataSource.$anonfun$getOrInferFileFormatSchema$11(DataSource.scala:208)
at scala.Option.orElse(Option.scala:447)
at org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:205)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:418)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:297)
at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:286)
at scala.Option.getOrElse(Option.scala:189)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:286)
at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:726)
at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:553)
at myspark.sql.ReadCSV$.main(ReadCSV.scala:25)
at myspark.sql.ReadCSV.main(ReadCSV.scala)
Caused by: java.lang.ClassNotFoundException: com.fasterxml.jackson.core.exc.InputCoercionException
at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
at java.lang.ClassLoader.loadClass(ClassLoader.java:418)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:355)
at java.lang.ClassLoader.loadClass(ClassLoader.java:351)
... 40 more
21/08/17 13:52:08 INFO SparkContext: Invoking stop() from shutdown hook
21/08/17 13:52:08 INFO SparkUI: Stopped Spark web UI at http://MSI:4040
21/08/17 13:52:08 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
21/08/17 13:52:08 INFO MemoryStore: MemoryStore cleared
21/08/17 13:52:08 INFO BlockManager: BlockManager stopped
21/08/17 13:52:08 INFO BlockManagerMaster: BlockManagerMaster stopped
21/08/17 13:52:08 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
21/08/17 13:52:08 INFO SparkContext: Successfully stopped SparkContext
21/08/17 13:52:08 INFO ShutdownHookManager: Shutdown hook called
21/08/17 13:52:08 INFO ShutdownHookManager: Deleting directory C:\Users\pc\AppData\Local\Temp\spark-7b14b08b-1c4a-42d3-a0e7-33570ce3bd50
Process finished with exit code 1
查询资料后意识到是com/fasterxml/jackson/core/exc/InputCoercionException(com.fasterxml.jackson)版本冲突所致
spark应用中jackson版本冲突的解决
https://segmentfault.com/a/1190000016412887
com.google.common.base.Preconditions.checkArgument(com.google.guava)版本冲突会导致如下问题
log文本
21/08/17 13:50:15 INFO ResourceUtils: ==============================================================
21/08/17 13:50:15 INFO SparkContext: Submitted application: SparkSQLTest
21/08/17 13:50:15 INFO SecurityManager: Changing view acls to: pc
21/08/17 13:50:15 INFO SecurityManager: Changing modify acls to: pc
21/08/17 13:50:15 INFO SecurityManager: Changing view acls groups to:
21/08/17 13:50:15 INFO SecurityManager: Changing modify acls groups to:
21/08/17 13:50:15 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(pc); groups with view permissions: Set(); users with modify permissions: Set(pc); groups with modify permissions: Set()
Exception in thread "main" java.lang.NoSuchMethodError: com.google.common.base.Preconditions.checkArgument(ZLjava/lang/String;Ljava/lang/Object;)V
at org.apache.hadoop.conf.Configuration.set(Configuration.java:1357)
at org.apache.hadoop.conf.Configuration.set(Configuration.java:1338)
at org.apache.spark.deploy.SparkHadoopUtil$.$anonfun$appendHiveConfigs$1(SparkHadoopUtil.scala:474)
at org.apache.spark.deploy.SparkHadoopUtil$.$anonfun$appendHiveConfigs$1$adapted(SparkHadoopUtil.scala:473)
at scala.collection.immutable.Stream.foreach(Stream.scala:533)
at org.apache.spark.deploy.SparkHadoopUtil$.appendHiveConfigs(SparkHadoopUtil.scala:473)
at org.apache.spark.deploy.SparkHadoopUtil$.org$apache$spark$deploy$SparkHadoopUtil$$appendS3AndSparkHadoopHiveConfigurations(SparkHadoopUtil.scala:453)
at org.apache.spark.deploy.SparkHadoopUtil$.newConfiguration(SparkHadoopUtil.scala:427)
at org.apache.spark.deploy.SparkHadoopUtil.newConfiguration(SparkHadoopUtil.scala:122)
at org.apache.spark.deploy.SparkHadoopUtil.<init>(SparkHadoopUtil.scala:49)
at org.apache.spark.deploy.SparkHadoopUtil$.instance$lzycompute(SparkHadoopUtil.scala:397)
at org.apache.spark.deploy.SparkHadoopUtil$.instance(SparkHadoopUtil.scala:397)
at org.apache.spark.deploy.SparkHadoopUtil$.get(SparkHadoopUtil.scala:418)
at org.apache.spark.SecurityManager.<init>(SecurityManager.scala:95)
at org.apache.spark.SparkEnv$.create(SparkEnv.scala:252)
at org.apache.spark.SparkEnv$.createDriverEnv(SparkEnv.scala:189)
at org.apache.spark.SparkContext.createSparkEnv(SparkContext.scala:272)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:448)
at org.apache.spark.SparkContext$.getOrCreate(SparkContext.scala:2589)
at org.apache.spark.sql.SparkSession$Builder.$anonfun$getOrCreate$2(SparkSession.scala:937)
at scala.Option.getOrElse(Option.scala:189)
at org.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:931)
at myspark.sql.ReadCSV$.main(ReadCSV.scala:19)
at myspark.sql.ReadCSV.main(ReadCSV.scala)
Process finished with exit code 1
于是采用资料中介绍的方法进行解决
使用maven-shade-plugin屏蔽冲突的package后再导入
在pom.xml中增加如下内容:
<dependencies>
<!-- https://mvnrepository.com/artifact/com.google.guava/guava -->
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>27.1-jre</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>2.11.4</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.11.4</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
<version>2.11.4</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.module</groupId>
<artifactId>jackson-module-scala_2.12</artifactId>
<version>2.11.4</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.1.0</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<relocations>
<relocation>
<pattern>com.fasterxml.jackson</pattern>
<shadedPattern>noc.com.fasterxml.jackson</shadedPattern>
</relocation>
<relocation>
<pattern>com.google.guava</pattern>
<shadedPattern>noc.com.google.guava</shadedPattern>
</relocation>
</relocations>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
更新依赖,再次运行demo,正常得到输出
ps:如果遇到无法更新插件异常,可以再maven的settings.xml中添加如下内容
<mirror>
<id>alimaven</id>
<name>aliyun maven</name>
<url>http://maven.aliyun.com/nexus/content/groups/public/</url>
<mirrorOf>central</mirrorOf>
</mirror>
<mirror>
<id>alimaven</id>
<name>aliyun maven</name>
<url>http://maven.aliyun.com/nexus/content/repositories/central/</url>
<mirrorOf>central</mirrorOf>
</mirror>
<mirror>
<id>junit</id>
<name>junit address</name>
<url>http://jcenter.bintray.com/</url>
<mirrorOf>central</mirrorOf>
</mirror>
<mirror>
<id>alimaven</id>
<name>aliyun maven</name>
<url>http://central.maven.org/maven2</url>
<mirrorOf>central</mirrorOf>
</mirror>