idea远程提交spark on yarn出现问题
代码
package dsy.read_hdfs
import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SparkSession}
object read_hdfs {
def main(args: Array[String]): Unit = {
System.setProperty("HADOOP_USER_NAME", "root");
System.setProperty("user.name", "root");
val value = this.getClass.getClassLoader.loadClass("org.apache.spark.scheduler.cluster.YarnClusterManager")
val spark: SparkSession = {
val conf: SparkConf = new SparkConf()
// 设置yarn-client模式提交
.setMaster("yarn")
//App名字
.set("spark.app.name", this.getClass.getSimpleName.stripSuffix("$"))
// 设置resourcemanager的ip
.set("yarn.resourcemanager.hostname", "dsy")
// 设置executor的个数
.set("spark.executor.instance", "2")
// 设置executor的内存大小
.set("spark.executor.memory", "1024M")
// 设置提交任务的yarn队列
.set("spark.yarn.queue", "spark")
// 设置driver的ip地址
.set("spark.driver.host", "localhost")
// 设置jar包的路径,如果有其他的依赖包,可以在这里添加,逗号隔开
.set("spark.yarn.jars", "C:\\Users\\han\\Desktop\\test\\dns_project\\target\\dns_project.jar")
// 序列化
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
SparkSession
.builder()
.config(conf)
.getOrCreate()
}
val data: DataFrame = spark
.read
.format("csv")
.option("header", "true")
.option("multiLine", "true")
.option("encoding", "utf-8") //utf-8
//"D:\\data\\dns_data_test.csv"
//"/soft/data/DNS_DATA/dns_data_test.csv"
//"hdfs://dsy:9000/dns_data/dns_data_test.csv"
.load("hdfs://dsy:9000/dns_data/dns_data_test.csv")
data.show(1000, truncate = false)
println(data.count())
data.printSchema()
spark.stop()
}
}
报错信息
23/02/22 10:14:10 INFO DFSClient: Exception in createBlockOutputStream
java.net.ConnectException: Connection timed out: no further information
at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)
at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:715)
at org.apache.hadoop.net.SocketIOWithTimeout.connect(SocketIOWithTimeout.java:206)
at org.apache.hadoop.net.NetUtils.connect(NetUtils.java:531)
at org.apache.hadoop.hdfs.DFSOutputStream.createSocketForPipeline(DFSOutputStream.java:1717)
at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.createBlockOutputStream(DFSOutputStream.java:1447)
at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.nextBlockOutputStream(DFSOutputStream.java:1400)
at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.run(DFSOutputStream.java:554)
连接超时,防火墙已关闭,端口开放,重启hdfs后还是无法解决,捣鼓了很久
在百度后再次查看日志在后面的报错中找到这么一串信息
Abandoning BP-341750442-172.20.46.198-1676892752466:blk_1073745146_4322
连接datanode使用的是私网ip,难怪连接不到,配置外网访问datanode就可以了
解决
云主机hosts文件配内网ip,本地windows电脑配置主机名对应公网ip。
在hdfs-site下配置以下属性后上传至resource资源目录重新打包运行
<property>
<name>dfs.client.use.datanode.hostname</name>
<value>true</value>
</property>
<property>
<name>dfs.datanode.use.datanode.hostname</name>
<value>true</value>
</property>
再次运行
![](https://i-blog.csdnimg.cn/blog_migrate/9a67fc245547a6493a68d9b2938e8d6f.png)
成功!