删去hdfs的数据和添加hdffs数据
val conf = new SparkConf().setMaster("yarn-cluster")
conf.set("spark.sql.shuffle.partitions", shufflePart)
val hadoopConf: Configuration = sc.hadoopConfiguration
var sngfs = new Path("hdfs://xxx/").getFileSystem(hadoopConf)
val train_path = "hdfs:xxx/" +endTime + "/"
val test_path = "hdfs:xxx/" +endTime + "/"
if(sngfs.exists(new Path( train_path))) {
sngfs.delete(new Path(train_path),false)
}
if(sngfs.exists(new Path( test_path))) {
sngfs.delete(new Path(test_path),false)
}
获取hdfs的分区路径
def getLastHdfsPath(sc: SparkContext, path: String,date_num:Int): Array[String] = {
/*
获取 hdfs 中path的最近时间分区列表
path:hdfs路径
date_num: 获取最近几个分区数量
last_date:最近几个分区的路径
* */
val fs = new Path(path).getFileSystem(sc.hadoopConfiguration)
val status = fs.listStatus(new Path(path))
val last_path = status.sortBy(_.getPath.getName)((Ordering[String].reverse)).map(x=>x.getPath.toString).take(date_num)
last_path
}