scala遍历hdfs上的文件，传给sparksql写入到hive的分区表

最新推荐文章于 2023-02-01 18:35:16 发布

dbbigdata

最新推荐文章于 2023-02-01 18:35:16 发布

阅读量759

点赞数 1

文章标签： spark hdfs 大数据 hive

本文链接：https://blog.csdn.net/dbbigdata/article/details/107660259

版权

scala遍历hdfs上的文件，传给sparksql写入到hive的分区表##

需求：用spark解析某个目录下的大量小文件后写入到hive表中，hive是分区表，分区的内容是目录下文件的名称。需要先用scala获取到这个目录下的各个文件名，然后传给spark解析，并把文件名变量传给hive的分区，当作hive的分区名。
比如：/data/temp/20-07-28ttt.csv /data/temp/2020-07-29ttt.csv /data/temp/2020-07-30ttt.csv
csv数据里没有时间字段，因此获取不了hive分区的内容。需要先遍历文件名，截取时间字段当作hive的分区。
代码如下:

def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkSession.builder().appName("boke")
      .enableHiveSupport().getOrCreate()

    val paths =s"/data/temp/"

    def getHdfs(path: String): FileSystem = {
      val conf = new Configuration()
      FileSystem.newInstance(URI.create(path), conf)
    }

    //获取目录下的一级文件和目录
    def getFilesAndDirs(path: String): Array[Path] = {
      val fs = getHdfs(path).listStatus(new Path(path))
      FileUtil.stat2Paths(fs)
    }

    //获取目录下的所有文件
    def getAllFiles(path: String): ArrayBuffer[String] = {
      val arr = ArrayBuffer[String]()
      val hdfs = getHdfs(path)
      val getPath = getFilesAndDirs(path)
      getPath.foreach(patha => {
        if (hdfs.getFileStatus(patha).isFile())
          arr += patha.toString
        else {
          arr ++= getAllFiles(patha.toString())
        }
      })
      arr
    }
    for (elem <- getAllFiles(paths)) {
      val filecsv: String = elem.substring(elem.length - 15)
      val days: String = filecsv.substring(0,10)
      val day: String = days.substring(0,4)+days.substring(5,7)+days.substring(8,10)

      spark.read.format("csv")
        .option("header", false)
        .option("multiLine", true)
        .schema(tableSchemas())
        .load(s"/data/temp/$filecsv")
        .createOrReplaceTempView("t1")
      spark.sql(s"""
                   |insert overwrite table ods.test partition(day='$day')
                   |select name,id from t1""".stripMargin)
    }
  }

  def tableSchemas() : StructType = {
    val inputFields = new util.ArrayList[StructField]()
    val stringType ="name,id,score"
    for ( stringTmp : String<- stringType.split(",") ){
      inputFields.add(DataTypes.createStructField(stringTmp,DataTypes.StringType,true))
    }
    DataTypes.createStructType(inputFields)
  }