spark中HDFS文件操作，hive表操作

Code_LT

已于 2023-06-19 22:54:32 修改

阅读量2.6k

点赞数

分类专栏： Spark 文章标签： scala spark

于 2019-09-16 16:44:35 首次发布

本文链接：https://blog.csdn.net/Code_LT/article/details/100893188

版权

Spark 专栏收录该内容

40 篇文章 5 订阅

订阅专栏

file:///home/text1.txt 则从本地读

hdfs://clusterA/direct1/text1.txt 从集群上读

import java.io.OutputStreamWriter

/**
  * 读取hdfs文件
  *
  * @param aPath 要读取的文件路径,如hdfs://clusterA/direct1/text1.txt"
  * @return
  */

val sc = SparkSession.builder().enableHiveSupport().getOrCreate()
val pathArr = aPath.split("//")
    val uri = pathArr.head + "//" + pathArr(1).split("/").head
    val path = new Path(aPath)
    val hdfs = org.apache.hadoop.fs.FileSystem.get(
        new java.net.URI(uri),
        new org.apache.hadoop.conf.Configuration())
    if (hdfs.exists(path))
           val outpu=sc.sparkContext.textFile(aPath).collect()


/**
  * 删除hdfs目录
  *
  * @param aPath 要删除的路径,如hdfs://clusterA/direct1"
  * @return
  */
def deleteHdfsPath(aPath: String) = {
    val pathArr = aPath.split("//")
    val uri = pathArr.head + "//" + pathArr(1).split("/").head
    val path = new Path(aPath)
    val hdfs = org.apache.hadoop.fs.FileSystem.get(
        new java.net.URI(uri),
        new org.apache.hadoop.conf.Configuration())
    if (hdfs.exists(path))
        hdfs.delete(path, true)
}

/**
  * 获取文本变更时间
  *
  * @param aPath 要获取的文本,如hdfs://clusterA/direct1/file.txt"
  * @return
  */
def getMdfTime(aPath: String) = {
    val pathArr = aPath.split("//")
    val uri = pathArr.head + "//" + pathArr(1).split("/").head
    val path = new Path(aPath)
    val hdfs = org.apache.hadoop.fs.FileSystem.get(
        new java.net.URI(uri),
        new org.apache.hadoop.conf.Configuration())
    if (hdfs.exists(path))
       { val fileSt = hdfs.getFileStatus(new Path(path))
         val modTime = fileSt.getModificationTime.toString\\毫秒时间戳 Long型
       }
}

/**
  * 支持中文的hdfs文件写入-覆盖模式
  *
  * @param aPath 要写如的文本路径,如hdfs://clusterA/direct1/file.txt"
  * @param content 要写入的文本
  * @return
  */

  def writeOverwrite(aPath: String, content: Iterator[String], hdfs: FileSystem) = {
    //支持中文
    val path = new Path(aPath)
    val out = new OutputStreamWriter(hdfs.create(path, true)) //true则overwrite,false的话路径存在会报错
    content.foreach(str => out.write(str + "\n"))
    out.flush()
    out.close()
  }

/**
  * 支持中文的hdfs文件写入-追加模式
  *
  * @param aPath 要写如的文本路径,如hdfs://clusterA/direct1/file.txt"
  * @param content 要写入的文本
  * @return
  */
   
 def writeAppend( filename : String, content : Iterator[ String ],hdfs:FileSystem ) = {
      //支持中文
      val path = new Path( filename )
      var fileOutputStream:FSDataOutputStream= null
      try
        if (hdfs.exists(path)) {
          fileOutputStream = hdfs.append(path)
          content.foreach(x=>fileOutputStream.write((x+"\n").getBytes()))
        }
        else {
          fileOutputStream = hdfs.create(path)
          content.foreach(x=>fileOutputStream.write((x+"\n").getBytes()))
        }
      finally {
        if (fileOutputStream != null) fileOutputStream.close
      }
    }


/**
  * 保存dataframe到指定table
  *
  * @param aPath 
  * @return
  */
 def saveDF(sqlContext: HiveContext, tableNme: String, hdfsPath: String, DF: DataFrame,  day: String, hour: String, numPartitions: Int): Unit = {

      //方法1
        val savePath = hdfsPath + "/pt_d=" + day + "/pt_h=" + hour
        val sqlcode = "alter table " + tableNme + " add if not exists partition (pt_d='" + day + "', pt_h='" + hour + "')" 
        //覆写模式       
        funDeleteHdfsPath(savePath)//先删除路径下文件
        DF.repartition(numPartitions).write.format("orc").save(savePath)
        //追加模式//DF.repartition(numPartitions).write.mode("append").format("orc").save(savePath)
        sqlContext.sql("use biads")
        sqlContext.sql(sqlcode)
     
 //方法2
    val sc = SparkSession.builder().enableHiveSupport().getOrCreate()
    DF.createOrReplaceTempView(tempViewName)
    sc.sql(s"INSERT OVERWRITE TABLE biads.${logTableName}  PARTITION (pt_d='${timeD}',pt_h='${timeH}',pt_channel='${pt_channel}') select * from ${tempViewName}")//若想追加写表则，OVERWRITE改为 INTO
        outString

//方法3
val DATASCHEMA=Array("c1","c2")//和创建的hive表字段名称保持一致

val savePath = path + "/pt_d=" + date + "/pt_h=" + hour + "/pt_min=" + minute 
          rdd.toDF(DATASCHEMA: _*)
            .coalesce(1)
            .write.mode("append").format("orc").save(savePath)//.mode("append")可改.mode("overwrite")
          val sqlCode = "alter table " + table + " add if not exists partition (pt_d='" + date + "', pt_h='" + hour + "', pt_min='" + minute ')"
          sqlContext.sql(sqlCode)


// 方法4-动态分区
df=df.withColumn("pt_m", lit(ptm))//加入分区为pt_m
df.write.mode(SaveMode.Overwrite).insertInto(tableName)

//注意，动态分区方法需要开启一下两个参数：
// hive.exec.dynamic.partition = true
// hive.exec.dynamic.partition.mode = nonstrict


即：
sc = SparkSession.builder()
      .config("hive.exec.dynamic.partition", "true")
      .config("hive.exec.dynamic.partition.mode", "nonstrict")
      .config("spark.sql.parquet.writeLegacyFormat", "true")
      .config("spark.sql.sources.partitionOverwriteMode", "dynamic")
      .enableHiveSupport().getOrCreate()


}

rdd输出一个文件

yourRDD.coalesce(1).saveAsTextFile("outputPath")

Code_LT

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
spark中HDFS文件操作，hive表操作

import java.io.OutputStreamWriter/** * 读取hdfs文件 * * @param aPath 要读取的文件路径,如hdfs://clusterA/direct1/text1.txt" * @return */val sc = SparkSession.builder().enableHiveSupport().getOrCr...
复制链接

扫一扫