spark中HDFS文件操作,hive表操作

file:///home/text1.txt 则从本地读

hdfs://clusterA/direct1/text1.txt 从集群上读

import java.io.OutputStreamWriter

/**
  * 读取hdfs文件
  *
  * @param aPath 要读取的文件路径,如hdfs://clusterA/direct1/text1.txt"
  * @return
  */

val sc = SparkSession.builder().enableHiveSupport().getOrCreate()
val pathArr = aPath.split("//")
    val uri = pathArr.head + "//" + pathArr(1).split("/").head
    val path = new Path(aPath)
    val hdfs = org.apache.hadoop.fs.FileSystem.get(
        new java.net.URI(uri),
        new org.apache.hadoop.conf.Configuration())
    if (hdfs.exists(path))
           val outpu=sc.sparkContext.textFile(aPath).collect()


/**
  * 删除hdfs目录
  *
  * @param aPath 要删除的路径,如hdfs://clusterA/direct1"
  * @return
  */
def deleteHdfsPath(aPath: String) = {
    val pathArr = aPath.split("//")
    val uri = pathArr.head + "//" + pathArr(1).split("/").head
    val path = new Path(aPath)
    val hdfs = org.apache.hadoop.fs.FileSystem.get(
        new java.net.URI(uri),
        new org.apache.hadoop.conf.Configuration())
    if (hdfs.exists(path))
        hdfs.delete(path, true)
}

/**
  * 获取文本变更时间
  *
  * @param aPath 要获取的文本,如hdfs://clusterA/direct1/file.txt"
  * @return
  */
def getMdfTime(aPath: String) = {
    val pathArr = aPath.split("//")
    val uri = pathArr.head + "//" + pathArr(1).split("/").head
    val path = new Path(aPath)
    val hdfs = org.apache.hadoop.fs.FileSystem.get(
        new java.net.URI(uri),
        new org.apache.hadoop.conf.Configuration())
    if (hdfs.exists(path))
       { val fileSt = hdfs.getFileStatus(new Path(path))
         val modTime = fileSt.getModificationTime.toString\\毫秒时间戳 Long型
       }
}

/**
  * 支持中文的hdfs文件写入-覆盖模式
  *
  * @param aPath 要写如的文本路径,如hdfs://clusterA/direct1/file.txt"
  * @param content 要写入的文本
  * @return
  */

  def writeOverwrite(aPath: String, content: Iterator[String], hdfs: FileSystem) = {
    //支持中文
    val path = new Path(aPath)
    val out = new OutputStreamWriter(hdfs.create(path, true)) //true则overwrite,false的话路径存在会报错
    content.foreach(str => out.write(str + "\n"))
    out.flush()
    out.close()
  }
/**
  * 支持中文的hdfs文件写入-追加模式
  *
  * @param aPath 要写如的文本路径,如hdfs://clusterA/direct1/file.txt"
  * @param content 要写入的文本
  * @return
  */
   
 def writeAppend( filename : String, content : Iterator[ String ],hdfs:FileSystem ) = {
      //支持中文
      val path = new Path( filename )
      var fileOutputStream:FSDataOutputStream= null
      try
        if (hdfs.exists(path)) {
          fileOutputStream = hdfs.append(path)
          content.foreach(x=>fileOutputStream.write((x+"\n").getBytes()))
        }
        else {
          fileOutputStream = hdfs.create(path)
          content.foreach(x=>fileOutputStream.write((x+"\n").getBytes()))
        }
      finally {
        if (fileOutputStream != null) fileOutputStream.close
      }
    }


/**
  * 保存dataframe到指定table
  *
  * @param aPath 
  * @return
  */
 def saveDF(sqlContext: HiveContext, tableNme: String, hdfsPath: String, DF: DataFrame,  day: String, hour: String, numPartitions: Int): Unit = {

      //方法1
        val savePath = hdfsPath + "/pt_d=" + day + "/pt_h=" + hour
        val sqlcode = "alter table " + tableNme + " add if not exists partition (pt_d='" + day + "', pt_h='" + hour + "')" 
        //覆写模式       
        funDeleteHdfsPath(savePath)//先删除路径下文件
        DF.repartition(numPartitions).write.format("orc").save(savePath)
        //追加模式//DF.repartition(numPartitions).write.mode("append").format("orc").save(savePath)
        sqlContext.sql("use biads")
        sqlContext.sql(sqlcode)
     
 //方法2
    val sc = SparkSession.builder().enableHiveSupport().getOrCreate()
    DF.createOrReplaceTempView(tempViewName)
    sc.sql(s"INSERT OVERWRITE TABLE biads.${logTableName}  PARTITION (pt_d='${timeD}',pt_h='${timeH}',pt_channel='${pt_channel}') select * from ${tempViewName}")//若想追加写表则,OVERWRITE改为 INTO
        outString

//方法3
val DATASCHEMA=Array("c1","c2")//和创建的hive表字段名称保持一致

val savePath = path + "/pt_d=" + date + "/pt_h=" + hour + "/pt_min=" + minute 
          rdd.toDF(DATASCHEMA: _*)
            .coalesce(1)
            .write.mode("append").format("orc").save(savePath)//.mode("append")可改.mode("overwrite")
          val sqlCode = "alter table " + table + " add if not exists partition (pt_d='" + date + "', pt_h='" + hour + "', pt_min='" + minute ')"
          sqlContext.sql(sqlCode)


// 方法4-动态分区
df=df.withColumn("pt_m", lit(ptm))//加入分区为pt_m
df.write.mode(SaveMode.Overwrite).insertInto(tableName)

//注意,动态分区方法需要开启一下两个参数:
// hive.exec.dynamic.partition = true
// hive.exec.dynamic.partition.mode = nonstrict


即:
sc = SparkSession.builder()
      .config("hive.exec.dynamic.partition", "true")
      .config("hive.exec.dynamic.partition.mode", "nonstrict")
      .config("spark.sql.parquet.writeLegacyFormat", "true")
      .config("spark.sql.sources.partitionOverwriteMode", "dynamic")
      .enableHiveSupport().getOrCreate()


}











rdd输出一个文件

yourRDD.coalesce(1).saveAsTextFile("outputPath")

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值