spark利用scala操作hdfs

14 篇文章 1 订阅

1、读取hdfs目录:hadoop dfs -ls path相当于listStatus的简写

//checkpoint目录是:/user/dmspark/accumulate/checkpoint
//e.g. /user/dmspark/accumulate/checkpoint/0519936a-5bff-4ecf-a6f0-3854e5952ec9/rdd-689/part-00099

private def getLatestCheckpoint(checkpointDir: String): Option[String] = {
    val fs = FileSystem.get(new Configuration())
    val latestCheckpointDir = findLatestSubDir(new Path(checkpointDir))
    var latestCheckpointRDDDir = findLatestSubDir(latestCheckpointDir)
    //如果最近的checkpoint文件不全,那么就用前5分钟的checkpoint数据
    if(fs.listStatus(latestCheckpointRDDDir).length != NumPartitionsOfReducedRDD + 1)
      latestCheckpointRDDDir = findLatestSubDir(latestCheckpointDir, 1)
    if(latestCheckpointRDDDir != null) Some(latestCheckpointRDDDir.toString) else None
}

def findLatestSubDir(path: Path, index: Int = 0): Path = {
   if(path == null || !fs.exists(path))
     return null
   val fileStatus = fs.listStatus(path).sortBy(_.getModificationTime).reverse
   if(fileStatus.length <= index) null else fileStatus(index).getPath
}

2、读取hdfs文件到内存的list

  def readToList(file: Path): List[String] = {
    val uRI = "hdfs://localhost:8021"
    val configuration = new Configuration()
    val hdfs: FileSystem = FileSystem.get(URI.create(uRI), configuration)
    val in: FSDataInputStream = hdfs.open(file)
    val reader: BufferedReader = new BufferedReader(new InputStreamReader(in, "UTF8"))
    var line = ""
    val list = new ListBuffer[String]

    breakable(
      while ((line = reader.readLine) != null) {
        if (line == null) {
          break()
        }
        list += line
      }
    )
    list.toList
  }
 

  def main(args: Array[String]): Unit = {
    val path = new Path("/user/dmspark/product3_source_conf/catecode_info.txt")
    println(readToList(path).size)
  }

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值