Scala操作hdfs文件

最新推荐文章于 2021-02-04 20:03:05 发布

跌跌撞撞创斯你

最新推荐文章于 2021-02-04 20:03:05 发布

阅读量1.4k

点赞数

分类专栏： scala hadoop

hadoop 同时被 2 个专栏收录

4 篇文章 0 订阅

订阅专栏

scala

0 篇文章 0 订阅

订阅专栏

package scala_test_lyh

import java.io._
import java.net.URI
import java.util._
import org.apache.commons.lang3.StringUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs._
import org.apache.zookeeper.common.IOUtils

object HDFSUtil {
  val hdfsUrl = "hdfs://namenode的ip地址:端口"
  var realUrl = ""

  /**
    * make a new dir in the hdfs
    *
    * @param dir the dir may like '/tmp/testdir'
    * @return boolean true-success, false-failed
    */
  def mkdir(dir : String) : Boolean = {
    var result = false
    if (StringUtils.isNoneBlank(dir)) {
      realUrl = hdfsUrl + dir
      val config = new Configuration()
      val fs = FileSystem.get(URI.create(realUrl), config)
      if (!fs.exists(new Path(realUrl))) {
        fs.mkdirs(new Path(realUrl))
      }
      fs.close()
      result = true
    }
    result
  }

  /**
    * delete a dir in the hdfs.
    * if dir not exists, it will throw FileNotFoundException
    *
    * @param dir the dir may like '/tmp/testdir'
    * @return boolean true-success, false-failed
    *
    */
  def deleteDir(dir : String) : Boolean = {
    var result = false
    if (StringUtils.isNoneBlank(dir)) {
      realUrl = hdfsUrl + dir
      val config = new Configuration()
      val fs = FileSystem.get(URI.create(realUrl), config)
      fs.delete(new Path(realUrl), true)
      fs.close()
      result = true
    }
    result
  }

  /**
    * list files/directories/links names under a directory, not include embed
    * objects
    *
    * @param dir a folder path may like '/tmp/testdir'
    * @return List<String> list of file names
    */
  def listAll(dir : String) : List[String] = {
    val names : List[String] = new ArrayList[String]()
    if (StringUtils.isNoneBlank(dir)) {
      realUrl = hdfsUrl + dir
      val config = new Configuration()
      val fs = FileSystem.get(URI.create(realUrl), config)
      val stats = fs.listStatus(new Path(realUrl))
      for (i <- 0 to stats.length - 1) {
        if (stats(i).isFile) {
          names.add(stats(i).getPath.toString)
        } else if (stats(i).isDirectory) {
          names.add(stats(i).getPath.toString)
        } else if (stats(i).isSymlink) {
          names.add(stats(i).getPath.toString)
        }
      }
    }
    names
  }

  /**
    * upload the local file to the hds,
    * notice that the path is full like /tmp/test.txt
    * if local file not exists, it will throw a FileNotFoundException
    *
    * @param localFile local file path, may like F:/test.txt or /usr/local/test.txt
    *
    * @param hdfsFile hdfs file path, may like /tmp/dir
    * @return boolean true-success, false-failed
    *
    **/
  def uploadLocalFile2HDFS(localFile : String, hdfsFile : String) : Boolean = {
    var result = false
    if (StringUtils.isNoneBlank(localFile) && StringUtils.isNoneBlank(hdfsFile)) {
      realUrl = hdfsUrl + hdfsFile
      val config = new Configuration()
      val hdfs = FileSystem.get(URI.create(hdfsUrl), config)
      val src = new Path(localFile)
      val dst = new Path(realUrl)
      hdfs.copyFromLocalFile(src, dst)
      hdfs.close()
      result = true
    }
    result
  }

  /**
    * create a new file in the hdfs. notice that the toCreateFilePath is the full path
    *  and write the content to the hdfs file.

    * create a new file in the hdfs.
    * if dir not exists, it will create one
    *
    * @param newFile new file path, a full path name, may like '/tmp/test.txt'
    * @param content file content
    * @return boolean true-success, false-failed
    **/
  def createNewHDFSFile(newFile : String, content : String) : Boolean = {
    var result = false
    if (StringUtils.isNoneBlank(newFile) && null != content) {
      realUrl = hdfsUrl + newFile
      val config = new Configuration()
      val hdfs = FileSystem.get(URI.create(realUrl), config)
      val os = hdfs.create(new Path(realUrl))
      os.write(content.getBytes("UTF-8"))
      os.close()
      hdfs.close()
      result = true
    }
    result
  }

  /**
    * delete the hdfs file
    *
    * @param hdfsFile a full path name, may like '/tmp/test.txt'
    * @return boolean true-success, false-failed
    */
  def deleteHDFSFile(hdfsFile : String) : Boolean = {
    var result = false
    if (StringUtils.isNoneBlank(hdfsFile)) {
      realUrl = hdfsUrl + hdfsFile
      val config = new Configuration()
      val hdfs = FileSystem.get(URI.create(realUrl), config)
      val path = new Path(realUrl)
      val isDeleted = hdfs.delete(path, true)
      hdfs.close()
      result = isDeleted
    }
    result
  }

  /**
    * read the hdfs file content
    *
    * @param hdfsFile a full path name, may like '/tmp/test.txt'
    * @return byte[] file content
    */
  def readHDFSFile(hdfsFile : String) : Array[Byte] = {
    var result =  new Array[Byte](0)
    if (StringUtils.isNoneBlank(hdfsFile)) {
      realUrl = hdfsUrl + hdfsFile
      val config = new Configuration()
      val hdfs = FileSystem.get(URI.create(realUrl), config)
      val path = new Path(realUrl)
      if (hdfs.exists(path)) {
        val inputStream = hdfs.open(path)
        val stat = hdfs.getFileStatus(path)
        val length = stat.getLen.toInt
        val buffer = new Array[Byte](length)
        inputStream.readFully(buffer)
        inputStream.close()
        hdfs.close()
        result = buffer
      }
    }
    result
  }

  /**
    * append something to file dst
    *
    * @param hdfsFile a full path name, may like '/tmp/test.txt'
    * @param content string
    * @return boolean true-success, false-failed
    */
  def append(hdfsFile : String, content : String) : Boolean = {
    var result = false
    if (StringUtils.isNoneBlank(hdfsFile) && null != content) {
      realUrl = hdfsUrl + hdfsFile
      val config = new Configuration()
      config.set("dfs.client.block.write.replace-datanode-on-failure.policy", "NEVER")
      config.set("dfs.client.block.write.replace-datanode-on-failure.enable", "true")
      val hdfs = FileSystem.get(URI.create(realUrl), config)
      val path = new Path(realUrl)
      if (hdfs.exists(path)) {
        val inputStream = new ByteArrayInputStream(content.getBytes())
        val outputStream = hdfs.append(path)
        IOUtils.copyBytes(inputStream, outputStream, 4096, true)
        outputStream.close()
        inputStream.close()
        hdfs.close()
        result = true
      }else {
        HDFSUtil.createNewHDFSFile(hdfsFile, content);
        result = true
      }
    } 
    result
  }
}

简单测试代码

package scala_test_lyh


object testHDFSUtil {
  def main(args: Array[String]): Unit = {
    //赋予hdfs账户的权限否则报权限错误
    System.setProperty("HADOOP_USER_NAME", "hdfs")
    //val result=HDFSUtil.mkdir("/input1")
    //val result=HDFSUtil.deleteDir("/input1")
    //val result=HDFSUtil.listAll("/input")
    //val result=HDFSUtil.uploadLocalFile2HDFS("C:\\Users\\admin\\Desktop\\lyh.zip","/input")
    //val result=HDFSUtil.createNewHDFSFile("/input/test1.txt","hello world")
    //HDFSUtil.deleteHDFSFile("/input/lyh.zip")
    //HDFSUtil.deleteHDFSFile("/input/test1.txt")
    val arr=HDFSUtil.readHDFSFile("/input/manifest.json")
    var str=new String(arr)
    println(str)

  }

测试spark2环境的简单wordCount代码：

package scala_test_lyh

import java.net.URI
import com.alibaba.fastjson.JSON
import org.apache.commons.lang3.StringUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.{SparkConf, SparkContext}

object wordCount {
  val hdfsUrl = "hdfs://namenode的ip地址:端口"
  var realUrl = ""
  def main(args: Array[String]): Unit = {
    System.setProperty("HADOOP_USER_NAME", "hdfs")
    val conf=new SparkConf().setAppName("wordCount").setMaster("local")

    val sc=new SparkContext(conf)

    val rdd=sc.textFile(hdfsUrl+args(0))

    rdd.filter(_.trim.length>0)
      .flatMap(_.split(" "))
      .map((_,1))//将每一项转换为key-value，数据是key，value是1
      .reduceByKey(_+_)//将具有相同key的项相加合并成一个
      .collect.foreach(println)

    val arr=readHDFSFile(args(1))
    val manifest_json=new String(arr)
    val json = JSON.parseObject(manifest_json)
    println(json.get("lastUpdated"))
    println(json.getJSONArray("parcels"))
  }

  def readHDFSFile(hdfsFile : String) : Array[Byte] = {
    var result =  new Array[Byte](0)
    if (StringUtils.isNoneBlank(hdfsFile)) {
      realUrl = hdfsUrl + hdfsFile
      val config = new Configuration()
      val hdfs = FileSystem.get(URI.create(realUrl), config)
      val path = new Path(realUrl)
      if (hdfs.exists(path)) {
        val inputStream = hdfs.open(path)
        val stat = hdfs.getFileStatus(path)
        val length = stat.getLen.toInt
        val buffer = new Array[Byte](length)
        inputStream.readFully(buffer)
        inputStream.close()
        hdfs.close()
        result = buffer
      }
    }
    result
  }
}

把上述spark测试代码打成jar包，和依赖第三方jar包fastjson-1.2.47.jar上传到集群环境

需要2个参数：1计算wordCount的文件在hdfs上的地址2解析json的文件在hdfs上的地址

本地模式运行命令：

spark-submit --jars fastjson-1.2.47.jar --class scala_test_lyh.wordCount test.jar /input/wordCount.txt /input/manifest.json

集群spark on yarn模式：

spark-submit --master yarn --jars fastjson-1.2.47.jar --class scala_test_lyh.wordCount test.jar /input/wordCount.txt /input/manifest.json

跌跌撞撞创斯你

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
1
评论
Scala操作hdfs文件

package scala_test_lyhimport java.io._import java.net.URIimport java.util._import org.apache.commons.lang3.StringUtilsimport org.apache.hadoop.conf.Configurationimport org.apache.hadoop.fs._im...
复制链接

扫一扫