package scala_test_lyh
import java.io._
import java.net.URI
import java.util._
import org.apache.commons.lang3.StringUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs._
import org.apache.zookeeper.common.IOUtils
object HDFSUtil {
val hdfsUrl = "hdfs://namenode的ip地址:端口"
var realUrl = ""
/**
* make a new dir in the hdfs
*
* @param dir the dir may like '/tmp/testdir'
* @return boolean true-success, false-failed
*/
def mkdir(dir : String) : Boolean = {
var result = false
if (StringUtils.isNoneBlank(dir)) {
realUrl = hdfsUrl + dir
val config = new Configuration()
val fs = FileSystem.get(URI.create(realUrl), config)
if (!fs.exists(new Path(realUrl))) {
fs.mkdirs(new Path(realUrl))
}
fs.close()
result = true
}
result
}
/**
* delete a dir in the hdfs.
* if dir not exists, it will throw FileNotFoundException
*
* @param dir the dir may like '/tmp/testdir'
* @return boolean true-success, false-failed
*
*/
def deleteDir(dir : String) : Boolean = {
var result = false
if (StringUtils.isNoneBlank(dir)) {
realUrl = hdfsUrl + dir
val config = new Configuration()
val fs = FileSystem.get(URI.create(realUrl), config)
fs.delete(new Path(realUrl), true)
fs.close()
result = true
}
result
}
/**
* list files/directories/links names under a directory, not include embed
* objects
*
* @param dir a folder path may like '/tmp/testdir'
* @return List<String> list of file names
*/
def listAll(dir : String) : List[String] = {
val names : List[String] = new ArrayList[String]()
if (StringUtils.isNoneBlank(dir)) {
realUrl = hdfsUrl + dir
val config = new Configuration()
val fs = FileSystem.get(URI.create(realUrl), config)
val stats = fs.listStatus(new Path(realUrl))
for (i <- 0 to stats.length - 1) {
if (stats(i).isFile) {
names.add(stats(i).getPath.toString)
} else if (stats(i).isDirectory) {
names.add(stats(i).getPath.toString)
} else if (stats(i).isSymlink) {
names.add(stats(i).getPath.toString)
}
}
}
names
}
/**
* upload the local file to the hds,
* notice that the path is full like /tmp/test.txt
* if local file not exists, it will throw a FileNotFoundException
*
* @param localFile local file path, may like F:/test.txt or /usr/local/test.txt
*
* @param hdfsFile hdfs file path, may like /tmp/dir
* @return boolean true-success, false-failed
*
**/
def uploadLocalFile2HDFS(localFile : String, hdfsFile : String) : Boolean = {
var result = false
if (StringUtils.isNoneBlank(localFile) && StringUtils.isNoneBlank(hdfsFile)) {
realUrl = hdfsUrl + hdfsFile
val config = new Configuration()
val hdfs = FileSystem.get(URI.create(hdfsUrl), config)
val src = new Path(localFile)
val dst = new Path(realUrl)
hdfs.copyFromLocalFile(src, dst)
hdfs.close()
result = true
}
result
}
/**
* create a new file in the hdfs. notice that the toCreateFilePath is the full path
* and write the content to the hdfs file.
* create a new file in the hdfs.
* if dir not exists, it will create one
*
* @param newFile new file path, a full path name, may like '/tmp/test.txt'
* @param content file content
* @return boolean true-success, false-failed
**/
def createNewHDFSFile(newFile : String, content : String) : Boolean = {
var result = false
if (StringUtils.isNoneBlank(newFile) && null != content) {
realUrl = hdfsUrl + newFile
val config = new Configuration()
val hdfs = FileSystem.get(URI.create(realUrl), config)
val os = hdfs.create(new Path(realUrl))
os.write(content.getBytes("UTF-8"))
os.close()
hdfs.close()
result = true
}
result
}
/**
* delete the hdfs file
*
* @param hdfsFile a full path name, may like '/tmp/test.txt'
* @return boolean true-success, false-failed
*/
def deleteHDFSFile(hdfsFile : String) : Boolean = {
var result = false
if (StringUtils.isNoneBlank(hdfsFile)) {
realUrl = hdfsUrl + hdfsFile
val config = new Configuration()
val hdfs = FileSystem.get(URI.create(realUrl), config)
val path = new Path(realUrl)
val isDeleted = hdfs.delete(path, true)
hdfs.close()
result = isDeleted
}
result
}
/**
* read the hdfs file content
*
* @param hdfsFile a full path name, may like '/tmp/test.txt'
* @return byte[] file content
*/
def readHDFSFile(hdfsFile : String) : Array[Byte] = {
var result = new Array[Byte](0)
if (StringUtils.isNoneBlank(hdfsFile)) {
realUrl = hdfsUrl + hdfsFile
val config = new Configuration()
val hdfs = FileSystem.get(URI.create(realUrl), config)
val path = new Path(realUrl)
if (hdfs.exists(path)) {
val inputStream = hdfs.open(path)
val stat = hdfs.getFileStatus(path)
val length = stat.getLen.toInt
val buffer = new Array[Byte](length)
inputStream.readFully(buffer)
inputStream.close()
hdfs.close()
result = buffer
}
}
result
}
/**
* append something to file dst
*
* @param hdfsFile a full path name, may like '/tmp/test.txt'
* @param content string
* @return boolean true-success, false-failed
*/
def append(hdfsFile : String, content : String) : Boolean = {
var result = false
if (StringUtils.isNoneBlank(hdfsFile) && null != content) {
realUrl = hdfsUrl + hdfsFile
val config = new Configuration()
config.set("dfs.client.block.write.replace-datanode-on-failure.policy", "NEVER")
config.set("dfs.client.block.write.replace-datanode-on-failure.enable", "true")
val hdfs = FileSystem.get(URI.create(realUrl), config)
val path = new Path(realUrl)
if (hdfs.exists(path)) {
val inputStream = new ByteArrayInputStream(content.getBytes())
val outputStream = hdfs.append(path)
IOUtils.copyBytes(inputStream, outputStream, 4096, true)
outputStream.close()
inputStream.close()
hdfs.close()
result = true
}else {
HDFSUtil.createNewHDFSFile(hdfsFile, content);
result = true
}
}
result
}
}
简单测试代码
package scala_test_lyh
object testHDFSUtil {
def main(args: Array[String]): Unit = {
//赋予hdfs账户的权限否则报权限错误
System.setProperty("HADOOP_USER_NAME", "hdfs")
//val result=HDFSUtil.mkdir("/input1")
//val result=HDFSUtil.deleteDir("/input1")
//val result=HDFSUtil.listAll("/input")
//val result=HDFSUtil.uploadLocalFile2HDFS("C:\\Users\\admin\\Desktop\\lyh.zip","/input")
//val result=HDFSUtil.createNewHDFSFile("/input/test1.txt","hello world")
//HDFSUtil.deleteHDFSFile("/input/lyh.zip")
//HDFSUtil.deleteHDFSFile("/input/test1.txt")
val arr=HDFSUtil.readHDFSFile("/input/manifest.json")
var str=new String(arr)
println(str)
}
测试spark2环境的简单wordCount代码:
package scala_test_lyh
import java.net.URI
import com.alibaba.fastjson.JSON
import org.apache.commons.lang3.StringUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.{SparkConf, SparkContext}
object wordCount {
val hdfsUrl = "hdfs://namenode的ip地址:端口"
var realUrl = ""
def main(args: Array[String]): Unit = {
System.setProperty("HADOOP_USER_NAME", "hdfs")
val conf=new SparkConf().setAppName("wordCount").setMaster("local")
val sc=new SparkContext(conf)
val rdd=sc.textFile(hdfsUrl+args(0))
rdd.filter(_.trim.length>0)
.flatMap(_.split(" "))
.map((_,1))//将每一项转换为key-value,数据是key,value是1
.reduceByKey(_+_)//将具有相同key的项相加合并成一个
.collect.foreach(println)
val arr=readHDFSFile(args(1))
val manifest_json=new String(arr)
val json = JSON.parseObject(manifest_json)
println(json.get("lastUpdated"))
println(json.getJSONArray("parcels"))
}
def readHDFSFile(hdfsFile : String) : Array[Byte] = {
var result = new Array[Byte](0)
if (StringUtils.isNoneBlank(hdfsFile)) {
realUrl = hdfsUrl + hdfsFile
val config = new Configuration()
val hdfs = FileSystem.get(URI.create(realUrl), config)
val path = new Path(realUrl)
if (hdfs.exists(path)) {
val inputStream = hdfs.open(path)
val stat = hdfs.getFileStatus(path)
val length = stat.getLen.toInt
val buffer = new Array[Byte](length)
inputStream.readFully(buffer)
inputStream.close()
hdfs.close()
result = buffer
}
}
result
}
}
把上述spark测试代码打成jar包,和依赖第三方jar包fastjson-1.2.47.jar上传到集群环境
需要2个参数:1计算wordCount的文件在hdfs上的地址2解析json的文件在hdfs上的地址
本地模式运行命令:
spark-submit --jars fastjson-1.2.47.jar --class scala_test_lyh.wordCount test.jar /input/wordCount.txt /input/manifest.json
集群spark on yarn模式:
spark-submit --master yarn --jars fastjson-1.2.47.jar --class scala_test_lyh.wordCount test.jar /input/wordCount.txt /input/manifest.json