java合并spark小文件_spark小文件合并解决多级分区

最新推荐文章于 2022-11-02 12:24:29 发布

hyaliney

最新推荐文章于 2022-11-02 12:24:29 发布

阅读量162

点赞数

文章标签： java合并spark小文件

本文链接：https://blog.csdn.net/weixin_29304021/article/details/114864588

版权

package spark10.access

import java.text.SimpleDateFormat

import java.util.Date

import org.apache.hadoop.fs.{FileSystem, Path}

import org.apache.log4j.Logger

import org.apache.spark.sql.hive.HiveContext

import org.apache.spark.{SparkConf, SparkContext, sql}

import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}

/**

* Created by hadoop on 下午10:01.

object AccesslogETL {

def main(args: Array[String]): Unit = {

val sc = new SparkContext(new SparkConf().setAppName("access").setMaster("local[3]"))

val sqlContext = new HiveContext(sc)

// doJob(sqlContext, fileSystem, appName, inputPath, outputPath,

// loadTime, accessTable, ifRefreshPartiton, tryTime)

}

/**

* @param sqlContext

* @param fileSystem

* @param batchID batchID, 格式:201803221505-g1

* @param inputPath

* @param outputPath

* @param accessTable 清洗后外部表

* @param ifRefreshPartiton 是否刷新分区: 0-不刷新, 1-刷新

* @param tryTime 重试次数

* @return

@throws(classOf[Exception])

def doJob(sqlContext: SQLContext, fileSystem: FileSystem, batchID: String,

inputPath: String, outputPath: String, dataTime:String,

accessTable: String, ifRefreshPartiton: String, tryTime: Int): String = {

try{

var result = "batchID:" + batchID

val logger = Logger.getLogger("org")

var begin = new Date().getTime

val inputLocation = inputPath + batchID

val inputDoingLocation = inputPath + "/" + batchID + "_doing"

val dirExists = fileSystem.exists(new Path(inputLocation))

if (!dirExists && tryTime == 0) {

logger.info(s"$inputLocation not exists")

result = result + s" $inputLocation not exists"

return result

} else if (!dirExists && tryTime > 0) {

if (!fileSystem.exists(new Path(inputDoingLocation))) {

logger.info(s" $inputDoingLocation not exists")

result = result + s" $inputDoingLocation not exists"

return result

}

} else {

val isDoingRenamed = renameHDFSDir(fileSystem, inputLocation, inputDoingLocation)

if (!isDoingRenamed) {

logger.info(s" $inputLocation move to $inputDoingLocation failed")

result = result + s" $inputLocation move to $inputDoingLocation failed"

return result

}

logger.info(s" $inputLocation move to $inputDoingLocation success")

}

val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")

val endTime = sdf.parse(dataTime)

val beginTime = sdf.format(endTime.getTime() - 2*60*60*1000)

val curHourTime = sdf.format(endTime.getTime() - 1*60*60*1000)

var resultDF:DataFrame = null

fileSystem.globStatus(new Path(inputDoingLocation+"/*")).foreach(p=>{

val hLoc = p.getPath.toString

// 调用计算分区大小的代码

val getPartitionNum = (houseLoc:String)=>{

//计算逻辑略

}

val partitionNum = getPartitionNum(hLoc)

logger.info("hLoc:" + hLoc + ", partitionNum:" + partitionNum)

// 根据每个house目录生成DataFrame, 过程略

val hDF:DataFrame = null // 调用生成DataFrame的逻辑，这里使用null占位

//最近1个小时的log

val curHourDF = hDF.filter(s"acctime>='$curHourTime'")

// 1个小时之前的log

val preHourDF = hDF.filter(s"acctime>'$beginTime' and acctime

// 1个小时之前的数据，数据量小，分区数设置为1/3分区大小

val preHourPartNum = if(partitionNum/3 == 0) 1 else partitionNum/3

val newDF = curHourDF.coalesce(partitionNum).unionAll(preHourDF.coalesce(preHourPartNum))

if(resultDF != null){

resultDF = resultDF.unionAll(newDF)

}else{

resultDF = newDF

}

})

// 将DataFrame保存到HDFS(代码略)

}catch {

case e:Exception =>{

e.printStackTrace()

e.getMessage

}

def renameHDFSDir(fileSystem: FileSystem, srcLocation: String, destLocation: String): Boolean = {

val srcPath = new Path(srcLocation)

val destPath = new Path(destLocation)

val isRename = fileSystem.rename(srcPath, destPath)

isRename

}

hyaliney

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
java合并spark小文件_spark小文件合并解决多级分区

package spark10.accessimport java.text.SimpleDateFormatimport java.util.Dateimport org.apache.hadoop.fs.{FileSystem, Path}import org.apache.log4j.Loggerimport org.apache.spark.sql.hive.HiveContextimpo...
复制链接

扫一扫