java合并spark小文件_spark小文件合并解决多级分区

package spark10.access

import java.text.SimpleDateFormat

import java.util.Date

import org.apache.hadoop.fs.{FileSystem, Path}

import org.apache.log4j.Logger

import org.apache.spark.sql.hive.HiveContext

import org.apache.spark.{SparkConf, SparkContext, sql}

import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}

/**

* Created by hadoop on 下午10:01.

*

*/

object AccesslogETL {

def main(args: Array[String]): Unit = {

val sc = new SparkContext(new SparkConf().setAppName("access").setMaster("local[3]"))

val sqlContext = new HiveContext(sc)

// doJob(sqlContext, fileSystem, appName, inputPath, outputPath,

// loadTime, accessTable, ifRefreshPartiton, tryTime)

}

/**

*

* @param sqlContext

* @param fileSystem

* @param batchID batchID, 格式:201803221505-g1

* @param inputPath

* @param outputPath

* @param accessTable 清洗后外部表

* @param ifRefreshPartiton 是否刷新分区: 0-不刷新, 1-刷新

* @param tryTime 重试次数

* @return

*/

@throws(classOf[Exception])

def doJob(sqlContext: SQLContext, fileSystem: FileSystem, batchID: String,

inputPath: String, outputPath: String, dataTime:String,

accessTable: String, ifRefreshPartiton: String, tryTime: Int): String = {

try{

var result = "batchID:" + batchID

val logger = Logger.getLogger("org")

var begin = new Date().getTime

val inputLocation = inputPath + batchID

val inputDoingLocation = inputPath + "/" + batchID + "_doing"

val dirExists = fileSystem.exists(new Path(inputLocation))

if (!dirExists && tryTime == 0) {

logger.info(s"$inputLocation not exists")

result = result + s" $inputLocation not exists"

return result

} else if (!dirExists && tryTime > 0) {

if (!fileSystem.exists(new Path(inputDoingLocation))) {

logger.info(s" $inputDoingLocation not exists")

result = result + s" $inputDoingLocation not exists"

return result

}

} else {

val isDoingRenamed = renameHDFSDir(fileSystem, inputLocation, inputDoingLocation)

if (!isDoingRenamed) {

logger.info(s" $inputLocation move to $inputDoingLocation failed")

result = result + s" $inputLocation move to $inputDoingLocation failed"

return result

}

logger.info(s" $inputLocation move to $inputDoingLocation success")

}

val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")

val endTime = sdf.parse(dataTime)

val beginTime = sdf.format(endTime.getTime() - 2*60*60*1000)

val curHourTime = sdf.format(endTime.getTime() - 1*60*60*1000)

var resultDF:DataFrame = null

// 遍历目录下的所有house子目录

fileSystem.globStatus(new Path(inputDoingLocation+"/*")).foreach(p=>{

val hLoc = p.getPath.toString

// 调用计算分区大小的代码

val getPartitionNum = (houseLoc:String)=>{

//计算逻辑略

1

}

val partitionNum = getPartitionNum(hLoc)

logger.info("hLoc:" + hLoc + ", partitionNum:" + partitionNum)

// 根据每个house目录生成DataFrame, 过程略

val hDF:DataFrame = null // 调用生成DataFrame的逻辑,这里使用null占位

//最近1个小时的log

val curHourDF = hDF.filter(s"acctime>='$curHourTime'")

// 1个小时之前的log

val preHourDF = hDF.filter(s"acctime>'$beginTime' and acctime

// 1个小时之前的数据, 数据量小, 分区数设置为1/3分区大小

val preHourPartNum = if(partitionNum/3 == 0) 1 else partitionNum/3

val newDF = curHourDF.coalesce(partitionNum).unionAll(preHourDF.coalesce(preHourPartNum))

if(resultDF != null){

resultDF = resultDF.unionAll(newDF)

}else{

resultDF = newDF

}

})

// 将DataFrame保存到HDFS(代码略)

""

}catch {

case e:Exception =>{

e.printStackTrace()

e.getMessage

}

}

}

def renameHDFSDir(fileSystem: FileSystem, srcLocation: String, destLocation: String): Boolean = {

val srcPath = new Path(srcLocation)

val destPath = new Path(destLocation)

val isRename = fileSystem.rename(srcPath, destPath)

isRename

}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值