业务:删除hdfs与hive失去映射关联的数据库或者数据表
说明:①业务下/user/hive/warehouse 下的数据库与hive中的部分数据库或者表对应不上,删除节省hdfs空间,不重要数据,可删除;
②无关数据库表数量太多,无法手动删除。
③指定数据库下的外部表,会存在hdfs上,但hive上看不到,且是无用数据。
pom.xml:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>cn.spark.wgy</groupId>
<artifactId>SparkDome</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.4.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.4.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.27</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>2.4.0</version>
</dependency>
</dependencies>
</project>
具体代码:
数据库:
package hdfs
import java.net.URI
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import scala.collection.mutable.ListBuffer
object SparkListHdfsFile {
def main(args: Array[String]): Unit = {
//获取spark环境
val spark: SparkSession = getSpark
//设置日志打印信息级别
spark.sqlContext.sparkContext.setLogLevel("WARN")//打印日志的级别
//获取数据库信息
val frame: DataFrame = spark.sql("show databases ")
//将dataframe转成array
val buffer: Array[Row] = frame.rdd.collect().toArray
//用于存储hive数据库名字信息
val hiveBuffer = ListBuffer[String]()
println("=============hive in database not have default.db=================")
for( a <- 0 to buffer.length-1){
//除去hive默认default数据库
if(buffer(a).toString().substring(1,buffer(a).toString().length-1)!="default")
//对字符串进行处理,方便后边取差集
hiveBuffer+=buffer(a).toString().substring(1,buffer(a).toString().length-1)+".db"
}
for (elem <- hiveBuffer) {
println("hive databasename:"+elem)
}
println("获取hdsfs指定目录下的一级文件和目录")
val hdfsBuffer = ListBuffer[String]()
val paths: Array[Path] = getFilesAndDirs("hdfs://hadoop101:9000","/user/hive/warehouse")
for (elem <- paths) {
//对字符串进行处理
val str: String = elem.toString.substring(42)
hdfsBuffer+=str
}
for (elem <- hdfsBuffer) {
println("hdfs databasename:"+elem)
}
//获取到hdfs上与hive库中不对应的数据库文件,取差集,注意位置,以hdfsBuffer为主
val diff: ListBuffer[String] = hdfsBuffer.diff(hdfsBuffer)
println("===========hdfs and hive different databasename not have default.db")
for (elem <- diff) {
println("hdfs databasename:"+elem)
}
val system: FileSystem = getHdfs("hdfs://hadoop101:9000")
if(diff.length==0){
println("don't have different database")
}else{
for (elem <- diff) {
var path= "/user/hive/warehouse/"+elem
val bool: Boolean = system.delete(new Path(path),true)
println("delect different database" +elem+ "result is "+bool)
}
}
}
//生成FileSystem
def getHdfs(path: String): FileSystem = {
val conf = new Configuration()
FileSystem.newInstance(URI.create(path), conf)
}
//获取目录下的一级文件和目录
def getFilesAndDirs(path: String,path2:String): Array[Path] = {
val fs = getHdfs(path).listStatus(new Path(path2))
FileUtil.stat2Paths(fs)
}
//获取Spark环境
def getSpark={
val sparkSession: SparkSession = SparkSession.builder()
.config("spark.sql.warehouse.dir","hdfs://hadoop101:9000/user/hive/warehouse")
.master("local")
.enableHiveSupport()
.getOrCreate()
sparkSession
}
}
表:
package hdfs
import java.net.URI
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import scala.collection.mutable.ListBuffer
object SparkDropDifferentTable {
def main(args: Array[String]): Unit = {
//获取spark环境
val spark: SparkSession = getSpark
//设置日志打印信息级别
spark.sqlContext.sparkContext.setLogLevel("WARN")//打印日志的级别
//选择数据库
spark.sql("use datasourcedb ")
val frame: DataFrame = spark.sql("show tables ")
//将dataframe转成array
val buffer: Array[Row] = frame.rdd.collect().toArray
//用于存储databasedb表名字信息
val hiveBuffer = ListBuffer[String]()
println("=============hive in table =================")
for( a <- 0 to buffer.length-1){
//对字符串进行处理,方便后边取差集
hiveBuffer+=buffer(a).toString().substring(14,buffer(a).toString().length-7)
}
for (elem <- hiveBuffer) {
println("hive tablename:"+elem)
}
println("获取hdsfs指定目录下的一级文件和目录")
val hdfsBuffer = ListBuffer[String]()
val paths: Array[Path] = getFilesAndDirs("hdfs://hadoop101:9000","/user/hive/warehouse/datasourcedb.db")
for (elem <- paths) {
//对字符串进行处理
val str: String = elem.toString.substring(54)
hdfsBuffer+=str
}
for (elem <- hdfsBuffer) {
println("hdfs tablename:"+elem)
}
//获取到hdfs上与hive库中不对应的数据表文件,取差集,注意位置,以hdfsBuffer为主
val diff: ListBuffer[String] = hdfsBuffer.diff(hdfsBuffer)
println("===========hdfs and hive different tablename")
for (elem <- diff) {
println("hdfs tablename:"+elem)
}
val system: FileSystem = getHdfs("hdfs://hadoop101:9000")
if(diff.length==0){
println("don't have different table")
}else{
for (elem <- diff) {
var path= "/user/hive/warehouse/datasourcedb.db/"+elem
val bool: Boolean = system.delete(new Path(path),true)
println("delect different table" +elem+ "result is "+bool)
}
}
}
//生成FileSystem
def getHdfs(path: String): FileSystem = {
val conf = new Configuration()
FileSystem.newInstance(URI.create(path), conf)
}
//获取目录下的一级文件和目录
def getFilesAndDirs(path: String,path2:String): Array[Path] = {
val fs = getHdfs(path).listStatus(new Path(path2))
FileUtil.stat2Paths(fs)
}
//获取Spark环境
def getSpark={
val sparkSession: SparkSession = SparkSession.builder()
.config("spark.sql.warehouse.dir","hdfs://hadoop101:9000/user/hive/warehouse")
.master("local")
.enableHiveSupport()
.getOrCreate()
sparkSession
}
}