删除hdfs与hive失去映射关联的库或表

熬夜的小王

于 2022-04-19 17:38:15 发布

阅读量335

点赞数

分类专栏：大数据专栏文章标签： hive hadoop hdfs scala

本文链接：https://blog.csdn.net/weixin_44947939/article/details/124279412

版权

大数据专栏专栏收录该内容

10 篇文章 0 订阅

订阅专栏

业务：删除hdfs与hive失去映射关联的数据库或者数据表

说明：①业务下/user/hive/warehouse 下的数据库与hive中的部分数据库或者表对应不上，删除节省hdfs空间，不重要数据，可删除；

②无关数据库表数量太多，无法手动删除。

③指定数据库下的外部表，会存在hdfs上，但hive上看不到，且是无用数据。

pom.xml:

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>cn.spark.wgy</groupId>
    <artifactId>SparkDome</artifactId>
    <version>1.0-SNAPSHOT</version>

    <dependencies>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
            <version>2.4.0</version>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.11</artifactId>
            <version>2.4.0</version>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming_2.11</artifactId>
            <version>2.3.0</version>
        </dependency>

        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.27</version>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-hive_2.11</artifactId>
            <version>2.4.0</version>
        </dependency>
    </dependencies>
</project>

具体代码：

数据库：

package hdfs

import java.net.URI

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}


import scala.collection.mutable.ListBuffer

object SparkListHdfsFile {
  def main(args: Array[String]): Unit = {
    //获取spark环境
    val spark: SparkSession = getSpark
    //设置日志打印信息级别
    spark.sqlContext.sparkContext.setLogLevel("WARN")//打印日志的级别
    //获取数据库信息
    val frame: DataFrame = spark.sql("show databases ")
    //将dataframe转成array
    val buffer: Array[Row] = frame.rdd.collect().toArray
    //用于存储hive数据库名字信息
    val hiveBuffer = ListBuffer[String]()
    println("=============hive in database not have default.db=================")
    for( a <- 0 to buffer.length-1){
      //除去hive默认default数据库
      if(buffer(a).toString().substring(1,buffer(a).toString().length-1)!="default")
        //对字符串进行处理,方便后边取差集
        hiveBuffer+=buffer(a).toString().substring(1,buffer(a).toString().length-1)+".db"
    }
    for (elem <- hiveBuffer) {
      println("hive databasename:"+elem)
    }

    println("获取hdsfs指定目录下的一级文件和目录")
    val hdfsBuffer = ListBuffer[String]()
    val paths: Array[Path] = getFilesAndDirs("hdfs://hadoop101:9000","/user/hive/warehouse")
    for (elem <- paths) {
      //对字符串进行处理
      val str: String = elem.toString.substring(42)
      hdfsBuffer+=str
    }
    for (elem <- hdfsBuffer) {
      println("hdfs databasename:"+elem)
    }
    //获取到hdfs上与hive库中不对应的数据库文件，取差集，注意位置，以hdfsBuffer为主
    val diff: ListBuffer[String] = hdfsBuffer.diff(hdfsBuffer)

    println("===========hdfs and hive different databasename not have default.db")
    for (elem <- diff) {
      println("hdfs databasename:"+elem)
    }
    val system: FileSystem = getHdfs("hdfs://hadoop101:9000")
    if(diff.length==0){
      println("don't have different database")
    }else{
      for (elem <- diff) {
        var path= "/user/hive/warehouse/"+elem
        val bool: Boolean = system.delete(new Path(path),true)
        println("delect different database" +elem+ "result is "+bool)
      }
    }
  }
  //生成FileSystem
  def getHdfs(path: String): FileSystem = {
    val conf = new Configuration()
    FileSystem.newInstance(URI.create(path), conf)
  }
  //获取目录下的一级文件和目录
  def getFilesAndDirs(path: String,path2:String): Array[Path] = {
    val fs = getHdfs(path).listStatus(new Path(path2))
    FileUtil.stat2Paths(fs)
  }
//获取Spark环境
  def getSpark={
    val sparkSession: SparkSession = SparkSession.builder()
      .config("spark.sql.warehouse.dir","hdfs://hadoop101:9000/user/hive/warehouse")
      .master("local")
      .enableHiveSupport()
      .getOrCreate()
    sparkSession
  }
}

表：

package hdfs

import java.net.URI

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}

import scala.collection.mutable.ListBuffer

object SparkDropDifferentTable {
  def main(args: Array[String]): Unit = {
    //获取spark环境
    val spark: SparkSession = getSpark
    //设置日志打印信息级别
    spark.sqlContext.sparkContext.setLogLevel("WARN")//打印日志的级别
    //选择数据库
    spark.sql("use datasourcedb ")
    val frame: DataFrame = spark.sql("show tables ")
    //将dataframe转成array
    val buffer: Array[Row] = frame.rdd.collect().toArray
    //用于存储databasedb表名字信息
    val hiveBuffer = ListBuffer[String]()
    println("=============hive in table =================")
    for( a <- 0 to buffer.length-1){
      //对字符串进行处理,方便后边取差集
        hiveBuffer+=buffer(a).toString().substring(14,buffer(a).toString().length-7)
    }
    for (elem <- hiveBuffer) {
      println("hive tablename:"+elem)
    }

    println("获取hdsfs指定目录下的一级文件和目录")
    val hdfsBuffer = ListBuffer[String]()
    val paths: Array[Path] = getFilesAndDirs("hdfs://hadoop101:9000","/user/hive/warehouse/datasourcedb.db")
    for (elem <- paths) {
      //对字符串进行处理
      val str: String = elem.toString.substring(54)
      hdfsBuffer+=str
    }
    for (elem <- hdfsBuffer) {
      println("hdfs tablename:"+elem)
    }
    //获取到hdfs上与hive库中不对应的数据表文件，取差集，注意位置，以hdfsBuffer为主
    val diff: ListBuffer[String] = hdfsBuffer.diff(hdfsBuffer)

    println("===========hdfs and hive different tablename")
    for (elem <- diff) {
      println("hdfs  tablename:"+elem)
    }
    val system: FileSystem = getHdfs("hdfs://hadoop101:9000")
    if(diff.length==0){
      println("don't have different table")
    }else{
      for (elem <- diff) {
        var path= "/user/hive/warehouse/datasourcedb.db/"+elem
        val bool: Boolean = system.delete(new Path(path),true)
        println("delect different table" +elem+ "result is "+bool)
      }
    }
  }
  //生成FileSystem
  def getHdfs(path: String): FileSystem = {
    val conf = new Configuration()
    FileSystem.newInstance(URI.create(path), conf)
  }
  //获取目录下的一级文件和目录
  def getFilesAndDirs(path: String,path2:String): Array[Path] = {
    val fs = getHdfs(path).listStatus(new Path(path2))
    FileUtil.stat2Paths(fs)
  }
  //获取Spark环境
  def getSpark={
    val sparkSession: SparkSession = SparkSession.builder()
      .config("spark.sql.warehouse.dir","hdfs://hadoop101:9000/user/hive/warehouse")
      .master("local")
      .enableHiveSupport()
      .getOrCreate()
    sparkSession
  }
}