删除hdfs与hive失去映射关联的库或表

业务:删除hdfs与hive失去映射关联的数据库或者数据表

说明:①业务下/user/hive/warehouse 下的数据库与hive中的部分数据库或者表对应不上,删除节省hdfs空间,不重要数据,可删除;

​ ②无关数据库表数量太多,无法手动删除。

​ ③指定数据库下的外部表,会存在hdfs上,但hive上看不到,且是无用数据。

pom.xml:

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>cn.spark.wgy</groupId>
    <artifactId>SparkDome</artifactId>
    <version>1.0-SNAPSHOT</version>

    <dependencies>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
            <version>2.4.0</version>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.11</artifactId>
            <version>2.4.0</version>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming_2.11</artifactId>
            <version>2.3.0</version>
        </dependency>

        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.27</version>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-hive_2.11</artifactId>
            <version>2.4.0</version>
        </dependency>
    </dependencies>
</project>

具体代码:

数据库:

package hdfs

import java.net.URI

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}


import scala.collection.mutable.ListBuffer

object SparkListHdfsFile {
  def main(args: Array[String]): Unit = {
    //获取spark环境
    val spark: SparkSession = getSpark
    //设置日志打印信息级别
    spark.sqlContext.sparkContext.setLogLevel("WARN")//打印日志的级别
    //获取数据库信息
    val frame: DataFrame = spark.sql("show databases ")
    //将dataframe转成array
    val buffer: Array[Row] = frame.rdd.collect().toArray
    //用于存储hive数据库名字信息
    val hiveBuffer = ListBuffer[String]()
    println("=============hive in database not have default.db=================")
    for( a <- 0 to buffer.length-1){
      //除去hive默认default数据库
      if(buffer(a).toString().substring(1,buffer(a).toString().length-1)!="default")
        //对字符串进行处理,方便后边取差集
        hiveBuffer+=buffer(a).toString().substring(1,buffer(a).toString().length-1)+".db"
    }
    for (elem <- hiveBuffer) {
      println("hive databasename:"+elem)
    }

    println("获取hdsfs指定目录下的一级文件和目录")
    val hdfsBuffer = ListBuffer[String]()
    val paths: Array[Path] = getFilesAndDirs("hdfs://hadoop101:9000","/user/hive/warehouse")
    for (elem <- paths) {
      //对字符串进行处理
      val str: String = elem.toString.substring(42)
      hdfsBuffer+=str
    }
    for (elem <- hdfsBuffer) {
      println("hdfs databasename:"+elem)
    }
    //获取到hdfs上与hive库中不对应的数据库文件,取差集,注意位置,以hdfsBuffer为主
    val diff: ListBuffer[String] = hdfsBuffer.diff(hdfsBuffer)

    println("===========hdfs and hive different databasename not have default.db")
    for (elem <- diff) {
      println("hdfs databasename:"+elem)
    }
    val system: FileSystem = getHdfs("hdfs://hadoop101:9000")
    if(diff.length==0){
      println("don't have different database")
    }else{
      for (elem <- diff) {
        var path= "/user/hive/warehouse/"+elem
        val bool: Boolean = system.delete(new Path(path),true)
        println("delect different database" +elem+ "result is "+bool)
      }
    }
  }
  //生成FileSystem
  def getHdfs(path: String): FileSystem = {
    val conf = new Configuration()
    FileSystem.newInstance(URI.create(path), conf)
  }
  //获取目录下的一级文件和目录
  def getFilesAndDirs(path: String,path2:String): Array[Path] = {
    val fs = getHdfs(path).listStatus(new Path(path2))
    FileUtil.stat2Paths(fs)
  }
//获取Spark环境
  def getSpark={
    val sparkSession: SparkSession = SparkSession.builder()
      .config("spark.sql.warehouse.dir","hdfs://hadoop101:9000/user/hive/warehouse")
      .master("local")
      .enableHiveSupport()
      .getOrCreate()
    sparkSession
  }
}

表:

package hdfs

import java.net.URI

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}

import scala.collection.mutable.ListBuffer

object SparkDropDifferentTable {
  def main(args: Array[String]): Unit = {
    //获取spark环境
    val spark: SparkSession = getSpark
    //设置日志打印信息级别
    spark.sqlContext.sparkContext.setLogLevel("WARN")//打印日志的级别
    //选择数据库
    spark.sql("use datasourcedb ")
    val frame: DataFrame = spark.sql("show tables ")
    //将dataframe转成array
    val buffer: Array[Row] = frame.rdd.collect().toArray
    //用于存储databasedb表名字信息
    val hiveBuffer = ListBuffer[String]()
    println("=============hive in table =================")
    for( a <- 0 to buffer.length-1){
      //对字符串进行处理,方便后边取差集
        hiveBuffer+=buffer(a).toString().substring(14,buffer(a).toString().length-7)
    }
    for (elem <- hiveBuffer) {
      println("hive tablename:"+elem)
    }

    println("获取hdsfs指定目录下的一级文件和目录")
    val hdfsBuffer = ListBuffer[String]()
    val paths: Array[Path] = getFilesAndDirs("hdfs://hadoop101:9000","/user/hive/warehouse/datasourcedb.db")
    for (elem <- paths) {
      //对字符串进行处理
      val str: String = elem.toString.substring(54)
      hdfsBuffer+=str
    }
    for (elem <- hdfsBuffer) {
      println("hdfs tablename:"+elem)
    }
    //获取到hdfs上与hive库中不对应的数据表文件,取差集,注意位置,以hdfsBuffer为主
    val diff: ListBuffer[String] = hdfsBuffer.diff(hdfsBuffer)

    println("===========hdfs and hive different tablename")
    for (elem <- diff) {
      println("hdfs  tablename:"+elem)
    }
    val system: FileSystem = getHdfs("hdfs://hadoop101:9000")
    if(diff.length==0){
      println("don't have different table")
    }else{
      for (elem <- diff) {
        var path= "/user/hive/warehouse/datasourcedb.db/"+elem
        val bool: Boolean = system.delete(new Path(path),true)
        println("delect different table" +elem+ "result is "+bool)
      }
    }
  }
  //生成FileSystem
  def getHdfs(path: String): FileSystem = {
    val conf = new Configuration()
    FileSystem.newInstance(URI.create(path), conf)
  }
  //获取目录下的一级文件和目录
  def getFilesAndDirs(path: String,path2:String): Array[Path] = {
    val fs = getHdfs(path).listStatus(new Path(path2))
    FileUtil.stat2Paths(fs)
  }
  //获取Spark环境
  def getSpark={
    val sparkSession: SparkSession = SparkSession.builder()
      .config("spark.sql.warehouse.dir","hdfs://hadoop101:9000/user/hive/warehouse")
      .master("local")
      .enableHiveSupport()
      .getOrCreate()
    sparkSession
  }
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值