Optimize and Vacuum Delta Tables

wangyanglongcc

已于 2023-05-05 13:36:08 修改

阅读量527

点赞数

分类专栏： Azure Databricks in Action 文章标签：大数据 spark databricks python

于 2023-05-05 13:34:03 首次发布

本文链接：https://blog.csdn.net/qq_33246702/article/details/130504916

版权

Azure Databricks in Action 专栏收录该内容

22 篇文章 0 订阅

订阅专栏

Optimize

Delta Lake on Databricks can improve the speed of read queries from a table. One way to improve this speed is to coalesce small files into larger ones.
Z-ordering :
Z-ordering is a technique to colocate related information in the same set of files.
This co-locality is automatically used by Delta Lake on Databricks data-skipping algorithms.
This behavior dramatically reduces the amount of data that Delta Lake on Databricks needs to read.

Vacuum : Remove unused data files with vacuum

You can remove data files no longer referenced by a Delta table that are older than the retention threshold by running the vacuum command on the table.
Running vacuum regularly is important for cost and compliance because of the following considerations:

Deleting unused data files reduces cloud storage costs.
Data files removed by vacuum might contain records that have been modified or deleted.
Permanently removing these files from cloud storage ensures these records are no longer accessible.

import queue
import threading
from datetime import datetime

from CreateSpark import Spark


class OptimizeVacuumTables(Spark):
    def __init__(self, database='db_dw', retain_hours=168):
        Spark.__init__(app_name='OptimizeTables')
        self.database = database
        self.retain_hours = retain_hours
        print(f'VACUUM tables in {database} and retain_hours is {retain_hours}')
        self.zorder_tables = {"database.table_name1": "zorder_by_cols:a,d",
                              "database.table_name2": "zorder_by_cols:a,b",
                              "database.table_name3": "zorder_by_cols:b,d"}
        self.spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")

    def exec_sql(self, sqlstr: str, queue: queue.Queue = None):
        status = "Successed"
        error_info = ""
        try:
            dt_start = datetime.now().strftime("%Y/%m/%d %H:%M:%S")
            self.spark.sql(sqlstr.strip())
        except Exception as err:
            status = "Failed"
            error_info = err
        finally:
            dt_finish = datetime.now().strftime("%Y/%m/%d %H:%M:%S")
            result = {"load_item": sqlstr, "dt_start": dt_start, "dt_finish": dt_finish, "status": status,
                      "error_info": error_info}
            if queue is not None:
                queue.put(result)
            print(result)
        return result

    def exec_sql_parrallelisation(self, sqlstr_list: list):
        q = queue.Queue()
        threads = []
        status = "Successed"
        for index, sqlstr in enumerate(sqlstr_list):
            t = threading.Thread(target=self.exec_sql, args=[sqlstr, q])
            t.start()
            threads.append(t)
        for t in threads:
            t.join()
        while not q.empty():
            result = q.get()
            if result == "Failed":
                status == "Failed"
        return status

    def optimize_vacuum_dim_tables(self):
        sdf = self.spark.sql(f"SHOW TABLES IN {self.database}").where(
            "isTemporary = False AND (LOWER(tableName) LIKE 'dim_%' OR LOWER(tableName) LIKE 'lkp_%')")
        optimise_table_list = [f"OPTIMIZE {self.database}.{table.tableName}" for table in sdf.collect()]
        self.exec_sql_parrallelisation(sqlstr_list=optimise_table_list)
        vacuum_table_list = [f"VACUUM {self.database}.{table.tableName} RETAIN {self.retain_hours} HOURS" for table in
                             sdf.collect()]
        self.exec_sql_parrallelisation(sqlstr_list=vacuum_table_list)

    def optimize_vacuum_fact_tables(self):
        sdf = self.spark.sql(f"SHOW TABLES IN {self.database}").where(
            "isTemporary = False AND LOWER(tableName) LIKE 'fact_%'")

        optimise_table_list = [f"OPTIMIZE {self.database}.{table.tableName}" for table in sdf.collect()]
        for table in sdf.collect():
            tb_name = f'{self.database}.{table.tableName}'.lower()
            if tb_name in self.zorder_tables:
                zorder_table_cols = self.zorder_tables[tb_name]
                sqlstr = f"OPTIMIZE {tb_name} ZORDER BY {zorder_table_cols}"
            else:
                sqlstr = f"OPTIMIZE {tb_name}"
            optimise_table_list.append(sqlstr)
        self.exec_sql_parrallelisation(sqlstr_list=optimise_table_list)
        vacuum_table_list = [f"VACUUM {self.database}.{table.tableName} RETAIN {self.retain_hours} HOURS" for table in
                             sdf.collect()]
        self.exec_sql_parrallelisation(sqlstr_list=vacuum_table_list)

    def analyze_tables(self):
        sdf = self.spark.sql(f"SHOW TABLES IN {self.database}").where("isTemporary = False")
        analyze_table_list = [f'ANALYZE TABLE {self.database}.{table.tableName}' for table in sdf.collect()]
        self.exec_sql_parrallelisation(sqlstr_list=analyze_table_list)


if __name__ == '__main__':
    OVT = OptimizeVacuumTables(database="dw", retain_hours=0)
    OVT.optimize_vacuum_dim_tables()
    OVT.optimize_vacuum_fact_tables()
    OVT.analyze_tables()