Optimize and Vacuum Delta Tables

Optimize

Delta Lake on Databricks can improve the speed of read queries from a table. One way to improve this speed is to coalesce small files into larger ones.
Z-ordering :
Z-ordering is a technique to colocate related information in the same set of files.
This co-locality is automatically used by Delta Lake on Databricks data-skipping algorithms.
This behavior dramatically reduces the amount of data that Delta Lake on Databricks needs to read.

Vacuum : Remove unused data files with vacuum

You can remove data files no longer referenced by a Delta table that are older than the retention threshold by running the vacuum command on the table.
Running vacuum regularly is important for cost and compliance because of the following considerations:

  • Deleting unused data files reduces cloud storage costs.
  • Data files removed by vacuum might contain records that have been modified or deleted.
    Permanently removing these files from cloud storage ensures these records are no longer accessible.
import queue
import threading
from datetime import datetime

from CreateSpark import Spark

class OptimizeVacuumTables(Spark):
    def __init__(self, database='db_dw', retain_hours=168):
        Spark.__init__(app_name='OptimizeTables')
        self.database = database
        self.retain_hours = retain_hours
        print(f'VACUUM tables in {database} and retain_hours is {retain_hours}')
        self.zorder_tables = {"database.table_name1": "zorder_by_cols:a,d",
                              "database.table_name2": "zorder_by_cols:a,b",
                              "database.table_name3": "zorder_by_cols:b,d"}
        self.spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")

    def exec_sql(self, sqlstr: str, queue: queue.Queue = None):
        status = "Successed"
        error_info = ""
        try:
            dt_start = datetime.now().strftime("%Y/%m/%d %H:%M:%S")
            self.spark.sql(sqlstr.strip())
        except Exception as err:
            status = "Failed"
            error_info = err
        finally:
            dt_finish = datetime.now().strftime("%Y/%m/%d %H:%M:%S")
            result = {"load_item": sqlstr, "dt_start": dt_start, "dt_finish": dt_finish, "status": status,
                      "error_info": error_info}
            if queue is not None:
                queue.put(result)
            print(result)
        return result

    def exec_sql_parrallelisation(self, sqlstr_list: list):
        q = queue.Queue()
        threads = []
        status = "Successed"
        for index, sqlstr in enumerate(sqlstr_list):
            t = threading.Thread(target=self.exec_sql, args=[sqlstr, q])
            t.start()
            threads.append(t)
        for t in threads:
            t.join()
        while not q.empty():
            result = q.get()
            if result == "Failed":
                status == "Failed"
        return status

    def optimize_vacuum_dim_tables(self):
        sdf = self.spark.sql(f"SHOW TABLES IN {self.database}").where(
            "isTemporary = False AND (LOWER(tableName) LIKE 'dim_%' OR LOWER(tableName) LIKE 'lkp_%')")
        optimise_table_list = [f"OPTIMIZE {self.database}.{table.tableName}" for table in sdf.collect()]
        self.exec_sql_parrallelisation(sqlstr_list=optimise_table_list)
        vacuum_table_list = [f"VACUUM {self.database}.{table.tableName} RETAIN {self.retain_hours} HOURS" for table in
                             sdf.collect()]
        self.exec_sql_parrallelisation(sqlstr_list=vacuum_table_list)

    def optimize_vacuum_fact_tables(self):
        sdf = self.spark.sql(f"SHOW TABLES IN {self.database}").where(
            "isTemporary = False AND LOWER(tableName) LIKE 'fact_%'")

        optimise_table_list = [f"OPTIMIZE {self.database}.{table.tableName}" for table in sdf.collect()]
        for table in sdf.collect():
            tb_name = f'{self.database}.{table.tableName}'.lower()
            if tb_name in self.zorder_tables:
                zorder_table_cols = self.zorder_tables[tb_name]
                sqlstr = f"OPTIMIZE {tb_name} ZORDER BY {zorder_table_cols}"
            else:
                sqlstr = f"OPTIMIZE {tb_name}"
            optimise_table_list.append(sqlstr)
        self.exec_sql_parrallelisation(sqlstr_list=optimise_table_list)
        vacuum_table_list = [f"VACUUM {self.database}.{table.tableName} RETAIN {self.retain_hours} HOURS" for table in
                             sdf.collect()]
        self.exec_sql_parrallelisation(sqlstr_list=vacuum_table_list)

    def analyze_tables(self):
        sdf = self.spark.sql(f"SHOW TABLES IN {self.database}").where("isTemporary = False")
        analyze_table_list = [f'ANALYZE TABLE {self.database}.{table.tableName}' for table in sdf.collect()]
        self.exec_sql_parrallelisation(sqlstr_list=analyze_table_list)


if __name__ == '__main__':
    OVT = OptimizeVacuumTables(database="dw", retain_hours=0)
    OVT.optimize_vacuum_dim_tables()
    OVT.optimize_vacuum_fact_tables()
    OVT.analyze_tables()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值