python 将mysql转为csv、json导入到Doris数据库

上一篇文章是将mysql导出成csv文件,适合csv不是很大的情况,以下对脚本进行了优化,采用分块读取csv,降低了内存的使用率,提高了传输速度。

from pydoris.doris_client import DorisClient
import requests
import pandas as pd
import numpy as np

fe_host = ''
username = ''
passwd = ''
fe_http_port = ""
fe_query_port = ""

doris_client = DorisClient(fe_host=fe_host,
                           fe_query_port=fe_query_port,
                           fe_http_port=fe_http_port,
                           username=username,
                           password=passwd,
                           db='zst_cep_model')

# 由于导入的csv文件过大,都是五六G以上,所以使用chunksize 分块获取数据进行操作。
for key, chunk in enumerate(pd.read_csv('xxxx.csv', chunksize=10000, dtype=str)):
    # 这一步也是很迷,需要读csv,然后转为字符串类型
    csv = chunk.to_csv(header=False, index=False, sep=',').encode('utf-8')

    # table_name 必须的是database.table的形式
    success = doris_client.write('xxx.xxx', csv)

    if success:
        print("数据写入成功!")
    else:
        print("数据写入失败。", key)
        break
    del csv

后面需要进行增量更新,使用csv太麻烦,想着使用json传输可能会好点,并直接从mysql中获取数据直接传输到doris,以下是脚本。

import pandas as pd
from pydoris.doris_client import *

from tools import *

# 配置
config = {
    // Mysql的配置
    'mysql_config': {
        'host': '',
        'port': ,
        'user': '',
        'password': '',
        'database': ''
    },
    // Doris的配置
    'doris_db_config': {
        'fe_host': '',
        'username': '',
        'passwd': '',
        'fe_http_port': "",
        'fe_query_port': "",
        'db': ''
    },
    
    # 要传输的mysql的目标表
    'mysql_table': '',
    # 要接收的Doris的目标表
    'doris_table': '',
}

# MysqlDataBaseClass 是自己编写的Mysql工具类,返回的是Mysql连接对象
yp_apidb = MysqlDataBaseClass(host=config['mysql_config']['host'], port=config['mysql_config']['port'], user=config['mysql_config']['user'],
                              password=config['mysql_config']['password'], database=config['mysql_config']['database'])

doris_client = DorisClient(fe_host=config['doris_db_config']['fe_host'],
                           fe_query_port=config['doris_db_config']['fe_query_port'],
                           fe_http_port=config['doris_db_config']['fe_http_port'],
                           username=config['doris_db_config']['username'],
                           password=config['doris_db_config']['passwd'],
                           db=config['doris_db_config']['db'])


def get_data_from_mysql(page=1, end_page=None, total_row_num=None, page_size=10000, limit_date='2024-08-12'):
    result = {'total_page': 0, 'total': 0, 'now_page': page, 'data': [], 'code': False, 'msg': ''}

    if total_row_num is None:
        select_res = yp_apidb.ExecuteSQL_Select(sql=f'''SELECT count(1) as total_num FROM `{config["mysql_table"]}` where collect_batch_date >= '{limit_date}';''')
        totalRowsNum = int(select_res[0]['total_num'])
    else:
        totalRowsNum = total_row_num

    if (totalRowsNum % page_size) == 0:
        totalPages = int(totalRowsNum / page_size)
    else:
        totalPages = int((totalRowsNum / page_size) + 1)

    result['total_page'] = totalPages
    result['total'] = totalRowsNum

    if end_page and page > end_page:
        result['msg'] = '已经达到设置的最后一页'
        return result

    if page > totalPages:
        result['msg'] = '已经是最后一页'
        return result

    start_num = int((page - 1) * page_size)
    limit = f'{start_num}, {page_size}'

    sql = f'''SELECT * FROM `{config["mysql_table"]}` where collect_batch_date >= '{limit_date}' limit {limit};'''

    data_list = yp_apidb.ExecuteSQL_Select(sql)
    result['data'] = data_list
    result['code'] = True
    result['msg'] = '获取成功'

    return result


def insert_to_doris(data_list):
    if len(data_list) > 0:
        df = pd.DataFrame(data_list)
        json_data = df.to_json(orient='records')

        options = WriteOptions()
        options.set_json_format()
        options.set_option("strip_outer_array", "true")
        success = doris_client.write(f"{config['doris_table']}", json_data, options=options)

        if success:
            return True
        else:
            print("数据写入失败。")
            return False


if __name__ == '__main__':
    page = 1
    total_row_num = None
    limit_date = '2024-08-01'
    
    # 循环获取下一页,从而达到自动翻页的功能
    while True:
        res = get_data_from_mysql(page=page, total_row_num=total_row_num, limit_date=limit_date)
        print(res['msg'], res['total_page'], res['total'], res['now_page'])
        total_row_num = res['total']
        if res['code']:

            data_list = res['data']
            flage = insert_to_doris(data_list)

            if flage is False:
                break

            page += 1
        else:
            print(res['msg'], page)
            break

以上脚本仅供学习参考,仅为实现临时功能而编写,还有优化的空间。

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值