上一篇文章是将mysql导出成csv文件,适合csv不是很大的情况,以下对脚本进行了优化,采用分块读取csv,降低了内存的使用率,提高了传输速度。
from pydoris.doris_client import DorisClient
import requests
import pandas as pd
import numpy as np
fe_host = ''
username = ''
passwd = ''
fe_http_port = ""
fe_query_port = ""
doris_client = DorisClient(fe_host=fe_host,
fe_query_port=fe_query_port,
fe_http_port=fe_http_port,
username=username,
password=passwd,
db='zst_cep_model')
# 由于导入的csv文件过大,都是五六G以上,所以使用chunksize 分块获取数据进行操作。
for key, chunk in enumerate(pd.read_csv('xxxx.csv', chunksize=10000, dtype=str)):
# 这一步也是很迷,需要读csv,然后转为字符串类型
csv = chunk.to_csv(header=False, index=False, sep=',').encode('utf-8')
# table_name 必须的是database.table的形式
success = doris_client.write('xxx.xxx', csv)
if success:
print("数据写入成功!")
else:
print("数据写入失败。", key)
break
del csv
后面需要进行增量更新,使用csv太麻烦,想着使用json传输可能会好点,并直接从mysql中获取数据直接传输到doris,以下是脚本。
import pandas as pd
from pydoris.doris_client import *
from tools import *
# 配置
config = {
// Mysql的配置
'mysql_config': {
'host': '',
'port': ,
'user': '',
'password': '',
'database': ''
},
// Doris的配置
'doris_db_config': {
'fe_host': '',
'username': '',
'passwd': '',
'fe_http_port': "",
'fe_query_port': "",
'db': ''
},
# 要传输的mysql的目标表
'mysql_table': '',
# 要接收的Doris的目标表
'doris_table': '',
}
# MysqlDataBaseClass 是自己编写的Mysql工具类,返回的是Mysql连接对象
yp_apidb = MysqlDataBaseClass(host=config['mysql_config']['host'], port=config['mysql_config']['port'], user=config['mysql_config']['user'],
password=config['mysql_config']['password'], database=config['mysql_config']['database'])
doris_client = DorisClient(fe_host=config['doris_db_config']['fe_host'],
fe_query_port=config['doris_db_config']['fe_query_port'],
fe_http_port=config['doris_db_config']['fe_http_port'],
username=config['doris_db_config']['username'],
password=config['doris_db_config']['passwd'],
db=config['doris_db_config']['db'])
def get_data_from_mysql(page=1, end_page=None, total_row_num=None, page_size=10000, limit_date='2024-08-12'):
result = {'total_page': 0, 'total': 0, 'now_page': page, 'data': [], 'code': False, 'msg': ''}
if total_row_num is None:
select_res = yp_apidb.ExecuteSQL_Select(sql=f'''SELECT count(1) as total_num FROM `{config["mysql_table"]}` where collect_batch_date >= '{limit_date}';''')
totalRowsNum = int(select_res[0]['total_num'])
else:
totalRowsNum = total_row_num
if (totalRowsNum % page_size) == 0:
totalPages = int(totalRowsNum / page_size)
else:
totalPages = int((totalRowsNum / page_size) + 1)
result['total_page'] = totalPages
result['total'] = totalRowsNum
if end_page and page > end_page:
result['msg'] = '已经达到设置的最后一页'
return result
if page > totalPages:
result['msg'] = '已经是最后一页'
return result
start_num = int((page - 1) * page_size)
limit = f'{start_num}, {page_size}'
sql = f'''SELECT * FROM `{config["mysql_table"]}` where collect_batch_date >= '{limit_date}' limit {limit};'''
data_list = yp_apidb.ExecuteSQL_Select(sql)
result['data'] = data_list
result['code'] = True
result['msg'] = '获取成功'
return result
def insert_to_doris(data_list):
if len(data_list) > 0:
df = pd.DataFrame(data_list)
json_data = df.to_json(orient='records')
options = WriteOptions()
options.set_json_format()
options.set_option("strip_outer_array", "true")
success = doris_client.write(f"{config['doris_table']}", json_data, options=options)
if success:
return True
else:
print("数据写入失败。")
return False
if __name__ == '__main__':
page = 1
total_row_num = None
limit_date = '2024-08-01'
# 循环获取下一页,从而达到自动翻页的功能
while True:
res = get_data_from_mysql(page=page, total_row_num=total_row_num, limit_date=limit_date)
print(res['msg'], res['total_page'], res['total'], res['now_page'])
total_row_num = res['total']
if res['code']:
data_list = res['data']
flage = insert_to_doris(data_list)
if flage is False:
break
page += 1
else:
print(res['msg'], page)
break
以上脚本仅供学习参考,仅为实现临时功能而编写,还有优化的空间。