500万条数据,使用ALERT UPDATE 做更新总是出现各种问题,循环更新处理特别慢。
下面是建一张新表,通过插入的方式去实现更新的目的。
from clickhouse_driver import Client
import re
from tqdm import tqdm
def extract_spname_without_parentheses(spname):
# 使用正则表达式匹配括号内容并去除
pattern = re.compile(r'(.+?)\s*(.*)')
match = re.match(pattern, spname)
if match:
return match.group(1)
else:
return spname
# ClickHouse 服务器地址和端口
host = 'xxx.xxx.xxx.xxx'
port = '9000'
# 数据库名称
database = 'default'
# 数据库用户名和密码
user = 'xxxx' # 替换为你的用户名
password = 'xxxx' # 替换为你的密码
# 创建连接
client = Client(host=host, port=port, database=database, user=user, password=password)
# 查询总数
query = "SELECT count(*) FROM school_score_plan_bak2" # 替换为你的表名
count = client.execute(query)
total_count = count[0][0]
# 批量插入大小
batch_size = 10000
# 执行查询和批量插入
with tqdm(total=total_count) as pbar:
offset = 0
while offset < total_count:
query = f"SELECT * FROM school_score_plan_bak2 ORDER BY id ASC LIMIT {batch_size} OFFSET {offset}" # 替换为你的表名
results = client.execute(query)
batch_insert_data = []
for result in results:
result_list = list(result)
spname = result_list[22].replace("'", "")
result_list[22] = spname
spn = extract_spname_without_parentheses(spname)
result_list[26] = spn
batch_insert_data.append(tuple(result_list))
pbar.update(1)
# 执行批量插入
if batch_insert_data:
insert_query = "INSERT INTO `default`.school_score_plan (id, first_km, `length`, level2_name, local_batch_name, local_type_name, name, num, province_name, school_id, sg_fxk, sg_info, sg_name, sg_sxk, sg_type, sp_fxk, sp_info, sp_sxk, sp_type, sp_xuanke, spcode, special_group, spname, tuition, `year`, local_province_id, spn) VALUES"
insert_query += ",".join(["{}".format(result) for result in batch_insert_data])
client.execute(insert_query)
offset += batch_size