[Python]动态生成百万sql语句

有一天,leader需要我造一个月大约300万的某演示数据,且要求每天的统计数据有规律,比如高峰低峰时段,工作日和周末数据差异等等,一天之内就要完成,开完这个会上午基本就过去了,吃完午饭的时候还没头绪咋弄呢。

一条一条往数据库里面手动插入是不可能的,一条数据字段就几十个了,真干起来不得累死,还干不完,于是就想着写个插入的sql语句就好了撒。写个一条完整的插入语句会的,但是要动态变化字段这就难为我了,毕竟还没那么了解数据库语句的使用。

数据处理不是python的拿手好戏么,正好前个周六的时候没事在家看了本python的书《Python全栈工程师养成攻略》,跟着前面章节用python统计了一下西游记里面字的个数,并进行了读取和写入的操作。嘻嘻,正好可以整个百万insert语句的文件嘛。

说干就干,分为几步:
1、读取目标文件
2、生成一条insert语句
3、写入目标文件
4、循环2、3步
5、关闭文件

得到的sql文件往数据库一跑,数据就哗哗的进去了。

生成百万数据的核心代码示例:

import date_util
import random
import time

# 类型
types = [5, 6, 12, 16, 17, 21]

# 删除原因
pre_delete_reasons = ['E0203', 'E0301', 'E0408', 'E0503', 'E0305']

frmt='%Y-%m-%d %H:%M:%S'

def generate_sql(id, count, start_time, end_time, file_name):
    fw = open(file_name, 'a', encoding='utf-8')

    while id < count:
        index = random.randint(0, len(illegal_types) - 1)
        # 随机类型
        illegal_type = illegal_types[index]
        # 随机状态
        pre_status = random.randint(1, 2)
        # 审核状态
        audit_status = 0
        if pre_status == 2:
            audit_status = 9
        # 处理时间
        pre_handle_time = date_util.random_timestamp(start_time, end_time) * 1000
        # 通过时间
        pass_time = "'" + time.strftime(frmt, time.localtime(pre_handle_time / 1000)) + ".001'"
        # 删除原因
        index = random.randint(0, len(pre_delete_reasons) - 1)
        pre_delete_reason = "'" + pre_delete_reasons[index] + "'"
        
        # 拼接sql语句
        sql = f"""INSERT INTO i.info VALUES ({id}, {pass_time}, NULL, NULL, NULL, {pre_handle_time}, {type}, 1, {audit_status}, NULL, 'B528', 5, 1, '50010600051211000001', '50010600051191000001', '1', 27, 'K33', NULL, 2, '["/images/dn@dn127/2/25-2631f8b0-1dfd8"]', NULL, NULL, 80.0000, NULL, NULL, NULL, NULL, NULL, 'PASS', NULL, {type}, 'B528', 5, 1, NULL, NULL, NULL, NULL, {pre_status}, {pre_delete_reason}, {type}, 'B528', 5, {pre_handle_time}, 44, 1, NULL, 0, 0, 0, 0, '1B2B3B', 0, NULL, 0, NULL, 0, NULL, NULL);"""
        id += 1
        fw.write(sql + '\n')

    fw.close()


def generate():
    count = 1

    # 生成半夜数据
    # id = count
    # count = id + 5000 + random.randint(0, 3000)
    # start_time = '2020-01-05 00:00:00'
    # end_time = '2020-01-05 6:59:59'
    # print(f"id: {id}, count: {count}, start_time: {start_time}, end_time: {end_time}")
    # generate_sql(id, count, start_time, end_time, '1_half_night.sql')

    # # 生成早高峰数据1
    # id = count
    # count = id + 5000 + random.randint(0, 200)
    # start_time = '2020-01-05 07:00:00'
    # end_time = '2020-01-05 07:59:59'
    # print(f"id: {id}, count: {count}, start_time: {start_time}, end_time: {end_time}")
    # generate_sql(id, count, start_time, end_time, '2_morning_top.sql')
    #
    # # 生成早高峰数据2
    # id = count
    # count = id + 8000 + random.randint(0, 200)
    # start_time = '2020-01-05 08:00:00'
    # end_time = '2020-01-05 08:59:59'
    # print(f"id: {id}, count: {count}, start_time: {start_time}, end_time: {end_time}")
    # generate_sql(id, count, start_time, end_time, '2_morning_top.sql')
    #
    # # 生成早高峰数据3
    # id = count
    # count = id + 7200 + random.randint(0, 200)
    # start_time = '2020-01-05 09:00:00'
    # end_time = '2020-01-05 09:59:59'
    # print(f"id: {id}, count: {count}, start_time: {start_time}, end_time: {end_time}")
    # generate_sql(id, count, start_time, end_time, '2_morning_top.sql')
    #
    #  生成白天日常数据
    id = count
    count = id + 3000 + random.randint(0, 500)
    start_time = '2020-01-05 10:00:00'
    end_time = '2020-01-05 16:59:59'
    print(f"id: {id}, count: {count}, start_time: {start_time}, end_time: {end_time}")
    generate_sql(id, count, start_time, end_time, '3_day.sql')
    #
    # # 生成晚高峰数据1
    # id = count
    # count = id + 7500 + random.randint(0, 200)
    # start_time = '2020-01-05 17:00:00'
    # end_time = '2020-01-05 17:59:59'
    # print(f"id: {id}, count: {count}, start_time: {start_time}, end_time: {end_time}")
    # generate_sql(id, count, start_time, end_time, '4_night_top.sql')
    #
    # # 生成晚高峰数据2
    # id = count
    # count = id + 10000 + random.randint(0, 200)
    # start_time = '2020-01-05 18:00:00'
    # end_time = '2020-01-05 18:59:59'
    # print(f"id: {id}, count: {count}, start_time: {start_time}, end_time: {end_time}")
    # generate_sql(id, count, start_time, end_time, '4_night_top.sql')
    #
    # # 生成晚高峰数据3
    # id = count
    # count = id + 6500 + random.randint(0, 200)
    # start_time = '2020-01-05 19:00:00'
    # end_time = '2020-01-05 19:59:59'
    # print(f"id: {id}, count: {count}, start_time: {start_time}, end_time: {end_time}")
    # generate_sql(id, count, start_time, end_time, '4_night_top.sql')
    #
    # # 生成晚上数据
    # id = count
    # count = id + 15000 + random.randint(0, 200)
    # start_time = '2020-01-05 20:00:00'
    # end_time = '2020-01-05 23:59:59'
    # print(f"id: {id}, count: {count}, start_time: {start_time}, end_time: {end_time}")
    # generate_sql(id, count, start_time, end_time, '5_night.sql')

    print(f"count: {count}")

generate()

本来想使用一个sql文件来承载这么多数据,没想到不得行,大概是内存不够,导致数据写不完,虽然程序正常执行完,检查数据的时候发现id到一定数之后的数据都没在里面,于是分了好几个文件来写不同时段的数据,一个文件几十万数据还是得行的。
还遇到的一个坑是,Insert语句values后面的字段要跟数据库里面的表字段顺序一致才行,不同数据库,同一张表可能顺序还不一样,语句就得调整,还好一条语句能用,n条都能用,字段太多的时候,眼睛都看花了。

回顾一下看还是觉得写得很挫,以后再改进吧。

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值