python读取excel，转换为JSON/INSERT SQL语句

菜鸟001号

已于 2024-08-20 14:57:19 修改

阅读量171

点赞数 2

文章标签： python sql

于 2024-08-19 18:54:19 首次发布

本文链接：https://blog.csdn.net/qq_37189286/article/details/141330878

版权

背景：

业务方提供一个几十列、十万w+的excel，需要每月多次写入数据库。每次都需要手动处理数据转换为json或者insert语句。所以想到通过py直接转换为json文件和insert语句。

1. 引入需求数据包:

import pandas as pd
import json
from datetime import datetime
import time

2. 定义表头（第一列）和英文字段（key）的对应关系

# 定义中文列名到英文列名的映射
chinese_to_english_mapping = {
    '业务类型': 'business_type',
    '一级维度': 'one_dim',
    '二级维度': 'two_dim',
    '三级维度': 'three_dim',
    '四级维度': 'four_dim'
}

# 读取Excel文件的列名
file_path = '111.xlsx'
df_columns = pd.read_excel(file_path, nrows=0, engine='openpyxl')

# 获取中文列名并去除空格
chinese_columns = [col.strip() for col in df_columns.columns.tolist()]

# 根据中文列名映射为英文列名
english_columns = [chinese_to_english_mapping[col] for col in chinese_columns]

# 读取Excel文件的数据
df = pd.read_excel(file_path, names=english_columns, engine='openpyxl')

# 去除所有字符串类型数据两边的空格
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

3. 将nan类型转换为None;

# 将混合数据类型的列转换为对象类型，这两种处理方式重复选一个就好，但是这里无法处理float('Nan')类型
# 严格来说下面列表处理方案可以兼容这里的处理

for col in df.columns:
    if len(df[col].apply(type).unique()) > 1:
        df[col] = df[col].astype(object)

# 将 NaN 值替换为 None
df = df.where(pd.notna(df), None)

df = df.applymap(lambda x: x if pd.notna(x) else None)

4.定义写入文件名称，定义初始SQL

output_file_path = 'output.json'


str1 = '''insert into db.tb(unique_id,business_type,mreport_one_dim,mreport_two_dim,
        mreport_three_dim,mreport_four_dim) values
        '''
str2 = str1 + ""
i = 0
row_count = df.shape[0] #总行数

5.生成json/sql写入文件

with open(output_file_path, 'w', encoding='utf-8') as f:
        for index, row in df.iterrows():
        json_object = row.to_dict()


        unique_id = index+1
        json_object['unique_id'] = unique_id

        
        json_str = json.dumps(json_object, ensure_ascii=False)
        
        #json写入文件
        f.write(json_str + '\n')


        key_order =["unique_id","business_type","mreport_one_dim","mreport_two_dim","mreport_three_dim","mreport_four_dim"]

        ## 按制定顺序生成list。
        ordered_values = [json_object[key] for key in key_order]

        ## 解决数据中存在float('Nan')，更多解决方案，可以看https://mp.csdn.net/mp_blog/creation/editor/141329337
        ordered_values_new = [None if pd.isna(item) else item for item in ordered_values]


        i += 1
        if i<=5000:
            values = ', '.join([
                f"'{value}'" if isinstance(value, str) and value != 'NULL' else
                'NULL' if value is None or value == 'NULL' or value =='nan' else
                str(value)
                for value in ordered_values_new
            ])

            str2 = str2 + "(" + values + "),\n"
            ## 判断是否遍历完所有
            if row_count - i == 0:
                with open("sql", "a+", encoding="utf-8") as f1:
                    f1.write(str2)
        ## 满5000 生产新的INSERT into 语句
        else :
            # print(str2)
            with open("sql", "a+", encoding="utf-8") as f1:
                f1.write(str2)
            row_count -= i
            i = 0
            str2 = str1 + ""