一、背景
在mysql表中有一个字符串类型的字段,里面储存了JSON格式的数组。
由于mysql单个字段的长度是有限的,若JSON数组太长,容易出现长度溢出的异常,所以要将该字段转换为一张独立的mysql表。
此文档适用于已知JSON数组最大长度的场景,一般情况下可以通过字符串的长度换算得到数组的最大长度。
二、基本知识
mysql从5.7
开始,增加了JSON函数,支持对字符串进行JSON格式转换。这里主要用到两个函数:JSON_EXTRACT
、JSON_UNQUOTE
。
-
JSON_EXTRACT(json_doc, path[, path] ...)
从
json_doc
中解析JSON文档,返回path
参数指定的数据。如果任一参数为NULL
,那么返回值也是NULL
。如果json_doc
不是合法的JSON数据,或者path
不是合法的参数,都会抛出异常。
如果提供了多个path
参数,返回结果会自动封装为数组,按照提供的参数顺序封装数据。如果只有一个path
参数,返回结果就只有一个数据。示例:
mysql> SELECT JSON_EXTRACT('[10, 20, [30, 40]]', '$[1]'); +--------------------------------------------+ | JSON_EXTRACT('[10, 20, [30, 40]]', '$[1]') | +--------------------------------------------+ | 20 | +--------------------------------------------+ mysql> SELECT JSON_EXTRACT('[10, 20, [30, 40]]', '$[1]', '$[0]'); +----------------------------------------------------+ | JSON_EXTRACT('[10, 20, [30, 40]]', '$[1]', '$[0]') | +----------------------------------------------------+ | [20, 10] | +----------------------------------------------------+ mysql> SELECT JSON_EXTRACT('[10, 20, [30, 40]]', '$[2][*]'); +-----------------------------------------------+ | JSON_EXTRACT('[10, 20, [30, 40]]', '$[2][*]') | +-----------------------------------------------+ | [30, 40] | +-----------------------------------------------+
-
JSON_UNQUOTE(json_val)
反引文JSON数据,返回一个
utf8mb4
编码的字符串。如果JSON数据为NULL
,返回也是NULL
。针对普通字符串,该函数相当于去掉字符串的双引号。针对特殊字符串,则会根据
sql_mode
进行转换。此文档不作详细介绍。示例:
- 普通字符串
mysql> SET @j = '"abc"'; mysql> SELECT @j, JSON_UNQUOTE(@j); +-------+------------------+ | @j | JSON_UNQUOTE(@j) | +-------+------------------+ | "abc" | abc | +-------+------------------+ mysql> SET @j = '[1, 2, 3]'; mysql> SELECT @j, JSON_UNQUOTE(@j); +-----------+------------------+ | @j | JSON_UNQUOTE(@j) | +-----------+------------------+ | [1, 2, 3] | [1, 2, 3] | +-----------+------------------+
- 特殊字符串
mysql> SELECT @@sql_mode; +------------+ | @@sql_mode | +------------+ | | +------------+ mysql> SELECT JSON_UNQUOTE('"\\t\\u0032"'); +------------------------------+ | JSON_UNQUOTE('"\\t\\u0032"') | +------------------------------+ | 2 | +------------------------------+ mysql> SET @@sql_mode = 'NO_BACKSLASH_ESCAPES'; mysql> SELECT JSON_UNQUOTE('"\\t\\u0032"'); +------------------------------+ | JSON_UNQUOTE('"\\t\\u0032"') | +------------------------------+ | \t\u0032 | +------------------------------+ mysql> SELECT JSON_UNQUOTE('"\t\u0032"'); +----------------------------+ | JSON_UNQUOTE('"\t\u0032"') | +----------------------------+ | 2 | +----------------------------+
三、实现原理
为了将JSON数组转换为行,需要遍历数组的所有元素。
- 通过枚举下标的方式,与JSON数组进行联合查询,获得所有数组元素。
- 过滤所有空数据
3.1 数据准备
-- 创建原始表
CREATE TABLE `application_info` (
`id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '主键',
`application_id` varchar(100) NOT NULL COMMENT '在执行引擎上的任务ID,如Presto作业ID,YARN的applicationId',
`query_id_str` VARCHAR(1024) COMMENT 'JSON数组' ,
PRIMARY KEY (`id`),
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='作业信息表'
-- 插入数据
INSERT INTO `application_info` VALUES ('application_01','[\"20200520_072820_00012_syrpv\",\"20200520_072820_00013_syrpv\"]');
-- 创建拆分表
CREATE TABLE `application_job_id_of_engine` (
`id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '自增主键',
`application_info_id` bigint(20) NOT NULL COMMENT '任务主键',
`application_id` varchar(100) NOT NULL COMMENT '任务ID。该字段是冗余字段,方便排查问题时,快速查看DS的任务ID',
`job_id` varchar(100) NOT NULL COMMENT '任务在执行引擎中的唯一标识。Presto - query_id;YARN - application_id',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='任务在执行引擎中的唯一标识。Presto - query_id;YARN - application_id'
3.2 迁移数据
INSERT INTO application_job_id_of_engine (application_info_id ,application_id , job_id )
SELECT
id,
application_id,
JSON_UNQUOTE(JSON_EXTRACT(query_id_str , CONCAT('$[', idx, ']'))) AS query_id
FROM application_info
-- 内嵌关联表,生成JSON数组的下标
JOIN (
SELECT 0 AS idx UNION
SELECT 1 AS idx UNION
SELECT 2 AS idx UNION
SELECT 3 AS idx UNION
SELECT 4 AS idx UNION
SELECT 5 AS idx UNION
SELECT 6 AS idx UNION
SELECT 7 AS idx UNION
SELECT 8 AS idx UNION
SELECT 9 AS idx UNION
SELECT 10 AS idx UNION
SELECT 11 AS idx UNION
SELECT 12 AS idx UNION
SELECT 13 AS idx UNION
SELECT 14 AS idx UNION
SELECT 15 AS idx UNION
SELECT 16 AS idx UNION
SELECT 17 AS idx UNION
SELECT 18 AS idx UNION
SELECT 19 AS idx UNION
SELECT 20 AS idx UNION
SELECT 21 AS idx UNION
SELECT 22 AS idx UNION
SELECT 23 AS idx UNION
SELECT 24 AS idx UNION
SELECT 25 AS idx UNION
SELECT 26 AS idx UNION
SELECT 27 AS idx UNION
SELECT 28 AS idx UNION
SELECT 29 AS idx UNION
SELECT 30 AS idx UNION
SELECT 31 AS idx UNION
SELECT 32
-- query_id_str(1024)最多存储33个query_id(31)
) AS indexes
-- 过滤空数据
WHERE JSON_EXTRACT(query_id_str, CONCAT('$[', idx, ']')) IS NOT NULL
ORDER BY id;