hive中get_json_object函数不支持解析json中文key

问题

今天在 Hive 中 get_json_object 函数解析 json 串的时候,发现函数不支持解析 json 中文 key。
例如:

select get_json_object('{ "姓名":"张三" , "年龄":"18" }', '$.姓名');

我们希望的结果是得到姓名对应的值张三,而运行之后的结果为 NULL 值。

select get_json_object('{ "abc姓名":"张三" , "abc":"18" }', '$.abc姓名');

我们希望的结果是得到姓名对应的值张三,而运行之后的结果为 18

产生问题的原因

是什么原因导致的呢?我们查看 Hive 官网中 get_json_object 函数的介绍,可以发现 get_json_object 函数不能解析 json 里面中文的 key,如下图所示:
在这里插入图片描述
json 路径只能包含字符 [0-9a-z_],即不能包含 大写或特殊字符 。此外,键不能以数字开头。

那为什么 json 路径只能包含字符 [0-9a-z_] 呢?

通过查看源码我们发现 get_json_object 对应的 UDF 类的源码如下:

import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.fasterxml.jackson.core.json.JsonReadFeature;
import com.fasterxml.jackson.databind.JavaType;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Iterators;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;

/**
 * UDFJson.
 */
@Description(name = "get_json_object",
        value = "_FUNC_(json_txt, path) - Extract a json object from path ",
        extended = "Extract json object from a json string based on json path "
                + "specified, and return json string of the extracted json object. It "
                + "will return null if the input json string is invalid.\n"
                + "A limited version of JSONPath supported:\n"
                + "  $   : Root object\n"
                + "  .   : Child operator\n"
                + "  []  : Subscript operator for array\n"
                + "  *   : Wildcard for []\n"
                + "Syntax not supported that's worth noticing:\n"
                + "  ''  : Zero length string as key\n"
                + "  ..  : Recursive descent\n"
                + "  @   : Current object/element\n"
                + "  ()  : Script expression\n"
                + "  ?() : Filter (script) expression.\n"
                + "  [,] : Union operator\n"
                + "  [start:end:step] : array slice operator\n")

//定义了一个名为UDFJson的类,继承自UDF类。
public class UDFGetJsonObjectCN extends UDF {
    //定义一个静态正则表达式模式,用于匹配JSON路径中的键。
    //匹配英文key:匹配一个或多个大写字母、小写字母、数字、下划线、连字符、冒号或空格。
    private static final Pattern patternKey = Pattern.compile("^([a-zA-Z0-9_\\-\\:\\s]+).*");
    //定义一个静态正则表达式模式,用于匹配JSON路径中的索引。
    private static final Pattern patternIndex = Pattern.compile("\\[([0-9]+|\\*)\\]");

    //创建一个ObjectMapper对象,用于解析JSON字符串。
    private static final ObjectMapper objectMapper = new ObjectMapper();
    //创建一个JavaType对象,用于表示Map类型。
    private static final JavaType MAP_TYPE = objectMapper.getTypeFactory().constructType(Map.class);
    //创建一个JavaType对象,用于表示List类型。
    private static final JavaType LIST_TYPE = objectMapper.getTypeFactory().constructType(List.class);


    //静态代码块,用于配置ObjectMapper的一些特性。
    static {
        // Allows for unescaped ASCII control characters in JSON values
        objectMapper.enable(JsonReadFeature.ALLOW_UNESCAPED_CONTROL_CHARS.mappedFeature());
        // Enabled to accept quoting of all character backslash qooting mechanism
        objectMapper.enable(JsonReadFeature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER.mappedFeature());
    }

    // An LRU cache using a linked hash map
    //定义了一个静态内部类HashCache,用作LRU缓存。
    static class HashCache<K, V> extends LinkedHashMap<K, V> {

        private static final int CACHE_SIZE = 16;
        private static final int INIT_SIZE = 32;
        private static final float LOAD_FACTOR = 0.6f;

        HashCache() {
            super(INIT_SIZE, LOAD_FACTOR);
        }

        private static final long serialVersionUID = 1;

        @Override
        protected boolean removeEldestEntry(Map.Entry<K, V> eldest) {
            return size() > CACHE_SIZE;
        }

    }

    //声明了一个名为extractObjectCache的HashMap对象,用于缓存已提取的JSON对象。
    Map<String, Object> extractObjectCache = new HashCache<String, Object>();
    //声明了一个名为pathExprCache的HashMap对象,用于缓存已解析的JSON路径表达式。
    Map<String, String[]> pathExprCache = new HashCache<String, String[]>();
    //声明了一个名为indexListCache的HashMap对象,用于缓存已解析的JSON路径中的索引列表。
    Map<String, ArrayList<String>> indexListCache =
            new HashCache<String, ArrayList<String>>();
    //声明了一个名为mKeyGroup1Cache的HashMap对象,用于缓存JSON路径中的键。
    Map<String, String> mKeyGroup1Cache = new HashCache<String, String>();
    //声明了一个名为mKeyMatchesCache的HashMap对象,用于缓存JSON路径中的键是否匹配的结果。
    Map<String, Boolean> mKeyMatchesCache = new HashCache<String, Boolean>();

    //构造函数,没有参数。
    public UDFGetJsonObjectCN() {
    }

    /**
     * Extract json object from a json string based on json path specified, and
     * return json string of the extracted json object. It will return null if the
     * input json string is invalid.
     *
     * A limited version of JSONPath supported: $ : Root object . : Child operator
     * [] : Subscript operator for array * : Wildcard for []
     *
     * Syntax not supported that's worth noticing: '' : Zero length string as key
     * .. : Recursive descent &amp;#064; : Current object/element () : Script
     * expression ?() : Filter (script) expression. [,] : Union operator
     * [start:end:step] : array slice operator
     *
     * @param jsonString
     *          the json string.
     * @param pathString
     *          the json path expression.
     * @return json string or null when an error happens.
     */
    //evaluate方法,用于提取指定路径的JSON对象并返回JSON字符串。
    public Text evaluate(String jsonString, String pathString) {
        if (jsonString == null || jsonString.isEmpty() || pathString == null
                || pathString.isEmpty() || pathString.charAt(0) != '$') {
            return null;
        }

        int pathExprStart = 1;
        boolean unknownType = pathString.equals("$");
        boolean isRootArray = false;

        if (pathString.length() > 1) {
            if (pathString.charAt(1) == '[') {
                pathExprStart = 0;
                isRootArray = true;
            } else if (pathString.charAt(1) == '.') {
                isRootArray = pathString.length() > 2 && pathString.charAt(2) == '[';
            } else {
                return null;
            }
        }

        // Cache pathExpr
        String[] pathExpr = pathExprCache.get(pathString);
        if (pathExpr == null) {
            pathExpr = pathString.split("\\.", -1);
            pathExprCache.put(pathString, pathExpr);
        }

        // Cache extractObject
        Object extractObject = extractObjectCache.get(jsonString);
        if (extractObject == null) {
            if (unknownType) {
                try {
                    extractObject = objectMapper.readValue(jsonString, LIST_TYPE);
                } catch (Exception e) {
                    // Ignore exception
                }
                if (extractObject == null) {
                    try {
                        extractObject = objectMapper.readValue(jsonString, MAP_TYPE);
                    } catch (Exception e) {
                        return null;
                    }
                }
            } else {
                JavaType javaType = isRootArray ? LIST_TYPE : MAP_TYPE;
                try {
                    extractObject = objectMapper.readValue(jsonString, javaType);
                } catch (Exception e) {
                    return null;
                }
            }
            extractObjectCache.put(jsonString, extractObject);
        }

        for (int i = pathExprStart; i < pathExpr.length; i++) {
            if (extractObject == null) {
                return null;
            }
            extractObject = extract(extractObject, pathExpr[i], i == pathExprStart && isRootArray);
        }

        Text result = new Text();
        if (extractObject instanceof Map || extractObject instanceof List) {
            try {
                result.set(objectMapper.writeValueAsString(extractObject));
            } catch (Exception e) {
                return null;
            }
        } else if (extractObject != null) {
            result.set(extractObject.toString());
        } else {
            return null;
        }
        return result;
    }

    //extract方法,递归地提取JSON对象。
    private Object extract(Object json, String path, boolean skipMapProc) {
        // skip MAP processing for the first path element if root is array
        if (!skipMapProc) {
            // Cache patternkey.matcher(path).matches()
            Matcher mKey = null;
            Boolean mKeyMatches = mKeyMatchesCache.get(path);
            if (mKeyMatches == null) {
                mKey = patternKey.matcher(path);
                mKeyMatches = mKey.matches() ? Boolean.TRUE : Boolean.FALSE;
                mKeyMatchesCache.put(path, mKeyMatches);
            }
            if (!mKeyMatches.booleanValue()) {
                return null;
            }

            // Cache mkey.group(1)
            String mKeyGroup1 = mKeyGroup1Cache.get(path);
            if (mKeyGroup1 == null) {
                if (mKey == null) {
                    mKey = patternKey.matcher(path);
                    mKeyMatches = mKey.matches() ? Boolean.TRUE : Boolean.FALSE;
                    mKeyMatchesCache.put(path, mKeyMatches);
                    if (!mKeyMatches.booleanValue()) {
                        return null;
                    }
                }
                mKeyGroup1 = mKey.group(1);
                mKeyGroup1Cache.put(path, mKeyGroup1);
            }
            json = extract_json_withkey(json, mKeyGroup1);
        }
        // Cache indexList
        ArrayList<String> indexList = indexListCache.get(path);
        if (indexList == null) {
            Matcher mIndex = patternIndex.matcher(path);
            indexList = new ArrayList<String>();
            while (mIndex.find()) {
                indexList.add(mIndex.group(1));
            }
            indexListCache.put(path, indexList);
        }

        if (indexList.size() > 0) {
            json = extract_json_withindex(json, indexList);
        }

        return json;
    }

    //创建一个名为jsonList的AddingList对象,用于存储提取出来的JSON对象。
    private transient AddingList jsonList = new AddingList();

    //定义了一个静态内部类AddingList,继承自ArrayList<Object>,用于添加JSON对象到jsonList中。
    private static class AddingList extends ArrayList<Object> {
        private static final long serialVersionUID = 1L;

        @Override
        public Iterator<Object> iterator() {
            return Iterators.forArray(toArray());
        }
        @Override
        public void removeRange(int fromIndex, int toIndex) {
            super.removeRange(fromIndex, toIndex);
        }
    };

    //extract_json_withindex方法,根据JSON路径中的索引提取JSON对象。
    @SuppressWarnings("unchecked")
    private Object extract_json_withindex(Object json, ArrayList<String> indexList) {
        jsonList.clear();
        jsonList.add(json);
        for (String index : indexList) {
            int targets = jsonList.size();
            if (index.equalsIgnoreCase("*")) {
                for (Object array : jsonList) {
                    if (array instanceof List) {
                        for (int j = 0; j < ((List<Object>)array).size(); j++) {
                            jsonList.add(((List<Object>)array).get(j));
                        }
                    }
                }
            } else {
                for (Object array : jsonList) {
                    int indexValue = Integer.parseInt(index);
                    if (!(array instanceof List)) {
                        continue;
                    }
                    List<Object> list = (List<Object>) array;
                    if (indexValue >= list.size()) {
                        continue;
                    }
                    jsonList.add(list.get(indexValue));
                }
            }
            if (jsonList.size() == targets) {
                return null;
            }
            jsonList.removeRange(0, targets);
        }
        if (jsonList.isEmpty()) {
            return null;
        }
        return (jsonList.size() > 1) ? new ArrayList<Object>(jsonList) : jsonList.get(0);
    }

    //extract_json_withkey方法,根据JSON路径中的键提取JSON对象。
    @SuppressWarnings("unchecked")
    private Object extract_json_withkey(Object json, String path) {
        if (json instanceof List) {
            List<Object> jsonArray = new ArrayList<Object>();
            for (int i = 0; i < ((List<Object>) json).size(); i++) {
                Object json_elem = ((List<Object>) json).get(i);
                Object json_obj = null;
                if (json_elem instanceof Map) {
                    json_obj = ((Map<String, Object>) json_elem).get(path);
                } else {
                    continue;
                }
                if (json_obj instanceof List) {
                    for (int j = 0; j < ((List<Object>) json_obj).size(); j++) {
                        jsonArray.add(((List<Object>) json_obj).get(j));
                    }
                } else if (json_obj != null) {
                    jsonArray.add(json_obj);
                }
            }
            return (jsonArray.size() == 0) ? null : jsonArray;
        } else if (json instanceof Map) {
            return ((Map<String, Object>) json).get(path);
        } else {
            return null;
        }
    }
}

代码做了一些注释,我们可以发现 private final Pattern patternKey = Pattern.compile("^([a-zA-Z0-9_\\-\\:\\s]+).*"); 这个就是匹配 key 的模式串,它的意思是匹配以数字、字母、_、-、:、空格 为开头的字符串。那么这个匹配模式串就决定了,get_json_object 函数无法匹配出 key 中带中文的键值对,即select get_json_object('{ "姓名":"张三" , "年龄":"18" }', '$.姓名'); 结果为 null;而 select get_json_object('{ "abc姓名":"张三" , "abc":"18" }', '$.abc姓名'); 中只能匹配以数字、字母、_、-、:、空格 为开头的字符串,所以将 abc姓名 中的 abc 当作 key 去取 value 值,所以得到的值为18

解决办法

知道问题的原因了,那么我们怎么解决这个问题呢,其实很简单,我们只需要修改代码中匹配 key 的正则表达式就可以了。
其实我们可以将 get_json_object 函数的源码拿出来重新写一个 UDF 函数就可以了。

Hive-2.1.1 版本

需要注意自己 Hive 的版本,我们以 Hive-2.1.1 版本为例,代码如下:

package com.yan.hive.udf;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.google.common.collect.Iterators;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
import org.codehaus.jackson.JsonFactory;
import org.codehaus.jackson.JsonParser.Feature;
import org.codehaus.jackson.map.ObjectMapper;
import org.codehaus.jackson.map.type.TypeFactory;
import org.codehaus.jackson.type.JavaType;

/**
 * UDFJson.
 *
 */
@Description(name = "get_json_object_cn",
        value = "_FUNC_(json_txt, path) - Extract a json object from path ",
        extended = "Extract json object from a json string based on json path "
                + "specified, and return json string of the extracted json object. It "
                + "will return null if the input json string is invalid.\n"
                + "A limited version of JSONPath supported:\n"
                + "  $   : Root object\n"
                + "  .   : Child operator\n"
                + "  []  : Subscript operator for array\n"
                + "  *   : Wildcard for []\n"
                + "Syntax not supported that's worth noticing:\n"
                + "  ''  : Zero length string as key\n"
                + "  ..  : Recursive descent\n"
                + "  &amp;#064;   : Current object/element\n"
                + "  ()  : Script expression\n"
                + "  ?() : Filter (script) expression.\n"
                + "  [,] : Union operator\n"
                + "  [start:end:step] : array slice operator\n")
public class UDFGetJsonObjectCN extends UDF {
    //private final Pattern patternKey = Pattern.compile("^([a-zA-Z0-9_\\-\\:\\s]+).*");
    private final Pattern patternKey = Pattern.compile("^([^\\[\\]]+).*");
    private final Pattern patternIndex = Pattern.compile("\\[([0-9]+|\\*)\\]");

    private static final JsonFactory JSON_FACTORY = new JsonFactory();
    static {
        // Allows for unescaped ASCII control characters in JSON values
        JSON_FACTORY.enable(Feature.ALLOW_UNQUOTED_CONTROL_CHARS);
        // Enabled to accept quoting of all character backslash qooting mechanism
        JSON_FACTORY.enable(Feature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER);
    }
    private static final ObjectMapper MAPPER = new ObjectMapper(JSON_FACTORY);
    private static final JavaType MAP_TYPE = TypeFactory.fromClass(Map.class);
    private static final JavaType LIST_TYPE = TypeFactory.fromClass(List.class);

    // An LRU cache using a linked hash map
    static class HashCache<K, V> extends LinkedHashMap<K, V> {

        private static final int CACHE_SIZE = 16;
        private static final int INIT_SIZE = 32;
        private static final float LOAD_FACTOR = 0.6f;

        HashCache() {
            super(INIT_SIZE, LOAD_FACTOR);
        }

        private static final long serialVersionUID = 1;

        @Override
        protected boolean removeEldestEntry(Map.Entry<K, V> eldest) {
            return size() > CACHE_SIZE;
        }

    }

    static Map<String, Object> extractObjectCache = new HashCache<String, Object>();
    static Map<String, String[]> pathExprCache = new HashCache<String, String[]>();
    static Map<String, ArrayList<String>> indexListCache =
            new HashCache<String, ArrayList<String>>();
    static Map<String, String> mKeyGroup1Cache = new HashCache<String, String>();
    static Map<String, Boolean> mKeyMatchesCache = new HashCache<String, Boolean>();

    Text result = new Text();

    public UDFGetJsonObjectCN() {
    }

    /**
     * Extract json object from a json string based on json path specified, and
     * return json string of the extracted json object. It will return null if the
     * input json string is invalid.
     *
     * A limited version of JSONPath supported: $ : Root object . : Child operator
     * [] : Subscript operator for array * : Wildcard for []
     *
     * Syntax not supported that's worth noticing: '' : Zero length string as key
     * .. : Recursive descent &amp;#064; : Current object/element () : Script
     * expression ?() : Filter (script) expression. [,] : Union operator
     * [start:end:step] : array slice operator
     *
     * @param jsonString
     *          the json string.
     * @param pathString
     *          the json path expression.
     * @return json string or null when an error happens.
     */
    public Text evaluate(String jsonString, String pathString) {

        if (jsonString == null || jsonString.isEmpty() || pathString == null
                || pathString.isEmpty() || pathString.charAt(0) != '$') {
            return null;
        }

        int pathExprStart = 1;
        boolean isRootArray = false;

        if (pathString.length() > 1) {
            if (pathString.charAt(1) == '[') {
                pathExprStart = 0;
                isRootArray = true;
            } else if (pathString.charAt(1) == '.') {
                isRootArray = pathString.length() > 2 && pathString.charAt(2) == '[';
            } else {
                return null;
            }
        }

        // Cache pathExpr
        String[] pathExpr = pathExprCache.get(pathString);
        if (pathExpr == null) {
            pathExpr = pathString.split("\\.", -1);
            pathExprCache.put(pathString, pathExpr);
        }

        // Cache extractObject
        Object extractObject = extractObjectCache.get(jsonString);
        if (extractObject == null) {
            JavaType javaType = isRootArray ? LIST_TYPE : MAP_TYPE;
            try {
                extractObject = MAPPER.readValue(jsonString, javaType);
            } catch (Exception e) {
                return null;
            }
            extractObjectCache.put(jsonString, extractObject);
        }
        for (int i = pathExprStart; i < pathExpr.length; i++) {
            if (extractObject == null) {
                return null;
            }
            extractObject = extract(extractObject, pathExpr[i], i == pathExprStart && isRootArray);
        }
        if (extractObject instanceof Map || extractObject instanceof List) {
            try {
                result.set(MAPPER.writeValueAsString(extractObject));
            } catch (Exception e) {
                return null;
            }
        } else if (extractObject != null) {
            result.set(extractObject.toString());
        } else {
            return null;
        }
        return result;
    }

    private Object extract(Object json, String path, boolean skipMapProc) {
        // skip MAP processing for the first path element if root is array
        if (!skipMapProc) {
            // Cache patternkey.matcher(path).matches()
            Matcher mKey = null;
            Boolean mKeyMatches = mKeyMatchesCache.get(path);
            if (mKeyMatches == null) {
                mKey = patternKey.matcher(path);
                mKeyMatches = mKey.matches() ? Boolean.TRUE : Boolean.FALSE;
                mKeyMatchesCache.put(path, mKeyMatches);
            }
            if (!mKeyMatches.booleanValue()) {
                return null;
            }

            // Cache mkey.group(1)
            String mKeyGroup1 = mKeyGroup1Cache.get(path);
            if (mKeyGroup1 == null) {
                if (mKey == null) {
                    mKey = patternKey.matcher(path);
                    mKeyMatches = mKey.matches() ? Boolean.TRUE : Boolean.FALSE;
                    mKeyMatchesCache.put(path, mKeyMatches);
                    if (!mKeyMatches.booleanValue()) {
                        return null;
                    }
                }
                mKeyGroup1 = mKey.group(1);
                mKeyGroup1Cache.put(path, mKeyGroup1);
            }
            json = extract_json_withkey(json, mKeyGroup1);
        }
        // Cache indexList
        ArrayList<String> indexList = indexListCache.get(path);
        if (indexList == null) {
            Matcher mIndex = patternIndex.matcher(path);
            indexList = new ArrayList<String>();
            while (mIndex.find()) {
                indexList.add(mIndex.group(1));
            }
            indexListCache.put(path, indexList);
        }

        if (indexList.size() > 0) {
            json = extract_json_withindex(json, indexList);
        }

        return json;
    }

    private transient AddingList jsonList = new AddingList();

    private static class AddingList extends ArrayList<Object> {
        @Override
        public Iterator<Object> iterator() {
            return Iterators.forArray(toArray());
        }
        @Override
        public void removeRange(int fromIndex, int toIndex) {
            super.removeRange(fromIndex, toIndex);
        }
    };

    @SuppressWarnings("unchecked")
    private Object extract_json_withindex(Object json, ArrayList<String> indexList) {

        jsonList.clear();
        jsonList.add(json);
        for (String index : indexList) {
            int targets = jsonList.size();
            if (index.equalsIgnoreCase("*")) {
                for (Object array : jsonList) {
                    if (array instanceof List) {
                        for (int j = 0; j < ((List<Object>)array).size(); j++) {
                            jsonList.add(((List<Object>)array).get(j));
                        }
                    }
                }
            } else {
                for (Object array : jsonList) {
                    int indexValue = Integer.parseInt(index);
                    if (!(array instanceof List)) {
                        continue;
                    }
                    List<Object> list = (List<Object>) array;
                    if (indexValue >= list.size()) {
                        continue;
                    }
                    jsonList.add(list.get(indexValue));
                }
            }
            if (jsonList.size() == targets) {
                return null;
            }
            jsonList.removeRange(0, targets);
        }
        if (jsonList.isEmpty()) {
            return null;
        }
        return (jsonList.size() > 1) ? new ArrayList<Object>(jsonList) : jsonList.get(0);
    }

    @SuppressWarnings("unchecked")
    private Object extract_json_withkey(Object json, String path) {
        if (json instanceof List) {
            List<Object> jsonArray = new ArrayList<Object>();
            for (int i = 0; i < ((List<Object>) json).size(); i++) {
                Object json_elem = ((List<Object>) json).get(i);
                Object json_obj = null;
                if (json_elem instanceof Map) {
                    json_obj = ((Map<String, Object>) json_elem).get(path);
                } else {
                    continue;
                }
                if (json_obj instanceof List) {
                    for (int j = 0; j < ((List<Object>) json_obj).size(); j++) {
                        jsonArray.add(((List<Object>) json_obj).get(j));
                    }
                } else if (json_obj != null) {
                    jsonArray.add(json_obj);
                }
            }
            return (jsonArray.size() == 0) ? null : jsonArray;
        } else if (json instanceof Map) {
            return ((Map<String, Object>) json).get(path);
        } else {
            return null;
        }
    }
}

需要导入的依赖,要和自己集群的版本契合,Hadoop 的版本及 Hive 的版本。

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.atguigu.hive</groupId>
    <artifactId>hivetest</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <hadoop.version>3.0.0</hadoop.version>
        <hive.version>2.1.1</hive.version>
        <jackson.version>1.9.2</jackson.version>
        <guava.version>14.0.1</guava.version>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-exec</artifactId>
            <version>${hive.version}</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.hive/hive-jdbc -->
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-jdbc</artifactId>
            <version>${hive.version}</version>
        </dependency>


        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>${hadoop.version}</version>
        </dependency>

        <dependency>
            <groupId>com.google.guava</groupId>
            <artifactId>guava</artifactId>
            <version>${guava.version}</version>
        </dependency>

        <dependency>
            <groupId>org.codehaus.jackson</groupId>
            <artifactId>jackson-core-asl</artifactId>
            <version>${jackson.version}</version>
        </dependency>
        <dependency>
            <groupId>org.codehaus.jackson</groupId>
            <artifactId>jackson-mapper-asl</artifactId>
            <version>${jackson.version}</version>
        </dependency>
        <dependency>
            <groupId>org.codehaus.jackson</groupId>
            <artifactId>jackson-jaxrs</artifactId>
            <version>${jackson.version}</version>
        </dependency>
        <dependency>
            <groupId>org.codehaus.jackson</groupId>
            <artifactId>jackson-xc</artifactId>
            <version>${jackson.version}</version>
        </dependency>


    </dependencies>
    
</project>

注意: 因为上述UDF中也用到了 com.google.guavaorg.codehaus.jackson,所以这两个依赖要和 hive 版本中所用的依赖版本一致。

Hive-4.0.0 版本

package com.yan.hive.udf;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.fasterxml.jackson.core.json.JsonReadFeature;
import com.fasterxml.jackson.databind.JavaType;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Iterators;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;

/**
 * @author Yan
 * @create 2023-08-05 22:21
 * hive解析json中文key
 */
@Description(name = "get_json_object_cn",
        value = "_FUNC_(json_txt, path) - Extract a json object from path ",
        extended = "Extract json object from a json string based on json path "
                + "specified, and return json string of the extracted json object. It "
                + "will return null if the input json string is invalid.\n"
                + "A limited version of JSONPath supported:\n"
                + "  $   : Root object\n"
                + "  .   : Child operator\n"
                + "  []  : Subscript operator for array\n"
                + "  *   : Wildcard for []\n"
                + "Syntax not supported that's worth noticing:\n"
                + "  ''  : Zero length string as key\n"
                + "  ..  : Recursive descent\n"
                + "  &amp;#064;   : Current object/element\n"
                + "  ()  : Script expression\n"
                + "  ?() : Filter (script) expression.\n"
                + "  [,] : Union operator\n"
                + "  [start:end:step] : array slice operator\n")

//定义了一个名为UDFJson的类,继承自UDF类。
public class UDFGetJsonObjectCN extends UDF {
    //定义一个静态正则表达式模式,用于匹配JSON路径中的键。
    //匹配英文key:匹配一个或多个大写字母、小写字母、数字、下划线、连字符、冒号或空格。
    //private static final Pattern patternKey = Pattern.compile("^([a-zA-Z0-9_\\-\\:\\s]+).*");
    //可以匹配中文,\\p{L}来匹配任意Unicode字母字符,包括中文字符:英文、数字、下划线、连字符、冒号、空格和中文字符。
    //private static final Pattern patternKey = Pattern.compile("^([a-zA-Z0-9_\\-\\:\\s\\p{L}]+).*");
    //可以匹配中文,\\p{L}来匹配任意Unicode字母字符,包括中文字符,但不包含特殊字符,特殊字符需自己添加
    //private static final Pattern patternKey = Pattern.compile("^([a-zA-Z0-9_\\-\\:\\s?%*+\\p{L}]+).*");
    //可以匹配中文,包含特殊字符,但不包含英文下的点(.);还有就是匹配不到路径中的索引了
    //private static final Pattern patternKey = Pattern.compile("^(.+).*");
    //可以匹配中文,包含特殊字符,不包中括号"[]",但不包含英文下的点(.);这样就可以匹配路径中的索引了
    private static final Pattern patternKey = Pattern.compile("^([^\\[\\]]+).*");
    //定义一个静态正则表达式模式,用于匹配JSON路径中的索引。
    private static final Pattern patternIndex = Pattern.compile("\\[([0-9]+|\\*)\\]");

    //创建一个ObjectMapper对象,用于解析JSON字符串。
    private static final ObjectMapper objectMapper = new ObjectMapper();
    //创建一个JavaType对象,用于表示Map类型。
    private static final JavaType MAP_TYPE = objectMapper.getTypeFactory().constructType(Map.class);
    //创建一个JavaType对象,用于表示List类型。
    private static final JavaType LIST_TYPE = objectMapper.getTypeFactory().constructType(List.class);


    //静态代码块,用于配置ObjectMapper的一些特性。
    static {
        // Allows for unescaped ASCII control characters in JSON values
        objectMapper.enable(JsonReadFeature.ALLOW_UNESCAPED_CONTROL_CHARS.mappedFeature());
        // Enabled to accept quoting of all character backslash qooting mechanism
        objectMapper.enable(JsonReadFeature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER.mappedFeature());
    }

    // An LRU cache using a linked hash map
    //定义了一个静态内部类HashCache,用作LRU缓存。
    static class HashCache<K, V> extends LinkedHashMap<K, V> {

        private static final int CACHE_SIZE = 16;
        private static final int INIT_SIZE = 32;
        private static final float LOAD_FACTOR = 0.6f;

        HashCache() {
            super(INIT_SIZE, LOAD_FACTOR);
        }

        private static final long serialVersionUID = 1;

        @Override
        protected boolean removeEldestEntry(Map.Entry<K, V> eldest) {
            return size() > CACHE_SIZE;
        }

    }

    //声明了一个名为extractObjectCache的HashMap对象,用于缓存已提取的JSON对象。
    Map<String, Object> extractObjectCache = new HashCache<String, Object>();
    //声明了一个名为pathExprCache的HashMap对象,用于缓存已解析的JSON路径表达式。
    Map<String, String[]> pathExprCache = new HashCache<String, String[]>();
    //声明了一个名为indexListCache的HashMap对象,用于缓存已解析的JSON路径中的索引列表。
    Map<String, ArrayList<String>> indexListCache =
            new HashCache<String, ArrayList<String>>();
    //声明了一个名为mKeyGroup1Cache的HashMap对象,用于缓存JSON路径中的键。
    Map<String, String> mKeyGroup1Cache = new HashCache<String, String>();
    //声明了一个名为mKeyMatchesCache的HashMap对象,用于缓存JSON路径中的键是否匹配的结果。
    Map<String, Boolean> mKeyMatchesCache = new HashCache<String, Boolean>();

    //构造函数,没有参数。
    public UDFGetJsonObjectCN() {
    }

    /**
     * Extract json object from a json string based on json path specified, and
     * return json string of the extracted json object. It will return null if the
     * input json string is invalid.
     *
     * A limited version of JSONPath supported: $ : Root object . : Child operator
     * [] : Subscript operator for array * : Wildcard for []
     *
     * Syntax not supported that's worth noticing: '' : Zero length string as key
     * .. : Recursive descent &amp;#064; : Current object/element () : Script
     * expression ?() : Filter (script) expression. [,] : Union operator
     * [start:end:step] : array slice operator
     *
     * @param jsonString
     *          the json string.
     * @param pathString
     *          the json path expression.
     * @return json string or null when an error happens.
     */
    //evaluate方法,用于提取指定路径的JSON对象并返回JSON字符串。
    public Text evaluate(String jsonString, String pathString) {
        if (jsonString == null || jsonString.isEmpty() || pathString == null
                || pathString.isEmpty() || pathString.charAt(0) != '$') {
            return null;
        }

        int pathExprStart = 1;
        boolean unknownType = pathString.equals("$");
        boolean isRootArray = false;

        if (pathString.length() > 1) {
            if (pathString.charAt(1) == '[') {
                pathExprStart = 0;
                isRootArray = true;
            } else if (pathString.charAt(1) == '.') {
                isRootArray = pathString.length() > 2 && pathString.charAt(2) == '[';
            } else {
                return null;
            }
        }

        // Cache pathExpr
        String[] pathExpr = pathExprCache.get(pathString);
        if (pathExpr == null) {
            pathExpr = pathString.split("\\.", -1);
            pathExprCache.put(pathString, pathExpr);
        }

        // Cache extractObject
        Object extractObject = extractObjectCache.get(jsonString);
        if (extractObject == null) {
            if (unknownType) {
                try {
                    extractObject = objectMapper.readValue(jsonString, LIST_TYPE);
                } catch (Exception e) {
                    // Ignore exception
                }
                if (extractObject == null) {
                    try {
                        extractObject = objectMapper.readValue(jsonString, MAP_TYPE);
                    } catch (Exception e) {
                        return null;
                    }
                }
            } else {
                JavaType javaType = isRootArray ? LIST_TYPE : MAP_TYPE;
                try {
                    extractObject = objectMapper.readValue(jsonString, javaType);
                } catch (Exception e) {
                    return null;
                }
            }
            extractObjectCache.put(jsonString, extractObject);
        }

        for (int i = pathExprStart; i < pathExpr.length; i++) {
            if (extractObject == null) {
                return null;
            }
            extractObject = extract(extractObject, pathExpr[i], i == pathExprStart && isRootArray);
        }

        Text result = new Text();
        if (extractObject instanceof Map || extractObject instanceof List) {
            try {
                result.set(objectMapper.writeValueAsString(extractObject));
            } catch (Exception e) {
                return null;
            }
        } else if (extractObject != null) {
            result.set(extractObject.toString());
        } else {
            return null;
        }
        return result;
    }

    //extract方法,递归地提取JSON对象。
    private Object extract(Object json, String path, boolean skipMapProc) {
        // skip MAP processing for the first path element if root is array
        if (!skipMapProc) {
            // Cache patternkey.matcher(path).matches()
            Matcher mKey = null;
            Boolean mKeyMatches = mKeyMatchesCache.get(path);
            if (mKeyMatches == null) {
                mKey = patternKey.matcher(path);
                mKeyMatches = mKey.matches() ? Boolean.TRUE : Boolean.FALSE;
                mKeyMatchesCache.put(path, mKeyMatches);
            }
            if (!mKeyMatches.booleanValue()) {
                return null;
            }

            // Cache mkey.group(1)
            String mKeyGroup1 = mKeyGroup1Cache.get(path);
            if (mKeyGroup1 == null) {
                if (mKey == null) {
                    mKey = patternKey.matcher(path);
                    mKeyMatches = mKey.matches() ? Boolean.TRUE : Boolean.FALSE;
                    mKeyMatchesCache.put(path, mKeyMatches);
                    if (!mKeyMatches.booleanValue()) {
                        return null;
                    }
                }
                mKeyGroup1 = mKey.group(1);
                mKeyGroup1Cache.put(path, mKeyGroup1);
            }
            json = extract_json_withkey(json, mKeyGroup1);
        }
        // Cache indexList
        ArrayList<String> indexList = indexListCache.get(path);
        if (indexList == null) {
            Matcher mIndex = patternIndex.matcher(path);
            indexList = new ArrayList<String>();
            while (mIndex.find()) {
                indexList.add(mIndex.group(1));
            }
            indexListCache.put(path, indexList);
        }

        if (indexList.size() > 0) {
            json = extract_json_withindex(json, indexList);
        }

        return json;
    }

    //创建一个名为jsonList的AddingList对象,用于存储提取出来的JSON对象。
    private transient AddingList jsonList = new AddingList();

    //定义了一个静态内部类AddingList,继承自ArrayList<Object>,用于添加JSON对象到jsonList中。
    private static class AddingList extends ArrayList<Object> {
        private static final long serialVersionUID = 1L;

        @Override
        public Iterator<Object> iterator() {
            return Iterators.forArray(toArray());
        }
        @Override
        public void removeRange(int fromIndex, int toIndex) {
            super.removeRange(fromIndex, toIndex);
        }
    };

    //extract_json_withindex方法,根据JSON路径中的索引提取JSON对象。
    @SuppressWarnings("unchecked")
    private Object extract_json_withindex(Object json, ArrayList<String> indexList) {
        jsonList.clear();
        jsonList.add(json);
        for (String index : indexList) {
            int targets = jsonList.size();
            if (index.equalsIgnoreCase("*")) {
                for (Object array : jsonList) {
                    if (array instanceof List) {
                        for (int j = 0; j < ((List<Object>)array).size(); j++) {
                            jsonList.add(((List<Object>)array).get(j));
                        }
                    }
                }
            } else {
                for (Object array : jsonList) {
                    int indexValue = Integer.parseInt(index);
                    if (!(array instanceof List)) {
                        continue;
                    }
                    List<Object> list = (List<Object>) array;
                    if (indexValue >= list.size()) {
                        continue;
                    }
                    jsonList.add(list.get(indexValue));
                }
            }
            if (jsonList.size() == targets) {
                return null;
            }
            jsonList.removeRange(0, targets);
        }
        if (jsonList.isEmpty()) {
            return null;
        }
        return (jsonList.size() > 1) ? new ArrayList<Object>(jsonList) : jsonList.get(0);
    }

    //extract_json_withkey方法,根据JSON路径中的键提取JSON对象。
    @SuppressWarnings("unchecked")
    private Object extract_json_withkey(Object json, String path) {
        if (json instanceof List) {
            List<Object> jsonArray = new ArrayList<Object>();
            for (int i = 0; i < ((List<Object>) json).size(); i++) {
                Object json_elem = ((List<Object>) json).get(i);
                Object json_obj = null;
                if (json_elem instanceof Map) {
                    json_obj = ((Map<String, Object>) json_elem).get(path);
                } else {
                    continue;
                }
                if (json_obj instanceof List) {
                    for (int j = 0; j < ((List<Object>) json_obj).size(); j++) {
                        jsonArray.add(((List<Object>) json_obj).get(j));
                    }
                } else if (json_obj != null) {
                    jsonArray.add(json_obj);
                }
            }
            return (jsonArray.size() == 0) ? null : jsonArray;
        } else if (json instanceof Map) {
            return ((Map<String, Object>) json).get(path);
        } else {
            return null;
        }
    }
}

依赖

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.atguigu.hive</groupId>
    <artifactId>hivetest</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <hadoop.version>3.3.1</hadoop.version>
        <hive.version>4.0.0</hive.version>
		<jackson.version>2.13.5</jackson.version>
		<guava.version>22.0</guava.version>
    </properties>

    <dependencies>
    
      	<dependency>
        	<groupId>com.fasterxml.jackson</groupId>
        	<artifactId>jackson-bom</artifactId>
        	<version>${jackson.version}</version>
        	<type>pom</type>
        	<scope>import</scope>
      	</dependency>
      
      	<dependency>
        	<groupId>com.google.guava</groupId>
        	<artifactId>guava</artifactId>
        	<version>${guava.version}</version>
      	</dependency>

        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-core</artifactId>
            <version>${jackson.version}</version>
        </dependency>

        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-databind</artifactId>
            <version>${jackson.version}</version>
        </dependency>


        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-jdbc</artifactId>
            <version>${hive.version}</version>
        </dependency>

    </dependencies>
    
</project>

Hive 版本不同,之间的依赖可能就有些许差距,如果不注意的话可能会报依赖错误。

参考文章:

get_json_object不能解析json里面中文的key

get_json_object源码

impala&hive自定义UDF解析json中文key

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值