java 解析hiveserver2日志解析HiveSQL 获取表的使用次数热度

最新推荐文章于 2024-07-26 15:16:16 发布

MIDSUMMER_yy

最新推荐文章于 2024-07-26 15:16:16 发布

阅读量6.2k

点赞数 2

分类专栏：大数据文章标签： java hive

本文链接：https://blog.csdn.net/qq_38151907/article/details/125925710

版权

大数据专栏收录该内容

26 篇文章 2 订阅

订阅专栏

java 解析hiveserver2日志解析HiveSQL 获取表的使用次数热度

首先逐行读取hiveserver2日志
日志里每个sql之前都会包含Executing command关键字
故先匹配关键字

if (str.contains("Executing command"))

每行的打头都是固定的头部，使用正则表达式进行切割

String sqlRegex = ".*Executing command\\(queryId=[\\w-]*\\):";
String sql = str.replaceAll(sqlRegex, " ").trim();

下面是解析日志的工具类，首先导入依赖

<properties>
        <hive.version>1.1.0</hive.version>
        <hadoop.version>2.6.0</hadoop.version>
        <poi.version>4.1.2</poi.version>
    </properties>

    <dependencies>

        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>${poi.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>${poi.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml-schemas</artifactId>
            <version>${poi.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
            <version>${poi.version}</version>
        </dependency>

        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-exec</artifactId>
            <version>${hive.version}</version>
            <scope>compile</scope>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>${hadoop.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-core</artifactId>
            <version>${hadoop.version}</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/com.alibaba/fastjson -->
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.58</version>
        </dependency>

        <dependency>
            <groupId>commons-httpclient</groupId>
            <artifactId>commons-httpclient</artifactId>
            <version>3.1</version>
        </dependency>

        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.36</version>
        </dependency>

    </dependencies>

一些类似mysql的依赖可以按需去掉，一些依赖因为后面的博客要用，我这边就一起写在这
下面是Hive日志解析工具类的代码

import org.apache.hadoop.hive.ql.parse.*;

import java.util.*;

/**
 * 目的：获取AST中的表，列，以及对其所做的操作，如SELECT,INSERT
 * 重点：获取SELECT操作中的表和列的相关操作。其他操作这判断到表级别。
 * 实现思路：对AST深度优先遍历，遇到操作的token则判断当前的操作，
 *                     遇到TOK_TAB或TOK_TABREF则判断出当前操作的表，遇到子句则压栈当前处理，处理子句。
 *                    子句处理完，栈弹出。
 *
 */
public class HiveParseUtils {

    private static final String UNKNOWN = "UNKNOWN";
    private static Map<String, String> alias = new HashMap<>();
    private static Map<String, String> cols = new TreeMap<>();
    private static Map<String, String> colAlais = new TreeMap<>();
    public static List<String> tables = new ArrayList<>();
    private static Stack<String> tableNameStack = new Stack<>();
    private static Stack<Oper> operStack = new Stack<>();
    private static String nowQueryTable = "";//定义及处理不清晰，修改为query或from节点对应的table集合或许好点。目前正在查询处理的表可能不止一个。
    private static Oper oper ;
    private static boolean joinClause = false;

    private enum Oper {
        SELECT, INSERT, DROP, TRUNCATE, LOAD, CREATETABLE, ALTER
    }
    public static Set<String> parseIteral(ASTNode ast) {
        Set<String> set= new HashSet<>();//当前查询所对应到的表集合
        prepareToParseCurrentNodeAndChilds(ast);
        set.addAll(parseChildNodes(ast));
        set.addAll(parseCurrentNode(ast ,set));
        endParseCurrentNode(ast);
        return set;
    }
    private static void endParseCurrentNode(ASTNode ast){
        if (ast.getToken() != null) {
            switch (ast.getToken().getType()) {//join 从句结束，跳出join
                case HiveParser.TOK_RIGHTOUTERJOIN:
                case HiveParser.TOK_LEFTOUTERJOIN:
                case HiveParser.TOK_JOIN:
                    joinClause = false;
                    break;
                case HiveParser.TOK_QUERY:
                    break;
                case HiveParser.TOK_INSERT:
                case HiveParser.TOK_SELECT:
                    nowQueryTable = tableNameStack.pop();
                    oper = operStack.pop();
                    break;
            }
        }
    }
    private static Set<String> parseCurrentNode(ASTNode ast, Set<String> set){
        if (ast.getToken() != null) {
            switch (ast.getToken().getType()) {
                case HiveParser.TOK_TABLE_PARTITION:
                    if (ast.getChildCount() != 2) {
                        String table = BaseSemanticAnalyzer
                                .getUnescapedName((ASTNode) ast.getChild(0));
                        if (oper == Oper.SELECT) {
                            nowQueryTable = table;
                        }
                        tables.add(table + "  " + oper);
                    }
                    break;

                case HiveParser.TOK_TAB:// outputTable
                    String tableTab = BaseSemanticAnalyzer
                            .getUnescapedName((ASTNode) ast.getChild(0));
                    if (oper == Oper.SELECT) {
                        nowQueryTable = tableTab;
                    }
                    tables.add(tableTab + "  " + oper);
                    break;
                case HiveParser.TOK_TABREF:// inputTable
                    ASTNode tabTree = (ASTNode) ast.getChild(0);
                    String tableName = (tabTree.getChildCount() == 1) ? BaseSemanticAnalyzer
                            .getUnescapedName((ASTNode) tabTree.getChild(0))
                            : BaseSemanticAnalyzer
                            .getUnescapedName((ASTNode) tabTree.getChild(0))
                            + "." + tabTree.getChild(1);
                    if (oper == Oper.SELECT) {
                        if(joinClause && !"".equals(nowQueryTable) ){
                            nowQueryTable += "&"+tableName;//
                        }else{
                            nowQueryTable = tableName;
                        }
                        set.add(tableName);
                    }
                    tables.add(tableName + "  " + oper);
                    if (ast.getChild(1) != null) {
                        String alia = ast.getChild(1).getText().toLowerCase();
                        alias.put(alia, tableName);
                    }
                    break;
                case HiveParser.TOK_TABLE_OR_COL:
                    if (ast.getParent().getType() != HiveParser.DOT) {
                        String col = ast.getChild(0).getText().toLowerCase();
                        if (alias.get(col) == null
                                && colAlais.get(nowQueryTable + "." + col) == null) {
                            if(nowQueryTable.indexOf("&") > 0){//sql23
                                cols.put(UNKNOWN + "." + col, "");
                            }else{
                                cols.put(nowQueryTable + "." + col, "");
                            }
                        }
                    }
                    break;
                case HiveParser.TOK_ALLCOLREF:
                    cols.put(nowQueryTable + ".*", "");
                    break;
                case HiveParser.TOK_SUBQUERY:
                    if (ast.getChildCount() == 2) {
                        String tableAlias = unescapeIdentifier(ast.getChild(1)
                                .getText());
                        String aliaReal = "";
                        for(String table : set){
                            aliaReal+=table+"&";
                        }
                        if(aliaReal.length() !=0){
                            aliaReal = aliaReal.substring(0, aliaReal.length()-1);
                        }
                        alias.put(tableAlias, aliaReal);
                    }
                    break;

                case HiveParser.TOK_SELEXPR:
                    if (ast.getChild(0).getType() == HiveParser.TOK_TABLE_OR_COL) {
                        String column = ast.getChild(0).getChild(0).getText()
                                .toLowerCase();
                        if(nowQueryTable.indexOf("&") > 0){
                            cols.put(UNKNOWN + "." + column, "");
                        }else if (colAlais.get(nowQueryTable + "." + column) == null) {
                            cols.put(nowQueryTable + "." + column, "");
                        }
                    } else if (ast.getChild(1) != null) {
                        String columnAlia = ast.getChild(1).getText().toLowerCase();
                        colAlais.put(nowQueryTable + "." + columnAlia, "");
                    }
                    break;
                case HiveParser.DOT:
                    if (ast.getType() == HiveParser.DOT) {
                        if (ast.getChildCount() == 2) {
                            if (ast.getChild(0).getType() == HiveParser.TOK_TABLE_OR_COL
                                    && ast.getChild(0).getChildCount() == 1
                                    && ast.getChild(1).getType() == HiveParser.Identifier) {
                                String alia = BaseSemanticAnalyzer
                                        .unescapeIdentifier(ast.getChild(0)
                                                .getChild(0).getText()
                                                .toLowerCase());
                                String column = BaseSemanticAnalyzer
                                        .unescapeIdentifier(ast.getChild(1)
                                                .getText().toLowerCase());
                                String realTable = null;
                                if (!tables.contains(alia + "" + oper)
                                        && alias.get(alia) == null) {// [b SELECT, a
                                    // SELECT]
                                    alias.put(alia, nowQueryTable);
                                }
                                if (tables.contains(alia + "" + oper)) {
                                    realTable = alia;
                                } else if (alias.get(alia) != null) {
                                    realTable = alias.get(alia);
                                }
                                if (realTable == null || realTable.length() == 0 || realTable.indexOf("&") > 0) {
                                    realTable = UNKNOWN;
                                }
                                cols.put(realTable + "." + column, "");

                            }
                        }
                    }
                    break;
                case HiveParser.TOK_ALTERTABLE_ADDPARTS:
                case HiveParser.TOK_ALTERTABLE_RENAME:
                case HiveParser.TOK_ALTERTABLE_ADDCOLS:
                    ASTNode alterTableName = (ASTNode) ast.getChild(0);
                    tables.add(alterTableName.getText() + "" + oper);
                    break;
            }
        }
        return set;
    }
    private static Set<String> parseChildNodes(ASTNode ast){
        Set<String> set= new HashSet<String>();
        int numCh = ast.getChildCount();
        if (numCh > 0) {
            for (int num = 0; num < numCh; num++) {
                ASTNode child = (ASTNode) ast.getChild(num);
                set.addAll(parseIteral(child));
            }
        }
        return set;
    }
    private static void prepareToParseCurrentNodeAndChilds(ASTNode ast){
        if (ast.getToken() != null) {
            switch (ast.getToken().getType()) {//join 从句开始
                case HiveParser.TOK_RIGHTOUTERJOIN:
                case HiveParser.TOK_LEFTOUTERJOIN:
                case HiveParser.TOK_JOIN:
                    joinClause = true;
                    break;
                case HiveParser.TOK_QUERY:
                    tableNameStack.push(nowQueryTable);
                    operStack.push(oper);
                    nowQueryTable = "";//sql22
                    oper = Oper.SELECT;
                    break;
                case HiveParser.TOK_INSERT:
                    tableNameStack.push(nowQueryTable);
                    operStack.push(oper);
                    oper = Oper.INSERT;
                    break;
                case HiveParser.TOK_SELECT:
                    tableNameStack.push(nowQueryTable);
                    operStack.push(oper);
                    oper = Oper.SELECT;
                    break;
                case HiveParser.TOK_DROPTABLE:
                    oper = Oper.DROP;
                    break;
                case HiveParser.TOK_TRUNCATETABLE:
                    oper = Oper.TRUNCATE;
                    break;
                case HiveParser.TOK_LOAD:
                    oper = Oper.LOAD;
                    break;
                case HiveParser.TOK_CREATETABLE:
                    oper = Oper.CREATETABLE;
                    break;
            }
            if (ast.getToken() != null
                    && ast.getToken().getType() >= HiveParser.TOK_ALTERDATABASE_PROPERTIES
                    && ast.getToken().getType() <= HiveParser.TOK_ALTERVIEW_RENAME) {
                oper = Oper.ALTER;
            }
        }
    }
    public static String unescapeIdentifier(String val) {
        if (val == null) {
            return null;
        }
        if (val.charAt(0) == '`' && val.charAt(val.length() - 1) == '`') {
            val = val.substring(1, val.length() - 1);
        }
        return val;
    }

    //获取字段和别名
    private static void output(Map<String, String> map) {
        Iterator<String> it = map.keySet().iterator();
        while (it.hasNext()) {
            String key = it.next();
            System.out.println(key + "" + map.get(key));
        }
    }

    public static List<String> splitSql(String sql) {
        sql = sql.replaceAll("^\\s*|\\s*$", "");
        char[] cs = sql.toCharArray();
        int quotTimes = 0;
        List<Integer> marksSplit = new ArrayList();
        for (int i = 0; i < cs.length; i++) {
            char c = cs[i];
            if (c == 39) {
                quotTimes++;
            }
            if (c == 59 && quotTimes % 2 == 0) {
                marksSplit.add(i);
            }
            if (i == cs.length - 1 && c != 59) {
                marksSplit.add(i + 1);
            }
        }
        List<String> sqls = new ArrayList<>();
        if (!marksSplit.isEmpty()) {
            for (int i = 0; i < marksSplit.size(); i++) {
                if (i == 0) {
                    sqls.add(sql.substring(0, marksSplit.get(i)));
                } else {
                    sqls.add(sql.substring(marksSplit.get(i - 1) + 1, marksSplit.get(i)));
                }
            }
        } else {
            sqls.add(sql);
        }
        return sqls;
    }

    public static Map<String, Hashtable<String, Integer>> sqlParse(String sql) throws ParseException {
        ParseDriver pd = new ParseDriver();
        List<String> sqlList = splitSql(sql);
        Map<String, Hashtable<String, Integer>> map = new HashMap<>();
        for(String s : sqlList) {
            ASTNode ast = pd.parse(s);
            parseIteral(ast);
            Hashtable<String, Integer> hashtable = new Hashtable<>();
            for (String table : tables) {
            	String[] split = table.split("  ");
            	String tableName = split[0];
            	if(hashtable.containsKey(tableName)) {
            		hashtable.put(tableName, hashtable.get(tableName)+1);
            	}else {
            		hashtable.put(tableName, 1);
            	}
            }
            map.put("tables", hashtable);
        }
        return map;
    }
    /**
     * 解析sql并选出指定类型的相关表
     * @param sql
     * @param type
     * @return
     * @throws ParseException
     */
    public static Map<String, Hashtable<String, Integer>> sqlParseAssignType(String sql,String type) throws Exception {
        ParseDriver pd = new ParseDriver();
        List<String> sqlList = splitSql(sql);
        Map<String, Hashtable<String, Integer>> map = new HashMap<>();
        for(String s : sqlList) {
        	ASTNode ast = pd.parse(s);

            parseIteral(ast);
            Hashtable<String, Integer> hashtable = new Hashtable<>();
            for (String table : tables) {
            	String[] split = table.split("  ");
            	String tableName = split[0];
            	String tableType = split[1];
            	if(tableType.equalsIgnoreCase(type)) {  //只有符合类型的才会被保存
            		if(hashtable.containsKey(tableName)) {
                		hashtable.put(tableName, hashtable.get(tableName)+1);
                	}else {
                		hashtable.put(tableName, 1);
                	}
            	}

            }
            map.put("tables", hashtable);
        }
        return map;
    }

    public static Map<String, Hashtable<String, Integer>> sqlParseAssignType(String sql) throws Exception {
        ParseDriver pd = new ParseDriver();
        List<String> sqlList = splitSql(sql);
        Map<String, Hashtable<String, Integer>> map = new HashMap<>();
        for(String s : sqlList) {
            ASTNode ast = pd.parse(s);
            parseIteral(ast);
            Hashtable<String, Integer> hashtable = new Hashtable<>();
            for (String table : tables) {
                String[] split = table.split("  ");
                String tableName = split[0];
                if(hashtable.containsKey(tableName)) {
                    hashtable.put(tableName, hashtable.get(tableName)+1);
                }else {
                    hashtable.put(tableName, 1);
                }
            }
            map.put("tables", hashtable);
        }
        return map;
    }

}

定义一个集合收集一下解析的内容

Map<String, Hashtable<String, Integer>> sqlParse=new HashMap<>();

对sql进行解析

HiveParseUtils.tables = new ArrayList<>();
sqlParse = HiveParseUtils.sqlParseAssignType(line);

获得热度

//得到内容
Map<String,Integer> result=new HashMap<>();
Hashtable<String, Integer> hashtable = sqlParse.get("tables");
//循环遍历，key为表名，value为表的使用次数
for (Iterator itr = hashtable.keySet().iterator(); itr.hasNext(); ) {
                    String key = (String) itr.next();
                    String table = key.trim().toLowerCase();
                    if (key.contains(".")) {
                    //这里这步是只获取表名，不要库名，可按需改造
                        table = key.split("\\.")[1];
                    }
                    
                    int value = hashtable.get(key);
                }