HiveParse 进行sql语句解析
解析sql中表名,查询字段名,where条件字段名
调用工具处理类
import com.alibaba.fastjson.JSONObject;
import org.apache.hadoop.hive.ql.parse.*;
import java.io.IOException;
import java.util.*;
/**
* 目的:获取AST中的表,列,以及对其所做的操作,如SELECT,INSERT
* 重点:获取SELECT操作中的表和列的相关操作。其他操作这判断到表级别。
* 实现思路:对AST深度优先遍历,遇到操作的token则判断当前的操作,
* 遇到TOK_TAB或TOK_TABREF则判断出当前操作的表,遇到子句则压栈当前处理,处理子句。
* 子句处理完,栈弹出。
*
*/
public class HiveParse {
private static final String UNKNOWN = "UNKNOWN";
private Map<String, String> alias = new HashMap<String, String>();
private Map<String, String> cols = new TreeMap<String, String>();
private Map<String, String> wherecols = new TreeMap<String, String>();
private Map<String, String> colAlais = new TreeMap<String, String>();
private Set<String> tables = new HashSet<String>();
private Stack<String> tableNameStack = new Stack<String>();
private Stack<Oper> operStack = new Stack<Oper>();
private String nowQueryTable = "";//定义及处理不清晰,修改为query或from节点对应的table集合或许好点。目前正在查询处理的表可能不止一个。
private Oper oper ;
private boolean joinClause = false;
private enum Oper {
SELECT, INSERT, DROP, TRUNCATE, LOAD, CREATETABLE, ALTER
}
public Set<String> parseIteral(ASTNode ast) {
Set<String> set= new HashSet<String>();//当前查询所对应到的表集合
prepareToParseCurrentNodeAndChilds(ast);
set.addAll(parseChildNodes(ast));
set.addAll(parseCurrentNode(ast ,set));
endParseCurrentNode(ast);
return set;
}
private void endParseCurrentNode(ASTNode ast){
if (ast.getToken() != null) {
switch (ast.getToken().getType()) {//join 从句结束,跳出join
case HiveParser.TOK_RIGHTOUTERJOIN:
case HiveParser.TOK_LEFTOUTERJOIN:
case HiveParser.TOK_JOIN:
joinClause = false;
break;
case HiveParser.TOK_QUERY:
break;
case HiveParser.TOK_INSERT:
case HiveParser.TOK_SELECT:
nowQueryTable = tableNameStack.pop();
oper = operStack.pop();
break;
}
}
}
private Set<String> parseCurrentNode(ASTNode ast, Set<String> set){
if (ast.getToken() != null) {
switch (ast.getToken().getType()) {
case HiveParser.TOK_TABLE_PARTITION:
// case HiveParser.TOK_TABNAME:
if (ast.getChildCount() != 2) {
String table = BaseSemanticAnalyzer
.getUnescapedName((ASTNode) ast.getChild(0));
if (oper == Oper.SELECT) {
nowQueryTable = table;
}
tables.add(table + "\t" + oper);
}
break;
case HiveParser.TOK_TAB:// outputTable
String tableTab = BaseSemanticAnalyzer
.getUnescapedName((ASTNode) ast.getChild(0));
if (oper == Oper.SELECT) {
nowQueryTable = tableTab;
}
tables.add(tableTab + "\t" + oper);
break;
case HiveParser.TOK_TABREF:// inputTable
ASTNode tabTree = (ASTNode) ast.getChild(0);
String tableName = (tabTree.getChildCount() == 1) ? BaseSemanticAnalyzer
.getUnescapedName((ASTNode) tabTree.getChild(0))
: BaseSemanticAnalyzer
.getUnescapedName((ASTNode) tabTree.getChild(0))
+ "." + tabTree.getChild(1);
if (oper == Oper.SELECT) {
if(joinClause && !"".equals(nowQueryTable) ){
nowQueryTable += "&"+tableName;//
}else{
nowQueryTable = tableName;
}
set.add(tableName);
}
tables.add(tableName + "\t" + oper);
if (ast.getChild(1) != null) {
String alia = ast.getChild(1).getText().toLowerCase();
alias.put(alia, tableName);//sql6 p别名在tabref只对应为一个表的别名。
}
break;
case HiveParser.TOK_TABLE_OR_COL:
if (ast.getParent().getType() != HiveParser.DOT) {
String col = ast.getChild(0).getText().toLowerCase();
if (alias.get(col) == null
&& colAlais.get(nowQueryTable + "." + col) == null) {
if(nowQueryTable.indexOf("&") > 0){//sql23
cols.put(UNKNOWN + "." + col, "");
}else{
cols.put(nowQueryTable + "." + col, "");
}
}
}
break;
case HiveParser.TOK_ALLCOLREF:
cols.put(nowQueryTable + ".*", "");
break;
case HiveParser.TOK_SUBQUERY:
if (ast.getChildCount() == 2) {
String tableAlias = unescapeIdentifier(ast.getChild(1)
.getText());
String aliaReal = "";
for(String table : set){
aliaReal+=table+"&";
}
if(aliaReal.length() !=0){
aliaReal = aliaReal.substring(0, aliaReal.length()-1);
}
// alias.put(tableAlias, nowQueryTable);//sql22
alias.put(tableAlias, aliaReal);//sql6
// alias.put(tableAlias, "");// just store alias
}
break;
case HiveParser.TOK_SELEXPR:
if (ast.getChild(0).getType() == HiveParser.TOK_TABLE_OR_COL) {
String column = ast.getChild(0).getChild(0).getText()
.toLowerCase();
if(nowQueryTable.indexOf("&") > 0){
cols.put(UNKNOWN + "." + column, "");
}else if (colAlais.get(nowQueryTable + "." + column) == null) {
cols.put(nowQueryTable + "." + column, "");
}
} else if (ast.getChild(1) != null) {// TOK_SELEXPR (+
// (TOK_TABLE_OR_COL id)
// 1) dd
String columnAlia = ast.getChild(1).getText().toLowerCase();
colAlais.put(nowQueryTable + "." + columnAlia, "");
}
break;
case HiveParser.DOT:
if (ast.getType() == HiveParser.DOT) {
if (ast.getChildCount() == 2) {
if (ast.getChild(0).getType() == HiveParser.TOK_TABLE_OR_COL
&& ast.getChild(0).getChildCount() == 1
&& ast.getChild(1).getType() == HiveParser.Identifier) {
String alia = BaseSemanticAnalyzer
.unescapeIdentifier(ast.getChild(0)
.getChild(0).getText()
.toLowerCase());
String column = BaseSemanticAnalyzer
.unescapeIdentifier(ast.getChild(1)
.getText().toLowerCase());
String realTable = null;
if (!tables.contains(alia + "\t" + oper)
&& alias.get(alia) == null) {// [b SELECT, a
// SELECT]
alias.put(alia, nowQueryTable);
}
if (tables.contains(alia + "\t" + oper)) {
realTable = alia;
} else if (alias.get(alia) != null) {
realTable = alias.get(alia);
}
if (realTable == null || realTable.length() == 0 || realTable.indexOf("&") > 0) {
realTable = UNKNOWN;
}
cols.put(realTable + "." + column, "");
}
}
}
break;
case HiveParser.TOK_WHERE:
int a = ast.getChildCount();
List<ASTNode> set1 =new ArrayList<>();
List<ASTNode> set0 =new ArrayList<>();
if(a==1){
set0= parseChildNodes1((ASTNode)ast.getChild(0),set0);
}else if(a==2){
set0= parseChildNodes1((ASTNode)ast.getChild(0),set0);
set1= parseChildNodes1((ASTNode)ast.getChild(1),set1);
}
for (int i = 0; i < set0.size(); i++) {
if (set0.get(i).getChild(0).getChild(0).getType() == HiveParser.TOK_TABLE_OR_COL
&& set0.get(i).getChild(0).getChild(0).getChildCount() == 1) {
String alia = BaseSemanticAnalyzer
.unescapeIdentifier(set0.get(i).getChild(0).getChild(0)
.getChild(0).getText()
.toLowerCase());
String column = BaseSemanticAnalyzer
.unescapeIdentifier(set0.get(i).getChild(0).getChild(1)
.getText().toLowerCase());
String realTable = null;
if (!tables.contains(alia + "\t" + oper)
&& alias.get(alia) == null) {// [b SELECT, a
// SELECT]
alias.put(alia, nowQueryTable);
}
if (tables.contains(alia + "\t" + oper)) {
realTable = alia;
} else if (alias.get(alia) != null) {
realTable = alias.get(alia);
}
if (realTable == null || realTable.length() == 0 || realTable.indexOf("&") > 0) {
realTable = UNKNOWN;
}
wherecols.put(realTable + "." + column, "");
}else if (set0.get(i).getChild(0).getType() == HiveParser.TOK_TABLE_OR_COL
&& set0.get(i).getChild(0).getChildCount() == 1) {
String column = BaseSemanticAnalyzer
.unescapeIdentifier(set0.get(i).getChild(0)
.getChild(0).getText()
.toLowerCase());
wherecols.put(nowQueryTable + "." + column, "");
}
}
break;
case HiveParser.TOK_ALTERTABLE_ADDPARTS:
case HiveParser.TOK_ALTERTABLE_RENAME:
case HiveParser.TOK_ALTERTABLE_ADDCOLS:
ASTNode alterTableName = (ASTNode) ast.getChild(0);
tables.add(alterTableName.getText() + "\t" + oper);
break;
}
}
return set;
}
private Set<String> parseChildNodes(ASTNode ast){
Set<String> set= new HashSet<String>();
int numCh = ast.getChildCount();
if (numCh > 0) {
for (int num = 0; num < numCh; num++) {
ASTNode child = (ASTNode) ast.getChild(num);
set.addAll(parseIteral(child));
}
}
return set;
}
private List<ASTNode> parseChildNodes1(ASTNode ast,List<ASTNode> set){
if (ast.getType() == HiveParser.KW_OR
||ast.getType() == HiveParser.KW_AND) {
int numCh = ast.getChildCount();
for (int num = 0; num < numCh; num++) {
parseChildNodes1((ASTNode) ast.getChild(num),set);
}
}else if (ast.getType() == HiveParser.NOTEQUAL //判断条件 > < like in
|| ast.getType() == HiveParser.EQUAL
|| ast.getType() == HiveParser.LESSTHAN
|| ast.getType() == HiveParser.LESSTHANOREQUALTO
|| ast.getType() == HiveParser.GREATERTHAN
|| ast.getType() == HiveParser.GREATERTHANOREQUALTO
|| ast.getType() == HiveParser.KW_LIKE
|| ast.getType() == HiveParser.DIVIDE
|| ast.getType() == HiveParser.PLUS
|| ast.getType() == HiveParser.MINUS
|| ast.getType() == HiveParser.STAR
|| ast.getType() == HiveParser.MOD
|| ast.getType() == HiveParser.AMPERSAND
|| ast.getType() == HiveParser.TILDE
|| ast.getType() == HiveParser.BITWISEOR
|| ast.getType() == HiveParser.BITWISEXOR) {
set.add(ast);
}
return set;
}
private void prepareToParseCurrentNodeAndChilds(ASTNode ast){
if (ast.getToken() != null) {
switch (ast.getToken().getType()) {//join 从句开始
case HiveParser.TOK_RIGHTOUTERJOIN:
case HiveParser.TOK_LEFTOUTERJOIN:
case HiveParser.TOK_JOIN:
joinClause = true;
break;
case HiveParser.TOK_QUERY:
tableNameStack.push(nowQueryTable);
operStack.push(oper);
nowQueryTable = "";//sql22
oper = Oper.SELECT;
break;
case HiveParser.TOK_INSERT:
tableNameStack.push(nowQueryTable);
operStack.push(oper);
oper = Oper.INSERT;
break;
case HiveParser.TOK_SELECT:
tableNameStack.push(nowQueryTable);
operStack.push(oper);
// nowQueryTable = nowQueryTable
// nowQueryTable = "";//语法树join
// 注释语法树sql9, 语法树join对应的设置为""的注释逻辑不符
oper = Oper.SELECT;
break;
case HiveParser.TOK_DROPTABLE:
oper = Oper.DROP;
break;
case HiveParser.TOK_TRUNCATETABLE:
oper = Oper.TRUNCATE;
break;
case HiveParser.TOK_LOAD:
oper = Oper.LOAD;
break;
case HiveParser.TOK_CREATETABLE:
oper = Oper.CREATETABLE;
break;
}
if (ast.getToken() != null
&& ast.getToken().getType() >= HiveParser.TOK_ALTERDATABASE_PROPERTIES
&& ast.getToken().getType() <= HiveParser.TOK_ALTERVIEW_RENAME) {
oper = Oper.ALTER;
}
}
}
public static String unescapeIdentifier(String val) {
if (val == null) {
return null;
}
if (val.charAt(0) == '`' && val.charAt(val.length() - 1) == '`') {
val = val.substring(1, val.length() - 1);
}
return val;
}
private void output(Map<String, String> map) {
Iterator<String> it = map.keySet().iterator();
while (it.hasNext()) {
String key = it.next();
System.out.println(key + "\t" + map.get(key));
}
}
public JSONObject parse(ASTNode ast) {
parseIteral(ast);
System.out.println("***************表***************");
for (String table : tables) {
System.out.println(table);
}
System.out.println("***************列***************");
output(cols);
System.out.println("***************where列***************");
output(wherecols);
System.out.println("***************别名***************");
output(alias);
JSONObject json=new JSONObject();
json.put("cols",cols);
json.put("wherecols",wherecols);
json.put("tables",tables);
json.put("alias",alias);
return json;
}
}
调用
ParseDriver pd = new ParseDriver();
HiveParse hp = new HiveParse();
System.out.println(sql);
ASTNode ast = pd.parse(sql);
System.out.println(ast.toStringTree());
jsonsql = hp.parse(ast);
Map<String, Set<String>> tablewherecols= new HashMap<>();
Set<String> settable= (Set<String>) jsonsql.get("tables");
Map<String, String> wherecols= (Map<String, String>) jsonsql.get("wherecols");
for (String table : settable) {
String [] biao = table.split("\t");
String table1 = biao[0];
Set<String> tabziduan= new HashSet<>();
for (String key : wherecols.keySet()) {
System.out.println(key);
String [] ziduan = key.split("\\.");
if (key.contains(table1)&&ziduan.length==2){
tabziduan.add(ziduan[1]);
tablewherecols.put(table1,tabziduan);
}
}
}
System.out.println(JSONObject.toJSONString(tablewherecols));```