HiveSql解析(基于AST)

2019-11-08 16:05:48来源：博客园阅读 ()

HiveSql解析(基于AST)

由于项目中要和大数据团队交互，需要解析出hive中用到的表和最外层的列，后续可能还要用到各个表和字段的血缘关系，网上搜了一圈，貌似只有一个答案，基本都是复制一个模板的。而且看起来貌似也不太符合我需要的，所以想着自己解析出想要的东西，勉强够用，记录一下。

  1 import com.google.common.base.Joiner;
  2 import com.google.common.collect.Lists;
  3 import com.google.common.collect.Maps;
  4 import lombok.extern.slf4j.Slf4j;
  5 import org.apache.commons.collections.CollectionUtils;
  6 import org.apache.hadoop.hive.ql.lib.Node;
  7 import org.apache.hadoop.hive.ql.parse.ASTNode;
  8 import org.apache.hadoop.hive.ql.parse.ParseDriver;
  9 
 10 import java.util.*;
 11 import java.util.stream.Collectors;
 12 
 13 import static org.apache.hadoop.hive.ql.parse.HiveParser.*;
 14 
 15 /**
 16  * @author chentiefeng
 17  * @date 2019/10/21 13:51
 18  */
 19 @Slf4j
 20 public class HiveSqlParse {
 21     private ParseDriver pd = new ParseDriver();
 22     /**
 23      * 原始表(表名,别名)
 24      */
 25     private List<String[]> sourceTable = Lists.newArrayList();
 26     /**
 27      * 插入表
 28      */
 29     private List<String> insertTables = Lists.newArrayList();
 30     /**
 31      * 最外层列
 32      */
 33     private List<String> outermostColumns = Lists.newArrayList();
 34     /**
 35      * 插入分区信息(分区列,分区值)
 36      */
 37     private Map<String, String> partitionMap = Maps.newHashMap();
 38     /**
 39      * 最外层Sel节点
 40      */
 41     private ASTNode outermostSelNode = null;
 42     /**
 43      * 最外层Insert节点
 44      */
 45     private ASTNode outermostInsertNode = null;
 46     /**
 47      * 放置 解析表栈
 48      */
 49     private Stack<HiveTableParseInfo> tableParseInfoSelStack = new Stack<>();
 50     private Stack<HiveTableParseInfo> tableParseInfoFromStack = new Stack<>();
 51     /**
 52      * 表关系解析信息，不包含原始表
 53      */
 54     private HiveTableParseInfo tableParseInfo = null;
 55 
 56     public HiveSqlParse() {
 57     }
 58 
 59     public HiveSqlParse(String sql) {
 60         parse(sql);
 61     }
 62 
 63     /**
 64      * sql解析
 65      *
 66      * @param sql
 67      */
 68     public void parse(String sql) {
 69         try {
 70             ASTNode ast = pd.parse(sql);
 71             log.info("hiveSql={},astTree={}", sql, ast.toStringTree());
 72             parseNode(ast);
 73             insert(outermostInsertNode);
 74             outermostColumns(outermostSelNode);
 75             sourceTable.removeIf(arr -> arr[0].equals(insertTables.get(0)));
 76         } catch (Exception e) {
 77             log.error(e.getMessage(), e);
 78             throw new RuntimeException(e);
 79         }
 80     }
 81 
 82     private void parseNode(ASTNode ast) {
 83         if (CollectionUtils.isNotEmpty(ast.getChildren())) {
 84             for (Node child : ast.getChildren()) {
 85                 ASTNode cc = (ASTNode) child;
 86                 switch (cc.getToken().getType()) {
 87                     case TOK_INSERT:
 88                         outermostInsertNode = cc;
 89                         break;
 90                     case TOK_TABNAME:
 91                         String tableName = Joiner.on(".").join(cc.getChildren().stream().map(n -> ((ASTNode) n).getText()).collect(Collectors.toList()));
 92                         ASTNode ccChild = (ASTNode) cc.getParent().getChild(cc.getParent().getChildCount() - 1);
 93                         HiveTableParseInfo sourceTableParseInfo = new HiveTableParseInfo();
 94                         if (ccChild.getToken().getType() == TOK_TABNAME) {
 95                             sourceTable.add(new String[]{tableName, ""});
 96                             sourceTableParseInfo.setAlias("");
 97                         } else {
 98                             sourceTable.add(new String[]{tableName, ccChild.getText()});
 99                             sourceTableParseInfo.setAlias(ccChild.getText());
100                         }
101                         sourceTableParseInfo.setName(tableName);
102                         if (!tableParseInfoFromStack.empty()) {
103                             tableParseInfoFromStack.pop().getTables().add(sourceTableParseInfo);
104                         }
105                         break;
106                     case TOK_QUERY:
107                         ASTNode ccc = (ASTNode) cc.getParent().getChild(cc.getParent().getChildCount() - 1);
108                         if (ccc.getToken().getType() != TOK_QUERY) {
109                             HiveTableParseInfo table = new HiveTableParseInfo();
110                             table.setAlias(ccc.getText());
111                             tableParseInfoSelStack.push(table);
112                             tableParseInfoFromStack.push(table);
113                         }
114                         break;
115                     case TOK_SELECT:
116                     case TOK_SELECTDI:
117                         HiveTableParseInfo pop = tableParseInfoSelStack.pop();
118                         if (!tableParseInfoSelStack.empty()) {
119                             HiveTableParseInfo father = tableParseInfoSelStack.peek();
120                             if (Objects.nonNull(father)) {
121                                 father.getTables().add(pop);
122                             }
123                         } else {
124                             tableParseInfo = pop;
125                         }
126                         parseColumns(cc, pop);
127                         continue;
128                     default:
129                 }
130                 parseNode(cc);
131             }
132         }
133     }
134 
135     private void insert(ASTNode cn) {
136         if (CollectionUtils.isEmpty(cn.getChildren())) {
137             return;
138         }
139         for (Node child : cn.getChildren()) {
140             ASTNode cc = (ASTNode) child;
141             switch (cc.getToken().getType()) {
142                 case TOK_INSERT_INTO:
143                 case TOK_DESTINATION:
144                     insertTable(cn);
145                     continue;
146                 case TOK_SELECT:
147                     outermostSelNode = cn;
148                     continue;
149                 default:
150             }
151             insert(cc);
152         }
153     }
154 
155     private void parseColumns(ASTNode cc, HiveTableParseInfo table) {
156         for (Node node : cc.getChildren()) {
157             ASTNode tokSelExpr = (ASTNode) node;
158             HiveTableParseInfo.HiveTableColumnParseInfo column = new HiveTableParseInfo.HiveTableColumnParseInfo();
159             String alias = getSelExprAlias(tokSelExpr);
160             column.setName(alias);
161             parseColumn(tokSelExpr, column);
162             table.getColumns().add(column);
163         }
164     }
165 
166 
167     private void parseColumn(ASTNode tokSelExpr, HiveTableParseInfo.HiveTableColumnParseInfo column) {
168         if (CollectionUtils.isEmpty(tokSelExpr.getChildren())) {
169             return;
170         }
171         for (Node child : tokSelExpr.getChildren()) {
172             ASTNode cc = (ASTNode) child;
173             if (cc.getToken().getType() == TOK_TABLE_OR_COL) {
174                 ASTNode ccc = (ASTNode) cc.getParent().getChild(cc.getParent().getChildCount() - 1);
175                 String[] item;
176                 if (ccc.getToken().getType() == TOK_TABLE_OR_COL) {
177                     item = new String[]{cc.getChild(0).getText(), ""};
178                 } else {
179                     item = new String[]{ccc.getText(), cc.getChild(0).getText()};
180                 }
181                 Optional<String[]> any = column.getSourceList().stream().filter(s -> Arrays.equals(item, s)).findAny();
182                 if (!any.isPresent()) {
183                     column.getSourceList().add(item);
184                 }
185                 continue;
186             }
187             parseColumn(cc, column);
188         }
189     }
190 
191     /**
192      * 插入信息
193      *
194      * @param cn
195      */
196     private void insertTable(ASTNode cn) {
197         if (CollectionUtils.isEmpty(cn.getChildren())) {
198             return;
199         }
200         for (Node child : cn.getChildren()) {
201             ASTNode cc = (ASTNode) child;
202             switch (cc.getToken().getType()) {
203                 case TOK_TABNAME:
204                     String tableName = Joiner.on(".").join(cc.getChildren().stream().map(n -> ((ASTNode) n).getText()).collect(Collectors.toList()));
205                     insertTables.add(tableName);
206                     break;
207                 case TOK_PARTVAL:
208                     if (cc.getChildCount() == 2) {
209                         partitionMap.put(cc.getChild(0).getText(), cc.getChild(1).getText());
210                     } else {
211                         partitionMap.put(cc.getChild(0).getText(), null);
212                     }
213                     break;
214                 default:
215             }
216             insertTable(cc);
217         }
218     }
219 
220     /**
221      * 最外层列
222      *
223      * @param cn
224      */
225     private void outermostColumns(ASTNode cn) {
226         if (CollectionUtils.isEmpty(cn.getChildren())) {
227             return;
228         }
229         for (Node cnChild : cn.getChildren()) {
230             ASTNode cc = (ASTNode) cnChild;
231             if (cc.getToken().getType() == TOK_SELEXPR) {
232                 String alias = getSelExprAlias(cc);
233                 outermostColumns.add(alias);
234                 continue;
235             }
236             outermostColumns(cc);
237         }
238     }
239 
240     /**
241      * 列别名获取
242      *
243      * @param cc
244      * @return
245      */
246     private String getSelExprAlias(ASTNode cc) {
247         ASTNode child = (ASTNode) cc.getChild(cc.getChildCount() - 1);
248         if (child.getToken().getType() == TOK_TABLE_OR_COL || child.getToken().getType() == DOT) {
249             return child.getChild(child.getChildCount() - 1).getText();
250         } else {
251             return child.getText();
252         }
253     }
254 
255     public List<String> getOutermostColumns() {
256         return outermostColumns;
257     }
258 
259     public List<String> getSourceTables() {
260         return sourceTable.stream().map(t -> t[0]).distinct().collect(Collectors.toList());
261     }
262 
263     public String getInsertTable() {
264         return CollectionUtils.isNotEmpty(insertTables) ? insertTables.get(0) : null;
265     }
266 
267     public Map<String, String> getPartition() {
268         return partitionMap;
269     }
270 
271     public HiveTableParseInfo getTableParseInfo() {
272         return tableParseInfo;
273     }
274 
275     public static void main(String[] args) {
276         String sql23 = "insert overwrite table risk_event partition(year='2019',dt) select t.ops as order_no,t.id_no ,concat(t.consumer_no,'aa') dd,aadx from (select concat(a.opt_id,b.opt_id) as ops,b.id_no from ods.arc_event a left outer join ods.arc_user b on a.consumer_no = b.consumer_no) t left outer join (select order_no from arc_verify where dt = '20191023') t1 on t.consumer_no = t1.consumer_no";
277 //        String sql23 = "insert overwrite table riskt_eventpartition select opt_id from arc_event a inner join arc_user b";
278 //        String sql23 = "insert overwrite table riskt_eventpartition select opt_id from arc_event";
279 //        String sql23 = "SELECT SUM(CASE when rcw.eventid=2 and rcw.method = 'sendevent' then 1 else 0 END) as successCnt," +
280 //                "       SUM(CASE when rcw.eventid=4 and rcw.method = 'risklevel' then 1 else 0 END) as payCnt," +
281 //                "       SUM(CASE when rcw.eventid=2 and rcw.method = 'sendevent' then 1 else 0 END)/SUM(CASE when rcw.eventid=4 and rcw.method = 'risklevel' then 1 else 0 END) as rate" +
282 //                "  FROM (\n" +
283 //                "        SELECT DISTINCT payorderid," +
284 //                "               eventid," +
285 //                "               method" +
286 //                "          FROM log.pay_rc_warden_event_basic" +
287 //                "         WHERE dt = '20180715'" +
288 //                "       ) rcw";
289         HiveSqlParse hiveSqlParse = new HiveSqlParse(sql23);
290         System.out.println(hiveSqlParse.getSourceTables());
291         System.out.println(hiveSqlParse.getOutermostColumns());
292         System.out.println(hiveSqlParse.getInsertTable());
293         System.out.println(hiveSqlParse.getPartition());
294         System.out.println(hiveSqlParse.getTableParseInfo());
295     }
296 }