1 | /** |
---|
2 | * This software is copyright (c) 2013-2022 by |
---|
3 | * - Leibniz-Institut fuer Deutsche Sprache (http://www.ids-mannheim.de) |
---|
4 | * This is free software. You can redistribute it |
---|
5 | * and/or modify it under the terms described in |
---|
6 | * the GNU General Public License v3 of which you |
---|
7 | * should have received a copy. Otherwise you can download |
---|
8 | * it from |
---|
9 | * |
---|
10 | * http://www.gnu.org/licenses/gpl-3.0.txt |
---|
11 | * |
---|
12 | * @copyright Leibniz-Institut fuer Deutsche Sprache (http://www.ids-mannheim.de) |
---|
13 | * |
---|
14 | * @license http://www.gnu.org/licenses/gpl-3.0.txt |
---|
15 | * GNU General Public License v3 |
---|
16 | */ |
---|
17 | package eu.clarin.sru.server.fcs.parser; |
---|
18 | |
---|
19 | import java.text.Normalizer; |
---|
20 | import java.util.ArrayDeque; |
---|
21 | import java.util.ArrayList; |
---|
22 | import java.util.Collections; |
---|
23 | import java.util.Deque; |
---|
24 | import java.util.HashSet; |
---|
25 | import java.util.List; |
---|
26 | import java.util.Set; |
---|
27 | |
---|
28 | import org.antlr.v4.runtime.BaseErrorListener; |
---|
29 | import org.antlr.v4.runtime.CharStream; |
---|
30 | import org.antlr.v4.runtime.CharStreams; |
---|
31 | import org.antlr.v4.runtime.CommonTokenStream; |
---|
32 | import org.antlr.v4.runtime.ParserRuleContext; |
---|
33 | import org.antlr.v4.runtime.RecognitionException; |
---|
34 | import org.antlr.v4.runtime.Recognizer; |
---|
35 | import org.antlr.v4.runtime.Token; |
---|
36 | import org.antlr.v4.runtime.tree.ParseTree; |
---|
37 | import org.antlr.v4.runtime.tree.TerminalNode; |
---|
38 | import org.slf4j.Logger; |
---|
39 | import org.slf4j.LoggerFactory; |
---|
40 | |
---|
41 | import eu.clarin.sru.fcs.qlparser.FCSLexer; |
---|
42 | import eu.clarin.sru.fcs.qlparser.FCSParser; |
---|
43 | import eu.clarin.sru.fcs.qlparser.FCSParserBaseVisitor; |
---|
44 | import eu.clarin.sru.fcs.qlparser.FCSParser.AttributeContext; |
---|
45 | import eu.clarin.sru.fcs.qlparser.FCSParser.Expression_andContext; |
---|
46 | import eu.clarin.sru.fcs.qlparser.FCSParser.Expression_basicContext; |
---|
47 | import eu.clarin.sru.fcs.qlparser.FCSParser.Expression_groupContext; |
---|
48 | import eu.clarin.sru.fcs.qlparser.FCSParser.Expression_notContext; |
---|
49 | import eu.clarin.sru.fcs.qlparser.FCSParser.Expression_orContext; |
---|
50 | import eu.clarin.sru.fcs.qlparser.FCSParser.IdentifierContext; |
---|
51 | import eu.clarin.sru.fcs.qlparser.FCSParser.Main_queryContext; |
---|
52 | import eu.clarin.sru.fcs.qlparser.FCSParser.QualifierContext; |
---|
53 | import eu.clarin.sru.fcs.qlparser.FCSParser.QuantifierContext; |
---|
54 | import eu.clarin.sru.fcs.qlparser.FCSParser.QueryContext; |
---|
55 | import eu.clarin.sru.fcs.qlparser.FCSParser.Query_disjunctionContext; |
---|
56 | import eu.clarin.sru.fcs.qlparser.FCSParser.Query_groupContext; |
---|
57 | import eu.clarin.sru.fcs.qlparser.FCSParser.Query_implicitContext; |
---|
58 | import eu.clarin.sru.fcs.qlparser.FCSParser.Query_segmentContext; |
---|
59 | import eu.clarin.sru.fcs.qlparser.FCSParser.Query_sequenceContext; |
---|
60 | import eu.clarin.sru.fcs.qlparser.FCSParser.Query_simpleContext; |
---|
61 | import eu.clarin.sru.fcs.qlparser.FCSParser.RegexpContext; |
---|
62 | import eu.clarin.sru.fcs.qlparser.FCSParser.Regexp_flagContext; |
---|
63 | import eu.clarin.sru.fcs.qlparser.FCSParser.Regexp_patternContext; |
---|
64 | import eu.clarin.sru.fcs.qlparser.FCSParser.Within_partContext; |
---|
65 | import eu.clarin.sru.fcs.qlparser.FCSParser.Within_part_simpleContext; |
---|
66 | |
---|
67 | |
---|
68 | /** |
---|
69 | * A FCS-QL query parser that produces FCS-QL expression trees. |
---|
70 | */ |
---|
71 | public class QueryParser { |
---|
72 | private static final int[] REP_ZERO_OR_MORE = |
---|
73 | new int[] { 0, QueryNode.OCCURS_UNBOUNDED }; |
---|
74 | private static final int[] REP_ONE_OR_MORE = |
---|
75 | new int[] { 1, QueryNode.OCCURS_UNBOUNDED }; |
---|
76 | private static final int[] REP_ZERO_OR_ONE = |
---|
77 | new int[] { 0, 1 }; |
---|
78 | private static final String EMPTY_STRING = ""; |
---|
79 | private static final int DEFAULT_INITIAL_STACK_SIZE = 32; |
---|
80 | private static final Logger logger = |
---|
81 | LoggerFactory.getLogger(ExpressionTreeBuilder.class); |
---|
82 | private static final String DEFAULT_IDENTIFIER = "text"; |
---|
83 | private static final Operator DEFAULT_OPERATOR = Operator.EQUALS; |
---|
84 | private static final Normalizer.Form DEFAULT_UNICODE_NORMALIZATION_FORM = |
---|
85 | Normalizer.Form.NFC; |
---|
86 | private final String defaultIdentifier; |
---|
87 | private final Operator defaultOperator; |
---|
88 | private final Normalizer.Form unicodeNormalizationForm; |
---|
89 | |
---|
90 | |
---|
91 | /** |
---|
92 | * Constructor. |
---|
93 | */ |
---|
94 | public QueryParser() { |
---|
95 | this(DEFAULT_IDENTIFIER, DEFAULT_UNICODE_NORMALIZATION_FORM); |
---|
96 | } |
---|
97 | |
---|
98 | |
---|
99 | /** |
---|
100 | * Constructor. |
---|
101 | * |
---|
102 | * @param defaultIdentifier |
---|
103 | * the default identifier to be used for simple expressions |
---|
104 | */ |
---|
105 | public QueryParser(String defaultIdentifier) { |
---|
106 | this.defaultIdentifier = defaultIdentifier; |
---|
107 | this.defaultOperator = DEFAULT_OPERATOR; |
---|
108 | this.unicodeNormalizationForm = DEFAULT_UNICODE_NORMALIZATION_FORM; |
---|
109 | } |
---|
110 | |
---|
111 | |
---|
112 | /** |
---|
113 | * Constructor. |
---|
114 | * |
---|
115 | * @param defaultIdentifier |
---|
116 | * the default identifier to be used for simple expressions |
---|
117 | * @param unicodeNormaliztionForm |
---|
118 | * the Unicode normalization form to be used or <code>null</code> |
---|
119 | * to not perform normalization |
---|
120 | */ |
---|
121 | public QueryParser(String defaultIdentifier, |
---|
122 | Normalizer.Form unicodeNormaliztionForm) { |
---|
123 | this.defaultIdentifier = defaultIdentifier; |
---|
124 | this.defaultOperator = DEFAULT_OPERATOR; |
---|
125 | this.unicodeNormalizationForm = unicodeNormaliztionForm; |
---|
126 | } |
---|
127 | |
---|
128 | |
---|
129 | /** |
---|
130 | * Parse query. |
---|
131 | * |
---|
132 | * @param query |
---|
133 | * the FCS-QL query |
---|
134 | * @return a FCS-QL expression tree |
---|
135 | * @throws QueryParserException |
---|
136 | * if an error occurred |
---|
137 | */ |
---|
138 | public QueryNode parse(String query) throws QueryParserException { |
---|
139 | final ErrorListener errorListener = new ErrorListener(query); |
---|
140 | try { |
---|
141 | CharStream input = CharStreams.fromString(query); |
---|
142 | FCSLexer lexer = new FCSLexer(input); |
---|
143 | CommonTokenStream tokens = new CommonTokenStream(lexer); |
---|
144 | FCSParser parser = new FCSParser(tokens); |
---|
145 | /* |
---|
146 | * clear (possible) default error listeners and set our own! |
---|
147 | */ |
---|
148 | lexer.removeErrorListeners(); |
---|
149 | lexer.addErrorListener(errorListener); |
---|
150 | parser.removeErrorListeners(); |
---|
151 | parser.addErrorListener(errorListener); |
---|
152 | |
---|
153 | /* |
---|
154 | * commence parsing ... |
---|
155 | */ |
---|
156 | ParseTree tree = parser.query(); |
---|
157 | if (parser.isMatchedEOF() && |
---|
158 | !errorListener.hasErrors() && |
---|
159 | (parser.getNumberOfSyntaxErrors() == 0)) { |
---|
160 | if (logger.isTraceEnabled()) { |
---|
161 | logger.trace("ANTLR parse tree: {}", |
---|
162 | tree.toStringTree(parser)); |
---|
163 | } |
---|
164 | ExpressionTreeBuilder v = |
---|
165 | new ExpressionTreeBuilder(DEFAULT_INITIAL_STACK_SIZE); |
---|
166 | v.visit(tree); |
---|
167 | return (QueryNode) v.stack.pop(); |
---|
168 | } else { |
---|
169 | if (logger.isTraceEnabled()) { |
---|
170 | for (String msg : errorListener.getErrors()) { |
---|
171 | logger.trace("ERROR: {}", msg); |
---|
172 | } |
---|
173 | } |
---|
174 | |
---|
175 | /* |
---|
176 | * FIXME: (include additional error information) |
---|
177 | */ |
---|
178 | String message = errorListener.hasErrors() |
---|
179 | ? errorListener.getErrors().get(0) |
---|
180 | : "unspecified error"; |
---|
181 | throw new QueryParserException(message); |
---|
182 | } |
---|
183 | } catch (ExpressionTreeBuilderException e) { |
---|
184 | throw new QueryParserException(e.getMessage(), e.getCause()); |
---|
185 | } catch (QueryParserException e) { |
---|
186 | throw e; |
---|
187 | } catch (Throwable t) { |
---|
188 | throw new QueryParserException( |
---|
189 | "an unexpected exception occured while parsing", t); |
---|
190 | } |
---|
191 | } |
---|
192 | |
---|
193 | |
---|
194 | /* |
---|
195 | * hide the expression tree builder implementation in nested private class ... |
---|
196 | */ |
---|
197 | private final class ExpressionTreeBuilder |
---|
198 | extends FCSParserBaseVisitor<Void> { |
---|
199 | private final Deque<Object> stack; |
---|
200 | /* |
---|
201 | * pre-allocate buffer to store chars returned by Character.toChars in |
---|
202 | * unescapeString() |
---|
203 | */ |
---|
204 | private final char[] buf = new char[2]; |
---|
205 | |
---|
206 | |
---|
207 | private ExpressionTreeBuilder(int initStackSize) { |
---|
208 | if (initStackSize < 1) { |
---|
209 | throw new IllegalArgumentException("initStackSize < 1"); |
---|
210 | } |
---|
211 | this.stack = new ArrayDeque<Object>(initStackSize); |
---|
212 | } |
---|
213 | |
---|
214 | |
---|
215 | @Override |
---|
216 | public Void visitQuery(QueryContext ctx) { |
---|
217 | if (logger.isTraceEnabled()) { |
---|
218 | logger.trace("visitQuery/enter: children={} / cnt={} / text={}", |
---|
219 | ctx.children, |
---|
220 | ctx.getChildCount(), |
---|
221 | ctx.getText()); |
---|
222 | } |
---|
223 | super.visitQuery(ctx); |
---|
224 | |
---|
225 | Within_partContext w_ctx = ctx.getChild(Within_partContext.class, 0); |
---|
226 | if (w_ctx != null) { |
---|
227 | QueryNode within = (QueryNode) stack.pop(); |
---|
228 | QueryNode query = (QueryNode) stack.pop();; |
---|
229 | stack.push(new QueryWithWithin(query, within)); |
---|
230 | } |
---|
231 | if (logger.isTraceEnabled()) { |
---|
232 | logger.trace("visitQuery/exit: stack={}", stack); |
---|
233 | } |
---|
234 | return null; |
---|
235 | } |
---|
236 | |
---|
237 | |
---|
238 | @Override |
---|
239 | public Void visitMain_query(Main_queryContext ctx) { |
---|
240 | if (logger.isTraceEnabled()) { |
---|
241 | logger.trace("visitMain_query/enter: children={} / cnt={} / text={}", |
---|
242 | ctx.children, |
---|
243 | ctx.getChildCount(), |
---|
244 | ctx.getText()); |
---|
245 | } |
---|
246 | super.visitMain_query(ctx); |
---|
247 | if (logger.isTraceEnabled()) { |
---|
248 | logger.trace("visitMain_query/exit: stack={}", stack); |
---|
249 | } |
---|
250 | return null; |
---|
251 | } |
---|
252 | |
---|
253 | |
---|
254 | @Override |
---|
255 | public Void visitQuery_disjunction(Query_disjunctionContext ctx) { |
---|
256 | if (logger.isTraceEnabled()) { |
---|
257 | logger.trace("visitQuery_disjunction/enter: children={} / cnt={} / text={}", |
---|
258 | ctx.children, |
---|
259 | ctx.getChildCount(), |
---|
260 | ctx.getText()); |
---|
261 | } |
---|
262 | final int pos = stack.size(); |
---|
263 | super.visitQuery_disjunction(ctx); |
---|
264 | if (stack.size() > pos) { |
---|
265 | List<QueryNode> items = new ArrayList<QueryNode>(); |
---|
266 | while (stack.size() > pos) { |
---|
267 | items.add(0, (QueryNode) stack.pop()); |
---|
268 | } |
---|
269 | stack.push(new QueryDisjunction(items)); |
---|
270 | } else { |
---|
271 | throw new ExpressionTreeBuilderException( |
---|
272 | "visitQuery_disjunction is empty!"); |
---|
273 | } |
---|
274 | if (logger.isTraceEnabled()) { |
---|
275 | logger.trace("visitQuery_disjunction/exit: stack={}", stack); |
---|
276 | } |
---|
277 | return null; |
---|
278 | } |
---|
279 | |
---|
280 | |
---|
281 | |
---|
282 | @Override |
---|
283 | public Void visitQuery_sequence(Query_sequenceContext ctx) { |
---|
284 | if (logger.isTraceEnabled()) { |
---|
285 | logger.trace("visitQuery_sequence/enter: children={} / cnt={} / text={}", |
---|
286 | ctx.children, |
---|
287 | ctx.getChildCount(), |
---|
288 | ctx.getText()); |
---|
289 | } |
---|
290 | final int pos = stack.size(); |
---|
291 | super.visitQuery_sequence(ctx); |
---|
292 | if (stack.size() > pos) { |
---|
293 | List<QueryNode> items = new ArrayList<QueryNode>(); |
---|
294 | while (stack.size() > pos) { |
---|
295 | items.add(0, (QueryNode) stack.pop()); |
---|
296 | } |
---|
297 | stack.push(new QuerySequence(items)); |
---|
298 | } else { |
---|
299 | throw new ExpressionTreeBuilderException( |
---|
300 | "visitQuery_sequence is empty!"); |
---|
301 | } |
---|
302 | if (logger.isTraceEnabled()) { |
---|
303 | logger.trace("visitQuery_sequence/exit: stack={}", stack); |
---|
304 | } |
---|
305 | return null; |
---|
306 | } |
---|
307 | |
---|
308 | |
---|
309 | @Override |
---|
310 | public Void visitQuery_group(Query_groupContext ctx) { |
---|
311 | if (logger.isTraceEnabled()) { |
---|
312 | logger.trace("visitQuery_group/enter: children={} / cnt={} / text={}", |
---|
313 | ctx.children, |
---|
314 | ctx.getChildCount(), |
---|
315 | ctx.getText()); |
---|
316 | } |
---|
317 | super.visitQuery_group(ctx); |
---|
318 | |
---|
319 | /* |
---|
320 | * handle repetition (if any) |
---|
321 | */ |
---|
322 | int min = 1; |
---|
323 | int max = 1; |
---|
324 | |
---|
325 | // fetch *first* child of type QuantifierContext, therefore idx=0 |
---|
326 | final QuantifierContext q_ctx = |
---|
327 | ctx.getChild(QuantifierContext.class, 0); |
---|
328 | if (q_ctx != null) { |
---|
329 | int[] r = processRepetition(q_ctx); |
---|
330 | min = r[0]; |
---|
331 | max = r[1]; |
---|
332 | } |
---|
333 | |
---|
334 | QueryNode content = (QueryNode) stack.pop(); |
---|
335 | stack.push(new QueryGroup(content, min, max)); |
---|
336 | if (logger.isTraceEnabled()) { |
---|
337 | logger.trace("visitQuery_group/exit: stack={}", stack); |
---|
338 | } |
---|
339 | return null; |
---|
340 | } |
---|
341 | |
---|
342 | |
---|
343 | @Override |
---|
344 | public Void visitQuery_simple(Query_simpleContext ctx) { |
---|
345 | if (logger.isTraceEnabled()) { |
---|
346 | logger.trace("visitQuery_simple/enter: children={} / cnt={} / text={}", |
---|
347 | ctx.children, |
---|
348 | ctx.getChildCount(), |
---|
349 | ctx.getText()); |
---|
350 | } |
---|
351 | super.visitQuery_simple(ctx); |
---|
352 | |
---|
353 | /* |
---|
354 | * handle repetition (if any) |
---|
355 | */ |
---|
356 | int min = 1; |
---|
357 | int max = 1; |
---|
358 | |
---|
359 | // fetch *first* child of type QuantifierContext, therefore idx=0 |
---|
360 | final QuantifierContext q_ctx = |
---|
361 | ctx.getChild(QuantifierContext.class, 0); |
---|
362 | if (q_ctx != null) { |
---|
363 | int[] r = processRepetition(q_ctx); |
---|
364 | min = r[0]; |
---|
365 | max = r[1]; |
---|
366 | } |
---|
367 | |
---|
368 | QueryNode expression = (QueryNode) stack.pop(); |
---|
369 | stack.push(new QuerySegment(expression, min, max)); |
---|
370 | if (logger.isTraceEnabled()) { |
---|
371 | logger.trace("visitQuery_simple/exit: stack={}", stack); |
---|
372 | } |
---|
373 | return null; |
---|
374 | } |
---|
375 | |
---|
376 | |
---|
377 | @Override |
---|
378 | public Void visitQuery_implicit(Query_implicitContext ctx) { |
---|
379 | if (logger.isTraceEnabled()) { |
---|
380 | logger.trace("visitQuery_implicit/enter: children={} / cnt={} / text={}", |
---|
381 | ctx.children, |
---|
382 | ctx.getChildCount(), |
---|
383 | ctx.getText()); |
---|
384 | } |
---|
385 | stack.push(defaultOperator); |
---|
386 | stack.push(defaultIdentifier); |
---|
387 | stack.push(EMPTY_STRING); |
---|
388 | super.visitQuery_implicit(ctx); |
---|
389 | |
---|
390 | @SuppressWarnings("unchecked") |
---|
391 | Set<RegexFlag> regex_flags = (Set<RegexFlag>) stack.pop(); |
---|
392 | String regex_value = (String) stack.pop(); |
---|
393 | String qualifier = (String) stack.pop(); |
---|
394 | String identifier = (String) stack.pop(); |
---|
395 | Operator operator = (Operator) stack.pop(); |
---|
396 | |
---|
397 | Expression exp = new Expression(qualifier, identifier, operator, |
---|
398 | regex_value, regex_flags); |
---|
399 | stack.push(exp); |
---|
400 | if (logger.isTraceEnabled()) { |
---|
401 | logger.trace("visitQuery_implicit/exit: stack={}", stack); |
---|
402 | } |
---|
403 | return null; |
---|
404 | } |
---|
405 | |
---|
406 | |
---|
407 | @Override |
---|
408 | public Void visitQuery_segment(Query_segmentContext ctx) { |
---|
409 | if (logger.isTraceEnabled()) { |
---|
410 | logger.trace("visitQuery_segment/enter: children={} / cnt={} / text={}", |
---|
411 | ctx.children, |
---|
412 | ctx.getChildCount(), |
---|
413 | ctx.getText()); |
---|
414 | } |
---|
415 | |
---|
416 | /* |
---|
417 | * if the context contains only two children, they must be |
---|
418 | * '[' and ']' thus we are dealing with a wildcard segment |
---|
419 | */ |
---|
420 | if (ctx.getChildCount() == 2) { |
---|
421 | stack.push(new ExpressionWildcard()); |
---|
422 | } else { |
---|
423 | super.visitQuery_segment(ctx); |
---|
424 | } |
---|
425 | if (logger.isTraceEnabled()) { |
---|
426 | logger.trace("visitQuery_segment/exit: stack={}", stack); |
---|
427 | } |
---|
428 | return null; |
---|
429 | } |
---|
430 | |
---|
431 | |
---|
432 | @Override |
---|
433 | public Void visitExpression_basic(Expression_basicContext ctx) { |
---|
434 | if (logger.isTraceEnabled()) { |
---|
435 | logger.trace("visitExpression_basic/enter: children={} / cnt={} / text={}", |
---|
436 | ctx.children, |
---|
437 | ctx.getChildCount(), |
---|
438 | ctx.getText()); |
---|
439 | } |
---|
440 | final Token tok_op = ((TerminalNode) ctx.getChild(1)).getSymbol(); |
---|
441 | switch (tok_op.getType()) { |
---|
442 | case FCSLexer.OPERATOR_EQ: |
---|
443 | stack.push(Operator.EQUALS); |
---|
444 | break; |
---|
445 | case FCSLexer.OPERATOR_NE: |
---|
446 | stack.push(Operator.NOT_EQUALS); |
---|
447 | break; |
---|
448 | default: |
---|
449 | throw new ExpressionTreeBuilderException( |
---|
450 | "invalid operator type: " + tok_op.getText()); |
---|
451 | } |
---|
452 | super.visitExpression_basic(ctx); |
---|
453 | |
---|
454 | @SuppressWarnings("unchecked") |
---|
455 | Set<RegexFlag> regex_flags = (Set<RegexFlag>) stack.pop(); |
---|
456 | String regex_value = (String) stack.pop(); |
---|
457 | String qualifier = (String) stack.pop(); |
---|
458 | String identifer = (String) stack.pop(); |
---|
459 | Operator operator = (Operator) stack.pop(); |
---|
460 | |
---|
461 | Expression exp = new Expression(qualifier, identifer, operator, |
---|
462 | regex_value, regex_flags); |
---|
463 | stack.push(exp); |
---|
464 | if (logger.isTraceEnabled()) { |
---|
465 | logger.trace("visitExpression_basic/exit: stack={}", stack); |
---|
466 | } |
---|
467 | return null; |
---|
468 | } |
---|
469 | |
---|
470 | |
---|
471 | @Override |
---|
472 | public Void visitExpression_not(Expression_notContext ctx) { |
---|
473 | if (logger.isTraceEnabled()) { |
---|
474 | logger.trace("visitExpression_not/enter: children={} / cnt={} / text={}", |
---|
475 | ctx.children, |
---|
476 | ctx.getChildCount(), |
---|
477 | ctx.getText()); |
---|
478 | } |
---|
479 | super.visitExpression_not(ctx); |
---|
480 | |
---|
481 | QueryNode expression = (QueryNode) stack.pop(); |
---|
482 | stack.push(new ExpressionNot(expression)); |
---|
483 | if (logger.isTraceEnabled()) { |
---|
484 | logger.trace("visitExpression_not/exit: stack={}", stack); |
---|
485 | } |
---|
486 | return null; |
---|
487 | } |
---|
488 | |
---|
489 | |
---|
490 | @Override |
---|
491 | public Void visitExpression_group(Expression_groupContext ctx) { |
---|
492 | if (logger.isTraceEnabled()) { |
---|
493 | logger.trace("visitExpression_group/enter: children={} / cnt={} / text={}", |
---|
494 | ctx.children, |
---|
495 | ctx.getChildCount(), |
---|
496 | ctx.getText()); |
---|
497 | } |
---|
498 | super.visitExpression_group(ctx); |
---|
499 | |
---|
500 | QueryNode expression = (QueryNode) stack.pop(); |
---|
501 | stack.push( new ExpressionGroup(expression)); |
---|
502 | if (logger.isTraceEnabled()) { |
---|
503 | logger.trace("visitExpression_group/exit: stack={}", stack); |
---|
504 | } |
---|
505 | return null; |
---|
506 | } |
---|
507 | |
---|
508 | |
---|
509 | @Override |
---|
510 | public Void visitExpression_or(Expression_orContext ctx) { |
---|
511 | if (logger.isTraceEnabled()) { |
---|
512 | logger.trace("visitExpression_or/enter: children={} / cnt={} / text={}", |
---|
513 | ctx.children, |
---|
514 | ctx.getChildCount(), |
---|
515 | ctx.getText()); |
---|
516 | } |
---|
517 | final int pos = stack.size(); |
---|
518 | super.visitExpression_or(ctx); |
---|
519 | if (stack.size() > pos) { |
---|
520 | final List<QueryNode> children = new ArrayList<QueryNode>(); |
---|
521 | while (stack.size() > pos) { |
---|
522 | children.add(0, (QueryNode) stack.pop()); |
---|
523 | } |
---|
524 | stack.push( new ExpressionOr(children)); |
---|
525 | } else { |
---|
526 | throw new ExpressionTreeBuilderException( |
---|
527 | "visitExpression_or is empty!"); |
---|
528 | } |
---|
529 | if (logger.isTraceEnabled()) { |
---|
530 | logger.trace("visitExpression_or/exit: stack={}", stack); |
---|
531 | } |
---|
532 | return null; |
---|
533 | } |
---|
534 | |
---|
535 | |
---|
536 | @Override |
---|
537 | public Void visitExpression_and(Expression_andContext ctx) { |
---|
538 | if (logger.isTraceEnabled()) { |
---|
539 | logger.trace("visitExpression_and/enter: children={} / cnt={} / text={}", |
---|
540 | ctx.children, |
---|
541 | ctx.getChildCount(), |
---|
542 | ctx.getText()); |
---|
543 | } |
---|
544 | final int pos = stack.size(); |
---|
545 | super.visitExpression_and(ctx); |
---|
546 | if (stack.size() > pos) { |
---|
547 | final List<QueryNode> children = new ArrayList<QueryNode>(); |
---|
548 | while (stack.size() > pos) { |
---|
549 | children.add(0, (QueryNode) stack.pop()); |
---|
550 | } |
---|
551 | stack.push(new ExpressionAnd(children)); |
---|
552 | } else { |
---|
553 | throw new ExpressionTreeBuilderException( |
---|
554 | "visitExpression_and is empty!"); |
---|
555 | } |
---|
556 | if (logger.isTraceEnabled()) { |
---|
557 | logger.trace("visitExpression_and/exit: stack={}", stack); |
---|
558 | } |
---|
559 | return null; |
---|
560 | } |
---|
561 | |
---|
562 | |
---|
563 | @Override |
---|
564 | public Void visitAttribute(AttributeContext ctx) { |
---|
565 | if (logger.isTraceEnabled()) { |
---|
566 | logger.trace("visitAttribute/enter: children={} / cnt={} / text={}", |
---|
567 | ctx.children, |
---|
568 | ctx.getChildCount(), |
---|
569 | ctx.getText()); |
---|
570 | } |
---|
571 | |
---|
572 | // handle optional qualifier |
---|
573 | QualifierContext q_ctx = ctx.getChild(QualifierContext.class, 0); |
---|
574 | String qualifier = (q_ctx != null) ? q_ctx.getText() : EMPTY_STRING; |
---|
575 | |
---|
576 | stack.push(ctx.getChild(IdentifierContext.class, 0).getText()); |
---|
577 | stack.push(qualifier); |
---|
578 | if (logger.isTraceEnabled()) { |
---|
579 | logger.trace("visitAttribute/exit: stack={}", stack); |
---|
580 | } |
---|
581 | return null; |
---|
582 | } |
---|
583 | |
---|
584 | |
---|
585 | @Override |
---|
586 | public Void visitRegexp(RegexpContext ctx) { |
---|
587 | if (logger.isTraceEnabled()) { |
---|
588 | logger.trace("visitRegexp/enter: children={} / cnt={} / text={}", |
---|
589 | ctx.children, |
---|
590 | ctx.getChildCount(), |
---|
591 | ctx.getText()); |
---|
592 | } |
---|
593 | |
---|
594 | final Regexp_patternContext p_ctx = |
---|
595 | ctx.getChild(Regexp_patternContext.class, 0); |
---|
596 | String regex = stripQuotes(p_ctx.getText()); |
---|
597 | |
---|
598 | /* process escape sequences, if present */ |
---|
599 | if (regex.indexOf('\\') != -1) { |
---|
600 | regex = unescapeString(regex, buf); |
---|
601 | } |
---|
602 | |
---|
603 | /* perform unicode normalization, if requested */ |
---|
604 | if (unicodeNormalizationForm != null) { |
---|
605 | regex = Normalizer.normalize(regex, unicodeNormalizationForm); |
---|
606 | } |
---|
607 | |
---|
608 | // FIXME: validate regex? |
---|
609 | stack.push(regex); |
---|
610 | |
---|
611 | // handle regex flags, if any |
---|
612 | final Regexp_flagContext f_ctx = |
---|
613 | ctx.getChild(Regexp_flagContext.class, 0); |
---|
614 | if (f_ctx != null) { |
---|
615 | final String s = f_ctx.getText(); |
---|
616 | |
---|
617 | Set<RegexFlag> flags = new HashSet<RegexFlag>(); |
---|
618 | for (int i = 0; i < s.length(); i++) { |
---|
619 | switch (s.charAt(i)) { |
---|
620 | case 'i': |
---|
621 | /* $FALL-THROUGH$ */ |
---|
622 | case 'c': |
---|
623 | flags.add(RegexFlag.CASE_INSENSITIVE); |
---|
624 | break; |
---|
625 | case 'I': |
---|
626 | /* $FALL-THROUGH$ */ |
---|
627 | case 'C': |
---|
628 | flags.add(RegexFlag.CASE_SENSITIVE); |
---|
629 | break; |
---|
630 | case 'l': |
---|
631 | flags.add(RegexFlag.LITERAL_MATCHING); |
---|
632 | break; |
---|
633 | case 'd': |
---|
634 | flags.add(RegexFlag.IGNORE_DIACRITICS); |
---|
635 | break; |
---|
636 | default: |
---|
637 | throw new ExpressionTreeBuilderException( |
---|
638 | "unknown regex modifier flag: " + s.charAt(i)); |
---|
639 | } // switch |
---|
640 | } |
---|
641 | |
---|
642 | // validate regex flags |
---|
643 | if (flags.contains(RegexFlag.CASE_SENSITIVE) && |
---|
644 | flags.contains(RegexFlag.CASE_INSENSITIVE)) { |
---|
645 | throw new ExpressionTreeBuilderException( |
---|
646 | "invalid combination of regex modifier flags: " + |
---|
647 | "'i' or 'c' and 'I' or 'C' are mutually exclusive"); |
---|
648 | } |
---|
649 | if (flags.contains(RegexFlag.LITERAL_MATCHING) && |
---|
650 | (flags.contains(RegexFlag.CASE_SENSITIVE) || |
---|
651 | flags.contains(RegexFlag.CASE_INSENSITIVE) || |
---|
652 | flags.contains(RegexFlag.IGNORE_DIACRITICS))) { |
---|
653 | throw new ExpressionTreeBuilderException( |
---|
654 | "invalid combination of regex modifier flags: 'l' " + |
---|
655 | "is mutually exclusive with 'i', 'c', 'I', 'C' or 'd'"); |
---|
656 | } |
---|
657 | |
---|
658 | stack.push(flags); |
---|
659 | } else { |
---|
660 | // regex without flags, so push 'empty' flags on stack |
---|
661 | stack.push(Collections.emptySet()); |
---|
662 | } |
---|
663 | if (logger.isTraceEnabled()) { |
---|
664 | logger.trace("visitRegexp/exit: stack={}", stack); |
---|
665 | } |
---|
666 | return null; |
---|
667 | } |
---|
668 | |
---|
669 | |
---|
670 | @Override |
---|
671 | public Void visitWithin_part_simple(Within_part_simpleContext ctx) { |
---|
672 | if (logger.isTraceEnabled()) { |
---|
673 | logger.trace("Within_part_simpleContext/enter: children={} / cnt={} / text={}", |
---|
674 | ctx.children, |
---|
675 | ctx.getChildCount(), |
---|
676 | ctx.getText()); |
---|
677 | } |
---|
678 | |
---|
679 | SimpleWithin.Scope scope = null; |
---|
680 | String s = ctx.getChild(0).getText(); |
---|
681 | if ("sentence".equals(s) || "s".equals(s)) { |
---|
682 | scope = SimpleWithin.Scope.SENTENCE; |
---|
683 | } else if ("utterance".equals(s) || "u".equals(s)) { |
---|
684 | scope = SimpleWithin.Scope.UTTERANCE; |
---|
685 | } else if ("paragraph".equals(s) || "p".equals(s)) { |
---|
686 | scope = SimpleWithin.Scope.PARAGRAPH; |
---|
687 | } else if ("turn".equals(s)|| "t".equals(s)) { |
---|
688 | scope = SimpleWithin.Scope.TURN; |
---|
689 | } else if ("text".equals(s)) { |
---|
690 | scope = SimpleWithin.Scope.TEXT; |
---|
691 | } else if ("session".equals(s)) { |
---|
692 | scope = SimpleWithin.Scope.SESSION; |
---|
693 | } else { |
---|
694 | throw new ExpressionTreeBuilderException( |
---|
695 | "invalid scope for simple 'within' clause: " + s); |
---|
696 | } |
---|
697 | stack.push(new SimpleWithin(scope)); |
---|
698 | if (logger.isTraceEnabled()) { |
---|
699 | logger.trace("Within_part_simpleContext/exit: stack={}", stack); |
---|
700 | } |
---|
701 | return null; |
---|
702 | } |
---|
703 | } |
---|
704 | |
---|
705 | |
---|
706 | private static int[] processRepetition(QuantifierContext ctx) { |
---|
707 | final Token tok = |
---|
708 | ctx.getChild(TerminalNode.class, 0).getSymbol(); |
---|
709 | switch (tok.getType()) { |
---|
710 | case FCSParser.Q_ZERO_OR_MORE: /* "*" */ |
---|
711 | return REP_ZERO_OR_MORE; |
---|
712 | case FCSParser.Q_ONE_OR_MORE: /* "+" */ |
---|
713 | return REP_ONE_OR_MORE; |
---|
714 | case FCSParser.Q_ZERO_OR_ONE: /* "?" */ |
---|
715 | return REP_ZERO_OR_ONE; |
---|
716 | case FCSParser.L_CURLY_BRACKET: /* "{x, y}" variants */ |
---|
717 | return processRepetition2(ctx); |
---|
718 | default: |
---|
719 | throw new ExpressionTreeBuilderException( |
---|
720 | "unexpected symbol in repetition quantifier: " + |
---|
721 | tok.getText()); |
---|
722 | } // switch |
---|
723 | } |
---|
724 | |
---|
725 | |
---|
726 | private static int[] processRepetition2(QuantifierContext ctx) { |
---|
727 | int commaIdx = getChildIndex(ctx, FCSParser.Q_COMMA, 0); |
---|
728 | int int1Idx = getChildIndex(ctx, FCSParser.INTEGER, 0); |
---|
729 | int int2Idx = getChildIndex(ctx, FCSParser.INTEGER, int1Idx + 1); |
---|
730 | int min = 0; |
---|
731 | int max = QueryNode.OCCURS_UNBOUNDED; |
---|
732 | if (commaIdx != -1) { |
---|
733 | if (int1Idx < commaIdx) { |
---|
734 | min = parseInteger(ctx.getChild(int1Idx).getText()); |
---|
735 | } |
---|
736 | if (commaIdx < int1Idx) { |
---|
737 | max = parseInteger(ctx.getChild(int1Idx).getText()); |
---|
738 | } else if (commaIdx < int2Idx) { |
---|
739 | max = parseInteger(ctx.getChild(int2Idx).getText()); |
---|
740 | } |
---|
741 | } else { |
---|
742 | if (int1Idx == -1) { |
---|
743 | throw new ExpressionTreeBuilderException("int1Idx == -1"); |
---|
744 | } |
---|
745 | min = parseInteger(ctx.getChild(int1Idx).getText()); |
---|
746 | max = min; |
---|
747 | } |
---|
748 | if ((max != QueryNode.OCCURS_UNBOUNDED) && (min > max)) { |
---|
749 | throw new ExpressionTreeBuilderException( |
---|
750 | "bad qualifier: min > max (" + min + " > " + max + ")"); |
---|
751 | } |
---|
752 | return new int[] { min, max }; |
---|
753 | } |
---|
754 | |
---|
755 | |
---|
756 | private static int getChildIndex(ParserRuleContext ctx, |
---|
757 | int tokenType, int start) { |
---|
758 | if ((start >= 0) && (start < ctx.getChildCount())) { |
---|
759 | for (int i = start; i < ctx.getChildCount(); i++) { |
---|
760 | final ParseTree o = ctx.getChild(i); |
---|
761 | if (o instanceof TerminalNode) { |
---|
762 | if (((TerminalNode) o).getSymbol().getType() == tokenType) { |
---|
763 | return i; |
---|
764 | } |
---|
765 | } |
---|
766 | } |
---|
767 | } |
---|
768 | return -1; |
---|
769 | } |
---|
770 | |
---|
771 | |
---|
772 | private static int parseInteger(String s) { |
---|
773 | try { |
---|
774 | return Integer.parseInt(s); |
---|
775 | } catch (NumberFormatException e) { |
---|
776 | throw new ExpressionTreeBuilderException( |
---|
777 | "invalid integer: " + s, e); |
---|
778 | } |
---|
779 | } |
---|
780 | |
---|
781 | |
---|
782 | private static String stripQuotes(String s) { |
---|
783 | if (s.startsWith("\"")) { |
---|
784 | if (s.endsWith("\"")) { |
---|
785 | s = s.substring(1, s.length() - 1); |
---|
786 | } else { |
---|
787 | throw new ExpressionTreeBuilderException( |
---|
788 | "value not properly quoted; invalid closing quote"); |
---|
789 | } |
---|
790 | } else if (s.startsWith("'")) { |
---|
791 | if (s.endsWith("'")) { |
---|
792 | s = s.substring(1, s.length() - 1); |
---|
793 | } else { |
---|
794 | throw new ExpressionTreeBuilderException( |
---|
795 | "value not properly quoted; invalid closing quote"); |
---|
796 | } |
---|
797 | } else { |
---|
798 | throw new ExpressionTreeBuilderException( |
---|
799 | "value not properly quoted; expected \" (double quote) " + |
---|
800 | "or ' (single qoute) character"); |
---|
801 | } |
---|
802 | return s; |
---|
803 | } |
---|
804 | |
---|
805 | |
---|
806 | private static String unescapeString(String s, char[] buf) { |
---|
807 | final StringBuilder sb = new StringBuilder(); |
---|
808 | for (int i = 0; i < s.length(); i++) { |
---|
809 | int cp = s.codePointAt(i); |
---|
810 | if (cp == '\\') { |
---|
811 | i++; // skip slash |
---|
812 | cp = s.codePointAt(i); |
---|
813 | |
---|
814 | switch (cp) { |
---|
815 | case '\\': /* slash */ |
---|
816 | sb.append("\\"); |
---|
817 | break; |
---|
818 | case '"': /* double quote */ |
---|
819 | sb.append("\""); |
---|
820 | break; |
---|
821 | case '\'': /* single quote */ |
---|
822 | sb.append("'"); |
---|
823 | break; |
---|
824 | case 'n': /* new line */ |
---|
825 | sb.append("\n"); |
---|
826 | break; |
---|
827 | case 't': /* tabulator */ |
---|
828 | sb.append("\t"); |
---|
829 | break; |
---|
830 | case '.': /* regex: dot */ |
---|
831 | sb.append("\\."); |
---|
832 | break; |
---|
833 | case '^': /* regex: caret */ |
---|
834 | sb.append("\\^"); |
---|
835 | break; |
---|
836 | case '$': /* regex: dollar */ |
---|
837 | sb.append("\\$"); |
---|
838 | break; |
---|
839 | case '*': /* regex: asterisk */ |
---|
840 | sb.append("\\*"); |
---|
841 | break; |
---|
842 | case '+': /* regex: plus */ |
---|
843 | sb.append("\\+"); |
---|
844 | break; |
---|
845 | case '?': /* regex: question mark */ |
---|
846 | sb.append("\\?"); |
---|
847 | break; |
---|
848 | case '(': /* regex: opening parenthesis */ |
---|
849 | sb.append("\\("); |
---|
850 | break; |
---|
851 | case ')': /* regex: closing parenthesis */ |
---|
852 | sb.append("\\)"); |
---|
853 | break; |
---|
854 | case '{': /* regex: opening curly brace */ |
---|
855 | sb.append("\\{"); |
---|
856 | break; |
---|
857 | case '[': /* regex: opening square bracket */ |
---|
858 | sb.append("\\["); |
---|
859 | break; |
---|
860 | case '|': /* regex: vertical bar */ |
---|
861 | sb.append("\\|"); |
---|
862 | break; |
---|
863 | case 'x': /* x HEX HEX */ |
---|
864 | i = unescapeUnicode(s, i, 2, sb, buf); |
---|
865 | break; |
---|
866 | case 'u': /* u HEX HEX HEX HEX */ |
---|
867 | i = unescapeUnicode(s, i, 4, sb, buf); |
---|
868 | break; |
---|
869 | case 'U': /* U HEX HEX HEX HEX HEX HEX HEX HEX */ |
---|
870 | i = unescapeUnicode(s, i, 8, sb, buf); |
---|
871 | break; |
---|
872 | default: |
---|
873 | throw new ExpressionTreeBuilderException( |
---|
874 | "invalid escape sequence: \\" + |
---|
875 | new String(Character.toChars(cp))); |
---|
876 | } |
---|
877 | } else { |
---|
878 | try { |
---|
879 | final int len = Character.toChars(cp, buf, 0); |
---|
880 | sb.append(buf, 0, len); |
---|
881 | } catch (IllegalArgumentException e) { |
---|
882 | throw new ExpressionTreeBuilderException( |
---|
883 | "invalid codepoint: U+" + |
---|
884 | Integer.toHexString(cp).toUpperCase()); |
---|
885 | } |
---|
886 | } |
---|
887 | } |
---|
888 | return sb.toString(); |
---|
889 | } |
---|
890 | |
---|
891 | |
---|
892 | private static final int unescapeUnicode(String s, int i, int size, |
---|
893 | StringBuilder sb, char[] buf) { |
---|
894 | if ((s.length() - i - 1) >= size) { |
---|
895 | int cp = 0; |
---|
896 | for (int j = 0; j < size; j++) { |
---|
897 | i++; |
---|
898 | if (j > 0) { |
---|
899 | cp = cp << 4; |
---|
900 | } |
---|
901 | cp |= parseHexString(s.codePointAt(i)); |
---|
902 | } |
---|
903 | try { |
---|
904 | final int len = Character.toChars(cp, buf, 0); |
---|
905 | sb.append(buf, 0, len); |
---|
906 | } catch (IllegalArgumentException e) { |
---|
907 | throw new ExpressionTreeBuilderException( |
---|
908 | "invalid codepoint: U+" + |
---|
909 | Integer.toHexString(cp).toUpperCase()); |
---|
910 | } |
---|
911 | return i; |
---|
912 | } else { |
---|
913 | throw new ExpressionTreeBuilderException( |
---|
914 | "truncated escape sequence: \\" + s.substring(i)); |
---|
915 | } |
---|
916 | } |
---|
917 | |
---|
918 | |
---|
919 | private static final int parseHexString(int c) { |
---|
920 | switch (c) { |
---|
921 | case '0': |
---|
922 | return 0; |
---|
923 | case '1': |
---|
924 | return 1; |
---|
925 | case '2': |
---|
926 | return 2; |
---|
927 | case '3': |
---|
928 | return 3; |
---|
929 | case '4': |
---|
930 | return 4; |
---|
931 | case '5': |
---|
932 | return 5; |
---|
933 | case '6': |
---|
934 | return 6; |
---|
935 | case '7': |
---|
936 | return 7; |
---|
937 | case '8': |
---|
938 | return 8; |
---|
939 | case '9': |
---|
940 | return 9; |
---|
941 | case 'a': |
---|
942 | /* FALL-THROUGH */ |
---|
943 | case 'A': |
---|
944 | return 10; |
---|
945 | case 'b': |
---|
946 | /* FALL-THROUGH */ |
---|
947 | case 'B': |
---|
948 | return 11; |
---|
949 | case 'c': |
---|
950 | /* FALL-THROUGH */ |
---|
951 | case 'C': |
---|
952 | return 12; |
---|
953 | case 'd': |
---|
954 | /* FALL-THROUGH */ |
---|
955 | case 'D': |
---|
956 | return 13; |
---|
957 | case 'e': |
---|
958 | /* FALL-THROUGH */ |
---|
959 | case 'E': |
---|
960 | return 14; |
---|
961 | case 'f': |
---|
962 | /* FALL-THROUGH */ |
---|
963 | case 'F': |
---|
964 | return 15; |
---|
965 | default: |
---|
966 | /* |
---|
967 | * actually, this should never happen, as ANTLR's lexer should catch |
---|
968 | * illegal HEX characters |
---|
969 | */ |
---|
970 | throw new ExpressionTreeBuilderException( |
---|
971 | "invalud hex character: " + |
---|
972 | new String(Character.toChars(c))); |
---|
973 | } |
---|
974 | } |
---|
975 | |
---|
976 | |
---|
977 | private static final class ErrorListener extends BaseErrorListener { |
---|
978 | private final String query; |
---|
979 | private List<String> errors = null; |
---|
980 | |
---|
981 | |
---|
982 | private ErrorListener(String query) { |
---|
983 | this.query = query; |
---|
984 | } |
---|
985 | |
---|
986 | |
---|
987 | @Override |
---|
988 | public void syntaxError(Recognizer<?, ?> recognizer, |
---|
989 | Object offendingSymbol, int line, int charPositionInLine, |
---|
990 | String msg, RecognitionException e) { |
---|
991 | if (errors == null) { |
---|
992 | errors = new ArrayList<String>(); |
---|
993 | } |
---|
994 | |
---|
995 | /* |
---|
996 | * FIXME: additional information of error should not be logged |
---|
997 | * but added to the list of errors; that list probably needs |
---|
998 | * to be enhanced to store supplementary information |
---|
999 | * Furthermore, a sophisticated errorlist implementation could |
---|
1000 | * also be used by the QueryVistor to add addition query error |
---|
1001 | * information |
---|
1002 | */ |
---|
1003 | if (logger.isDebugEnabled()) { |
---|
1004 | if (offendingSymbol instanceof Token) { |
---|
1005 | final Token t = (Token) offendingSymbol; |
---|
1006 | int pos = t.getStartIndex(); |
---|
1007 | if (pos != -1) { |
---|
1008 | StringBuilder x = new StringBuilder(); |
---|
1009 | while (pos-- > 0) { |
---|
1010 | x.append(" "); |
---|
1011 | } |
---|
1012 | x.append("^- ").append(msg); |
---|
1013 | logger.debug("query: {}", query); |
---|
1014 | logger.debug(" {}", x.toString()); |
---|
1015 | } |
---|
1016 | } |
---|
1017 | } |
---|
1018 | |
---|
1019 | errors.add(msg); |
---|
1020 | } |
---|
1021 | |
---|
1022 | |
---|
1023 | public boolean hasErrors() { |
---|
1024 | return (errors != null) && !errors.isEmpty(); |
---|
1025 | } |
---|
1026 | |
---|
1027 | |
---|
1028 | public List<String> getErrors() { |
---|
1029 | if (errors != null) { |
---|
1030 | return errors; |
---|
1031 | } else { |
---|
1032 | return Collections.emptyList(); |
---|
1033 | } |
---|
1034 | } |
---|
1035 | } |
---|
1036 | |
---|
1037 | |
---|
1038 | @SuppressWarnings("serial") |
---|
1039 | private static final class ExpressionTreeBuilderException |
---|
1040 | extends RuntimeException { |
---|
1041 | private ExpressionTreeBuilderException(String message, |
---|
1042 | Throwable cause) { |
---|
1043 | super(message, cause); |
---|
1044 | } |
---|
1045 | |
---|
1046 | |
---|
1047 | private ExpressionTreeBuilderException(String message) { |
---|
1048 | this(message, null); |
---|
1049 | } |
---|
1050 | } |
---|
1051 | |
---|
1052 | } // class QueryParser |
---|