Taskforces/FCS/FCS-QL: FCS_QL_2.g4

File FCS_QL_2.g4, 3.7 KB (added by peter.beinema@mpi.nl, 9 years ago)

antlr (version 4.5) grammar for FCS-QL query

Line 
1// 2015-08-03 PFB sample
2// Define a grammar for Federated Content Seach Query language (cf. https://trac.clarin.eu/wiki/Taskforces/FCS/FCS-QL)
3//
4// 0. Install Java (version 1.6 or higher)
5//
6// 1. Download antlr version 4.5:
7//      $ cd /usr/local/lib
8//      $ curl -O http://www.antlr.org/download/antlr-4.5-complete.jar
9//    Or just download in browser from website:
10//      http://www.antlr.org/download.html
11//      and put it somewhere rational like /usr/local/lib.
12//
13// 2. Add antlr-4.5-complete.jar to your CLASSPATH:
14//      $ export CLASSPATH=".:/usr/local/lib/antlr-4.5-complete.jar:$CLASSPATH"
15//      It's also a good idea to put this in your .bash_profile or whatever your startup script is.
16//
17// 3. Create aliases for the ANTLR Tool, and TestRig.
18//      $ alias antlr4='java -Xmx500M -cp "/usr/local/lib/antlr-4.5-complete.jar:$CLASSPATH" org.antlr.v4.Tool'
19//      $ alias grun='java org.antlr.v4.runtime.misc.TestRig'
20//
21// 4. Compile your grammar:
22//      antlr4 FCS-QL.g4
23//      javac FCS-QL*.java
24//
25// 5. Run your parser:  // -gui => interactive tree diagram output; -tree => bracketed textual output
26//      grun FCS_QL query -gui < test_00004.in
27//
28// TO DO: handle full unicode (rules [27] [29] [30])
29// TO DO: test alternatives
30
31grammar FCS_QL_2 ;
32
33//--- [1]
34query :
35      main_query within_part?
36    ;
37
38//--- [2v2] no left recursion or ambiguity
39main_query :
40      simple_query
41    | simple_query '|' main_query       // 'or'
42    | simple_query main_query           // sequence
43    | simple_query quantifier           // quantification
44    ;
45
46//--- [3v2]
47simple_query :
48      '(' main_query ')'
49    | implicit_query
50    | segment_query
51    ;
52
53//--- [4]
54implicit_query :
55      flagged_regexp
56    ;
57
58//--- [5v2] 'expression no longer optional
59segment_query :
60      '[' expression ']'
61    ;
62
63//--- [6]
64within_part :
65       simple_within_part
66    ;
67
68//--- [7]
69simple_within_part:
70      'within' simple_within_scope
71    ;
72
73//--- [8]
74simple_within_scope :
75      'sentence'
76    | 's'
77    | 'utterance'
78    | 'u'
79    | 'pararaph'
80    | 'p'
81    | 'turn'
82    | 't'
83    | 'text'
84    | 'session'
85    ;
86
87//--- [9]..[10]
88// not present in spec
89
90//--- [11v2]
91expression :
92      basic_expression
93    | basic_expression '|' expression   // or
94    | basic_expression '&' expression   // and
95    ;
96
97//--- [12v2]
98basic_expression :
99      '(' expression ')'        // grouping
100    | '!' expression            // not
101    | attribute operator flagged_regexp
102    ;
103
104//-- [13]
105operator :
106      ISOP      // equals
107    | ISNTOP    // not eqals
108    ;
109
110//--- 13A: terminal, split off from 13
111ISOP :
112      '='
113    ;
114
115//--- 13B: terminal, split off from 13
116ISNTOP :
117      '!='
118    ;
119
120//--- [14]
121quantifier :
122      '+'                               // one or more
123    | '*'                               // zero or more
124    | '?'                               // zero or one
125    | '{' integer '}'                   // exactly 'integer' times
126    | '{' integer? ',' integer '}'      // optional min - max
127    | '{' integer ',' integer? '}'      // min - optional max
128    ;
129
130//--- [15]
131flagged_regexp :
132      regexp
133    | regexp '/' REGEXP_FLAG+
134    ;
135
136//--- [16v2]
137REGEXP_FLAG :
138      [CIcdli]+
139    ;
140
141//--- [17]
142regexp :
143      quoted_string
144    ;
145
146//--- [18]
147attribute :
148      simple_attribute
149    | qualified_attribute
150    ;
151
152//--- [19]
153simple_attribute :
154      ATTRIBUTE ;
155
156ATTRIBUTE:
157      [A-Za-z][A-Za-z0-9_]*
158    ;
159
160//--- [20]
161qualified_attribute :
162      ATTRIBUTE ':' ATTRIBUTE
163    ;
164
165//--- [21]
166// identifier : replaced by ATTRIBUTE
167
168//--- [22]
169// identifier_char : replaced by ATTRIBUTE
170
171//--- [23]
172// not available in spec
173
174//--- [24]
175integer :
176      DIGITS ;
177
178//--- [25]
179// not available in spec
180
181//--- [26]
182quoted_string :
183      'qstr'
184    | STRING_LITERAL
185    ;
186
187//--- [26A] adapted to something antlr will process correctly
188STRING_LITERAL : '"' (~('"' | '\\' | '\r' | '\n') | '\\' ('"' | '\\'))* '"';
189
190DIGITS :
191    [0-9]+ ;
192
193//--- [28]
194WS : [ \t\n\r]+ -> skip ;