Context Navigation

FCS-Specification-Draft

Timestamp:: 11/03/15 09:02:53 (9 years ago)
Author:: Leif-Jöran
Comment:: FCS-QL EBNF added to normative appendix

Legend:

: Unmodified
: Added
: Removed
: Modified

Taskforces/FCS/FCS-Specification-Draft

-                      v35
+                      v36
 || `http://clarin.eu/fcs/diagnostic/14` || General processing hint. || E.g. "No matches, because layer 'XY' is not available in your selection of resources" || non-fatal || Endpoints `MUST` use this diagnostic only if the Client performed an Advanced Search request. ||
+== CLARIN FCS-QL Grammar Specification #fcsQLEBNF
+The version of the CLARIN FCS-QL is tied to the FCS Core version starting with version 2.0.
+=== FCS-QL EBNF ===
+{{{#!comment
+  Please keep the EBNF nicely formatted. Thanks!
+}}}
+{{{
+ [1] query                ::= main-query within-part?
+ [2.11] main-query        ::= simple-query
+                            | simple-query "|" main-query     /* or */
+                            | simple-query main-query         /* sequence */
+                            | simple-query quantifier         /* quatification */
+ [3.11] simple-query      ::= '(' main_query ')'              /* grouping */
+                            | implicit-query
+                            | segment-query
+ [4] implicit-query       ::= flagged-regexp
+ [5] segment-query        ::= "[" expression? "]"
+ [6] within-part          ::= simple-within-part
+ [7] simple-within-part   ::= "within" simple-within-scope
+ [8] simple-within-scope  ::= "sentence"
+                            | "s"
+                            | "utterance"
+                            | "u"
+                            | "paragraph"
+                            | "p"
+                            | "turn"
+                            | "t"
+                            | "text"
+                            | "session"
+[11.11] expression        ::= basic-expression
+                            | expression "|" expression     /* or */
+                            | expression "&" expression     /* and */
+[12.11] basic-expression  ::= '(' expression ')'         /* grouping */
+                            | "!" expression                /* not */
+                            | attribute operator flagged-regexp
+[13] operator             ::= "="                           /* equals */
+                            | "!="                          /* non-equals */
+[14] quantifier           ::= "+"                           /* one-or-more */
+                            | "*"                           /* zero-or-more */
+                            | "?"                           /* zero-or-one */
+                            | "{" integer "}"               /* exactly n-times */
+                            | "{" integer? "," integer "}"  /* at most */
+                            | "{" integer "," integer? "}"  /* min-max */
+[15] flagged-regexp       ::= regexp
+                            | regexp "/" regexp-flag+
+[16] regexp-flag          ::= "i"  /* case-insensitive; Poliqarp/Perl compat */
+                            | "I"  /* case-sensitive; Poliqarp compat */
+                            | "c"  /* case-insensitive, CQP compat */
+                            | "C"  /* case-sensitive */
+                            | "l"  /* literal matching, CQP compat*/
+                            | "d"  /* diacritic agnostic matching, CQP compat */
+[17] regexp               ::= quoted-string
+[18] attribute            ::= simple-attribute
+                            | qualified-attribute
+[19] simple-attribute     ::= identifier
+[20] qualified-attribute  ::= identifier ":" identifier
+[21.11] identifier        ::= identifier-first-char identifier-char*
+[21.12] identifier-first-char      ::= [a-zA-Z]
+[22] identifier-char      ::= [a-zA-Z0-9\-]
+[24] integer              ::= [0-9]+
+[26] quoted-string        ::= "'" (char | ws)* "'"  /* single-quotes */
+                            | """ (char | ws)* """  /* double-quotes */
+[27] char                 ::= <any unicode codepoint excluding whitespace codepoints>
+                            | "\" escaped-char
+[28] ws                   ::= <any whitespace codepoint>
+[29] escaped-char         ::= "\"                                  /* backslash (\) */
+                            | "'"                                  /* single quote (') */
+                            | """                                  /* double quote (") */
+                            | "n"                                  /* generic newline, i.e "\n", "\r", etc */
+                            | "t"                                  /* character tabulation (U+0009) */
+                            | "x" hex hex                          /* Unicode codepoint with hex value hh */
+                            | "u" hex hex hex hex                  /* Unicode codepoint with hex value hhhh */
+                            | "U" hex hex hex hex hex hex hex hex  /* Unicode codepoint with hex value hhhhhhhh */
+[30] hex                  ::= [0-9a-fA-F]
+}}}
+=== Notes ===
+ * based on Poliqarp with inspiration from others
+ * "attribute": the annotation layer to be used, e.g. "word", "lemma", "pos" or qualified "pos:stts" the supported values for this construct are beyond the grammar and are defined in supplementary documents
+ * "simple-within-scope": possible values for scope
+   *  "sentence", "s", "utterance", "u": denote a matching scope of something like a sentence or utterance. provides compatibility with FCS 1.0 ("Generic Hits", "Each hit SHOULD be presented within the context of a complete sentence.")
+   * "paragraph" | "p" | "turn" | "t": denote the next larger unit, e.g. something like a paragraph
+   * "article" | "session": something like a whole document
+ * {{{[27]}}} and {{{[28]}}} "any $SOMETING codepoint" are a pain to get easily done in at least ANTLR and JavaCC. Especially in combination with {{{[29]}}} :/
+ * regex are not defined/guarded by this grammar :/
+ * non-continuous rule numbers are currently intended; we've already removed some. Rules will be renumbered, when grammar is fixed.
+ * Integrated Peter B's suggestion {{{[2v2]}}} and {{{[3v2]}}} together with {{{[11v2]}}} and {{{[12v2]}}} for resolving structural ambiguity eventhough antlr handles this perfectly fine.
+ * Changed "identifier" {{{[21]}}} to only be allowed to start with a letter e.g. not digits and - (hyphen) to more resemble XML names.
 = Non-normative Appendix
 {{{