Changes between Version 12 and Version 13 of Taskforces/FCS/FCS-QL


Ignore:
Timestamp:
06/13/17 15:42:46 (7 years ago)
Author:
Leif-Jöran
Comment:

Refer to the spec

Legend:

Unmodified
Added
Removed
Modified
  • Taskforces/FCS/FCS-QL

    v12 v13  
    11= CLARIN Federated Content Search Query Language =
    2 A working draft for the CQP flavor for CLARIN Federated Content Search (FCS).
    3 == EBNF ==
    4 {{{#!comment
    5   Please keep the EBNF nicely formatted. Thanks!
    6 }}}
    7 {{{
    8  [1] query                ::= main-query within-part?
    9          
    10  [2] main-query           ::= simple-query
    11                             | "(" main-query ")"            /* grouping */
    12                             | main-query "|" main-query     /* or */
    13                             | main-query main-query         /* sequence */
    14                             | main-query quantifier         /* quatification */         
    15 
    16  [2.11] main-query        ::= simple-query
    17                             | simple-query "|" main-query     /* or */
    18                             | simple-query main-query         /* sequence */
    19                             | simple-query quantifier         /* quatification */
    20        
    21  [3] simple-query         ::= implicit-query
    22                             | segment-query     
    23  
    24  [3.11] simple-query      ::= '(' main_query ')'              /* grouping */
    25                             | implicit-query
    26                             | segment-query     
    27  
    28  [4] implicit-query       ::= flagged-regexp     
    29  
    30  [5] segment-query        ::= "[" expression? "]"       
    31 
    32  [6] within-part          ::= simple-within-part
    33 
    34  [7] simple-within-part   ::= "within" simple-within-scope
    35 
    36  [8] simple-within-scope  ::= "sentence"
    37                             | "s"
    38                             | "utterance"
    39                             | "u"
    40                             | "paragraph"
    41                             | "p"
    42                             | "turn"
    43                             | "t"
    44                             | "text"
    45                             | "session"         
    46 
    47 [11] expression           ::= basic-expression
    48                             | expression "|" expression     /* or */
    49                             | expression "&" expression     /* and */
    50                             | "(" expression ")"            /* grouping */
    51                             | "!" expression                /* not */   
    52 
    53 [11.11] expression        ::= basic-expression
    54                             | expression "|" expression     /* or */
    55                             | expression "&" expression     /* and */
    56                                
    57 
    58 [12] basic-expression     ::= attribute operator flagged-regexp         
    59 
    60 [12.11] basic-expression  ::= '(' expression ')'         /* grouping */
    61                             | "!" expression                /* not */
    62                             | attribute operator flagged-regexp
    63 
    64 [13] operator             ::= "="                           /* equals */
    65                             | "!="                          /* non-equals */
    66 
    67 [14] quantifier           ::= "+"                           /* one-or-more */
    68                             | "*"                           /* zero-or-more */
    69                             | "?"                           /* zero-or-one */
    70                             | "{" integer "}"               /* exactly n-times */
    71                             | "{" integer? "," integer "}"  /* at most */
    72                             | "{" integer "," integer? "}"  /* min-max */       
    73 
    74 [15] flagged-regexp       ::= regexp
    75                             | regexp "/" regexp-flag+   
    76 
    77 [16] regexp-flag          ::= "i"  /* case-insensitive; Poliqarp/Perl compat */
    78                             | "I"  /* case-sensitive; Poliqarp compat */
    79                             | "c"  /* case-insensitive, CQP compat */
    80                             | "C"  /* case-sensitive */
    81                             | "l"  /* literal matching, CQP compat*/
    82                             | "d"  /* diacritic agnostic matching, CQP compat */
    83        
    84 [17] regexp               ::= quoted-string
    85 
    86 [18] attribute            ::= simple-attribute
    87                             | qualified-attribute
    88 
    89 [19] simple-attribute     ::= identifier
    90 
    91 [20] qualified-attribute  ::= identifier ":" identifier 
    92 
    93 [21] identifier           ::= identifier-char identifier-char*
    94 
    95 [21.11] identifier        ::= identifier-first-char identifier-char*
    96 
    97 [21.12] identifier-first-char      ::= [a-zA-Z]
    98 
    99 [22] identifier-char      ::= [a-zA-Z0-9\-]
    100 
    101 [24] integer              ::= [0-9]+
    102 
    103 [26] quoted-string        ::= "'" (char | ws)* "'"  /* single-quotes */
    104                             | """ (char | ws)* """  /* double-quotes */
    105 
    106 [27] char                 ::= <any unicode codepoint excluding whitespace codepoints>
    107                             | "\" escaped-char
    108 
    109 [28] ws                   ::= <any whitespace codepoint>
    110 
    111 [29] escaped-char         ::= "\"                                  /* backslash (\) */
    112                             | "'"                                  /* single quote (') */
    113                             | """                                  /* double quote (") */
    114                             | "n"                                  /* generic newline, i.e "\n", "\r", etc */
    115                             | "t"                                  /* character tabulation (U+0009) */
    116                             | "x" hex hex                          /* Unicode codepoint with hex value hh */
    117                             | "u" hex hex hex hex                  /* Unicode codepoint with hex value hhhh */
    118                             | "U" hex hex hex hex hex hex hex hex  /* Unicode codepoint with hex value hhhhhhhh */
    119 
    120 [30] hex                  ::= [0-9a-fA-F]
    121 }}}
    122 == Notes ==
    123  * based on Poliqarp with inspiration from others
    124  * "attribute": the annotation layer to be used, e.g. "word", "lemma", "pos" or qualified "pos:stts" the supported values for this construct are beyond the grammar and need to be defined in supplementary documents
    125  * "simple-within-scope": possible values for scope
    126    *  "sentence", "s", "utterance", "u": denote a matching scope of something like a sentence or utterance. provides compatibility with FCS 1.0 ("Generic Hits", "Each hit SHOULD be presented within the context of a complete sentence.")
    127    * "paragraph" | "p" | "turn" | "t": denote the next larger unit, e.g. something like a paragraph
    128    * "article" | "session": something like a whole document
    129  * {{{[27]}}} and {{{[28]}}} "any $SOMETING codepoint" are a pain to get easily done in at least ANTLR and JavaCC. Especially in combination with {{{[29]}}} :/
    130  * regex are not defined/guarded by this grammar :/
    131  * non-continuous rule numbers are currently intended; we've already removed some. Rules will be renumbered, when grammar is fixed.
    132  * Integrated Peter B's suggestion {{{[2v2]}}} and {{{[3v2]}}} together with {{{[11v2]}}} and {{{[12v2]}}} for resolving structural ambiguity eventhough antlr handles this perfectly fine.
    133  * Changed "identifier" {{{[21]}}} to only be allowed to start with a letter e.g. not digits and - (hyphen) to more resemble XML names.
    134 
    135 == Discussion ==
    136 Peter Beinema, MPI, proposes some minor changes to the grammar:
    137  * main query {{{[2]}}} / simple query {{{[3]}}}: above definition generate structural ambiguity. Not a problem for ANTLR (which selects the right-recursive solution), but some other parser generators generate all solutions - which are exponential wrt the number of main queries. I propose to use the alternative rules given below.
    138  * above rule {{{[2]}}} can generate infinite array of quantifiers: {{{"word" +*{23}{,17}{2,}?+}}} would be legal.
    139  * rule {{{[5]}}}: option marker '?' makes "[]" a valid query. Propose to remove question mark.
    140  * expression {{{11}}} / basic expression {{{[12]}}}: structural ambiguity. See proposed alternative below.
    141  * attached file contains an antlr4 grammar, including comments on how to use it on mac / unix / linux platform
    142 == ENBF ==
    143 {{{
    144 [2 v2]   /* in combination with [3 v2] no more left recursion or ambiguity, max 1 quantifier per simple-query */
    145 main-query ::=
    146       simple-query
    147     | simple-query '|' main-query       /* 'or' */
    148     | simple-query main-query           /* sequence */
    149     | simple-query quantifier           /* quantification */
    150 
    151 [3 v2]
    152 simple_query ::=
    153       '(' main_query ')'  /* embedding moved from main-query to simple-query
    154     | implicit-query
    155     | segment-query
    156 
    157 [5 v2]  /* expression no longer optional */
    158 segment-query :
    159       '[' expression ']'
    160 
    161 [11 v2]  /* in combination with [12 v2] no longer left recursive or ambiguous
    162 expression :
    163       basic-expression
    164     | basic-expression '|' expression   // or
    165     | basic-expression '&' expression   // and
    166 
    167 [12 v2]
    168 basic-expression :
    169       '(' expression ')'        // grouping
    170     | '!' expression            // not
    171     | attribute operator flagged-regexp
    172 }}}
    173 
     2The FCS-QL is integrated in the CLARIN-FCS Core 2.0 specification. See section [https://trac.clarin.eu/wiki/FCS/Specification-Draft#FCS-QL].