Context Navigation

← Previous Changeset
Next Changeset →

Changeset 7056

Timestamp:

11/10/16 13:27:45 (8 years ago)

Author:

Oliver Schonefeld

Message:

implement unescaping of escape sequences
perform unicode normalization on regex values
escape certain Java escape sequences when printing fcs query object tree

Location:

FCSSimpleEndpoint/trunk/src/main/java/eu/clarin/sru/server/fcs/parser

Files:

: 2 edited

Expression.java (modified) (1 diff)
QueryParser.java (modified) (8 diffs)

Legend:

: Unmodified
: Added
: Removed

FCSSimpleEndpoint/trunk/src/main/java/eu/clarin/sru/server/fcs/parser/Expression.java

-                      r7055
+                      r7056
         sb.append(operator.toDisplayString());
         sb.append(" \"");
+        sb.append(regex);
+        for (int i = 0; i < regex.length(); i++) {
+            char ch = regex.charAt(i);
+            switch (ch) {
+            case '\n':
+                sb.append("\\n");
+                break;
+            case '\r':
+                sb.append("\\r");
+                break;
+            case '\t':
+                sb.append("\\t");
+                break;
+            default:
+                sb.append(ch);
+            }
+        }
         sb.append("\"");
+        sb.append(" (").append(regex.length()).append(")");
         if (regex_flags != null) {
             sb.append("/");

FCSSimpleEndpoint/trunk/src/main/java/eu/clarin/sru/server/fcs/parser/QueryParser.java

-                      r7055
+                      r7056
 package eu.clarin.sru.server.fcs.parser;
+import java.text.Normalizer;
 import java.util.ArrayDeque;
 import java.util.ArrayList;
 …
 import java.util.List;
 import java.util.Set;
 import org.antlr.v4.runtime.ANTLRInputStream;
 import org.antlr.v4.runtime.BaseErrorListener;
 …
     private static final String DEFAULT_IDENTIFIER = "text";
     private static final Operator DEFAULT_OPERATOR = Operator.EQUALS;
+    private static final Normalizer.Form DEFAULT_UNICODE_NORMALIZATION_FORM =
+            Normalizer.Form.NFC;
     private final String defaultIdentifier;
     private final Operator defaultOperator;
+    private final Normalizer.Form unicodeNormalizationForm;
 …
      */
     public QueryParser() {
         this(DEFAULT_IDENTIFIER);
+        this(DEFAULT_IDENTIFIER, DEFAULT_UNICODE_NORMALIZATION_FORM);
+    }
 …
         this.defaultIdentifier = defaultIdentifier;
         this.defaultOperator = DEFAULT_OPERATOR;
+        this.unicodeNormalizationForm = DEFAULT_UNICODE_NORMALIZATION_FORM;
+    }
+    /**
+     * Constructor.
+     *
+     * @param defaultIdentifier
+     *            the default identifer to be used for simple expressions
+     * @param unicodeNormaliztionForm
+     *            the unicode normaliazion form to be usded or <code>null</code>
+     *            to not perform normalization
+     */
+    public QueryParser(String defaultIdentifier,
+            Normalizer.Form unicodeNormaliztionForm) {
+        this.defaultIdentifier = defaultIdentifier;
+        this.defaultOperator = DEFAULT_OPERATOR;
+        this.unicodeNormalizationForm = unicodeNormaliztionForm;
+    }
 …
+            }
             // check for optional qualifier
+            // handle optional qualifier
             QualifierContext q_ctx = ctx.getChild(QualifierContext.class, 0);
             String qualifier = (q_ctx != null) ? q_ctx.getText() : EMPTY_STRING;
 …
                     ctx.getChild(Regexp_patternContext.class, 0);
             String regex = stripQuotes(p_ctx.getText());
+            /* process escape sequences, if present */
+            if (regex.indexOf('\\') != -1) {
+                regex = unescapeString(regex);
+            }
+            /* perform unicode normalization, if requested */
+            if (unicodeNormalizationForm != null) {
+                regex = Normalizer.normalize(regex, unicodeNormalizationForm);
+            }
             // FIXME: validate regex?
-            // FIXME: translate unicode escape sequences! (if they ever work)
             stack.push(regex);
 …
+    private static String unescapeString(String s) {
+        StringBuilder sb = new StringBuilder();
+        for (int i = 0; i < s.length(); i++) {
+            int cp = s.codePointAt(i);
+            if (cp == '\\') {
+                i++; // skip slash
+                cp = s.codePointAt(i);
+                switch (cp) {
+                case '\\': /* slash */
+                    sb.append("\\");
+                    break;
+                case '"': /* double quote */
+                    sb.append("\"");
+                    break;
+                case '\'': /* single-quote */
+                    sb.append("'");
+                    break;
+                case 'n': /* new line */
+                    sb.append("\n");
+                    break;
+                case 't': /* tabulator */
+                    sb.append("\t");
+                    break;
+                case 'x':
+                    i = unescapeUnicode(s, i, 2, sb);
+                    break;
+                case 'u':
+                    i = unescapeUnicode(s, i, 4, sb);
+                    break;
+                case 'U':
+                    i = unescapeUnicode(s, i, 8, sb);
+                    break;
+                }
+            } else {
+                try {
+                    sb.append(Character.toChars(cp));
+                } catch (IllegalArgumentException e) {
+                    throw new ExpressionTreeBuilderException(
+                            "invalid codepoint: 0x" + Integer.toHexString(cp));
+                }
+            }
+        }
+        return sb.toString();
+    }
+    private static final int unescapeUnicode(String s, int i, int size,
+            StringBuilder sb) {
+        if ((s.length() - i - 1) >= size) {
+            int cp = 0;
+            for (int j = 0; j < 4; j++) {
+                i++;
+                if (j > 0) {
+                    cp = cp << 4;
+                }
+                cp |= parseHexString(s.charAt(i));
+            }
+            try {
+                sb.append(Character.toChars(cp));
+            } catch (IllegalArgumentException e) {
+                throw new ExpressionTreeBuilderException(
+                        "invalid codepoint: 0x" + Integer.toHexString(cp));
+            }
+            return i;
+        } else {
+            throw new ExpressionTreeBuilderException(
+                    "truncated escape sequence: \\" + s.substring(i));
+        }
+    }
+    private static final int parseHexString(char c) {
+        switch (c) {
+        case '0':
+            return 0;
+        case '1':
+            return 1;
+        case '2':
+            return 2;
+        case '3':
+            return 3;
+        case '4':
+            return 4;
+        case '5':
+            return 5;
+        case '6':
+            return 6;
+        case '7':
+            return 7;
+        case '8':
+            return 8;
+        case '9':
+            return 9;
+        case 'a':
+            /* FALL-THROUGH */
+        case 'A':
+            return 10;
+        case 'b':
+            /* FALL-THROUGH */
+        case 'B':
+            return 11;
+        case 'c':
+            /* FALL-THROUGH */
+        case 'C':
+            return 12;
+        case 'd':
+            /* FALL-THROUGH */
+        case 'D':
+            return 13;
+        case 'e':
+            /* FALL-THROUGH */
+        case 'E':
+            return 14;
+        case 'f':
+            /* FALL-THROUGH */
+        case 'F':
+            return 15;
+        default:
+            throw new ExpressionTreeBuilderException(
+                    "invalud hex character: " + c);
+        }
+    }
     private static final class ErrorListener extends BaseErrorListener {
         private final String query;

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 7056

Legend:

FCSSimpleEndpoint/trunk/src/main/java/eu/clarin/sru/server/fcs/parser/Expression.java

FCSSimpleEndpoint/trunk/src/main/java/eu/clarin/sru/server/fcs/parser/QueryParser.java

Download in other formats: