Changeset 7056
- Timestamp:
- 11/10/16 13:27:45 (8 years ago)
- Location:
- FCSSimpleEndpoint/trunk/src/main/java/eu/clarin/sru/server/fcs/parser
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
FCSSimpleEndpoint/trunk/src/main/java/eu/clarin/sru/server/fcs/parser/Expression.java
r7055 r7056 128 128 sb.append(operator.toDisplayString()); 129 129 sb.append(" \""); 130 sb.append(regex); 130 for (int i = 0; i < regex.length(); i++) { 131 char ch = regex.charAt(i); 132 switch (ch) { 133 case '\n': 134 sb.append("\\n"); 135 break; 136 case '\r': 137 sb.append("\\r"); 138 break; 139 case '\t': 140 sb.append("\\t"); 141 break; 142 default: 143 sb.append(ch); 144 } 145 } 131 146 sb.append("\""); 147 sb.append(" (").append(regex.length()).append(")"); 132 148 if (regex_flags != null) { 133 149 sb.append("/"); -
FCSSimpleEndpoint/trunk/src/main/java/eu/clarin/sru/server/fcs/parser/QueryParser.java
r7055 r7056 17 17 package eu.clarin.sru.server.fcs.parser; 18 18 19 import java.text.Normalizer; 19 20 import java.util.ArrayDeque; 20 21 import java.util.ArrayList; … … 24 25 import java.util.List; 25 26 import java.util.Set; 27 26 28 import org.antlr.v4.runtime.ANTLRInputStream; 27 29 import org.antlr.v4.runtime.BaseErrorListener; … … 79 81 private static final String DEFAULT_IDENTIFIER = "text"; 80 82 private static final Operator DEFAULT_OPERATOR = Operator.EQUALS; 83 private static final Normalizer.Form DEFAULT_UNICODE_NORMALIZATION_FORM = 84 Normalizer.Form.NFC; 81 85 private final String defaultIdentifier; 82 86 private final Operator defaultOperator; 87 private final Normalizer.Form unicodeNormalizationForm; 83 88 84 89 … … 87 92 */ 88 93 public QueryParser() { 89 this(DEFAULT_IDENTIFIER );94 this(DEFAULT_IDENTIFIER, DEFAULT_UNICODE_NORMALIZATION_FORM); 90 95 } 91 96 … … 100 105 this.defaultIdentifier = defaultIdentifier; 101 106 this.defaultOperator = DEFAULT_OPERATOR; 107 this.unicodeNormalizationForm = DEFAULT_UNICODE_NORMALIZATION_FORM; 108 } 109 110 111 /** 112 * Constructor. 113 * 114 * @param defaultIdentifier 115 * the default identifer to be used for simple expressions 116 * @param unicodeNormaliztionForm 117 * the unicode normaliazion form to be usded or <code>null</code> 118 * to not perform normalization 119 */ 120 public QueryParser(String defaultIdentifier, 121 Normalizer.Form unicodeNormaliztionForm) { 122 this.defaultIdentifier = defaultIdentifier; 123 this.defaultOperator = DEFAULT_OPERATOR; 124 this.unicodeNormalizationForm = unicodeNormaliztionForm; 102 125 } 103 126 … … 539 562 } 540 563 541 // check foroptional qualifier564 // handle optional qualifier 542 565 QualifierContext q_ctx = ctx.getChild(QualifierContext.class, 0); 543 566 String qualifier = (q_ctx != null) ? q_ctx.getText() : EMPTY_STRING; … … 564 587 ctx.getChild(Regexp_patternContext.class, 0); 565 588 String regex = stripQuotes(p_ctx.getText()); 589 590 /* process escape sequences, if present */ 591 if (regex.indexOf('\\') != -1) { 592 regex = unescapeString(regex); 593 } 594 595 /* perform unicode normalization, if requested */ 596 if (unicodeNormalizationForm != null) { 597 regex = Normalizer.normalize(regex, unicodeNormalizationForm); 598 } 599 566 600 // FIXME: validate regex? 567 // FIXME: translate unicode escape sequences! (if they ever work)568 601 stack.push(regex); 569 602 … … 763 796 764 797 798 private static String unescapeString(String s) { 799 StringBuilder sb = new StringBuilder(); 800 for (int i = 0; i < s.length(); i++) { 801 int cp = s.codePointAt(i); 802 if (cp == '\\') { 803 i++; // skip slash 804 cp = s.codePointAt(i); 805 806 switch (cp) { 807 case '\\': /* slash */ 808 sb.append("\\"); 809 break; 810 case '"': /* double quote */ 811 sb.append("\""); 812 break; 813 case '\'': /* single-quote */ 814 sb.append("'"); 815 break; 816 case 'n': /* new line */ 817 sb.append("\n"); 818 break; 819 case 't': /* tabulator */ 820 sb.append("\t"); 821 break; 822 case 'x': 823 i = unescapeUnicode(s, i, 2, sb); 824 break; 825 case 'u': 826 i = unescapeUnicode(s, i, 4, sb); 827 break; 828 case 'U': 829 i = unescapeUnicode(s, i, 8, sb); 830 break; 831 } 832 } else { 833 try { 834 sb.append(Character.toChars(cp)); 835 } catch (IllegalArgumentException e) { 836 throw new ExpressionTreeBuilderException( 837 "invalid codepoint: 0x" + Integer.toHexString(cp)); 838 } 839 } 840 } 841 return sb.toString(); 842 } 843 844 845 private static final int unescapeUnicode(String s, int i, int size, 846 StringBuilder sb) { 847 if ((s.length() - i - 1) >= size) { 848 int cp = 0; 849 for (int j = 0; j < 4; j++) { 850 i++; 851 if (j > 0) { 852 cp = cp << 4; 853 } 854 cp |= parseHexString(s.charAt(i)); 855 } 856 try { 857 sb.append(Character.toChars(cp)); 858 } catch (IllegalArgumentException e) { 859 throw new ExpressionTreeBuilderException( 860 "invalid codepoint: 0x" + Integer.toHexString(cp)); 861 } 862 return i; 863 } else { 864 throw new ExpressionTreeBuilderException( 865 "truncated escape sequence: \\" + s.substring(i)); 866 } 867 } 868 869 870 private static final int parseHexString(char c) { 871 switch (c) { 872 case '0': 873 return 0; 874 case '1': 875 return 1; 876 case '2': 877 return 2; 878 case '3': 879 return 3; 880 case '4': 881 return 4; 882 case '5': 883 return 5; 884 case '6': 885 return 6; 886 case '7': 887 return 7; 888 case '8': 889 return 8; 890 case '9': 891 return 9; 892 case 'a': 893 /* FALL-THROUGH */ 894 case 'A': 895 return 10; 896 case 'b': 897 /* FALL-THROUGH */ 898 case 'B': 899 return 11; 900 case 'c': 901 /* FALL-THROUGH */ 902 case 'C': 903 return 12; 904 case 'd': 905 /* FALL-THROUGH */ 906 case 'D': 907 return 13; 908 case 'e': 909 /* FALL-THROUGH */ 910 case 'E': 911 return 14; 912 case 'f': 913 /* FALL-THROUGH */ 914 case 'F': 915 return 15; 916 default: 917 throw new ExpressionTreeBuilderException( 918 "invalud hex character: " + c); 919 } 920 } 921 922 765 923 private static final class ErrorListener extends BaseErrorListener { 766 924 private final String query;
Note: See TracChangeset
for help on using the changeset viewer.