Changeset 6065 for SRUAggregator
- Timestamp:
- 02/26/15 16:17:04 (9 years ago)
- Location:
- SRUAggregator/trunk
- Files:
-
- 3 deleted
- 11 edited
- 1 copied
Legend:
- Unmodified
- Added
- Removed
-
SRUAggregator/trunk/aggregator.yml
r6057 r6065 1 1 aggregatorParams: 2 #CENTER_REGISTRY_URL: http://centres.clarin.eu/restxml/2 CENTER_REGISTRY_URL: http://centres.clarin.eu/restxml/ 3 3 additionalCQLEndpoints: 4 4 - https://clarin.ids-mannheim.de/digibibsru-new 5 - https://lux17.mpi.nl/ds/cqlsearch5 # - https://lux17.mpi.nl/ds/cqlsearch 6 6 # - http://cqlservlet.mpi.nl/ 7 7 # - http://dspin.dwds.de:8088/ddc-sru/dingler/ … … 28 28 EXECUTOR_SHUTDOWN_TIMEOUT_MS: 1000 29 29 30 WEBLICHT_URL: https://weblicht.sfs.uni-tuebingen.de/WebLicht-4/?input= 30 weblichtConfig: 31 url: https://weblicht.sfs.uni-tuebingen.de/WebLicht-4/?input= 32 acceptedTcfLanguages: 33 - en 34 - de 35 - nl 36 - fr 37 - it 38 - sp 39 - pl 31 40 32 41 # use the simple server factory, run on a single port -
SRUAggregator/trunk/build.sh
r5976 r6065 32 32 node_modules/react-tools/bin/jsx --no-cache-dir $JSDIR $JSDIR 33 33 34 # mvn -q clean package 34 if [ "$1" == "--jar" ] 35 then 36 mvn -q clean package 37 fi 35 38 36 39 # Run in production: 37 #java -jar target/Aggregator2-2.0.0 -alpha-6.jar server aggregator.yml40 #java -jar target/Aggregator2-2.0.0.jar server aggregator.yml 38 41 39 42 # Run for development: 40 # java -cp src/main/resources:target/Aggregator2-2.0.0 -alpha-10.jar eu.clarin.sru.fcs.aggregator.app.Aggregator server aggregator_development.yml43 # java -cp src/main/resources:target/Aggregator2-2.0.0.jar eu.clarin.sru.fcs.aggregator.app.Aggregator server aggregator_development.yml -
SRUAggregator/trunk/pom.xml
r6057 r6065 8 8 <groupId>eu.clarin.sru.fcs</groupId> 9 9 <artifactId>Aggregator2</artifactId> 10 <version>2.0.0-alpha-2 4</version>10 <version>2.0.0-alpha-25</version> 11 11 <name>FCS Aggregator</name> 12 12 … … 90 90 91 91 <dependency> 92 <groupId>org.apache.opennlp</groupId>93 <artifactId>opennlp-tools</artifactId>94 <version>1.5.3</version>95 </dependency>96 <dependency>97 92 <groupId>com.optimaize.languagedetector</groupId> 98 93 <artifactId>language-detector</artifactId> … … 149 144 </configuration> 150 145 </plugin> 151 <!--<plugin>152 <groupId>org.apache.maven.plugins</groupId>153 <artifactId>maven-source-plugin</artifactId>154 <version>2.2.1</version>155 <executions>156 <execution>157 <id>attach-sources</id>158 <goals>159 <goal>jar</goal>160 </goals>161 </execution>162 </executions>163 </plugin>-->164 146 <plugin> 165 147 <groupId>org.apache.maven.plugins</groupId> -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/app/Aggregator.java
r6057 r6065 18 18 import eu.clarin.sru.fcs.aggregator.rest.RestService; 19 19 import eu.clarin.sru.fcs.aggregator.scan.Statistics; 20 import eu.clarin.sru.fcs.aggregator. lang.LanguagesISO693_3;20 import eu.clarin.sru.fcs.aggregator.util.LanguagesISO693; 21 21 import io.dropwizard.Application; 22 22 import io.dropwizard.assets.AssetsBundle; … … 25 25 import java.io.File; 26 26 import java.io.IOException; 27 import java.io.InputStream;28 27 import java.util.ArrayList; 29 28 import java.util.Collections; … … 35 34 import java.util.concurrent.ScheduledExecutorService; 36 35 import java.util.concurrent.atomic.AtomicReference; 37 import opennlp.tools.tokenize.TokenizerModel;38 36 import org.slf4j.LoggerFactory; 39 37 … … 88 86 * @author edima 89 87 * 88 * TODO: ?use weblicht only to show up in zoomed mode 89 * - send only tcf with only a text layer and language (from the list in params) 90 * 91 * TODO: add the modes described above (except live) 92 * 90 93 * TODO: zoom into the results from a corpus, allow functionality only for 91 94 * the view (search for next set of results) … … 95 98 * Twan (they did a test, it worked) 96 99 * 100 * TODO: add PiWik support, tracking the following: 101 * - visits, searches, search per corpus 102 * 103 * TODO: BUG: language detection is immediate, in UI; export implications 104 * 97 105 * TODO: websockets 98 106 * … … 122 130 private AtomicReference<Statistics> searchStatsAtom = new AtomicReference<Statistics>(new Statistics()); 123 131 124 private TokenizerModel tokenizerModel;125 132 private LanguageDetector languageDetector; 126 133 private TextObjectFactory textObjectFactory; … … 235 242 } 236 243 237 LanguagesISO693_3.getInstance(); // force init 238 initTokenizer(); 244 LanguagesISO693.getInstance(); // force init 239 245 initLanguageDetector(); 240 246 … … 293 299 } 294 300 295 public TokenizerModel getTokenizerModel() {296 return tokenizerModel;297 }298 299 301 private static void shutdownAndAwaitTermination(AggregatorConfiguration.Params params, 300 302 ThrottledClient sruClient, ExecutorService scheduler) { … … 313 315 } 314 316 315 private void initTokenizer() {316 TokenizerModel model = null;317 try {318 try (InputStream tokenizerModelDeAsIS = Thread.currentThread().getContextClassLoader().getResourceAsStream(DE_TOK_MODEL)) {319 model = new TokenizerModel(tokenizerModelDeAsIS);320 }321 } catch (IOException ex) {322 log.error("Failed to load tokenizer model", ex);323 }324 tokenizerModel = model;325 }326 327 317 public void initLanguageDetector() throws IOException { 328 318 List<LanguageProfile> languageProfiles = new LanguageProfileReader().readAll(); -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/app/AggregatorConfiguration.java
r6057 r6065 14 14 public static class Params { 15 15 16 @NotEmpty17 16 @JsonProperty 18 17 String CENTER_REGISTRY_URL; 19 18 20 @NotEmpty21 19 @JsonProperty 22 String WEBLICHT_URL;20 List<URL> additionalCQLEndpoints; 23 21 24 22 @NotEmpty … … 66 64 long EXECUTOR_SHUTDOWN_TIMEOUT_MS; 67 65 66 public static class WeblichtConfig { 67 @JsonProperty 68 String url; 69 70 @JsonProperty 71 List<String> acceptedTcfLanguages; 72 73 @JsonIgnore 74 public String getUrl() { 75 return url; 76 } 77 78 @JsonIgnore 79 public List<String> getAcceptedTcfLanguages() { 80 return acceptedTcfLanguages; 81 } 82 } 83 84 @NotEmpty 68 85 @JsonProperty 69 List<URL> additionalCQLEndpoints;86 WeblichtConfig weblichtConfig; 70 87 71 88 @JsonIgnore … … 95 112 96 113 @JsonIgnore 97 public String getWEBLICHT_URL() {98 return WEBLICHT_URL;114 public WeblichtConfig getWeblichtConfig() { 115 return weblichtConfig; 99 116 } 100 117 } -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/rest/RestService.java
r6057 r6065 7 7 import eu.clarin.sru.fcs.aggregator.app.Aggregator; 8 8 import eu.clarin.sru.fcs.aggregator.app.AggregatorConfiguration; 9 import eu.clarin.sru.fcs.aggregator.app.AggregatorConfiguration.Params.WeblichtConfig; 9 10 import eu.clarin.sru.fcs.aggregator.scan.Corpus; 10 11 import eu.clarin.sru.fcs.aggregator.scan.Statistics; … … 12 13 import eu.clarin.sru.fcs.aggregator.search.Result; 13 14 import eu.clarin.sru.fcs.aggregator.search.Search; 14 import eu.clarin.sru.fcs.aggregator.lang.LanguagesISO693_3; 15 import eu.clarin.sru.fcs.aggregator.search.ExportException; 15 import eu.clarin.sru.fcs.aggregator.util.LanguagesISO693; 16 16 import eu.clarin.sru.fcs.aggregator.search.Exports; 17 17 import java.io.IOException; … … 22 22 import java.util.Map; 23 23 import java.util.Set; 24 import java.util.logging.Level;25 import java.util.logging.Logger;26 24 import javax.servlet.ServletContext; 27 25 import javax.servlet.http.HttpServletRequest; … … 36 34 import javax.ws.rs.core.MediaType; 37 35 import javax.ws.rs.core.Response; 38 import opennlp.tools.tokenize.TokenizerModel;39 36 import org.slf4j.LoggerFactory; 40 37 … … 113 110 log.info("get language codes", codes); 114 111 for (String code : codes) { 115 String name = LanguagesISO693 _3.getInstance().nameForCode_3(code);112 String name = LanguagesISO693.getInstance().nameForCode(code); 116 113 languages.put(code, name != null ? name : code); 117 114 } … … 192 189 } 193 190 194 if (format == null || format.trim().isEmpty() || format.trim().equals("text")) {191 if (format == null || format.trim().isEmpty() || format.trim().equals("text")) { 195 192 String text = Exports.getExportText(search.getResults()); 196 193 return download(text, MediaType.TEXT_PLAIN, search.getQuery() + ".txt"); 197 194 } else if (format.equals("tcf")) { 198 byte[] bytes = Exports.getExportTokenizedTCF( 199 search.getResults(), search.getSearchLanguage(), 200 Aggregator.getInstance().getTokenizerModel()); 195 byte[] bytes = Exports.getExportTCF( 196 search.getResults(), search.getSearchLanguage()); 201 197 return download(bytes, TCF_MEDIA_TYPE, search.getQuery() + ".xml"); 202 198 } else if (format.equals("excel")) { … … 240 236 url = DataTransfer.uploadToDropOff(bytes, "text/plan", ".txt"); 241 237 } 242 } else if (format.equals("tokens")) { 243 byte[] bytes = Exports.getExportTokenizedTCF( 244 search.getResults(), search.getSearchLanguage(), 245 Aggregator.getInstance().getTokenizerModel()); 238 } else if (format.equals("tcf")) { 239 byte[] bytes = Exports.getExportTCF( 240 search.getResults(), search.getSearchLanguage()); 246 241 if (bytes != null) { 247 242 url = DataTransfer.uploadToDropOff(bytes, "text/tcf+xml", ".tcf"); … … 251 246 } 252 247 253 URI weblichtUri = new URI(Aggregator.getInstance().getParams().getWEBLICHT_URL()254 248 WeblichtConfig weblicht = Aggregator.getInstance().getParams().getWeblichtConfig(); 249 URI weblichtUri = new URI(weblicht.getUrl() + url); 255 250 return url == null 256 251 ? Response.status(503).entity("error while exporting to weblicht").build() -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/scan/Corpus.java
r6043 r6065 1 1 package eu.clarin.sru.fcs.aggregator.scan; 2 2 3 import eu.clarin.sru.fcs.aggregator. lang.LanguagesISO693_3;3 import eu.clarin.sru.fcs.aggregator.util.LanguagesISO693; 4 4 import java.util.ArrayList; 5 5 import java.util.Collections; … … 102 102 103 103 public void addLanguage(String language) { 104 if (LanguagesISO693 _3.getInstance().getCodes_3().contains(language)) {104 if (LanguagesISO693.getInstance().isCode(language)) { 105 105 this.languages.add(language); 106 106 } else { 107 String code = LanguagesISO693 _3.getInstance().code_3ForName(language);107 String code = LanguagesISO693.getInstance().code_3ForName(language); 108 108 this.languages.add(code == null ? language : code); 109 109 } -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/search/Exports.java
r6043 r6065 1 1 package eu.clarin.sru.fcs.aggregator.search; 2 2 3 import eu.clarin.sru.fcs.aggregator.lang.LanguagesISO693_2; 4 import eu.clarin.sru.fcs.aggregator.lang.LanguagesISO693_3; 3 import eu.clarin.sru.fcs.aggregator.util.LanguagesISO693; 5 4 import eu.clarin.weblicht.wlfxb.io.WLDObjector; 6 5 import eu.clarin.weblicht.wlfxb.io.WLFormatException; 7 6 import eu.clarin.weblicht.wlfxb.md.xb.MetaData; 8 import eu.clarin.weblicht.wlfxb.tc.api.MatchedCorpus;9 import eu.clarin.weblicht.wlfxb.tc.api.Token;10 7 import eu.clarin.weblicht.wlfxb.tc.xb.TextCorpusStored; 11 8 import eu.clarin.weblicht.wlfxb.xb.WLData; 12 9 import java.io.ByteArrayOutputStream; 13 10 import java.io.IOException; 14 import java.util.ArrayList;15 import java.util.Collections;16 import java.util.HashSet;17 11 import java.util.List; 18 import java.util.Set;19 12 import java.util.logging.Level; 20 13 import java.util.logging.Logger; 21 import opennlp.tools.tokenize.TokenizerME;22 import opennlp.tools.tokenize.TokenizerModel;23 14 import org.apache.poi.ss.usermodel.Cell; 24 15 import org.apache.poi.ss.usermodel.CellStyle; … … 30 21 /** 31 22 * Utility for representing SearchResult data in different formats. 32 * 23 * 33 24 * @author Yana Panchenko 34 25 */ … … 39 30 public static String getExportCSV(List<Result> resultsProcessed, String separator) { 40 31 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 32 boolean noResult = true; 33 StringBuilder csv = new StringBuilder(); 34 if (resultsProcessed != null && !resultsProcessed.isEmpty()) { 35 String[] headers = new String[]{ 36 "LEFT CONTEXT", "KEYWORD", "RIGHT CONTEXT", "PID", "REFERENCE"}; 37 for (String header : headers) { 38 csv.append("\""); 39 csv.append(header); 40 csv.append("\""); 41 csv.append(separator); 42 } 43 csv.append("\n"); 44 45 for (Result result : resultsProcessed) { 46 for (Kwic kwic : result.getKwics()) { 47 csv.append("\""); 48 csv.append(escapeQuotes(kwic.getLeft())); 49 csv.append("\""); 50 csv.append(separator); 51 csv.append("\""); 52 csv.append(escapeQuotes(kwic.getKeyword())); 53 csv.append("\""); 54 csv.append(separator); 55 csv.append("\""); 56 csv.append(escapeQuotes(kwic.getRight())); 57 csv.append("\""); 58 csv.append(separator); 59 csv.append("\""); 60 if (kwic.getPid() != null) { 61 csv.append(escapeQuotes(kwic.getPid())); 62 } 63 csv.append("\""); 64 csv.append(separator); 65 csv.append("\""); 66 if (kwic.getReference() != null) { 67 csv.append(escapeQuotes(kwic.getReference())); 68 } 69 csv.append("\""); 70 csv.append("\n"); 71 noResult = false; 72 } 73 } 74 } 75 if (noResult) { 76 return null; 77 } else { 78 return csv.toString(); 79 } 80 } 90 81 91 82 private static CharSequence escapeQuotes(String text) { 92 93 94 95 96 97 98 99 100 101 83 StringBuilder sb = new StringBuilder(); 84 for (int i = 0; i < text.length(); i++) { 85 char ch = text.charAt(i); 86 if (ch == '"') { 87 sb.append('"'); 88 } 89 sb.append(ch); 90 } 91 return sb; 92 } 102 93 103 94 public static byte[] getExportExcel(List<Result> resultsProcessed) throws ExportException { 104 95 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 96 boolean noResult = true; 97 SXSSFWorkbook workbook = null; 98 ByteArrayOutputStream excelStream = new ByteArrayOutputStream(); 99 if (resultsProcessed != null && !resultsProcessed.isEmpty()) { 100 try { 101 String[] headers = new String[]{ 102 "LEFT CONTEXT", "KEYWORD", "RIGHT CONTEXT", "PID", "REFERENCE"}; 103 104 workbook = new SXSSFWorkbook(); 105 Sheet sheet = workbook.createSheet(); 106 107 Font boldFont = workbook.createFont(); 108 boldFont.setBoldweight(Font.BOLDWEIGHT_BOLD); 109 110 // Header 111 CellStyle headerStyle = workbook.createCellStyle(); 112 headerStyle.setFont(boldFont); 113 114 Row row = sheet.createRow(0); 115 116 for (int j = 0; j < headers.length; ++j) { 117 Cell cell = row.createCell(j, Cell.CELL_TYPE_STRING); 118 cell.setCellValue(headers[j]); 119 cell.setCellStyle(headerStyle); 120 } 121 122 // Body 123 Cell cell; 124 for (int k = 0; k < resultsProcessed.size(); k++) { 125 Result result = resultsProcessed.get(k); 126 List<Kwic> kwics = result.getKwics(); 127 for (int i = 0; i < kwics.size(); i++) { 128 Kwic kwic = kwics.get(i); 129 row = sheet.createRow(k + i + 1); 130 cell = row.createCell(0, Cell.CELL_TYPE_STRING); 131 cell.setCellValue(kwic.getLeft()); 132 cell = row.createCell(1, Cell.CELL_TYPE_STRING); 133 cell.setCellValue(kwic.getKeyword()); 134 cell = row.createCell(2, Cell.CELL_TYPE_STRING); 135 cell.setCellValue(kwic.getRight()); 136 if (kwic.getPid() != null) { 137 cell = row.createCell(3, Cell.CELL_TYPE_STRING); 138 cell.setCellValue(kwic.getPid()); 139 } 140 if (kwic.getReference() != null) { 141 cell = row.createCell(3, Cell.CELL_TYPE_STRING); 142 cell.setCellValue(kwic.getReference()); 143 } 144 noResult = false; 145 } 146 } 147 workbook.write(excelStream); 148 } catch (IOException ex) { 158 149 LOGGER.log(Level.SEVERE, null, ex); 159 150 throw new ExportException("Exception exporting Excel", ex); 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 p rivatestatic byte[] getExportTCF(List<Result> resultsProcessed,151 } finally { 152 if (workbook != null) { 153 workbook.dispose(); 154 } 155 } 156 } 157 if (noResult) { 158 return null; 159 } else { 160 return excelStream.toByteArray(); 161 } 162 163 } 164 165 public static byte[] getExportTCF(List<Result> resultsProcessed, 175 166 String searchLanguage) throws ExportException { 176 StringBuilder text = new StringBuilder(); 177 Set<String> resultsLangs = new HashSet<String>(); 178 if (resultsProcessed != null && !resultsProcessed.isEmpty()) { 179 for (Result result : resultsProcessed) { 180 resultsLangs.addAll(result.getCorpus().getLanguages()); 181 for (Kwic kwic : result.getKwics()) { 182 text.append(kwic.getLeft()); 183 text.append(" "); 184 text.append(kwic.getKeyword()); 185 text.append(" "); 186 text.append(kwic.getRight()); 187 text.append("\n"); 188 } 189 } 190 191 } 192 if (text.length() == 0) { 193 return null; 194 } else { 195 WLData data; 196 MetaData md = new MetaData(); 197 String resultsLang = "unknown"; 198 if (resultsLangs.size() == 1) { 199 resultsLang = resultsLangs.iterator().next(); 200 String code2 = LanguagesISO693_2.getInstance().langForCode(resultsLang).getCode_639_1(); 201 if (code2 != null) { 202 resultsLang = code2; 203 } 204 } else if (!searchLanguage.equals("anylang")) { 205 String code2 = LanguagesISO693_2.getInstance().langForCode(resultsLang).getCode_639_1(); 206 if (code2 == null) { 207 resultsLang = searchLanguage; 208 } else { 209 resultsLang = code2; 210 } 211 } 212 TextCorpusStored tc = new TextCorpusStored(resultsLang); 213 tc.createTextLayer().addText(text.toString()); 214 data = new WLData(md, tc); 167 String text = getExportText(resultsProcessed); 168 if (text == null || text.isEmpty()) { 169 return null; 170 } else { 171 WLData data; 172 MetaData md = new MetaData(); 173 String languageCode = LanguagesISO693.getInstance().code_1ForCode_3(searchLanguage); 174 TextCorpusStored tc = new TextCorpusStored(languageCode); 175 tc.createTextLayer().addText(text); 176 data = new WLData(md, tc); 215 177 ByteArrayOutputStream os = new ByteArrayOutputStream(); 216 217 218 219 178 try { 179 WLDObjector.write(data, os); 180 } catch (WLFormatException ex) { 181 LOGGER.log(Level.SEVERE, "Error exporting TCF {0} {1}", new String[]{ex.getClass().getName(), ex.getMessage()}); 220 182 throw new ExportException("Error exporting TCF", ex); 221 183 } 222 184 return os.toByteArray(); 223 } 224 } 225 226 public static byte[] getExportTokenizedTCF(List<Result> resultsProcessed, 227 String searchLanguage, TokenizerModel tokenizerModel) throws ExportException { 228 StringBuilder text = new StringBuilder(); 229 if (resultsProcessed != null && !resultsProcessed.isEmpty()) { 185 } 186 } 187 188 public static String getExportText(List<Result> resultsProcessed) { 189 StringBuilder text = new StringBuilder(); 190 if (resultsProcessed != null && !resultsProcessed.isEmpty()) { 230 191 for (Result result : resultsProcessed) { 231 192 for (Kwic kwic : result.getKwics()) { … … 239 200 } 240 201 text.append("\n"); 241 } 242 } 243 244 } 245 if (text.length() == 0) { 246 return null; 247 } else { 248 WLData data; 249 MetaData md = new MetaData(); 250 String languageCode = LanguagesISO693_3.getInstance().code_1ForCode_3(searchLanguage); 251 TextCorpusStored tc = new TextCorpusStored(languageCode); 252 tc.createTextLayer().addText(text.toString()); 253 addTokensSentencesMatches(resultsProcessed, tc, tokenizerModel); 254 data = new WLData(md, tc); 255 ByteArrayOutputStream os = new ByteArrayOutputStream(); 256 try { 257 WLDObjector.write(data, os); 258 } catch (WLFormatException ex) { 259 LOGGER.log(Level.SEVERE, "Error exporting TCF {0} {1}", new String[]{ex.getClass().getName(), ex.getMessage()}); 260 throw new ExportException("Error exporting TCF", ex); 261 } 262 return os.toByteArray(); 263 } 264 } 265 266 private static void addTokensSentencesMatches(List<Result> resultsProcessed, TextCorpusStored tc, TokenizerModel model) { 267 if (model == null || !"de".equals(tc.getLanguage())) { 268 return; 269 } 270 TokenizerME tokenizer = new TokenizerME(model); 271 272 if (resultsProcessed != null && !resultsProcessed.isEmpty()) { 273 tc.createTokensLayer(); 274 tc.createSentencesLayer(); 275 tc.createMatchesLayer("FCS", resultsProcessed.get(0).getSearchString()); 276 for (Result result : resultsProcessed) { 277 MatchedCorpus mCorpus = tc.getMatchesLayer().addCorpus(result.getCorpus().getTitle(), result.getCorpus().getHandle()); 278 for (Kwic kwic : result.getKwics()) { 279 List<Token> tokens = new ArrayList<Token>(); 280 addToTcfTokens(tokens, tc, tokenizer.tokenize(kwic.getLeft())); 281 String[] target = tokenizer.tokenize(kwic.getKeyword()); 282 List<Token> targetTokens = addToTcfTokens(tokens, tc, target); 283 addToTcfTokens(tokens, tc, tokenizer.tokenize(kwic.getRight())); 284 tc.getSentencesLayer().addSentence(tokens); 285 List<String> pidAndRef = new ArrayList<String>(); 286 if (kwic.getPid() != null) { 287 pidAndRef.add(kwic.getPid()); 288 } 289 if (kwic.getReference() != null) { 290 pidAndRef.add(kwic.getReference()); 291 } 292 tc.getMatchesLayer().addItem(mCorpus, targetTokens, pidAndRef); 293 } 294 } 295 } 296 } 297 298 private static List<Token> addToTcfTokens(List<Token> tokens, TextCorpusStored tc, String[] tokenStrings) { 299 List<Token> addedTokens = new ArrayList<Token>(tokenStrings.length); 300 for (String tokenString : tokenStrings) { 301 Token token = tc.getTokensLayer().addToken(tokenString); 302 addedTokens.add(token); 303 tokens.add(token); 304 } 305 return addedTokens; 306 } 307 308 public static String getExportText(List<Result> resultsProcessed) { 309 StringBuilder text = new StringBuilder(); 310 if (resultsProcessed != null && !resultsProcessed.isEmpty()) { 311 for (Result result : resultsProcessed) { 312 for (Kwic kwic : result.getKwics()) { 313 text.append(kwic.getLeft()); 314 text.append(" "); 315 text.append(kwic.getKeyword()); 316 text.append(" "); 317 text.append(kwic.getRight()); 318 text.append("\n"); 319 } 320 } 321 322 } 323 if (text.length() == 0) { 324 return null; 325 } else { 326 return text.toString(); 327 } 328 } 202 } 203 } 204 } 205 if (text.length() == 0) { 206 return null; 207 } else { 208 return text.toString(); 209 } 210 } 329 211 } -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/search/Kwic.java
r6043 r6065 3 3 import eu.clarin.sru.client.fcs.DataViewHits; 4 4 import eu.clarin.sru.fcs.aggregator.app.Aggregator; 5 import eu.clarin.sru.fcs.aggregator. lang.LanguagesISO693_3;5 import eu.clarin.sru.fcs.aggregator.util.LanguagesISO693; 6 6 import java.util.ArrayList; 7 7 import java.util.List; 8 import org.apache.commons.lang.StringEscapeUtils; 8 9 9 10 /** … … 48 49 this.reference = reference; 49 50 50 String text = hits.getText(); 51 // warning: the client library doesn't unescape the xml 52 // so the text can still contains < and & codes 53 String str = hits.getText(); 54 51 55 int lastOffset = 0; 52 56 for (int i = 0; i < hits.getHitCount(); i++) { 53 57 int[] offsets = hits.getHitOffsets(i); 54 58 if (lastOffset < offsets[0]) { 55 fragments.add(new TextFragment(text.substring(lastOffset, offsets[0]), false)); 59 String text = StringEscapeUtils.unescapeXml(str.substring(lastOffset, offsets[0])); 60 fragments.add(new TextFragment(text, false)); 56 61 } 57 62 if (offsets[0] < offsets[1]) { 58 fragments.add(new TextFragment(text.substring(offsets[0], offsets[1]), true)); 63 String text = StringEscapeUtils.unescapeXml(str.substring(offsets[0], offsets[1])); 64 fragments.add(new TextFragment(text, true)); 59 65 } 60 66 lastOffset = offsets[1]; 61 67 } 62 if (lastOffset < text.length()) { 63 fragments.add(new TextFragment(text.substring(lastOffset, text.length()), false)); 68 if (lastOffset < str.length()) { 69 String text = StringEscapeUtils.unescapeXml(str.substring(lastOffset, str.length())); 70 fragments.add(new TextFragment(text, false)); 64 71 } 65 72 66 String code_iso639_1 = Aggregator.getInstance().detectLanguage( hits.getText());73 String code_iso639_1 = Aggregator.getInstance().detectLanguage(str); 67 74 language = code_iso639_1 == null ? null 68 : LanguagesISO693 _3.getInstance().code_3ForCode_1(code_iso639_1);75 : LanguagesISO693.getInstance().code_3ForCode(code_iso639_1); 69 76 } 70 77 … … 87 94 @Deprecated 88 95 public String getLeft() { 96 StringBuilder sb = new StringBuilder(); 89 97 for (TextFragment tf : fragments) { 90 if ( !tf.isHit) {91 return tf.text;98 if (tf.isHit) { 99 break; 92 100 } 101 sb.append(tf.text); 93 102 } 94 return "";103 return sb.toString(); 95 104 } 96 105 -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/util/LanguagesISO693.java
r6044 r6065 1 package eu.clarin.sru.fcs.aggregator. lang;1 package eu.clarin.sru.fcs.aggregator.util; 2 2 3 3 import com.fasterxml.jackson.core.JsonProcessingException; … … 10 10 import java.util.HashMap; 11 11 import java.util.Map; 12 import java.util.Set;13 12 import org.slf4j.LoggerFactory; 14 13 … … 18 17 * @author Yana Panchenko 19 18 */ 20 public class LanguagesISO693 _3{19 public class LanguagesISO693 { 21 20 22 private static final org.slf4j.Logger log = LoggerFactory.getLogger(LanguagesISO693 _3.class);21 private static final org.slf4j.Logger log = LoggerFactory.getLogger(LanguagesISO693.class); 23 22 public static final String LANGUAGES_FILE_PATH = "/lang/iso-639-3_20140320.tab"; 24 23 public static final String LANGUAGES_FILE_ENCODING = "UTF-8"; 25 24 26 private static LanguagesISO693 _3instance = null;25 private static LanguagesISO693 instance = null; 27 26 28 27 public static class Language { 29 28 30 29 // code is ISO-639-3 (3 letters) while code_2 is ISO-639-1 (2 letters) 31 String code_3, code_1, name;30 private String code_3, code_1, name; 32 31 33 32 public Language(String code_3, String code_1, String name) { … … 38 37 } 39 38 40 private Map<String, Language> code _3ToLang = new HashMap<String, Language>();39 private Map<String, Language> codeToLang = new HashMap<String, Language>(); 41 40 private Map<String, Language> nameToLang = new HashMap<String, Language>(); 42 private Map<String, Language> code_1ToLang = new HashMap<String, Language>();43 41 44 private LanguagesISO693 _3() {45 InputStream is = LanguagesISO693 _3.class.getResourceAsStream(LANGUAGES_FILE_PATH);42 private LanguagesISO693() { 43 InputStream is = LanguagesISO693.class.getResourceAsStream(LANGUAGES_FILE_PATH); 46 44 try (BufferedReader br = new BufferedReader(new InputStreamReader(is, LANGUAGES_FILE_ENCODING))) { 47 String line = br.readLine(); // ignore first line 45 br.readLine(); // ignore first line (header) 46 String line; 48 47 while ((line = br.readLine()) != null) { 49 48 if (line.length() > 0) { … … 60 59 String name = toks[6].trim(); 61 60 Language l = new Language(code_3, code_1, name); 62 code _3ToLang.put(code_3, l);61 codeToLang.put(code_3, l); 63 62 if (code_1 != null) { 64 code _1ToLang.put(code_1, l);63 codeToLang.put(code_1, l); 65 64 } 66 65 nameToLang.put(name, l); … … 73 72 ObjectWriter ow = new ObjectMapper().writerWithDefaultPrettyPrinter(); 74 73 try { 75 System.out.println(ow.writeValueAsString(code _3ToLang));74 System.out.println(ow.writeValueAsString(codeToLang)); 76 75 } catch (JsonProcessingException ex) { 77 76 } 78 77 } 79 78 80 public static LanguagesISO693 _3getInstance() {79 public static LanguagesISO693 getInstance() { 81 80 if (instance == null) { 82 instance = new LanguagesISO693 _3();81 instance = new LanguagesISO693(); 83 82 } 84 83 return instance; 85 84 } 86 85 87 public Set<String> getCodes_3() {88 return code _3ToLang.keySet();86 public boolean isCode(String code) { 87 return codeToLang.containsKey(code); 89 88 } 90 89 91 public String code_3ForCode _1(String code639_1) {90 public String code_3ForCode(String code639_1) { 92 91 if (code639_1 == null) { 93 92 return null; 94 93 } 95 Language l = code _1ToLang.get(code639_1);94 Language l = codeToLang.get(code639_1); 96 95 if (l == null) { 97 96 log.error("Unknown ISO-639-1 code: " + code639_1); … … 105 104 return null; 106 105 } 107 Language l = code _3ToLang.get(code639_3);106 Language l = codeToLang.get(code639_3); 108 107 if (l == null) { 109 108 log.error("Unknown ISO-639-3 code: " + code639_3); … … 122 121 } 123 122 124 public String nameForCode _3(String code) {125 Language l = code _3ToLang.get(code);123 public String nameForCode(String code) { 124 Language l = codeToLang.get(code); 126 125 if (l == null) { 127 126 log.error("Unknown language code: " + code); … … 130 129 return l.name; 131 130 } 132 133 131 } -
SRUAggregator/trunk/src/main/resources/assets/js/main.js
r6057 r6065 3 3 "use strict"; 4 4 5 var VERSION = "VERSION 2.0.0.α2 4";5 var VERSION = "VERSION 2.0.0.α25"; 6 6 var URLROOT = "/Aggregator-testing"; 7 7 … … 150 150 React.createElement("span", {className: "icon-bar"}) 151 151 ), 152 React.createElement("a", {className: "navbar-brand", href: "#", tabIndex: "-1"}, React.createElement("header", null, "Federated Content Search"))152 React.createElement("a", {className: "navbar-brand", href: URLROOT, tabIndex: "-1"}, React.createElement("header", null, "Federated Content Search")) 153 153 ), 154 154 this.renderCollapsible() -
SRUAggregator/trunk/src/main/resources/assets/js/main.jsx
r6057 r6065 3 3 "use strict"; 4 4 5 var VERSION = "VERSION 2.0.0.α2 4";5 var VERSION = "VERSION 2.0.0.α25"; 6 6 var URLROOT = "/Aggregator-testing"; 7 7 … … 150 150 <span className="icon-bar"></span> 151 151 </button> 152 <a className="navbar-brand" href= "#"tabIndex="-1"><header>Federated Content Search</header></a>152 <a className="navbar-brand" href={URLROOT} tabIndex="-1"><header>Federated Content Search</header></a> 153 153 </div> 154 154 {this.renderCollapsible()}
Note: See TracChangeset
for help on using the changeset viewer.