Changeset 6065 for SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/search/Exports.java
- Timestamp:
- 02/26/15 16:17:04 (9 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/search/Exports.java
r6043 r6065 1 1 package eu.clarin.sru.fcs.aggregator.search; 2 2 3 import eu.clarin.sru.fcs.aggregator.lang.LanguagesISO693_2; 4 import eu.clarin.sru.fcs.aggregator.lang.LanguagesISO693_3; 3 import eu.clarin.sru.fcs.aggregator.util.LanguagesISO693; 5 4 import eu.clarin.weblicht.wlfxb.io.WLDObjector; 6 5 import eu.clarin.weblicht.wlfxb.io.WLFormatException; 7 6 import eu.clarin.weblicht.wlfxb.md.xb.MetaData; 8 import eu.clarin.weblicht.wlfxb.tc.api.MatchedCorpus;9 import eu.clarin.weblicht.wlfxb.tc.api.Token;10 7 import eu.clarin.weblicht.wlfxb.tc.xb.TextCorpusStored; 11 8 import eu.clarin.weblicht.wlfxb.xb.WLData; 12 9 import java.io.ByteArrayOutputStream; 13 10 import java.io.IOException; 14 import java.util.ArrayList;15 import java.util.Collections;16 import java.util.HashSet;17 11 import java.util.List; 18 import java.util.Set;19 12 import java.util.logging.Level; 20 13 import java.util.logging.Logger; 21 import opennlp.tools.tokenize.TokenizerME;22 import opennlp.tools.tokenize.TokenizerModel;23 14 import org.apache.poi.ss.usermodel.Cell; 24 15 import org.apache.poi.ss.usermodel.CellStyle; … … 30 21 /** 31 22 * Utility for representing SearchResult data in different formats. 32 * 23 * 33 24 * @author Yana Panchenko 34 25 */ … … 39 30 public static String getExportCSV(List<Result> resultsProcessed, String separator) { 40 31 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 32 boolean noResult = true; 33 StringBuilder csv = new StringBuilder(); 34 if (resultsProcessed != null && !resultsProcessed.isEmpty()) { 35 String[] headers = new String[]{ 36 "LEFT CONTEXT", "KEYWORD", "RIGHT CONTEXT", "PID", "REFERENCE"}; 37 for (String header : headers) { 38 csv.append("\""); 39 csv.append(header); 40 csv.append("\""); 41 csv.append(separator); 42 } 43 csv.append("\n"); 44 45 for (Result result : resultsProcessed) { 46 for (Kwic kwic : result.getKwics()) { 47 csv.append("\""); 48 csv.append(escapeQuotes(kwic.getLeft())); 49 csv.append("\""); 50 csv.append(separator); 51 csv.append("\""); 52 csv.append(escapeQuotes(kwic.getKeyword())); 53 csv.append("\""); 54 csv.append(separator); 55 csv.append("\""); 56 csv.append(escapeQuotes(kwic.getRight())); 57 csv.append("\""); 58 csv.append(separator); 59 csv.append("\""); 60 if (kwic.getPid() != null) { 61 csv.append(escapeQuotes(kwic.getPid())); 62 } 63 csv.append("\""); 64 csv.append(separator); 65 csv.append("\""); 66 if (kwic.getReference() != null) { 67 csv.append(escapeQuotes(kwic.getReference())); 68 } 69 csv.append("\""); 70 csv.append("\n"); 71 noResult = false; 72 } 73 } 74 } 75 if (noResult) { 76 return null; 77 } else { 78 return csv.toString(); 79 } 80 } 90 81 91 82 private static CharSequence escapeQuotes(String text) { 92 93 94 95 96 97 98 99 100 101 83 StringBuilder sb = new StringBuilder(); 84 for (int i = 0; i < text.length(); i++) { 85 char ch = text.charAt(i); 86 if (ch == '"') { 87 sb.append('"'); 88 } 89 sb.append(ch); 90 } 91 return sb; 92 } 102 93 103 94 public static byte[] getExportExcel(List<Result> resultsProcessed) throws ExportException { 104 95 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 96 boolean noResult = true; 97 SXSSFWorkbook workbook = null; 98 ByteArrayOutputStream excelStream = new ByteArrayOutputStream(); 99 if (resultsProcessed != null && !resultsProcessed.isEmpty()) { 100 try { 101 String[] headers = new String[]{ 102 "LEFT CONTEXT", "KEYWORD", "RIGHT CONTEXT", "PID", "REFERENCE"}; 103 104 workbook = new SXSSFWorkbook(); 105 Sheet sheet = workbook.createSheet(); 106 107 Font boldFont = workbook.createFont(); 108 boldFont.setBoldweight(Font.BOLDWEIGHT_BOLD); 109 110 // Header 111 CellStyle headerStyle = workbook.createCellStyle(); 112 headerStyle.setFont(boldFont); 113 114 Row row = sheet.createRow(0); 115 116 for (int j = 0; j < headers.length; ++j) { 117 Cell cell = row.createCell(j, Cell.CELL_TYPE_STRING); 118 cell.setCellValue(headers[j]); 119 cell.setCellStyle(headerStyle); 120 } 121 122 // Body 123 Cell cell; 124 for (int k = 0; k < resultsProcessed.size(); k++) { 125 Result result = resultsProcessed.get(k); 126 List<Kwic> kwics = result.getKwics(); 127 for (int i = 0; i < kwics.size(); i++) { 128 Kwic kwic = kwics.get(i); 129 row = sheet.createRow(k + i + 1); 130 cell = row.createCell(0, Cell.CELL_TYPE_STRING); 131 cell.setCellValue(kwic.getLeft()); 132 cell = row.createCell(1, Cell.CELL_TYPE_STRING); 133 cell.setCellValue(kwic.getKeyword()); 134 cell = row.createCell(2, Cell.CELL_TYPE_STRING); 135 cell.setCellValue(kwic.getRight()); 136 if (kwic.getPid() != null) { 137 cell = row.createCell(3, Cell.CELL_TYPE_STRING); 138 cell.setCellValue(kwic.getPid()); 139 } 140 if (kwic.getReference() != null) { 141 cell = row.createCell(3, Cell.CELL_TYPE_STRING); 142 cell.setCellValue(kwic.getReference()); 143 } 144 noResult = false; 145 } 146 } 147 workbook.write(excelStream); 148 } catch (IOException ex) { 158 149 LOGGER.log(Level.SEVERE, null, ex); 159 150 throw new ExportException("Exception exporting Excel", ex); 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 p rivatestatic byte[] getExportTCF(List<Result> resultsProcessed,151 } finally { 152 if (workbook != null) { 153 workbook.dispose(); 154 } 155 } 156 } 157 if (noResult) { 158 return null; 159 } else { 160 return excelStream.toByteArray(); 161 } 162 163 } 164 165 public static byte[] getExportTCF(List<Result> resultsProcessed, 175 166 String searchLanguage) throws ExportException { 176 StringBuilder text = new StringBuilder(); 177 Set<String> resultsLangs = new HashSet<String>(); 178 if (resultsProcessed != null && !resultsProcessed.isEmpty()) { 179 for (Result result : resultsProcessed) { 180 resultsLangs.addAll(result.getCorpus().getLanguages()); 181 for (Kwic kwic : result.getKwics()) { 182 text.append(kwic.getLeft()); 183 text.append(" "); 184 text.append(kwic.getKeyword()); 185 text.append(" "); 186 text.append(kwic.getRight()); 187 text.append("\n"); 188 } 189 } 190 191 } 192 if (text.length() == 0) { 193 return null; 194 } else { 195 WLData data; 196 MetaData md = new MetaData(); 197 String resultsLang = "unknown"; 198 if (resultsLangs.size() == 1) { 199 resultsLang = resultsLangs.iterator().next(); 200 String code2 = LanguagesISO693_2.getInstance().langForCode(resultsLang).getCode_639_1(); 201 if (code2 != null) { 202 resultsLang = code2; 203 } 204 } else if (!searchLanguage.equals("anylang")) { 205 String code2 = LanguagesISO693_2.getInstance().langForCode(resultsLang).getCode_639_1(); 206 if (code2 == null) { 207 resultsLang = searchLanguage; 208 } else { 209 resultsLang = code2; 210 } 211 } 212 TextCorpusStored tc = new TextCorpusStored(resultsLang); 213 tc.createTextLayer().addText(text.toString()); 214 data = new WLData(md, tc); 167 String text = getExportText(resultsProcessed); 168 if (text == null || text.isEmpty()) { 169 return null; 170 } else { 171 WLData data; 172 MetaData md = new MetaData(); 173 String languageCode = LanguagesISO693.getInstance().code_1ForCode_3(searchLanguage); 174 TextCorpusStored tc = new TextCorpusStored(languageCode); 175 tc.createTextLayer().addText(text); 176 data = new WLData(md, tc); 215 177 ByteArrayOutputStream os = new ByteArrayOutputStream(); 216 217 218 219 178 try { 179 WLDObjector.write(data, os); 180 } catch (WLFormatException ex) { 181 LOGGER.log(Level.SEVERE, "Error exporting TCF {0} {1}", new String[]{ex.getClass().getName(), ex.getMessage()}); 220 182 throw new ExportException("Error exporting TCF", ex); 221 183 } 222 184 return os.toByteArray(); 223 } 224 } 225 226 public static byte[] getExportTokenizedTCF(List<Result> resultsProcessed, 227 String searchLanguage, TokenizerModel tokenizerModel) throws ExportException { 228 StringBuilder text = new StringBuilder(); 229 if (resultsProcessed != null && !resultsProcessed.isEmpty()) { 185 } 186 } 187 188 public static String getExportText(List<Result> resultsProcessed) { 189 StringBuilder text = new StringBuilder(); 190 if (resultsProcessed != null && !resultsProcessed.isEmpty()) { 230 191 for (Result result : resultsProcessed) { 231 192 for (Kwic kwic : result.getKwics()) { … … 239 200 } 240 201 text.append("\n"); 241 } 242 } 243 244 } 245 if (text.length() == 0) { 246 return null; 247 } else { 248 WLData data; 249 MetaData md = new MetaData(); 250 String languageCode = LanguagesISO693_3.getInstance().code_1ForCode_3(searchLanguage); 251 TextCorpusStored tc = new TextCorpusStored(languageCode); 252 tc.createTextLayer().addText(text.toString()); 253 addTokensSentencesMatches(resultsProcessed, tc, tokenizerModel); 254 data = new WLData(md, tc); 255 ByteArrayOutputStream os = new ByteArrayOutputStream(); 256 try { 257 WLDObjector.write(data, os); 258 } catch (WLFormatException ex) { 259 LOGGER.log(Level.SEVERE, "Error exporting TCF {0} {1}", new String[]{ex.getClass().getName(), ex.getMessage()}); 260 throw new ExportException("Error exporting TCF", ex); 261 } 262 return os.toByteArray(); 263 } 264 } 265 266 private static void addTokensSentencesMatches(List<Result> resultsProcessed, TextCorpusStored tc, TokenizerModel model) { 267 if (model == null || !"de".equals(tc.getLanguage())) { 268 return; 269 } 270 TokenizerME tokenizer = new TokenizerME(model); 271 272 if (resultsProcessed != null && !resultsProcessed.isEmpty()) { 273 tc.createTokensLayer(); 274 tc.createSentencesLayer(); 275 tc.createMatchesLayer("FCS", resultsProcessed.get(0).getSearchString()); 276 for (Result result : resultsProcessed) { 277 MatchedCorpus mCorpus = tc.getMatchesLayer().addCorpus(result.getCorpus().getTitle(), result.getCorpus().getHandle()); 278 for (Kwic kwic : result.getKwics()) { 279 List<Token> tokens = new ArrayList<Token>(); 280 addToTcfTokens(tokens, tc, tokenizer.tokenize(kwic.getLeft())); 281 String[] target = tokenizer.tokenize(kwic.getKeyword()); 282 List<Token> targetTokens = addToTcfTokens(tokens, tc, target); 283 addToTcfTokens(tokens, tc, tokenizer.tokenize(kwic.getRight())); 284 tc.getSentencesLayer().addSentence(tokens); 285 List<String> pidAndRef = new ArrayList<String>(); 286 if (kwic.getPid() != null) { 287 pidAndRef.add(kwic.getPid()); 288 } 289 if (kwic.getReference() != null) { 290 pidAndRef.add(kwic.getReference()); 291 } 292 tc.getMatchesLayer().addItem(mCorpus, targetTokens, pidAndRef); 293 } 294 } 295 } 296 } 297 298 private static List<Token> addToTcfTokens(List<Token> tokens, TextCorpusStored tc, String[] tokenStrings) { 299 List<Token> addedTokens = new ArrayList<Token>(tokenStrings.length); 300 for (String tokenString : tokenStrings) { 301 Token token = tc.getTokensLayer().addToken(tokenString); 302 addedTokens.add(token); 303 tokens.add(token); 304 } 305 return addedTokens; 306 } 307 308 public static String getExportText(List<Result> resultsProcessed) { 309 StringBuilder text = new StringBuilder(); 310 if (resultsProcessed != null && !resultsProcessed.isEmpty()) { 311 for (Result result : resultsProcessed) { 312 for (Kwic kwic : result.getKwics()) { 313 text.append(kwic.getLeft()); 314 text.append(" "); 315 text.append(kwic.getKeyword()); 316 text.append(" "); 317 text.append(kwic.getRight()); 318 text.append("\n"); 319 } 320 } 321 322 } 323 if (text.length() == 0) { 324 return null; 325 } else { 326 return text.toString(); 327 } 328 } 202 } 203 } 204 } 205 if (text.length() == 0) { 206 return null; 207 } else { 208 return text.toString(); 209 } 210 } 329 211 }
Note: See TracChangeset
for help on using the changeset viewer.