Changeset 4747
- Timestamp:
- 03/18/14 15:26:03 (11 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
vlo/branches/vlo-3.0/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/MetadataImporter.java
r4612 r4747 35 35 import org.slf4j.LoggerFactory; 36 36 37 38 37 /** 39 38 * The main metadataImporter class. Also contains the main function. … … 44 43 * and so on. 45 44 */ 46 47 45 @SuppressWarnings({"serial"}) 48 46 public class MetadataImporter { 49 47 50 48 /** 51 * Defines which files to try and parse. 52 * In this case all files ending in"xml" or "cmdi".53 */ 54 private static final String[] VALID_CMDI_EXTENSIONS = new String[] { "xml", "cmdi"};49 * Defines which files to try and parse. In this case all files ending in 50 * "xml" or "cmdi". 51 */ 52 private static final String[] VALID_CMDI_EXTENSIONS = new String[]{"xml", "cmdi"}; 55 53 56 54 /** … … 73 71 */ 74 72 final static Map<String, PostProcessor> POST_PROCESSORS = new HashMap<String, PostProcessor>(); 73 75 74 static { 76 75 POST_PROCESSORS.put(FacetConstants.FIELD_ID, new IdPostProcessor()); 77 76 POST_PROCESSORS.put(FacetConstants.FIELD_CONTINENT, new ContinentNamePostProcessor()); 78 77 POST_PROCESSORS.put(FacetConstants.FIELD_COUNTRY, new CountryNamePostProcessor()); 79 78 POST_PROCESSORS.put(FacetConstants.FIELD_LANGUAGE, new LanguageCodePostProcessor()); … … 83 82 POST_PROCESSORS.put(FacetConstants.FIELD_CLARIN_PROFILE, new CMDIComponentProfileNamePostProcessor()); 84 83 } 85 84 86 85 /** 87 86 * Constructor 88 * 87 * 89 88 * @param 90 89 */ 91 public MetadataImporter (){ 92 } 93 94 /** 95 * Contains MDSelflinks (usually). 96 * Just to know what we have already done. 90 public MetadataImporter() { 91 } 92 93 /** 94 * Contains MDSelflinks (usually). Just to know what we have already done. 97 95 */ 98 96 protected final Set<String> processedIds = new HashSet<String>(); … … 129 127 LOG.info("Deleting original data done."); 130 128 } 131 129 132 130 // Import the specified data roots 133 131 for (DataRoot dataRoot : dataRoots) { … … 141 139 List<File> files = getFilesFromDataRoot(dataRoot.getRootFile()); 142 140 for (File file : files) { 143 if (config.getMaxFileSize() > 0 &&144 file.length() > config.getMaxFileSize()) {141 if (config.getMaxFileSize() > 0 142 && file.length() > config.getMaxFileSize()) { 145 143 LOG.info("Skipping " + file.getAbsolutePath() + " because it is too large."); 146 144 } else { … … 154 152 LOG.info("End of processing: " + dataRoot.getOriginName()); 155 153 } 156 154 157 155 // delete outdated entries (based on maxDaysInSolr parameter) 158 if (config.getMaxDaysInSolr() > 0 && config.getDeleteAllFirst() == false) {159 LOG.info("Deleting old files that were not seen for more than " +config.getMaxDaysInSolr()+" days...");160 solrServer.deleteByQuery(FacetConstants.FIELD_LAST_SEEN +":[* TO NOW-"+config.getMaxDaysInSolr()+"DAYS]");156 if (config.getMaxDaysInSolr() > 0 && config.getDeleteAllFirst() == false) { 157 LOG.info("Deleting old files that were not seen for more than " + config.getMaxDaysInSolr() + " days..."); 158 solrServer.deleteByQuery(FacetConstants.FIELD_LAST_SEEN + ":[* TO NOW-" + config.getMaxDaysInSolr() + "DAYS]"); 161 159 LOG.info("Deleting old files done."); 162 160 } … … 171 169 solrServer.commit(); 172 170 buildSuggesterIndex(); 173 } 171 } 174 172 } catch (SolrServerException e) { 175 173 LOG.error("cannot commit:\n", e); … … 223 221 224 222 /** 225 * Create an interface to the SOLR server. 226 * 223 * Create an interface to the SOLR server. 224 * 227 225 * After the interface has been created the importer can send documents to 228 226 * the server. Sending documents involves a queue. The importer adds … … 235 233 String solrUrl = config.getSolrUrl(); 236 234 LOG.info("Initializing Solr Server on " + solrUrl); 237 235 238 236 /* Specify the number of documents in the queue that will trigger the 239 237 * threads, two of them, emptying it. 240 238 */ 241 solrServer = new ConcurrentUpdateSolrServer(solrUrl, 239 solrServer = new ConcurrentUpdateSolrServer(solrUrl, 242 240 config.getMinDocsInSolrQueue(), 2) { 243 241 /* … … 246 244 * serverError variable. 247 245 */ 248 @Override249 public void handleError(Throwable exception) {250 super.handleError(exception);251 serverError = exception;252 }253 };246 @Override 247 public void handleError(Throwable exception) { 248 super.handleError(exception); 249 serverError = exception; 250 } 251 }; 254 252 } 255 253 … … 280 278 if (solrDocument != null) { 281 279 if (!cmdiData.getDataResources().isEmpty() || !cmdiData.getLandingPageResources().isEmpty() 282 283 || cmdiData.getMetadataResources().isEmpty()) {280 || !cmdiData.getSearchResources().isEmpty() || !cmdiData.getSearchPageResources().isEmpty() 281 || cmdiData.getMetadataResources().isEmpty()) { 284 282 // We only add metadata files that have 285 283 // 1) data resources or 286 287 288 284 // 2) a landing page or 285 // 3) a search service (like SRU/CQL) or 286 // 4) a search page or 289 287 // 5) that have none of the above but also lack any metadata links (e.g. olac files describing a corpus with a link to the original archive). 290 288 // Other files will have only metadata resources and are considered 'collection' metadata files they … … 343 341 solrDocument.addField(FacetConstants.FIELD_LANDINGPAGE, resource.getResourceName()); 344 342 } 345 343 346 344 // add search page resource 347 345 for (Resource resource : cmdiData.getSearchPageResources()) { 348 346 solrDocument.addField(FacetConstants.FIELD_SEARCHPAGE, resource.getResourceName()); 349 347 } 350 348 351 349 // add timestamp 352 350 Date dt = new Date(); 353 351 SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'"); 354 352 solrDocument.addField(FacetConstants.FIELD_LAST_SEEN, df.format(dt)); 355 353 356 354 // add resource proxys 357 355 addResourceData(solrDocument, cmdiData); … … 367 365 * overwritten by some more specific xpath (as in the LRT cmdi files). So if 368 366 * a type is overwritten and already in the solrDocument we take that type. 369 * 370 * TODO evaluate odd connection between FIELD_FORMAT and ResourceProxy-Mimetypes 367 * 368 * TODO evaluate odd connection between FIELD_FORMAT and 369 * ResourceProxy-Mimetypes 371 370 */ 372 371 protected void addResourceData(SolrInputDocument solrDocument, CMDIData cmdiData) { … … 390 389 format = CommonUtils.normalizeMimeType(mimeType); 391 390 } 392 391 393 392 FormatPostProcessor processor = new FormatPostProcessor(); 394 393 mimeType = processor.process(mimeType); 395 394 396 395 // TODO check should probably be moved into Solr (by using some minimum length filter) 397 if(!mimeType.equals("")) 398 solrDocument.addField(FacetConstants.FIELD_FORMAT, mimeType); 396 if (!mimeType.equals("")) { 397 solrDocument.addField(FacetConstants.FIELD_FORMAT, mimeType); 398 } 399 399 solrDocument.addField(FacetConstants.FIELD_RESOURCE, mimeType + FacetConstants.FIELD_RESOURCE_SPLIT_CHAR 400 400 + resource.getResourceName()); 401 401 } 402 402 } 403 403 404 404 /** 405 405 * Send current list of SolrImputDocuments to SolrServer and clears list … … 418 418 docs = new ArrayList<SolrInputDocument>(); 419 419 } 420 420 421 421 /** 422 422 * Builds suggester index for autocompletion … … 426 426 */ 427 427 private void buildSuggesterIndex() throws SolrServerException, MalformedURLException { 428 429 HashMap<String,String> paramMap = new HashMap<String, String>();430 431 432 433 428 LOG.info("Building index for autocompletion."); 429 HashMap<String, String> paramMap = new HashMap<String, String>(); 430 paramMap.put("qt", "/suggest"); 431 paramMap.put("spellcheck.build", "true"); 432 SolrParams params = new MapSolrParams(paramMap); 433 solrServer.query(params); 434 434 } 435 435 436 436 public static VloConfig config; 437 437 438 438 /** 439 439 * @param args 440 440 * @throws IOException 441 441 */ 442 public static void main(String[] args) throws MalformedURLException, IOException { 443 444 442 public static void main(String[] args) throws MalformedURLException, IOException { 443 445 444 // path to the configuration file 446 445 String configFile = null; 447 446 448 447 // use the Apache cli framework for getting command line parameters 449 448 Options options = new Options(); … … 461 460 CommandLine cmd = parser.parse(options, args); 462 461 if (cmd.hasOption("c")) { 463 462 464 463 // the "c" option was specified, now get its value 465 464 configFile = cmd.getOptionValue("c"); … … 467 466 468 467 } catch (org.apache.commons.cli.ParseException ex) { 469 468 470 469 /** 471 470 * Caught an exception caused by command line parsing. Try to get … … 473 472 * property. 474 473 */ 475 476 474 String message = "Command line parsing failed. " + ex.getMessage(); 477 475 LOG.error(message); 478 476 System.err.println(message); 479 477 } 480 481 if (configFile == null) {478 479 if (configFile == null) { 482 480 483 481 String message; … … 485 483 message = "Could not get config file name via the command line, trying the system properties."; 486 484 LOG.info(message); 487 485 488 486 String key; 489 487 … … 493 491 494 492 if (configFile == null) { 495 493 496 494 String message; 497 495 498 496 message = "Could not get filename as system property either - stopping."; 499 497 LOG.error(message); 500 498 } else { 501 499 // read the configuration from the externally supplied file 502 XmlVloConfigFactory configFactory = new XmlVloConfigFactory(new URL(configFile)); 500 final URL configUrl; 501 if (configFile.startsWith("file:")) { 502 configUrl = new URL(configFile); 503 } else { 504 configUrl = new File(configFile).toURI().toURL(); 505 } 506 System.out.println("Reading configuration from " + configUrl.toString()); 507 final XmlVloConfigFactory configFactory = new XmlVloConfigFactory(configUrl); 503 508 MetadataImporter.config = configFactory.newConfig(); 504 509 505 510 // optionally, modify the configuration here 506 507 511 // create and start the importer 508 512 MetadataImporter importer = new MetadataImporter(); … … 510 514 511 515 // finished importing 512 513 516 if (MetadataImporter.config.printMapping()) { 514 517 File file = new File("xsdMapping.txt");
Note: See TracChangeset
for help on using the changeset viewer.