Changeset 5197
- Timestamp:
- 05/14/14 14:04:26 (10 years ago)
- Location:
- vlo/trunk/vlo-importer/src
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
vlo/trunk/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/FacetMappingFactory.java
r4029 r5197 19 19 20 20 /** 21 * Creates facet-mappings (xpaths) from a configuration. 22 * As they say "this is where the magic happens". 23 * Also does some caching. 21 * Creates facet-mappings (xpaths) from a configuration. As they say "this is 22 * where the magic happens". Also does some caching. 24 23 */ 25 24 public class FacetMappingFactory { … … 42 41 43 42 /** 44 * Get facet concept mapping. 45 * 46 * Get facet mapping used to map meta data based on a facet concepts 47 * fileand url to cmdi meta data profile.48 43 * Get facet concept mapping. 44 * 45 * Get facet mapping used to map meta data based on a facet concepts file 46 * and url to cmdi meta data profile. 47 * 49 48 * @param facetConcepts name of the facet concepts file 50 49 * @param xsd url of xml schema of cmdi profile 51 * 50 * 52 51 * @return facet concept mapping 53 52 */ … … 64 63 /** 65 64 * Create facet concept mapping. 66 * 65 * 67 66 * Create facet mapping used to map meta data based on a facet concept 68 67 * mapping file and url to cmdi meta data profile. 69 * 68 * 70 69 * @param facetConcepts name of the facet concepts file 71 70 * @param xsd url of xml schema of cmdi profile 72 * 71 * 73 72 * @return the facet mapping used to map meta data to facets 74 73 */ … … 96 95 pathConceptLinkMapping = new HashMap<String, String>(); 97 96 for (String c : conceptLinkPathMapping.keySet()) { 98 for (String p : conceptLinkPathMapping.get(c)) 99 pathConceptLinkMapping.put(p,c); 97 for (String p : conceptLinkPathMapping.get(c)) { 98 pathConceptLinkMapping.put(p, c); 99 } 100 100 } 101 101 } … … 107 107 if (context == null && acceptableContext.includeEmpty()) { 108 108 // no context is accepted 109 LOG.debug("facet[ "+facetConcept.getName()+"] path["+path+"] context["+context+"](empty) is accepted");109 LOG.debug("facet[{}] path[{}] context[{}](empty) is accepted", facetConcept.getName(), path, context); 110 110 xpaths.add(path); 111 111 handled = true; 112 112 } else if (acceptableContext.getConcepts().contains(context)) { 113 113 // a specific context is accepted 114 LOG.debug("facet[ "+facetConcept.getName()+"] path["+path+"] context["+context+"] is accepted");114 LOG.debug("facet[{}] path[{}] context[{}] is accepted", facetConcept.getName(), path, context); 115 115 xpaths.add(path); 116 116 handled = true; … … 122 122 if (context == null && rejectableContext.includeEmpty()) { 123 123 // no context is rejected 124 LOG.debug("facet[ "+facetConcept.getName()+"] path["+path+"] context["+context+"](empty) is rejected");124 LOG.debug("facet[{}] path[{}] context[{}](empty) is rejected", facetConcept.getName(), path, context); 125 125 handled = true; 126 126 } else if (rejectableContext.getConcepts().contains(context)) { 127 127 // a specific context is rejected 128 LOG.debug("facet[ "+facetConcept.getName()+"] path["+path+"] context["+context+"] is rejected");128 LOG.debug("facet[{}] path[{}] context[{}] is rejected", facetConcept.getName(), path, context); 129 129 handled = true; 130 130 } else if (rejectableContext.includeAny()) { 131 131 // any context is rejected 132 LOG.debug("facet[ "+facetConcept.getName()+"] path["+path+"] context["+context+"](any) is rejected");132 LOG.debug("facet[{}] path[{}] context[{}](any) is rejected", facetConcept.getName(), path, context); 133 133 handled = true; 134 134 } 135 135 } 136 if (!handled && context !=null && facetConcept.hasAcceptableContext() && facetConcept.getAcceptableContext().includeAny()) {136 if (!handled && context != null && facetConcept.hasAcceptableContext() && facetConcept.getAcceptableContext().includeAny()) { 137 137 // any, not rejected context, is accepted 138 LOG.debug("facet[ "+facetConcept.getName()+"] path["+path+"] context["+context+"](any) is accepted");138 LOG.debug("facet[{}] path[{}] context[{}](any) is accepted", facetConcept.getName(), path, context); 139 139 xpaths.add(path); 140 140 } 141 141 } 142 } else 142 } else { 143 143 xpaths.addAll(paths); 144 } 144 145 } 145 146 } 146 147 147 148 //add hardcoded patterns only when there is no xpath generated from conceptlink 148 149 if (xpaths.isEmpty()) { 149 150 xpaths.addAll(facetConcept.getPatterns()); 150 151 } 151 152 152 153 // pattern-based blacklisting: remove all XPath expressions that contain a blacklisted substring; 153 154 // this is basically a hack to enhance the quality of the visualised information in the VLO; 154 155 // should be replaced by a more intelligent approach in the future 155 for (String blacklistPattern : facetConcept.getBlacklistPatterns()) {156 157 while(xpathIterator.hasNext()) {158 159 if(xpath.contains(blacklistPattern)) {160 LOG.debug("Rejecting "+xpath+" because of blacklisted substring "+blacklistPattern);161 162 163 164 } 165 156 for (String blacklistPattern : facetConcept.getBlacklistPatterns()) { 157 Iterator<String> xpathIterator = xpaths.iterator(); 158 while (xpathIterator.hasNext()) { 159 String xpath = xpathIterator.next(); 160 if (xpath.contains(blacklistPattern)) { 161 LOG.debug("Rejecting {} because of blacklisted substring {}", xpath, blacklistPattern); 162 xpathIterator.remove(); 163 } 164 } 165 } 166 166 167 config.setCaseInsensitive(facetConcept.isCaseInsensitive()); 167 168 config.setAllowMultipleValues(facetConcept.isAllowMultipleValues()); … … 173 174 } 174 175 } catch (NavException e) { 175 LOG.error("Error creating facetMapping from xsd: " + xsd + " ", e);176 LOG.error("Error creating facetMapping from xsd: {}", xsd, e); 176 177 } 177 178 return result; 178 179 } 179 180 /** 181 * Look if there is a contextual (container) data category associated with an ancestor by walking back. 182 */ 183 private String getContext(String path, Map<String,String> pathConceptLinkMapping) { 180 181 /** 182 * Look if there is a contextual (container) data category associated with 183 * an ancestor by walking back. 184 */ 185 private String getContext(String path, Map<String, String> pathConceptLinkMapping) { 184 186 String context = null; 185 187 String cpath = path; 186 while (context==null && !cpath.equals("/text()")) {187 cpath = cpath.replaceAll("/[^/]*/text\\(\\)", "/text()");188 while (context == null && !cpath.equals("/text()")) { 189 cpath = cpath.replaceAll("/[^/]*/text\\(\\)", "/text()"); 188 190 context = pathConceptLinkMapping.get(cpath); 189 191 } … … 192 194 193 195 /** 194 * The id facet is special case and patterns must be added first. 195 * The standard pattern to get the id out of the header is the most reliable and it should fall back on concept matching if nothing matches. 196 * (Note this is the exact opposite of other facets where the concept match is probably better then the 'hardcoded' pattern). 196 * The id facet is special case and patterns must be added first. The 197 * standard pattern to get the id out of the header is the most reliable and 198 * it should fall back on concept matching if nothing matches. (Note this is 199 * the exact opposite of other facets where the concept match is probably 200 * better then the 'hardcoded' pattern). 197 201 */ 198 202 private void handleId(List<String> xpaths, FacetConcept facetConcept) { … … 203 207 204 208 /** 205 * "this is where the magic happens". 206 * Finds paths in the xsd to all concepts (isocat data catagories). 209 * "this is where the magic happens". Finds paths in the xsd to all concepts 210 * (isocat data catagories). 211 * 207 212 * @param xsd URL of XML Schema of some CMDI profile 208 * @return Map (Data Category -> List of XPath expressions linked to the key data category which can be found in CMDI files with this schema) 213 * @return Map (Data Category -> List of XPath expressions linked to the key 214 * data category which can be found in CMDI files with this schema) 209 215 * @throws NavException 210 216 */ … … 214 220 boolean parseSuccess = vg.parseHttpUrl(xsd, true); 215 221 if (!parseSuccess) { 216 LOG.error("Cannot create ConceptLink Map from xsd (xsd is probably not reachable): " +xsd+". All metadata instances that use this xsd will not be imported correctly.");222 LOG.error("Cannot create ConceptLink Map from xsd (xsd is probably not reachable): " + xsd + ". All metadata instances that use this xsd will not be imported correctly."); 217 223 return result; //return empty map, so the incorrect xsd is not tried for all metadata instances that specify it. 218 224 } … … 243 249 244 250 /** 245 * Goal is to get the "datcat" attribute. Tries a number of different favors that were found in the xsd's. 251 * Goal is to get the "datcat" attribute. Tries a number of different favors 252 * that were found in the xsd's. 253 * 246 254 * @return -1 if index is not found. 247 255 */ … … 260 268 /** 261 269 * Given an xml-token path thingy create an xpath. 270 * 262 271 * @param elementPath 263 272 * @return … … 273 282 /** 274 283 * does some updating after a step. To keep the path proper and path-y. 284 * 275 285 * @param vn 276 286 * @param elementPath … … 292 302 293 303 class Token { 304 294 305 final String name; 295 306 final int depth; -
vlo/trunk/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/MetadataImporter.java
r5143 r5197 71 71 */ 72 72 final static Map<String, PostProcessor> POST_PROCESSORS = new HashMap<String, PostProcessor>(); 73 73 74 74 static { 75 75 POST_PROCESSORS.put(FacetConstants.FIELD_ID, new IdPostProcessor()); … … 116 116 */ 117 117 void startImport() throws MalformedURLException { 118 118 119 119 initSolrServer(); 120 120 List<DataRoot> dataRoots = checkDataRoots(); … … 144 144 LOG.info("Skipping " + file.getAbsolutePath() + " because it is too large."); 145 145 } else { 146 LOG.debug("PROCESSING FILE: " +file.getAbsolutePath());146 LOG.debug("PROCESSING FILE: {}", file.getAbsolutePath()); 147 147 processCmdi(file, dataRoot, processor); 148 148 } … … 273 273 } 274 274 } catch (Exception e) { 275 LOG.error("error in file: " + file + " Exception", e);275 LOG.error("error in file: {}", file, e); 276 276 nrOfFilesWithError++; 277 277 } 278 if (cmdiData != null && processedIds.add(cmdiData.getId())) { 279 SolrInputDocument solrDocument = cmdiData.getSolrDocument(); 280 if (solrDocument != null) { 281 if (!cmdiData.getDataResources().isEmpty() || !cmdiData.getLandingPageResources().isEmpty() 282 || !cmdiData.getSearchResources().isEmpty() || !cmdiData.getSearchPageResources().isEmpty() 283 || cmdiData.getMetadataResources().isEmpty()) { 284 // We only add metadata files that have 285 // 1) data resources or 286 // 2) a landing page or 287 // 3) a search service (like SRU/CQL) or 288 // 4) a search page or 289 // 5) that have none of the above but also lack any metadata links (e.g. olac files describing a corpus with a link to the original archive). 290 // Other files will have only metadata resources and are considered 'collection' metadata files they 291 // are usually not very interesting (think imdi corpus files) and will not be included. 292 updateDocument(solrDocument, cmdiData, file, dataOrigin); 293 } else { 294 nrOfIgnoredFiles++; 278 if (cmdiData != null) { 279 if (processedIds.add(cmdiData.getId())) { 280 SolrInputDocument solrDocument = cmdiData.getSolrDocument(); 281 if (solrDocument != null) { 282 if (!cmdiData.getDataResources().isEmpty() || !cmdiData.getLandingPageResources().isEmpty() 283 || !cmdiData.getSearchResources().isEmpty() || !cmdiData.getSearchPageResources().isEmpty() 284 || cmdiData.getMetadataResources().isEmpty()) { 285 // We only add metadata files that have 286 // 1) data resources or 287 // 2) a landing page or 288 // 3) a search service (like SRU/CQL) or 289 // 4) a search page or 290 // 5) that have none of the above but also lack any metadata links (e.g. olac files describing a corpus with a link to the original archive). 291 // Other files will have only metadata resources and are considered 'collection' metadata files they 292 // are usually not very interesting (think imdi corpus files) and will not be included. 293 updateDocument(solrDocument, cmdiData, file, dataOrigin); 294 } else { 295 nrOfIgnoredFiles++; 296 } 295 297 } 298 } else { 299 LOG.warn("Skipping {}, already processed id: {}", file, cmdiData.getId()); 296 300 } 297 301 } … … 328 332 solrDocument.addField(FacetConstants.FIELD_ID, cmdiData.getId()); 329 333 solrDocument.addField(FacetConstants.FIELD_FILENAME, file.getAbsolutePath()); 330 334 331 335 String metadataSourceUrl = dataOrigin.getPrefix(); 332 336 metadataSourceUrl += file.getAbsolutePath().substring(dataOrigin.getToStrip().length()); 333 337 334 338 solrDocument.addField(FacetConstants.FIELD_COMPLETE_METADATA, metadataSourceUrl); 335 339 … … 356 360 // add resource proxys 357 361 addResourceData(solrDocument, cmdiData); 362 363 LOG.debug("Adding document for submission to SOLR: {}", file); 358 364 docs.add(solrDocument); 359 365 if (docs.size() == config.getMaxDocsInList()) { … … 391 397 format = CommonUtils.normalizeMimeType(mimeType); 392 398 } 393 399 394 400 FormatPostProcessor processor = new FormatPostProcessor(); 395 401 mimeType = processor.process(mimeType); … … 435 441 solrServer.query(params); 436 442 } 437 443 438 444 public static VloConfig config; 439 445 … … 455 461 */ 456 462 options.addOption("c", true, "-c <file> : use parameters specified in <file>"); 457 463 458 464 CommandLineParser parser = new PosixParser(); 459 465 460 466 try { 461 467 // parse the command line arguments … … 466 472 configFile = cmd.getOptionValue("c"); 467 473 } 468 474 469 475 } catch (org.apache.commons.cli.ParseException ex) { 470 476 … … 478 484 System.err.println(message); 479 485 } 480 486 481 487 if (configFile == null) { 482 488 483 489 String message; 484 490 485 491 message = "Could not get config file name via the command line, trying the system properties."; 486 492 LOG.info(message); 487 493 488 494 String key; 489 495 490 496 key = "configFile"; 491 497 configFile = System.getProperty(key); 492 498 } 493 499 494 500 if (configFile == null) { 495 501 496 502 String message; 497 503 498 504 message = "Could not get filename as system property either - stopping."; 499 505 LOG.error(message); -
vlo/trunk/vlo-importer/src/test/java/eu/clarin/cmdi/vlo/importer/MetadataImporterTest.java
r4611 r5197 281 281 " because it is too large."); 282 282 } else { 283 LOG.debug("PROCESSING FILE: " + 284 file.getAbsolutePath()); 283 LOG.debug("PROCESSING FILE: {}", file.getAbsolutePath()); 285 284 /* 286 285 * Anticipate on the solr exception that will
Note: See TracChangeset
for help on using the changeset viewer.