- Timestamp:
- 10/30/15 17:19:52 (9 years ago)
- Location:
- vlo/branches/vlo-3.3-oeaw/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer
- Files:
-
- 1 added
- 8 edited
Legend:
- Unmodified
- Added
- Removed
-
vlo/branches/vlo-3.3-oeaw/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/AvailabilityPostProcessor.java
r6379 r6715 10 10 */ 11 11 public class AvailabilityPostProcessor extends PostProcessorsWithVocabularyMap { 12 13 private static Map<String, String> availabilityMap; 14 12 15 13 private static final Integer MAX_LENGTH = 20; 16 14 private static final String OTHER_VALUE = "Other"; … … 19 17 @Override 20 18 public List<String> process(final String value) { 21 String result = value;22 19 List<String> resultList = new ArrayList<String>(); 23 24 if (getVocabularyMap().containsKey(value)) { 25 resultList.add(getVocabularyMap().get(value)); 26 } else { 27 if (result.length() > MAX_LENGTH) { 28 resultList.add(OTHER_VALUE); 29 } else { 30 resultList.add(result.trim()); 31 } 32 } 20 21 resultList.add(normalize(value, value.length() > MAX_LENGTH? OTHER_VALUE : value.trim())); 33 22 34 23 return resultList; 35 24 } 36 37 public Map<String, String> getVocabularyMap(){ 38 if(availabilityMap == null){ 39 availabilityMap = createControlledVocabularyMap(MetadataImporter.config.getLicenseAvailabilityMapUrl()); 40 } 41 42 return availabilityMap; 43 } 25 26 27 @Override 28 public String getNormalizationMapURL() { 29 return MetadataImporter.config.getLicenseAvailabilityMapUrl(); 30 } 44 31 } -
vlo/branches/vlo-3.3-oeaw/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/CMDIParserVTDXML.java
r6413 r6715 1 1 package eu.clarin.cmdi.vlo.importer; 2 3 import java.io.File; 4 import java.io.FileInputStream; 5 import java.io.IOException; 6 import java.net.URI; 7 import java.util.ArrayList; 8 import java.util.Arrays; 9 import java.util.HashMap; 10 import java.util.LinkedList; 11 import java.util.List; 12 import java.util.Map; 13 import java.util.Map.Entry; 14 import java.util.regex.Matcher; 15 import java.util.regex.Pattern; 16 17 import org.apache.commons.io.IOUtils; 18 import org.slf4j.Logger; 19 import org.slf4j.LoggerFactory; 2 20 3 21 import com.ximpleware.AutoPilot; … … 8 26 import com.ximpleware.XPathEvalException; 9 27 import com.ximpleware.XPathParseException; 28 10 29 import eu.clarin.cmdi.vlo.FacetConstants; 11 import java.io.File;12 import java.io.FileInputStream;13 import java.io.IOException;14 import java.net.URI;15 import java.util.ArrayList;16 import java.util.List;17 import java.util.Map;18 import java.util.regex.Matcher;19 import java.util.regex.Pattern;20 import org.apache.commons.io.IOUtils;21 import org.slf4j.Logger;22 import org.slf4j.LoggerFactory;23 30 24 31 public class CMDIParserVTDXML implements CMDIDataProcessor { … … 231 238 */ 232 239 private void processFacets(CMDIData cmdiData, VTDNav nav, FacetMapping facetMapping) throws VTDException { 233 List<FacetConfiguration> facetList = facetMapping.getFacets(); 240 241 List<FacetConfiguration> facetList = facetMapping.getFacets(); 234 242 for (FacetConfiguration config : facetList) { 235 243 boolean matchedPattern = false; … … 289 297 continue; 290 298 } 291 299 292 300 final List<String> values = postProcess(config.getName(), value); 293 insertFacetValues(config.getName(), values, cmdiData, languageCode, allowMultipleValues, config.isCaseInsensitive()); 301 302 insertFacetValues(config.getName(), values, cmdiData, languageCode, allowMultipleValues, config.isCaseInsensitive(), true); 303 304 crossMap(config, value, cmdiData, languageCode); 294 305 295 306 //add also non curated resource type … … 304 315 derivedValues.addAll(postProcess(derivedFacet, postProcessedValue)); 305 316 } 306 insertFacetValues(derivedFacet, derivedValues, cmdiData, languageCode, allowMultipleValues, config.isCaseInsensitive() );317 insertFacetValues(derivedFacet, derivedValues, cmdiData, languageCode, allowMultipleValues, config.isCaseInsensitive(), true); 307 318 } 308 319 … … 330 341 } 331 342 332 private void insertFacetValues(String name, List<String> valueList, CMDIData cmdiData, String languageCode, boolean allowMultipleValues, boolean caseInsensitive) { 343 344 /* 345 * Add values to facet either they come from MD fields either from cross mapping 346 * Advantage is given to the values from MD fields. They will be always at the begging of the list and in case 347 * when facet doesn't allow multiple values and we already had value from cross mapping this value will be overridden 348 * 349 */ 350 private void insertFacetValues(String name, List<String> valueList, CMDIData cmdiData, String languageCode, boolean allowMultipleValues, boolean caseInsensitive, boolean comesFromConceptMapping) { 351 352 //keep only values from original concepts, not from cross mappings 353 if(comesFromConceptMapping && !allowMultipleValues && cmdiData.getSolrDocument().containsKey(name)){ 354 cmdiData.getSolrDocument().remove(name); 355 } 356 357 if(!comesFromConceptMapping && !allowMultipleValues && cmdiData.getSolrDocument().containsKey(name)) 358 return; 359 333 360 for (int i = 0; i < valueList.size(); i++) { 334 361 if (!allowMultipleValues && i > 0) { … … 361 388 return resultList; 362 389 } 390 391 private void crossMap(FacetConfiguration config, String extractedValue, CMDIData cmdiData, String languageCode){ 392 393 if (postProcessors.containsKey(config.getName())){ 394 PostProcessor processor = postProcessors.get(config.getName()); 395 if(processor instanceof PostProcessorsWithVocabularyMap){ 396 397 List<String> facetNames = MetadataImporter.config.getAllFacetFields(); 398 399 Map<String, String> crossMap = ((PostProcessorsWithVocabularyMap) processor).getCrossMappings(extractedValue); 400 for(Entry e: crossMap.entrySet()){ 401 String toFacet = (String) e.getKey(); 402 String value = (String) e.getValue(); 403 for(String facetName: facetNames){ 404 if(toFacet.toLowerCase().equals(facetName.toLowerCase())){//normalize facet name, map can contain it in any case 405 insertFacetValues(facetName, Arrays.asList(value), cmdiData, languageCode, config.getAllowMultipleValues(), config.isCaseInsensitive(), false); 406 } 407 } 408 } 409 } 410 } 411 } 412 363 413 } -
vlo/branches/vlo-3.3-oeaw/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/LanguageCodePostProcessor.java
r6379 r6715 43 43 } 44 44 45 @Override 46 protected Map<String, String> getVocabularyMap() { 47 if(languageNameVariantsMap == null){ 48 languageNameVariantsMap = createControlledVocabularyMap(MetadataImporter.config.getLanguageNameVariantsUrl()); 49 } 50 return languageNameVariantsMap; 45 @Override 46 public String getNormalizationMapURL() { 47 return MetadataImporter.config.getLanguageNameVariantsUrl(); 51 48 } 52 53 49 54 50 protected String extractLanguageCode(String value) { … … 59 55 60 56 // map known language name variants to their offical name 61 if(getVocabularyMap().containsKey(result)) 62 result = getVocabularyMap().get(result); 57 result = normalize(result); 63 58 64 59 // input is already ISO 639-3? … … 90 85 return result; 91 86 } 87 92 88 } -
vlo/branches/vlo-3.3-oeaw/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/MetadataImporter.java
r6502 r6715 89 89 POST_PROCESSORS.put(FacetConstants.FIELD_CLARIN_PROFILE, new CMDIComponentProfileNamePostProcessor()); 90 90 POST_PROCESSORS.put(FacetConstants.FIELD_RESOURCE_CLASS, new ResourceClassPostProcessor()); 91 POST_PROCESSORS.put(FacetConstants.FIELD_PROFILE, new ProfileNamePostProcessor()); 91 92 } 92 93 -
vlo/branches/vlo-3.3-oeaw/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/NationalProjectPostProcessor.java
r6509 r6715 27 27 private final static Logger LOG = LoggerFactory.getLogger(NationalProjectPostProcessor.class); 28 28 29 private static Map<String, String> nationalProjectMap = null;30 private static Map<Pattern, String> nationalProjectRegExpMap = null;31 32 29 /** 33 30 * Returns the national project based on the mapping in … … 39 36 @Override 40 37 public List<String> process(String value) { 41 String input = value.trim();42 38 List<String> resultList = new ArrayList<String>(); 43 44 if (input != null && getVocabularyMap().containsKey(input)) { 45 resultList.add(getVocabularyMap().get(input)); 46 return resultList; 47 } 48 49 for (Pattern pattern : getRegExpMapping().keySet()) { 50 Matcher matcher = pattern.matcher(input); 51 if (matcher.find()) { 52 resultList.add(getRegExpMapping().get(pattern)); 53 return resultList; 54 } 55 } 56 57 resultList.add(""); 39 resultList.add(normalize(value.trim(), "")); 40 58 41 return resultList; 59 42 } 60 43 61 @Override62 protected Map<String, String> getVocabularyMap() {63 if(nationalProjectMap == null){64 createControlledVocabularyMap(getMappingFileUrl());65 }66 67 return nationalProjectMap;68 }69 70 private Map<Pattern, String> getRegExpMapping() {71 if (nationalProjectRegExpMap == null) {72 createControlledVocabularyMap(getMappingFileUrl());73 }74 return nationalProjectRegExpMap;75 }76 77 78 protected String getMappingFileUrl() {79 String projectsMappingFile = MetadataImporter.config.getNationalProjectMapping();80 44 81 if (projectsMappingFile.length() == 0) {82 // use the packaged project mapping file83 projectsMappingFile = "/nationalProjectsMapping.xml";84 }85 86 return projectsMappingFile;87 }88 89 45 @Override 90 protected Map<String, String> createControlledVocabularyMap(String mapUrl) { 91 if(nationalProjectMap == null){ 92 nationalProjectMap = new TreeMap<String, String>(String.CASE_INSENSITIVE_ORDER); 93 } 94 95 if(nationalProjectRegExpMap == null){ 96 nationalProjectRegExpMap = new HashMap<Pattern, String>(); 97 } 98 99 VariantsMap map = getMappingFromFile(mapUrl); 100 101 for(Entry<String, List<Variant>> entry: map.getMap().entrySet()){ 102 for(Variant variant: entry.getValue()){ 103 if(variant.isRegExp()){ 104 nationalProjectRegExpMap.put(Pattern.compile(variant.getValue()), entry.getKey()); 105 }else{ 106 nationalProjectMap.put(variant.getValue(), entry.getKey()); 107 } 108 } 109 } 110 111 return nationalProjectMap; 46 public String getNormalizationMapURL() { 47 return MetadataImporter.config.getNationalProjectMapping(); 112 48 } 113 49 -
vlo/branches/vlo-3.3-oeaw/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/OrganisationPostProcessor.java
r6379 r6715 23 23 String[] splitArray = normalizeInputString(value).split(";"); 24 24 for (int i = 0; i < splitArray.length; i++) { 25 String orgaName = splitArray[i]; 26 if (getVocabularyMap().containsKey(normalizeVariant(orgaName))) { 27 splitArray[i] = getVocabularyMap().get(normalizeVariant(orgaName)); 28 } 25 String normalizedVal = normalize(splitArray[i], null); 26 if(normalizedVal != null) 27 splitArray[i] = normalizeVariant(normalizedVal); 29 28 } 30 29 … … 32 31 } 33 32 34 @Override 35 protected Map<String, String> getVocabularyMap() { 36 if(organisationNamesMap == null){ 37 organisationNamesMap = createControlledVocabularyMap(MetadataImporter.config.getOrganisationNamesUrl()); 38 } 39 return organisationNamesMap; 33 @Override 34 public String getNormalizationMapURL() { 35 return MetadataImporter.config.getOrganisationNamesUrl(); 40 36 } 41 37 … … 48 44 } 49 45 46 50 47 51 48 -
vlo/branches/vlo-3.3-oeaw/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/PostProcessorsWithVocabularyMap.java
r6616 r6715 2 2 3 3 import java.io.InputStream; 4 import java.util.List; 4 5 import java.util.Map; 5 6 import java.util.Map.Entry; … … 9 10 10 11 import eu.clarin.cmdi.vlo.pojo.VariantsMap; 12 import eu.clarin.cmdi.vlo.pojo.VocabularyEntry; 11 13 import eu.clarin.cmdi.vlo.transformers.VariantsMapMarshaller; 12 14 … … 23 25 public abstract class PostProcessorsWithVocabularyMap implements PostProcessor{ 24 26 25 private final static Logger LOG = LoggerFactory.getLogger(PostProcessorsWithVocabularyMap.class); 26 27 private final static Logger _logger = LoggerFactory.getLogger(PostProcessorsWithVocabularyMap.class); 28 29 private List<VocabularyEntry> map; 30 31 public PostProcessorsWithVocabularyMap(){ 32 VariantsMap varinatsRawMap = getMappingFromFile(getNormalizationMapURL()); 33 map = varinatsRawMap.getMap(); 34 } 35 36 37 /* 38 * This method is used for normalization of facet values based on normalization maps. 39 * In case that value is not in the map, the input value is returned. 40 * 41 * @param value - original value from record 42 * @return normalized value if there is a match otherwise original value 43 * 44 */ 45 public String normalize(String value){ 46 return normalize(value, value); 47 } 48 27 49 28 50 /* 29 * returns specific static map and should call createControlledVocabularyMap 51 * This method is used for normalization of facet values based on normalization maps. 52 * With second parameter user can specify what to return in case when value is not in normalization map (special case for NationalProject facet). 53 * 54 * @param value - original value from record 55 * @param fallBackValue - value to be returned in case of no match 56 * @return normalized value if there is a match otherwise returns what is specified with 2nd parameter 57 * 30 58 */ 31 protected abstract Map<String, String> getVocabularyMap(); 59 public String normalize(String value, String fallBackValue){ 60 int ind = map.indexOf(value); 61 return (ind != -1)? map.get(ind).getNormalizedValue() : fallBackValue; 62 } 63 64 public Map<String, String> getCrossMappings(String value){ 65 int ind = map.indexOf(value); 66 return (ind != -1)? map.get(ind).getCrossMap() : null; 67 } 68 69 public abstract String getNormalizationMapURL(); 32 70 33 71 … … 40 78 try { 41 79 42 LOG.info("Reading vocabulary file from: {}", mapUrl);80 _logger.info("Reading vocabulary file from: {}", mapUrl); 43 81 // load records from file 44 82 // in the future this should be loaded from CLAVAS directly and the file only used as fallback … … 52 90 } 53 91 54 // for debug purposes 92 93 94 // for debug 55 95 public void printMap(){ 56 LOG.info("map contains {} entries", getVocabularyMap().size()); 57 for(Entry<String, String> e: getVocabularyMap().entrySet()){ 58 byte bytes[] = e.getKey().getBytes(); 59 StringBuilder sb = new StringBuilder(); 60 for (byte b : bytes) 61 sb.append(String.format("%02X ", b)); 62 63 LOG.info("Key <{} {}> will be mapped to <{}>", e.getKey(), sb.toString(), e.getValue()); 64 } 65 66 String s = "TÃŒbingen Curated Resource"; 67 byte bytes[] = s.getBytes(); 68 StringBuilder sb = new StringBuilder(); 69 for (byte b : bytes) 70 sb.append(String.format("%02X ", b)); 71 LOG.info("{} {}", s, sb.toString()); 72 73 96 _logger.info("map contains {} entries", map.size()); 97 for(VocabularyEntry entry: map) 98 _logger.info(entry.toString()); 99 74 100 } 75 76 77 78 79 101 } -
vlo/branches/vlo-3.3-oeaw/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/ResourceClassPostProcessor.java
r6400 r6715 10 10 11 11 public class ResourceClassPostProcessor extends PostProcessorsWithVocabularyMap { 12 private static Map<String, String> resourceTypeMap;13 12 14 13 /** … … 31 30 } 32 31 33 if (getVocabularyMap().containsKey(value)) 34 result = getVocabularyMap().get(value); 32 result = normalize(value); 35 33 36 34 return Arrays.asList(result); … … 38 36 39 37 @Override 40 protected Map<String, String> getVocabularyMap() { 41 if(resourceTypeMap == null){ 42 resourceTypeMap = createControlledVocabularyMap(MetadataImporter.config.getResourceClassMapUrl()); 43 } 44 45 return resourceTypeMap; 38 public String getNormalizationMapURL() { 39 return MetadataImporter.config.getResourceClassMapUrl(); 46 40 } 47 41 }
Note: See TracChangeset
for help on using the changeset viewer.