Changeset 5985


Ignore:
Timestamp:
02/17/15 10:56:15 (9 years ago)
Author:
teckart@informatik.uni-leipzig.de
Message:

Modification of extraction process: fallback patterns are now used when conceptlink-based XPaths fail (until now: fallback patterns are never used when there are conceptlink-based XPaths for a facet in a CMDI profile) (#668)

Location:
vlo/trunk/vlo-importer/src
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • vlo/trunk/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/CMDIParserVTDXML.java

    r5979 r5985  
    193193        List<FacetConfiguration> facetList = facetMapping.getFacets();
    194194        for (FacetConfiguration config : facetList) {
     195            boolean matchedPattern = false;
    195196            List<String> patterns = config.getPatterns();
    196197            for (String pattern : patterns) {
    197                 boolean matchedPattern = matchPattern(cmdiData, nav, config, pattern, config.getAllowMultipleValues());
     198                matchedPattern = matchPattern(cmdiData, nav, config, pattern, config.getAllowMultipleValues());
    198199                if (matchedPattern && !config.getAllowMultipleValues()) {
    199200                    break;
     201                }
     202            }
     203           
     204            // using fallback patterns if extraction failed
     205            if (matchedPattern == false) {
     206                for (String pattern : config.getFallbackPatterns()) {
     207                    matchedPattern = matchPattern(cmdiData, nav, config, pattern, config.getAllowMultipleValues());
     208                    if (matchedPattern && !config.getAllowMultipleValues()) {
     209                        break;
     210                    }
    200211                }
    201212            }
  • vlo/trunk/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/FacetConfiguration.java

    r2768 r5985  
    1515    private boolean caseInsensitive= false;
    1616    private List<String> patterns = new ArrayList<String>();
     17    private List<String> fallbackPatterns = new ArrayList<String>();
    1718    private boolean allowMultipleValues = true;
    1819
     
    2829        this.patterns = patterns;
    2930    }
     31   
     32    public void setFallbackPatterns(List<String> fallbackPatterns) {
     33        this.fallbackPatterns = fallbackPatterns;
     34    }
    3035
    3136    public void setPattern(String pattern) {
    3237        this.patterns = Collections.singletonList(pattern);
     38    }
     39   
     40    public void setFallbackPattern(String fallbackPattern) {
     41        this.fallbackPatterns = Collections.singletonList(fallbackPattern);
    3342    }
    3443
     
    3847    public List<String> getPatterns() {
    3948        return patterns;
     49    }
     50   
     51    public List<String> getFallbackPatterns() {
     52        return fallbackPatterns;
    4053    }
    4154
  • vlo/trunk/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/FacetMappingFactory.java

    r5979 r5985  
    148148                }
    149149
    150                 //add hardcoded patterns only when there is no xpath generated from conceptlink
    151                 if (xpaths.isEmpty()) {
    152                     xpaths.addAll(facetConcept.getPatterns());
    153                 }
    154 
    155150                // pattern-based blacklisting: remove all XPath expressions that contain a blacklisted substring;
    156151                // this is basically a hack to enhance the quality of the visualised information in the VLO;
     
    169164                config.setCaseInsensitive(facetConcept.isCaseInsensitive());
    170165                config.setAllowMultipleValues(facetConcept.isAllowMultipleValues());
     166                config.setName(facetConcept.getName());
     167
    171168                config.setPatterns(xpaths);
    172                 config.setName(facetConcept.getName());
    173                 if (!config.getPatterns().isEmpty()) {
     169                config.setFallbackPatterns(facetConcept.getPatterns());
     170
     171                if (!config.getPatterns().isEmpty() || !config.getFallbackPatterns().isEmpty()) {
    174172                    result.addFacet(config);
    175173                }
  • vlo/trunk/vlo-importer/src/test/java/eu/clarin/cmdi/vlo/importer/FacetMappingFactoryTest.java

    r5979 r5985  
    4242       
    4343        assertEquals(FacetConstants.FIELD_SELF_LINK, mapping.getName());
    44         assertEquals(1, mapping.getPatterns().size());
    45         assertEquals("/c:CMD/c:Header/c:MdSelfLink/text()",
    46                 mapping.getPatterns().get(0));
     44        assertEquals(0, mapping.getPatterns().size());
     45        assertEquals(1, mapping.getFallbackPatterns().size());
     46        assertEquals("/c:CMD/c:Header/c:MdSelfLink/text()",
     47                mapping.getFallbackPatterns().get(0));
    4748        mapping = facets.get(index++);
    4849       
    4950        assertEquals(FacetConstants.FIELD_COLLECTION, mapping.getName());
    50         assertEquals(1, mapping.getPatterns().size());
     51        assertEquals(1, mapping.getFallbackPatterns().size());
    5152        mapping = facets.get(index++);
    5253       
     
    144145       
    145146        assertEquals(FacetConstants.FIELD_RESOURCE_CLASS, mapping.getName());
    146         assertEquals(3, mapping.getPatterns().size());
     147        assertEquals(3, mapping.getFallbackPatterns().size());
    147148        mapping = facets.get(index++);
    148149       
     
    152153       
    153154        assertEquals("/c:CMD/c:Header/c:MdCollectionDisplayName/text()",
    154                 mapping.getPatterns().get(0));
    155         assertEquals(1, mapping.getPatterns().size());
    156         mapping = facets.get(index++);
    157        
    158         assertEquals("/c:CMD/c:Header//text()", mapping.getPatterns().get(0));
    159         assertEquals("/c:CMD/c:Components//text()", mapping.getPatterns().get(1));
    160         assertEquals(2, mapping.getPatterns().size());
    161         mapping = facets.get(index++);
    162        
    163         assertEquals("/c:CMD/c:Header/c:MdProfile/text()", mapping.getPatterns().get(0));
    164         assertEquals(1, mapping.getPatterns().size());
     155                mapping.getFallbackPatterns().get(0));
     156        assertEquals(1, mapping.getFallbackPatterns().size());
     157        mapping = facets.get(index++);
     158       
     159        assertEquals("/c:CMD/c:Header//text()", mapping.getFallbackPatterns().get(0));
     160        assertEquals("/c:CMD/c:Components//text()", mapping.getFallbackPatterns().get(1));
     161        assertEquals(2, mapping.getFallbackPatterns().size());
     162        mapping = facets.get(index++);
     163       
     164        assertEquals("/c:CMD/c:Header/c:MdProfile/text()", mapping.getFallbackPatterns().get(0));
     165        assertEquals(1, mapping.getFallbackPatterns().size());
    165166        mapping = facets.get(index++);
    166167
     
    168169        assertEquals(FacetConstants.FIELD_KEYWORDS, mapping.getName());
    169170        assertEquals("/c:CMD/c:Components/c:mods/c:classification/text()",
    170                 mapping.getPatterns().get(0));
    171         assertEquals(3, mapping.getPatterns().size());
     171                mapping.getFallbackPatterns().get(0));
     172        assertEquals(3, mapping.getFallbackPatterns().size());
    172173
    173174        assertEquals("check to see we tested them all", facets.size(), index);
     
    192193       
    193194        assertEquals(FacetConstants.FIELD_SELF_LINK, mapping.getName());
    194         assertEquals(1, mapping.getPatterns().size());
    195         assertEquals("/c:CMD/c:Header/c:MdSelfLink/text()",
    196                 mapping.getPatterns().get(0));
     195        assertEquals(0, mapping.getPatterns().size());
     196        assertEquals(1, mapping.getFallbackPatterns().size());
     197        assertEquals("/c:CMD/c:Header/c:MdSelfLink/text()",
     198                mapping.getFallbackPatterns().get(0));
    197199        mapping = facets.get(index++);
    198200       
    199201        assertEquals(FacetConstants.FIELD_COLLECTION, mapping.getName());
    200         assertEquals(1, mapping.getPatterns().size());
     202        assertEquals(1, mapping.getFallbackPatterns().size());
    201203        assertEquals("/c:CMD/c:Header/c:MdCollectionDisplayName/text()",
    202                 mapping.getPatterns().get(0));
     204                mapping.getFallbackPatterns().get(0));
    203205        mapping = facets.get(index++);
    204206       
    205207        assertEquals(FacetConstants.FIELD_PROJECT_NAME, mapping.getName());
    206         assertEquals(3, mapping.getPatterns().size());
     208        assertEquals(3, mapping.getFallbackPatterns().size());
    207209        assertEquals("/c:CMD/c:Components/c:teiHeader/c:fileDesc/c:publicationStmt/c:publisher/c:orgName/c:orgName[@role=\"project\"]/text()",
    208                 mapping.getPatterns().get(0));
     210                mapping.getFallbackPatterns().get(0));
    209211        mapping = facets.get(index++);
    210212       
     
    222224       
    223225        assertEquals(FacetConstants.FIELD_COUNTRY, mapping.getName());
    224         assertEquals(2, mapping.getPatterns().size());
     226        assertEquals(0, mapping.getPatterns().size());
     227        assertEquals(2, mapping.getFallbackPatterns().size());
    225228        assertEquals("/c:CMD/c:Components/c:OLAC-DcmiTerms/c:spatial[@dcterms-type=\"ISO3166\"]/text()",
    226229                mapping
    227                 .getPatterns().get(0));
     230                .getFallbackPatterns().get(0));
    228231        assertEquals("/c:CMD/c:Components/c:OLAC-DcmiTerms/c:coverage[@dcterms-type=\"ISO3166\"]/text()",
    229232                mapping
    230                 .getPatterns().get(1));
     233                .getFallbackPatterns().get(1));
    231234        mapping = facets.get(index++);
    232235       
     
    235238        assertEquals("/c:CMD/c:Components/c:OLAC-DcmiTerms/c:language/text()",
    236239                mapping.getPatterns().get(0));
     240        assertEquals("/c:CMD/c:Components//c:OLAC-DcmiTerms/c:language/@olac-language", mapping.getFallbackPatterns().get(0));
    237241        mapping = facets.get(index++);
    238242       
     
    250254
    251255        assertEquals(FacetConstants.FIELD_GENRE, mapping.getName());
    252         assertEquals(4, mapping.getPatterns().size());
     256        assertEquals(4, mapping.getFallbackPatterns().size());
    253257        assertEquals("/c:CMD/c:Components/c:OLAC-DcmiTerms/c:type/@olac-linguistic-type",
    254                 mapping.getPatterns().get(0));
     258                mapping.getFallbackPatterns().get(0));
    255259        assertEquals("/c:CMD/c:Components/c:mods/c:genre/text()",
    256                 mapping.getPatterns().get(1));
     260                mapping.getFallbackPatterns().get(1));
    257261        mapping = facets.get(index++);
    258262
     
    281285       
    282286        assertEquals(FacetConstants.FIELD_NATIONAL_PROJECT, mapping.getName());
    283         assertEquals(1, mapping.getPatterns().size());
    284         mapping = facets.get(index++);
    285        
    286 
    287         assertEquals("/c:CMD/c:Header//text()", mapping.getPatterns().get(0));
    288         assertEquals("/c:CMD/c:Components//text()", mapping.getPatterns().get(1));
    289         assertEquals(2, mapping.getPatterns().size());
    290         mapping = facets.get(index++);
    291        
    292        
    293         assertEquals("/c:CMD/c:Header/c:MdProfile/text()", mapping.getPatterns().get(0));
    294         assertEquals(1, mapping.getPatterns().size());
     287        assertEquals(1, mapping.getFallbackPatterns().size());
     288        mapping = facets.get(index++);
     289       
     290
     291        assertEquals("/c:CMD/c:Header//text()", mapping.getFallbackPatterns().get(0));
     292        assertEquals("/c:CMD/c:Components//text()", mapping.getFallbackPatterns().get(1));
     293        assertEquals(2, mapping.getFallbackPatterns().size());
     294        mapping = facets.get(index++);
     295       
     296       
     297        assertEquals("/c:CMD/c:Header/c:MdProfile/text()", mapping.getFallbackPatterns().get(0));
     298        assertEquals(1, mapping.getFallbackPatterns().size());
    295299        mapping = facets.get(index++);
    296300
    297301        assertEquals(FacetConstants.FIELD_KEYWORDS, mapping.getName());
    298302        assertEquals("/c:CMD/c:Components/c:mods/c:classification/text()",
    299                 mapping.getPatterns().get(0));
    300         assertEquals(3, mapping.getPatterns().size());
     303                mapping.getFallbackPatterns().get(0));
     304        assertEquals(3, mapping.getFallbackPatterns().size());
    301305
    302306        assertEquals("check to see we tested them all", facets.size(), index);
     
    322326       
    323327        assertEquals(FacetConstants.FIELD_SELF_LINK, mapping.getName());
    324         assertEquals(1, mapping.getPatterns().size());
    325         assertEquals("/c:CMD/c:Header/c:MdSelfLink/text()",
    326                 mapping.getPatterns().get(0));
     328        assertEquals(0, mapping.getPatterns().size());
     329        assertEquals(1, mapping.getFallbackPatterns().size());
     330        assertEquals("/c:CMD/c:Header/c:MdSelfLink/text()",
     331                mapping.getFallbackPatterns().get(0));
    327332        mapping = facets.get(index++);
    328333       
    329334        assertEquals(FacetConstants.FIELD_COLLECTION, mapping.getName());
    330         assertEquals(1, mapping.getPatterns().size());
     335        assertEquals(1, mapping.getFallbackPatterns().size());
    331336        mapping = facets.get(index++);
    332337       
    333338        assertEquals(FacetConstants.FIELD_PROJECT_NAME, mapping.getName());
    334         assertEquals(3, mapping.getPatterns().size());
    335         assertEquals("/c:CMD/c:Components/c:media-session-profile/c:media-session/c:Corpus", mapping.getPatterns().get(1));
     339        assertEquals(3, mapping.getFallbackPatterns().size());
     340        assertEquals("/c:CMD/c:Components/c:media-session-profile/c:media-session/c:Corpus", mapping.getFallbackPatterns().get(1));
    336341        mapping = facets.get(index++);
    337342       
    338343        assertEquals(FacetConstants.FIELD_NAME, mapping.getName());
    339         assertEquals(6, mapping.getPatterns().size());
     344        assertEquals(6, mapping.getFallbackPatterns().size());
    340345        assertEquals("/c:CMD/c:Components/c:LrtInventoryResource/c:LrtCommon/c:ResourceName/text()",
    341                 mapping.getPatterns().get(0));
     346                mapping.getFallbackPatterns().get(0));
    342347        assertEquals("/c:CMD/c:Components/c:mods/c:titleInfo/title/text()",
    343                 mapping.getPatterns().get(1));
     348                mapping.getFallbackPatterns().get(1));
    344349        mapping = facets.get(index++);
    345350       
     
    388393       
    389394        assertEquals(FacetConstants.FIELD_GENRE, mapping.getName());
    390         assertEquals(4, mapping.getPatterns().size());
     395        assertEquals(4, mapping.getFallbackPatterns().size());
    391396        mapping = facets.get(index++);
    392397       
    393398        assertEquals(FacetConstants.FIELD_SUBJECT, mapping.getName());
    394         assertEquals(8, mapping.getPatterns().size());
     399        assertEquals(8, mapping.getFallbackPatterns().size());
    395400        mapping = facets.get(index++);
    396401       
    397402        assertEquals(FacetConstants.FIELD_DESCRIPTION, mapping.getName());
    398403        assertEquals(2, mapping.getPatterns().size());
     404        assertEquals(5, mapping.getFallbackPatterns().size());
    399405        assertEquals("/c:CMD/c:Components/c:LrtInventoryResource/c:LrtCommon/c:Description/text()",
    400406                mapping.getPatterns().get(0));
    401         assertEquals("/c:CMD/c:Components/c:LrtInventoryResource/c:LrtIPR/c:Description/text()",
    402                 mapping.getPatterns().get(1));
     407        assertEquals("/c:CMD/c:Components/c:mods/c:abstract/text()", mapping.getFallbackPatterns().get(0));
    403408        mapping = facets.get(index++);
    404409       
    405410        assertEquals(FacetConstants.FIELD_RESOURCE_CLASS, mapping.getName());
    406411        assertEquals(1, mapping.getPatterns().size());
     412        assertEquals(3, mapping.getFallbackPatterns().size());
    407413        assertEquals("/c:CMD/c:Components/c:LrtInventoryResource/c:LrtCommon/c:ResourceType/text()",
    408                 mapping.getPatterns().get(0));
     414                mapping.getFallbackPatterns().get(0));
    409415        mapping = facets.get(index++);
    410416       
    411417        assertEquals("/c:CMD/c:Header/c:MdCollectionDisplayName/text()",
    412                 mapping.getPatterns().get(0));
    413         assertEquals(1, mapping.getPatterns().size());
    414         mapping = facets.get(index++);
    415        
    416         assertEquals("/c:CMD/c:Header//text()", mapping.getPatterns().get(0));
    417         assertEquals("/c:CMD/c:Components//text()", mapping.getPatterns().get(1));
    418         assertEquals(2, mapping.getPatterns().size());
    419         mapping = facets.get(index++);
    420        
    421         assertEquals("/c:CMD/c:Header/c:MdProfile/text()", mapping.getPatterns().get(0));
    422         assertEquals(1, mapping.getPatterns().size());
     418                mapping.getFallbackPatterns().get(0));
     419        assertEquals(1, mapping.getFallbackPatterns().size());
     420        mapping = facets.get(index++);
     421       
     422        assertEquals("/c:CMD/c:Header//text()", mapping.getFallbackPatterns().get(0));
     423        assertEquals("/c:CMD/c:Components//text()", mapping.getFallbackPatterns().get(1));
     424        assertEquals(2, mapping.getFallbackPatterns().size());
     425        mapping = facets.get(index++);
     426       
     427        assertEquals("/c:CMD/c:Header/c:MdProfile/text()", mapping.getFallbackPatterns().get(0));
     428        assertEquals(1, mapping.getFallbackPatterns().size());
    423429        mapping = facets.get(index++);
    424430       
    425431        assertEquals(FacetConstants.FIELD_KEYWORDS, mapping.getName());
    426432        assertEquals(1, mapping.getPatterns().size());
     433        assertEquals(3, mapping.getFallbackPatterns().size());
    427434        assertEquals("/c:CMD/c:Components/c:LrtInventoryResource/c:tags/c:tag/text()",
    428435                mapping.getPatterns().get(0));
     436        assertEquals("/c:CMD/c:Components/c:mods/c:classification/text()",
     437                mapping.getFallbackPatterns().get(0));
    429438        assertEquals("check to see we tested them all", facets.size(), index);
    430439    }
Note: See TracChangeset for help on using the changeset viewer.