Changeset 5979


Ignore:
Timestamp:
02/16/15 13:42:07 (9 years ago)
Author:
teckart@informatik.uni-leipzig.de
Message:

Added support for using local XML schema files instead of using the component registry (#522), also stricter check when extracting profile ID from CMDI instance file

Location:
vlo/trunk/vlo-importer/src
Files:
7 edited

Legend:

Unmodified
Added
Removed
  • vlo/trunk/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/CMDIParserVTDXML.java

    r5849 r5979  
    1515import java.util.List;
    1616import java.util.Map;
     17import java.util.regex.Matcher;
     18import java.util.regex.Pattern;
    1719import org.apache.commons.io.IOUtils;
    1820import org.slf4j.Logger;
     
    2123public class CMDIParserVTDXML implements CMDIDataProcessor {
    2224    private final Map<String, PostProcessor> postProcessors;
     25    private final Boolean useLocalXSDCache;
     26    private static final Pattern PROFILE_ID_PATTERN = Pattern.compile(".*(clarin.eu:cr1:p_[0-9]+).*");
    2327    private final static Logger LOG = LoggerFactory.getLogger(CMDIParserVTDXML.class);
    2428   
    2529    private static final String DEFAULT_LANGUAGE = "und";
    2630
    27     public CMDIParserVTDXML(Map<String, PostProcessor> postProcessors) {
     31    public CMDIParserVTDXML(Map<String, PostProcessor> postProcessors, Boolean useLocalXSDCache) {
    2832        this.postProcessors = postProcessors;
     33        this.useLocalXSDCache = useLocalXSDCache;
    2934    }
    3035
     
    3944       
    4045        VTDNav nav = vg.getNav();
    41         FacetMapping facetMapping = getFacetMapping(nav.cloneNav(), file.getAbsolutePath());
     46        FacetMapping facetMapping = getFacetMapping(nav.cloneNav());
    4247
    4348        if(facetMapping.getFacets().isEmpty()){
     
    6267     * Extracts valid XML patterns for all facet definitions
    6368     * @param nav VTD Navigator
    64      * @param cmdiFilePath Absolute path of the XML file for which nav was created
    6569     * @return the facet mapping used to map meta data to facets
    6670     * @throws VTDException
    6771     */
    68     private FacetMapping getFacetMapping(VTDNav nav, String cmdiFilePath) throws VTDException {
    69         String xsd = extractXsd(nav);
    70         if (xsd == null) {
     72    private FacetMapping getFacetMapping(VTDNav nav) throws VTDException {
     73        String profileId = extractXsd(nav);
     74        if (profileId == null) {
    7175            throw new RuntimeException("Cannot get xsd schema so cannot get a proper mapping. Parse failed!");
    72         }
    73         if (xsd.indexOf("http") != xsd.lastIndexOf("http")){
    74             LOG.info("No valid CMDI schema URL was extracted. This is an indication of a broken CMDI file (like false content in //MdProfile element). {}", cmdiFilePath);
    7576        }
    7677        String facetConceptsFile = MetadataImporter.config.getFacetConceptsFile();
     
    7980            facetConceptsFile = "/facetConcepts.xml";
    8081        }
    81         return FacetMappingFactory.getFacetMapping(facetConceptsFile, xsd);
     82        return FacetMappingFactory.getFacetMapping(facetConceptsFile, profileId, useLocalXSDCache);
    8283    }
    8384
     
    8586     * Try two approaches to extract the XSD schema information from the CMDI file
    8687     * @param nav VTD Navigator
    87      * @return URL of CMDI schema, or null if neither the CMDI header nor the XMLSchema-instance's attributes contained the information
     88     * @return ID of CMDI schema, or null if neither the CMDI header nor the XMLSchema-instance's attributes contained the information
    8889     * @throws VTDException
    8990     */
    9091    String extractXsd(VTDNav nav) throws VTDException {
    91         String xsd = getXsdFromHeader(nav);
    92         if (xsd == null) {
    93             xsd = getXsdFromSchemaLocation(nav);
    94         }
    95         return xsd;
     92        String profileID = getProfileIdFromHeader(nav);
     93        if (profileID == null) {
     94            profileID = getProfileIdFromSchemaLocation(nav);
     95        }
     96        return profileID;
    9697    }
    9798
     
    99100     * Extract XSD schema information from CMDI header (using element //Header/MdProfile)
    100101     * @param nav VTD Navigator
    101      * @return URL to CMDI schema, or null if content of //Header/MdProfile element could not be read
     102     * @return ID of CMDI schema, or null if content of //Header/MdProfile element could not be read
    102103     * @throws XPathParseException
    103104     * @throws XPathEvalException
    104105     * @throws NavException
    105106     */
    106     private String getXsdFromHeader(VTDNav nav) throws XPathParseException, XPathEvalException, NavException {
    107         String result = null;
     107    private String getProfileIdFromHeader(VTDNav nav) throws XPathParseException, XPathEvalException, NavException {
    108108        nav.toElement(VTDNav.ROOT);
    109109        AutoPilot ap = new AutoPilot(nav);
     
    111111        ap.selectXPath("/c:CMD/c:Header/c:MdProfile/text()");
    112112        int index = ap.evalXPath();
     113        String profileId = null;
    113114        if (index != -1) {
    114             String profileId = nav.toString(index).trim();
    115             result = MetadataImporter.config.getComponentRegistryProfileSchema(profileId);
    116         }
    117         return result;
     115            profileId = nav.toString(index).trim();
     116        }
     117        return profileId;
    118118    }
    119119
     
    121121     * Extract XSD schema information from schemaLocation or noNamespaceSchemaLocation attributes
    122122     * @param nav VTD Navigator
    123      * @return URL to CMDI schema, or null if attributes don't exist
     123     * @return ID of CMDI schema, or null if attributes don't exist
    124124     * @throws NavException
    125125     */
    126     private String getXsdFromSchemaLocation(VTDNav nav) throws NavException {
     126    private String getProfileIdFromSchemaLocation(VTDNav nav) throws NavException {
    127127        String result = null;
    128128        nav.toElement(VTDNav.ROOT);
     
    137137            }
    138138        }
    139         return result;
     139       
     140        // extract profile ID
     141        if(result != null) {
     142        Matcher m = PROFILE_ID_PATTERN.matcher(result);
     143        if(m.find())
     144            return m.group(1);
     145        }
     146        return null;
    140147    }
    141148   
  • vlo/trunk/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/FacetMappingFactory.java

    r5197 r5979  
    3535    private FacetMappingFactory() {
    3636    }
    37 
    38     public static FacetMapping getFacetMapping(String facetConceptsFile, String xsd) {
    39         return INSTANCE.getOrCreateMapping(facetConceptsFile, xsd);
     37   
     38    public static FacetMapping getFacetMapping(String facetConceptsFile, String xsd, Boolean useLocalXSDCache) {
     39        return INSTANCE.getOrCreateMapping(facetConceptsFile, xsd, useLocalXSDCache);
    4040    }
    4141
     
    4848     * @param facetConcepts name of the facet concepts file
    4949     * @param xsd url of xml schema of cmdi profile
     50     * @param useLocalXSDCache use local XML schema files instead of accessing the component registry
    5051     *
    5152     * @return facet concept mapping
    5253     */
    53     private FacetMapping getOrCreateMapping(String facetConcepts, String xsd) {
     54    private FacetMapping getOrCreateMapping(String facetConcepts, String xsd, Boolean useLocalXSDCache) {
    5455        // check if concept mapping has already been created
    5556        FacetMapping result = mapping.get(xsd);
    5657        if (result == null) {
    57             result = createMapping(facetConcepts, xsd);
     58            result = createMapping(facetConcepts, xsd, useLocalXSDCache);
    5859            mapping.put(xsd, result);
    5960        }
     
    6970     * @param facetConcepts name of the facet concepts file
    7071     * @param xsd url of xml schema of cmdi profile
     72     * @param useLocalXSDCache use local XML schema files instead of accessing the component registry
    7173     *
    7274     * @return the facet mapping used to map meta data to facets
    7375     */
    74     private FacetMapping createMapping(String facetConcepts, String xsd) {
     76    private FacetMapping createMapping(String facetConcepts, String xsd, Boolean useLocalXSDCache) {
    7577
    7678        FacetMapping result = new FacetMapping();
     
    7981        try {
    8082            //The magic
    81             Map<String, List<String>> conceptLinkPathMapping = createConceptLinkPathMapping(xsd);
     83            Map<String, List<String>> conceptLinkPathMapping = createConceptLinkPathMapping(xsd, useLocalXSDCache);
    8284            Map<String, String> pathConceptLinkMapping = null;
    8385            // Below we put the stuff we found into the configuration class.
     
    211213     *
    212214     * @param xsd URL of XML Schema of some CMDI profile
     215     * @param useLocalXSDCache use local XML schema files instead of accessing the component registry
    213216     * @return Map (Data Category -> List of XPath expressions linked to the key
    214217     * data category which can be found in CMDI files with this schema)
    215218     * @throws NavException
    216219     */
    217     private Map<String, List<String>> createConceptLinkPathMapping(String xsd) throws NavException {
     220    private Map<String, List<String>> createConceptLinkPathMapping(String xsd, Boolean useLocalXSDCache) throws NavException {
    218221        Map<String, List<String>> result = new HashMap<String, List<String>>();
    219222        VTDGen vg = new VTDGen();
    220         boolean parseSuccess = vg.parseHttpUrl(xsd, true);
     223        boolean parseSuccess;
     224        if(useLocalXSDCache) {
     225            parseSuccess = vg.parseFile(Thread.currentThread().getContextClassLoader().getResource("testProfiles/"+xsd+".xsd").getPath(), true);
     226        } else {
     227            parseSuccess = vg.parseHttpUrl(MetadataImporter.config.getComponentRegistryProfileSchema(xsd), true);
     228        }
     229           
    221230        if (!parseSuccess) {
    222231            LOG.error("Cannot create ConceptLink Map from xsd (xsd is probably not reachable): " + xsd + ". All metadata instances that use this xsd will not be imported correctly.");
  • vlo/trunk/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/MetadataImporter.java

    r5896 r5979  
    137137                    LOG.info("Deleting data of provider done.");
    138138                }
    139                 CMDIDataProcessor processor = new CMDIParserVTDXML(POST_PROCESSORS);
     139                CMDIDataProcessor processor = new CMDIParserVTDXML(POST_PROCESSORS, false);
    140140                List<File> files = getFilesFromDataRoot(dataRoot.getRootFile());
    141141                for (File file : files) {
  • vlo/trunk/vlo-importer/src/test/java/eu/clarin/cmdi/vlo/importer/CMDIDataProcessorTest.java

    r5887 r5979  
    1717
    1818    private CMDIDataProcessor getDataParser() {
    19         return new CMDIParserVTDXML(MetadataImporter.POST_PROCESSORS);
     19        return new CMDIParserVTDXML(MetadataImporter.POST_PROCESSORS, true);
    2020    }
    2121   
  • vlo/trunk/vlo-importer/src/test/java/eu/clarin/cmdi/vlo/importer/CMDIParserVTDXMLTest.java

    r5347 r5979  
    1919        content += "</CMD>\n";
    2020        String xsd = getXsd(content);
    21         assertEquals("http://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/profiles/clarin.eu:cr1:p_1288172614026/xsd", xsd);
     21        assertEquals("clarin.eu:cr1:p_1288172614026", xsd);
    2222    }
    2323
     
    3030        content += "</CMD>\n";
    3131        String xsd = getXsd(content);
    32         assertEquals("http://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/profiles/clarin.eu:cr1:p_1288172614026/xsd", xsd);
     32        assertEquals("clarin.eu:cr1:p_1288172614026", xsd);
    3333    }
    3434
     
    4141        content += "</CMD>\n";
    4242        String xsd = getXsd(content);
    43         assertEquals("http://www.meertens.knaw.nl/oai/cmdi/diddd_sub_location_profile.xsd", xsd);
     43        assertEquals(null, xsd);
    4444    }
    4545
     
    5959        vg.parse(true);
    6060        VTDNav nav = vg.getNav();
    61         CMDIParserVTDXML parser = new CMDIParserVTDXML(null);
     61        CMDIParserVTDXML parser = new CMDIParserVTDXML(null, true);
    6262        String xsd = parser.extractXsd(nav);
    6363        return xsd;
  • vlo/trunk/vlo-importer/src/test/java/eu/clarin/cmdi/vlo/importer/FacetMappingFactoryTest.java

    r5143 r5979  
    1616    private final static String FACETCONCEPTS_FILENAME = "/facetConceptsTest.xml";
    1717   
    18     private final static String IMDI_PROFILE_URL =
    19             "http://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/profiles/clarin.eu:cr1:p_1271859438204/xsd";
    20     private final static String OLAC_PROFILE_URL =
    21             "http://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/profiles/clarin.eu:cr1:p_1288172614026/xsd";
    22     private final static String LRT_PROFILE_URL =
    23             "http://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/profiles/clarin.eu:cr1:p_1289827960126/xsd";
    24     private final static String ID_PROFILE_URL =
    25             "http://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/profiles/clarin.eu:cr1:p_1290431694629/xsd";
    26     private final static String TEXTCORPUSPROFILE_PROFILE_URL =
    27             "http://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/profiles/clarin.eu:cr1:p_1290431694580/xsd";
     18    private final static String IMDI_PROFILE_ID = "clarin.eu:cr1:p_1271859438204";
     19    private final static String OLAC_PROFILE_ID = "clarin.eu:cr1:p_1288172614026";
     20    private final static String LRT_PROFILE_ID = "clarin.eu:cr1:p_1289827960126";
     21    private final static String ID_PROFILE_ID = "clarin.eu:cr1:p_1290431694629";
     22    private final static String TEXTCORPUSPROFILE_PROFILE_ID = "clarin.eu:cr1:p_1290431694580";
    2823
    2924    private final static Logger LOG = LoggerFactory.getLogger(FacetMappingFactoryTest.class);
     
    3227    public void testGetImdiMapping() {
    3328        FacetMapping facetMapping = FacetMappingFactory
    34                 .getFacetMapping(FACETCONCEPTS_FILENAME, IMDI_PROFILE_URL);
     29                .getFacetMapping(FACETCONCEPTS_FILENAME, IMDI_PROFILE_ID, true);
    3530       
    3631        List<FacetConfiguration> facets = facetMapping.getFacets();
     
    182177    public void testGetOlacMapping() {
    183178        FacetMapping facetMapping = FacetMappingFactory
    184                 .getFacetMapping(FACETCONCEPTS_FILENAME, OLAC_PROFILE_URL);
     179                .getFacetMapping(FACETCONCEPTS_FILENAME, OLAC_PROFILE_ID, true);
    185180       
    186181        List<FacetConfiguration> facets = facetMapping.getFacets();
     
    311306    public void testGetLrtMapping() {
    312307        FacetMapping facetMapping = FacetMappingFactory
    313                 .getFacetMapping(FACETCONCEPTS_FILENAME, LRT_PROFILE_URL);
     308                .getFacetMapping(FACETCONCEPTS_FILENAME, LRT_PROFILE_ID, true);
    314309
    315310        List<FacetConfiguration> facets = facetMapping.getFacets();
     
    439434
    440435        FacetMapping facetMapping = FacetMappingFactory
    441                 .getFacetMapping(FACETCONCEPTS_FILENAME, ID_PROFILE_URL);
     436                .getFacetMapping(FACETCONCEPTS_FILENAME, ID_PROFILE_ID, true);
    442437
    443438        List<FacetConfiguration> facets = facetMapping.getFacets();
     
    468463    public void testStringBasedBlacklisting() {
    469464        FacetMapping facetMapping = FacetMappingFactory
    470                 .getFacetMapping(FACETCONCEPTS_FILENAME, TEXTCORPUSPROFILE_PROFILE_URL);
     465                .getFacetMapping(FACETCONCEPTS_FILENAME, TEXTCORPUSPROFILE_PROFILE_ID, true);
    471466        List<FacetConfiguration> facets = facetMapping.getFacets();
    472467       
  • vlo/trunk/vlo-importer/src/test/java/eu/clarin/cmdi/vlo/importer/MetadataImporterTest.java

    r5197 r5979  
    271271                                dataRoot.getOriginName());
    272272                        CMDIDataProcessor processor = new
    273                                 CMDIParserVTDXML(POST_PROCESSORS);
     273                                CMDIParserVTDXML(POST_PROCESSORS, true);
    274274                        List<File> files =
    275275                                getFilesFromDataRoot(dataRoot.getRootFile());
Note: See TracChangeset for help on using the changeset viewer.