Changeset 5197


Ignore:
Timestamp:
05/14/14 14:04:26 (10 years ago)
Author:
Twan Goosen
Message:

logging improvements

Location:
vlo/trunk/vlo-importer/src
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • vlo/trunk/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/FacetMappingFactory.java

    r4029 r5197  
    1919
    2020/**
    21  * Creates facet-mappings (xpaths) from a configuration.
    22  * As they say "this is where the magic happens".
    23  * Also does some caching.
     21 * Creates facet-mappings (xpaths) from a configuration. As they say "this is
     22 * where the magic happens". Also does some caching.
    2423 */
    2524public class FacetMappingFactory {
     
    4241
    4342    /**
    44      * Get facet concept mapping. 
    45      * 
    46      * Get facet mapping used to map meta data based on a facet concepts
    47      * file and url to cmdi meta data profile.
    48 
     43     * Get facet concept mapping.
     44     *
     45     * Get facet mapping used to map meta data based on a facet concepts file
     46     * and url to cmdi meta data profile.
     47     *
    4948     * @param facetConcepts name of the facet concepts file
    5049     * @param xsd url of xml schema of cmdi profile
    51      * 
     50     *
    5251     * @return facet concept mapping
    5352     */
     
    6463    /**
    6564     * Create facet concept mapping.
    66      * 
     65     *
    6766     * Create facet mapping used to map meta data based on a facet concept
    6867     * mapping file and url to cmdi meta data profile.
    69      * 
     68     *
    7069     * @param facetConcepts name of the facet concepts file
    7170     * @param xsd url of xml schema of cmdi profile
    72      * 
     71     *
    7372     * @return the facet mapping used to map meta data to facets
    7473     */
     
    9695                                    pathConceptLinkMapping = new HashMap<String, String>();
    9796                                    for (String c : conceptLinkPathMapping.keySet()) {
    98                                         for (String p : conceptLinkPathMapping.get(c))
    99                                             pathConceptLinkMapping.put(p,c);
     97                                        for (String p : conceptLinkPathMapping.get(c)) {
     98                                            pathConceptLinkMapping.put(p, c);
     99                                        }
    100100                                    }
    101101                                }
     
    107107                                    if (context == null && acceptableContext.includeEmpty()) {
    108108                                        // no context is accepted
    109                                         LOG.debug("facet["+facetConcept.getName()+"] path["+path+"] context["+context+"](empty) is accepted");
     109                                        LOG.debug("facet[{}] path[{}] context[{}](empty) is accepted", facetConcept.getName(), path, context);
    110110                                        xpaths.add(path);
    111111                                        handled = true;
    112112                                    } else if (acceptableContext.getConcepts().contains(context)) {
    113113                                        // a specific context is accepted
    114                                         LOG.debug("facet["+facetConcept.getName()+"] path["+path+"] context["+context+"] is accepted");
     114                                        LOG.debug("facet[{}] path[{}] context[{}] is accepted", facetConcept.getName(), path, context);
    115115                                        xpaths.add(path);
    116116                                        handled = true;
     
    122122                                    if (context == null && rejectableContext.includeEmpty()) {
    123123                                        // no context is rejected
    124                                         LOG.debug("facet["+facetConcept.getName()+"] path["+path+"] context["+context+"](empty) is rejected");
     124                                        LOG.debug("facet[{}] path[{}] context[{}](empty) is rejected", facetConcept.getName(), path, context);
    125125                                        handled = true;
    126126                                    } else if (rejectableContext.getConcepts().contains(context)) {
    127127                                        // a specific context is rejected
    128                                         LOG.debug("facet["+facetConcept.getName()+"] path["+path+"] context["+context+"] is rejected");
     128                                        LOG.debug("facet[{}] path[{}] context[{}] is rejected", facetConcept.getName(), path, context);
    129129                                        handled = true;
    130130                                    } else if (rejectableContext.includeAny()) {
    131131                                        // any context is rejected
    132                                         LOG.debug("facet["+facetConcept.getName()+"] path["+path+"] context["+context+"](any) is rejected");
     132                                        LOG.debug("facet[{}] path[{}] context[{}](any) is rejected", facetConcept.getName(), path, context);
    133133                                        handled = true;
    134134                                    }
    135135                                }
    136                                 if (!handled && context!=null && facetConcept.hasAcceptableContext() && facetConcept.getAcceptableContext().includeAny()) {
     136                                if (!handled && context != null && facetConcept.hasAcceptableContext() && facetConcept.getAcceptableContext().includeAny()) {
    137137                                    // any, not rejected context, is accepted
    138                                     LOG.debug("facet["+facetConcept.getName()+"] path["+path+"] context["+context+"](any) is accepted");
     138                                    LOG.debug("facet[{}] path[{}] context[{}](any) is accepted", facetConcept.getName(), path, context);
    139139                                    xpaths.add(path);
    140140                                }
    141141                            }
    142                         } else
     142                        } else {
    143143                            xpaths.addAll(paths);
     144                        }
    144145                    }
    145146                }
    146                
     147
    147148                //add hardcoded patterns only when there is no xpath generated from conceptlink
    148149                if (xpaths.isEmpty()) {
    149150                    xpaths.addAll(facetConcept.getPatterns());
    150151                }
    151                
     152
    152153                // pattern-based blacklisting: remove all XPath expressions that contain a blacklisted substring;
    153154                // this is basically a hack to enhance the quality of the visualised information in the VLO;
    154155                // should be replaced by a more intelligent approach in the future
    155                 for(String blacklistPattern : facetConcept.getBlacklistPatterns()) {
    156                         Iterator<String> xpathIterator = xpaths.iterator();
    157                         while(xpathIterator.hasNext()) {
    158                                 String xpath = xpathIterator.next();
    159                                 if(xpath.contains(blacklistPattern)) {
    160                                         LOG.debug("Rejecting "+xpath+" because of blacklisted substring "+blacklistPattern);
    161                                         xpathIterator.remove();
    162                                 }
    163                         }
    164                 }               
    165                
     156                for (String blacklistPattern : facetConcept.getBlacklistPatterns()) {
     157                    Iterator<String> xpathIterator = xpaths.iterator();
     158                    while (xpathIterator.hasNext()) {
     159                        String xpath = xpathIterator.next();
     160                        if (xpath.contains(blacklistPattern)) {
     161                            LOG.debug("Rejecting {} because of blacklisted substring {}", xpath, blacklistPattern);
     162                            xpathIterator.remove();
     163                        }
     164                    }
     165                }
     166
    166167                config.setCaseInsensitive(facetConcept.isCaseInsensitive());
    167168                config.setAllowMultipleValues(facetConcept.isAllowMultipleValues());
     
    173174            }
    174175        } catch (NavException e) {
    175             LOG.error("Error creating facetMapping from xsd: " + xsd + " ", e);
     176            LOG.error("Error creating facetMapping from xsd: {}", xsd, e);
    176177        }
    177178        return result;
    178179    }
    179    
    180     /**
    181      * Look if there is a contextual (container) data category associated with an ancestor by walking back.
    182      */
    183     private String getContext(String path, Map<String,String> pathConceptLinkMapping) {
     180
     181    /**
     182     * Look if there is a contextual (container) data category associated with
     183     * an ancestor by walking back.
     184     */
     185    private String getContext(String path, Map<String, String> pathConceptLinkMapping) {
    184186        String context = null;
    185187        String cpath = path;
    186         while(context==null && !cpath.equals("/text()")) {
    187             cpath = cpath.replaceAll("/[^/]*/text\\(\\)","/text()");
     188        while (context == null && !cpath.equals("/text()")) {
     189            cpath = cpath.replaceAll("/[^/]*/text\\(\\)", "/text()");
    188190            context = pathConceptLinkMapping.get(cpath);
    189191        }
     
    192194
    193195    /**
    194      * The id facet is special case and patterns must be added first.
    195      * The standard pattern to get the id out of the header is the most reliable and it should fall back on concept matching if nothing matches.
    196      * (Note this is the exact opposite of other facets where the concept match is probably better then the 'hardcoded' pattern).
     196     * The id facet is special case and patterns must be added first. The
     197     * standard pattern to get the id out of the header is the most reliable and
     198     * it should fall back on concept matching if nothing matches. (Note this is
     199     * the exact opposite of other facets where the concept match is probably
     200     * better then the 'hardcoded' pattern).
    197201     */
    198202    private void handleId(List<String> xpaths, FacetConcept facetConcept) {
     
    203207
    204208    /**
    205      * "this is where the magic happens".
    206      * Finds paths in the xsd to all concepts (isocat data catagories).
     209     * "this is where the magic happens". Finds paths in the xsd to all concepts
     210     * (isocat data catagories).
     211     *
    207212     * @param xsd URL of XML Schema of some CMDI profile
    208      * @return Map (Data Category -> List of XPath expressions linked to the key data category which can be found in CMDI files with this schema)
     213     * @return Map (Data Category -> List of XPath expressions linked to the key
     214     * data category which can be found in CMDI files with this schema)
    209215     * @throws NavException
    210216     */
     
    214220        boolean parseSuccess = vg.parseHttpUrl(xsd, true);
    215221        if (!parseSuccess) {
    216             LOG.error("Cannot create ConceptLink Map from xsd (xsd is probably not reachable): "+xsd+". All metadata instances that use this xsd will not be imported correctly.");
     222            LOG.error("Cannot create ConceptLink Map from xsd (xsd is probably not reachable): " + xsd + ". All metadata instances that use this xsd will not be imported correctly.");
    217223            return result; //return empty map, so the incorrect xsd is not tried for all metadata instances that specify it.
    218224        }
     
    243249
    244250    /**
    245      * Goal is to get the "datcat" attribute. Tries a number of different favors that were found in the xsd's.
     251     * Goal is to get the "datcat" attribute. Tries a number of different favors
     252     * that were found in the xsd's.
     253     *
    246254     * @return -1 if index is not found.
    247255     */
     
    260268    /**
    261269     * Given an xml-token path thingy create an xpath.
     270     *
    262271     * @param elementPath
    263272     * @return
     
    273282    /**
    274283     * does some updating after a step. To keep the path proper and path-y.
     284     *
    275285     * @param vn
    276286     * @param elementPath
     
    292302
    293303    class Token {
     304
    294305        final String name;
    295306        final int depth;
  • vlo/trunk/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/MetadataImporter.java

    r5143 r5197  
    7171     */
    7272    final static Map<String, PostProcessor> POST_PROCESSORS = new HashMap<String, PostProcessor>();
    73 
     73   
    7474    static {
    7575        POST_PROCESSORS.put(FacetConstants.FIELD_ID, new IdPostProcessor());
     
    116116     */
    117117    void startImport() throws MalformedURLException {
    118 
     118       
    119119        initSolrServer();
    120120        List<DataRoot> dataRoots = checkDataRoots();
     
    144144                        LOG.info("Skipping " + file.getAbsolutePath() + " because it is too large.");
    145145                    } else {
    146                         LOG.debug("PROCESSING FILE: " + file.getAbsolutePath());
     146                        LOG.debug("PROCESSING FILE: {}", file.getAbsolutePath());
    147147                        processCmdi(file, dataRoot, processor);
    148148                    }
     
    273273            }
    274274        } catch (Exception e) {
    275             LOG.error("error in file: " + file + " Exception", e);
     275            LOG.error("error in file: {}", file, e);
    276276            nrOfFilesWithError++;
    277277        }
    278         if (cmdiData != null && processedIds.add(cmdiData.getId())) {
    279             SolrInputDocument solrDocument = cmdiData.getSolrDocument();
    280             if (solrDocument != null) {
    281                 if (!cmdiData.getDataResources().isEmpty() || !cmdiData.getLandingPageResources().isEmpty()
    282                         || !cmdiData.getSearchResources().isEmpty() || !cmdiData.getSearchPageResources().isEmpty()
    283                         || cmdiData.getMetadataResources().isEmpty()) {
    284                     // We only add metadata files that have
    285                     //  1) data resources or
    286                     //  2) a landing page or
    287                     //  3) a search service (like SRU/CQL) or
    288                     //  4) a search page or
    289                     //  5) that have none of the above but also lack any metadata links (e.g. olac files describing a corpus with a link to the original archive).
    290                     // Other files will have only metadata resources and are considered 'collection' metadata files they
    291                     // are usually not very interesting (think imdi corpus files) and will not be included.
    292                     updateDocument(solrDocument, cmdiData, file, dataOrigin);
    293                 } else {
    294                     nrOfIgnoredFiles++;
     278        if (cmdiData != null) {
     279            if (processedIds.add(cmdiData.getId())) {
     280                SolrInputDocument solrDocument = cmdiData.getSolrDocument();
     281                if (solrDocument != null) {
     282                    if (!cmdiData.getDataResources().isEmpty() || !cmdiData.getLandingPageResources().isEmpty()
     283                            || !cmdiData.getSearchResources().isEmpty() || !cmdiData.getSearchPageResources().isEmpty()
     284                            || cmdiData.getMetadataResources().isEmpty()) {
     285                        // We only add metadata files that have
     286                        //  1) data resources or
     287                        //      2) a landing page or
     288                        //      3) a search service (like SRU/CQL) or
     289                        //      4) a search page or
     290                        //  5) that have none of the above but also lack any metadata links (e.g. olac files describing a corpus with a link to the original archive).
     291                        // Other files will have only metadata resources and are considered 'collection' metadata files they
     292                        // are usually not very interesting (think imdi corpus files) and will not be included.
     293                        updateDocument(solrDocument, cmdiData, file, dataOrigin);
     294                    } else {
     295                        nrOfIgnoredFiles++;
     296                    }
    295297                }
     298            } else {
     299                LOG.warn("Skipping {}, already processed id: {}", file, cmdiData.getId());
    296300            }
    297301        }
     
    328332        solrDocument.addField(FacetConstants.FIELD_ID, cmdiData.getId());
    329333        solrDocument.addField(FacetConstants.FIELD_FILENAME, file.getAbsolutePath());
    330 
     334       
    331335        String metadataSourceUrl = dataOrigin.getPrefix();
    332336        metadataSourceUrl += file.getAbsolutePath().substring(dataOrigin.getToStrip().length());
    333 
     337       
    334338        solrDocument.addField(FacetConstants.FIELD_COMPLETE_METADATA, metadataSourceUrl);
    335339
     
    356360        // add resource proxys     
    357361        addResourceData(solrDocument, cmdiData);
     362       
     363        LOG.debug("Adding document for submission to SOLR: {}", file);
    358364        docs.add(solrDocument);
    359365        if (docs.size() == config.getMaxDocsInList()) {
     
    391397                format = CommonUtils.normalizeMimeType(mimeType);
    392398            }
    393 
     399           
    394400            FormatPostProcessor processor = new FormatPostProcessor();
    395401            mimeType = processor.process(mimeType);
     
    435441        solrServer.query(params);
    436442    }
    437 
     443   
    438444    public static VloConfig config;
    439445
     
    455461         */
    456462        options.addOption("c", true, "-c <file> : use parameters specified in <file>");
    457 
     463       
    458464        CommandLineParser parser = new PosixParser();
    459 
     465       
    460466        try {
    461467            // parse the command line arguments
     
    466472                configFile = cmd.getOptionValue("c");
    467473            }
    468 
     474           
    469475        } catch (org.apache.commons.cli.ParseException ex) {
    470476
     
    478484            System.err.println(message);
    479485        }
    480 
     486       
    481487        if (configFile == null) {
    482 
     488           
    483489            String message;
    484 
     490           
    485491            message = "Could not get config file name via the command line, trying the system properties.";
    486492            LOG.info(message);
    487 
     493           
    488494            String key;
    489 
     495           
    490496            key = "configFile";
    491497            configFile = System.getProperty(key);
    492498        }
    493 
     499       
    494500        if (configFile == null) {
    495 
     501           
    496502            String message;
    497 
     503           
    498504            message = "Could not get filename as system property either - stopping.";
    499505            LOG.error(message);
  • vlo/trunk/vlo-importer/src/test/java/eu/clarin/cmdi/vlo/importer/MetadataImporterTest.java

    r4611 r5197  
    281281                                        " because it is too large.");
    282282                            } else {
    283                                 LOG.debug("PROCESSING FILE: " +
    284                                         file.getAbsolutePath());               
     283                                LOG.debug("PROCESSING FILE: {}", file.getAbsolutePath());               
    285284                                /*
    286285                                 * Anticipate on the solr exception that will
Note: See TracChangeset for help on using the changeset viewer.