Changeset 4747


Ignore:
Timestamp:
03/18/14 15:26:03 (11 years ago)
Author:
Twan Goosen
Message:

accept relative file path as config location in importer

File:
1 edited

Legend:

Unmodified
Added
Removed
  • vlo/branches/vlo-3.0/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/MetadataImporter.java

    r4612 r4747  
    3535import org.slf4j.LoggerFactory;
    3636
    37 
    3837/**
    3938 * The main metadataImporter class. Also contains the main function.
     
    4443 * and so on.
    4544 */
    46 
    4745@SuppressWarnings({"serial"})
    4846public class MetadataImporter {
    4947
    5048    /**
    51      * Defines which files to try and parse.
    52      * In this case all files ending in "xml" or "cmdi".
    53      */
    54     private static final String[] VALID_CMDI_EXTENSIONS = new String[] { "xml", "cmdi" };
     49     * Defines which files to try and parse. In this case all files ending in
     50     * "xml" or "cmdi".
     51     */
     52    private static final String[] VALID_CMDI_EXTENSIONS = new String[]{"xml", "cmdi"};
    5553
    5654    /**
     
    7371     */
    7472    final static Map<String, PostProcessor> POST_PROCESSORS = new HashMap<String, PostProcessor>();
     73
    7574    static {
    7675        POST_PROCESSORS.put(FacetConstants.FIELD_ID, new IdPostProcessor());
    77                 POST_PROCESSORS.put(FacetConstants.FIELD_CONTINENT, new ContinentNamePostProcessor());
     76        POST_PROCESSORS.put(FacetConstants.FIELD_CONTINENT, new ContinentNamePostProcessor());
    7877        POST_PROCESSORS.put(FacetConstants.FIELD_COUNTRY, new CountryNamePostProcessor());
    7978        POST_PROCESSORS.put(FacetConstants.FIELD_LANGUAGE, new LanguageCodePostProcessor());
     
    8382        POST_PROCESSORS.put(FacetConstants.FIELD_CLARIN_PROFILE, new CMDIComponentProfileNamePostProcessor());
    8483    }
    85    
     84
    8685    /**
    8786     * Constructor
    88      * 
     87     *
    8988     * @param
    9089     */
    91     public MetadataImporter (){
    92     }
    93 
    94     /**
    95      * Contains MDSelflinks (usually).
    96      * Just to know what we have already done.
     90    public MetadataImporter() {
     91    }
     92
     93    /**
     94     * Contains MDSelflinks (usually). Just to know what we have already done.
    9795     */
    9896    protected final Set<String> processedIds = new HashSet<String>();
     
    129127                LOG.info("Deleting original data done.");
    130128            }
    131            
     129
    132130            // Import the specified data roots
    133131            for (DataRoot dataRoot : dataRoots) {
     
    141139                List<File> files = getFilesFromDataRoot(dataRoot.getRootFile());
    142140                for (File file : files) {
    143                     if (config.getMaxFileSize() > 0 &&
    144                             file.length() > config.getMaxFileSize()) {
     141                    if (config.getMaxFileSize() > 0
     142                            && file.length() > config.getMaxFileSize()) {
    145143                        LOG.info("Skipping " + file.getAbsolutePath() + " because it is too large.");
    146144                    } else {
     
    154152                LOG.info("End of processing: " + dataRoot.getOriginName());
    155153            }
    156            
     154
    157155            // delete outdated entries (based on maxDaysInSolr parameter)
    158             if(config.getMaxDaysInSolr() > 0 && config.getDeleteAllFirst() == false) {
    159                 LOG.info("Deleting old files that were not seen for more than "+config.getMaxDaysInSolr()+" days...");
    160                 solrServer.deleteByQuery(FacetConstants.FIELD_LAST_SEEN+":[* TO NOW-"+config.getMaxDaysInSolr()+"DAYS]");
     156            if (config.getMaxDaysInSolr() > 0 && config.getDeleteAllFirst() == false) {
     157                LOG.info("Deleting old files that were not seen for more than " + config.getMaxDaysInSolr() + " days...");
     158                solrServer.deleteByQuery(FacetConstants.FIELD_LAST_SEEN + ":[* TO NOW-" + config.getMaxDaysInSolr() + "DAYS]");
    161159                LOG.info("Deleting old files done.");
    162160            }
     
    171169                    solrServer.commit();
    172170                    buildSuggesterIndex();
    173                 }               
     171                }
    174172            } catch (SolrServerException e) {
    175173                LOG.error("cannot commit:\n", e);
     
    223221
    224222    /**
    225      * Create an interface to the SOLR server. 
    226      * 
     223     * Create an interface to the SOLR server.
     224     *
    227225     * After the interface has been created the importer can send documents to
    228226     * the server. Sending documents involves a queue. The importer adds
     
    235233        String solrUrl = config.getSolrUrl();
    236234        LOG.info("Initializing Solr Server on " + solrUrl);
    237        
     235
    238236        /* Specify the number of documents in the queue that will trigger the
    239237         * threads, two of them, emptying it.
    240238         */
    241         solrServer = new ConcurrentUpdateSolrServer(solrUrl, 
     239        solrServer = new ConcurrentUpdateSolrServer(solrUrl,
    242240                config.getMinDocsInSolrQueue(), 2) {
    243241                    /*
     
    246244                     * serverError variable.
    247245                     */
    248             @Override
    249             public void handleError(Throwable exception) {
    250                 super.handleError(exception);
    251                 serverError = exception;
    252             }
    253         };
     246                    @Override
     247                    public void handleError(Throwable exception) {
     248                        super.handleError(exception);
     249                        serverError = exception;
     250                    }
     251                };
    254252    }
    255253
     
    280278            if (solrDocument != null) {
    281279                if (!cmdiData.getDataResources().isEmpty() || !cmdiData.getLandingPageResources().isEmpty()
    282                                 || !cmdiData.getSearchResources().isEmpty() || !cmdiData.getSearchPageResources().isEmpty()
    283                                 || cmdiData.getMetadataResources().isEmpty() ) {
     280                        || !cmdiData.getSearchResources().isEmpty() || !cmdiData.getSearchPageResources().isEmpty()
     281                        || cmdiData.getMetadataResources().isEmpty()) {
    284282                    // We only add metadata files that have
    285283                    //  1) data resources or
    286                         //      2) a landing page or
    287                         //      3) a search service (like SRU/CQL) or
    288                         //      4) a search page or
     284                    //  2) a landing page or
     285                    //  3) a search service (like SRU/CQL) or
     286                    //  4) a search page or
    289287                    //  5) that have none of the above but also lack any metadata links (e.g. olac files describing a corpus with a link to the original archive).
    290288                    // Other files will have only metadata resources and are considered 'collection' metadata files they
     
    343341            solrDocument.addField(FacetConstants.FIELD_LANDINGPAGE, resource.getResourceName());
    344342        }
    345        
     343
    346344        // add search page resource
    347345        for (Resource resource : cmdiData.getSearchPageResources()) {
    348346            solrDocument.addField(FacetConstants.FIELD_SEARCHPAGE, resource.getResourceName());
    349347        }
    350        
     348
    351349        // add timestamp
    352350        Date dt = new Date();
    353351        SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
    354352        solrDocument.addField(FacetConstants.FIELD_LAST_SEEN, df.format(dt));
    355        
     353
    356354        // add resource proxys     
    357355        addResourceData(solrDocument, cmdiData);
     
    367365     * overwritten by some more specific xpath (as in the LRT cmdi files). So if
    368366     * a type is overwritten and already in the solrDocument we take that type.
    369      *
    370      * TODO evaluate odd connection between FIELD_FORMAT and ResourceProxy-Mimetypes
     367     *
     368     * TODO evaluate odd connection between FIELD_FORMAT and
     369     * ResourceProxy-Mimetypes
    371370     */
    372371    protected void addResourceData(SolrInputDocument solrDocument, CMDIData cmdiData) {
     
    390389                format = CommonUtils.normalizeMimeType(mimeType);
    391390            }
    392            
     391
    393392            FormatPostProcessor processor = new FormatPostProcessor();
    394393            mimeType = processor.process(mimeType);
    395            
     394
    396395            // TODO check should probably be moved into Solr (by using some minimum length filter)
    397             if(!mimeType.equals(""))
    398                 solrDocument.addField(FacetConstants.FIELD_FORMAT, mimeType);
     396            if (!mimeType.equals("")) {
     397                solrDocument.addField(FacetConstants.FIELD_FORMAT, mimeType);
     398            }
    399399            solrDocument.addField(FacetConstants.FIELD_RESOURCE, mimeType + FacetConstants.FIELD_RESOURCE_SPLIT_CHAR
    400400                    + resource.getResourceName());
    401401        }
    402402    }
    403    
     403
    404404    /**
    405405     * Send current list of SolrImputDocuments to SolrServer and clears list
     
    418418        docs = new ArrayList<SolrInputDocument>();
    419419    }
    420    
     420
    421421    /**
    422422     * Builds suggester index for autocompletion
     
    426426     */
    427427    private void buildSuggesterIndex() throws SolrServerException, MalformedURLException {
    428         LOG.info("Building index for autocompletion.");
    429         HashMap<String,String> paramMap = new HashMap<String, String>();
    430         paramMap.put("qt", "/suggest");
    431         paramMap.put("spellcheck.build", "true");
    432         SolrParams params = new MapSolrParams(paramMap);
    433         solrServer.query(params);
     428        LOG.info("Building index for autocompletion.");
     429        HashMap<String, String> paramMap = new HashMap<String, String>();
     430        paramMap.put("qt", "/suggest");
     431        paramMap.put("spellcheck.build", "true");
     432        SolrParams params = new MapSolrParams(paramMap);
     433        solrServer.query(params);
    434434    }
    435435
    436436    public static VloConfig config;
    437    
     437
    438438    /**
    439439     * @param args
    440440     * @throws IOException
    441441     */
    442     public static void main(String[] args) throws MalformedURLException, IOException {
    443 
    444        
     442    public static void main(String[] args) throws MalformedURLException, IOException {
     443
    445444        // path to the configuration file
    446445        String configFile = null;
    447        
     446
    448447        // use the Apache cli framework for getting command line parameters
    449448        Options options = new Options();
     
    461460            CommandLine cmd = parser.parse(options, args);
    462461            if (cmd.hasOption("c")) {
    463                
     462
    464463                // the "c" option was specified, now get its value
    465464                configFile = cmd.getOptionValue("c");
     
    467466
    468467        } catch (org.apache.commons.cli.ParseException ex) {
    469            
     468
    470469            /**
    471470             * Caught an exception caused by command line parsing. Try to get
     
    473472             * property.
    474473             */
    475 
    476474            String message = "Command line parsing failed. " + ex.getMessage();
    477475            LOG.error(message);
    478476            System.err.println(message);
    479477        }
    480        
    481         if (configFile == null){
     478
     479        if (configFile == null) {
    482480
    483481            String message;
     
    485483            message = "Could not get config file name via the command line, trying the system properties.";
    486484            LOG.info(message);
    487            
     485
    488486            String key;
    489487
     
    493491
    494492        if (configFile == null) {
    495            
     493
    496494            String message;
    497            
     495
    498496            message = "Could not get filename as system property either - stopping.";
    499497            LOG.error(message);
    500498        } else {
    501499            // read the configuration from the externally supplied file
    502             XmlVloConfigFactory configFactory = new XmlVloConfigFactory(new URL(configFile));
     500            final URL configUrl;
     501            if (configFile.startsWith("file:")) {
     502                configUrl = new URL(configFile);
     503            } else {
     504                configUrl = new File(configFile).toURI().toURL();
     505            }
     506            System.out.println("Reading configuration from " + configUrl.toString());
     507            final XmlVloConfigFactory configFactory = new XmlVloConfigFactory(configUrl);
    503508            MetadataImporter.config = configFactory.newConfig();
    504509
    505510            // optionally, modify the configuration here
    506 
    507511            // create and start the importer
    508512            MetadataImporter importer = new MetadataImporter();
     
    510514
    511515            // finished importing
    512 
    513516            if (MetadataImporter.config.printMapping()) {
    514517                File file = new File("xsdMapping.txt");
Note: See TracChangeset for help on using the changeset viewer.