source: vlo/trunk/vlo_importer/src/main/java/eu/clarin/cmdi/vlo/importer/MetadataImporter.java @ 2774

Last change on this file since 2774 was 2774, checked in by keeloo, 11 years ago

Corrected typos in config package. Added list initialisation to VloConfig?. Also fixed ticket 297, Access to packaged National projects mapping file. Finally, made provisions displaying the landing page in the web application.

File size: 17.9 KB
Line 
1package eu.clarin.cmdi.vlo.importer;
2
3import eu.clarin.cmdi.vlo.CommonUtils;
4import eu.clarin.cmdi.vlo.FacetConstants;
5import eu.clarin.cmdi.vlo.config.DataRoot;
6import eu.clarin.cmdi.vlo.config.VloConfig;
7import java.io.File;
8import java.io.IOException;
9import java.net.MalformedURLException;
10import java.util.ArrayList;
11import java.util.Collection;
12import java.util.HashMap;
13import java.util.HashSet;
14import java.util.List;
15import java.util.Map;
16import java.util.Set;
17import org.apache.commons.cli.CommandLine;
18import org.apache.commons.cli.CommandLineParser;
19import org.apache.commons.cli.Options;
20import org.apache.commons.cli.PosixParser;
21import org.apache.commons.io.FileUtils;
22import org.apache.solr.client.solrj.SolrServerException;
23import org.apache.solr.client.solrj.impl.StreamingUpdateSolrServer;
24import org.apache.solr.client.solrj.util.ClientUtils;
25import org.apache.solr.common.SolrInputDocument;
26import org.apache.solr.common.params.MapSolrParams;
27import org.apache.solr.common.params.SolrParams;
28import org.slf4j.Logger;
29import org.slf4j.LoggerFactory;
30
31
32/**
33 * The main metadataImporter class. Also contains the main function.
34 *
35 * The metadataimporter reads all the config files and then, for each
36 * metadatafile in each defined directory structure parses and imports them as
37 * defined in the configuration. The startImport function starts the importing
38 * and so on.
39 */
40
41@SuppressWarnings({"serial"})
42public class MetadataImporter {
43
44    /**
45     * Defines which files to try and parse.
46     * In this case all files ending in "xml" or "cmdi".
47     */
48    private static final String[] VALID_CMDI_EXTENSIONS = new String[] { "xml", "cmdi" };
49
50    /**
51     * Log log log log
52     */
53    private final static Logger LOG = LoggerFactory.getLogger(MetadataImporter.class);
54    /**
55     * Some place to store errors.
56     */
57    private static Throwable serverError;
58    /**
59     * the solr server.
60     */
61    private StreamingUpdateSolrServer solrServer;
62    /**
63     * Defines the post-processor associations. At import, for each facet value,
64     * this map is checked and all postprocessors associated with the facet
65     * _type_ are applied to the value before storing the new value in the solr
66     * document.
67     */
68    final static Map<String, PostProcessor> POST_PROCESSORS = new HashMap<String, PostProcessor>();
69    static {
70        POST_PROCESSORS.put(FacetConstants.FIELD_COUNTRY, new CountryNamePostProcessor());
71        POST_PROCESSORS.put(FacetConstants.FIELD_LANGUAGE, new LanguageCodePostProcessor());
72        POST_PROCESSORS.put(FacetConstants.FIELD_RESOURCE_TYPE, new ResourceTypePostProcessor());
73        POST_PROCESSORS.put(FacetConstants.FIELD_LANGUAGES, new LanguageLinkPostProcessor());
74        POST_PROCESSORS.put(FacetConstants.FIELD_NATIONAL_PROJECT, new NationalProjectPostProcessor());
75        POST_PROCESSORS.put(FacetConstants.FIELD_CLARIN_PROFILE, new CMDIComponentProfileNamePostProcessor());
76    }
77   
78    /**
79     * Constructor
80     *
81     * @param
82     */
83    public MetadataImporter (){
84    }
85
86    /**
87     * Contains MDSelflinks (usually).
88     * Just to know what we have already done.
89     */
90    private final Set<String> processedIds = new HashSet<String>();
91    /**
92     * Some caching for solr documents (we are more efficient if we ram a whole
93     * bunch to the solr server at once.
94     */
95    protected List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>();
96
97    // SOME STATS
98    private int nrOFDocumentsUpdated;
99    private int nrOfFilesAnalyzed = 0;
100    private int nrOfFilesWithoutId = 0;
101    private int nrOfFilesWithoutDataResources = 0;
102    private int nrOfFilesWithError = 0;
103
104    /**
105     * Retrieve all files with VALID_CMDI_EXTENSIONS from all DataRoot entries
106     * and starts processing for every single file
107     *
108     * @throws MalformedURLException
109     */
110    void startImport() throws MalformedURLException {
111
112        initSolrServer();
113        List<DataRoot> dataRoots = checkDataRoots();
114        long start = System.currentTimeMillis();
115        try {
116            // Delete the whole Solr db
117            if (VloConfig.deleteAllFirst()) {
118                LOG.info("Deleting original data...");
119                solrServer.deleteByQuery("*:*");
120                solrServer.commit();
121                LOG.info("Deleting original data done.");
122            }
123            for (DataRoot dataRoot : dataRoots) {
124                LOG.info("Start of processing: " + dataRoot.getOriginName());
125                if (dataRoot.deleteFirst()) {
126                    LOG.info("Deleting data for data provider: " + dataRoot.getOriginName());
127                    solrServer.deleteByQuery(FacetConstants.FIELD_DATA_PROVIDER + ":" + ClientUtils.escapeQueryChars(dataRoot.getOriginName()));
128                    LOG.info("Deleting data of provider done.");
129                }
130                CMDIDataProcessor processor = new CMDIParserVTDXML(POST_PROCESSORS);
131                List<File> files = getFilesFromDataRoot(dataRoot.getRootFile());
132                for (File file : files) {
133                    LOG.debug("PROCESSING FILE: " + file.getAbsolutePath());
134                    processCmdi(file, dataRoot, processor);
135                }
136                if (!docs.isEmpty()) {
137                    sendDocs();
138                }
139                LOG.info("End of processing: " + dataRoot.getOriginName());
140            }
141        } catch (SolrServerException e) {
142            LOG.error("error updating files:\n", e);
143            LOG.error("Also see vlo_solr server logs for more information");
144        } catch (IOException e) {
145            LOG.error("error updating files:\n", e);
146        } finally {
147            try {
148                if (solrServer != null) {
149                    solrServer.commit();
150                    buildSuggesterIndex();
151                }               
152            } catch (SolrServerException e) {
153                LOG.error("cannot commit:\n", e);
154            } catch (IOException e) {
155                LOG.error("cannot commit:\n", e);
156            }
157        }
158        long took = (System.currentTimeMillis() - start) / 1000;
159        LOG.info("Found " + nrOfFilesWithoutId + " file(s) without an id. (id is generated based on fileName but that may not be unique)");
160        LOG.info("Found " + nrOfFilesWithError + " file(s) with errors.");
161        LOG.info("Found " + nrOfFilesWithoutDataResources
162                + " file(s) without data resources (metadata descriptions without resources are ignored).");
163        LOG.info("Update of " + nrOFDocumentsUpdated + " took " + took + " secs. Total nr of files analyzed " + nrOfFilesAnalyzed);
164    }
165
166    /**
167     * Check a List of DataRoots for existence of RootFile (typically parent
168     * directory of metadata files)
169     *
170     * @return
171     */
172    private List<DataRoot> checkDataRoots() {
173        List<DataRoot> dataRoots = VloConfig.getDataRoots();
174        for (DataRoot dataRoot : dataRoots) {
175            if (!dataRoot.getRootFile().exists()) {
176                LOG.error("Root file " + dataRoot.getRootFile() + " does not exist. Probable configuration error so stopping import.");
177                System.exit(1);
178            }
179        }
180        return dataRoots;
181    }
182
183    /**
184     * Get the rootFile or all files with VALID_CMDI_EXTENSIONS if rootFile is a
185     * directory
186     *
187     * @param rootFile
188     * @return List with the rootFile or all contained files if rootFile is a
189     * directory
190     */
191    private List<File> getFilesFromDataRoot(File rootFile) {
192        List<File> result = new ArrayList<File>();
193        if (rootFile.isFile()) {
194            result.add(rootFile);
195        } else {
196            Collection<File> listFiles = FileUtils.listFiles(rootFile, VALID_CMDI_EXTENSIONS, true);
197            result.addAll(listFiles);
198        }
199        return result;
200    }
201
202    /**
203     * Initialize SolrServer as specified in configuration file
204     *
205     * @throws MalformedURLException
206     */
207    protected void initSolrServer() throws MalformedURLException {
208        String solrUrl = VloConfig.getSolrUrl();
209        LOG.info("Initializing Solr Server on " + solrUrl);
210        solrServer = new StreamingUpdateSolrServer(solrUrl, 1000, 2) {
211            @Override
212            public void handleError(Throwable ex) {
213                super.handleError(ex);
214                serverError = ex;
215            }
216        };
217    }
218
219    /**
220     * Process single CMDI file with CMDIDataProcessor
221     *
222     * @param file CMDI input file
223     * @param dataOrigin
224     * @param processor
225     * @throws SolrServerException
226     * @throws IOException
227     */
228    private void processCmdi(File file, DataRoot dataOrigin, CMDIDataProcessor processor) throws SolrServerException, IOException {
229        nrOfFilesAnalyzed++;
230        CMDIData cmdiData = null;
231        try {
232            cmdiData = processor.process(file);
233            if (!idOk(cmdiData.getId())) {
234                cmdiData.setId(dataOrigin.getOriginName() + "/" + file.getName()); //No id found in the metadata file so making one up based on the file name. Not quaranteed to be unique, but we have to set something.
235                nrOfFilesWithoutId++;
236            }
237        } catch (Exception e) {
238            LOG.error("error in file: " + file + " Exception", e);
239            nrOfFilesWithError++;
240        }
241        if (cmdiData != null && processedIds.add(cmdiData.getId())) {
242            SolrInputDocument solrDocument = cmdiData.getSolrDocument();
243            if (solrDocument != null) {
244                if (!cmdiData.getDataResources().isEmpty() || cmdiData.getMetadataResources().isEmpty()) {
245                    // We only add metadata files that have data resources (1) or files that don't link to other metadata files (2):
246                    //  1) files with data resources are obviously interesting
247                    //  2) files without metadata links and without dataResource can be interesting e.g. olac files describing a corpus with a link to the original archive.
248                    // Other files will have only metadata resources and are considered 'collection' metadata files they
249                    // are usually not very interesting (think imdi corpus files) and will not be included.
250                    updateDocument(solrDocument, cmdiData, file, dataOrigin);
251                } else {
252                    nrOfFilesWithoutDataResources++;
253                }
254            }
255        }
256    }
257
258    /**
259     * Check id for validness
260     *
261     * @param id
262     * @return true if id is acceptable, false otherwise
263     */
264    private boolean idOk(String id) {
265        return id != null && !id.isEmpty();
266    }
267
268    /**
269     * Adds some additional information from DataRoot to solrDocument, add
270     * solrDocument to document list, submits list to SolrServer every 1000
271     * files
272     *
273     * @param solrDocument
274     * @param cmdiData
275     * @param file
276     * @param dataOrigin
277     * @throws SolrServerException
278     * @throws IOException
279     */
280    private void updateDocument(SolrInputDocument solrDocument, CMDIData cmdiData, File file, DataRoot dataOrigin) throws SolrServerException,
281            IOException {
282        if (!solrDocument.containsKey(FacetConstants.FIELD_COLLECTION)) {
283            solrDocument.addField(FacetConstants.FIELD_COLLECTION, dataOrigin.getOriginName());
284        }
285        solrDocument.addField(FacetConstants.FIELD_DATA_PROVIDER, dataOrigin.getOriginName());
286        solrDocument.addField(FacetConstants.FIELD_ID, cmdiData.getId());
287        solrDocument.addField(FacetConstants.FIELD_FILENAME, file.getAbsolutePath());
288
289        String metadataSourceUrl = dataOrigin.getPrefix();
290        //System.out.println(dataOrigin.getTostrip());
291        //System.out.println(dataOrigin.getTostrip().length());
292        //System.out.println(file.getAbsolutePath());
293        metadataSourceUrl += file.getAbsolutePath().substring(dataOrigin.getToStrip().length());
294
295        solrDocument.addField(FacetConstants.FIELD_COMPLETE_METADATA, metadataSourceUrl);
296
297        // add SearchServices (should be CQL endpoint)
298        for (Resource resource : cmdiData.getSearchResources()) {
299            solrDocument.addField(FacetConstants.FIELD_SEARCH_SERVICE, resource.getResourceName());
300        }
301
302        // add landing page resource
303        for (Resource resource : cmdiData.getLandingPageResources()) {
304            solrDocument.addField(FacetConstants.FIELD_LANDINGPAGE, resource.getResourceName());
305        }
306       
307        addResourceData(solrDocument, cmdiData);
308        docs.add(solrDocument);
309        if (docs.size() == VloConfig.getMaxOnHeap()) {
310            sendDocs();
311        }
312    }
313
314    /**
315     * Adds two fields FIELD_RESOURCE_TYPE and FIELD_RESOURCE. The Type can be
316     * specified in the "ResourceType" element of an imdi file or possibly
317     * overwritten by some more specific xpath (as in the LRT cmdi files). So if
318     * a type is overwritten and already in the solrDocument we take that type.
319     */
320    private void addResourceData(SolrInputDocument solrDocument, CMDIData cmdiData) {
321        List<Object> fieldValues = solrDocument.containsKey(FacetConstants.FIELD_RESOURCE_TYPE) ? new ArrayList<Object>(solrDocument
322                .getFieldValues(FacetConstants.FIELD_RESOURCE_TYPE)) : null;
323        solrDocument.removeField(FacetConstants.FIELD_RESOURCE_TYPE); //Remove old values they might be overwritten.
324        List<Resource> resources = cmdiData.getDataResources();
325        for (int i = 0; i < resources.size(); i++) {
326            Resource resource = resources.get(i);
327            String mimeType = resource.getMimeType();
328            String resourceType = mimeType;
329            if (mimeType == null) {
330                if (fieldValues != null && i < fieldValues.size()) {
331                    resourceType = fieldValues.get(i).toString(); //assuming there will be as many resource types overwritten as there are specified
332                    mimeType = CommonUtils.normalizeMimeType(resourceType);
333                } else {
334                    mimeType = CommonUtils.normalizeMimeType("");
335                    resourceType = mimeType;
336                }
337            } else {
338                resourceType = CommonUtils.normalizeMimeType(mimeType);
339            }
340            solrDocument.addField(FacetConstants.FIELD_RESOURCE_TYPE, resourceType);
341            solrDocument.addField(FacetConstants.FIELD_RESOURCE, mimeType + FacetConstants.FIELD_RESOURCE_SPLIT_CHAR
342                    + resource.getResourceName());
343        }
344    }
345
346    /**
347     * Send current list of SolrImputDocuments to SolrServer and clears list
348     * afterwards
349     *
350     * @throws SolrServerException
351     * @throws IOException
352     */
353    protected void sendDocs() throws SolrServerException, IOException {
354        LOG.info("Sending " + docs.size() + " docs to solr server. Total number of docs updated till now: " + nrOFDocumentsUpdated);
355        nrOFDocumentsUpdated += docs.size();
356        solrServer.add(docs);
357        if (serverError != null) {
358            throw new SolrServerException(serverError);
359        }
360        docs = new ArrayList<SolrInputDocument>();
361    }
362   
363    /**
364     * Builds suggester index for autocompletion
365     *
366     * @throws SolrServerException
367     * @throws MalformedURLException
368     */
369    private void buildSuggesterIndex() throws SolrServerException, MalformedURLException {
370        LOG.info("Building index for autocompletion.");
371        HashMap<String,String> paramMap = new HashMap<String, String>();
372        paramMap.put("qt", "/suggest");
373        paramMap.put("spellcheck.build", "true");
374        SolrParams params = new MapSolrParams(paramMap);
375        solrServer.query(params);
376    }
377
378    /**
379     * @param args
380     * @throws IOException
381     */
382    public static void main(String[] args) throws MalformedURLException, IOException { 
383
384        // application configuration
385        VloConfig config;
386       
387        // path to the configuration file
388        String configFile = null;
389       
390        // use the Apache cli framework for getting command line parameters
391        Options options = new Options();
392
393        /**
394         * Add a "c" option, the option indicating the specification of an XML
395         * configuration file
396         */
397        options.addOption("c", true, "-c <file> : use parameters specified in <file>");
398
399        CommandLineParser parser = new PosixParser();
400
401        try {
402            // parse the command line arguments
403            CommandLine cmd = parser.parse(options, args);
404            if (cmd.hasOption("c")) {
405               
406                // the "c" option was specified, now get its value
407                configFile = cmd.getOptionValue("c");
408            }
409
410        } catch (org.apache.commons.cli.ParseException ex) {
411           
412            /**
413             * Caught an exception caused by command line parsing. Try to get
414             * the name of the configuration file by querying the system
415             * property.
416             */
417
418            String message = "Command line parsing failed. " + ex.getMessage();
419            LOG.error(message);
420            System.err.println(message);
421        }
422       
423        if (configFile == null){
424
425            String message;
426
427            message = "Could not get config file name via the command line, trying the system properties.";
428            LOG.info(message);
429           
430            String key;
431
432            key = "configFile";
433            configFile = System.getProperty(key);
434        }
435
436        if (configFile == null) {
437           
438            String message;
439           
440            message = "Could not get filename as system property either - stopping.";
441            LOG.error(message);
442        } else {
443            // read the configuration from the externally supplied file
444            VloConfig.readConfig(configFile);
445
446            // optionally, modify the configuration here
447
448            // create and start the importer
449            MetadataImporter importer = new MetadataImporter();
450            importer.startImport();
451
452            // finished importing
453
454            if (VloConfig.printMapping()) {
455                File file = new File("xsdMapping.txt");
456                FacetMappingFactory.printMapping(file);
457                LOG.info("Printed facetMapping in " + file);
458            }
459        }
460    }
461}
Note: See TracBrowser for help on using the repository browser.