source: vlo/branches/vlo-3.0/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/MetadataImporter.java @ 4612

Last change on this file since 4612 was 4612, checked in by teckart, 10 years ago

Fix ticket #454: data cleansing of facet "format": only valid MIME-types are accepted. Other values are mapped to a default "unknown" String.

File size: 20.6 KB
Line 
1package eu.clarin.cmdi.vlo.importer;
2
3import eu.clarin.cmdi.vlo.CommonUtils;
4import eu.clarin.cmdi.vlo.FacetConstants;
5import eu.clarin.cmdi.vlo.config.DataRoot;
6import eu.clarin.cmdi.vlo.config.VloConfig;
7import eu.clarin.cmdi.vlo.config.XmlVloConfigFactory;
8
9import java.io.File;
10import java.io.IOException;
11import java.net.MalformedURLException;
12import java.net.URL;
13import java.text.SimpleDateFormat;
14import java.util.ArrayList;
15import java.util.Collection;
16import java.util.Date;
17import java.util.HashMap;
18import java.util.HashSet;
19import java.util.List;
20import java.util.Map;
21import java.util.Set;
22
23import org.apache.commons.cli.CommandLine;
24import org.apache.commons.cli.CommandLineParser;
25import org.apache.commons.cli.Options;
26import org.apache.commons.cli.PosixParser;
27import org.apache.commons.io.FileUtils;
28import org.apache.solr.client.solrj.SolrServerException;
29import org.apache.solr.client.solrj.impl.ConcurrentUpdateSolrServer;
30import org.apache.solr.client.solrj.util.ClientUtils;
31import org.apache.solr.common.SolrInputDocument;
32import org.apache.solr.common.params.MapSolrParams;
33import org.apache.solr.common.params.SolrParams;
34import org.slf4j.Logger;
35import org.slf4j.LoggerFactory;
36
37
38/**
39 * The main metadataImporter class. Also contains the main function.
40 *
41 * The metadataimporter reads all the config files and then, for each
42 * metadatafile in each defined directory structure parses and imports them as
43 * defined in the configuration. The startImport function starts the importing
44 * and so on.
45 */
46
47@SuppressWarnings({"serial"})
48public class MetadataImporter {
49
50    /**
51     * Defines which files to try and parse.
52     * In this case all files ending in "xml" or "cmdi".
53     */
54    private static final String[] VALID_CMDI_EXTENSIONS = new String[] { "xml", "cmdi" };
55
56    /**
57     * Log log log log
58     */
59    protected final static Logger LOG = LoggerFactory.getLogger(MetadataImporter.class);
60    /**
61     * Some place to store errors.
62     */
63    private static Throwable serverError;
64    /**
65     * the solr server.
66     */
67    private ConcurrentUpdateSolrServer solrServer;
68    /**
69     * Defines the post-processor associations. At import, for each facet value,
70     * this map is checked and all postprocessors associated with the facet
71     * _type_ are applied to the value before storing the new value in the solr
72     * document.
73     */
74    final static Map<String, PostProcessor> POST_PROCESSORS = new HashMap<String, PostProcessor>();
75    static {
76        POST_PROCESSORS.put(FacetConstants.FIELD_ID, new IdPostProcessor());
77                POST_PROCESSORS.put(FacetConstants.FIELD_CONTINENT, new ContinentNamePostProcessor());
78        POST_PROCESSORS.put(FacetConstants.FIELD_COUNTRY, new CountryNamePostProcessor());
79        POST_PROCESSORS.put(FacetConstants.FIELD_LANGUAGE, new LanguageCodePostProcessor());
80        POST_PROCESSORS.put(FacetConstants.FIELD_LANGUAGES, new LanguageLinkPostProcessor());
81        POST_PROCESSORS.put(FacetConstants.FIELD_YEAR, new YearPostProcessor());
82        POST_PROCESSORS.put(FacetConstants.FIELD_NATIONAL_PROJECT, new NationalProjectPostProcessor());
83        POST_PROCESSORS.put(FacetConstants.FIELD_CLARIN_PROFILE, new CMDIComponentProfileNamePostProcessor());
84    }
85   
86    /**
87     * Constructor
88     *
89     * @param
90     */
91    public MetadataImporter (){
92    }
93
94    /**
95     * Contains MDSelflinks (usually).
96     * Just to know what we have already done.
97     */
98    protected final Set<String> processedIds = new HashSet<String>();
99    /**
100     * Some caching for solr documents (we are more efficient if we ram a whole
101     * bunch to the solr server at once.
102     */
103    protected List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>();
104
105    // SOME STATS
106    protected int nrOFDocumentsSend;
107    protected int nrOfFilesAnalyzed = 0;
108    protected int nrOfFilesWithoutId = 0;
109    protected int nrOfIgnoredFiles = 0;
110    protected int nrOfFilesWithError = 0;
111
112    /**
113     * Retrieve all files with VALID_CMDI_EXTENSIONS from all DataRoot entries
114     * and starts processing for every single file
115     *
116     * @throws MalformedURLException
117     */
118    void startImport() throws MalformedURLException {
119
120        initSolrServer();
121        List<DataRoot> dataRoots = checkDataRoots();
122        long start = System.currentTimeMillis();
123        try {
124            // Delete the whole Solr db
125            if (config.getDeleteAllFirst()) {
126                LOG.info("Deleting original data...");
127                solrServer.deleteByQuery("*:*");
128                solrServer.commit();
129                LOG.info("Deleting original data done.");
130            }
131           
132            // Import the specified data roots
133            for (DataRoot dataRoot : dataRoots) {
134                LOG.info("Start of processing: " + dataRoot.getOriginName());
135                if (dataRoot.deleteFirst()) {
136                    LOG.info("Deleting data for data provider: " + dataRoot.getOriginName());
137                    solrServer.deleteByQuery(FacetConstants.FIELD_DATA_PROVIDER + ":" + ClientUtils.escapeQueryChars(dataRoot.getOriginName()));
138                    LOG.info("Deleting data of provider done.");
139                }
140                CMDIDataProcessor processor = new CMDIParserVTDXML(POST_PROCESSORS);
141                List<File> files = getFilesFromDataRoot(dataRoot.getRootFile());
142                for (File file : files) {
143                    if (config.getMaxFileSize() > 0 && 
144                            file.length() > config.getMaxFileSize()) {
145                        LOG.info("Skipping " + file.getAbsolutePath() + " because it is too large.");
146                    } else {
147                        LOG.debug("PROCESSING FILE: " + file.getAbsolutePath());
148                        processCmdi(file, dataRoot, processor);
149                    }
150                }
151                if (!docs.isEmpty()) {
152                    sendDocs();
153                }
154                LOG.info("End of processing: " + dataRoot.getOriginName());
155            }
156           
157            // delete outdated entries (based on maxDaysInSolr parameter)
158            if(config.getMaxDaysInSolr() > 0 && config.getDeleteAllFirst() == false) {
159                LOG.info("Deleting old files that were not seen for more than "+config.getMaxDaysInSolr()+" days...");
160                solrServer.deleteByQuery(FacetConstants.FIELD_LAST_SEEN+":[* TO NOW-"+config.getMaxDaysInSolr()+"DAYS]");
161                LOG.info("Deleting old files done.");
162            }
163        } catch (SolrServerException e) {
164            LOG.error("error updating files:\n", e);
165            LOG.error("Also see vlo_solr server logs for more information");
166        } catch (IOException e) {
167            LOG.error("error updating files:\n", e);
168        } finally {
169            try {
170                if (solrServer != null) {
171                    solrServer.commit();
172                    buildSuggesterIndex();
173                }               
174            } catch (SolrServerException e) {
175                LOG.error("cannot commit:\n", e);
176            } catch (IOException e) {
177                LOG.error("cannot commit:\n", e);
178            }
179        }
180        long took = (System.currentTimeMillis() - start) / 1000;
181        LOG.info("Found " + nrOfFilesWithoutId + " file(s) without an id. (id is generated based on fileName but that may not be unique)");
182        LOG.info("Found " + nrOfFilesWithError + " file(s) with errors.");
183        LOG.info("Found " + nrOfIgnoredFiles
184                + " file(s) that where ignored (files without resources or any link to a search service or landing page are ignored).");
185        LOG.info("Update of " + nrOFDocumentsSend + " took " + took + " secs. Total nr of files analyzed " + nrOfFilesAnalyzed);
186    }
187
188    /**
189     * Check a List of DataRoots for existence of RootFile (typically parent
190     * directory of metadata files)
191     *
192     * @return
193     */
194    protected List<DataRoot> checkDataRoots() {
195        List<DataRoot> dataRoots = config.getDataRoots();
196        for (DataRoot dataRoot : dataRoots) {
197            if (!dataRoot.getRootFile().exists()) {
198                LOG.error("Root file " + dataRoot.getRootFile() + " does not exist. Probable configuration error so stopping import.");
199                System.exit(1);
200            }
201        }
202        return dataRoots;
203    }
204
205    /**
206     * Get the rootFile or all files with VALID_CMDI_EXTENSIONS if rootFile is a
207     * directory
208     *
209     * @param rootFile
210     * @return List with the rootFile or all contained files if rootFile is a
211     * directory
212     */
213    protected List<File> getFilesFromDataRoot(File rootFile) {
214        List<File> result = new ArrayList<File>();
215        if (rootFile.isFile()) {
216            result.add(rootFile);
217        } else {
218            Collection<File> listFiles = FileUtils.listFiles(rootFile, VALID_CMDI_EXTENSIONS, true);
219            result.addAll(listFiles);
220        }
221        return result;
222    }
223
224    /**
225     * Create an interface to the SOLR server.
226     *
227     * After the interface has been created the importer can send documents to
228     * the server. Sending documents involves a queue. The importer adds
229     * documents to a queue, and dedicated threads will empty it, and
230     * effectively store store the documents.
231     *
232     * @throws MalformedURLException
233     */
234    protected void initSolrServer() throws MalformedURLException {
235        String solrUrl = config.getSolrUrl();
236        LOG.info("Initializing Solr Server on " + solrUrl);
237       
238        /* Specify the number of documents in the queue that will trigger the
239         * threads, two of them, emptying it.
240         */
241        solrServer = new ConcurrentUpdateSolrServer(solrUrl, 
242                config.getMinDocsInSolrQueue(), 2) {
243                    /*
244                     * Let the super class method handle exceptions. Make the
245                     * exception available to the importer in the form of the
246                     * serverError variable.
247                     */
248            @Override
249            public void handleError(Throwable exception) {
250                super.handleError(exception);
251                serverError = exception;
252            }
253        };
254    }
255
256    /**
257     * Process single CMDI file with CMDIDataProcessor
258     *
259     * @param file CMDI input file
260     * @param dataOrigin
261     * @param processor
262     * @throws SolrServerException
263     * @throws IOException
264     */
265    protected void processCmdi(File file, DataRoot dataOrigin, CMDIDataProcessor processor) throws SolrServerException, IOException {
266        nrOfFilesAnalyzed++;
267        CMDIData cmdiData = null;
268        try {
269            cmdiData = processor.process(file);
270            if (!idOk(cmdiData.getId())) {
271                cmdiData.setId(dataOrigin.getOriginName() + "/" + file.getName()); //No id found in the metadata file so making one up based on the file name. Not quaranteed to be unique, but we have to set something.
272                nrOfFilesWithoutId++;
273            }
274        } catch (Exception e) {
275            LOG.error("error in file: " + file + " Exception", e);
276            nrOfFilesWithError++;
277        }
278        if (cmdiData != null && processedIds.add(cmdiData.getId())) {
279            SolrInputDocument solrDocument = cmdiData.getSolrDocument();
280            if (solrDocument != null) {
281                if (!cmdiData.getDataResources().isEmpty() || !cmdiData.getLandingPageResources().isEmpty()
282                                || !cmdiData.getSearchResources().isEmpty() || !cmdiData.getSearchPageResources().isEmpty()
283                                || cmdiData.getMetadataResources().isEmpty() ) {
284                    // We only add metadata files that have
285                    //  1) data resources or
286                        //      2) a landing page or
287                        //      3) a search service (like SRU/CQL) or
288                        //      4) a search page or
289                    //  5) that have none of the above but also lack any metadata links (e.g. olac files describing a corpus with a link to the original archive).
290                    // Other files will have only metadata resources and are considered 'collection' metadata files they
291                    // are usually not very interesting (think imdi corpus files) and will not be included.
292                    updateDocument(solrDocument, cmdiData, file, dataOrigin);
293                } else {
294                    nrOfIgnoredFiles++;
295                }
296            }
297        }
298    }
299
300    /**
301     * Check id for validness
302     *
303     * @param id
304     * @return true if id is acceptable, false otherwise
305     */
306    protected boolean idOk(String id) {
307        return id != null && !id.isEmpty();
308    }
309
310    /**
311     * Adds some additional information from DataRoot to solrDocument, add
312     * solrDocument to document list, submits list to SolrServer every 1000
313     * files
314     *
315     * @param solrDocument
316     * @param cmdiData
317     * @param file
318     * @param dataOrigin
319     * @throws SolrServerException
320     * @throws IOException
321     */
322    protected void updateDocument(SolrInputDocument solrDocument, CMDIData cmdiData, File file, DataRoot dataOrigin) throws SolrServerException,
323            IOException {
324        if (!solrDocument.containsKey(FacetConstants.FIELD_COLLECTION)) {
325            solrDocument.addField(FacetConstants.FIELD_COLLECTION, dataOrigin.getOriginName());
326        }
327        solrDocument.addField(FacetConstants.FIELD_DATA_PROVIDER, dataOrigin.getOriginName());
328        solrDocument.addField(FacetConstants.FIELD_ID, cmdiData.getId());
329        solrDocument.addField(FacetConstants.FIELD_FILENAME, file.getAbsolutePath());
330
331        String metadataSourceUrl = dataOrigin.getPrefix();
332        metadataSourceUrl += file.getAbsolutePath().substring(dataOrigin.getToStrip().length());
333
334        solrDocument.addField(FacetConstants.FIELD_COMPLETE_METADATA, metadataSourceUrl);
335
336        // add SearchServices (should be CQL endpoint)
337        for (Resource resource : cmdiData.getSearchResources()) {
338            solrDocument.addField(FacetConstants.FIELD_SEARCH_SERVICE, resource.getResourceName());
339        }
340
341        // add landing page resource
342        for (Resource resource : cmdiData.getLandingPageResources()) {
343            solrDocument.addField(FacetConstants.FIELD_LANDINGPAGE, resource.getResourceName());
344        }
345       
346        // add search page resource
347        for (Resource resource : cmdiData.getSearchPageResources()) {
348            solrDocument.addField(FacetConstants.FIELD_SEARCHPAGE, resource.getResourceName());
349        }
350       
351        // add timestamp
352        Date dt = new Date();
353        SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
354        solrDocument.addField(FacetConstants.FIELD_LAST_SEEN, df.format(dt));
355       
356        // add resource proxys     
357        addResourceData(solrDocument, cmdiData);
358        docs.add(solrDocument);
359        if (docs.size() == config.getMaxDocsInList()) {
360            sendDocs();
361        }
362    }
363
364    /**
365     * Adds two fields FIELD_FORMAT and FIELD_RESOURCE. The Type can be
366     * specified in the "ResourceType" element of an imdi file or possibly
367     * overwritten by some more specific xpath (as in the LRT cmdi files). So if
368     * a type is overwritten and already in the solrDocument we take that type.
369     *
370     * TODO evaluate odd connection between FIELD_FORMAT and ResourceProxy-Mimetypes
371     */
372    protected void addResourceData(SolrInputDocument solrDocument, CMDIData cmdiData) {
373        List<Object> fieldValues = solrDocument.containsKey(FacetConstants.FIELD_FORMAT) ? new ArrayList<Object>(solrDocument
374                .getFieldValues(FacetConstants.FIELD_FORMAT)) : null;
375        solrDocument.removeField(FacetConstants.FIELD_FORMAT); //Remove old values they might be overwritten.
376        List<Resource> resources = cmdiData.getDataResources();
377        for (int i = 0; i < resources.size(); i++) {
378            Resource resource = resources.get(i);
379            String mimeType = resource.getMimeType();
380            String format = mimeType;
381            if (mimeType == null) {
382                if (fieldValues != null && i < fieldValues.size()) {
383                    format = fieldValues.get(i).toString(); //assuming there will be as many formats overwritten as there are specified
384                    mimeType = CommonUtils.normalizeMimeType(format);
385                } else {
386                    mimeType = CommonUtils.normalizeMimeType("");
387                    format = mimeType;
388                }
389            } else {
390                format = CommonUtils.normalizeMimeType(mimeType);
391            }
392           
393            FormatPostProcessor processor = new FormatPostProcessor();
394            mimeType = processor.process(mimeType);
395           
396            // TODO check should probably be moved into Solr (by using some minimum length filter)
397            if(!mimeType.equals(""))
398                solrDocument.addField(FacetConstants.FIELD_FORMAT, mimeType);
399            solrDocument.addField(FacetConstants.FIELD_RESOURCE, mimeType + FacetConstants.FIELD_RESOURCE_SPLIT_CHAR
400                    + resource.getResourceName());
401        }
402    }
403   
404    /**
405     * Send current list of SolrImputDocuments to SolrServer and clears list
406     * afterwards
407     *
408     * @throws SolrServerException
409     * @throws IOException
410     */
411    protected void sendDocs() throws SolrServerException, IOException {
412        LOG.info("Sending " + docs.size() + " docs to solr server. Total number of docs updated till now: " + nrOFDocumentsSend);
413        nrOFDocumentsSend += docs.size();
414        solrServer.add(docs);
415        if (serverError != null) {
416            throw new SolrServerException(serverError);
417        }
418        docs = new ArrayList<SolrInputDocument>();
419    }
420   
421    /**
422     * Builds suggester index for autocompletion
423     *
424     * @throws SolrServerException
425     * @throws MalformedURLException
426     */
427    private void buildSuggesterIndex() throws SolrServerException, MalformedURLException {
428        LOG.info("Building index for autocompletion.");
429        HashMap<String,String> paramMap = new HashMap<String, String>();
430        paramMap.put("qt", "/suggest");
431        paramMap.put("spellcheck.build", "true");
432        SolrParams params = new MapSolrParams(paramMap);
433        solrServer.query(params);
434    }
435
436    public static VloConfig config;
437   
438    /**
439     * @param args
440     * @throws IOException
441     */
442    public static void main(String[] args) throws MalformedURLException, IOException { 
443
444       
445        // path to the configuration file
446        String configFile = null;
447       
448        // use the Apache cli framework for getting command line parameters
449        Options options = new Options();
450
451        /**
452         * Add a "c" option, the option indicating the specification of an XML
453         * configuration file
454         */
455        options.addOption("c", true, "-c <file> : use parameters specified in <file>");
456
457        CommandLineParser parser = new PosixParser();
458
459        try {
460            // parse the command line arguments
461            CommandLine cmd = parser.parse(options, args);
462            if (cmd.hasOption("c")) {
463               
464                // the "c" option was specified, now get its value
465                configFile = cmd.getOptionValue("c");
466            }
467
468        } catch (org.apache.commons.cli.ParseException ex) {
469           
470            /**
471             * Caught an exception caused by command line parsing. Try to get
472             * the name of the configuration file by querying the system
473             * property.
474             */
475
476            String message = "Command line parsing failed. " + ex.getMessage();
477            LOG.error(message);
478            System.err.println(message);
479        }
480       
481        if (configFile == null){
482
483            String message;
484
485            message = "Could not get config file name via the command line, trying the system properties.";
486            LOG.info(message);
487           
488            String key;
489
490            key = "configFile";
491            configFile = System.getProperty(key);
492        }
493
494        if (configFile == null) {
495           
496            String message;
497           
498            message = "Could not get filename as system property either - stopping.";
499            LOG.error(message);
500        } else {
501            // read the configuration from the externally supplied file
502            XmlVloConfigFactory configFactory = new XmlVloConfigFactory(new URL(configFile));
503            MetadataImporter.config = configFactory.newConfig();
504
505            // optionally, modify the configuration here
506
507            // create and start the importer
508            MetadataImporter importer = new MetadataImporter();
509            importer.startImport();
510
511            // finished importing
512
513            if (MetadataImporter.config.printMapping()) {
514                File file = new File("xsdMapping.txt");
515                FacetMappingFactory.printMapping(file);
516                LOG.info("Printed facetMapping in " + file);
517            }
518        }
519    }
520}
Note: See TracBrowser for help on using the repository browser.