source: vlo/branches/vlo-2.13-param/vlo_importer/src/main/java/eu/clarin/cmdi/vlo/importer/MetadataImporter.java @ 2659

Last change on this file since 2659 was 2659, checked in by keeloo, 11 years ago

Removed VloConfig?.get() and made the member variables static themselves. Cleaned up the VloConfig? class.

File size: 17.2 KB
Line 
1package eu.clarin.cmdi.vlo.importer;
2
3import eu.clarin.cmdi.vlo.CommonUtils;
4import eu.clarin.cmdi.vlo.FacetConstants;
5import eu.clarin.cmdi.vlo.config.DataRoot;
6import eu.clarin.cmdi.vlo.config.VloConfig;
7import java.io.File;
8import java.io.IOException;
9import java.net.MalformedURLException;
10import java.util.ArrayList;
11import java.util.Collection;
12import java.util.HashMap;
13import java.util.HashSet;
14import java.util.List;
15import java.util.Map;
16import java.util.Set;
17import org.apache.commons.cli.CommandLine;
18import org.apache.commons.cli.CommandLineParser;
19import org.apache.commons.cli.Options;
20import org.apache.commons.cli.PosixParser;
21import org.apache.commons.io.FileUtils;
22import org.apache.solr.client.solrj.SolrServerException;
23import org.apache.solr.client.solrj.impl.StreamingUpdateSolrServer;
24import org.apache.solr.client.solrj.util.ClientUtils;
25import org.apache.solr.common.SolrInputDocument;
26import org.apache.solr.common.params.MapSolrParams;
27import org.apache.solr.common.params.SolrParams;
28import org.slf4j.Logger;
29import org.slf4j.LoggerFactory;
30
31
32/**
33 * The main metadataImporter class. Also contains the main function.
34 *
35 * The metadataimporter reads all the config files and then, for each
36 * metadatafile in each defined directory structure parses and imports them as
37 * defined in the configuration. The startImport function starts the importing
38 * and so on.
39 */
40
41@SuppressWarnings({"serial"})
42public class MetadataImporter {
43
44    /**
45     * Defines which files to try and parse.
46     * In this case all files ending in "xml" or "cmdi".
47     */
48    private static final String[] VALID_CMDI_EXTENSIONS = new String[] { "xml", "cmdi" };
49
50    /**
51     * Log log log log
52     */
53    private final static Logger LOG = LoggerFactory.getLogger(MetadataImporter.class);
54    /**
55     * Some place to store errors.
56     */
57    private static Throwable serverError;
58    /**
59     * the solr server.
60     */
61    private StreamingUpdateSolrServer solrServer;
62    /**
63     * Defines the post-processor associations. At import, for each facet value,
64     * this map is checked and all postprocessors associated with the facet
65     * _type_ are applied to the value before storing the new value in the solr
66     * document.
67     */
68    final static Map<String, PostProcessor> POST_PROCESSORS = new HashMap<String, PostProcessor>();
69    static {
70        POST_PROCESSORS.put(FacetConstants.FIELD_COUNTRY, new CountryNamePostProcessor());
71        POST_PROCESSORS.put(FacetConstants.FIELD_LANGUAGE, new LanguageCodePostProcessor());
72        POST_PROCESSORS.put(FacetConstants.FIELD_RESOURCE_TYPE, new ResourceTypePostProcessor());
73        POST_PROCESSORS.put(FacetConstants.FIELD_LANGUAGES, new LanguageLinkPostProcessor());
74        POST_PROCESSORS.put(FacetConstants.FIELD_NATIONAL_PROJECT, new NationalProjectPostProcessor());
75        POST_PROCESSORS.put(FacetConstants.FIELD_CLARIN_PROFILE, new CMDIComponentProfileNamePostProcessor());
76    }
77   
78    // VLO configuration
79    VloConfig config;
80   
81    /**
82     * Constructor
83     *
84     * @param
85     */
86    public MetadataImporter (VloConfig config){
87        this.config = config;
88    }
89
90    /**
91     * Contains MDSelflinks (usually).
92     * Just to know what we have already done.
93     */
94    private final Set<String> processedIds = new HashSet<String>();
95    /**
96     * Some caching for solr documents (we are more efficient if we ram a whole
97     * bunch to the solr server at once.
98     */
99    protected List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>();
100
101    // SOME STATS
102    private int nrOFDocumentsUpdated;
103    private int nrOfFilesAnalyzed = 0;
104    private int nrOfFilesWithoutId = 0;
105    private int nrOfFilesWithoutDataResources = 0;
106    private int nrOfFilesWithError = 0;
107
108    /**
109     * Retrieve all files with VALID_CMDI_EXTENSIONS from all DataRoot entries
110     * and starts processing for every single file
111     *
112     * @throws MalformedURLException
113     */
114    void startImport() throws MalformedURLException {
115
116        initSolrServer();
117        List<DataRoot> dataRoots = checkDataRoots();
118        long start = System.currentTimeMillis();
119        try {
120            // Delete the whole Solr db
121            if (VloConfig.isDeleteAllFirst()) {
122                LOG.info("Deleting original data...");
123                solrServer.deleteByQuery("*:*");
124                solrServer.commit();
125                LOG.info("Deleting original data done.");
126            }
127            for (DataRoot dataRoot : dataRoots) {
128                LOG.info("Start of processing: " + dataRoot.getOriginName());
129                if (dataRoot.isDeleteFirst()) {
130                    LOG.info("Deleting data for data provider: " + dataRoot.getOriginName());
131                    solrServer.deleteByQuery(FacetConstants.FIELD_DATA_PROVIDER + ":" + ClientUtils.escapeQueryChars(dataRoot.getOriginName()));
132                    LOG.info("Deleting data of provider done.");
133                }
134                CMDIDataProcessor processor = new CMDIParserVTDXML(POST_PROCESSORS);
135                List<File> files = getFilesFromDataRoot(dataRoot.getRootFile());
136                for (File file : files) {
137                    LOG.debug("PROCESSING FILE: " + file.getAbsolutePath());
138                    processCmdi(file, dataRoot, processor);
139                }
140                if (!docs.isEmpty()) {
141                    sendDocs();
142                }
143                LOG.info("End of processing: " + dataRoot.getOriginName());
144            }
145        } catch (SolrServerException e) {
146            LOG.error("error updating files:\n", e);
147            LOG.error("Also see vlo_solr server logs for more information");
148        } catch (IOException e) {
149            LOG.error("error updating files:\n", e);
150        } finally {
151            try {
152                if (solrServer != null) {
153                    solrServer.commit();
154                    buildSuggesterIndex();
155                }               
156            } catch (SolrServerException e) {
157                LOG.error("cannot commit:\n", e);
158            } catch (IOException e) {
159                LOG.error("cannot commit:\n", e);
160            }
161        }
162        long took = (System.currentTimeMillis() - start) / 1000;
163        LOG.info("Found " + nrOfFilesWithoutId + " file(s) without an id. (id is generated based on fileName but that may not be unique)");
164        LOG.info("Found " + nrOfFilesWithError + " file(s) with errors.");
165        LOG.info("Found " + nrOfFilesWithoutDataResources
166                + " file(s) without data resources (metadata descriptions without resources are ignored).");
167        LOG.info("Update of " + nrOFDocumentsUpdated + " took " + took + " secs. Total nr of files analyzed " + nrOfFilesAnalyzed);
168    }
169
170    /**
171     * Check a List of DataRoots for existence of RootFile (typically parent
172     * directory of metadata files)
173     *
174     * @return
175     */
176    private List<DataRoot> checkDataRoots() {
177        List<DataRoot> dataRoots = VloConfig.getDataRoots();
178        for (DataRoot dataRoot : dataRoots) {
179            if (!dataRoot.getRootFile().exists()) {
180                LOG.error("Root file " + dataRoot.getRootFile() + " does not exist. Probable configuration error so stopping import.");
181                System.exit(1);
182            }
183        }
184        return dataRoots;
185    }
186
187    /**
188     * Get the rootFile or all files with VALID_CMDI_EXTENSIONS if rootFile is a
189     * directory
190     *
191     * @param rootFile
192     * @return List with the rootFile or all contained files if rootFile is a
193     * directory
194     */
195    private List<File> getFilesFromDataRoot(File rootFile) {
196        List<File> result = new ArrayList<File>();
197        if (rootFile.isFile()) {
198            result.add(rootFile);
199        } else {
200            Collection<File> listFiles = FileUtils.listFiles(rootFile, VALID_CMDI_EXTENSIONS, true);
201            result.addAll(listFiles);
202        }
203        return result;
204    }
205
206    /**
207     * Initialize SolrServer as specified in configuration file
208     *
209     * @throws MalformedURLException
210     */
211    protected void initSolrServer() throws MalformedURLException {
212        String solrUrl = VloConfig.getSolrUrl();
213        LOG.info("Initializing Solr Server on " + solrUrl);
214        solrServer = new StreamingUpdateSolrServer(solrUrl, 1000, 2) {
215            @Override
216            public void handleError(Throwable ex) {
217                super.handleError(ex);
218                serverError = ex;
219            }
220        };
221    }
222
223    /**
224     * Process single CMDI file with CMDIDataProcessor
225     *
226     * @param file CMDI input file
227     * @param dataOrigin
228     * @param processor
229     * @throws SolrServerException
230     * @throws IOException
231     */
232    private void processCmdi(File file, DataRoot dataOrigin, CMDIDataProcessor processor) throws SolrServerException, IOException {
233        nrOfFilesAnalyzed++;
234        CMDIData cmdiData = null;
235        try {
236            cmdiData = processor.process(file);
237            if (!idOk(cmdiData.getId())) {
238                cmdiData.setId(dataOrigin.getOriginName() + "/" + file.getName()); //No id found in the metadata file so making one up based on the file name. Not quaranteed to be unique, but we have to set something.
239                nrOfFilesWithoutId++;
240            }
241        } catch (Exception e) {
242            LOG.error("error in file: " + file + " Exception", e);
243            nrOfFilesWithError++;
244        }
245        if (cmdiData != null && processedIds.add(cmdiData.getId())) {
246            SolrInputDocument solrDocument = cmdiData.getSolrDocument();
247            if (solrDocument != null) {
248                if (!cmdiData.getDataResources().isEmpty() || cmdiData.getMetadataResources().isEmpty()) {
249                    // We only add metadata files that have data resources (1) or files that don't link to other metadata files (2):
250                    //  1) files with data resources are obviously interesting
251                    //  2) files without metadata links and without dataResource can be interesting e.g. olac files describing a corpus with a link to the original archive.
252                    // Other files will have only metadata resources and are considered 'collection' metadata files they
253                    // are usually not very interesting (think imdi corpus files) and will not be included.
254                    updateDocument(solrDocument, cmdiData, file, dataOrigin);
255                } else {
256                    nrOfFilesWithoutDataResources++;
257                }
258            }
259        }
260    }
261
262    /**
263     * Check id for validness
264     *
265     * @param id
266     * @return true if id is acceptable, false otherwise
267     */
268    private boolean idOk(String id) {
269        return id != null && !id.isEmpty();
270    }
271
272    /**
273     * Adds some additional information from DataRoot to solrDocument, add
274     * solrDocument to document list, submits list to SolrServer every 1000
275     * files
276     *
277     * @param solrDocument
278     * @param cmdiData
279     * @param file
280     * @param dataOrigin
281     * @throws SolrServerException
282     * @throws IOException
283     */
284    private void updateDocument(SolrInputDocument solrDocument, CMDIData cmdiData, File file, DataRoot dataOrigin) throws SolrServerException,
285            IOException {
286        if (!solrDocument.containsKey(FacetConstants.FIELD_COLLECTION)) {
287            solrDocument.addField(FacetConstants.FIELD_COLLECTION, dataOrigin.getOriginName());
288        }
289        solrDocument.addField(FacetConstants.FIELD_DATA_PROVIDER, dataOrigin.getOriginName());
290        solrDocument.addField(FacetConstants.FIELD_ID, cmdiData.getId());
291        solrDocument.addField(FacetConstants.FIELD_FILENAME, file.getAbsolutePath());
292
293        String metadataSourceUrl = dataOrigin.getPrefix();
294        //System.out.println(dataOrigin.getTostrip());
295        //System.out.println(dataOrigin.getTostrip().length());
296        //System.out.println(file.getAbsolutePath());
297        metadataSourceUrl += file.getAbsolutePath().substring(dataOrigin.getTostrip().length());
298
299        solrDocument.addField(FacetConstants.FIELD_COMPLETE_METADATA, metadataSourceUrl);
300       
301        // add SearchServices (should be CQL endpoint)
302        for(Resource resource : cmdiData.getSearchResources())
303                solrDocument.addField(FacetConstants.FIELD_SEARCH_SERVICE, resource.getResourceName());       
304       
305        addResourceData(solrDocument, cmdiData);
306        docs.add(solrDocument);
307        if (docs.size() == 1000) {
308            sendDocs();
309        }
310    }
311
312    /**
313     * Adds two fields FIELD_RESOURCE_TYPE and FIELD_RESOURCE. The Type can be
314     * specified in the "ResourceType" element of an imdi file or possibly
315     * overwritten by some more specific xpath (as in the LRT cmdi files). So if
316     * a type is overwritten and already in the solrDocument we take that type.
317     */
318    private void addResourceData(SolrInputDocument solrDocument, CMDIData cmdiData) {
319        List<Object> fieldValues = solrDocument.containsKey(FacetConstants.FIELD_RESOURCE_TYPE) ? new ArrayList<Object>(solrDocument
320                .getFieldValues(FacetConstants.FIELD_RESOURCE_TYPE)) : null;
321        solrDocument.removeField(FacetConstants.FIELD_RESOURCE_TYPE); //Remove old values they might be overwritten.
322        List<Resource> resources = cmdiData.getDataResources();
323        for (int i = 0; i < resources.size(); i++) {
324            Resource resource = resources.get(i);
325            String mimeType = resource.getMimeType();
326            String resourceType = mimeType;
327            if (mimeType == null) {
328                if (fieldValues != null && i < fieldValues.size()) {
329                    resourceType = fieldValues.get(i).toString(); //assuming there will be as many resource types overwritten as there are specified
330                    mimeType = CommonUtils.normalizeMimeType(resourceType);
331                } else {
332                    mimeType = CommonUtils.normalizeMimeType("");
333                    resourceType = mimeType;
334                }
335            } else {
336                resourceType = CommonUtils.normalizeMimeType(mimeType);
337            }
338            solrDocument.addField(FacetConstants.FIELD_RESOURCE_TYPE, resourceType);
339            solrDocument.addField(FacetConstants.FIELD_RESOURCE, mimeType + FacetConstants.FIELD_RESOURCE_SPLIT_CHAR
340                    + resource.getResourceName());
341        }
342    }
343
344    /**
345     * Send current list of SolrImputDocuments to SolrServer and clears list
346     * afterwards
347     *
348     * @throws SolrServerException
349     * @throws IOException
350     */
351    protected void sendDocs() throws SolrServerException, IOException {
352        LOG.info("Sending " + docs.size() + " docs to solr server. Total number of docs updated till now: " + nrOFDocumentsUpdated);
353        nrOFDocumentsUpdated += docs.size();
354        solrServer.add(docs);
355        if (serverError != null) {
356            throw new SolrServerException(serverError);
357        }
358        docs = new ArrayList<SolrInputDocument>();
359    }
360   
361    /**
362     * Builds suggester index for autocompletion
363     *
364     * @throws SolrServerException
365     * @throws MalformedURLException
366     */
367    private void buildSuggesterIndex() throws SolrServerException, MalformedURLException {
368        LOG.info("Building index for autocompletion.");
369        HashMap<String,String> paramMap = new HashMap<String, String>();
370        paramMap.put("qt", "/suggest");
371        paramMap.put("spellcheck.build", "true");
372        SolrParams params = new MapSolrParams(paramMap);
373        solrServer.query(params);
374    }
375
376    /**
377     * @param args
378     * @throws IOException
379     */
380    public static void main(String[] args) throws MalformedURLException, IOException {
381
382        // application configuration
383        VloConfig config;
384       
385        // use the Apache cli framework for getting command line parameters
386        Options options = new Options();
387
388        /**
389         * Add a "c" option, the option indicating the specification of an XML
390         * configuration file
391         */
392        options.addOption("c", true, "-c <file> : use parameters specified in <file>");
393
394        CommandLineParser parser = new PosixParser();
395
396        try {
397            // parse the command line arguments
398            CommandLine cmd = parser.parse(options, args);
399            if (cmd.hasOption("c")) {
400               
401                // the "c" option was specified, now get its value
402                String fileName;
403                fileName = cmd.getOptionValue("c");
404               
405                // optionally, check for file existence here
406               
407                // read the configuration defined in the file
408                config = VloConfig.readTestConfig(fileName);
409
410                // optionally, modify the configuration here
411               
412                // create and start the importer
413                MetadataImporter importer = new MetadataImporter(config);
414                importer.startImport();
415               
416                // finished importing
417               
418                if (config.isPrintMapping()) {
419                    File file = new File("xsdMapping.txt");
420                    FacetMappingFactory.printMapping(file);
421                    LOG.info("Printed facetMapping in " + file);
422                }
423            }
424
425        } catch (org.apache.commons.cli.ParseException ex) {
426           
427            // caught an exception caused by command line parsing
428           
429            String message = "Command line parsing failed. " + ex.getMessage();
430                   
431            LOG.error(message);
432            System.err.println(message);
433        }
434    }
435}
Note: See TracBrowser for help on using the repository browser.