source: vlo/trunk/vlo_importer/src/main/java/eu/clarin/cmdi/vlo/importer/MetadataImporter.java @ 2826

Last change on this file since 2826 was 2826, checked in by keeloo, 11 years ago

Repaired the importer test: made it transparent regarding the
deleteAllFirst parameter.

Added the parameter and logic that causes large files to be
filtered. Added accompanying tests in the vlo test configuration
package.

File size: 18.2 KB
Line 
1package eu.clarin.cmdi.vlo.importer;
2
3import eu.clarin.cmdi.vlo.CommonUtils;
4import eu.clarin.cmdi.vlo.FacetConstants;
5import eu.clarin.cmdi.vlo.config.DataRoot;
6import eu.clarin.cmdi.vlo.config.VloConfig;
7import java.io.File;
8import java.io.IOException;
9import java.net.MalformedURLException;
10import java.util.ArrayList;
11import java.util.Collection;
12import java.util.HashMap;
13import java.util.HashSet;
14import java.util.List;
15import java.util.Map;
16import java.util.Set;
17import org.apache.commons.cli.CommandLine;
18import org.apache.commons.cli.CommandLineParser;
19import org.apache.commons.cli.Options;
20import org.apache.commons.cli.PosixParser;
21import org.apache.commons.io.FileUtils;
22import org.apache.solr.client.solrj.SolrServerException;
23import org.apache.solr.client.solrj.impl.StreamingUpdateSolrServer;
24import org.apache.solr.client.solrj.util.ClientUtils;
25import org.apache.solr.common.SolrInputDocument;
26import org.apache.solr.common.params.MapSolrParams;
27import org.apache.solr.common.params.SolrParams;
28import org.slf4j.Logger;
29import org.slf4j.LoggerFactory;
30
31
32/**
33 * The main metadataImporter class. Also contains the main function.
34 *
35 * The metadataimporter reads all the config files and then, for each
36 * metadatafile in each defined directory structure parses and imports them as
37 * defined in the configuration. The startImport function starts the importing
38 * and so on.
39 */
40
41@SuppressWarnings({"serial"})
42public class MetadataImporter {
43
44    /**
45     * Defines which files to try and parse.
46     * In this case all files ending in "xml" or "cmdi".
47     */
48    private static final String[] VALID_CMDI_EXTENSIONS = new String[] { "xml", "cmdi" };
49
50    /**
51     * Log log log log
52     */
53    protected final static Logger LOG = LoggerFactory.getLogger(MetadataImporter.class);
54    /**
55     * Some place to store errors.
56     */
57    private static Throwable serverError;
58    /**
59     * the solr server.
60     */
61    private StreamingUpdateSolrServer solrServer;
62    /**
63     * Defines the post-processor associations. At import, for each facet value,
64     * this map is checked and all postprocessors associated with the facet
65     * _type_ are applied to the value before storing the new value in the solr
66     * document.
67     */
68    final static Map<String, PostProcessor> POST_PROCESSORS = new HashMap<String, PostProcessor>();
69    static {
70        POST_PROCESSORS.put(FacetConstants.FIELD_COUNTRY, new CountryNamePostProcessor());
71        POST_PROCESSORS.put(FacetConstants.FIELD_LANGUAGE, new LanguageCodePostProcessor());
72        POST_PROCESSORS.put(FacetConstants.FIELD_RESOURCE_TYPE, new ResourceTypePostProcessor());
73        POST_PROCESSORS.put(FacetConstants.FIELD_LANGUAGES, new LanguageLinkPostProcessor());
74        POST_PROCESSORS.put(FacetConstants.FIELD_NATIONAL_PROJECT, new NationalProjectPostProcessor());
75        POST_PROCESSORS.put(FacetConstants.FIELD_CLARIN_PROFILE, new CMDIComponentProfileNamePostProcessor());
76    }
77   
78    /**
79     * Constructor
80     *
81     * @param
82     */
83    public MetadataImporter (){
84    }
85
86    /**
87     * Contains MDSelflinks (usually).
88     * Just to know what we have already done.
89     */
90    protected final Set<String> processedIds = new HashSet<String>();
91    /**
92     * Some caching for solr documents (we are more efficient if we ram a whole
93     * bunch to the solr server at once.
94     */
95    protected List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>();
96
97    // SOME STATS
98    protected int nrOFDocumentsUpdated;
99    protected int nrOfFilesAnalyzed = 0;
100    protected int nrOfFilesWithoutId = 0;
101    protected int nrOfFilesWithoutDataResources = 0;
102    protected int nrOfFilesWithError = 0;
103
104    /**
105     * Retrieve all files with VALID_CMDI_EXTENSIONS from all DataRoot entries
106     * and starts processing for every single file
107     *
108     * @throws MalformedURLException
109     */
110    void startImport() throws MalformedURLException {
111
112        initSolrServer();
113        List<DataRoot> dataRoots = checkDataRoots();
114        long start = System.currentTimeMillis();
115        try {
116            // Delete the whole Solr db
117            if (VloConfig.deleteAllFirst()) {
118                LOG.info("Deleting original data...");
119                solrServer.deleteByQuery("*:*");
120                solrServer.commit();
121                LOG.info("Deleting original data done.");
122            }
123            for (DataRoot dataRoot : dataRoots) {
124                LOG.info("Start of processing: " + dataRoot.getOriginName());
125                if (dataRoot.deleteFirst()) {
126                    LOG.info("Deleting data for data provider: " + dataRoot.getOriginName());
127                    solrServer.deleteByQuery(FacetConstants.FIELD_DATA_PROVIDER + ":" + ClientUtils.escapeQueryChars(dataRoot.getOriginName()));
128                    LOG.info("Deleting data of provider done.");
129                }
130                CMDIDataProcessor processor = new CMDIParserVTDXML(POST_PROCESSORS);
131                List<File> files = getFilesFromDataRoot(dataRoot.getRootFile());
132                for (File file : files) {
133                    if (VloConfig.getUseMaxFileSize() && 
134                            file.length() > VloConfig.getMaxFileSize()) {
135                        LOG.info("Skipping " + file.getAbsolutePath() + " because it is too large.");
136                    } else {
137                        LOG.debug("PROCESSING FILE: " + file.getAbsolutePath());
138                        processCmdi(file, dataRoot, processor);
139                    }
140                }
141                if (!docs.isEmpty()) {
142                    sendDocs();
143                }
144                LOG.info("End of processing: " + dataRoot.getOriginName());
145            }
146        } catch (SolrServerException e) {
147            LOG.error("error updating files:\n", e);
148            LOG.error("Also see vlo_solr server logs for more information");
149        } catch (IOException e) {
150            LOG.error("error updating files:\n", e);
151        } finally {
152            try {
153                if (solrServer != null) {
154                    solrServer.commit();
155                    buildSuggesterIndex();
156                }               
157            } catch (SolrServerException e) {
158                LOG.error("cannot commit:\n", e);
159            } catch (IOException e) {
160                LOG.error("cannot commit:\n", e);
161            }
162        }
163        long took = (System.currentTimeMillis() - start) / 1000;
164        LOG.info("Found " + nrOfFilesWithoutId + " file(s) without an id. (id is generated based on fileName but that may not be unique)");
165        LOG.info("Found " + nrOfFilesWithError + " file(s) with errors.");
166        LOG.info("Found " + nrOfFilesWithoutDataResources
167                + " file(s) without data resources (metadata descriptions without resources are ignored).");
168        LOG.info("Update of " + nrOFDocumentsUpdated + " took " + took + " secs. Total nr of files analyzed " + nrOfFilesAnalyzed);
169    }
170
171    /**
172     * Check a List of DataRoots for existence of RootFile (typically parent
173     * directory of metadata files)
174     *
175     * @return
176     */
177    protected List<DataRoot> checkDataRoots() {
178        List<DataRoot> dataRoots = VloConfig.getDataRoots();
179        for (DataRoot dataRoot : dataRoots) {
180            if (!dataRoot.getRootFile().exists()) {
181                LOG.error("Root file " + dataRoot.getRootFile() + " does not exist. Probable configuration error so stopping import.");
182                System.exit(1);
183            }
184        }
185        return dataRoots;
186    }
187
188    /**
189     * Get the rootFile or all files with VALID_CMDI_EXTENSIONS if rootFile is a
190     * directory
191     *
192     * @param rootFile
193     * @return List with the rootFile or all contained files if rootFile is a
194     * directory
195     */
196    protected List<File> getFilesFromDataRoot(File rootFile) {
197        List<File> result = new ArrayList<File>();
198        if (rootFile.isFile()) {
199            result.add(rootFile);
200        } else {
201            Collection<File> listFiles = FileUtils.listFiles(rootFile, VALID_CMDI_EXTENSIONS, true);
202            result.addAll(listFiles);
203        }
204        return result;
205    }
206
207    /**
208     * Initialize SolrServer as specified in configuration file
209     *
210     * @throws MalformedURLException
211     */
212    protected void initSolrServer() throws MalformedURLException {
213        String solrUrl = VloConfig.getSolrUrl();
214        LOG.info("Initializing Solr Server on " + solrUrl);
215        solrServer = new StreamingUpdateSolrServer(solrUrl, 1000, 2) {
216            @Override
217            public void handleError(Throwable ex) {
218                super.handleError(ex);
219                serverError = ex;
220            }
221        };
222    }
223
224    /**
225     * Process single CMDI file with CMDIDataProcessor
226     *
227     * @param file CMDI input file
228     * @param dataOrigin
229     * @param processor
230     * @throws SolrServerException
231     * @throws IOException
232     */
233    protected void processCmdi(File file, DataRoot dataOrigin, CMDIDataProcessor processor) throws SolrServerException, IOException {
234        nrOfFilesAnalyzed++;
235        CMDIData cmdiData = null;
236        try {
237            cmdiData = processor.process(file);
238            if (!idOk(cmdiData.getId())) {
239                cmdiData.setId(dataOrigin.getOriginName() + "/" + file.getName()); //No id found in the metadata file so making one up based on the file name. Not quaranteed to be unique, but we have to set something.
240                nrOfFilesWithoutId++;
241            }
242        } catch (Exception e) {
243            LOG.error("error in file: " + file + " Exception", e);
244            nrOfFilesWithError++;
245        }
246        if (cmdiData != null && processedIds.add(cmdiData.getId())) {
247            SolrInputDocument solrDocument = cmdiData.getSolrDocument();
248            if (solrDocument != null) {
249                if (!cmdiData.getDataResources().isEmpty() || cmdiData.getMetadataResources().isEmpty()) {
250                    // We only add metadata files that have data resources (1) or files that don't link to other metadata files (2):
251                    //  1) files with data resources are obviously interesting
252                    //  2) files without metadata links and without dataResource can be interesting e.g. olac files describing a corpus with a link to the original archive.
253                    // Other files will have only metadata resources and are considered 'collection' metadata files they
254                    // are usually not very interesting (think imdi corpus files) and will not be included.
255                    updateDocument(solrDocument, cmdiData, file, dataOrigin);
256                } else {
257                    nrOfFilesWithoutDataResources++;
258                }
259            }
260        }
261    }
262
263    /**
264     * Check id for validness
265     *
266     * @param id
267     * @return true if id is acceptable, false otherwise
268     */
269    protected boolean idOk(String id) {
270        return id != null && !id.isEmpty();
271    }
272
273    /**
274     * Adds some additional information from DataRoot to solrDocument, add
275     * solrDocument to document list, submits list to SolrServer every 1000
276     * files
277     *
278     * @param solrDocument
279     * @param cmdiData
280     * @param file
281     * @param dataOrigin
282     * @throws SolrServerException
283     * @throws IOException
284     */
285    protected void updateDocument(SolrInputDocument solrDocument, CMDIData cmdiData, File file, DataRoot dataOrigin) throws SolrServerException,
286            IOException {
287        if (!solrDocument.containsKey(FacetConstants.FIELD_COLLECTION)) {
288            solrDocument.addField(FacetConstants.FIELD_COLLECTION, dataOrigin.getOriginName());
289        }
290        solrDocument.addField(FacetConstants.FIELD_DATA_PROVIDER, dataOrigin.getOriginName());
291        solrDocument.addField(FacetConstants.FIELD_ID, cmdiData.getId());
292        solrDocument.addField(FacetConstants.FIELD_FILENAME, file.getAbsolutePath());
293
294        String metadataSourceUrl = dataOrigin.getPrefix();
295        //System.out.println(dataOrigin.getTostrip());
296        //System.out.println(dataOrigin.getTostrip().length());
297        //System.out.println(file.getAbsolutePath());
298        metadataSourceUrl += file.getAbsolutePath().substring(dataOrigin.getToStrip().length());
299
300        solrDocument.addField(FacetConstants.FIELD_COMPLETE_METADATA, metadataSourceUrl);
301
302        // add SearchServices (should be CQL endpoint)
303        for (Resource resource : cmdiData.getSearchResources()) {
304            solrDocument.addField(FacetConstants.FIELD_SEARCH_SERVICE, resource.getResourceName());
305        }
306
307        // add landing page resource
308        for (Resource resource : cmdiData.getLandingPageResources()) {
309            solrDocument.addField(FacetConstants.FIELD_LANDINGPAGE, resource.getResourceName());
310        }
311       
312        addResourceData(solrDocument, cmdiData);
313        docs.add(solrDocument);
314        if (docs.size() == VloConfig.getMaxOnHeap()) {
315            sendDocs();
316        }
317    }
318
319    /**
320     * Adds two fields FIELD_RESOURCE_TYPE and FIELD_RESOURCE. The Type can be
321     * specified in the "ResourceType" element of an imdi file or possibly
322     * overwritten by some more specific xpath (as in the LRT cmdi files). So if
323     * a type is overwritten and already in the solrDocument we take that type.
324     */
325    protected void addResourceData(SolrInputDocument solrDocument, CMDIData cmdiData) {
326        List<Object> fieldValues = solrDocument.containsKey(FacetConstants.FIELD_RESOURCE_TYPE) ? new ArrayList<Object>(solrDocument
327                .getFieldValues(FacetConstants.FIELD_RESOURCE_TYPE)) : null;
328        solrDocument.removeField(FacetConstants.FIELD_RESOURCE_TYPE); //Remove old values they might be overwritten.
329        List<Resource> resources = cmdiData.getDataResources();
330        for (int i = 0; i < resources.size(); i++) {
331            Resource resource = resources.get(i);
332            String mimeType = resource.getMimeType();
333            String resourceType = mimeType;
334            if (mimeType == null) {
335                if (fieldValues != null && i < fieldValues.size()) {
336                    resourceType = fieldValues.get(i).toString(); //assuming there will be as many resource types overwritten as there are specified
337                    mimeType = CommonUtils.normalizeMimeType(resourceType);
338                } else {
339                    mimeType = CommonUtils.normalizeMimeType("");
340                    resourceType = mimeType;
341                }
342            } else {
343                resourceType = CommonUtils.normalizeMimeType(mimeType);
344            }
345            solrDocument.addField(FacetConstants.FIELD_RESOURCE_TYPE, resourceType);
346            solrDocument.addField(FacetConstants.FIELD_RESOURCE, mimeType + FacetConstants.FIELD_RESOURCE_SPLIT_CHAR
347                    + resource.getResourceName());
348        }
349    }
350
351    /**
352     * Send current list of SolrImputDocuments to SolrServer and clears list
353     * afterwards
354     *
355     * @throws SolrServerException
356     * @throws IOException
357     */
358    protected void sendDocs() throws SolrServerException, IOException {
359        LOG.info("Sending " + docs.size() + " docs to solr server. Total number of docs updated till now: " + nrOFDocumentsUpdated);
360        nrOFDocumentsUpdated += docs.size();
361        solrServer.add(docs);
362        if (serverError != null) {
363            throw new SolrServerException(serverError);
364        }
365        docs = new ArrayList<SolrInputDocument>();
366    }
367   
368    /**
369     * Builds suggester index for autocompletion
370     *
371     * @throws SolrServerException
372     * @throws MalformedURLException
373     */
374    private void buildSuggesterIndex() throws SolrServerException, MalformedURLException {
375        LOG.info("Building index for autocompletion.");
376        HashMap<String,String> paramMap = new HashMap<String, String>();
377        paramMap.put("qt", "/suggest");
378        paramMap.put("spellcheck.build", "true");
379        SolrParams params = new MapSolrParams(paramMap);
380        solrServer.query(params);
381    }
382
383    /**
384     * @param args
385     * @throws IOException
386     */
387    public static void main(String[] args) throws MalformedURLException, IOException { 
388
389        // application configuration
390        VloConfig config;
391       
392        // path to the configuration file
393        String configFile = null;
394       
395        // use the Apache cli framework for getting command line parameters
396        Options options = new Options();
397
398        /**
399         * Add a "c" option, the option indicating the specification of an XML
400         * configuration file
401         */
402        options.addOption("c", true, "-c <file> : use parameters specified in <file>");
403
404        CommandLineParser parser = new PosixParser();
405
406        try {
407            // parse the command line arguments
408            CommandLine cmd = parser.parse(options, args);
409            if (cmd.hasOption("c")) {
410               
411                // the "c" option was specified, now get its value
412                configFile = cmd.getOptionValue("c");
413            }
414
415        } catch (org.apache.commons.cli.ParseException ex) {
416           
417            /**
418             * Caught an exception caused by command line parsing. Try to get
419             * the name of the configuration file by querying the system
420             * property.
421             */
422
423            String message = "Command line parsing failed. " + ex.getMessage();
424            LOG.error(message);
425            System.err.println(message);
426        }
427       
428        if (configFile == null){
429
430            String message;
431
432            message = "Could not get config file name via the command line, trying the system properties.";
433            LOG.info(message);
434           
435            String key;
436
437            key = "configFile";
438            configFile = System.getProperty(key);
439        }
440
441        if (configFile == null) {
442           
443            String message;
444           
445            message = "Could not get filename as system property either - stopping.";
446            LOG.error(message);
447        } else {
448            // read the configuration from the externally supplied file
449            VloConfig.readConfig(configFile);
450
451            // optionally, modify the configuration here
452
453            // create and start the importer
454            MetadataImporter importer = new MetadataImporter();
455            importer.startImport();
456
457            // finished importing
458
459            if (VloConfig.printMapping()) {
460                File file = new File("xsdMapping.txt");
461                FacetMappingFactory.printMapping(file);
462                LOG.info("Printed facetMapping in " + file);
463            }
464        }
465    }
466}
Note: See TracBrowser for help on using the repository browser.