source: vlo/branches/vlo-2.13-param/vlo_importer/src/main/java/eu/clarin/cmdi/vlo/importer/MetadataImporter.java @ 2660

Last change on this file since 2660 was 2660, checked in by keeloo, 11 years ago

Made VloConfig? truly static.

File size: 17.1 KB
Line 
1package eu.clarin.cmdi.vlo.importer;
2
3import eu.clarin.cmdi.vlo.CommonUtils;
4import eu.clarin.cmdi.vlo.FacetConstants;
5import eu.clarin.cmdi.vlo.config.DataRoot;
6import eu.clarin.cmdi.vlo.config.VloConfig;
7import java.io.File;
8import java.io.IOException;
9import java.net.MalformedURLException;
10import java.util.ArrayList;
11import java.util.Collection;
12import java.util.HashMap;
13import java.util.HashSet;
14import java.util.List;
15import java.util.Map;
16import java.util.Set;
17import org.apache.commons.cli.CommandLine;
18import org.apache.commons.cli.CommandLineParser;
19import org.apache.commons.cli.Options;
20import org.apache.commons.cli.PosixParser;
21import org.apache.commons.io.FileUtils;
22import org.apache.solr.client.solrj.SolrServerException;
23import org.apache.solr.client.solrj.impl.StreamingUpdateSolrServer;
24import org.apache.solr.client.solrj.util.ClientUtils;
25import org.apache.solr.common.SolrInputDocument;
26import org.apache.solr.common.params.MapSolrParams;
27import org.apache.solr.common.params.SolrParams;
28import org.slf4j.Logger;
29import org.slf4j.LoggerFactory;
30
31
32/**
33 * The main metadataImporter class. Also contains the main function.
34 *
35 * The metadataimporter reads all the config files and then, for each
36 * metadatafile in each defined directory structure parses and imports them as
37 * defined in the configuration. The startImport function starts the importing
38 * and so on.
39 */
40
41@SuppressWarnings({"serial"})
42public class MetadataImporter {
43
44    /**
45     * Defines which files to try and parse.
46     * In this case all files ending in "xml" or "cmdi".
47     */
48    private static final String[] VALID_CMDI_EXTENSIONS = new String[] { "xml", "cmdi" };
49
50    /**
51     * Log log log log
52     */
53    private final static Logger LOG = LoggerFactory.getLogger(MetadataImporter.class);
54    /**
55     * Some place to store errors.
56     */
57    private static Throwable serverError;
58    /**
59     * the solr server.
60     */
61    private StreamingUpdateSolrServer solrServer;
62    /**
63     * Defines the post-processor associations. At import, for each facet value,
64     * this map is checked and all postprocessors associated with the facet
65     * _type_ are applied to the value before storing the new value in the solr
66     * document.
67     */
68    final static Map<String, PostProcessor> POST_PROCESSORS = new HashMap<String, PostProcessor>();
69    static {
70        POST_PROCESSORS.put(FacetConstants.FIELD_COUNTRY, new CountryNamePostProcessor());
71        POST_PROCESSORS.put(FacetConstants.FIELD_LANGUAGE, new LanguageCodePostProcessor());
72        POST_PROCESSORS.put(FacetConstants.FIELD_RESOURCE_TYPE, new ResourceTypePostProcessor());
73        POST_PROCESSORS.put(FacetConstants.FIELD_LANGUAGES, new LanguageLinkPostProcessor());
74        POST_PROCESSORS.put(FacetConstants.FIELD_NATIONAL_PROJECT, new NationalProjectPostProcessor());
75        POST_PROCESSORS.put(FacetConstants.FIELD_CLARIN_PROFILE, new CMDIComponentProfileNamePostProcessor());
76    }
77   
78    /**
79     * Constructor
80     *
81     * @param
82     */
83    public MetadataImporter (){
84    }
85
86    /**
87     * Contains MDSelflinks (usually).
88     * Just to know what we have already done.
89     */
90    private final Set<String> processedIds = new HashSet<String>();
91    /**
92     * Some caching for solr documents (we are more efficient if we ram a whole
93     * bunch to the solr server at once.
94     */
95    protected List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>();
96
97    // SOME STATS
98    private int nrOFDocumentsUpdated;
99    private int nrOfFilesAnalyzed = 0;
100    private int nrOfFilesWithoutId = 0;
101    private int nrOfFilesWithoutDataResources = 0;
102    private int nrOfFilesWithError = 0;
103
104    /**
105     * Retrieve all files with VALID_CMDI_EXTENSIONS from all DataRoot entries
106     * and starts processing for every single file
107     *
108     * @throws MalformedURLException
109     */
110    void startImport() throws MalformedURLException {
111
112        initSolrServer();
113        List<DataRoot> dataRoots = checkDataRoots();
114        long start = System.currentTimeMillis();
115        try {
116            // Delete the whole Solr db
117            if (VloConfig.isDeleteAllFirst()) {
118                LOG.info("Deleting original data...");
119                solrServer.deleteByQuery("*:*");
120                solrServer.commit();
121                LOG.info("Deleting original data done.");
122            }
123            for (DataRoot dataRoot : dataRoots) {
124                LOG.info("Start of processing: " + dataRoot.getOriginName());
125                if (dataRoot.isDeleteFirst()) {
126                    LOG.info("Deleting data for data provider: " + dataRoot.getOriginName());
127                    solrServer.deleteByQuery(FacetConstants.FIELD_DATA_PROVIDER + ":" + ClientUtils.escapeQueryChars(dataRoot.getOriginName()));
128                    LOG.info("Deleting data of provider done.");
129                }
130                CMDIDataProcessor processor = new CMDIParserVTDXML(POST_PROCESSORS);
131                List<File> files = getFilesFromDataRoot(dataRoot.getRootFile());
132                for (File file : files) {
133                    LOG.debug("PROCESSING FILE: " + file.getAbsolutePath());
134                    processCmdi(file, dataRoot, processor);
135                }
136                if (!docs.isEmpty()) {
137                    sendDocs();
138                }
139                LOG.info("End of processing: " + dataRoot.getOriginName());
140            }
141        } catch (SolrServerException e) {
142            LOG.error("error updating files:\n", e);
143            LOG.error("Also see vlo_solr server logs for more information");
144        } catch (IOException e) {
145            LOG.error("error updating files:\n", e);
146        } finally {
147            try {
148                if (solrServer != null) {
149                    solrServer.commit();
150                    buildSuggesterIndex();
151                }               
152            } catch (SolrServerException e) {
153                LOG.error("cannot commit:\n", e);
154            } catch (IOException e) {
155                LOG.error("cannot commit:\n", e);
156            }
157        }
158        long took = (System.currentTimeMillis() - start) / 1000;
159        LOG.info("Found " + nrOfFilesWithoutId + " file(s) without an id. (id is generated based on fileName but that may not be unique)");
160        LOG.info("Found " + nrOfFilesWithError + " file(s) with errors.");
161        LOG.info("Found " + nrOfFilesWithoutDataResources
162                + " file(s) without data resources (metadata descriptions without resources are ignored).");
163        LOG.info("Update of " + nrOFDocumentsUpdated + " took " + took + " secs. Total nr of files analyzed " + nrOfFilesAnalyzed);
164    }
165
166    /**
167     * Check a List of DataRoots for existence of RootFile (typically parent
168     * directory of metadata files)
169     *
170     * @return
171     */
172    private List<DataRoot> checkDataRoots() {
173        List<DataRoot> dataRoots = VloConfig.getDataRoots();
174        for (DataRoot dataRoot : dataRoots) {
175            if (!dataRoot.getRootFile().exists()) {
176                LOG.error("Root file " + dataRoot.getRootFile() + " does not exist. Probable configuration error so stopping import.");
177                System.exit(1);
178            }
179        }
180        return dataRoots;
181    }
182
183    /**
184     * Get the rootFile or all files with VALID_CMDI_EXTENSIONS if rootFile is a
185     * directory
186     *
187     * @param rootFile
188     * @return List with the rootFile or all contained files if rootFile is a
189     * directory
190     */
191    private List<File> getFilesFromDataRoot(File rootFile) {
192        List<File> result = new ArrayList<File>();
193        if (rootFile.isFile()) {
194            result.add(rootFile);
195        } else {
196            Collection<File> listFiles = FileUtils.listFiles(rootFile, VALID_CMDI_EXTENSIONS, true);
197            result.addAll(listFiles);
198        }
199        return result;
200    }
201
202    /**
203     * Initialize SolrServer as specified in configuration file
204     *
205     * @throws MalformedURLException
206     */
207    protected void initSolrServer() throws MalformedURLException {
208        String solrUrl = VloConfig.getSolrUrl();
209        LOG.info("Initializing Solr Server on " + solrUrl);
210        solrServer = new StreamingUpdateSolrServer(solrUrl, 1000, 2) {
211            @Override
212            public void handleError(Throwable ex) {
213                super.handleError(ex);
214                serverError = ex;
215            }
216        };
217    }
218
219    /**
220     * Process single CMDI file with CMDIDataProcessor
221     *
222     * @param file CMDI input file
223     * @param dataOrigin
224     * @param processor
225     * @throws SolrServerException
226     * @throws IOException
227     */
228    private void processCmdi(File file, DataRoot dataOrigin, CMDIDataProcessor processor) throws SolrServerException, IOException {
229        nrOfFilesAnalyzed++;
230        CMDIData cmdiData = null;
231        try {
232            cmdiData = processor.process(file);
233            if (!idOk(cmdiData.getId())) {
234                cmdiData.setId(dataOrigin.getOriginName() + "/" + file.getName()); //No id found in the metadata file so making one up based on the file name. Not quaranteed to be unique, but we have to set something.
235                nrOfFilesWithoutId++;
236            }
237        } catch (Exception e) {
238            LOG.error("error in file: " + file + " Exception", e);
239            nrOfFilesWithError++;
240        }
241        if (cmdiData != null && processedIds.add(cmdiData.getId())) {
242            SolrInputDocument solrDocument = cmdiData.getSolrDocument();
243            if (solrDocument != null) {
244                if (!cmdiData.getDataResources().isEmpty() || cmdiData.getMetadataResources().isEmpty()) {
245                    // We only add metadata files that have data resources (1) or files that don't link to other metadata files (2):
246                    //  1) files with data resources are obviously interesting
247                    //  2) files without metadata links and without dataResource can be interesting e.g. olac files describing a corpus with a link to the original archive.
248                    // Other files will have only metadata resources and are considered 'collection' metadata files they
249                    // are usually not very interesting (think imdi corpus files) and will not be included.
250                    updateDocument(solrDocument, cmdiData, file, dataOrigin);
251                } else {
252                    nrOfFilesWithoutDataResources++;
253                }
254            }
255        }
256    }
257
258    /**
259     * Check id for validness
260     *
261     * @param id
262     * @return true if id is acceptable, false otherwise
263     */
264    private boolean idOk(String id) {
265        return id != null && !id.isEmpty();
266    }
267
268    /**
269     * Adds some additional information from DataRoot to solrDocument, add
270     * solrDocument to document list, submits list to SolrServer every 1000
271     * files
272     *
273     * @param solrDocument
274     * @param cmdiData
275     * @param file
276     * @param dataOrigin
277     * @throws SolrServerException
278     * @throws IOException
279     */
280    private void updateDocument(SolrInputDocument solrDocument, CMDIData cmdiData, File file, DataRoot dataOrigin) throws SolrServerException,
281            IOException {
282        if (!solrDocument.containsKey(FacetConstants.FIELD_COLLECTION)) {
283            solrDocument.addField(FacetConstants.FIELD_COLLECTION, dataOrigin.getOriginName());
284        }
285        solrDocument.addField(FacetConstants.FIELD_DATA_PROVIDER, dataOrigin.getOriginName());
286        solrDocument.addField(FacetConstants.FIELD_ID, cmdiData.getId());
287        solrDocument.addField(FacetConstants.FIELD_FILENAME, file.getAbsolutePath());
288
289        String metadataSourceUrl = dataOrigin.getPrefix();
290        //System.out.println(dataOrigin.getTostrip());
291        //System.out.println(dataOrigin.getTostrip().length());
292        //System.out.println(file.getAbsolutePath());
293        metadataSourceUrl += file.getAbsolutePath().substring(dataOrigin.getTostrip().length());
294
295        solrDocument.addField(FacetConstants.FIELD_COMPLETE_METADATA, metadataSourceUrl);
296       
297        // add SearchServices (should be CQL endpoint)
298        for(Resource resource : cmdiData.getSearchResources())
299                solrDocument.addField(FacetConstants.FIELD_SEARCH_SERVICE, resource.getResourceName());       
300       
301        addResourceData(solrDocument, cmdiData);
302        docs.add(solrDocument);
303        if (docs.size() == 1000) {
304            sendDocs();
305        }
306    }
307
308    /**
309     * Adds two fields FIELD_RESOURCE_TYPE and FIELD_RESOURCE. The Type can be
310     * specified in the "ResourceType" element of an imdi file or possibly
311     * overwritten by some more specific xpath (as in the LRT cmdi files). So if
312     * a type is overwritten and already in the solrDocument we take that type.
313     */
314    private void addResourceData(SolrInputDocument solrDocument, CMDIData cmdiData) {
315        List<Object> fieldValues = solrDocument.containsKey(FacetConstants.FIELD_RESOURCE_TYPE) ? new ArrayList<Object>(solrDocument
316                .getFieldValues(FacetConstants.FIELD_RESOURCE_TYPE)) : null;
317        solrDocument.removeField(FacetConstants.FIELD_RESOURCE_TYPE); //Remove old values they might be overwritten.
318        List<Resource> resources = cmdiData.getDataResources();
319        for (int i = 0; i < resources.size(); i++) {
320            Resource resource = resources.get(i);
321            String mimeType = resource.getMimeType();
322            String resourceType = mimeType;
323            if (mimeType == null) {
324                if (fieldValues != null && i < fieldValues.size()) {
325                    resourceType = fieldValues.get(i).toString(); //assuming there will be as many resource types overwritten as there are specified
326                    mimeType = CommonUtils.normalizeMimeType(resourceType);
327                } else {
328                    mimeType = CommonUtils.normalizeMimeType("");
329                    resourceType = mimeType;
330                }
331            } else {
332                resourceType = CommonUtils.normalizeMimeType(mimeType);
333            }
334            solrDocument.addField(FacetConstants.FIELD_RESOURCE_TYPE, resourceType);
335            solrDocument.addField(FacetConstants.FIELD_RESOURCE, mimeType + FacetConstants.FIELD_RESOURCE_SPLIT_CHAR
336                    + resource.getResourceName());
337        }
338    }
339
340    /**
341     * Send current list of SolrImputDocuments to SolrServer and clears list
342     * afterwards
343     *
344     * @throws SolrServerException
345     * @throws IOException
346     */
347    protected void sendDocs() throws SolrServerException, IOException {
348        LOG.info("Sending " + docs.size() + " docs to solr server. Total number of docs updated till now: " + nrOFDocumentsUpdated);
349        nrOFDocumentsUpdated += docs.size();
350        solrServer.add(docs);
351        if (serverError != null) {
352            throw new SolrServerException(serverError);
353        }
354        docs = new ArrayList<SolrInputDocument>();
355    }
356   
357    /**
358     * Builds suggester index for autocompletion
359     *
360     * @throws SolrServerException
361     * @throws MalformedURLException
362     */
363    private void buildSuggesterIndex() throws SolrServerException, MalformedURLException {
364        LOG.info("Building index for autocompletion.");
365        HashMap<String,String> paramMap = new HashMap<String, String>();
366        paramMap.put("qt", "/suggest");
367        paramMap.put("spellcheck.build", "true");
368        SolrParams params = new MapSolrParams(paramMap);
369        solrServer.query(params);
370    }
371
372    /**
373     * @param args
374     * @throws IOException
375     */
376    public static void main(String[] args) throws MalformedURLException, IOException {
377
378        // application configuration
379        VloConfig config;
380       
381        // use the Apache cli framework for getting command line parameters
382        Options options = new Options();
383
384        /**
385         * Add a "c" option, the option indicating the specification of an XML
386         * configuration file
387         */
388        options.addOption("c", true, "-c <file> : use parameters specified in <file>");
389
390        CommandLineParser parser = new PosixParser();
391
392        try {
393            // parse the command line arguments
394            CommandLine cmd = parser.parse(options, args);
395            if (cmd.hasOption("c")) {
396               
397                // the "c" option was specified, now get its value
398                String fileName;
399                fileName = cmd.getOptionValue("c");
400               
401                // optionally, check for file existence here
402               
403                // read the configuration from the externally supplied file
404                VloConfig.readConfig(fileName);
405
406                // optionally, modify the configuration here
407               
408                // create and start the importer
409                MetadataImporter importer = new MetadataImporter();
410                importer.startImport();
411               
412                // finished importing
413               
414                if (VloConfig.isPrintMapping()) {
415                    File file = new File("xsdMapping.txt");
416                    FacetMappingFactory.printMapping(file);
417                    LOG.info("Printed facetMapping in " + file);
418                }
419            }
420
421        } catch (org.apache.commons.cli.ParseException ex) {
422           
423            // caught an exception caused by command line parsing
424           
425            String message = "Command line parsing failed. " + ex.getMessage();
426                   
427            LOG.error(message);
428            System.err.println(message);
429        }
430    }
431}
Note: See TracBrowser for help on using the repository browser.