source: vlo/trunk/vlo_webapp/src/main/java/eu/clarin/cmdi/vlo/importer/MetadataImporter.java @ 2496

Last change on this file since 2496 was 2496, checked in by teckart, 11 years ago

Added method that starts building autocomplete (suggester) index after import

File size: 15.9 KB
Line 
1package eu.clarin.cmdi.vlo.importer;
2
3import java.io.File;
4import java.io.IOException;
5import java.net.MalformedURLException;
6import java.util.ArrayList;
7import java.util.Collection;
8import java.util.HashMap;
9import java.util.HashSet;
10import java.util.List;
11import java.util.Map;
12import java.util.Set;
13
14import org.apache.commons.io.FileUtils;
15import org.apache.solr.client.solrj.SolrServerException;
16import org.apache.solr.client.solrj.impl.StreamingUpdateSolrServer;
17import org.apache.solr.client.solrj.util.ClientUtils;
18import org.apache.solr.common.SolrInputDocument;
19import org.apache.solr.common.params.MapSolrParams;
20import org.apache.solr.common.params.SolrParams;
21import org.slf4j.Logger;
22import org.slf4j.LoggerFactory;
23import org.springframework.beans.factory.BeanFactory;
24import org.springframework.context.support.ClassPathXmlApplicationContext;
25
26import eu.clarin.cmdi.vlo.CommonUtils;
27import eu.clarin.cmdi.vlo.Configuration;
28import eu.clarin.cmdi.vlo.FacetConstants;
29
30
31/**
32 * The main metadataImporter class. Also contains the main function.
33 *
34 * The metadataimporter reads all the config files and then, for each metadatafile in each defined directory structure parses and imports them as defined in the configuration.
35 * The startImport function starts the importing and so on.
36 */
37
38@SuppressWarnings({"serial"})
39public class MetadataImporter {
40
41    /**
42     * Defines which files to try and parse.
43     * In this case all files ending in "xml" or "cmdi".
44     */
45    private static final String[] VALID_CMDI_EXTENSIONS = new String[] { "xml", "cmdi" };
46
47    /**
48     * Log log log log
49     */
50    private final static Logger LOG = LoggerFactory.getLogger(MetadataImporter.class);
51    /**
52     * Some place to store errors.
53     */
54    private static Throwable serverError;
55    /**
56     * the solr server.
57     */
58    private StreamingUpdateSolrServer solrServer;
59
60    /**
61     * Defines the post-processor associations.
62     * At import, for each facet value, this map is checked and all postprocessors associated with the facet _type_ are applied to the value before storing the new value in the solr document.
63     */
64    final static Map<String, PostProcessor> POST_PROCESSORS = new HashMap<String, PostProcessor>();
65    static {
66        POST_PROCESSORS.put(FacetConstants.FIELD_COUNTRY, new CountryNamePostProcessor());
67        POST_PROCESSORS.put(FacetConstants.FIELD_LANGUAGE, new LanguageCodePostProcessor());
68        POST_PROCESSORS.put(FacetConstants.FIELD_RESOURCE_TYPE, new ResourceTypePostProcessor());
69        POST_PROCESSORS.put(FacetConstants.FIELD_LANGUAGES, new LanguageLinkPostProcessor());
70        POST_PROCESSORS.put(FacetConstants.FIELD_NATIONAL_PROJECT, new NationalProjectPostProcessor());
71        POST_PROCESSORS.put(FacetConstants.FIELD_CLARIN_PROFILE, new CMDIComponentProfileNamePostProcessor());
72    }
73
74    /**
75     * Contains MDSelflinks (usually).
76     * Just to know what we have already done.
77     */
78    private final Set<String> processedIds = new HashSet<String>();
79    /**
80     * Some caching for solr documents (we are more efficient if we ram a whole bunch to the solr server at once.
81     */
82    protected List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>();
83    /**
84     * Config.
85     */
86    private final ImporterConfig config;
87
88    // SOME STATS
89    private int nrOFDocumentsUpdated;
90    private int nrOfFilesAnalyzed = 0;
91    private int nrOfFilesWithoutId = 0;
92    private int nrOfFilesWithoutDataResources = 0;
93    private int nrOfFilesWithError = 0;
94
95    /**
96     * Constructor, wants to know the config.
97     * @param config the config.
98     */
99    public MetadataImporter(ImporterConfig config) {
100        this.config = config;
101    }
102
103    /**
104     * Retrieve all files with VALID_CMDI_EXTENSIONS from all DataRoot entries and starts processing for every single file
105     * @throws MalformedURLException
106     */
107    void startImport() throws MalformedURLException {
108
109        initSolrServer();
110        List<DataRoot> dataRoots = checkDataRoots();
111        long start = System.currentTimeMillis();
112        try {
113            // Delete the whole Solr db
114            if (config.isDeleteAllFirst()) {
115                LOG.info("Deleting original data...");
116                solrServer.deleteByQuery("*:*");
117                solrServer.commit();
118                LOG.info("Deleting original data done.");
119            }
120            for (DataRoot dataRoot : dataRoots) {
121                LOG.info("Start of processing: " + dataRoot.getOriginName());
122                if (dataRoot.isDeleteFirst()) {
123                    LOG.info("Deleting data for data provider: " + dataRoot.getOriginName());
124                    solrServer.deleteByQuery(FacetConstants.FIELD_DATA_PROVIDER + ":" + ClientUtils.escapeQueryChars(dataRoot.getOriginName()));
125                    LOG.info("Deleting data of provider done.");
126                }
127                CMDIDataProcessor processor = new CMDIParserVTDXML(POST_PROCESSORS);
128                List<File> files = getFilesFromDataRoot(dataRoot.getRootFile());
129                for (File file : files) {
130                    LOG.debug("PROCESSING FILE: " + file.getAbsolutePath());
131                    processCmdi(file, dataRoot, processor);
132                }
133                if (!docs.isEmpty()) {
134                    sendDocs();
135                }
136                LOG.info("End of processing: " + dataRoot.getOriginName());
137            }
138        } catch (SolrServerException e) {
139            LOG.error("error updating files:\n", e);
140            LOG.error("Also see vlo_solr server logs for more information");
141        } catch (IOException e) {
142            LOG.error("error updating files:\n", e);
143        } finally {
144            try {
145                if (solrServer != null) {
146                    solrServer.commit();
147                    buildSuggesterIndex();
148                }               
149            } catch (SolrServerException e) {
150                LOG.error("cannot commit:\n", e);
151            } catch (IOException e) {
152                LOG.error("cannot commit:\n", e);
153            }
154        }
155        long took = (System.currentTimeMillis() - start) / 1000;
156        LOG.info("Found " + nrOfFilesWithoutId + " file(s) without an id. (id is generated based on fileName but that may not be unique)");
157        LOG.info("Found " + nrOfFilesWithError + " file(s) with errors.");
158        LOG.info("Found " + nrOfFilesWithoutDataResources
159                + " file(s) without data resources (metadata descriptions without resources are ignored).");
160        LOG.info("Update of " + nrOFDocumentsUpdated + " took " + took + " secs. Total nr of files analyzed " + nrOfFilesAnalyzed);
161    }
162
163    /**
164     * Check a List of DataRoots for existence of RootFile (typically parent directory of metadata files)
165     * @return
166     */
167    private List<DataRoot> checkDataRoots() {
168        List<DataRoot> dataRoots = config.getDataRoots();
169        for (DataRoot dataRoot : dataRoots) {
170            if (!dataRoot.getRootFile().exists()) {
171                LOG.error("Root file " + dataRoot.getRootFile() + " does not exist. Probable configuration error so stopping import.");
172                System.exit(1);
173            }
174        }
175        return dataRoots;
176    }
177
178    /**
179     * Get the rootFile or all files with VALID_CMDI_EXTENSIONS if rootFile is a directory
180     * @param rootFile
181     * @return List with the rootFile or all contained files if rootFile is a directory
182     */
183    private List<File> getFilesFromDataRoot(File rootFile) {
184        List<File> result = new ArrayList<File>();
185        if (rootFile.isFile()) {
186            result.add(rootFile);
187        } else {
188            Collection<File> listFiles = FileUtils.listFiles(rootFile, VALID_CMDI_EXTENSIONS, true);
189            result.addAll(listFiles);
190        }
191        return result;
192    }
193
194    /**
195     * Initialize SolrServer as specified in configuration file
196     * @throws MalformedURLException
197     */
198    protected void initSolrServer() throws MalformedURLException {
199        String solrUrl = Configuration.getInstance().getSolrUrl();
200        LOG.info("Initializing Solr Server on " + solrUrl);
201        solrServer = new StreamingUpdateSolrServer(solrUrl, 1000, 2) {
202            @Override
203            public void handleError(Throwable ex) {
204                super.handleError(ex);
205                serverError = ex;
206            }
207        };
208    }
209
210    /**
211     * Process single CMDI file with CMDIDataProcessor
212     * @param file CMDI input file
213     * @param dataOrigin
214     * @param processor
215     * @throws SolrServerException
216     * @throws IOException
217     */
218    private void processCmdi(File file, DataRoot dataOrigin, CMDIDataProcessor processor) throws SolrServerException, IOException {
219        nrOfFilesAnalyzed++;
220        CMDIData cmdiData = null;
221        try {
222            cmdiData = processor.process(file);
223            if (!idOk(cmdiData.getId())) {
224                cmdiData.setId(dataOrigin.getOriginName() + "/" + file.getName()); //No id found in the metadata file so making one up based on the file name. Not quaranteed to be unique, but we have to set something.
225                nrOfFilesWithoutId++;
226            }
227        } catch (Exception e) {
228            LOG.error("error in file: " + file + " Exception", e);
229            nrOfFilesWithError++;
230        }
231        if (cmdiData != null && processedIds.add(cmdiData.getId())) {
232            SolrInputDocument solrDocument = cmdiData.getSolrDocument();
233            if (solrDocument != null) {
234                if (!cmdiData.getDataResources().isEmpty() || cmdiData.getMetadataResources().isEmpty()) {
235                    // We only add metadata files that have data resources (1) or files that don't link to other metadata files (2):
236                    //  1) files with data resources are obviously interesting
237                    //  2) files without metadata links and without dataResource can be interesting e.g. olac files describing a corpus with a link to the original archive.
238                    // Other files will have only metadata resources and are considered 'collection' metadata files they
239                    // are usually not very interesting (think imdi corpus files) and will not be included.
240                    updateDocument(solrDocument, cmdiData, file, dataOrigin);
241                } else {
242                    nrOfFilesWithoutDataResources++;
243                }
244            }
245        }
246    }
247
248    /**
249     * Check id for validness
250     * @param id
251     * @return true if id is acceptable, false otherwise
252     */
253    private boolean idOk(String id) {
254        return id != null && !id.isEmpty();
255    }
256
257    /**
258     * Adds some additional information from DataRoot to solrDocument, add solrDocument to document list, submits list to SolrServer every 1000 files
259     * @param solrDocument
260     * @param cmdiData
261     * @param file
262     * @param dataOrigin
263     * @throws SolrServerException
264     * @throws IOException
265     */
266    private void updateDocument(SolrInputDocument solrDocument, CMDIData cmdiData, File file, DataRoot dataOrigin) throws SolrServerException,
267            IOException {
268        if (!solrDocument.containsKey(FacetConstants.FIELD_COLLECTION)) {
269            solrDocument.addField(FacetConstants.FIELD_COLLECTION, dataOrigin.getOriginName());
270        }
271        solrDocument.addField(FacetConstants.FIELD_DATA_PROVIDER, dataOrigin.getOriginName());
272        solrDocument.addField(FacetConstants.FIELD_ID, cmdiData.getId());
273        solrDocument.addField(FacetConstants.FIELD_FILENAME, file.getAbsolutePath());
274
275        String metadataSourceUrl = dataOrigin.getPrefix();
276        //System.out.println(dataOrigin.getTostrip());
277        //System.out.println(dataOrigin.getTostrip().length());
278        //System.out.println(file.getAbsolutePath());
279        metadataSourceUrl += file.getAbsolutePath().substring(dataOrigin.getTostrip().length());
280
281        solrDocument.addField(FacetConstants.FIELD_COMPLETE_METADATA, metadataSourceUrl);
282       
283        // add SearchServices (should be CQL endpoint)
284        for(Resource resource : cmdiData.getSearchResources())
285                solrDocument.addField(FacetConstants.FIELD_SEARCH_SERVICE, resource.getResourceName());       
286       
287        addResourceData(solrDocument, cmdiData);
288        docs.add(solrDocument);
289        if (docs.size() == 1000) {
290            sendDocs();
291        }
292    }
293
294    /**
295     * Adds two fields FIELD_RESOURCE_TYPE and FIELD_RESOURCE. The Type can be specified in the "ResourceType" element of an imdi file or
296     * possibly overwritten by some more specific xpath (as in the LRT cmdi files). So if a type is overwritten and already in the
297     * solrDocument we take that type.
298     */
299    private void addResourceData(SolrInputDocument solrDocument, CMDIData cmdiData) {
300        List<Object> fieldValues = solrDocument.containsKey(FacetConstants.FIELD_RESOURCE_TYPE) ? new ArrayList<Object>(solrDocument
301                .getFieldValues(FacetConstants.FIELD_RESOURCE_TYPE)) : null;
302        solrDocument.removeField(FacetConstants.FIELD_RESOURCE_TYPE); //Remove old values they might be overwritten.
303        List<Resource> resources = cmdiData.getDataResources();
304        for (int i = 0; i < resources.size(); i++) {
305            Resource resource = resources.get(i);
306            String mimeType = resource.getMimeType();
307            String resourceType = mimeType;
308            if (mimeType == null) {
309                if (fieldValues != null && i < fieldValues.size()) {
310                    resourceType = fieldValues.get(i).toString(); //assuming there will be as many resource types overwritten as there are specified
311                    mimeType = CommonUtils.normalizeMimeType(resourceType);
312                } else {
313                    mimeType = CommonUtils.normalizeMimeType("");
314                    resourceType = mimeType;
315                }
316            } else {
317                resourceType = CommonUtils.normalizeMimeType(mimeType);
318            }
319            solrDocument.addField(FacetConstants.FIELD_RESOURCE_TYPE, resourceType);
320            solrDocument.addField(FacetConstants.FIELD_RESOURCE, mimeType + FacetConstants.FIELD_RESOURCE_SPLIT_CHAR
321                    + resource.getResourceName());
322        }
323    }
324
325    /**
326     * Send current list of SolrImputDocuments to SolrServer and clears list afterwards
327     * @throws SolrServerException
328     * @throws IOException
329     */
330    protected void sendDocs() throws SolrServerException, IOException {
331        LOG.info("Sending " + docs.size() + " docs to solr server. Total number of docs updated till now: " + nrOFDocumentsUpdated);
332        nrOFDocumentsUpdated += docs.size();
333        solrServer.add(docs);
334        if (serverError != null) {
335            throw new SolrServerException(serverError);
336        }
337        docs = new ArrayList<SolrInputDocument>();
338    }
339   
340    /**
341     * Builds suggester index for autocompletion
342     * @throws SolrServerException
343     * @throws MalformedURLException
344     */
345    private void buildSuggesterIndex() throws SolrServerException, MalformedURLException {
346        LOG.info("Building index for autocompletion.");
347        HashMap<String,String> paramMap = new HashMap<String, String>();
348        paramMap.put("qt", "/suggest");
349        paramMap.put("spellcheck.build", "true");
350        SolrParams params = new MapSolrParams(paramMap);
351        solrServer.query(params);
352    }
353
354    /**
355     * @param args
356     * @throws IOException
357     */
358    public static void main(String[] args) throws IOException {
359        BeanFactory factory = new ClassPathXmlApplicationContext(new String[] { Configuration.CONFIG_FILE });
360        factory.getBean("configuration");
361        Configuration cfg = Configuration.getInstance();
362
363        BeanFactory factory2 = new ClassPathXmlApplicationContext(new String[] { cfg.getImporterConfigFile() } );
364
365        ImporterConfig config = (ImporterConfig) factory2.getBean("importerConfig", ImporterConfig.class);
366
367        MetadataImporter importer = new MetadataImporter(config);
368        importer.startImport();
369        if (config.isPrintMapping()) {
370            File file = new File("xsdMapping.txt");
371            FacetMappingFactory.printMapping(file);
372            LOG.info("Printed facetMapping in " + file);
373        }
374    }
375
376}
Note: See TracBrowser for help on using the repository browser.