Context Navigation

source: vlo/trunk/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/MetadataImporter.java @ 5979

Last change on this file since 5979 was 5979, checked in by teckart@informatik.uni-leipzig.de, 9 years ago
Added support for using local XML schema files instead of using the component registry (#522), also stricter check when extracting profile ID from CMDI instance file
File size: 21.3 KB

Line
1	package eu.clarin.cmdi.vlo.importer;
2
3	import eu.clarin.cmdi.vlo.CommonUtils;
4	import eu.clarin.cmdi.vlo.FacetConstants;
5	import eu.clarin.cmdi.vlo.config.DataRoot;
6	import eu.clarin.cmdi.vlo.config.VloConfig;
7	import eu.clarin.cmdi.vlo.config.XmlVloConfigFactory;
8
9	import java.io.File;
10	import java.io.IOException;
11	import java.net.MalformedURLException;
12	import java.net.URL;
13	import java.text.SimpleDateFormat;
14	import java.util.ArrayList;
15	import java.util.Collection;
16	import java.util.Date;
17	import java.util.HashMap;
18	import java.util.HashSet;
19	import java.util.List;
20	import java.util.Map;
21	import java.util.Set;
22
23	import org.apache.commons.cli.CommandLine;
24	import org.apache.commons.cli.CommandLineParser;
25	import org.apache.commons.cli.Options;
26	import org.apache.commons.cli.PosixParser;
27	import org.apache.commons.io.FileUtils;
28	import org.apache.solr.client.solrj.SolrServerException;
29	import org.apache.solr.client.solrj.impl.ConcurrentUpdateSolrServer;
30	import org.apache.solr.client.solrj.util.ClientUtils;
31	import org.apache.solr.common.SolrInputDocument;
32	import org.apache.solr.common.params.MapSolrParams;
33	import org.apache.solr.common.params.SolrParams;
34	import org.slf4j.Logger;
35	import org.slf4j.LoggerFactory;
36
37	/**
38	* The main metadataImporter class. Also contains the main function.
39	*
40	* The metadataimporter reads all the config files and then, for each
41	* metadatafile in each defined directory structure parses and imports them as
42	* defined in the configuration. The startImport function starts the importing
43	* and so on.
44	*/
45	@SuppressWarnings({"serial"})
46	public class MetadataImporter {
47
48	/**
49	* Defines which files to try and parse. In this case all files ending in
50	* "xml" or "cmdi".
51	*/
52	private static final String[] VALID_CMDI_EXTENSIONS = new String[]{"xml", "cmdi"};
53
54	/**
55	* Log log log log
56	*/
57	protected final static Logger LOG = LoggerFactory.getLogger(MetadataImporter.class);
58	/**
59	* Some place to store errors.
60	*/
61	private static Throwable serverError;
62	/**
63	* the solr server.
64	*/
65	private ConcurrentUpdateSolrServer solrServer;
66	/**
67	* Defines the post-processor associations. At import, for each facet value,
68	* this map is checked and all postprocessors associated with the facet
69	* _type_ are applied to the value before storing the new value in the solr
70	* document.
71	*/
72	final static Map<String, PostProcessor> POST_PROCESSORS = new HashMap<String, PostProcessor>();
73
74	static {
75	POST_PROCESSORS.put(FacetConstants.FIELD_ID, new IdPostProcessor());
76	POST_PROCESSORS.put(FacetConstants.FIELD_CONTINENT, new ContinentNamePostProcessor());
77	POST_PROCESSORS.put(FacetConstants.FIELD_COUNTRY, new CountryNamePostProcessor());
78	POST_PROCESSORS.put(FacetConstants.FIELD_LANGUAGE, new LanguageNamePostProcessor());
79	POST_PROCESSORS.put(FacetConstants.FIELD_LANGUAGES, new LanguageLinkPostProcessor());
80	POST_PROCESSORS.put(FacetConstants.FIELD_LANGUAGE_CODE, new LanguageCodePostProcessor());
81	POST_PROCESSORS.put(FacetConstants.FIELD_LICENSE, new LicensePostProcessor());
82	POST_PROCESSORS.put(FacetConstants.FIELD_ORGANISATION, new OrganisationPostProcessor());
83	POST_PROCESSORS.put(FacetConstants.FIELD_YEAR, new YearPostProcessor());
84	POST_PROCESSORS.put(FacetConstants.FIELD_NATIONAL_PROJECT, new NationalProjectPostProcessor());
85	POST_PROCESSORS.put(FacetConstants.FIELD_CLARIN_PROFILE, new CMDIComponentProfileNamePostProcessor());
86	POST_PROCESSORS.put(FacetConstants.FIELD_RESOURCE_CLASS, new ResourceClassPostProcessor());
87	}
88
89	/**
90	* Constructor
91	*/
92	public MetadataImporter() {}
93
94	/**
95	* Contains MDSelflinks (usually). Just to know what we have already done.
96	*/
97	protected final Set<String> processedIds = new HashSet<String>();
98	/**
99	* Some caching for solr documents (we are more efficient if we ram a whole
100	* bunch to the solr server at once.
101	*/
102	protected List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>();
103
104	// SOME STATS
105	protected int nrOFDocumentsSend;
106	protected int nrOfFilesAnalyzed = 0;
107	protected int nrOfFilesWithoutId = 0;
108	protected int nrOfIgnoredFiles = 0;
109	protected int nrOfFilesWithError = 0;
110
111	/**
112	* Retrieve all files with VALID_CMDI_EXTENSIONS from all DataRoot entries
113	* and starts processing for every single file
114	*
115	* @throws MalformedURLException
116	*/
117	void startImport() throws MalformedURLException {
118
119	initSolrServer();
120	List<DataRoot> dataRoots = checkDataRoots();
121	long start = System.currentTimeMillis();
122	try {
123	// Delete the whole Solr db
124	if (config.getDeleteAllFirst()) {
125	LOG.info("Deleting original data...");
126	solrServer.deleteByQuery(":");
127	solrServer.commit();
128	LOG.info("Deleting original data done.");
129	}
130
131	// Import the specified data roots
132	for (DataRoot dataRoot : dataRoots) {
133	LOG.info("Start of processing: " + dataRoot.getOriginName());
134	if (dataRoot.deleteFirst()) {
135	LOG.info("Deleting data for data provider: " + dataRoot.getOriginName());
136	solrServer.deleteByQuery(FacetConstants.FIELD_DATA_PROVIDER + ":" + ClientUtils.escapeQueryChars(dataRoot.getOriginName()));
137	LOG.info("Deleting data of provider done.");
138	}
139	CMDIDataProcessor processor = new CMDIParserVTDXML(POST_PROCESSORS, false);
140	List<File> files = getFilesFromDataRoot(dataRoot.getRootFile());
141	for (File file : files) {
142	if (config.getMaxFileSize() > 0
143	&& file.length() > config.getMaxFileSize()) {
144	LOG.info("Skipping " + file.getAbsolutePath() + " because it is too large.");
145	} else {
146	LOG.debug("PROCESSING FILE: {}", file.getAbsolutePath());
147	processCmdi(file, dataRoot, processor);
148	}
149	}
150	if (!docs.isEmpty()) {
151	sendDocs();
152	}
153	LOG.info("End of processing: " + dataRoot.getOriginName());
154	}
155
156	// delete outdated entries (based on maxDaysInSolr parameter)
157	if (config.getMaxDaysInSolr() > 0 && config.getDeleteAllFirst() == false) {
158	LOG.info("Deleting old files that were not seen for more than " + config.getMaxDaysInSolr() + " days...");
159	solrServer.deleteByQuery(FacetConstants.FIELD_LAST_SEEN + ":[* TO NOW-" + config.getMaxDaysInSolr() + "DAYS]");
160	LOG.info("Deleting old files done.");
161	}
162	} catch (SolrServerException e) {
163	LOG.error("error updating files:\n", e);
164	LOG.error("Also see vlo_solr server logs for more information");
165	} catch (IOException e) {
166	LOG.error("error updating files:\n", e);
167	} finally {
168	try {
169	if (solrServer != null) {
170	solrServer.commit();
171	buildSuggesterIndex();
172	}
173	} catch (SolrServerException e) {
174	LOG.error("cannot commit:\n", e);
175	} catch (IOException e) {
176	LOG.error("cannot commit:\n", e);
177	}
178	}
179	long took = (System.currentTimeMillis() - start) / 1000;
180	LOG.info("Found " + nrOfFilesWithoutId + " file(s) without an id. (id is generated based on fileName but that may not be unique)");
181	LOG.info("Found " + nrOfFilesWithError + " file(s) with errors.");
182	LOG.info("Found " + nrOfIgnoredFiles
183	+ " file(s) that where ignored (files without resources or any link to a search service or landing page are ignored).");
184	LOG.info("Update of " + nrOFDocumentsSend + " took " + took + " secs. Total nr of files analyzed " + nrOfFilesAnalyzed);
185	solrServer.shutdown();
186	}
187
188	/**
189	* Check a List of DataRoots for existence of RootFile (typically parent
190	* directory of metadata files)
191	*
192	* @return
193	*/
194	protected List<DataRoot> checkDataRoots() {
195	List<DataRoot> dataRoots = config.getDataRoots();
196	for (DataRoot dataRoot : dataRoots) {
197	if (!dataRoot.getRootFile().exists()) {
198	LOG.error("Root file " + dataRoot.getRootFile() + " does not exist. Probable configuration error so stopping import.");
199	System.exit(1);
200	}
201	}
202	return dataRoots;
203	}
204
205	/**
206	* Get the rootFile or all files with VALID_CMDI_EXTENSIONS if rootFile is a
207	* directory
208	*
209	* @param rootFile
210	* @return List with the rootFile or all contained files if rootFile is a
211	* directory
212	*/
213	protected List<File> getFilesFromDataRoot(File rootFile) {
214	List<File> result = new ArrayList<File>();
215	if (rootFile.isFile()) {
216	result.add(rootFile);
217	} else {
218	Collection<File> listFiles = FileUtils.listFiles(rootFile, VALID_CMDI_EXTENSIONS, true);
219	result.addAll(listFiles);
220	}
221	return result;
222	}
223
224	/**
225	* Create an interface to the SOLR server.
226	*
227	* After the interface has been created the importer can send documents to
228	* the server. Sending documents involves a queue. The importer adds
229	* documents to a queue, and dedicated threads will empty it, and
230	* effectively store store the documents.
231	*
232	* @throws MalformedURLException
233	*/
234	protected void initSolrServer() throws MalformedURLException {
235	String solrUrl = config.getSolrUrl();
236	LOG.info("Initializing Solr Server on " + solrUrl);
237
238	/* Specify the number of documents in the queue that will trigger the
239	* threads, two of them, emptying it.
240	*/
241	solrServer = new ConcurrentUpdateSolrServer(solrUrl,
242	config.getMinDocsInSolrQueue(), 2) {
243	/*
244	* Let the super class method handle exceptions. Make the
245	* exception available to the importer in the form of the
246	* serverError variable.
247	*/
248	@Override
249	public void handleError(Throwable exception) {
250	super.handleError(exception);
251	serverError = exception;
252	}
253	};
254	}
255
256	/**
257	* Process single CMDI file with CMDIDataProcessor
258	*
259	* @param file CMDI input file
260	* @param dataOrigin
261	* @param processor
262	* @throws SolrServerException
263	* @throws IOException
264	*/
265	protected void processCmdi(File file, DataRoot dataOrigin, CMDIDataProcessor processor) throws SolrServerException, IOException {
266	nrOfFilesAnalyzed++;
267	CMDIData cmdiData = null;
268	try {
269	cmdiData = processor.process(file);
270	if (!idOk(cmdiData.getId())) {
271	cmdiData.setId(dataOrigin.getOriginName() + "/" + file.getName()); //No id found in the metadata file so making one up based on the file name. Not quaranteed to be unique, but we have to set something.
272	nrOfFilesWithoutId++;
273	}
274	} catch (Exception e) {
275	LOG.error("error in file: {}", file, e);
276	nrOfFilesWithError++;
277	}
278	if (cmdiData != null) {
279	if (processedIds.add(cmdiData.getId())) {
280	SolrInputDocument solrDocument = cmdiData.getSolrDocument();
281	if (solrDocument != null) {
282	if (!cmdiData.getDataResources().isEmpty() \|\| !cmdiData.getLandingPageResources().isEmpty()
283	\|\| !cmdiData.getSearchResources().isEmpty() \|\| !cmdiData.getSearchPageResources().isEmpty()
284	\|\| cmdiData.getMetadataResources().isEmpty()) {
285	// We only add metadata files that have
286	// 1) data resources or
287	// 2) a landing page or
288	// 3) a search service (like SRU/CQL) or
289	// 4) a search page or
290	// 5) that have none of the above but also lack any metadata links (e.g. olac files describing a corpus with a link to the original archive).
291	// Other files will have only metadata resources and are considered 'collection' metadata files they
292	// are usually not very interesting (think imdi corpus files) and will not be included.
293	updateDocument(solrDocument, cmdiData, file, dataOrigin);
294	} else {
295	nrOfIgnoredFiles++;
296	}
297	}
298	} else {
299	LOG.warn("Skipping {}, already processed id: {}", file, cmdiData.getId());
300	}
301	}
302	}
303
304	/**
305	* Check id for validness
306	*
307	* @param id
308	* @return true if id is acceptable, false otherwise
309	*/
310	protected boolean idOk(String id) {
311	return id != null && !id.isEmpty();
312	}
313
314	/**
315	* Adds some additional information from DataRoot to solrDocument, add
316	* solrDocument to document list, submits list to SolrServer every 1000
317	* files
318	*
319	* @param solrDocument
320	* @param cmdiData
321	* @param file
322	* @param dataOrigin
323	* @throws SolrServerException
324	* @throws IOException
325	*/
326	protected void updateDocument(SolrInputDocument solrDocument, CMDIData cmdiData, File file, DataRoot dataOrigin) throws SolrServerException,
327	IOException {
328	if (!solrDocument.containsKey(FacetConstants.FIELD_COLLECTION)) {
329	solrDocument.addField(FacetConstants.FIELD_COLLECTION, dataOrigin.getOriginName());
330	}
331	solrDocument.addField(FacetConstants.FIELD_DATA_PROVIDER, dataOrigin.getOriginName());
332	solrDocument.addField(FacetConstants.FIELD_ID, cmdiData.getId());
333	solrDocument.addField(FacetConstants.FIELD_FILENAME, file.getAbsolutePath());
334
335	String metadataSourceUrl = dataOrigin.getPrefix();
336	metadataSourceUrl += file.getAbsolutePath().substring(dataOrigin.getToStrip().length());
337
338	solrDocument.addField(FacetConstants.FIELD_COMPLETE_METADATA, metadataSourceUrl);
339
340	// add SearchServices (should be CQL endpoint)
341	for (Resource resource : cmdiData.getSearchResources()) {
342	solrDocument.addField(FacetConstants.FIELD_SEARCH_SERVICE, resource.getResourceName());
343	}
344
345	// add landing page resource
346	for (Resource resource : cmdiData.getLandingPageResources()) {
347	solrDocument.addField(FacetConstants.FIELD_LANDINGPAGE, resource.getResourceName());
348	}
349
350	// add search page resource
351	for (Resource resource : cmdiData.getSearchPageResources()) {
352	solrDocument.addField(FacetConstants.FIELD_SEARCHPAGE, resource.getResourceName());
353	}
354
355	// add timestamp
356	Date dt = new Date();
357	SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
358	solrDocument.addField(FacetConstants.FIELD_LAST_SEEN, df.format(dt));
359
360	// add resource proxys
361	addResourceData(solrDocument, cmdiData);
362
363	LOG.debug("Adding document for submission to SOLR: {}", file);
364	docs.add(solrDocument);
365	if (docs.size() == config.getMaxDocsInList()) {
366	sendDocs();
367	}
368	}
369
370	/**
371	* Adds two fields FIELD_FORMAT and FIELD_RESOURCE. The Type can be
372	* specified in the "ResourceType" element of an imdi file or possibly
373	* overwritten by some more specific xpath (as in the LRT cmdi files). So if
374	* a type is overwritten and already in the solrDocument we take that type.
375	*
376	* @param solrDocument
377	* @param cmdiData
378	*/
379	protected void addResourceData(SolrInputDocument solrDocument, CMDIData cmdiData) {
380	List<Object> fieldValues = solrDocument.containsKey(FacetConstants.FIELD_FORMAT) ? new ArrayList<Object>(solrDocument
381	.getFieldValues(FacetConstants.FIELD_FORMAT)) : null;
382	solrDocument.removeField(FacetConstants.FIELD_FORMAT); //Remove old values they might be overwritten.
383	List<Resource> resources = cmdiData.getDataResources();
384	for (int i = 0; i < resources.size(); i++) {
385	Resource resource = resources.get(i);
386	String mimeType = resource.getMimeType();
387	if (mimeType == null) {
388	if (fieldValues != null && i < fieldValues.size()) {
389	mimeType = CommonUtils.normalizeMimeType(fieldValues.get(i).toString());
390	} else {
391	mimeType = CommonUtils.normalizeMimeType("");
392	}
393	}
394
395	FormatPostProcessor processor = new FormatPostProcessor();
396	mimeType = processor.process(mimeType).get(0);
397
398	// TODO check should probably be moved into Solr (by using some minimum length filter)
399	if (!mimeType.equals("")) {
400	solrDocument.addField(FacetConstants.FIELD_FORMAT, mimeType);
401	}
402	solrDocument.addField(FacetConstants.FIELD_RESOURCE, mimeType + FacetConstants.FIELD_RESOURCE_SPLIT_CHAR
403	+ resource.getResourceName());
404	}
405	}
406
407	/**
408	* Send current list of SolrImputDocuments to SolrServer and clears list
409	* afterwards
410	*
411	* @throws SolrServerException
412	* @throws IOException
413	*/
414	protected void sendDocs() throws SolrServerException, IOException {
415	LOG.info("Sending " + docs.size() + " docs to solr server. Total number of docs updated till now: " + nrOFDocumentsSend);
416	nrOFDocumentsSend += docs.size();
417	solrServer.add(docs);
418	if (serverError != null) {
419	throw new SolrServerException(serverError);
420	}
421	docs = new ArrayList<SolrInputDocument>();
422	}
423
424	/**
425	* Builds suggester index for autocompletion
426	*
427	* @throws SolrServerException
428	* @throws MalformedURLException
429	*/
430	private void buildSuggesterIndex() throws SolrServerException, MalformedURLException {
431	LOG.info("Building index for autocompletion.");
432	HashMap<String, String> paramMap = new HashMap<String, String>();
433	paramMap.put("qt", "/suggest");
434	paramMap.put("spellcheck.build", "true");
435	SolrParams params = new MapSolrParams(paramMap);
436	solrServer.query(params);
437	}
438
439	public static VloConfig config;
440
441	/**
442	* @param args
443	* @throws MalformedURLException
444	* @throws IOException
445	*/
446	public static void main(String[] args) throws MalformedURLException, IOException {
447
448	// path to the configuration file
449	String configFile = null;
450
451	// use the Apache cli framework for getting command line parameters
452	Options options = new Options();
453
454	/**
455	* Add a "c" option, the option indicating the specification of an XML
456	* configuration file
457	*/
458	options.addOption("c", true, "-c <file> : use parameters specified in <file>");
459
460	CommandLineParser parser = new PosixParser();
461
462	try {
463	// parse the command line arguments
464	CommandLine cmd = parser.parse(options, args);
465	if (cmd.hasOption("c")) {
466
467	// the "c" option was specified, now get its value
468	configFile = cmd.getOptionValue("c");
469	}
470
471	} catch (org.apache.commons.cli.ParseException ex) {
472
473	/**
474	* Caught an exception caused by command line parsing. Try to get
475	* the name of the configuration file by querying the system
476	* property.
477	*/
478	String message = "Command line parsing failed. " + ex.getMessage();
479	LOG.error(message);
480	System.err.println(message);
481	}
482
483	if (configFile == null) {
484
485	String message;
486
487	message = "Could not get config file name via the command line, trying the system properties.";
488	LOG.info(message);
489
490	String key;
491
492	key = "configFile";
493	configFile = System.getProperty(key);
494	}
495
496	if (configFile == null) {
497
498	String message;
499
500	message = "Could not get filename as system property either - stopping.";
501	LOG.error(message);
502	} else {
503	// read the configuration from the externally supplied file
504	final URL configUrl;
505	if (configFile.startsWith("file:")) {
506	configUrl = new URL(configFile);
507	} else {
508	configUrl = new File(configFile).toURI().toURL();
509	}
510	System.out.println("Reading configuration from " + configUrl.toString());
511	final XmlVloConfigFactory configFactory = new XmlVloConfigFactory(configUrl);
512	MetadataImporter.config = configFactory.newConfig();
513
514	// optionally, modify the configuration here
515	// create and start the importer
516	MetadataImporter importer = new MetadataImporter();
517	importer.startImport();
518
519	// finished importing
520	if (MetadataImporter.config.printMapping()) {
521	File file = new File("xsdMapping.txt");
522	FacetMappingFactory.printMapping(file);
523	LOG.info("Printed facetMapping in " + file);
524	}
525	}
526	}
527	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: