Context Navigation

source: vlo/branches/vlo-3.0/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/MetadataImporter.java @ 4612

Last change on this file since 4612 was 4612, checked in by teckart, 10 years ago
Fix ticket #454: data cleansing of facet "format": only valid MIME-types are accepted. Other values are mapped to a default "unknown" String.
File size: 20.6 KB

Line
1	package eu.clarin.cmdi.vlo.importer;
2
3	import eu.clarin.cmdi.vlo.CommonUtils;
4	import eu.clarin.cmdi.vlo.FacetConstants;
5	import eu.clarin.cmdi.vlo.config.DataRoot;
6	import eu.clarin.cmdi.vlo.config.VloConfig;
7	import eu.clarin.cmdi.vlo.config.XmlVloConfigFactory;
8
9	import java.io.File;
10	import java.io.IOException;
11	import java.net.MalformedURLException;
12	import java.net.URL;
13	import java.text.SimpleDateFormat;
14	import java.util.ArrayList;
15	import java.util.Collection;
16	import java.util.Date;
17	import java.util.HashMap;
18	import java.util.HashSet;
19	import java.util.List;
20	import java.util.Map;
21	import java.util.Set;
22
23	import org.apache.commons.cli.CommandLine;
24	import org.apache.commons.cli.CommandLineParser;
25	import org.apache.commons.cli.Options;
26	import org.apache.commons.cli.PosixParser;
27	import org.apache.commons.io.FileUtils;
28	import org.apache.solr.client.solrj.SolrServerException;
29	import org.apache.solr.client.solrj.impl.ConcurrentUpdateSolrServer;
30	import org.apache.solr.client.solrj.util.ClientUtils;
31	import org.apache.solr.common.SolrInputDocument;
32	import org.apache.solr.common.params.MapSolrParams;
33	import org.apache.solr.common.params.SolrParams;
34	import org.slf4j.Logger;
35	import org.slf4j.LoggerFactory;
36
37
38	/**
39	* The main metadataImporter class. Also contains the main function.
40	*
41	* The metadataimporter reads all the config files and then, for each
42	* metadatafile in each defined directory structure parses and imports them as
43	* defined in the configuration. The startImport function starts the importing
44	* and so on.
45	*/
46
47	@SuppressWarnings({"serial"})
48	public class MetadataImporter {
49
50	/**
51	* Defines which files to try and parse.
52	* In this case all files ending in "xml" or "cmdi".
53	*/
54	private static final String[] VALID_CMDI_EXTENSIONS = new String[] { "xml", "cmdi" };
55
56	/**
57	* Log log log log
58	*/
59	protected final static Logger LOG = LoggerFactory.getLogger(MetadataImporter.class);
60	/**
61	* Some place to store errors.
62	*/
63	private static Throwable serverError;
64	/**
65	* the solr server.
66	*/
67	private ConcurrentUpdateSolrServer solrServer;
68	/**
69	* Defines the post-processor associations. At import, for each facet value,
70	* this map is checked and all postprocessors associated with the facet
71	* _type_ are applied to the value before storing the new value in the solr
72	* document.
73	*/
74	final static Map<String, PostProcessor> POST_PROCESSORS = new HashMap<String, PostProcessor>();
75	static {
76	POST_PROCESSORS.put(FacetConstants.FIELD_ID, new IdPostProcessor());
77	POST_PROCESSORS.put(FacetConstants.FIELD_CONTINENT, new ContinentNamePostProcessor());
78	POST_PROCESSORS.put(FacetConstants.FIELD_COUNTRY, new CountryNamePostProcessor());
79	POST_PROCESSORS.put(FacetConstants.FIELD_LANGUAGE, new LanguageCodePostProcessor());
80	POST_PROCESSORS.put(FacetConstants.FIELD_LANGUAGES, new LanguageLinkPostProcessor());
81	POST_PROCESSORS.put(FacetConstants.FIELD_YEAR, new YearPostProcessor());
82	POST_PROCESSORS.put(FacetConstants.FIELD_NATIONAL_PROJECT, new NationalProjectPostProcessor());
83	POST_PROCESSORS.put(FacetConstants.FIELD_CLARIN_PROFILE, new CMDIComponentProfileNamePostProcessor());
84	}
85
86	/**
87	* Constructor
88	*
89	* @param
90	*/
91	public MetadataImporter (){
92	}
93
94	/**
95	* Contains MDSelflinks (usually).
96	* Just to know what we have already done.
97	*/
98	protected final Set<String> processedIds = new HashSet<String>();
99	/**
100	* Some caching for solr documents (we are more efficient if we ram a whole
101	* bunch to the solr server at once.
102	*/
103	protected List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>();
104
105	// SOME STATS
106	protected int nrOFDocumentsSend;
107	protected int nrOfFilesAnalyzed = 0;
108	protected int nrOfFilesWithoutId = 0;
109	protected int nrOfIgnoredFiles = 0;
110	protected int nrOfFilesWithError = 0;
111
112	/**
113	* Retrieve all files with VALID_CMDI_EXTENSIONS from all DataRoot entries
114	* and starts processing for every single file
115	*
116	* @throws MalformedURLException
117	*/
118	void startImport() throws MalformedURLException {
119
120	initSolrServer();
121	List<DataRoot> dataRoots = checkDataRoots();
122	long start = System.currentTimeMillis();
123	try {
124	// Delete the whole Solr db
125	if (config.getDeleteAllFirst()) {
126	LOG.info("Deleting original data...");
127	solrServer.deleteByQuery(":");
128	solrServer.commit();
129	LOG.info("Deleting original data done.");
130	}
131
132	// Import the specified data roots
133	for (DataRoot dataRoot : dataRoots) {
134	LOG.info("Start of processing: " + dataRoot.getOriginName());
135	if (dataRoot.deleteFirst()) {
136	LOG.info("Deleting data for data provider: " + dataRoot.getOriginName());
137	solrServer.deleteByQuery(FacetConstants.FIELD_DATA_PROVIDER + ":" + ClientUtils.escapeQueryChars(dataRoot.getOriginName()));
138	LOG.info("Deleting data of provider done.");
139	}
140	CMDIDataProcessor processor = new CMDIParserVTDXML(POST_PROCESSORS);
141	List<File> files = getFilesFromDataRoot(dataRoot.getRootFile());
142	for (File file : files) {
143	if (config.getMaxFileSize() > 0 &&
144	file.length() > config.getMaxFileSize()) {
145	LOG.info("Skipping " + file.getAbsolutePath() + " because it is too large.");
146	} else {
147	LOG.debug("PROCESSING FILE: " + file.getAbsolutePath());
148	processCmdi(file, dataRoot, processor);
149	}
150	}
151	if (!docs.isEmpty()) {
152	sendDocs();
153	}
154	LOG.info("End of processing: " + dataRoot.getOriginName());
155	}
156
157	// delete outdated entries (based on maxDaysInSolr parameter)
158	if(config.getMaxDaysInSolr() > 0 && config.getDeleteAllFirst() == false) {
159	LOG.info("Deleting old files that were not seen for more than "+config.getMaxDaysInSolr()+" days...");
160	solrServer.deleteByQuery(FacetConstants.FIELD_LAST_SEEN+":[* TO NOW-"+config.getMaxDaysInSolr()+"DAYS]");
161	LOG.info("Deleting old files done.");
162	}
163	} catch (SolrServerException e) {
164	LOG.error("error updating files:\n", e);
165	LOG.error("Also see vlo_solr server logs for more information");
166	} catch (IOException e) {
167	LOG.error("error updating files:\n", e);
168	} finally {
169	try {
170	if (solrServer != null) {
171	solrServer.commit();
172	buildSuggesterIndex();
173	}
174	} catch (SolrServerException e) {
175	LOG.error("cannot commit:\n", e);
176	} catch (IOException e) {
177	LOG.error("cannot commit:\n", e);
178	}
179	}
180	long took = (System.currentTimeMillis() - start) / 1000;
181	LOG.info("Found " + nrOfFilesWithoutId + " file(s) without an id. (id is generated based on fileName but that may not be unique)");
182	LOG.info("Found " + nrOfFilesWithError + " file(s) with errors.");
183	LOG.info("Found " + nrOfIgnoredFiles
184	+ " file(s) that where ignored (files without resources or any link to a search service or landing page are ignored).");
185	LOG.info("Update of " + nrOFDocumentsSend + " took " + took + " secs. Total nr of files analyzed " + nrOfFilesAnalyzed);
186	}
187
188	/**
189	* Check a List of DataRoots for existence of RootFile (typically parent
190	* directory of metadata files)
191	*
192	* @return
193	*/
194	protected List<DataRoot> checkDataRoots() {
195	List<DataRoot> dataRoots = config.getDataRoots();
196	for (DataRoot dataRoot : dataRoots) {
197	if (!dataRoot.getRootFile().exists()) {
198	LOG.error("Root file " + dataRoot.getRootFile() + " does not exist. Probable configuration error so stopping import.");
199	System.exit(1);
200	}
201	}
202	return dataRoots;
203	}
204
205	/**
206	* Get the rootFile or all files with VALID_CMDI_EXTENSIONS if rootFile is a
207	* directory
208	*
209	* @param rootFile
210	* @return List with the rootFile or all contained files if rootFile is a
211	* directory
212	*/
213	protected List<File> getFilesFromDataRoot(File rootFile) {
214	List<File> result = new ArrayList<File>();
215	if (rootFile.isFile()) {
216	result.add(rootFile);
217	} else {
218	Collection<File> listFiles = FileUtils.listFiles(rootFile, VALID_CMDI_EXTENSIONS, true);
219	result.addAll(listFiles);
220	}
221	return result;
222	}
223
224	/**
225	* Create an interface to the SOLR server.
226	*
227	* After the interface has been created the importer can send documents to
228	* the server. Sending documents involves a queue. The importer adds
229	* documents to a queue, and dedicated threads will empty it, and
230	* effectively store store the documents.
231	*
232	* @throws MalformedURLException
233	*/
234	protected void initSolrServer() throws MalformedURLException {
235	String solrUrl = config.getSolrUrl();
236	LOG.info("Initializing Solr Server on " + solrUrl);
237
238	/* Specify the number of documents in the queue that will trigger the
239	* threads, two of them, emptying it.
240	*/
241	solrServer = new ConcurrentUpdateSolrServer(solrUrl,
242	config.getMinDocsInSolrQueue(), 2) {
243	/*
244	* Let the super class method handle exceptions. Make the
245	* exception available to the importer in the form of the
246	* serverError variable.
247	*/
248	@Override
249	public void handleError(Throwable exception) {
250	super.handleError(exception);
251	serverError = exception;
252	}
253	};
254	}
255
256	/**
257	* Process single CMDI file with CMDIDataProcessor
258	*
259	* @param file CMDI input file
260	* @param dataOrigin
261	* @param processor
262	* @throws SolrServerException
263	* @throws IOException
264	*/
265	protected void processCmdi(File file, DataRoot dataOrigin, CMDIDataProcessor processor) throws SolrServerException, IOException {
266	nrOfFilesAnalyzed++;
267	CMDIData cmdiData = null;
268	try {
269	cmdiData = processor.process(file);
270	if (!idOk(cmdiData.getId())) {
271	cmdiData.setId(dataOrigin.getOriginName() + "/" + file.getName()); //No id found in the metadata file so making one up based on the file name. Not quaranteed to be unique, but we have to set something.
272	nrOfFilesWithoutId++;
273	}
274	} catch (Exception e) {
275	LOG.error("error in file: " + file + " Exception", e);
276	nrOfFilesWithError++;
277	}
278	if (cmdiData != null && processedIds.add(cmdiData.getId())) {
279	SolrInputDocument solrDocument = cmdiData.getSolrDocument();
280	if (solrDocument != null) {
281	if (!cmdiData.getDataResources().isEmpty() \|\| !cmdiData.getLandingPageResources().isEmpty()
282	\|\| !cmdiData.getSearchResources().isEmpty() \|\| !cmdiData.getSearchPageResources().isEmpty()
283	\|\| cmdiData.getMetadataResources().isEmpty() ) {
284	// We only add metadata files that have
285	// 1) data resources or
286	// 2) a landing page or
287	// 3) a search service (like SRU/CQL) or
288	// 4) a search page or
289	// 5) that have none of the above but also lack any metadata links (e.g. olac files describing a corpus with a link to the original archive).
290	// Other files will have only metadata resources and are considered 'collection' metadata files they
291	// are usually not very interesting (think imdi corpus files) and will not be included.
292	updateDocument(solrDocument, cmdiData, file, dataOrigin);
293	} else {
294	nrOfIgnoredFiles++;
295	}
296	}
297	}
298	}
299
300	/**
301	* Check id for validness
302	*
303	* @param id
304	* @return true if id is acceptable, false otherwise
305	*/
306	protected boolean idOk(String id) {
307	return id != null && !id.isEmpty();
308	}
309
310	/**
311	* Adds some additional information from DataRoot to solrDocument, add
312	* solrDocument to document list, submits list to SolrServer every 1000
313	* files
314	*
315	* @param solrDocument
316	* @param cmdiData
317	* @param file
318	* @param dataOrigin
319	* @throws SolrServerException
320	* @throws IOException
321	*/
322	protected void updateDocument(SolrInputDocument solrDocument, CMDIData cmdiData, File file, DataRoot dataOrigin) throws SolrServerException,
323	IOException {
324	if (!solrDocument.containsKey(FacetConstants.FIELD_COLLECTION)) {
325	solrDocument.addField(FacetConstants.FIELD_COLLECTION, dataOrigin.getOriginName());
326	}
327	solrDocument.addField(FacetConstants.FIELD_DATA_PROVIDER, dataOrigin.getOriginName());
328	solrDocument.addField(FacetConstants.FIELD_ID, cmdiData.getId());
329	solrDocument.addField(FacetConstants.FIELD_FILENAME, file.getAbsolutePath());
330
331	String metadataSourceUrl = dataOrigin.getPrefix();
332	metadataSourceUrl += file.getAbsolutePath().substring(dataOrigin.getToStrip().length());
333
334	solrDocument.addField(FacetConstants.FIELD_COMPLETE_METADATA, metadataSourceUrl);
335
336	// add SearchServices (should be CQL endpoint)
337	for (Resource resource : cmdiData.getSearchResources()) {
338	solrDocument.addField(FacetConstants.FIELD_SEARCH_SERVICE, resource.getResourceName());
339	}
340
341	// add landing page resource
342	for (Resource resource : cmdiData.getLandingPageResources()) {
343	solrDocument.addField(FacetConstants.FIELD_LANDINGPAGE, resource.getResourceName());
344	}
345
346	// add search page resource
347	for (Resource resource : cmdiData.getSearchPageResources()) {
348	solrDocument.addField(FacetConstants.FIELD_SEARCHPAGE, resource.getResourceName());
349	}
350
351	// add timestamp
352	Date dt = new Date();
353	SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
354	solrDocument.addField(FacetConstants.FIELD_LAST_SEEN, df.format(dt));
355
356	// add resource proxys
357	addResourceData(solrDocument, cmdiData);
358	docs.add(solrDocument);
359	if (docs.size() == config.getMaxDocsInList()) {
360	sendDocs();
361	}
362	}
363
364	/**
365	* Adds two fields FIELD_FORMAT and FIELD_RESOURCE. The Type can be
366	* specified in the "ResourceType" element of an imdi file or possibly
367	* overwritten by some more specific xpath (as in the LRT cmdi files). So if
368	* a type is overwritten and already in the solrDocument we take that type.
369	*
370	* TODO evaluate odd connection between FIELD_FORMAT and ResourceProxy-Mimetypes
371	*/
372	protected void addResourceData(SolrInputDocument solrDocument, CMDIData cmdiData) {
373	List<Object> fieldValues = solrDocument.containsKey(FacetConstants.FIELD_FORMAT) ? new ArrayList<Object>(solrDocument
374	.getFieldValues(FacetConstants.FIELD_FORMAT)) : null;
375	solrDocument.removeField(FacetConstants.FIELD_FORMAT); //Remove old values they might be overwritten.
376	List<Resource> resources = cmdiData.getDataResources();
377	for (int i = 0; i < resources.size(); i++) {
378	Resource resource = resources.get(i);
379	String mimeType = resource.getMimeType();
380	String format = mimeType;
381	if (mimeType == null) {
382	if (fieldValues != null && i < fieldValues.size()) {
383	format = fieldValues.get(i).toString(); //assuming there will be as many formats overwritten as there are specified
384	mimeType = CommonUtils.normalizeMimeType(format);
385	} else {
386	mimeType = CommonUtils.normalizeMimeType("");
387	format = mimeType;
388	}
389	} else {
390	format = CommonUtils.normalizeMimeType(mimeType);
391	}
392
393	FormatPostProcessor processor = new FormatPostProcessor();
394	mimeType = processor.process(mimeType);
395
396	// TODO check should probably be moved into Solr (by using some minimum length filter)
397	if(!mimeType.equals(""))
398	solrDocument.addField(FacetConstants.FIELD_FORMAT, mimeType);
399	solrDocument.addField(FacetConstants.FIELD_RESOURCE, mimeType + FacetConstants.FIELD_RESOURCE_SPLIT_CHAR
400	+ resource.getResourceName());
401	}
402	}
403
404	/**
405	* Send current list of SolrImputDocuments to SolrServer and clears list
406	* afterwards
407	*
408	* @throws SolrServerException
409	* @throws IOException
410	*/
411	protected void sendDocs() throws SolrServerException, IOException {
412	LOG.info("Sending " + docs.size() + " docs to solr server. Total number of docs updated till now: " + nrOFDocumentsSend);
413	nrOFDocumentsSend += docs.size();
414	solrServer.add(docs);
415	if (serverError != null) {
416	throw new SolrServerException(serverError);
417	}
418	docs = new ArrayList<SolrInputDocument>();
419	}
420
421	/**
422	* Builds suggester index for autocompletion
423	*
424	* @throws SolrServerException
425	* @throws MalformedURLException
426	*/
427	private void buildSuggesterIndex() throws SolrServerException, MalformedURLException {
428	LOG.info("Building index for autocompletion.");
429	HashMap<String,String> paramMap = new HashMap<String, String>();
430	paramMap.put("qt", "/suggest");
431	paramMap.put("spellcheck.build", "true");
432	SolrParams params = new MapSolrParams(paramMap);
433	solrServer.query(params);
434	}
435
436	public static VloConfig config;
437
438	/**
439	* @param args
440	* @throws IOException
441	*/
442	public static void main(String[] args) throws MalformedURLException, IOException {
443
444
445	// path to the configuration file
446	String configFile = null;
447
448	// use the Apache cli framework for getting command line parameters
449	Options options = new Options();
450
451	/**
452	* Add a "c" option, the option indicating the specification of an XML
453	* configuration file
454	*/
455	options.addOption("c", true, "-c <file> : use parameters specified in <file>");
456
457	CommandLineParser parser = new PosixParser();
458
459	try {
460	// parse the command line arguments
461	CommandLine cmd = parser.parse(options, args);
462	if (cmd.hasOption("c")) {
463
464	// the "c" option was specified, now get its value
465	configFile = cmd.getOptionValue("c");
466	}
467
468	} catch (org.apache.commons.cli.ParseException ex) {
469
470	/**
471	* Caught an exception caused by command line parsing. Try to get
472	* the name of the configuration file by querying the system
473	* property.
474	*/
475
476	String message = "Command line parsing failed. " + ex.getMessage();
477	LOG.error(message);
478	System.err.println(message);
479	}
480
481	if (configFile == null){
482
483	String message;
484
485	message = "Could not get config file name via the command line, trying the system properties.";
486	LOG.info(message);
487
488	String key;
489
490	key = "configFile";
491	configFile = System.getProperty(key);
492	}
493
494	if (configFile == null) {
495
496	String message;
497
498	message = "Could not get filename as system property either - stopping.";
499	LOG.error(message);
500	} else {
501	// read the configuration from the externally supplied file
502	XmlVloConfigFactory configFactory = new XmlVloConfigFactory(new URL(configFile));
503	MetadataImporter.config = configFactory.newConfig();
504
505	// optionally, modify the configuration here
506
507	// create and start the importer
508	MetadataImporter importer = new MetadataImporter();
509	importer.startImport();
510
511	// finished importing
512
513	if (MetadataImporter.config.printMapping()) {
514	File file = new File("xsdMapping.txt");
515	FacetMappingFactory.printMapping(file);
516	LOG.info("Printed facetMapping in " + file);
517	}
518	}
519	}
520	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: