Context Navigation

source: vlo/trunk/vlo_webapp/src/main/java/eu/clarin/cmdi/vlo/importer/MetadataImporter.java @ 2496

Last change on this file since 2496 was 2496, checked in by teckart, 11 years ago
Added method that starts building autocomplete (suggester) index after import
File size: 15.9 KB

Line
1	package eu.clarin.cmdi.vlo.importer;
2
3	import java.io.File;
4	import java.io.IOException;
5	import java.net.MalformedURLException;
6	import java.util.ArrayList;
7	import java.util.Collection;
8	import java.util.HashMap;
9	import java.util.HashSet;
10	import java.util.List;
11	import java.util.Map;
12	import java.util.Set;
13
14	import org.apache.commons.io.FileUtils;
15	import org.apache.solr.client.solrj.SolrServerException;
16	import org.apache.solr.client.solrj.impl.StreamingUpdateSolrServer;
17	import org.apache.solr.client.solrj.util.ClientUtils;
18	import org.apache.solr.common.SolrInputDocument;
19	import org.apache.solr.common.params.MapSolrParams;
20	import org.apache.solr.common.params.SolrParams;
21	import org.slf4j.Logger;
22	import org.slf4j.LoggerFactory;
23	import org.springframework.beans.factory.BeanFactory;
24	import org.springframework.context.support.ClassPathXmlApplicationContext;
25
26	import eu.clarin.cmdi.vlo.CommonUtils;
27	import eu.clarin.cmdi.vlo.Configuration;
28	import eu.clarin.cmdi.vlo.FacetConstants;
29
30
31	/**
32	* The main metadataImporter class. Also contains the main function.
33	*
34	* The metadataimporter reads all the config files and then, for each metadatafile in each defined directory structure parses and imports them as defined in the configuration.
35	* The startImport function starts the importing and so on.
36	*/
37
38	@SuppressWarnings({"serial"})
39	public class MetadataImporter {
40
41	/**
42	* Defines which files to try and parse.
43	* In this case all files ending in "xml" or "cmdi".
44	*/
45	private static final String[] VALID_CMDI_EXTENSIONS = new String[] { "xml", "cmdi" };
46
47	/**
48	* Log log log log
49	*/
50	private final static Logger LOG = LoggerFactory.getLogger(MetadataImporter.class);
51	/**
52	* Some place to store errors.
53	*/
54	private static Throwable serverError;
55	/**
56	* the solr server.
57	*/
58	private StreamingUpdateSolrServer solrServer;
59
60	/**
61	* Defines the post-processor associations.
62	* At import, for each facet value, this map is checked and all postprocessors associated with the facet _type_ are applied to the value before storing the new value in the solr document.
63	*/
64	final static Map<String, PostProcessor> POST_PROCESSORS = new HashMap<String, PostProcessor>();
65	static {
66	POST_PROCESSORS.put(FacetConstants.FIELD_COUNTRY, new CountryNamePostProcessor());
67	POST_PROCESSORS.put(FacetConstants.FIELD_LANGUAGE, new LanguageCodePostProcessor());
68	POST_PROCESSORS.put(FacetConstants.FIELD_RESOURCE_TYPE, new ResourceTypePostProcessor());
69	POST_PROCESSORS.put(FacetConstants.FIELD_LANGUAGES, new LanguageLinkPostProcessor());
70	POST_PROCESSORS.put(FacetConstants.FIELD_NATIONAL_PROJECT, new NationalProjectPostProcessor());
71	POST_PROCESSORS.put(FacetConstants.FIELD_CLARIN_PROFILE, new CMDIComponentProfileNamePostProcessor());
72	}
73
74	/**
75	* Contains MDSelflinks (usually).
76	* Just to know what we have already done.
77	*/
78	private final Set<String> processedIds = new HashSet<String>();
79	/**
80	* Some caching for solr documents (we are more efficient if we ram a whole bunch to the solr server at once.
81	*/
82	protected List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>();
83	/**
84	* Config.
85	*/
86	private final ImporterConfig config;
87
88	// SOME STATS
89	private int nrOFDocumentsUpdated;
90	private int nrOfFilesAnalyzed = 0;
91	private int nrOfFilesWithoutId = 0;
92	private int nrOfFilesWithoutDataResources = 0;
93	private int nrOfFilesWithError = 0;
94
95	/**
96	* Constructor, wants to know the config.
97	* @param config the config.
98	*/
99	public MetadataImporter(ImporterConfig config) {
100	this.config = config;
101	}
102
103	/**
104	* Retrieve all files with VALID_CMDI_EXTENSIONS from all DataRoot entries and starts processing for every single file
105	* @throws MalformedURLException
106	*/
107	void startImport() throws MalformedURLException {
108
109	initSolrServer();
110	List<DataRoot> dataRoots = checkDataRoots();
111	long start = System.currentTimeMillis();
112	try {
113	// Delete the whole Solr db
114	if (config.isDeleteAllFirst()) {
115	LOG.info("Deleting original data...");
116	solrServer.deleteByQuery(":");
117	solrServer.commit();
118	LOG.info("Deleting original data done.");
119	}
120	for (DataRoot dataRoot : dataRoots) {
121	LOG.info("Start of processing: " + dataRoot.getOriginName());
122	if (dataRoot.isDeleteFirst()) {
123	LOG.info("Deleting data for data provider: " + dataRoot.getOriginName());
124	solrServer.deleteByQuery(FacetConstants.FIELD_DATA_PROVIDER + ":" + ClientUtils.escapeQueryChars(dataRoot.getOriginName()));
125	LOG.info("Deleting data of provider done.");
126	}
127	CMDIDataProcessor processor = new CMDIParserVTDXML(POST_PROCESSORS);
128	List<File> files = getFilesFromDataRoot(dataRoot.getRootFile());
129	for (File file : files) {
130	LOG.debug("PROCESSING FILE: " + file.getAbsolutePath());
131	processCmdi(file, dataRoot, processor);
132	}
133	if (!docs.isEmpty()) {
134	sendDocs();
135	}
136	LOG.info("End of processing: " + dataRoot.getOriginName());
137	}
138	} catch (SolrServerException e) {
139	LOG.error("error updating files:\n", e);
140	LOG.error("Also see vlo_solr server logs for more information");
141	} catch (IOException e) {
142	LOG.error("error updating files:\n", e);
143	} finally {
144	try {
145	if (solrServer != null) {
146	solrServer.commit();
147	buildSuggesterIndex();
148	}
149	} catch (SolrServerException e) {
150	LOG.error("cannot commit:\n", e);
151	} catch (IOException e) {
152	LOG.error("cannot commit:\n", e);
153	}
154	}
155	long took = (System.currentTimeMillis() - start) / 1000;
156	LOG.info("Found " + nrOfFilesWithoutId + " file(s) without an id. (id is generated based on fileName but that may not be unique)");
157	LOG.info("Found " + nrOfFilesWithError + " file(s) with errors.");
158	LOG.info("Found " + nrOfFilesWithoutDataResources
159	+ " file(s) without data resources (metadata descriptions without resources are ignored).");
160	LOG.info("Update of " + nrOFDocumentsUpdated + " took " + took + " secs. Total nr of files analyzed " + nrOfFilesAnalyzed);
161	}
162
163	/**
164	* Check a List of DataRoots for existence of RootFile (typically parent directory of metadata files)
165	* @return
166	*/
167	private List<DataRoot> checkDataRoots() {
168	List<DataRoot> dataRoots = config.getDataRoots();
169	for (DataRoot dataRoot : dataRoots) {
170	if (!dataRoot.getRootFile().exists()) {
171	LOG.error("Root file " + dataRoot.getRootFile() + " does not exist. Probable configuration error so stopping import.");
172	System.exit(1);
173	}
174	}
175	return dataRoots;
176	}
177
178	/**
179	* Get the rootFile or all files with VALID_CMDI_EXTENSIONS if rootFile is a directory
180	* @param rootFile
181	* @return List with the rootFile or all contained files if rootFile is a directory
182	*/
183	private List<File> getFilesFromDataRoot(File rootFile) {
184	List<File> result = new ArrayList<File>();
185	if (rootFile.isFile()) {
186	result.add(rootFile);
187	} else {
188	Collection<File> listFiles = FileUtils.listFiles(rootFile, VALID_CMDI_EXTENSIONS, true);
189	result.addAll(listFiles);
190	}
191	return result;
192	}
193
194	/**
195	* Initialize SolrServer as specified in configuration file
196	* @throws MalformedURLException
197	*/
198	protected void initSolrServer() throws MalformedURLException {
199	String solrUrl = Configuration.getInstance().getSolrUrl();
200	LOG.info("Initializing Solr Server on " + solrUrl);
201	solrServer = new StreamingUpdateSolrServer(solrUrl, 1000, 2) {
202	@Override
203	public void handleError(Throwable ex) {
204	super.handleError(ex);
205	serverError = ex;
206	}
207	};
208	}
209
210	/**
211	* Process single CMDI file with CMDIDataProcessor
212	* @param file CMDI input file
213	* @param dataOrigin
214	* @param processor
215	* @throws SolrServerException
216	* @throws IOException
217	*/
218	private void processCmdi(File file, DataRoot dataOrigin, CMDIDataProcessor processor) throws SolrServerException, IOException {
219	nrOfFilesAnalyzed++;
220	CMDIData cmdiData = null;
221	try {
222	cmdiData = processor.process(file);
223	if (!idOk(cmdiData.getId())) {
224	cmdiData.setId(dataOrigin.getOriginName() + "/" + file.getName()); //No id found in the metadata file so making one up based on the file name. Not quaranteed to be unique, but we have to set something.
225	nrOfFilesWithoutId++;
226	}
227	} catch (Exception e) {
228	LOG.error("error in file: " + file + " Exception", e);
229	nrOfFilesWithError++;
230	}
231	if (cmdiData != null && processedIds.add(cmdiData.getId())) {
232	SolrInputDocument solrDocument = cmdiData.getSolrDocument();
233	if (solrDocument != null) {
234	if (!cmdiData.getDataResources().isEmpty() \|\| cmdiData.getMetadataResources().isEmpty()) {
235	// We only add metadata files that have data resources (1) or files that don't link to other metadata files (2):
236	// 1) files with data resources are obviously interesting
237	// 2) files without metadata links and without dataResource can be interesting e.g. olac files describing a corpus with a link to the original archive.
238	// Other files will have only metadata resources and are considered 'collection' metadata files they
239	// are usually not very interesting (think imdi corpus files) and will not be included.
240	updateDocument(solrDocument, cmdiData, file, dataOrigin);
241	} else {
242	nrOfFilesWithoutDataResources++;
243	}
244	}
245	}
246	}
247
248	/**
249	* Check id for validness
250	* @param id
251	* @return true if id is acceptable, false otherwise
252	*/
253	private boolean idOk(String id) {
254	return id != null && !id.isEmpty();
255	}
256
257	/**
258	* Adds some additional information from DataRoot to solrDocument, add solrDocument to document list, submits list to SolrServer every 1000 files
259	* @param solrDocument
260	* @param cmdiData
261	* @param file
262	* @param dataOrigin
263	* @throws SolrServerException
264	* @throws IOException
265	*/
266	private void updateDocument(SolrInputDocument solrDocument, CMDIData cmdiData, File file, DataRoot dataOrigin) throws SolrServerException,
267	IOException {
268	if (!solrDocument.containsKey(FacetConstants.FIELD_COLLECTION)) {
269	solrDocument.addField(FacetConstants.FIELD_COLLECTION, dataOrigin.getOriginName());
270	}
271	solrDocument.addField(FacetConstants.FIELD_DATA_PROVIDER, dataOrigin.getOriginName());
272	solrDocument.addField(FacetConstants.FIELD_ID, cmdiData.getId());
273	solrDocument.addField(FacetConstants.FIELD_FILENAME, file.getAbsolutePath());
274
275	String metadataSourceUrl = dataOrigin.getPrefix();
276	//System.out.println(dataOrigin.getTostrip());
277	//System.out.println(dataOrigin.getTostrip().length());
278	//System.out.println(file.getAbsolutePath());
279	metadataSourceUrl += file.getAbsolutePath().substring(dataOrigin.getTostrip().length());
280
281	solrDocument.addField(FacetConstants.FIELD_COMPLETE_METADATA, metadataSourceUrl);
282
283	// add SearchServices (should be CQL endpoint)
284	for(Resource resource : cmdiData.getSearchResources())
285	solrDocument.addField(FacetConstants.FIELD_SEARCH_SERVICE, resource.getResourceName());
286
287	addResourceData(solrDocument, cmdiData);
288	docs.add(solrDocument);
289	if (docs.size() == 1000) {
290	sendDocs();
291	}
292	}
293
294	/**
295	* Adds two fields FIELD_RESOURCE_TYPE and FIELD_RESOURCE. The Type can be specified in the "ResourceType" element of an imdi file or
296	* possibly overwritten by some more specific xpath (as in the LRT cmdi files). So if a type is overwritten and already in the
297	* solrDocument we take that type.
298	*/
299	private void addResourceData(SolrInputDocument solrDocument, CMDIData cmdiData) {
300	List<Object> fieldValues = solrDocument.containsKey(FacetConstants.FIELD_RESOURCE_TYPE) ? new ArrayList<Object>(solrDocument
301	.getFieldValues(FacetConstants.FIELD_RESOURCE_TYPE)) : null;
302	solrDocument.removeField(FacetConstants.FIELD_RESOURCE_TYPE); //Remove old values they might be overwritten.
303	List<Resource> resources = cmdiData.getDataResources();
304	for (int i = 0; i < resources.size(); i++) {
305	Resource resource = resources.get(i);
306	String mimeType = resource.getMimeType();
307	String resourceType = mimeType;
308	if (mimeType == null) {
309	if (fieldValues != null && i < fieldValues.size()) {
310	resourceType = fieldValues.get(i).toString(); //assuming there will be as many resource types overwritten as there are specified
311	mimeType = CommonUtils.normalizeMimeType(resourceType);
312	} else {
313	mimeType = CommonUtils.normalizeMimeType("");
314	resourceType = mimeType;
315	}
316	} else {
317	resourceType = CommonUtils.normalizeMimeType(mimeType);
318	}
319	solrDocument.addField(FacetConstants.FIELD_RESOURCE_TYPE, resourceType);
320	solrDocument.addField(FacetConstants.FIELD_RESOURCE, mimeType + FacetConstants.FIELD_RESOURCE_SPLIT_CHAR
321	+ resource.getResourceName());
322	}
323	}
324
325	/**
326	* Send current list of SolrImputDocuments to SolrServer and clears list afterwards
327	* @throws SolrServerException
328	* @throws IOException
329	*/
330	protected void sendDocs() throws SolrServerException, IOException {
331	LOG.info("Sending " + docs.size() + " docs to solr server. Total number of docs updated till now: " + nrOFDocumentsUpdated);
332	nrOFDocumentsUpdated += docs.size();
333	solrServer.add(docs);
334	if (serverError != null) {
335	throw new SolrServerException(serverError);
336	}
337	docs = new ArrayList<SolrInputDocument>();
338	}
339
340	/**
341	* Builds suggester index for autocompletion
342	* @throws SolrServerException
343	* @throws MalformedURLException
344	*/
345	private void buildSuggesterIndex() throws SolrServerException, MalformedURLException {
346	LOG.info("Building index for autocompletion.");
347	HashMap<String,String> paramMap = new HashMap<String, String>();
348	paramMap.put("qt", "/suggest");
349	paramMap.put("spellcheck.build", "true");
350	SolrParams params = new MapSolrParams(paramMap);
351	solrServer.query(params);
352	}
353
354	/**
355	* @param args
356	* @throws IOException
357	*/
358	public static void main(String[] args) throws IOException {
359	BeanFactory factory = new ClassPathXmlApplicationContext(new String[] { Configuration.CONFIG_FILE });
360	factory.getBean("configuration");
361	Configuration cfg = Configuration.getInstance();
362
363	BeanFactory factory2 = new ClassPathXmlApplicationContext(new String[] { cfg.getImporterConfigFile() } );
364
365	ImporterConfig config = (ImporterConfig) factory2.getBean("importerConfig", ImporterConfig.class);
366
367	MetadataImporter importer = new MetadataImporter(config);
368	importer.startImport();
369	if (config.isPrintMapping()) {
370	File file = new File("xsdMapping.txt");
371	FacetMappingFactory.printMapping(file);
372	LOG.info("Printed facetMapping in " + file);
373	}
374	}
375
376	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: