Context Navigation

source: vlo/branches/vlo-3.3-oeaw/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/MetadataImporter.java @ 6400

Last change on this file since 6400 was 6400, checked in by davor.ostojic@oeaw.ac.at, 9 years ago
enrichment with data for new facets
File size: 28.8 KB

Line
1	package eu.clarin.cmdi.vlo.importer;
2
3	import java.io.File;
4	import java.io.IOException;
5	import java.net.MalformedURLException;
6	import java.net.URL;
7	import java.text.SimpleDateFormat;
8	import java.util.ArrayList;
9	import java.util.Arrays;
10	import java.util.Date;
11	import java.util.HashMap;
12	import java.util.HashSet;
13	import java.util.Iterator;
14	import java.util.LinkedList;
15	import java.util.List;
16	import java.util.Map;
17	import java.util.Set;
18
19	import org.apache.commons.cli.CommandLine;
20	import org.apache.commons.cli.CommandLineParser;
21	import org.apache.commons.cli.Options;
22	import org.apache.commons.cli.PosixParser;
23	import org.apache.commons.io.FileUtils;
24	import org.apache.solr.client.solrj.SolrQuery;
25	import org.apache.solr.client.solrj.SolrServerException;
26	import org.apache.solr.client.solrj.impl.ConcurrentUpdateSolrServer;
27	import org.apache.solr.client.solrj.util.ClientUtils;
28	import org.apache.solr.common.SolrDocumentList;
29	import org.apache.solr.common.SolrInputDocument;
30	import org.apache.solr.common.params.MapSolrParams;
31	import org.apache.solr.common.params.SolrParams;
32	import org.slf4j.Logger;
33	import org.slf4j.LoggerFactory;
34
35	import eu.clarin.cmdi.vlo.CommonUtils;
36	import eu.clarin.cmdi.vlo.FacetConstants;
37	import eu.clarin.cmdi.vlo.LanguageCodeUtils;
38	import eu.clarin.cmdi.vlo.config.DataRoot;
39	import eu.clarin.cmdi.vlo.config.VloConfig;
40	import eu.clarin.cmdi.vlo.config.XmlVloConfigFactory;
41
42	/**
43	* The main metadataImporter class. Also contains the main function.
44	*
45	* The metadataimporter reads all the config files and then, for each
46	* metadatafile in each defined directory structure parses and imports them as
47	* defined in the configuration. The startImport function starts the importing
48	* and so on.
49	*/
50	@SuppressWarnings({"serial"})
51	public class MetadataImporter {
52
53	/**
54	* Defines which files to try and parse. In this case all files ending in
55	* "xml" or "cmdi".
56	*/
57	private static final String[] VALID_CMDI_EXTENSIONS = new String[]{"xml", "cmdi"};
58
59	/**
60	* Log log log log
61	*/
62	protected final static Logger LOG = LoggerFactory.getLogger(MetadataImporter.class);
63	/**
64	* Some place to store errors.
65	*/
66	private static Throwable serverError;
67	/**
68	* the solr server.
69	*/
70	private ConcurrentUpdateSolrServer solrServer;
71	/**
72	* Defines the post-processor associations. At import, for each facet value,
73	* this map is checked and all postprocessors associated with the facet
74	* _type_ are applied to the value before storing the new value in the solr
75	* document.
76	*/
77	final static Map<String, PostProcessor> POST_PROCESSORS = new HashMap<String, PostProcessor>();
78
79	static {
80	POST_PROCESSORS.put(FacetConstants.FIELD_ID, new IdPostProcessor());
81	POST_PROCESSORS.put(FacetConstants.FIELD_CONTINENT, new ContinentNamePostProcessor());
82	POST_PROCESSORS.put(FacetConstants.FIELD_COUNTRY, new CountryNamePostProcessor());
83	POST_PROCESSORS.put(FacetConstants.FIELD_LANGUAGE_CODE, new LanguageCodePostProcessor());
84	POST_PROCESSORS.put(FacetConstants.FIELD_LANGUAGE_NAME, new LanguageNamePostProcessor());
85	POST_PROCESSORS.put(FacetConstants.FIELD_AVAILABILITY, new AvailabilityPostProcessor());
86	POST_PROCESSORS.put(FacetConstants.FIELD_ORGANISATION, new OrganisationPostProcessor());
87	POST_PROCESSORS.put(FacetConstants.FIELD_TEMPORAL_COVERAGE, new TemporalCoveragePostProcessor());
88	POST_PROCESSORS.put(FacetConstants.FIELD_NATIONAL_PROJECT, new NationalProjectPostProcessor());
89	POST_PROCESSORS.put(FacetConstants.FIELD_CLARIN_PROFILE, new CMDIComponentProfileNamePostProcessor());
90	POST_PROCESSORS.put(FacetConstants.FIELD_RESOURCE_CLASS, new ResourceClassPostProcessor());
91	}
92
93	/**
94	* Constructor
95	*/
96	public MetadataImporter() {}
97
98	public MetadataImporter(String clDatarootsList) {
99	this.clDatarootsList = clDatarootsList;
100	}
101
102	/**
103	* Contains MDSelflinks (usually). Just to know what we have already done.
104	*/
105	protected final Set<String> processedIds = new HashSet<String>();
106	/**
107	* Some caching for solr documents (we are more efficient if we ram a whole
108	* bunch to the solr server at once.
109	*/
110	protected List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>();
111
112	// SOME STATS
113	protected int nrOFDocumentsSend;
114	protected int nrOfFilesAnalyzed = 0;
115	protected int nrOfFilesWithoutId = 0;
116	protected int nrOfIgnoredFiles = 0;
117	protected int nrOfFilesWithError = 0;
118
119	/**
120	* Retrieve all files with VALID_CMDI_EXTENSIONS from all DataRoot entries
121	* and starts processing for every single file
122	*
123	* @throws MalformedURLException
124	*/
125	void startImport() throws MalformedURLException {
126
127	initSolrServer();
128	List<DataRoot> dataRoots = checkDataRoots();
129
130	dataRoots = filterDataRootsWithCLArgs(dataRoots);
131
132	long start = System.currentTimeMillis();
133	try {
134	// Delete the whole Solr db
135	if (config.getDeleteAllFirst()) {
136	LOG.info("Deleting original data...");
137	solrServer.deleteByQuery(":");
138	solrServer.commit();
139	LOG.info("Deleting original data done.");
140	}
141
142	// Import the specified data roots
143	for (DataRoot dataRoot : dataRoots) {
144	LOG.info("Start of processing: " + dataRoot.getOriginName());
145	if (dataRoot.deleteFirst()) {
146	LOG.info("Deleting data for data provider: " + dataRoot.getOriginName());
147	solrServer.deleteByQuery(FacetConstants.FIELD_DATA_PROVIDER + ":" + ClientUtils.escapeQueryChars(dataRoot.getOriginName()));
148	LOG.info("Deleting data of provider done.");
149	}
150	CMDIDataProcessor processor = new CMDIParserVTDXML(POST_PROCESSORS, false);
151	List<List<File>> centreFilesList = getFilesFromDataRoot(dataRoot.getRootFile());
152	// import files from every endpoint
153	for(List<File> centreFiles : centreFilesList) {
154	LOG.info("Processing directory: {}", centreFiles.get(0).getParent());
155	for (File file : centreFiles) {
156	if (config.getMaxFileSize() > 0
157	&& file.length() > config.getMaxFileSize()) {
158	LOG.info("Skipping " + file.getAbsolutePath() + " because it is too large.");
159	} else {
160	LOG.debug("PROCESSING FILE: {}", file.getAbsolutePath());
161	processCmdi(file, dataRoot, processor);
162	}
163	}
164	if (!docs.isEmpty()) {
165	sendDocs();
166	}
167	solrServer.commit();
168	updateDocumentHierarchy();
169	}
170	LOG.info("End of processing: " + dataRoot.getOriginName());
171	}
172
173	// delete outdated entries (based on maxDaysInSolr parameter)
174	if (config.getMaxDaysInSolr() > 0 && config.getDeleteAllFirst() == false) {
175	LOG.info("Deleting old files that were not seen for more than " + config.getMaxDaysInSolr() + " days...");
176	solrServer.deleteByQuery(FacetConstants.FIELD_LAST_SEEN + ":[* TO NOW-" + config.getMaxDaysInSolr() + "DAYS]");
177	LOG.info("Deleting old files done.");
178	}
179	} catch (SolrServerException e) {
180	LOG.error("error updating files:\n", e);
181	LOG.error("Also see vlo_solr server logs for more information");
182	} catch (IOException e) {
183	LOG.error("error updating files:\n", e);
184	} finally {
185	try {
186	if (solrServer != null) {
187	solrServer.commit();
188	buildSuggesterIndex();
189	}
190	} catch (SolrServerException e) {
191	LOG.error("cannot commit:\n", e);
192	} catch (IOException e) {
193	LOG.error("cannot commit:\n", e);
194	}
195	}
196	long took = (System.currentTimeMillis() - start) / 1000;
197	LOG.info("Found " + nrOfFilesWithoutId + " file(s) without an id. (id is generated based on fileName but that may not be unique)");
198	LOG.info("Found " + nrOfFilesWithError + " file(s) with errors.");
199	LOG.info("Found " + nrOfIgnoredFiles
200	+ " file(s) that where ignored (files without resources or any link to a search service or landing page are ignored).");
201	LOG.info("Update of " + nrOFDocumentsSend + " took " + took + " secs. Total nr of files analyzed " + nrOfFilesAnalyzed);
202	solrServer.shutdown();
203	}
204
205	/**
206	* Check a List of DataRoots for existence of RootFile (typically parent
207	* directory of metadata files)
208	*
209	* @return
210	*/
211	protected List<DataRoot> checkDataRoots() {
212	List<DataRoot> dataRoots = config.getDataRoots();
213	List<DataRoot> existingDataRoots = new LinkedList<DataRoot>();
214	for (DataRoot dataRoot : dataRoots) {
215	if (!dataRoot.getRootFile().exists()) {
216	LOG.warn("Root file " + dataRoot.getRootFile() + " does not exist. It could be configuration error! Proceeding with next ...");
217	} else{
218	existingDataRoots.add(dataRoot);
219	}
220
221	}
222	return existingDataRoots;
223	}
224
225	/**
226	* if user specified which data roots should be imported,
227	* list of existing data roots will be filtered with the list from user
228	*
229	* @return
230	*/
231	protected List<DataRoot> filterDataRootsWithCLArgs(List<DataRoot> dataRoots){
232	if(clDatarootsList == null)
233	return dataRoots;
234
235
236	LOG.info("Filtering configured data root files with command line arguments: \"" + clDatarootsList + "\"" ) ;
237
238	LinkedList<File> fsDataRoots = new LinkedList<File>();
239
240	List<String> paths = Arrays.asList((clDatarootsList.split("\\s+")));
241
242	//Convert String paths to File objects for comparison
243	for(String path: paths)
244	fsDataRoots.add(new File(path));
245
246	List<DataRoot> filteredDataRoots = new LinkedList<DataRoot>();
247	try{
248	//filter data
249	dr: for(DataRoot dataRoot: dataRoots){
250	for(File fsDataRoot: fsDataRoots){
251	if(fsDataRoot.getCanonicalPath().equals(dataRoot.getRootFile().getCanonicalPath())){
252	filteredDataRoots.add(dataRoot);
253	fsDataRoots.remove(fsDataRoot);
254	continue dr;
255	}
256	}
257	LOG.info("Root file " + dataRoot.getRootFile() + " will be omitted from processing");
258	}
259	}catch (IOException e){
260	filteredDataRoots = dataRoots;
261	}
262
263
264	return filteredDataRoots;
265	}
266
267	/**
268	* Get all files with VALID_CMDI_EXTENSIONS if rootFile is a
269	* directory that contains center directories or rootFile if it is a file
270	*
271	* @param rootFile
272	* @return List with centre Lists of all contained CMDI files if rootFile is a
273	* directory or rootFile if it is a File
274	*/
275	protected List<List<File>> getFilesFromDataRoot(File rootFile) {
276	List<List<File>> result = new ArrayList<List<File>>();
277	if(rootFile.isFile()) {
278	List<File> singleFileList = new ArrayList<File>();
279	singleFileList.add(rootFile);
280	result.add(singleFileList);
281	} else {
282	File[] centerDirs = rootFile.listFiles();
283	for(File centerDir : centerDirs) {
284	List<File> centerFileList = new ArrayList<File>();
285	if(centerDir.isDirectory()) {
286	centerFileList.addAll(FileUtils.listFiles(centerDir, VALID_CMDI_EXTENSIONS, true));
287	}
288
289	if(!centerFileList.isEmpty())
290	result.add(centerFileList);
291	}
292	}
293	return result;
294	}
295
296	/**
297	* Create an interface to the SOLR server.
298	*
299	* After the interface has been created the importer can send documents to
300	* the server. Sending documents involves a queue. The importer adds
301	* documents to a queue, and dedicated threads will empty it, and
302	* effectively store store the documents.
303	*
304	* @throws MalformedURLException
305	*/
306	protected void initSolrServer() throws MalformedURLException {
307	String solrUrl = config.getSolrUrl();
308	LOG.info("Initializing Solr Server on " + solrUrl);
309
310	/* Specify the number of documents in the queue that will trigger the
311	* threads, two of them, emptying it.
312	*/
313	solrServer = new ConcurrentUpdateSolrServer(solrUrl,
314	config.getMinDocsInSolrQueue(), 2) {
315	/*
316	* Let the super class method handle exceptions. Make the
317	* exception available to the importer in the form of the
318	* serverError variable.
319	*/
320	@Override
321	public void handleError(Throwable exception) {
322	super.handleError(exception);
323	serverError = exception;
324	}
325	};
326	}
327
328	/**
329	* Process single CMDI file with CMDIDataProcessor
330	*
331	* @param file CMDI input file
332	* @param dataOrigin
333	* @param processor
334	* @throws SolrServerException
335	* @throws IOException
336	*/
337	protected void processCmdi(File file, DataRoot dataOrigin, CMDIDataProcessor processor) throws SolrServerException, IOException {
338	nrOfFilesAnalyzed++;
339	CMDIData cmdiData = null;
340	try {
341	cmdiData = processor.process(file);
342	if (!idOk(cmdiData.getId())) {
343	cmdiData.setId(dataOrigin.getOriginName() + "/" + file.getName()); //No id found in the metadata file so making one up based on the file name. Not quaranteed to be unique, but we have to set something.
344	nrOfFilesWithoutId++;
345	}
346	} catch (Exception e) {
347	LOG.error("error in file: {}", file, e);
348	nrOfFilesWithError++;
349	}
350	if (cmdiData != null) {
351	if (processedIds.add(cmdiData.getId())) {
352	SolrInputDocument solrDocument = cmdiData.getSolrDocument();
353	if (solrDocument != null) {
354	if (!cmdiData.getDataResources().isEmpty() \|\| !cmdiData.getLandingPageResources().isEmpty()
355	\|\| !cmdiData.getSearchResources().isEmpty() \|\| !cmdiData.getSearchPageResources().isEmpty()
356	\|\| cmdiData.getMetadataResources().isEmpty()) {
357	// We only add metadata files that have
358	// 1) data resources or
359	// 2) a landing page or
360	// 3) a search service (like SRU/CQL) or
361	// 4) a search page or
362	// 5) that have none of the above but also lack any metadata links (e.g. olac files describing a corpus with a link to the original archive).
363	// Other files will have only metadata resources and are considered 'collection' metadata files they
364	// are usually not very interesting (think imdi corpus files) and will not be included.
365	updateDocument(solrDocument, cmdiData, file, dataOrigin);
366	} else {
367	nrOfIgnoredFiles++;
368	}
369	}
370	} else {
371	LOG.warn("Skipping {}, already processed id: {}", file, cmdiData.getId());
372	}
373	}
374	}
375
376	/**
377	* Check id for validness
378	*
379	* @param id
380	* @return true if id is acceptable, false otherwise
381	*/
382	protected boolean idOk(String id) {
383	return id != null && !id.isEmpty();
384	}
385
386	/**
387	* Adds some additional information from DataRoot to solrDocument, add
388	* solrDocument to document list, submits list to SolrServer every 1000
389	* files
390	*
391	* @param solrDocument
392	* @param cmdiData
393	* @param file
394	* @param dataOrigin
395	* @throws SolrServerException
396	* @throws IOException
397	*/
398	protected void updateDocument(SolrInputDocument solrDocument, CMDIData cmdiData, File file, DataRoot dataOrigin) throws SolrServerException,
399	IOException {
400	if (!solrDocument.containsKey(FacetConstants.FIELD_COLLECTION)) {
401	solrDocument.addField(FacetConstants.FIELD_COLLECTION, dataOrigin.getOriginName());
402	}
403	solrDocument.addField(FacetConstants.FIELD_DATA_PROVIDER, dataOrigin.getOriginName());
404	solrDocument.addField(FacetConstants.FIELD_ID, cmdiData.getId());
405	solrDocument.addField(FacetConstants.FIELD_FILENAME, file.getAbsolutePath());
406
407	String metadataSourceUrl = dataOrigin.getPrefix();
408	metadataSourceUrl += file.getAbsolutePath().substring(dataOrigin.getToStrip().length());
409
410	solrDocument.addField(FacetConstants.FIELD_COMPLETE_METADATA, metadataSourceUrl);
411
412	// add SearchServices (should be CQL endpoint)
413	for (Resource resource : cmdiData.getSearchResources()) {
414	solrDocument.addField(FacetConstants.FIELD_SEARCH_SERVICE, resource.getResourceName());
415	}
416
417	// add landing page resource
418	for (Resource resource : cmdiData.getLandingPageResources()) {
419	solrDocument.addField(FacetConstants.FIELD_LANDINGPAGE, resource.getResourceName());
420	}
421
422	// add search page resource
423	for (Resource resource : cmdiData.getSearchPageResources()) {
424	solrDocument.addField(FacetConstants.FIELD_SEARCHPAGE, resource.getResourceName());
425	}
426
427	// add timestamp
428	Date dt = new Date();
429	SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
430	solrDocument.addField(FacetConstants.FIELD_LAST_SEEN, df.format(dt));
431
432	// add resource proxys
433	addResourceData(solrDocument, cmdiData);
434
435
436	//Add profileId & profileName & original resource type
437	solrDocument.addField("profileName", cmdiData.getProfileName());
438	solrDocument.addField("profileId", cmdiData.getProfileId());
439	solrDocument.addField("resourceClassOrig", cmdiData.getOriginalResourceType());
440
441
442	//Add missing facets with "null" val
443	for(String facet: config.getFacetFields()){
444	if(!solrDocument.getFieldNames().contains(facet))
445	solrDocument.addField(facet, "null");
446	}
447
448
449	LOG.debug("Adding document for submission to SOLR: {}", file);
450	docs.add(solrDocument);
451	if (docs.size() == config.getMaxDocsInList()) {
452	sendDocs();
453	}
454	}
455
456	/**
457	* Adds two fields FIELD_FORMAT and FIELD_RESOURCE. The Type can be
458	* specified in the "ResourceType" element of an imdi file or possibly
459	* overwritten by some more specific xpath (as in the LRT cmdi files). So if
460	* a type is overwritten and already in the solrDocument we take that type.
461	*
462	* @param solrDocument
463	* @param cmdiData
464	*/
465	protected void addResourceData(SolrInputDocument solrDocument, CMDIData cmdiData) {
466	List<Object> fieldValues = solrDocument.containsKey(FacetConstants.FIELD_FORMAT) ? new ArrayList<Object>(solrDocument
467	.getFieldValues(FacetConstants.FIELD_FORMAT)) : null;
468	solrDocument.removeField(FacetConstants.FIELD_FORMAT); //Remove old values they might be overwritten.
469	List<Resource> resources = cmdiData.getDataResources();
470	for (int i = 0; i < resources.size(); i++) {
471	Resource resource = resources.get(i);
472	String mimeType = resource.getMimeType();
473	if (mimeType == null) {
474	if (fieldValues != null && i < fieldValues.size()) {
475	mimeType = CommonUtils.normalizeMimeType(fieldValues.get(i).toString());
476	} else {
477	mimeType = CommonUtils.normalizeMimeType("");
478	}
479	}
480
481	FormatPostProcessor processor = new FormatPostProcessor();
482	mimeType = processor.process(mimeType).get(0);
483
484	// TODO check should probably be moved into Solr (by using some minimum length filter)
485	if (!mimeType.equals("")) {
486	solrDocument.addField(FacetConstants.FIELD_FORMAT, mimeType);
487	}
488	solrDocument.addField(FacetConstants.FIELD_RESOURCE, mimeType + FacetConstants.FIELD_RESOURCE_SPLIT_CHAR
489	+ resource.getResourceName());
490	}
491	solrDocument.addField(FacetConstants.FIELD_RESOURCE_COUNT, resources.size());
492	}
493
494	/**
495	* Send current list of SolrImputDocuments to SolrServer and clears list
496	* afterwards
497	*
498	* @throws SolrServerException
499	* @throws IOException
500	*/
501	protected void sendDocs() throws SolrServerException, IOException {
502	LOG.info("Sending " + docs.size() + " docs to solr server. Total number of docs updated till now: " + nrOFDocumentsSend);
503	nrOFDocumentsSend += docs.size();
504	solrServer.add(docs);
505	if (serverError != null) {
506	throw new SolrServerException(serverError);
507	}
508	docs = new ArrayList<SolrInputDocument>();
509	}
510
511	/**
512	* Builds suggester index for autocompletion
513	*
514	* @throws SolrServerException
515	* @throws MalformedURLException
516	*/
517	private void buildSuggesterIndex() throws SolrServerException, MalformedURLException {
518	LOG.info("Building index for autocompletion.");
519	HashMap<String, String> paramMap = new HashMap<String, String>();
520	paramMap.put("qt", "/suggest");
521	paramMap.put("spellcheck.build", "true");
522	SolrParams params = new MapSolrParams(paramMap);
523	solrServer.query(params);
524	}
525
526	/**
527	* Updates documents in Solr with their hierarchy weight and lists of related resources (hasPart & isPartOf)
528	* @throws SolrServerException
529	* @throws MalformedURLException
530	*/
531	private void updateDocumentHierarchy() throws SolrServerException, MalformedURLException, IOException {
532	LOG.info(ResourceStructureGraph.printStatistics(0));
533	Boolean updatedDocs = false;
534	List<SolrInputDocument> updateDocs = new ArrayList<SolrInputDocument>();
535	Iterator<CmdiVertex> vertexIter = ResourceStructureGraph.getFoundVertices().iterator();
536	while(vertexIter.hasNext()) {
537	CmdiVertex vertex = vertexIter.next();
538	List<String> incomingVertexNames = ResourceStructureGraph.getIncomingVertexNames(vertex);
539	List<String> outgoingVertexNames = ResourceStructureGraph.getOutgoingVertexNames(vertex);
540
541	SolrQuery query;
542	// update vertex if changes are necessary (necessary if non-default weight or edges to other resources)
543	if(vertex.getHierarchyWeight() != 0 \|\| !incomingVertexNames.isEmpty() \|\| !outgoingVertexNames.isEmpty()) {
544	updatedDocs = true;
545
546	// get document
547	query = new SolrQuery();
548	query.set("q", FacetConstants.FIELD_ID+":"+vertex.getId());
549	SolrDocumentList response = solrServer.query(query).getResults();
550
551	// empty result set? may be the case if CMDI file was rejected due to missing ResourceProxys in {@link #processCmdi(File, DataRoot, CMDIDataProcessor) processCmdi}
552	if(response.size() == 0) {
553	LOG.debug("Doc "+vertex.getId()+" not found while updating document hierarchy information");
554	continue;
555	}
556	SolrInputDocument doc = ClientUtils.toSolrInputDocument(response.get(0));
557
558	if(vertex.getHierarchyWeight() != 0) {
559	doc.setField(FacetConstants.FIELD_HIERARCHY_WEIGHT, Math.abs(vertex.getHierarchyWeight()));
560	}
561
562	if(!incomingVertexNames.isEmpty()) {
563	doc.setField(FacetConstants.FIELD_HAS_PART, incomingVertexNames);
564	doc.setField(FacetConstants.FIELD_HAS_PART_COUNT, incomingVertexNames.size());
565	}
566
567	if(!outgoingVertexNames.isEmpty()) {
568	doc.setField(FacetConstants.FIELD_IS_PART_OF, outgoingVertexNames);
569	}
570	updateDocs.add(doc);
571	}
572
573	if (updateDocs.size() == config.getMaxDocsInList()) {
574	solrServer.add(updateDocs);
575	if (serverError != null) {
576	throw new SolrServerException(serverError);
577	}
578	updateDocs = new ArrayList<SolrInputDocument>();
579	}
580	}
581	if(!updateDocs.isEmpty()) {
582	solrServer.add(updateDocs);
583	if (serverError != null) {
584	throw new SolrServerException(serverError);
585	}
586	}
587
588	if(updatedDocs) {
589	solrServer.commit();
590	}
591
592	ResourceStructureGraph.clearResourceGraph();
593	}
594
595	public static VloConfig config;
596
597	public static LanguageCodeUtils languageCodeUtils;
598
599	//data roots passed from command line
600	private String clDatarootsList = null;
601
602	/**
603	* @param args
604	* @throws MalformedURLException
605	* @throws IOException
606	*/
607	public static void main(String[] args) throws MalformedURLException, IOException {
608
609	// path to the configuration file
610	String configFile = null;
611
612	// use the Apache cli framework for getting command line parameters
613	Options options = new Options();
614
615	// Data root list passed from command line with -l option
616	String cldrList = null;
617
618	/**
619	* Add a "c" option, the option indicating the specification of an XML
620	* configuration file
621	*
622	* "l" option - to specify which data roots (from config file) to import
623	* imports all by default
624	*/
625	options.addOption("c", true, "-c <file> : use parameters specified in <file>");
626	options.addOption("l", true, "-l <dataroot> [ ' ' <dataroot> ]* : space separated list of dataroots to be processed.\n"
627	+ "If dataroot is not specified in config file it will be ignored.");
628	options.getOption("l").setOptionalArg(true);
629
630	CommandLineParser parser = new PosixParser();
631
632	try {
633	// parse the command line arguments
634	CommandLine cmd = parser.parse(options, args);
635	if (cmd.hasOption("c")) {
636
637	// the "c" option was specified, now get its value
638	configFile = cmd.getOptionValue("c");
639	}
640
641	if(cmd.hasOption("l")){
642	cldrList = cmd.getOptionValue("l");
643	}
644
645	} catch (org.apache.commons.cli.ParseException ex) {
646
647	/**
648	* Caught an exception caused by command line parsing. Try to get
649	* the name of the configuration file by querying the system
650	* property.
651	*/
652	String message = "Command line parsing failed. " + ex.getMessage();
653	LOG.error(message);
654	System.err.println(message);
655	}
656
657	if (configFile == null) {
658
659	String message;
660
661	message = "Could not get config file name via the command line, trying the system properties.";
662	LOG.info(message);
663
664	String key;
665
666	key = "configFile";
667	configFile = System.getProperty(key);
668	}
669
670	if (configFile == null) {
671
672	String message;
673
674	message = "Could not get filename as system property either - stopping.";
675	LOG.error(message);
676	} else {
677	// read the configuration from the externally supplied file
678	final URL configUrl;
679	if (configFile.startsWith("file:")) {
680	configUrl = new URL(configFile);
681	} else {
682	configUrl = new File(configFile).toURI().toURL();
683	}
684	System.out.println("Reading configuration from " + configUrl.toString());
685	LOG.info("Reading configuration from " + configUrl.toString());
686	final XmlVloConfigFactory configFactory = new XmlVloConfigFactory(configUrl);
687	MetadataImporter.config = configFactory.newConfig();
688	MetadataImporter.languageCodeUtils = new LanguageCodeUtils(MetadataImporter.config);
689
690	// optionally, modify the configuration here
691	// create and start the importer
692	MetadataImporter importer = new MetadataImporter(cldrList);
693	importer.startImport();
694
695	// finished importing
696	if (MetadataImporter.config.printMapping()) {
697	File file = new File("xsdMapping.txt");
698	FacetMappingFactory.printMapping(file);
699	LOG.info("Printed facetMapping in " + file);
700	}
701	}
702	}
703	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: