Context Navigation

source: vlo/branches/vlo-3.3/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/MetadataImporter.java @ 6700

Last change on this file since 6700 was 6700, checked in by Twan Goosen, 9 years ago
Merged fix for #790 to 3.3 branch
File size: 29.6 KB

Line
1	package eu.clarin.cmdi.vlo.importer;
2
3	import java.io.File;
4	import java.io.IOException;
5	import java.net.MalformedURLException;
6	import java.net.URL;
7	import java.text.SimpleDateFormat;
8	import java.util.ArrayList;
9	import java.util.Arrays;
10	import java.util.Date;
11	import java.util.HashMap;
12	import java.util.HashSet;
13	import java.util.Iterator;
14	import java.util.LinkedList;
15	import java.util.List;
16	import java.util.Map;
17	import java.util.Set;
18
19	import org.apache.commons.cli.CommandLine;
20	import org.apache.commons.cli.CommandLineParser;
21	import org.apache.commons.cli.Options;
22	import org.apache.commons.cli.PosixParser;
23	import org.apache.commons.io.FileUtils;
24	import org.apache.solr.client.solrj.SolrQuery;
25	import org.apache.solr.client.solrj.SolrServerException;
26	import org.apache.solr.client.solrj.impl.ConcurrentUpdateSolrServer;
27	import org.apache.solr.client.solrj.util.ClientUtils;
28	import org.apache.solr.common.SolrDocumentList;
29	import org.apache.solr.common.SolrInputDocument;
30	import org.apache.solr.common.params.MapSolrParams;
31	import org.apache.solr.common.params.SolrParams;
32	import org.slf4j.Logger;
33	import org.slf4j.LoggerFactory;
34
35	import eu.clarin.cmdi.vlo.CommonUtils;
36	import eu.clarin.cmdi.vlo.FacetConstants;
37	import eu.clarin.cmdi.vlo.LanguageCodeUtils;
38	import eu.clarin.cmdi.vlo.config.DataRoot;
39	import eu.clarin.cmdi.vlo.config.VloConfig;
40	import eu.clarin.cmdi.vlo.config.XmlVloConfigFactory;
41
42	/**
43	* The main metadataImporter class. Also contains the main function.
44	*
45	* The metadataimporter reads all the config files and then, for each
46	* metadatafile in each defined directory structure parses and imports them as
47	* defined in the configuration. The startImport function starts the importing
48	* and so on.
49	*/
50	public class MetadataImporter {
51
52	/**
53	* Defines which files to try and parse. In this case all files ending in
54	* "xml" or "cmdi".
55	*/
56	private static final String[] VALID_CMDI_EXTENSIONS = new String[]{"xml", "cmdi"};
57
58	/**
59	* Log log log log
60	*/
61	protected final static Logger LOG = LoggerFactory.getLogger(MetadataImporter.class);
62	/**
63	* Some place to store errors.
64	*/
65	private static Throwable serverError;
66	/**
67	* the solr server.
68	*/
69	private ConcurrentUpdateSolrServer solrServer;
70	/**
71	* Defines the post-processor associations. At import, for each facet value,
72	* this map is checked and all postprocessors associated with the facet
73	* _type_ are applied to the value before storing the new value in the solr
74	* document.
75	*/
76	final static Map<String, PostProcessor> POST_PROCESSORS = new HashMap<String, PostProcessor>();
77
78	static {
79	POST_PROCESSORS.put(FacetConstants.FIELD_ID, new IdPostProcessor());
80	POST_PROCESSORS.put(FacetConstants.FIELD_CONTINENT, new ContinentNamePostProcessor());
81	POST_PROCESSORS.put(FacetConstants.FIELD_COUNTRY, new CountryNamePostProcessor());
82	POST_PROCESSORS.put(FacetConstants.FIELD_LANGUAGE_CODE, new LanguageCodePostProcessor());
83	POST_PROCESSORS.put(FacetConstants.FIELD_LANGUAGE_NAME, new LanguageNamePostProcessor());
84	POST_PROCESSORS.put(FacetConstants.FIELD_AVAILABILITY, new AvailabilityPostProcessor());
85	POST_PROCESSORS.put(FacetConstants.FIELD_ORGANISATION, new OrganisationPostProcessor());
86	POST_PROCESSORS.put(FacetConstants.FIELD_TEMPORAL_COVERAGE, new TemporalCoveragePostProcessor());
87	POST_PROCESSORS.put(FacetConstants.FIELD_NATIONAL_PROJECT, new NationalProjectPostProcessor());
88	POST_PROCESSORS.put(FacetConstants.FIELD_CLARIN_PROFILE, new CMDIComponentProfileNamePostProcessor());
89	POST_PROCESSORS.put(FacetConstants.FIELD_RESOURCE_CLASS, new ResourceClassPostProcessor());
90	}
91
92	/**
93	* Constructor
94	*/
95	public MetadataImporter() {}
96
97	public MetadataImporter(String clDatarootsList) {
98	this.clDatarootsList = clDatarootsList;
99	}
100
101	/**
102	* Contains MDSelflinks (usually). Just to know what we have already done.
103	*/
104	protected final Set<String> processedIds = new HashSet<String>();
105	/**
106	* Some caching for solr documents (we are more efficient if we ram a whole
107	* bunch to the solr server at once.
108	*/
109	protected List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>();
110
111	// SOME STATS
112	protected int nrOFDocumentsSend;
113	protected int nrOfFilesAnalyzed = 0;
114	protected int nrOfFilesWithoutId = 0;
115	protected int nrOfIgnoredFiles = 0;
116	protected int nrOfFilesWithError = 0;
117
118	/**
119	* Retrieve all files with VALID_CMDI_EXTENSIONS from all DataRoot entries
120	* and starts processing for every single file
121	*
122	* @throws MalformedURLException
123	*/
124	void startImport() throws MalformedURLException {
125
126	initSolrServer();
127	List<DataRoot> dataRoots = checkDataRoots();
128
129	dataRoots = filterDataRootsWithCLArgs(dataRoots);
130
131	long start = System.currentTimeMillis();
132	try {
133	// Delete the whole Solr db
134	if (config.getDeleteAllFirst()) {
135	LOG.info("Deleting original data...");
136	solrServer.deleteByQuery(":");
137	solrServer.commit();
138	LOG.info("Deleting original data done.");
139	}
140
141	// Import the specified data roots
142	for (DataRoot dataRoot : dataRoots) {
143	LOG.info("Start of processing: " + dataRoot.getOriginName());
144	if (dataRoot.deleteFirst()) {
145	LOG.info("Deleting data for data provider: " + dataRoot.getOriginName());
146	solrServer.deleteByQuery(FacetConstants.FIELD_DATA_PROVIDER + ":" + ClientUtils.escapeQueryChars(dataRoot.getOriginName()));
147	LOG.info("Deleting data of provider done.");
148	}
149	CMDIDataProcessor processor = new CMDIParserVTDXML(POST_PROCESSORS, false);
150	List<List<File>> centreFilesList = getFilesFromDataRoot(dataRoot.getRootFile());
151	// import files from every endpoint
152	for(List<File> centreFiles : centreFilesList) {
153	LOG.info("Processing directory: {}", centreFiles.get(0).getParent());
154	for (File file : centreFiles) {
155	if (config.getMaxFileSize() > 0
156	&& file.length() > config.getMaxFileSize()) {
157	LOG.info("Skipping " + file.getAbsolutePath() + " because it is too large.");
158	} else {
159	LOG.debug("PROCESSING FILE: {}", file.getAbsolutePath());
160	processCmdi(file, dataRoot, processor);
161	}
162	}
163	if (!docs.isEmpty()) {
164	sendDocs();
165	}
166	solrServer.commit();
167	if(config.isProcessHierarchies()){
168	updateDocumentHierarchy();
169	}
170	}
171	LOG.info("End of processing: " + dataRoot.getOriginName());
172	}
173
174	// delete outdated entries (based on maxDaysInSolr parameter)
175	if (config.getMaxDaysInSolr() > 0 && config.getDeleteAllFirst() == false) {
176	LOG.info("Deleting old files that were not seen for more than " + config.getMaxDaysInSolr() + " days...");
177	solrServer.deleteByQuery(FacetConstants.FIELD_LAST_SEEN + ":[* TO NOW-" + config.getMaxDaysInSolr() + "DAYS]");
178	LOG.info("Deleting old files done.");
179	}
180	} catch (SolrServerException e) {
181	LOG.error("error updating files:\n", e);
182	LOG.error("Also see vlo_solr server logs for more information");
183	} catch (IOException e) {
184	LOG.error("error updating files:\n", e);
185	} finally {
186	try {
187	if (solrServer != null) {
188	solrServer.commit();
189	buildSuggesterIndex();
190	}
191	} catch (SolrServerException e) {
192	LOG.error("cannot commit:\n", e);
193	} catch (IOException e) {
194	LOG.error("cannot commit:\n", e);
195	}
196	}
197	long took = (System.currentTimeMillis() - start) / 1000;
198	LOG.info("Found " + nrOfFilesWithoutId + " file(s) without an id. (id is generated based on fileName but that may not be unique)");
199	LOG.info("Found " + nrOfFilesWithError + " file(s) with errors.");
200	LOG.info("Found " + nrOfIgnoredFiles
201	+ " file(s) that where ignored (files without resources or any link to a search service or landing page are ignored).");
202	LOG.info("Update of " + nrOFDocumentsSend + " took " + took + " secs. Total nr of files analyzed " + nrOfFilesAnalyzed);
203	solrServer.shutdown();
204	}
205
206	/**
207	* Check a List of DataRoots for existence of RootFile (typically parent
208	* directory of metadata files)
209	*
210	* @return
211	*/
212	protected List<DataRoot> checkDataRoots() {
213	List<DataRoot> dataRoots = config.getDataRoots();
214	List<DataRoot> existingDataRoots = new LinkedList<DataRoot>();
215	for (DataRoot dataRoot : dataRoots) {
216	if (!dataRoot.getRootFile().exists()) {
217	LOG.warn("Root file " + dataRoot.getRootFile() + " does not exist. It could be configuration error! Proceeding with next ...");
218	} else{
219	existingDataRoots.add(dataRoot);
220	}
221
222	}
223	return existingDataRoots;
224	}
225
226	/**
227	* if user specified which data roots should be imported,
228	* list of existing data roots will be filtered with the list from user
229	*
230	* @param dataRoots complete list of DataRoots
231	* @return list of DataRoots without DataRoots excluded by the user
232	*/
233	protected List<DataRoot> filterDataRootsWithCLArgs(List<DataRoot> dataRoots){
234	if(clDatarootsList == null)
235	return dataRoots;
236
237
238	LOG.info("Filtering configured data root files with command line arguments: \"" + clDatarootsList + "\"" ) ;
239
240	LinkedList<File> fsDataRoots = new LinkedList<File>();
241
242	List<String> paths = Arrays.asList((clDatarootsList.split("\\s+")));
243
244	//Convert String paths to File objects for comparison
245	for(String path: paths)
246	fsDataRoots.add(new File(path));
247
248	List<DataRoot> filteredDataRoots = new LinkedList<DataRoot>();
249	try{
250	//filter data
251	dr: for(DataRoot dataRoot: dataRoots){
252	for(File fsDataRoot: fsDataRoots){
253	if(fsDataRoot.getCanonicalPath().equals(dataRoot.getRootFile().getCanonicalPath())){
254	filteredDataRoots.add(dataRoot);
255	fsDataRoots.remove(fsDataRoot);
256	continue dr;
257	}
258	}
259	LOG.info("Root file " + dataRoot.getRootFile() + " will be omitted from processing");
260	}
261	}catch (IOException e){
262	filteredDataRoots = dataRoots;
263	}
264
265
266	return filteredDataRoots;
267	}
268
269	/**
270	* Get all files with VALID_CMDI_EXTENSIONS if rootFile is a
271	* directory that contains center directories or rootFile if it is a file
272	*
273	* @param rootFile
274	* @return List with centre Lists of all contained CMDI files if rootFile is a
275	* directory or rootFile if it is a File
276	*/
277	protected List<List<File>> getFilesFromDataRoot(File rootFile) {
278	List<List<File>> result = new ArrayList<List<File>>();
279	if(rootFile.isFile()) {
280	List<File> singleFileList = new ArrayList<File>();
281	singleFileList.add(rootFile);
282	result.add(singleFileList);
283	} else {
284	File[] centerDirs = rootFile.listFiles();
285	for(File centerDir : centerDirs) {
286	List<File> centerFileList = new ArrayList<File>();
287	if(centerDir.isDirectory()) {
288	centerFileList.addAll(FileUtils.listFiles(centerDir, VALID_CMDI_EXTENSIONS, true));
289	}
290
291	if(!centerFileList.isEmpty())
292	result.add(centerFileList);
293	}
294	}
295	return result;
296	}
297
298	/**
299	* Create an interface to the SOLR server.
300	*
301	* After the interface has been created the importer can send documents to
302	* the server. Sending documents involves a queue. The importer adds
303	* documents to a queue, and dedicated threads will empty it, and
304	* effectively store store the documents.
305	*
306	* @throws MalformedURLException
307	*/
308	protected void initSolrServer() throws MalformedURLException {
309	String solrUrl = config.getSolrUrl();
310	LOG.info("Initializing Solr Server on " + solrUrl);
311
312	/* Specify the number of documents in the queue that will trigger the
313	* threads, two of them, emptying it.
314	*/
315	solrServer = new ConcurrentUpdateSolrServer(solrUrl,
316	config.getMinDocsInSolrQueue(), 2) {
317	/*
318	* Let the super class method handle exceptions. Make the
319	* exception available to the importer in the form of the
320	* serverError variable.
321	*/
322	@Override
323	public void handleError(Throwable exception) {
324	super.handleError(exception);
325	serverError = exception;
326	}
327	};
328	}
329
330	/**
331	* Process single CMDI file with CMDIDataProcessor
332	*
333	* @param file CMDI input file
334	* @param dataOrigin
335	* @param processor
336	* @throws SolrServerException
337	* @throws IOException
338	*/
339	protected void processCmdi(File file, DataRoot dataOrigin, CMDIDataProcessor processor) throws SolrServerException, IOException {
340	nrOfFilesAnalyzed++;
341	CMDIData cmdiData = null;
342	try {
343	cmdiData = processor.process(file);
344	if (!idOk(cmdiData.getId())) {
345	cmdiData.setId(dataOrigin.getOriginName() + "/" + file.getName()); //No id found in the metadata file so making one up based on the file name. Not quaranteed to be unique, but we have to set something.
346	nrOfFilesWithoutId++;
347	}
348	} catch (Exception e) {
349	LOG.error("error in file: {}", file, e);
350	nrOfFilesWithError++;
351	}
352	if (cmdiData != null) {
353	if (processedIds.add(cmdiData.getId())) {
354	SolrInputDocument solrDocument = cmdiData.getSolrDocument();
355	if (solrDocument != null) {
356	if (!cmdiData.getDataResources().isEmpty() \|\| !cmdiData.getLandingPageResources().isEmpty()
357	\|\| !cmdiData.getSearchResources().isEmpty() \|\| !cmdiData.getSearchPageResources().isEmpty()
358	\|\| cmdiData.getMetadataResources().isEmpty()) {
359	// We only add metadata files that have
360	// 1) data resources or
361	// 2) a landing page or
362	// 3) a search service (like SRU/CQL) or
363	// 4) a search page or
364	// 5) that have none of the above but also lack any metadata links (e.g. olac files describing a corpus with a link to the original archive).
365	// Other files will have only metadata resources and are considered 'collection' metadata files they
366	// are usually not very interesting (think imdi corpus files) and will not be included.
367	updateDocument(solrDocument, cmdiData, file, dataOrigin);
368	if(ResourceStructureGraph.getVertex(cmdiData.getId()) != null)
369	ResourceStructureGraph.getVertex(cmdiData.getId()).setWasImported(true);
370	} else {
371	nrOfIgnoredFiles++;
372	}
373	}
374	} else {
375	LOG.warn("Skipping {}, already processed id: {}", file, cmdiData.getId());
376	}
377	}
378	}
379
380	/**
381	* Check id for validness
382	*
383	* @param id
384	* @return true if id is acceptable, false otherwise
385	*/
386	protected boolean idOk(String id) {
387	return id != null && !id.isEmpty();
388	}
389
390	/**
391	* Adds some additional information from DataRoot to solrDocument, add
392	* solrDocument to document list, submits list to SolrServer every 1000
393	* files
394	*
395	* @param solrDocument
396	* @param cmdiData
397	* @param file
398	* @param dataOrigin
399	* @throws SolrServerException
400	* @throws IOException
401	*/
402	protected void updateDocument(SolrInputDocument solrDocument, CMDIData cmdiData, File file, DataRoot dataOrigin) throws SolrServerException,
403	IOException {
404	if (!solrDocument.containsKey(FacetConstants.FIELD_COLLECTION)) {
405	solrDocument.addField(FacetConstants.FIELD_COLLECTION, dataOrigin.getOriginName());
406	}
407	solrDocument.addField(FacetConstants.FIELD_DATA_PROVIDER, dataOrigin.getOriginName());
408	solrDocument.addField(FacetConstants.FIELD_ID, cmdiData.getId());
409	solrDocument.addField(FacetConstants.FIELD_FILENAME, file.getAbsolutePath());
410
411	String metadataSourceUrl = dataOrigin.getPrefix();
412	metadataSourceUrl += file.getAbsolutePath().substring(dataOrigin.getToStrip().length());
413
414	solrDocument.addField(FacetConstants.FIELD_COMPLETE_METADATA, metadataSourceUrl);
415
416	// add SearchServices (should be CQL endpoint)
417	for (Resource resource : cmdiData.getSearchResources()) {
418	solrDocument.addField(FacetConstants.FIELD_SEARCH_SERVICE, resource.getResourceName());
419	}
420
421	// add landing page resource
422	for (Resource resource : cmdiData.getLandingPageResources()) {
423	solrDocument.addField(FacetConstants.FIELD_LANDINGPAGE, resource.getResourceName());
424	}
425
426	// add search page resource
427	for (Resource resource : cmdiData.getSearchPageResources()) {
428	solrDocument.addField(FacetConstants.FIELD_SEARCHPAGE, resource.getResourceName());
429	}
430
431	// add timestamp
432	Date dt = new Date();
433	SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
434	solrDocument.addField(FacetConstants.FIELD_LAST_SEEN, df.format(dt));
435
436	// add resource proxys
437	addResourceData(solrDocument, cmdiData);
438
439	LOG.debug("Adding document for submission to SOLR: {}", file);
440	docs.add(solrDocument);
441	if (docs.size() == config.getMaxDocsInList()) {
442	sendDocs();
443	}
444	}
445
446	/**
447	* Adds two fields FIELD_FORMAT and FIELD_RESOURCE. The Type can be
448	* specified in the "ResourceType" element of an imdi file or possibly
449	* overwritten by some more specific xpath (as in the LRT cmdi files). So if
450	* a type is overwritten and already in the solrDocument we take that type.
451	*
452	* @param solrDocument
453	* @param cmdiData
454	*/
455	protected void addResourceData(SolrInputDocument solrDocument, CMDIData cmdiData) {
456	List<Object> fieldValues = solrDocument.containsKey(FacetConstants.FIELD_FORMAT) ? new ArrayList<Object>(solrDocument
457	.getFieldValues(FacetConstants.FIELD_FORMAT)) : null;
458	solrDocument.removeField(FacetConstants.FIELD_FORMAT); //Remove old values they might be overwritten.
459	List<Resource> resources = cmdiData.getDataResources();
460	for (int i = 0; i < resources.size(); i++) {
461	Resource resource = resources.get(i);
462	String mimeType = resource.getMimeType();
463	if (mimeType == null) {
464	if (fieldValues != null && i < fieldValues.size()) {
465	mimeType = CommonUtils.normalizeMimeType(fieldValues.get(i).toString());
466	} else {
467	mimeType = CommonUtils.normalizeMimeType("");
468	}
469	}
470
471	FormatPostProcessor processor = new FormatPostProcessor();
472	mimeType = processor.process(mimeType).get(0);
473
474	// TODO check should probably be moved into Solr (by using some minimum length filter)
475	if (!mimeType.equals("")) {
476	solrDocument.addField(FacetConstants.FIELD_FORMAT, mimeType);
477	}
478	solrDocument.addField(FacetConstants.FIELD_RESOURCE, mimeType + FacetConstants.FIELD_RESOURCE_SPLIT_CHAR
479	+ resource.getResourceName());
480	}
481	solrDocument.addField(FacetConstants.FIELD_RESOURCE_COUNT, resources.size());
482	}
483
484	/**
485	* Send current list of SolrImputDocuments to SolrServer and clears list
486	* afterwards
487	*
488	* @throws SolrServerException
489	* @throws IOException
490	*/
491	protected void sendDocs() throws SolrServerException, IOException {
492	LOG.info("Sending " + docs.size() + " docs to solr server. Total number of docs updated till now: " + nrOFDocumentsSend);
493	nrOFDocumentsSend += docs.size();
494	solrServer.add(docs);
495	if (serverError != null) {
496	throw new SolrServerException(serverError);
497	}
498	docs = new ArrayList<SolrInputDocument>();
499	}
500
501	/**
502	* Builds suggester index for autocompletion
503	*
504	* @throws SolrServerException
505	* @throws MalformedURLException
506	*/
507	private void buildSuggesterIndex() throws SolrServerException, MalformedURLException {
508	LOG.info("Building index for autocompletion.");
509	HashMap<String, String> paramMap = new HashMap<String, String>();
510	paramMap.put("qt", "/suggest");
511	paramMap.put("spellcheck.build", "true");
512	SolrParams params = new MapSolrParams(paramMap);
513	solrServer.query(params);
514	}
515
516	/**
517	* Updates documents in Solr with their hierarchy weight and lists of related resources (hasPart & isPartOf)
518	* @throws SolrServerException
519	* @throws MalformedURLException
520	*/
521	private void updateDocumentHierarchy() throws SolrServerException, MalformedURLException, IOException {
522	LOG.info(ResourceStructureGraph.printStatistics(0));
523	Boolean updatedDocs = false;
524	List<SolrInputDocument> updateDocs = new ArrayList<SolrInputDocument>();
525	Iterator<CmdiVertex> vertexIter = ResourceStructureGraph.getFoundVertices().iterator();
526	while(vertexIter.hasNext()) {
527	CmdiVertex vertex = vertexIter.next();
528	List<String> incomingVertexNames = ResourceStructureGraph.getIncomingVertexNames(vertex);
529	List<String> outgoingVertexNames = ResourceStructureGraph.getOutgoingVertexNames(vertex);
530
531	SolrQuery query;
532	// update vertex if changes are necessary (necessary if non-default weight or edges to other resources)
533	if(vertex.getHierarchyWeight() != 0 \|\| !incomingVertexNames.isEmpty() \|\| !outgoingVertexNames.isEmpty()) {
534	updatedDocs = true;
535
536	// get document
537	query = new SolrQuery();
538	query.setRequestHandler(FacetConstants.SOLR_REQUEST_HANDLER_FAST);
539	query.set("q", FacetConstants.FIELD_ID+":"+vertex.getId());
540	SolrDocumentList response = solrServer.query(query).getResults();
541
542	// empty result set? may be the case if CMDI file was rejected due to missing ResourceProxys in {@link #processCmdi(File, DataRoot, CMDIDataProcessor) processCmdi}
543	if(response.size() == 0) {
544	LOG.debug("Doc "+vertex.getId()+" not found while updating document hierarchy information");
545	continue;
546	}
547	SolrInputDocument doc = ClientUtils.toSolrInputDocument(response.get(0));
548
549	if(vertex.getHierarchyWeight() != 0) {
550	doc.setField(FacetConstants.FIELD_HIERARCHY_WEIGHT, Math.abs(vertex.getHierarchyWeight()));
551	}
552
553	// remove vertices that were not imported
554	Iterator<String> incomingVertexIter = incomingVertexNames.iterator();
555	while(incomingVertexIter.hasNext()) {
556	String vertexId = incomingVertexIter.next();
557	if(ResourceStructureGraph.getVertex(vertexId) == null \|\| !ResourceStructureGraph.getVertex(vertexId).getWasImported())
558	incomingVertexIter.remove();
559	}
560	Iterator<String> outgoingVertexIter = outgoingVertexNames.iterator();
561	while(outgoingVertexIter.hasNext()) {
562	String vertexId = outgoingVertexIter.next();
563	if(ResourceStructureGraph.getVertex(vertexId) == null \|\| !ResourceStructureGraph.getVertex(vertexId).getWasImported())
564	outgoingVertexIter.remove();
565	}
566
567	if(!incomingVertexNames.isEmpty()) {
568	doc.setField(FacetConstants.FIELD_HAS_PART, incomingVertexNames);
569	doc.setField(FacetConstants.FIELD_HAS_PART_COUNT, incomingVertexNames.size());
570	}
571
572	if(!outgoingVertexNames.isEmpty()) {
573	doc.setField(FacetConstants.FIELD_IS_PART_OF, outgoingVertexNames);
574	}
575	updateDocs.add(doc);
576	}
577
578	if (updateDocs.size() == config.getMaxDocsInList()) {
579	solrServer.add(updateDocs);
580	if (serverError != null) {
581	throw new SolrServerException(serverError);
582	}
583	updateDocs = new ArrayList<SolrInputDocument>();
584	}
585	}
586	if(!updateDocs.isEmpty()) {
587	solrServer.add(updateDocs);
588	if (serverError != null) {
589	throw new SolrServerException(serverError);
590	}
591	}
592
593	if(updatedDocs) {
594	solrServer.commit();
595	}
596
597	ResourceStructureGraph.clearResourceGraph();
598	}
599
600	public static VloConfig config;
601
602	public static LanguageCodeUtils languageCodeUtils;
603
604	//data roots passed from command line
605	private String clDatarootsList = null;
606
607	/**
608	* @param args
609	* @throws MalformedURLException
610	* @throws IOException
611	*/
612	public static void main(String[] args) throws MalformedURLException, IOException {
613
614	// path to the configuration file
615	String configFile = null;
616
617	// use the Apache cli framework for getting command line parameters
618	Options options = new Options();
619
620	// Data root list passed from command line with -l option
621	String cldrList = null;
622
623	/**
624	* Add a "c" option, the option indicating the specification of an XML
625	* configuration file
626	*
627	* "l" option - to specify which data roots (from config file) to import
628	* imports all by default
629	*/
630	options.addOption("c", true, "-c <file> : use parameters specified in <file>");
631	options.addOption("l", true, "-l <dataroot> [ ' ' <dataroot> ]* : space separated list of dataroots to be processed.\n"
632	+ "If dataroot is not specified in config file it will be ignored.");
633	options.getOption("l").setOptionalArg(true);
634
635	CommandLineParser parser = new PosixParser();
636
637	try {
638	// parse the command line arguments
639	CommandLine cmd = parser.parse(options, args);
640	if (cmd.hasOption("c")) {
641
642	// the "c" option was specified, now get its value
643	configFile = cmd.getOptionValue("c");
644	}
645
646	if(cmd.hasOption("l")){
647	cldrList = cmd.getOptionValue("l");
648	}
649
650	} catch (org.apache.commons.cli.ParseException ex) {
651
652	/**
653	* Caught an exception caused by command line parsing. Try to get
654	* the name of the configuration file by querying the system
655	* property.
656	*/
657	String message = "Command line parsing failed. " + ex.getMessage();
658	LOG.error(message);
659	System.err.println(message);
660	}
661
662	if (configFile == null) {
663
664	String message;
665
666	message = "Could not get config file name via the command line, trying the system properties.";
667	LOG.info(message);
668
669	String key;
670
671	key = "configFile";
672	configFile = System.getProperty(key);
673	}
674
675	if (configFile == null) {
676
677	String message;
678
679	message = "Could not get filename as system property either - stopping.";
680	LOG.error(message);
681	} else {
682	// read the configuration from the externally supplied file
683	final URL configUrl;
684	if (configFile.startsWith("file:")) {
685	configUrl = new URL(configFile);
686	} else {
687	configUrl = new File(configFile).toURI().toURL();
688	}
689	System.out.println("Reading configuration from " + configUrl.toString());
690	LOG.info("Reading configuration from " + configUrl.toString());
691	final XmlVloConfigFactory configFactory = new XmlVloConfigFactory(configUrl);
692	MetadataImporter.config = configFactory.newConfig();
693	MetadataImporter.languageCodeUtils = new LanguageCodeUtils(MetadataImporter.config);
694
695	// optionally, modify the configuration here
696	// create and start the importer
697	MetadataImporter importer = new MetadataImporter(cldrList);
698	importer.startImport();
699
700	// finished importing
701	if (MetadataImporter.config.printMapping()) {
702	File file = new File("xsdMapping.txt");
703	FacetMappingFactory.printMapping(file);
704	LOG.info("Printed facetMapping in " + file);
705	}
706	}
707	}
708	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: