Changeset 996
- Timestamp:
- 12/15/10 12:48:10 (13 years ago)
- Location:
- vlo/trunk/vlo_webapp
- Files:
-
- 2 added
- 6 edited
- 1 moved
Legend:
- Unmodified
- Added
- Removed
-
vlo/trunk/vlo_webapp/pom.xml
r957 r996 76 76 77 77 <dependency> 78 <groupId>commons-digester</groupId>79 <artifactId>commons-digester</artifactId>80 <version>2.1</version>78 <groupId>com.ximpleware</groupId> 79 <artifactId>vtd-xml</artifactId> 80 <version>2.9</version> 81 81 </dependency> 82 82 </dependencies> -
vlo/trunk/vlo_webapp/src/main/java/eu/clarin/cmdi/vlo/importer/CMDIDigester.java
r995 r996 14 14 import javax.xml.xpath.XPathFactory; 15 15 16 import org.slf4j.Logger;17 import org.slf4j.LoggerFactory;18 16 import org.w3c.dom.Document; 19 17 import org.w3c.dom.Node; … … 22 20 import org.xml.sax.SAXException; 23 21 24 public class CMDIDigester { 25 private final static Logger LOG = LoggerFactory.getLogger(CMDIDigester.class); 22 23 /** 24 * @deprecated 25 * Dom parsing implementation, use the @see CMDIParserVTDXML it is much faster. 26 * Keeping this for now just in case we run into issues with the vlt parsing. 27 * patdui 15 December 2010 28 */ 29 public class CMDIDigester implements CMDIDataProcessor { 26 30 private final FacetMapping facetMapping; 27 31 private DocumentBuilder builder; -
vlo/trunk/vlo_webapp/src/main/java/eu/clarin/cmdi/vlo/importer/DataRoot.java
r894 r996 8 8 private String originName; 9 9 private File rootFile; 10 private boolean deleteFirst = false; 10 11 11 12 public void setFacetMapping(FacetMapping facetMapping) { … … 33 34 } 34 35 36 public void setDeleteFirst(boolean deleteFirst) { 37 this.deleteFirst = deleteFirst; 38 } 39 40 public boolean isDeleteFirst() { 41 return deleteFirst; 42 } 43 35 44 } -
vlo/trunk/vlo_webapp/src/main/java/eu/clarin/cmdi/vlo/importer/ImporterConfig.java
r950 r996 5 5 public class ImporterConfig { 6 6 7 private boolean delete First = false;7 private boolean deleteAllFirst = false; 8 8 9 9 private List<DataRoot> dataRoots; … … 17 17 } 18 18 19 public void setDelete First(boolean deleteFirst) {20 this.delete First = deleteFirst;19 public void setDeleteAllFirst(boolean deleteAllFirst) { 20 this.deleteAllFirst = deleteAllFirst; 21 21 } 22 22 23 public boolean isDelete First() {24 return delete First;23 public boolean isDeleteAllFirst() { 24 return deleteAllFirst; 25 25 } 26 26 -
vlo/trunk/vlo_webapp/src/main/java/eu/clarin/cmdi/vlo/importer/MetadataImporter.java
r988 r996 9 9 import java.util.Set; 10 10 11 import javax.xml.xpath.XPathExpressionException;12 13 11 import org.apache.solr.client.solrj.SolrServerException; 14 12 import org.apache.solr.client.solrj.impl.StreamingUpdateSolrServer; … … 18 16 import org.springframework.beans.factory.BeanFactory; 19 17 import org.springframework.context.support.ClassPathXmlApplicationContext; 20 import org.xml.sax.SAXException;21 18 22 19 import eu.clarin.cmdi.vlo.Configuration; … … 52 49 } 53 50 54 //TODO PD can have multiple origins55 51 private void startImport() { 56 52 List<DataRoot> dataRoots = config.getDataRoots(); … … 64 60 long start = System.currentTimeMillis(); 65 61 try { 66 if (config.isDelete First()) {62 if (config.isDeleteAllFirst()) { 67 63 LOG.info("Deleting original data..."); 68 64 solrServer.deleteByQuery("*:*");//Delete the whole solr db. … … 70 66 } 71 67 for (DataRoot dataRoot : dataRoots) { 72 LOG.info("Start of processing: "+dataRoot.getOriginName()); 73 CMDIDigester digester = new CMDIDigester(dataRoot.getFacetMapping()); 74 processCmdi(dataRoot.getRootFile(), dataRoot.getOriginName(), digester); 68 LOG.info("Start of processing: " + dataRoot.getOriginName()); 69 if (dataRoot.isDeleteFirst()) { 70 LOG.info("Deleting data for origin: " + dataRoot.getOriginName()); 71 solrServer.deleteByQuery(FacetConstants.FIELD_ORIGIN + ":" + dataRoot.getOriginName()); 72 LOG.info("Deleting data for origin done."); 73 } 74 CMDIDataProcessor processor = new CMDIParserVTDXML(dataRoot.getFacetMapping()); 75 processCmdi(dataRoot.getRootFile(), dataRoot.getOriginName(), processor); 75 76 if (!docs.isEmpty()) { 76 77 sendDocs(); 77 78 } 78 LOG.info("End of processing: " +dataRoot.getOriginName());79 LOG.info("End of processing: " + dataRoot.getOriginName()); 79 80 } 80 81 } catch (SolrServerException e) { … … 98 99 } 99 100 100 private void processCmdi(File file, String origin, CMDID igester digester) throws SolrServerException, IOException {101 private void processCmdi(File file, String origin, CMDIDataProcessor processor) throws SolrServerException, IOException { 101 102 nrOfFilesAnalyzed++; 102 103 CMDIData cmdiData = null; 103 104 try { 104 cmdiData = digester.process(file); 105 } catch (IOException e) { 106 LOG.error("error in file: " + file + " Exception", e); 107 } catch (SAXException e) { 108 LOG.error("error in file: " + file + " Exception", e); 109 } catch (XPathExpressionException e) { 105 cmdiData = processor.process(file); 106 } catch (Exception e) { 110 107 LOG.error("error in file: " + file + " Exception", e); 111 108 } … … 119 116 File resourceFile = new File(file.getParentFile(), cmdiResource); 120 117 if (resourceFile.exists()) { 121 processCmdi(resourceFile, origin, digester);118 processCmdi(resourceFile, origin, processor); 122 119 } else { 123 120 nrOfNonExistendResourceFiles++; … … 146 143 147 144 private void sendDocs() throws SolrServerException, IOException { 148 LOG.info("Sending " +docs.size()+" docs to solr server. Total number of docs updated till now: "+nrOFDocumentsUpdated);145 LOG.info("Sending " + docs.size() + " docs to solr server. Total number of docs updated till now: " + nrOFDocumentsUpdated); 149 146 nrOFDocumentsUpdated += docs.size(); 150 147 solrServer.add(docs); -
vlo/trunk/vlo_webapp/src/main/resources/importerConfig.xml
r995 r996 4 4 5 5 <bean id="importerConfig" class="eu.clarin.cmdi.vlo.importer.ImporterConfig"> 6 < !-- <property name="deleteFirst" value="true"/>-->6 <property name="deleteAllFirst" value="true"/> 7 7 <property name="dataRoots"> 8 8 <list> 9 9 <bean class="eu.clarin.cmdi.vlo.importer.DataRoot"> 10 <!-- <property name="deleteFirst" value="true"/>--> 10 11 <property name="originName" value="OLAC Metadata Providers" /> 11 12 <property name="rootFile" … … 16 17 <property name="originName" value="Nijmegen corpora of casual speech" /> 17 18 <property name="rootFile" 18 19 19 value="/Users/patdui/data/snapshots2/data/corpora/qfs1/media-archive/casual_speech/Corpusstructure/casual_speech.imdi.cmdi" /> 20 20 <property name="facetMapping" ref="imdiMapping"></property> -
vlo/trunk/vlo_webapp/src/test/java/eu/clarin/cmdi/vlo/importer/CMDIDataProcessorTest.java
r995 r996 7 7 import java.io.File; 8 8 import java.io.IOException; 9 import java.util.ArrayList; 10 import java.util.Collection; 11 import java.util.Collections; 9 12 import java.util.List; 10 13 … … 17 20 import org.springframework.context.support.ClassPathXmlApplicationContext; 18 21 19 public class CMDID igesterTest {22 public class CMDIDataProcessorTest { 20 23 21 24 private static File testDir; 25 26 private CMDIDataProcessor getDataParser(FacetMapping map) { 27 return new CMDIParserVTDXML(map); 28 // return new CMDIDigester(map); 29 } 22 30 23 31 @Test … … 66 74 content += "</CMD>\n"; 67 75 File cmdiFile = createCmdiFile("testCorpus", content); 68 CMDID igester digester = new CMDIDigester(getIMDIFacetMap());69 CMDIData data = digester.process(cmdiFile);76 CMDIDataProcessor processor = getDataParser(getIMDIFacetMap()); 77 CMDIData data = processor.process(cmdiFile); 70 78 assertEquals("test-hdl:1839/00-0000-0000-0000-0001-D", data.getId()); 71 79 List<String> resources = data.getResources(); … … 362 370 content += "</CMD>\n"; 363 371 File cmdiFile = createCmdiFile("testSession", content); 364 CMDID igester digester = new CMDIDigester(getIMDIFacetMap());365 CMDIData data = digester.process(cmdiFile);372 CMDIDataProcessor processor = getDataParser(getIMDIFacetMap()); 373 CMDIData data = processor.process(cmdiFile); 366 374 assertEquals("test-hdl:1839/00-0000-0000-0009-294C-9", data.getId()); 367 375 List<String> resources = data.getResources(); … … 452 460 content += "</CMD>\n"; 453 461 File cmdiFile = createCmdiFile("testSession", content); 454 CMDID igester digester = new CMDIDigester(getIMDIFacetMap());455 CMDIData data = digester.process(cmdiFile);462 CMDIDataProcessor processor = getDataParser(getIMDIFacetMap()); 463 CMDIData data = processor.process(cmdiFile); 456 464 assertEquals("test-hdl:1839/00-0000-0000-0009-294C-9", data.getId()); 457 465 List<String> resources = data.getResources(); … … 505 513 content += " <description>The one-eyed grandmother is one of many traditional Kuna stories performed in the Kuna gathering house. This story, performed here by Pedro Arias, combines European derived motifs (Tom Thumb and Hansel and Gretel) with themes that seem more Kuna in origin. All are woven together and a moral is provided. Pedro Arias performed this story before a gathered audience in the morning..\n"; 506 514 content += " </description>\n"; 515 content += " <description>Test</description>\n"; 507 516 content += " <identifier>http://uts.cc.utexas.edu/~ailla/audio/sherzer/one_eyed_grandmother.ram</identifier>\n"; 508 517 content += " <identifier>http://uts.cc.utexas.edu/~ailla/texts/sherzer/one_eyed_grandmother.pdf</identifier>\n"; … … 516 525 517 526 File cmdiFile = createCmdiFile("testOlac", content); 518 CMDID igester digester = new CMDIDigester(getOlacFacetMap());519 CMDIData data = digester.process(cmdiFile);527 CMDIDataProcessor processor = getDataParser(getOlacFacetMap()); 528 CMDIData data = processor.process(cmdiFile); 520 529 assertEquals("oai:ailla.utexas.edu:1", data.getId()); 521 530 List<String> resources = data.getResources(); … … 532 541 assertEquals("transcription", doc.getFieldValue("genre")); 533 542 // assertEquals("Kuna", doc.getFieldValue("subject")); 534 assertEquals(2, doc.getFieldValues("description").size()); 543 Collection<Object> fieldValues = doc.getFieldValues("description"); 544 assertEquals(3, fieldValues.size()); 545 List<String> descriptions = new ArrayList(fieldValues); 546 Collections.sort(descriptions); 547 assertEquals("\n Channel: Talking;\n Genre: Traditional Narrative / Story;\n Country: Panama;\n" 548 + " Place of Recording: Mulatuppu;\n Event: Community Gathering;\n" 549 + " Institutional Affiliation: University of Texas at Austin;\n Participant Information: Political Leader;\n" 550 + " ", descriptions.get(0).toString()); 551 assertEquals("Test", descriptions.get(1).toString()); 552 assertEquals("The one-eyed grandmother is one of many traditional Kuna stories performed " 553 + "in the Kuna gathering house. This story, performed here by Pedro Arias, combines " 554 + "European derived motifs (Tom Thumb and Hansel and Gretel) with themes that seem more " 555 + "Kuna in origin. All are woven together and a moral is provided. Pedro Arias performed " 556 + "this story before a gathered audience in the morning..\n ", descriptions.get(2).toString()); 535 557 } 536 558 … … 568 590 569 591 File cmdiFile = createCmdiFile("testOlac", content); 570 CMDID igester digester = new CMDIDigester(getOlacFacetMap());571 CMDIData data = digester.process(cmdiFile);592 CMDIDataProcessor processor = getDataParser(getOlacFacetMap()); 593 CMDIData data = processor.process(cmdiFile); 572 594 assertEquals("collection_ATILF_Resources.cmdi", data.getId()); 573 595 List<String> resources = data.getResources(); … … 616 638 617 639 File cmdiFile = createCmdiFile("testOlac", content); 618 CMDID igester digester = new CMDIDigester(getLrtFacetMap());619 CMDIData data = digester.process(cmdiFile);640 CMDIDataProcessor processor = getDataParser(getLrtFacetMap()); 641 CMDIData data = processor.process(cmdiFile); 620 642 assertEquals("clarin.eu:lrt:433", data.getId()); 621 643 List<String> resources = data.getResources();
Note: See TracChangeset
for help on using the changeset viewer.