Changeset 995
- Timestamp:
- 12/15/10 09:23:33 (13 years ago)
- Location:
- vlo/trunk/vlo_webapp/src
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
vlo/trunk/vlo_webapp/src/main/java/eu/clarin/cmdi/vlo/importer/CMDIDigester.java
r992 r995 25 25 private final static Logger LOG = LoggerFactory.getLogger(CMDIDigester.class); 26 26 private final FacetMapping facetMapping; 27 // private XMLReader xmlReader;28 27 private DocumentBuilder builder; 29 28 … … 37 36 throw new RuntimeException("Cannot instantiate documentBuilder:", e); 38 37 } 39 // try {40 // xmlReader = createXmlReader();41 // } catch (SAXException e) {42 // throw new RuntimeException("Cannot instantiate xmlReader:", e);43 // }44 38 } 45 39 … … 50 44 XPath xpath = XPathFactory.newInstance().newXPath(); 51 45 result = createCMDIData(xpath, inputSource); 52 53 /**54 * Do not reuse the digester it holds state on bad parses. We can reuse the xmlReader. Creating a new Digester or reusing an55 * instance gives similar performance.56 * @see org.apache.commons.digester.Digester57 */58 //result = (CMDIData) createDigester().parse(inputSource);59 46 return result; 60 47 } … … 92 79 } // else do nothing it is perfectly acceptable that not all data is in a cmdi file so not everything will be matched. E.G xpath expression evaluation CMDI session files will never match on CMD corpus files. 93 80 } 94 95 // private Digester createDigester() {96 // Digester digester = new Digester(xmlReader);97 // digester.setValidating(false);98 // digester.addObjectCreate("CMD", CMDIData.class);99 // digester.addBeanPropertySetter(facetMapping.getIdMapping(), "id");100 // digester.addCallMethod("CMD/Resources/ResourceProxyList/ResourceProxy/", "addResource", 2);101 // digester.addCallParam("CMD/Resources/ResourceProxyList/ResourceProxy/ResourceRef", 0);102 // digester.addCallParam("CMD/Resources/ResourceProxyList/ResourceProxy/ResourceType", 1);103 // // Map<String, String> facetMap = facetMapping.getFacetMap();104 // // for (String facet : facetMap.keySet()) {105 // // matchDocumentField(digester, facetMap.get(facet), facet);106 // // }107 // return digester;108 // }109 //110 // private void matchDocumentField(Digester digester, String pattern, String fieldName) {111 // String[] split = pattern.split(",@", 2);112 // String path = split[0];113 // String attribute = split.length == 2 ? split[1] : null;114 // digester.addCallMethod(path, "addDocField", 2);115 // digester.addObjectParam(path, 0, fieldName);116 // digester.addCallParam(path, 1, attribute);117 // }118 //119 // private XMLReader createXmlReader() throws SAXException {120 // XMLReader xmlReader = XMLReaderFactory.createXMLReader();121 // xmlReader.setFeature("http://xml.org/sax/features/validation", true);122 // xmlReader.setFeature("http://xml.org/sax/features/namespaces", true);123 // xmlReader.setProperty("http://java.sun.com/xml/jaxp/properties/schemaLanguage", "http://www.w3.org/2001/XMLSchema");124 // return xmlReader;125 // }126 127 81 } -
vlo/trunk/vlo_webapp/src/main/java/eu/clarin/cmdi/vlo/importer/FacetConfiguration.java
r992 r995 30 30 return name; 31 31 } 32 33 @Override 34 public String toString() { 35 return "name="+name+", pattern="+pattern; 36 } 32 37 } -
vlo/trunk/vlo_webapp/src/main/resources/importerConfig.xml
r994 r995 136 136 137 137 <bean id="imdiMapping" class="eu.clarin.cmdi.vlo.importer.FacetMapping"> <!-- add year? --> 138 <property name="idMapping" value=" CMD/Header/MdSelfLink/text()" />138 <property name="idMapping" value="/CMD/Header/MdSelfLink/text()" /> 139 139 <property name="facets"> 140 140 <list> 141 141 <bean class="eu.clarin.cmdi.vlo.importer.FacetConfiguration"> 142 142 <property name="name" value="name" /> 143 <property name="pattern" value=" CMD/Components/Session/Name/text()" />143 <property name="pattern" value="/CMD/Components/Session/Name/text()" /> 144 144 </bean> 145 145 <bean class="eu.clarin.cmdi.vlo.importer.FacetConfiguration"> 146 146 <property name="name" value="year" /> 147 <property name="pattern" value=" CMD/Components/Session/Date/text()" />147 <property name="pattern" value="/CMD/Components/Session/Date/text()" /> 148 148 </bean> 149 149 <bean class="eu.clarin.cmdi.vlo.importer.FacetConfiguration"> 150 150 <property name="name" value="continent" /> 151 <property name="pattern" value=" CMD/Components/Session/MDGroup/Location/Continent/text()" />151 <property name="pattern" value="/CMD/Components/Session/MDGroup/Location/Continent/text()" /> 152 152 </bean> 153 153 <bean class="eu.clarin.cmdi.vlo.importer.FacetConfiguration"> 154 154 <property name="name" value="country" /> 155 <property name="pattern" value=" CMD/Components/Session/MDGroup/Location/Country/text()" />155 <property name="pattern" value="/CMD/Components/Session/MDGroup/Location/Country/text()" /> 156 156 </bean> 157 157 <bean class="eu.clarin.cmdi.vlo.importer.FacetConfiguration"> 158 158 <property name="name" value="language" /> 159 <property name="pattern" value=" CMD/Components/Session/MDGroup/Content/Content_Languages/Content_Language/Id/text()" />159 <property name="pattern" value="/CMD/Components/Session/MDGroup/Content/Content_Languages/Content_Language/Id/text()" /> 160 160 </bean> 161 161 <bean class="eu.clarin.cmdi.vlo.importer.FacetConfiguration"> 162 162 <property name="name" value="organisation" /> 163 <property name="pattern" value=" CMD/Components/Session/MDGroup/Project/Contact/Organisation/text()" />163 <property name="pattern" value="/CMD/Components/Session/MDGroup/Project/Contact/Organisation/text()" /> 164 164 </bean> 165 165 <bean class="eu.clarin.cmdi.vlo.importer.FacetConfiguration"> 166 166 <property name="name" value="genre" /> 167 <property name="pattern" value=" CMD/Components/Session/MDGroup/Content/Genre/text()" />167 <property name="pattern" value="/CMD/Components/Session/MDGroup/Content/Genre/text()" /> 168 168 <property name="caseInsensitive" value="true" /> 169 169 </bean> 170 170 <bean class="eu.clarin.cmdi.vlo.importer.FacetConfiguration"> 171 171 <property name="name" value="subject" /> 172 <property name="pattern" value=" CMD/Components/Session/MDGroup/Content/Subject/text()" />172 <property name="pattern" value="/CMD/Components/Session/MDGroup/Content/Subject/text()" /> 173 173 <property name="caseInsensitive" value="true" /> 174 174 </bean> 175 175 <bean class="eu.clarin.cmdi.vlo.importer.FacetConfiguration"> 176 176 <property name="name" value="description" /> 177 <property name="pattern" value=" CMD/Components/Session/descriptions/Description/text()" />177 <property name="pattern" value="/CMD/Components/Session/descriptions/Description/text()" /> 178 178 </bean> 179 179 </list> … … 187 187 188 188 <bean id="olacMapping" class="eu.clarin.cmdi.vlo.importer.FacetMapping"> 189 <property name="idMapping" value=" CMD/Header/MdSelfLink/text()" /> <!-- And some other example see http://trac.clarin.eu/wiki/CmdiVirtualLanguageObservatory -->189 <property name="idMapping" value="/CMD/Header/MdSelfLink/text()" /> <!-- And some other example see http://trac.clarin.eu/wiki/CmdiVirtualLanguageObservatory --> 190 190 <property name="facets"> 191 191 <list> 192 192 <bean class="eu.clarin.cmdi.vlo.importer.FacetConfiguration"> 193 193 <property name="name" value="name" /> 194 <property name="pattern" value=" CMD/Components/OLAC-DcmiTerms/title/text()" />194 <property name="pattern" value="/CMD/Components/OLAC-DcmiTerms/title/text()" /> 195 195 </bean> 196 196 <bean class="eu.clarin.cmdi.vlo.importer.FacetConfiguration"> 197 197 <property name="name" value="country" /> 198 <property name="pattern" value=" CMD/Components/OLAC-DcmiTerms/spatial/text()" />198 <property name="pattern" value="/CMD/Components/OLAC-DcmiTerms/spatial/text()" /> 199 199 <!-- 200 200 /CMD/Components/OLAC-DcmiTerms/spatial[@dcterms-type="ISO3166"] -> country … … 204 204 <bean class="eu.clarin.cmdi.vlo.importer.FacetConfiguration"> 205 205 <property name="name" value="language" /> 206 <property name="pattern" value=" CMD/Components/OLAC-DcmiTerms/language/@olac-language" />206 <property name="pattern" value="/CMD/Components/OLAC-DcmiTerms/language/@olac-language" /> 207 207 </bean> 208 208 <bean class="eu.clarin.cmdi.vlo.importer.FacetConfiguration"> 209 209 <property name="name" value="organisation" /> 210 <property name="pattern" value=" CMD/Components/OLAC-DcmiTerms/publisher/text()" />210 <property name="pattern" value="/CMD/Components/OLAC-DcmiTerms/publisher/text()" /> 211 211 </bean> 212 212 <bean class="eu.clarin.cmdi.vlo.importer.FacetConfiguration"> 213 213 <property name="name" value="genre" /> 214 <property name="pattern" value=" CMD/Components/OLAC-DcmiTerms/type/@olac-linguistic-type" />214 <property name="pattern" value="/CMD/Components/OLAC-DcmiTerms/type/@olac-linguistic-type" /> 215 215 <property name="caseInsensitive" value="true" /> 216 216 </bean> 217 217 <bean class="eu.clarin.cmdi.vlo.importer.FacetConfiguration"> 218 218 <property name="name" value="description" /> 219 <property name="pattern" value=" CMD/Components/OLAC-DcmiTerms/description/text()" />219 <property name="pattern" value="/CMD/Components/OLAC-DcmiTerms/description/text()" /> 220 220 </bean> 221 221 <!-- … … 244 244 245 245 <bean id="lrtMapping" class="eu.clarin.cmdi.vlo.importer.FacetMapping"> 246 <property name="idMapping" value=" CMD/Header/MdSelfLink/text()" />246 <property name="idMapping" value="/CMD/Header/MdSelfLink/text()" /> 247 247 <property name="facets"> 248 248 <list> 249 249 <bean class="eu.clarin.cmdi.vlo.importer.FacetConfiguration"> 250 250 <property name="name" value="name" /> 251 <property name="pattern" value=" CMD/Components/LrtInventoryResource/LrtCommon/ResourceName/text()" />251 <property name="pattern" value="/CMD/Components/LrtInventoryResource/LrtCommon/ResourceName/text()" /> 252 252 </bean> 253 253 <bean class="eu.clarin.cmdi.vlo.importer.FacetConfiguration"> 254 254 <property name="name" value="country" /> 255 <property name="pattern" value=" CMD/Components/LrtInventoryResource/LrtCommon/Countries/Country/code/text()" />255 <property name="pattern" value="/CMD/Components/LrtInventoryResource/LrtCommon/Countries/Country/Code/text()" /> 256 256 </bean> 257 257 <bean class="eu.clarin.cmdi.vlo.importer.FacetConfiguration"> 258 258 <property name="name" value="language" /> 259 <property name="pattern" value=" CMD/Components/LrtInventoryResource/LrtCommon/Languages/ISO639/iso-639-3-code/text()" />259 <property name="pattern" value="/CMD/Components/LrtInventoryResource/LrtCommon/Languages/ISO639/iso-639-3-code/text()" /> 260 260 </bean> 261 261 <bean class="eu.clarin.cmdi.vlo.importer.FacetConfiguration"> 262 262 <property name="name" value="organisation" /> 263 <property name="pattern" value=" CMD/Components/LrtInventoryResource/LrtCommon/Institute/text()" />263 <property name="pattern" value="/CMD/Components/LrtInventoryResource/LrtCommon/Institute/text()" /> 264 264 </bean> 265 265 <bean class="eu.clarin.cmdi.vlo.importer.FacetConfiguration"> 266 266 <property name="name" value="year" /> 267 267 <property name="pattern" 268 value=" CMD/Components/LrtInventoryResource/LrtCommon/FinalizationYearResourceCreation/text()" />268 value="/CMD/Components/LrtInventoryResource/LrtCommon/FinalizationYearResourceCreation/text()" /> 269 269 </bean> 270 270 <bean class="eu.clarin.cmdi.vlo.importer.FacetConfiguration"> 271 271 <property name="name" value="description" /> 272 <property name="pattern" value=" CMD/Components/LrtInventoryResource/LrtCommon/Description/text()" />272 <property name="pattern" value="/CMD/Components/LrtInventoryResource/LrtCommon/Description/text()" /> 273 273 </bean> 274 274 <!-- <entry key="continent">--> -
vlo/trunk/vlo_webapp/src/test/java/eu/clarin/cmdi/vlo/importer/CMDIDigesterTest.java
r989 r995 379 379 "This recording was made to generate a freely available test resource including speech and gestures. The annotations were created by Peter and Kita who is gesture researcher at the MPI for Psycholinguistics.", 380 380 doc.getFieldValue("description")); 381 assertEquals("2002-10-30", doc.getFieldValue("year")); //TODO PD curate year needs to be only the year also for olac, lrt381 assertEquals("2002-10-30", doc.getFieldValue("year")); 382 382 assertEquals(null, doc.getFieldValue("subject")); 383 //TODO PD make it work check TRAC ticket to get resources in384 // Collection<Object> fieldValue = doc.getFieldValues("resource");385 // assertEquals(2, fieldValue.size());386 383 } 387 384 … … 534 531 assertEquals(null, doc.getFieldValue("organisation")); 535 532 assertEquals("transcription", doc.getFieldValue("genre")); 536 // assertEquals("Kuna", doc.getFieldValue("subject"));533 // assertEquals("Kuna", doc.getFieldValue("subject")); 537 534 assertEquals(2, doc.getFieldValues("description").size()); 538 535 } … … 581 578 } 582 579 583 //TODO PD add LRT test 584 580 @Test 581 public void testLrtCollection() throws Exception { 582 String content = ""; 583 content += "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"; 584 content += "<CMD ns0:schemaLocation=\"http://www.clarin.eu/cmd http://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/profiles/clarin.eu:cr1:p_1289827960126/xsd\" xmlns:ns0=\"http://www.w3.org/2001/XMLSchema-instance\">\n"; 585 content += " <Header>\n"; 586 content += " <MdCreator>lrt2cmdi.py</MdCreator>\n"; 587 content += " <MdCreationDate>2010-11-17</MdCreationDate>\n"; 588 content += " <MdSelfLink>clarin.eu:lrt:433</MdSelfLink>\n"; 589 content += " <MdProfile>clarin.eu:cr1:p_1289827960126</MdProfile>\n"; 590 content += " </Header>\n"; 591 content += " <Resources>\n"; 592 content += " <ResourceProxyList />\n"; 593 content += " <JournalFileProxyList />\n"; 594 content += " <ResourceRelationList />\n"; 595 content += " </Resources>\n"; 596 content += " <Components>\n"; 597 content += " <LrtInventoryResource>\n"; 598 content += " <LrtCommon>\n"; 599 content += " <ResourceName>Corpus of Present-day Written Estonian</ResourceName>\n"; 600 content += " <ResourceType>Written Corpus</ResourceType>\n"; 601 content += " <LanguagesOther />\n"; 602 content += " <Description>written general; 95 mio words; TEI/SGML</Description>\n"; 603 content += " <ContactPerson>Kadri.Muischnek@ut.ee</ContactPerson>\n"; 604 content += " <Format />\n"; 605 content += " <Institute>Test</Institute>\n"; 606 content += " <MetadataLink />\n"; 607 content += " <Publications />\n"; 608 content += " <ReadilyAvailable>true</ReadilyAvailable>\n"; 609 content += " <ReferenceLink /> \n"; 610 content += " <Languages><ISO639><iso-639-3-code>est</iso-639-3-code></ISO639></Languages>\n"; 611 content += " <Countries><Country><Code>EE</Code></Country></Countries>\n"; 612 content += " </LrtCommon>\n"; 613 content += " </LrtInventoryResource>\n"; 614 content += " </Components>\n"; 615 content += "</CMD>\n"; 616 617 File cmdiFile = createCmdiFile("testOlac", content); 618 CMDIDigester digester = new CMDIDigester(getLrtFacetMap()); 619 CMDIData data = digester.process(cmdiFile); 620 assertEquals("clarin.eu:lrt:433", data.getId()); 621 List<String> resources = data.getResources(); 622 assertEquals(0, resources.size()); 623 SolrInputDocument doc = data.getSolrDocument(); 624 assertNotNull(doc); 625 assertEquals(5, doc.getFieldNames().size()); 626 assertEquals("Corpus of Present-day Written Estonian", doc.getFieldValue("name")); 627 assertEquals(null, doc.getFieldValue("continent")); 628 assertEquals(1, doc.getFieldValues("language").size()); 629 assertEquals("est", doc.getFieldValue("language")); 630 assertEquals("EE", doc.getFieldValue("country")); 631 assertEquals("Test", doc.getFieldValue("organisation")); 632 assertEquals(null, doc.getFieldValue("year")); 633 assertEquals(null, doc.getFieldValue("genre")); 634 assertEquals("written general; 95 mio words; TEI/SGML", doc.getFieldValue("description")); 635 } 636 585 637 private FacetMapping getOlacFacetMap() { 586 BeanFactory factory = new ClassPathXmlApplicationContext(new String[] { "importerConfig.xml" }); 638 BeanFactory factory = new ClassPathXmlApplicationContext(new String[] { "importerConfig.xml" }); 587 639 FacetMapping facetMapping = (FacetMapping) factory.getBean("olacMapping"); 588 640 return facetMapping; … … 592 644 BeanFactory factory = new ClassPathXmlApplicationContext(new String[] { "importerConfig.xml" }); 593 645 FacetMapping facetMapping = (FacetMapping) factory.getBean("imdiMapping"); 646 return facetMapping; 647 } 648 649 private FacetMapping getLrtFacetMap() { 650 BeanFactory factory = new ClassPathXmlApplicationContext(new String[] { "importerConfig.xml" }); 651 FacetMapping facetMapping = (FacetMapping) factory.getBean("lrtMapping"); 594 652 return facetMapping; 595 653 }
Note: See TracChangeset
for help on using the changeset viewer.