source: vlo/trunk/vlo-importer/src/test/java/eu/clarin/cmdi/vlo/importer/CMDIDataProcessorTest.java @ 5979

Last change on this file since 5979 was 5979, checked in by teckart@informatik.uni-leipzig.de, 9 years ago

Added support for using local XML schema files instead of using the component registry (#522), also stricter check when extracting profile ID from CMDI instance file

File size: 55.4 KB
Line 
1package eu.clarin.cmdi.vlo.importer;
2
3import eu.clarin.cmdi.vlo.FacetConstants;
4import java.io.File;
5import java.util.ArrayList;
6import java.util.Collection;
7import java.util.Collections;
8import java.util.List;
9import org.apache.solr.common.SolrInputDocument;
10import static org.junit.Assert.assertEquals;
11import static org.junit.Assert.assertNotNull;
12import static org.junit.Assert.assertNull;
13import static org.junit.Assert.assertTrue;
14import org.junit.Test;
15
16public class CMDIDataProcessorTest extends ImporterTestcase {
17
18    private CMDIDataProcessor getDataParser() {
19        return new CMDIParserVTDXML(MetadataImporter.POST_PROCESSORS, true);
20    }
21   
22    @Test
23    public void testCreateCMDIDataFromCorpus() throws Exception {
24       
25        // make sure the mapping file for testing is used
26        config.setFacetConceptsFile("/facetConceptsTest.xml");
27
28        String content = "";
29        content += "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
30        content += "<CMD xmlns=\"http://www.clarin.eu/cmd/\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\">\n";
31        content += "   <Header>\n";
32        content += "      <MdCreationDate>2003-01-14</MdCreationDate>\n";
33        content += "      <MdSelfLink>test-hdl:1839/00-0000-0000-0000-0001-D</MdSelfLink>\n";
34        content += "      <MdProfile>clarin.eu:cr1:p_1274880881885</MdProfile>\n";
35        content += "   </Header>\n";
36        content += "   <Resources>\n";
37        content += "      <ResourceProxyList>\n";
38        content += "         <ResourceProxy id=\"d28635e19\">\n";
39        content += "            <ResourceType>Metadata</ResourceType>\n";
40        content += "            <ResourceRef>../acqui_data/Corpusstructure/acqui.imdi.cmdi</ResourceRef>\n";
41        content += "         </ResourceProxy>\n";
42        content += "         <ResourceProxy id=\"d28635e23\">\n";
43        content += "            <ResourceType>Metadata</ResourceType>\n";
44        content += "            <ResourceRef>../Comprehension/Corpusstructure/comprehension.imdi.cmdi</ResourceRef>\n";
45        content += "         </ResourceProxy>\n";
46        content += "         <ResourceProxy id=\"d28635e26\">\n";
47        content += "            <ResourceType>Metadata</ResourceType>\n";
48        content += "            <ResourceRef>../lac_data/Corpusstructure/lac.imdi.cmdi</ResourceRef>\n";
49        content += "         </ResourceProxy>\n";
50        content += "      </ResourceProxyList>\n";
51        content += "      <JournalFileProxyList/>\n";
52        content += "      <ResourceRelationList/>\n";
53        content += "   </Resources>\n";
54        content += "   <Components>\n";
55        content += "      <imdi-corpus>\n";
56        content += "         <Corpus>\n";
57        content += "            <Name>MPI corpora</Name>\n";
58        content += "            <Title>Corpora of the Max-Planck Institute for Psycholinguistics</Title>\n";
59        content += "            <CorpusLink Name=\"Acquisition\">../acqui_data/Corpusstructure/acqui.imdi</CorpusLink>\n";
60        content += "            <CorpusLink Name=\"Comprehension\">../Comprehension/Corpusstructure/comprehension.imdi</CorpusLink>\n";
61        content += "            <CorpusLink Name=\"Language and Cognition\">../lac_data/Corpusstructure/lac.imdi</CorpusLink>\n";
62        content += "            <descriptions>\n";
63        content += "               <Description LanguageId=\"\">IMDI corpora</Description>\n";
64        content += "               <Description LanguageId=\"\"/>\n";
65        content += "            </descriptions>\n";
66        content += "         </Corpus>\n";
67        content += "      </imdi-corpus>\n";
68        content += "   </Components>\n";
69        content += "</CMD>\n";
70        File cmdiFile = createCmdiFile("testCorpus", content);
71        CMDIDataProcessor processor = getDataParser();
72        CMDIData data = processor.process(cmdiFile);
73        assertEquals("test-hdl_58_1839_47_00-0000-0000-0000-0001-D", data.getId());
74        List<Resource> resources = data.getMetadataResources();
75        assertEquals(3, resources.size());
76        Resource res = resources.get(0);
77        assertEquals("../acqui_data/Corpusstructure/acqui.imdi.cmdi", res.getResourceName());
78        assertEquals(null, res.getMimeType());
79        assertEquals(0, data.getDataResources().size());
80        SolrInputDocument doc = data.getSolrDocument();
81        // TODO FIX bad test case. Depends on the presence of an internet connection! (BAD!)
82        assertTrue(doc.getFieldValues(FacetConstants.FIELD_CLARIN_PROFILE).contains("imdi-corpus"));
83        assertNotNull(doc);
84    }
85
86    @Test
87    public void testCreateCMDIDataFromSession() throws Exception {
88       
89        // make sure the mapping file for testing is used
90        config.setFacetConceptsFile("/facetConceptsTest.xml");
91
92        String content = "";
93        content += "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
94        content += "<CMD xmlns=\"http://www.clarin.eu/cmd/\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n";
95        content += "     xsi:schemaLocation=\"http://www.clarin.eu/cmd http://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/profiles/clarin.eu:cr1:p_1271859438204/xsd\">\n";
96        content += "   <Header>\n";
97        content += "      <MdCreationDate>2008-05-27</MdCreationDate>\n";
98        content += "      <MdSelfLink>test-hdl:1839/00-0000-0000-0009-294C-9</MdSelfLink>\n";
99        content += "      <MdProfile>clarin.eu:cr1:p_1271859438204</MdProfile>\n";
100        content += "   </Header>\n";
101        content += "   <Resources>\n";
102        content += "      <ResourceProxyList>\n";
103        content += "         <ResourceProxy id=\"d314e408\">\n";
104        content += "            <ResourceType mimetype=\"video/x-mpeg1\" >Resource</ResourceType>\n";
105        content += "            <ResourceRef>../Media/elan-example1.mpg</ResourceRef>\n";
106        content += "         </ResourceProxy>\n";
107        content += "         <ResourceProxy id=\"d314e471\">\n";
108        content += "            <ResourceType mimetype=\"audio/mpeg\" >Resource</ResourceType>\n";
109        content += "            <ResourceRef>../Media/elan-example1.mp3</ResourceRef>\n";
110        content += "         </ResourceProxy>\n";
111        content += "      </ResourceProxyList>\n";
112        content += "      <JournalFileProxyList/>\n";
113        content += "      <ResourceRelationList/>\n";
114        content += "   </Resources>\n";
115        content += "   <Components>\n";
116        content += "      <Session>\n";
117        content += "         <Name>kleve-route</Name>\n";
118        content += "         <Title>route description to Kleve</Title>\n";
119        content += "         <Date>2002-10-30</Date>\n";
120        content += "         <descriptions>\n";
121        content += "            <Description xml:lang='eng' LanguageId=\"ISO639-2:eng\">This  recording was made to generate a freely available test resource including speech and gestures. The annotations were created by Peter and Kita who is gesture researcher at the MPI for Psycholinguistics.</Description>\n";
122        content += "            <Description LanguageId=\"ISO639-2:ger\">Diese Aufnahme wurde erzeugt, um eine frei verf\\u00fcgbare Test Resource zur Verf\\u00fcgung stellen zu k\\u00f6nnen, die Sprache und Gestik umfasst. Die Annotationen wurden von Peter und Kita, dem Gestik Researcher am MPI erzeugt.</Description>\n";
123        content += "         </descriptions>\n";
124        content += "         <MDGroup>\n";
125        content += "            <Location>\n";
126        content += "               <Continent>Europe</Continent>\n";
127        content += "               <Country>Netherlands</Country>\n";
128        content += "               <Region/>\n";
129        content += "               <Address>Wundtlaan 1, Nijmegen</Address>\n";
130        content += "            </Location>\n";
131        content += "            <Project>\n";
132        content += "               <Name>Peter Wittenburg</Name>\n";
133        content += "               <Title>Route description test resource</Title>\n";
134        content += "               <Id/>\n";
135        content += "               <Contact>\n";
136        content += "                  <Name>Peter Wittenburg</Name>\n";
137        content += "                  <Address>Wundtlaan 1, 6525 XD Nijmegen</Address>\n";
138        content += "                  <Email>peter.wittenburg@mpi.nl</Email>\n";
139        content += "                  <Organisation>Max Planck Institute for Psycholinguistics</Organisation>\n";
140        content += "               </Contact>\n";
141        content += "               <descriptions>\n";
142        content += "                  <Description LanguageId=\"\"/>\n";
143        content += "               </descriptions>\n";
144        content += "            </Project>\n";
145        content += "            <Keys>\n";
146        content += "               <Key Name=\"conversion.IMDI.1.9to3.0.warning\">Unknown mapping of Genre: conversation|explanation|unspecified --&gt; ???</Key>\n";
147        content += "            </Keys>\n";
148        content += "            <Content>\n";
149        content += "               <Genre>Demo</Genre>\n";
150        content += "               <SubGenre>Unspecified</SubGenre>\n";
151        content += "               <Task>route description</Task>\n";
152        content += "               <Modalities>Speech; Gestures</Modalities>\n";
153        content += "               <CommunicationContext>\n";
154        content += "                  <Interactivity>interactive</Interactivity>\n";
155        content += "                  <PlanningType>semi-spontaneous</PlanningType>\n";
156        content += "                  <Involvement>elicited</Involvement>\n";
157        content += "                  <SocialContext>Unspecified</SocialContext>\n";
158        content += "                  <EventStructure>Unspecified</EventStructure>\n";
159        content += "                  <Channel>Unspecified</Channel>\n";
160        content += "               </CommunicationContext>\n";
161        content += "               <Content_Languages>\n";
162        content += "                  <descriptions>\n";
163        content += "                     <Description LanguageId=\"\"/>\n";
164        content += "                  </descriptions>\n";
165        content += "                  <Content_Language>\n";
166        content += "                     <Id>ISO639-3:eng</Id>\n";
167        content += "                     <Name>English</Name>\n";
168        content += "                     <descriptions>\n";
169        content += "                        <Description LanguageId=\"\"/>\n";
170        content += "                     </descriptions>\n";
171        content += "                  </Content_Language>\n";
172        content += "               </Content_Languages>\n";
173        content += "               <Keys>\n";
174        content += "                  <Key Name=\"IMDI__1_9.Interactional\">conversation</Key>\n";
175        content += "                  <Key Name=\"IMDI__1_9.Discursive\">explanation</Key>\n";
176        content += "                  <Key Name=\"IMDI__1_9.Interactional\">Unspecified</Key>\n";
177        content += "               </Keys>\n";
178        content += "               <descriptions>\n";
179        content += "                  <Description LanguageId=\"ISO639:eng\">This file was generated from an IMDI 1.9 file and transformed to IMDI 3.0. The substructure of Genre is replaced by two elements named \"Genre\" and \"SubGenre\". The original content of Genre substructure was: Interactional = 'conversation', Discursive = 'explanation', Performance = 'Unspecified'. These values have been added as Keys to the Content information.</Description>\n";
180        content += "                  <Description LanguageId=\"ISO639:eng\">Peter explains how to come from Nijmegen to Kleve by car, such that Kita would be able to get there.</Description>\n";
181        content += "               </descriptions>\n";
182        content += "            </Content>\n";
183        content += "            <Actors>\n";
184        content += "               <descriptions>\n";
185        content += "                  <Description LanguageId=\"\"/>\n";
186        content += "               </descriptions>\n";
187        content += "               <Actor>\n";
188        content += "                  <Role>interviewee</Role>\n";
189        content += "                  <Name>Peter</Name>\n";
190        content += "                  <FullName>Peter Wittenburg</FullName>\n";
191        content += "                  <Code>W</Code>\n";
192        content += "                  <FamilySocialRole>Unspecified</FamilySocialRole>\n";
193        content += "                  <EthnicGroup/>\n";
194        content += "                  <Age>Unknown</Age>\n";
195        content += "                  <BirthDate>Unspecified</BirthDate>\n";
196        content += "                  <Sex>Unknown</Sex>\n";
197        content += "                  <Education>university</Education>\n";
198        content += "                  <Anonymized>true</Anonymized>\n";
199        content += "                  <Contact>\n";
200        content += "                     <Name/>\n";
201        content += "                     <Address/>\n";
202        content += "                     <Email/>\n";
203        content += "                     <Organisation/>\n";
204        content += "                  </Contact>\n";
205        content += "                  <Keys/>\n";
206        content += "                  <descriptions>\n";
207        content += "                     <Description LanguageId=\"\"/>\n";
208        content += "                  </descriptions>\n";
209        content += "                  <Actor_Languages>\n";
210        content += "                     <descriptions>\n";
211        content += "                        <Description LanguageId=\"\"/>\n";
212        content += "                     </descriptions>\n";
213        content += "                     <Actor_Language>\n";
214        content += "                        <Id>ISO639-3:nld</Id>\n";
215        content += "                        <Name>Dutch</Name>\n";
216        content += "                        <descriptions>\n";
217        content += "                           <Description LanguageId=\"\"/>\n";
218        content += "                        </descriptions>\n";
219        content += "                     </Actor_Language>\n";
220        content += "                     <Actor_Language>\n";
221        content += "                        <Id>ISO639-3:deu</Id>\n";
222        content += "                        <Name>German</Name>\n";
223        content += "                        <descriptions>\n";
224        content += "                           <Description LanguageId=\"\"/>\n";
225        content += "                        </descriptions>\n";
226        content += "                     </Actor_Language>\n";
227        content += "                     <Actor_Language>\n";
228        content += "                        <Id>ISO639-3:eng</Id>\n";
229        content += "                        <Name>English</Name>\n";
230        content += "                        <descriptions>\n";
231        content += "                           <Description LanguageId=\"\"/>\n";
232        content += "                        </descriptions>\n";
233        content += "                     </Actor_Language>\n";
234        content += "                  </Actor_Languages>\n";
235        content += "               </Actor>\n";
236        content += "               <Actor>\n";
237        content += "                  <Role>interviewer</Role>\n";
238        content += "                  <Name>Kita</Name>\n";
239        content += "                  <FullName>Sotaro Kita</FullName>\n";
240        content += "                  <Code>k</Code>\n";
241        content += "                  <FamilySocialRole>Unspecified</FamilySocialRole>\n";
242        content += "                  <EthnicGroup/>\n";
243        content += "                  <Age>Unknown</Age>\n";
244        content += "                  <BirthDate>Unspecified</BirthDate>\n";
245        content += "                  <Sex>Unknown</Sex>\n";
246        content += "                  <Education>university</Education>\n";
247        content += "                  <Anonymized>true</Anonymized>\n";
248        content += "                  <Contact>\n";
249        content += "                     <Name/>\n";
250        content += "                     <Address/>\n";
251        content += "                     <Email/>\n";
252        content += "                     <Organisation/>\n";
253        content += "                  </Contact>\n";
254        content += "                  <Keys/>\n";
255        content += "                  <descriptions>\n";
256        content += "                     <Description LanguageId=\"\"/>\n";
257        content += "                  </descriptions>\n";
258        content += "                  <Actor_Languages>\n";
259        content += "                     <descriptions>\n";
260        content += "                        <Description LanguageId=\"\"/>\n";
261        content += "                     </descriptions>\n";
262        content += "                     <Actor_Language>\n";
263        content += "                        <Id>ISO639-3:eng</Id>\n";
264        content += "                        <Name>English</Name>\n";
265        content += "                        <descriptions>\n";
266        content += "                           <Description LanguageId=\"\"/>\n";
267        content += "                        </descriptions>\n";
268        content += "                     </Actor_Language>\n";
269        content += "                     <Actor_Language>\n";
270        content += "                        <Id>ISO639-3:jpn</Id>\n";
271        content += "                        <Name>Japanese</Name>\n";
272        content += "                        <descriptions>\n";
273        content += "                           <Description LanguageId=\"\"/>\n";
274        content += "                        </descriptions>\n";
275        content += "                     </Actor_Language>\n";
276        content += "                  </Actor_Languages>\n";
277        content += "               </Actor>\n";
278        content += "               <Actor>\n";
279        content += "                  <Role>Collector</Role>\n";
280        content += "                  <Name>Peter Wittenburg</Name>\n";
281        content += "                  <FullName>Peter Wittenburg</FullName>\n";
282        content += "                  <Code>Unspecified</Code>\n";
283        content += "                  <FamilySocialRole>Unspecified</FamilySocialRole>\n";
284        content += "                  <EthnicGroup/>\n";
285        content += "                  <Age>Unspecified</Age>\n";
286        content += "                  <BirthDate>Unspecified</BirthDate>\n";
287        content += "                  <Sex>Unspecified</Sex>\n";
288        content += "                  <Education/>\n";
289        content += "                  <Anonymized>false</Anonymized>\n";
290        content += "                  <Contact>\n";
291        content += "                     <Name>Peter Wittenburg</Name>\n";
292        content += "                     <Address>Wundtlaan 1, 6525 XD Nijmegen</Address>\n";
293        content += "                     <Email>peter.wittenburg@mpi.nl</Email>\n";
294        content += "                     <Organisation>Max-Planck-Institute for Psycholinguistics</Organisation>\n";
295        content += "                  </Contact>\n";
296        content += "                  <Keys/>\n";
297        content += "                  <descriptions>\n";
298        content += "                     <Description LanguageId=\"\"/>\n";
299        content += "                  </descriptions>\n";
300        content += "                  <Actor_Languages/>\n";
301        content += "               </Actor>\n";
302        content += "            </Actors>\n";
303        content += "         </MDGroup>\n";
304        content += "         <Resources>\n";
305        content += "            <MediaFile ref=\"d314e408\">\n";
306        content += "               <ResourceLink>../Media/elan-example1.mpg</ResourceLink>\n";
307        content += "               <Type>video</Type>\n";
308        content += "               <Format>video/x-mpeg1</Format>\n";
309        content += "               <Size/>\n";
310        content += "               <Quality>Unknown</Quality>\n";
311        content += "               <RecordingConditions>excellent</RecordingConditions>\n";
312        content += "               <TimePosition>\n";
313        content += "                  <Start>Unknown</Start>\n";
314        content += "                  <End>Unknown</End>\n";
315        content += "               </TimePosition>\n";
316        content += "               <Access>\n";
317        content += "                  <Availability>openly available</Availability>\n";
318        content += "                  <Date>2003-02-12</Date>\n";
319        content += "                  <Owner>MPI for Psycholinguistics</Owner>\n";
320        content += "                  <Publisher/>\n";
321        content += "                  <Contact>\n";
322        content += "                     <Name>Romuald Skiba</Name>\n";
323        content += "                     <Address/>\n";
324        content += "                     <Email/>\n";
325        content += "                     <Organisation/>\n";
326        content += "                  </Contact>\n";
327        content += "                  <descriptions>\n";
328        content += "                     <Description LanguageId=\"\"/>\n";
329        content += "                  </descriptions>\n";
330        content += "               </Access>\n";
331        content += "               <descriptions>\n";
332        content += "                  <Description LanguageId=\"\"/>\n";
333        content += "               </descriptions>\n";
334        content += "               <Keys/>\n";
335        content += "            </MediaFile>\n";
336        content += "            <MediaFile ref=\"d314e471\">\n";
337        content += "               <ResourceLink>../Media/elan-example1.mp4</ResourceLink>\n";
338        content += "               <Type>video</Type>\n";
339        content += "               <Format>video/mp4</Format>\n";
340        content += "               <Size/>\n";
341        content += "               <Quality>Unknown</Quality>\n";
342        content += "               <RecordingConditions>excellent</RecordingConditions>\n";
343        content += "               <TimePosition>\n";
344        content += "                  <Start>Unknown</Start>\n";
345        content += "                  <End>Unknown</End>\n";
346        content += "               </TimePosition>\n";
347        content += "               <Access>\n";
348        content += "                  <Availability>openly available</Availability>\n";
349        content += "                  <Date>2003-02-12</Date>\n";
350        content += "                  <Owner>MPI for Psycholinguistics</Owner>\n";
351        content += "                  <Publisher/>\n";
352        content += "                  <Contact>\n";
353        content += "                     <Name>Romuald Skiba</Name>\n";
354        content += "                     <Address/>\n";
355        content += "                     <Email/>\n";
356        content += "                     <Organisation/>\n";
357        content += "                  </Contact>\n";
358        content += "                  <descriptions>\n";
359        content += "                     <Description LanguageId=\"\"/>\n";
360        content += "                  </descriptions>\n";
361        content += "               </Access>\n";
362        content += "               <descriptions>\n";
363        content += "                  <Description LanguageId=\"\"/>\n";
364        content += "               </descriptions>\n";
365        content += "               <Keys/>\n";
366        content += "            </MediaFile>\n";
367        content += "         </Resources>\n";
368        content += "         <References>\n";
369        content += "            <descriptions>\n";
370        content += "               <Description LanguageId=\"\"/>\n";
371        content += "            </descriptions>\n";
372        content += "         </References>\n";
373        content += "      </Session>\n";
374        content += "   </Components>\n";
375        content += "</CMD>\n";
376        File cmdiFile = createCmdiFile("testSession", content);
377        CMDIDataProcessor processor = getDataParser();
378        CMDIData data = processor.process(cmdiFile);
379        assertEquals("test-hdl_58_1839_47_00-0000-0000-0009-294C-9", data.getId());
380        List<Resource> resources = data.getMetadataResources();
381        assertEquals(0, resources.size());
382        List<Resource> dataResources = data.getDataResources();
383        assertEquals(2, dataResources.size());
384        Resource res = dataResources.get(0);
385        assertEquals("../Media/elan-example1.mpg", res.getResourceName());
386        assertEquals("video/x-mpeg1", res.getMimeType());
387        res = dataResources.get(1);
388        assertEquals("../Media/elan-example1.mp3", res.getResourceName());
389        assertEquals("audio/mpeg", res.getMimeType());
390        SolrInputDocument doc = data.getSolrDocument();
391        assertNotNull(doc);
392        assertEquals(16, doc.getFieldNames().size());
393        assertEquals("test-hdl:1839/00-0000-0000-0009-294C-9", doc.getFieldValue("_selfLink"));
394        assertEquals("kleve-route", doc.getFieldValue("name"));
395        assertEquals("Peter Wittenburg", doc.getFieldValue(FacetConstants.FIELD_PROJECT_NAME));
396        assertEquals("Europe", doc.getFieldValue("continent"));
397        assertEquals("English", doc.getFieldValue("language"));
398        assertEquals("Netherlands", doc.getFieldValue("country"));
399        assertEquals("MPI for Psycholinguistics, MPG", doc.getFieldValue("organisation"));
400        assertEquals("demo", doc.getFieldValue("genre"));
401        assertEquals(
402                "{lang='eng'}This  recording was made to generate a freely available test resource including speech and gestures. The annotations were created by Peter and Kita who is gesture researcher at the MPI for Psycholinguistics.",
403                doc.getFieldValue("description"));
404        assertEquals("2002", doc.getFieldValue("year"));
405        List<String> fieldValues = new ArrayList(doc.getFieldValues(FacetConstants.FIELD_FORMAT));
406        assertEquals(2, fieldValues.size());
407        assertEquals("video/x-mpeg1", fieldValues.get(0));
408        assertEquals("video/mp4", fieldValues.get(1));
409        assertEquals(null, doc.getFieldValue("subject"));
410    }
411
412    @Test
413    public void testCreateCMDISessionSmall() throws Exception {
414       
415        // make sure the mapping file for testing is used
416        config.setFacetConceptsFile("/facetConceptsTest.xml");
417
418        String content = "";
419        content += "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
420        content += "<CMD xmlns=\"http://www.clarin.eu/cmd/\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n";
421        content += "     xsi:schemaLocation=\"http://www.clarin.eu/cmd http://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/profiles/clarin.eu:cr1:p_1271859438204/xsd\">\n";
422        content += "   <Header>\n";
423        content += "      <MdCreationDate>2008-05-27</MdCreationDate>\n";
424        content += "      <MdSelfLink>test-hdl:1839/00-0000-0000-0009-294C-9</MdSelfLink>\n";
425        content += "      <MdProfile>clarin.eu:cr1:p_1271859438204</MdProfile>\n";
426        content += "   </Header>\n";
427        content += "   <Resources>\n";
428        content += "    </Resources>\n";
429        content += "   <Components>\n";
430        content += "      <Session>\n";
431        content += "         <Name>kleve-route</Name>\n";
432        content += "      </Session>\n";
433        content += "   </Components>\n";
434        content += "</CMD>\n";
435        File cmdiFile = createCmdiFile("testSession", content);
436        CMDIDataProcessor processor = getDataParser();
437        CMDIData data = processor.process(cmdiFile);
438        assertEquals("kleve-route", data.getSolrDocument().getFieldValue(FacetConstants.FIELD_NAME));
439    }
440
441    @Test
442    public void testEmptyFieldsShouldBeNull() throws Exception {
443       
444        // make sure the mapping file for testing is used
445        config.setFacetConceptsFile("/facetConceptsTest.xml");
446
447        String content = "";
448        content += "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
449        content += "<CMD xmlns=\"http://www.clarin.eu/cmd/\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n";
450        content += "     xsi:schemaLocation=\"http://www.clarin.eu/cmd http://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/profiles/clarin.eu:cr1:p_1271859438204/xsd\">\n";
451        content += "   <Header>\n";
452        content += "      <MdCreationDate>2008-05-27</MdCreationDate>\n";
453        content += "      <MdSelfLink>test-hdl:1839/00-0000-0000-0009-294C-9</MdSelfLink>\n";
454        content += "      <MdProfile>clarin.eu:cr1:p_1271859438204</MdProfile>\n";
455        content += "   </Header>\n";
456        content += "   <Resources>\n";
457        content += "      <ResourceProxyList>\n";
458        content += "      </ResourceProxyList>\n";
459        content += "      <JournalFileProxyList/>\n";
460        content += "      <ResourceRelationList/>\n";
461        content += "   </Resources>\n";
462        content += "   <Components>\n";
463        content += "      <Session>\n";
464        content += "         <Name>kleve-route</Name>\n";
465        content += "         <Title>route description to Kleve</Title>\n";
466        content += "         <Date></Date>\n";
467        content += "         <descriptions>\n";
468        content += "            <Description LanguageId=\"ISO639-2:eng\">Test.</Description>\n";
469        content += "         </descriptions>\n";
470        content += "         <MDGroup>\n";
471        content += "            <Location>\n";
472        content += "               <Continent>Europe</Continent>\n";
473        content += "               <Country>Netherlands</Country>\n";
474        content += "               <Region/>\n";
475        content += "               <Address>Wundtlaan 1, Nijmegen</Address>\n";
476        content += "            </Location>\n";
477        content += "            <Project>\n";
478        content += "               <Name></Name>\n";
479        content += "               <Title></Title>\n";
480        content += "               <Id/>\n";
481        content += "               <Contact>\n";
482        content += "                  <Name></Name>\n";
483        content += "                  <Address></Address>\n";
484        content += "                  <Email></Email>\n";
485        content += "                  <Organisation></Organisation>\n";
486        content += "               </Contact>\n";
487        content += "               <descriptions>\n";
488        content += "                  <Description LanguageId=\"\"/>\n";
489        content += "               </descriptions>\n";
490        content += "            </Project>\n";
491        content += "            <Keys>\n";
492        content += "            </Keys>\n";
493        content += "            <Content>\n";
494        content += "               <Genre>Demo</Genre>\n";
495        content += "               <SubGenre>Unspecified</SubGenre>\n";
496        content += "               <Task>route description</Task>\n";
497        content += "               <Modalities>Speech; Gestures</Modalities>\n";
498        content += "               <CommunicationContext>\n";
499        content += "               </CommunicationContext>\n";
500        content += "               <Content_Languages>\n";
501        content += "               </Content_Languages>\n";
502        content += "               <descriptions>\n";
503        content += "               </descriptions>\n";
504        content += "            </Content>\n";
505        content += "            <Actors>\n";
506        content += "            </Actors>\n";
507        content += "         </MDGroup>\n";
508        content += "         <Resources>\n";
509        content += "         </Resources>\n";
510        content += "      </Session>\n";
511        content += "   </Components>\n";
512        content += "</CMD>\n";
513        File cmdiFile = createCmdiFile("testSession", content);
514        CMDIDataProcessor processor = getDataParser();
515        CMDIData data = processor.process(cmdiFile);
516        assertEquals("test-hdl_58_1839_47_00-0000-0000-0009-294C-9", data.getId()); //modified handle -> 'clean' id
517        List<Resource> resources = data.getMetadataResources();
518        assertEquals(0, resources.size());
519        SolrInputDocument doc = data.getSolrDocument();
520        assertNotNull(doc);
521        assertEquals(9, doc.getFieldNames().size());
522        assertEquals("test-hdl:1839/00-0000-0000-0009-294C-9", doc.getFieldValue("_selfLink")); //unmodified handle
523        assertEquals("kleve-route", doc.getFieldValue("name"));
524        assertEquals("Europe", doc.getFieldValue("continent"));
525        assertEquals("Netherlands", doc.getFieldValue("country"));
526        assertEquals("demo", doc.getFieldValue("genre"));
527        assertEquals("{lang='und'}Test.", doc.getFieldValue("description"));
528        assertEquals("Should be null not empty string", null, doc.getFieldValue("organisation"));
529        assertEquals(null, doc.getFieldValue("language"));
530        assertEquals(null, doc.getFieldValue("subject"));
531        assertEquals(null, doc.getFieldValue("year"));
532    }
533
534    @Test
535    public void testOlac() throws Exception {
536
537        // make sure the mapping file for testing is used
538        config.setFacetConceptsFile("/facetConceptsTest.xml");
539
540        String content = "";
541        content += "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
542        content += "<CMD xmlns=\"http://www.clarin.eu/cmd/\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n";
543        content += "     xmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n";
544        content += "     xmlns:oai_dc=\"http://www.openarchives.org/OAI/2.0/oai_dc/\"\n";
545        content += "     xmlns:defns=\"http://www.openarchives.org/OAI/2.0/\"\n";
546        content += "     xsi:schemaLocation=\"http://www.clarin.eu/cmd http://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/profiles/clarin.eu:cr1:p_1288172614026/xsd\">\n";
547        content += "   <Header>\n";
548        content += "      <MdCreator>olac2cmdi.xsl</MdCreator>\n";
549        content += "      <MdCreationDate>2002-12-14</MdCreationDate>\n";
550        content += "      <MdSelfLink>oai:ailla.utexas.edu:1</MdSelfLink>\n";
551        content += "      <MdProfile>clarin.eu:cr1:p_1288172614026</MdProfile>\n";
552        content += "   </Header>\n";
553        content += "   <Resources>\n";
554        content += "      <ResourceProxyList/>\n";
555        content += "      <JournalFileProxyList/>\n";
556        content += "      <ResourceRelationList/>\n";
557        content += "   </Resources>\n";
558        content += "   <Components>\n";
559        content += "      <OLAC-DcmiTerms>\n";
560        content += "         <creator>Joel Sherzer (recorder)</creator>\n";
561        content += "         <description>\n";
562        content += "    Channel: Talking;\n";
563        content += "    Genre: Traditional Narrative / Story;\n";
564        content += "    Country: Panama;\n";
565        content += "    Place of Recording: Mulatuppu;\n";
566        content += "    Event: Community Gathering;\n";
567        content += "    Institutional Affiliation: University of Texas at Austin;\n";
568        content += "    Participant Information: Political Leader;\n";
569        content += "      </description>\n";
570        content += "         <description>The one-eyed grandmother is one of many traditional Kuna stories performed in the Kuna gathering house. This story, performed here by Pedro Arias, combines European derived motifs (Tom Thumb and Hansel and Gretel) with themes that seem more Kuna in origin. All are woven together and a moral is provided. Pedro Arias performed this story before a gathered audience in the morning..\n";
571        content += "      </description>\n";
572        content += "         <description>Test</description>\n";
573        content += "         <identifier>http://uts.cc.utexas.edu/~ailla/audio/sherzer/one_eyed_grandmother.ram</identifier>\n";
574        content += "         <identifier>http://uts.cc.utexas.edu/~ailla/texts/sherzer/one_eyed_grandmother.pdf</identifier>\n";
575        content += "         <language olac-language=\"x-sil-CHN\"/>\n";
576        content += "         <language>Chinese</language>\n";
577        content += "         <subject olac-linguistic-field=\"testSubject\">Kuna</subject>\n";
578        content += "         <type olac-linguistic-type=\"Transcription\"/>\n";
579        content += "         <format>WAV</format>\n";
580        content += "        <type dcterms-type=\"DCMIType\">Sound</type>\n";
581        content += "      </OLAC-DcmiTerms>\n";
582        content += "   </Components>\n";
583        content += "</CMD>\n";
584
585        File cmdiFile = createCmdiFile("testOlac", content);
586        CMDIDataProcessor processor = getDataParser();
587        CMDIData data = processor.process(cmdiFile);
588        assertEquals("oai_58_ailla.utexas.edu_58_1", data.getId());
589        List<Resource> resources = data.getMetadataResources();
590        assertEquals(0, resources.size());
591        List<Resource> dataResources = data.getDataResources();
592        assertEquals(0, dataResources.size());
593        SolrInputDocument doc = data.getSolrDocument();
594        assertNotNull(doc);
595        assertEquals(11, doc.getFieldNames().size());
596        assertEquals("oai:ailla.utexas.edu:1", doc.getFieldValue("_selfLink"));
597        assertEquals(null, doc.getFieldValue("name"));
598        assertEquals(null, doc.getFieldValue("continent"));
599        assertEquals(1, doc.getFieldValues("language").size());
600        assertEquals("Chinese", doc.getFieldValue("language"));
601        assertEquals(null, doc.getFieldValue("country"));
602        assertEquals(null, doc.getFieldValue("organisation"));
603        assertEquals("transcription", doc.getFieldValue("genre"));
604        assertEquals("kuna", doc.getFieldValue("subject"));
605        Collection<Object> fieldValues = doc.getFieldValues("description");
606        assertEquals(3, fieldValues.size());
607        List<String> descriptions = new ArrayList(fieldValues);
608        Collections.sort(descriptions);
609        assertEquals("{lang='und'}Channel: Talking;\n    Genre: Traditional Narrative / Story;\n    Country: Panama;\n"
610                + "    Place of Recording: Mulatuppu;\n    Event: Community Gathering;\n"
611                + "    Institutional Affiliation: University of Texas at Austin;\n    Participant Information: Political Leader;", descriptions.get(0).toString());
612        assertEquals("{lang='und'}Test", descriptions.get(1).toString());
613        assertEquals("{lang='und'}The one-eyed grandmother is one of many traditional Kuna stories performed "
614                + "in the Kuna gathering house. This story, performed here by Pedro Arias, combines "
615                + "European derived motifs (Tom Thumb and Hansel and Gretel) with themes that seem more "
616                + "Kuna in origin. All are woven together and a moral is provided. Pedro Arias performed "
617                + "this story before a gathered audience in the morning..", descriptions.get(2).toString());
618        assertEquals("Sound", doc.getFieldValue(FacetConstants.FIELD_RESOURCE_CLASS));
619    }
620
621    @Test
622    public void testOlacMultiFacets() throws Exception {
623       
624        // make sure the mapping file for testing is used
625        config.setFacetConceptsFile("/facetConceptsTest.xml");
626
627        String content = "";
628        content += "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
629        content += "<CMD xmlns=\"http://www.clarin.eu/cmd/\">\n";
630        content += "   <Header>\n";
631        content += "      <MdProfile>clarin.eu:cr1:p_1288172614026</MdProfile>\n";
632        content += "   </Header>\n";
633        content += "   <Components>\n";
634        content += "      <OLAC-DcmiTerms>\n";
635        content += "         <subject olac-linguistic-field=\"testSubject\">Kuna</subject>\n";
636        content += "         <subject dcterms-type=\"LCSH\">testSubjectFallback</subject>\n";
637        content += "         <spatial dcterms-type=\"ISO3166\">testCountry1</spatial>\n";
638        content += "         <coverage dcterms-type=\"ISO3166\">testCountry2</coverage>\n";
639        content += "         <language olac-language=\"language1\">test1</language>\n";
640        content += "         <subject olac-language=\"language2\">test2</subject>\n";
641        content += "         <subject olac-language=\"language2\">test2</subject>\n";
642        content += "      </OLAC-DcmiTerms>\n";
643        content += "   </Components>\n";
644        content += "</CMD>\n";
645
646        File cmdiFile = createCmdiFile("testOlac", content);
647        CMDIDataProcessor processor = getDataParser();
648        CMDIData data = processor.process(cmdiFile);
649        SolrInputDocument doc = data.getSolrDocument();
650        assertNull(doc.getFieldValue("_selfLink"));
651        assertEquals(3, doc.getFieldValues(FacetConstants.FIELD_SUBJECT).size());
652        assertTrue(doc.getFieldValues(FacetConstants.FIELD_SUBJECT).contains("kuna"));
653        assertEquals(2, doc.getFieldValues(FacetConstants.FIELD_COUNTRY).size());
654        assertTrue(doc.getFieldValues(FacetConstants.FIELD_COUNTRY).contains("testCountry1"));
655        assertTrue(doc.getFieldValues(FacetConstants.FIELD_COUNTRY).contains("testCountry2"));
656        assertEquals(1, doc.getFieldValues(FacetConstants.FIELD_LANGUAGE).size());
657        assertTrue(doc.getFieldValues(FacetConstants.FIELD_LANGUAGE).contains("test1"));
658
659        content = "";
660        content += "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
661        content += "<CMD xmlns=\"http://www.clarin.eu/cmd/\">\n";
662        content += "   <Header>\n";
663        content += "      <MdProfile>clarin.eu:cr1:p_1288172614026</MdProfile>\n";
664        content += "   </Header>\n";
665        content += "   <Components>\n";
666        content += "      <OLAC-DcmiTerms>\n";
667        content += "         <subject dcterms-type=\"LCSH\">testSubjectFallback</subject>\n";
668        content += "         <coverage dcterms-type=\"ISO3166\">testCountry2</coverage>\n";
669        content += "         <language olac-language=\"language1\">test1</language>\n";
670        content += "         <subject olac-language=\"language2\">test2</subject>\n";
671        content += "      </OLAC-DcmiTerms>\n";
672        content += "   </Components>\n";
673        content += "</CMD>\n";
674
675        cmdiFile = createCmdiFile("testOlac", content);
676        processor = getDataParser();
677        data = processor.process(cmdiFile);
678        doc = data.getSolrDocument();
679        assertEquals(2, doc.getFieldValues(FacetConstants.FIELD_SUBJECT).size());
680        assertEquals("testsubjectfallback", doc.getFieldValue(FacetConstants.FIELD_SUBJECT));
681        assertEquals(1, doc.getFieldValues(FacetConstants.FIELD_COUNTRY).size());
682        assertEquals("testCountry2", doc.getFieldValue(FacetConstants.FIELD_COUNTRY));
683        assertEquals(1, doc.getFieldValues(FacetConstants.FIELD_LANGUAGE).size());
684        assertEquals("test1", doc.getFieldValue(FacetConstants.FIELD_LANGUAGE));
685
686        content = "";
687        content += "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
688        content += "<CMD xmlns=\"http://www.clarin.eu/cmd/\">\n";
689        content += "   <Header>\n";
690        content += "      <MdProfile>clarin.eu:cr1:p_1288172614026</MdProfile>\n";
691        content += "   </Header>\n";
692        content += "   <Components>\n";
693        content += "      <OLAC-DcmiTerms>\n";
694        content += "         <subject dcterms-type=\"LCSH\">testSubjectFallback</subject>\n";
695        content += "         <subject olac-linguistic-field=\"testSubject\">Kuna</subject>\n";
696        content += "         <coverage dcterms-type=\"ISO3166\">testCountry2</coverage>\n";
697        content += "         <spatial dcterms-type=\"ISO3166\">testCountry1</spatial>\n";
698        content += "         <subject olac-language=\"language1\">test2</subject>\n";
699        content += "         <language olac-language=\"language1\">test1</language>\n";
700        content += "      </OLAC-DcmiTerms>\n";
701        content += "   </Components>\n";
702        content += "</CMD>\n";
703
704        cmdiFile = createCmdiFile("testOlac", content);
705        processor = getDataParser();
706        data = processor.process(cmdiFile);
707        doc = data.getSolrDocument();
708        assertEquals(3, doc.getFieldValues("subject").size());
709        assertEquals("testsubjectfallback", doc.getFieldValue("subject"));
710        assertEquals(2, doc.getFieldValues(FacetConstants.FIELD_COUNTRY).size());
711        assertTrue(doc.getFieldValues(FacetConstants.FIELD_COUNTRY).contains("testCountry1"));
712        assertTrue(doc.getFieldValues(FacetConstants.FIELD_COUNTRY).contains("testCountry2"));
713        assertEquals(1, doc.getFieldValues(FacetConstants.FIELD_LANGUAGE).size());
714        assertTrue(doc.getFieldValues(FacetConstants.FIELD_LANGUAGE).contains("test1"));
715    }
716
717    @Test
718    public void testIgnoreWhiteSpaceFacets() throws Exception {
719       
720        // make sure the mapping file for testing is used
721        config.setFacetConceptsFile("/facetConceptsTest.xml");
722
723        String content = "";
724        content += "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
725        content += "<CMD xmlns=\"http://www.clarin.eu/cmd/\">\n";
726        content += "   <Header>\n";
727        content += "      <MdProfile>clarin.eu:cr1:p_1288172614026</MdProfile>\n";
728        content += "   </Header>\n";
729        content += "   <Components>\n";
730        content += "      <OLAC-DcmiTerms>\n";
731        content += "         <subject olac-linguistic-field=\"\n\n\t\t\t\">Kuna</subject>\n";
732        content += "      </OLAC-DcmiTerms>\n";
733        content += "   </Components>\n";
734        content += "</CMD>\n";
735
736        File cmdiFile = createCmdiFile("testOlac", content);
737        CMDIDataProcessor processor = getDataParser();
738        CMDIData data = processor.process(cmdiFile);
739        SolrInputDocument doc = data.getSolrDocument();
740        assertTrue(doc.getFieldValues("subject").contains("kuna"));
741    }
742
743    @Test
744    public void testCountryCodesPostProcessing() throws Exception {
745       
746        // make sure the mapping file for testing is used
747        config.setFacetConceptsFile("/facetConceptsTest.xml");
748
749        String content = "";
750        content += "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
751        content += "<CMD xmlns=\"http://www.clarin.eu/cmd/\">\n";
752        content += "   <Header>\n";
753        content += "      <MdProfile>clarin.eu:cr1:p_1288172614026</MdProfile>\n";
754        content += "   </Header>\n";
755        content += "   <Components>\n";
756        content += "      <OLAC-DcmiTerms>\n";
757        content += "         <coverage dcterms-type=\"ISO3166\">NL</coverage>\n";
758        content += "      </OLAC-DcmiTerms>\n";
759        content += "   </Components>\n";
760        content += "</CMD>\n";
761
762        File cmdiFile = createCmdiFile("testOlac", content);
763        CMDIDataProcessor processor = getDataParser();
764        CMDIData data = processor.process(cmdiFile);
765        SolrInputDocument doc = data.getSolrDocument();
766        assertEquals("Netherlands", doc.getFieldValue(FacetConstants.FIELD_COUNTRY));
767    }
768
769    @Test
770    public void testLanguageCodesPostProcessing() throws Exception {
771       
772        // make sure the mapping file for testing is used
773        config.setFacetConceptsFile("/facetConceptsTest.xml");
774
775        String content = "";
776        content += "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
777        content += "<CMD xmlns=\"http://www.clarin.eu/cmd/\">\n";
778        content += "   <Header>\n";
779        content += "      <MdProfile>clarin.eu:cr1:p_1288172614026</MdProfile>\n";
780        content += "   </Header>\n";
781        content += "   <Components>\n";
782        content += "      <OLAC-DcmiTerms>\n";
783        content += "         <language olac-language=\"fr\"/>\n";
784        content += "         <language olac-language=\"spa\"/>\n";
785        content += "      </OLAC-DcmiTerms>\n";
786        content += "   </Components>\n";
787        content += "</CMD>\n";
788
789        File cmdiFile = createCmdiFile("testOlac", content);
790        CMDIDataProcessor processor = getDataParser();
791        CMDIData data = processor.process(cmdiFile);
792        SolrInputDocument doc = data.getSolrDocument();
793//        Collection<Object> values = doc.getFieldValues(FacetConstants.FIELD_LANGUAGE);
794//        assertEquals(2, values.size());
795//        Iterator<Object> iter = values.iterator();
796//        assertEquals("French", iter.next());
797//        assertEquals("Spanish; Castilian", iter.next());
798    }
799
800    @Test
801    public void testOlacCollection() throws Exception {
802       
803        // make sure the mapping file for testing is used
804        config.setFacetConceptsFile("/facetConceptsTest.xml");
805
806        String content = "";
807        content += "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
808        content += "<CMD xmlns=\"http://www.clarin.eu/cmd/\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n";
809        content += "    xsi:schemaLocation=\"http://www.clarin.eu/cmd http://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/profiles/clarin.eu:cr1:p_1288172614026/xsd\">\n";
810        content += "    <Header>\n";
811        content += "        <MdCreator>dir2cmdicollection.py</MdCreator>\n";
812        content += "        <MdCreationDate>2010-10-11</MdCreationDate>\n";
813        content += "        <MdSelfLink>collection_ATILF_Resources.cmdi</MdSelfLink>\n";
814        content += "        <MdProfile>clarin.eu:cr1:p_1288172614026</MdProfile>\n";
815        content += "    </Header>\n";
816        content += "    <Resources>\n";
817        content += "        <ResourceProxyList>\n";
818        content += "<ResourceProxy id=\"ATILF_Resources_0_oai_atilf_inalf_fr_0001.xml.cmdi\"><ResourceType>Metadata</ResourceType><ResourceRef>ATILF_Resources/0/oai_atilf_inalf_fr_0001.xml.cmdi</ResourceRef></ResourceProxy>\n";
819        content += "<ResourceProxy id=\"ATILF_Resources_0_oai_atilf_inalf_fr_0002.xml.cmdi\"><ResourceType>Metadata</ResourceType><ResourceRef>ATILF_Resources/0/oai_atilf_inalf_fr_0002.xml.cmdi</ResourceRef></ResourceProxy>\n";
820        content += "<ResourceProxy id=\"ATILF_Resources_0_oai_atilf_inalf_fr_0003.xml.cmdi\"><ResourceType>Metadata</ResourceType><ResourceRef>ATILF_Resources/0/oai_atilf_inalf_fr_0003.xml.cmdi</ResourceRef></ResourceProxy>\n";
821        content += "<ResourceProxy id=\"ATILF_Resources_0_oai_atilf_inalf_fr_0004.xml.cmdi\"><ResourceType>Metadata</ResourceType><ResourceRef>ATILF_Resources/0/oai_atilf_inalf_fr_0004.xml.cmdi</ResourceRef></ResourceProxy>\n";
822        content += "<ResourceProxy id=\"ATILF_Resources_0_oai_atilf_inalf_fr_0005_a.xml.cmdi\"><ResourceType>Metadata</ResourceType><ResourceRef>ATILF_Resources/0/oai_atilf_inalf_fr_0005_a.xml.cmdi</ResourceRef></ResourceProxy>\n";
823        content += "<ResourceProxy id=\"ATILF_Resources_0_oai_atilf_inalf_fr_0005_b.xml.cmdi\"><ResourceType>Metadata</ResourceType><ResourceRef>ATILF_Resources/0/oai_atilf_inalf_fr_0005_b.xml.cmdi</ResourceRef></ResourceProxy>\n";
824        content += "<ResourceProxy id=\"ATILF_Resources_0_oai_atilf_inalf_fr_0006.xml.cmdi\"><ResourceType>Metadata</ResourceType><ResourceRef>ATILF_Resources/0/oai_atilf_inalf_fr_0006.xml.cmdi</ResourceRef></ResourceProxy>\n";
825        content += "<ResourceProxy id=\"ATILF_Resources_0_oai_atilf_inalf_fr_M277.xml.cmdi\"><ResourceType>Metadata</ResourceType><ResourceRef>ATILF_Resources/0/oai_atilf_inalf_fr_M277.xml.cmdi</ResourceRef></ResourceProxy>\n";
826        content += "<ResourceProxy id=\"ATILF_Resources_0_oai_atilf_inalf_fr_M592.xml.cmdi\"><ResourceType>Metadata</ResourceType><ResourceRef>ATILF_Resources/0/oai_atilf_inalf_fr_M592.xml.cmdi</ResourceRef></ResourceProxy>\n";
827        content += "        </ResourceProxyList>\n";
828        content += "        <JournalFileProxyList/>\n";
829        content += "        <ResourceRelationList/>\n";
830        content += "    </Resources>\n";
831        content += "    <Components>\n";
832        content += "        <olac></olac>\n";
833        content += "    </Components>\n";
834        content += "</CMD>\n";
835
836        File cmdiFile = createCmdiFile("testOlac", content);
837        CMDIDataProcessor processor = getDataParser();
838        CMDIData data = processor.process(cmdiFile);
839        assertEquals("collection_ATILF_Resources.cmdi", data.getId());
840        assertEquals("collection_ATILF_Resources.cmdi", data.getSolrDocument().getFieldValue("_selfLink"));
841        List<Resource> resources = data.getMetadataResources();
842        assertEquals(9, resources.size());
843        Resource res = resources.get(0);
844        assertEquals("ATILF_Resources/0/oai_atilf_inalf_fr_0001.xml.cmdi", res.getResourceName());
845        assertEquals(null, res.getMimeType());
846        assertEquals(0, data.getDataResources().size());
847        SolrInputDocument doc = data.getSolrDocument();
848        assertNotNull(doc);
849        List<Resource> dataResources = data.getDataResources();
850        assertEquals(0, dataResources.size());
851    }
852
853    @Test
854    public void testLrtCollection() throws Exception {
855       
856        // make sure the mapping file for testing is used
857        config.setFacetConceptsFile("/facetConceptsTest.xml");
858
859        String content = "";
860        content += "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
861        content += "<CMD xmlns=\"http://www.clarin.eu/cmd/\" ns0:schemaLocation=\"http://www.clarin.eu/cmd http://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/profiles/clarin.eu:cr1:p_1289827960126/xsd\" xmlns:ns0=\"http://www.w3.org/2001/XMLSchema-instance\">\n";
862        content += "    <Header>\n";
863        content += "        <MdCreator>lrt2cmdi.py</MdCreator>\n";
864        content += "        <MdCreationDate>2010-11-17</MdCreationDate>\n";
865        content += "        <MdSelfLink>clarin.eu:lrt:433</MdSelfLink>\n";
866        content += "        <MdProfile>clarin.eu:cr1:p_1289827960126</MdProfile>\n";
867        content += "    </Header>\n";
868        content += "    <Resources>\n";
869        content += "        <ResourceProxyList />\n";
870        content += "        <JournalFileProxyList />\n";
871        content += "        <ResourceRelationList />\n";
872        content += "    </Resources>\n";
873        content += "    <Components>\n";
874        content += "        <LrtInventoryResource>\n";
875        content += "            <LrtCommon>\n";
876        content += "                <ResourceName>Corpus of Present-day Written Estonian</ResourceName>\n";
877        content += "                <ResourceType>Written Corpus</ResourceType>\n";
878        content += "                <LanguagesOther />\n";
879        content += "                <Description xml:lang='en'>written general; 95 mio words; TEI/SGML</Description>\n";
880        content += "                <ContactPerson>Kadri.Muischnek@ut.ee</ContactPerson>\n";
881        content += "                <Format />\n";
882        content += "                <Institute>Test</Institute>\n";
883        content += "                <MetadataLink />\n";
884        content += "                <Publications />\n";
885        content += "                <ReadilyAvailable>true</ReadilyAvailable>\n";
886        content += "                <ReferenceLink />         \n";
887        content += "                <Languages><ISO639><iso-639-3-code>est</iso-639-3-code></ISO639></Languages>\n";
888        content += "                <Countries><Country><Code>EE</Code></Country></Countries>\n";
889        content += "            </LrtCommon>\n";
890        content += "       </LrtInventoryResource>\n";
891        content += "    </Components>\n";
892        content += "</CMD>\n";
893
894        File cmdiFile = createCmdiFile("testOlac", content);
895        CMDIDataProcessor processor = getDataParser();
896        CMDIData data = processor.process(cmdiFile);
897        assertEquals("clarin.eu_58_lrt_58_433", data.getId());
898        List<Resource> resources = data.getMetadataResources();
899        assertEquals(0, resources.size());
900        List<Resource> dataResources = data.getDataResources();
901        assertEquals(0, dataResources.size());
902        SolrInputDocument doc = data.getSolrDocument();
903        assertNotNull(doc);
904        assertEquals(11, doc.getFieldNames().size());
905        assertEquals("clarin.eu:lrt:433", doc.getFieldValue("_selfLink"));
906        assertEquals("Corpus of Present-day Written Estonian", doc.getFieldValue("name"));
907        assertEquals(null, doc.getFieldValue("continent"));
908        assertEquals(1, doc.getFieldValues("language").size());
909        assertEquals("Estonian", doc.getFieldValue("language"));
910        assertEquals("Estonia", doc.getFieldValue("country"));
911        assertEquals("Test", doc.getFieldValue("organisation"));
912        assertEquals(null, doc.getFieldValue("year"));
913        assertEquals(null, doc.getFieldValue("genre"));
914        assertEquals("{lang='eng'}written general; 95 mio words; TEI/SGML", doc.getFieldValue("description"));
915        assertEquals("Written Corpus", doc.getFieldValue(FacetConstants.FIELD_RESOURCE_CLASS));
916    }
917}
Note: See TracBrowser for help on using the repository browser.