source: vlo/branches/vlo-3.3-oeaw/vlo-importer/src/test/java/eu/clarin/cmdi/vlo/importer/CMDIDataProcessorTest.java @ 6760

Last change on this file since 6760 was 6760, checked in by davor.ostojic@oeaw.ac.at, 9 years ago

merged with trunk 3.4

File size: 54.6 KB
Line 
1package eu.clarin.cmdi.vlo.importer;
2
3import eu.clarin.cmdi.vlo.FacetConstants;
4import java.io.File;
5import java.net.URL;
6import java.util.ArrayList;
7import java.util.Collection;
8import java.util.Collections;
9import java.util.List;
10import org.apache.solr.common.SolrInputDocument;
11import static org.junit.Assert.assertEquals;
12import static org.junit.Assert.assertNotNull;
13import static org.junit.Assert.assertNull;
14import static org.junit.Assert.assertTrue;
15import org.junit.Before;
16import org.junit.Test;
17
18public class CMDIDataProcessorTest extends ImporterTestcase {
19
20    private CMDIDataProcessor getDataParser() {
21        return new CMDIParserVTDXML(MetadataImporter.POST_PROCESSORS, true);
22    }
23
24    @Before
25    @Override
26    public void setup() throws Exception {
27        super.setup();
28        // make sure the mapping file for testing is used
29        config.setFacetConceptsFile(getTestFacetConceptFilePath());
30    }
31
32    @Test
33    public void testCreateCMDIDataFromCorpus() throws Exception {
34        String content = "";
35        content += "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
36        content += "<CMD xmlns=\"http://www.clarin.eu/cmd/\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\">\n";
37        content += "   <Header>\n";
38        content += "      <MdCreationDate>2003-01-14</MdCreationDate>\n";
39        content += "      <MdSelfLink>test-hdl:1839/00-0000-0000-0000-0001-D</MdSelfLink>\n";
40        content += "      <MdProfile>clarin.eu:cr1:p_1274880881885</MdProfile>\n";
41        content += "   </Header>\n";
42        content += "   <Resources>\n";
43        content += "      <ResourceProxyList>\n";
44        content += "         <ResourceProxy id=\"d28635e19\">\n";
45        content += "            <ResourceType>Metadata</ResourceType>\n";
46        content += "            <ResourceRef>../acqui_data/Corpusstructure/acqui.imdi.cmdi</ResourceRef>\n";
47        content += "         </ResourceProxy>\n";
48        content += "         <ResourceProxy id=\"d28635e23\">\n";
49        content += "            <ResourceType>Metadata</ResourceType>\n";
50        content += "            <ResourceRef>../Comprehension/Corpusstructure/comprehension.imdi.cmdi</ResourceRef>\n";
51        content += "         </ResourceProxy>\n";
52        content += "         <ResourceProxy id=\"d28635e26\">\n";
53        content += "            <ResourceType>Metadata</ResourceType>\n";
54        content += "            <ResourceRef>../lac_data/Corpusstructure/lac.imdi.cmdi</ResourceRef>\n";
55        content += "         </ResourceProxy>\n";
56        content += "      </ResourceProxyList>\n";
57        content += "      <JournalFileProxyList/>\n";
58        content += "      <ResourceRelationList/>\n";
59        content += "   </Resources>\n";
60        content += "   <Components>\n";
61        content += "      <imdi-corpus>\n";
62        content += "         <Corpus>\n";
63        content += "            <Name>MPI corpora</Name>\n";
64        content += "            <Title>Corpora of the Max-Planck Institute for Psycholinguistics</Title>\n";
65        content += "            <CorpusLink Name=\"Acquisition\">../acqui_data/Corpusstructure/acqui.imdi</CorpusLink>\n";
66        content += "            <CorpusLink Name=\"Comprehension\">../Comprehension/Corpusstructure/comprehension.imdi</CorpusLink>\n";
67        content += "            <CorpusLink Name=\"Language and Cognition\">../lac_data/Corpusstructure/lac.imdi</CorpusLink>\n";
68        content += "            <descriptions>\n";
69        content += "               <Description LanguageId=\"\">IMDI corpora</Description>\n";
70        content += "               <Description LanguageId=\"\"/>\n";
71        content += "            </descriptions>\n";
72        content += "         </Corpus>\n";
73        content += "      </imdi-corpus>\n";
74        content += "   </Components>\n";
75        content += "</CMD>\n";
76        File cmdiFile = createCmdiFile("testCorpus", content);
77        CMDIDataProcessor processor = getDataParser();
78        CMDIData data = processor.process(cmdiFile);
79        assertEquals("test-hdl_58_1839_47_00-0000-0000-0000-0001-D", data.getId());
80        List<Resource> resources = data.getMetadataResources();
81        assertEquals(3, resources.size());
82        Resource res = resources.get(0);
83        assertEquals("../acqui_data/Corpusstructure/acqui.imdi.cmdi", res.getResourceName());
84        assertEquals(null, res.getMimeType());
85        assertEquals(0, data.getDataResources().size());
86        SolrInputDocument doc = data.getSolrDocument();
87        // TODO FIX bad test case. Depends on the presence of an internet connection! (BAD!)
88        assertTrue(doc.getFieldValues(FacetConstants.FIELD_CLARIN_PROFILE).contains("imdi-corpus"));
89        assertNotNull(doc);
90    }
91
92    @Test
93    public void testCreateCMDIDataFromSession() throws Exception {
94        String content = "";
95        content += "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
96        content += "<CMD xmlns=\"http://www.clarin.eu/cmd/\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n";
97        content += "     xsi:schemaLocation=\"http://www.clarin.eu/cmd http://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/profiles/clarin.eu:cr1:p_1271859438204/xsd\">\n";
98        content += "   <Header>\n";
99        content += "      <MdCreationDate>2008-05-27</MdCreationDate>\n";
100        content += "      <MdSelfLink>test-hdl:1839/00-0000-0000-0009-294C-9</MdSelfLink>\n";
101        content += "      <MdProfile>clarin.eu:cr1:p_1271859438204</MdProfile>\n";
102        content += "   </Header>\n";
103        content += "   <Resources>\n";
104        content += "      <ResourceProxyList>\n";
105        content += "         <ResourceProxy id=\"d314e408\">\n";
106        content += "            <ResourceType mimetype=\"video/x-mpeg1\" >Resource</ResourceType>\n";
107        content += "            <ResourceRef>../Media/elan-example1.mpg</ResourceRef>\n";
108        content += "         </ResourceProxy>\n";
109        content += "         <ResourceProxy id=\"d314e471\">\n";
110        content += "            <ResourceType mimetype=\"audio/mpeg\" >Resource</ResourceType>\n";
111        content += "            <ResourceRef>../Media/elan-example1.mp3</ResourceRef>\n";
112        content += "         </ResourceProxy>\n";
113        content += "      </ResourceProxyList>\n";
114        content += "      <JournalFileProxyList/>\n";
115        content += "      <ResourceRelationList/>\n";
116        content += "   </Resources>\n";
117        content += "   <Components>\n";
118        content += "      <Session>\n";
119        content += "         <Name>kleve-route</Name>\n";
120        content += "         <Title>route description to Kleve</Title>\n";
121        content += "         <Date>2002-10-30</Date>\n";
122        content += "         <descriptions>\n";
123        content += "            <Description xml:lang='eng' LanguageId=\"ISO639-2:eng\">This  recording was made to generate a freely available test resource including speech and gestures. The annotations were created by Peter and Kita who is gesture researcher at the MPI for Psycholinguistics.</Description>\n";
124        content += "            <Description LanguageId=\"ISO639-2:ger\">Diese Aufnahme wurde erzeugt, um eine frei verf\\u00fcgbare Test Resource zur Verf\\u00fcgung stellen zu k\\u00f6nnen, die Sprache und Gestik umfasst. Die Annotationen wurden von Peter und Kita, dem Gestik Researcher am MPI erzeugt.</Description>\n";
125        content += "         </descriptions>\n";
126        content += "         <MDGroup>\n";
127        content += "            <Location>\n";
128        content += "               <Continent>Europe</Continent>\n";
129        content += "               <Country>Netherlands</Country>\n";
130        content += "               <Region/>\n";
131        content += "               <Address>Wundtlaan 1, Nijmegen</Address>\n";
132        content += "            </Location>\n";
133        content += "            <Project>\n";
134        content += "               <Name>Peter Wittenburg</Name>\n";
135        content += "               <Title>Route description test resource</Title>\n";
136        content += "               <Id/>\n";
137        content += "               <Contact>\n";
138        content += "                  <Name>Peter Wittenburg</Name>\n";
139        content += "                  <Address>Wundtlaan 1, 6525 XD Nijmegen</Address>\n";
140        content += "                  <Email>peter.wittenburg@mpi.nl</Email>\n";
141        content += "                  <Organisation>Max Planck Institute for Psycholinguistics</Organisation>\n";
142        content += "               </Contact>\n";
143        content += "               <descriptions>\n";
144        content += "                  <Description LanguageId=\"\"/>\n";
145        content += "               </descriptions>\n";
146        content += "            </Project>\n";
147        content += "            <Keys>\n";
148        content += "               <Key Name=\"conversion.IMDI.1.9to3.0.warning\">Unknown mapping of Genre: conversation|explanation|unspecified --&gt; ???</Key>\n";
149        content += "            </Keys>\n";
150        content += "            <Content>\n";
151        content += "               <Genre>Demo</Genre>\n";
152        content += "               <SubGenre>Unspecified</SubGenre>\n";
153        content += "               <Task>route description</Task>\n";
154        content += "               <Modalities>Speech; Gestures</Modalities>\n";
155        content += "               <CommunicationContext>\n";
156        content += "                  <Interactivity>interactive</Interactivity>\n";
157        content += "                  <PlanningType>semi-spontaneous</PlanningType>\n";
158        content += "                  <Involvement>elicited</Involvement>\n";
159        content += "                  <SocialContext>Unspecified</SocialContext>\n";
160        content += "                  <EventStructure>Unspecified</EventStructure>\n";
161        content += "                  <Channel>Unspecified</Channel>\n";
162        content += "               </CommunicationContext>\n";
163        content += "               <Content_Languages>\n";
164        content += "                  <descriptions>\n";
165        content += "                     <Description LanguageId=\"\"/>\n";
166        content += "                  </descriptions>\n";
167        content += "                  <Content_Language>\n";
168        content += "                     <Id>ISO639-3:eng</Id>\n";
169        content += "                     <Name>English</Name>\n";
170        content += "                     <descriptions>\n";
171        content += "                        <Description LanguageId=\"\"/>\n";
172        content += "                     </descriptions>\n";
173        content += "                  </Content_Language>\n";
174        content += "               </Content_Languages>\n";
175        content += "               <Keys>\n";
176        content += "                  <Key Name=\"IMDI__1_9.Interactional\">conversation</Key>\n";
177        content += "                  <Key Name=\"IMDI__1_9.Discursive\">explanation</Key>\n";
178        content += "                  <Key Name=\"IMDI__1_9.Interactional\">Unspecified</Key>\n";
179        content += "               </Keys>\n";
180        content += "               <descriptions>\n";
181        content += "                  <Description LanguageId=\"ISO639:eng\">This file was generated from an IMDI 1.9 file and transformed to IMDI 3.0. The substructure of Genre is replaced by two elements named \"Genre\" and \"SubGenre\". The original content of Genre substructure was: Interactional = 'conversation', Discursive = 'explanation', Performance = 'Unspecified'. These values have been added as Keys to the Content information.</Description>\n";
182        content += "                  <Description LanguageId=\"ISO639:eng\">Peter explains how to come from Nijmegen to Kleve by car, such that Kita would be able to get there.</Description>\n";
183        content += "               </descriptions>\n";
184        content += "            </Content>\n";
185        content += "            <Actors>\n";
186        content += "               <descriptions>\n";
187        content += "                  <Description LanguageId=\"\"/>\n";
188        content += "               </descriptions>\n";
189        content += "               <Actor>\n";
190        content += "                  <Role>interviewee</Role>\n";
191        content += "                  <Name>Peter</Name>\n";
192        content += "                  <FullName>Peter Wittenburg</FullName>\n";
193        content += "                  <Code>W</Code>\n";
194        content += "                  <FamilySocialRole>Unspecified</FamilySocialRole>\n";
195        content += "                  <EthnicGroup/>\n";
196        content += "                  <Age>Unknown</Age>\n";
197        content += "                  <BirthDate>Unspecified</BirthDate>\n";
198        content += "                  <Sex>Unknown</Sex>\n";
199        content += "                  <Education>university</Education>\n";
200        content += "                  <Anonymized>true</Anonymized>\n";
201        content += "                  <Contact>\n";
202        content += "                     <Name/>\n";
203        content += "                     <Address/>\n";
204        content += "                     <Email/>\n";
205        content += "                     <Organisation/>\n";
206        content += "                  </Contact>\n";
207        content += "                  <Keys/>\n";
208        content += "                  <descriptions>\n";
209        content += "                     <Description LanguageId=\"\"/>\n";
210        content += "                  </descriptions>\n";
211        content += "                  <Actor_Languages>\n";
212        content += "                     <descriptions>\n";
213        content += "                        <Description LanguageId=\"\"/>\n";
214        content += "                     </descriptions>\n";
215        content += "                     <Actor_Language>\n";
216        content += "                        <Id>ISO639-3:nld</Id>\n";
217        content += "                        <Name>Dutch</Name>\n";
218        content += "                        <descriptions>\n";
219        content += "                           <Description LanguageId=\"\"/>\n";
220        content += "                        </descriptions>\n";
221        content += "                     </Actor_Language>\n";
222        content += "                     <Actor_Language>\n";
223        content += "                        <Id>ISO639-3:deu</Id>\n";
224        content += "                        <Name>German</Name>\n";
225        content += "                        <descriptions>\n";
226        content += "                           <Description LanguageId=\"\"/>\n";
227        content += "                        </descriptions>\n";
228        content += "                     </Actor_Language>\n";
229        content += "                     <Actor_Language>\n";
230        content += "                        <Id>ISO639-3:eng</Id>\n";
231        content += "                        <Name>English</Name>\n";
232        content += "                        <descriptions>\n";
233        content += "                           <Description LanguageId=\"\"/>\n";
234        content += "                        </descriptions>\n";
235        content += "                     </Actor_Language>\n";
236        content += "                  </Actor_Languages>\n";
237        content += "               </Actor>\n";
238        content += "               <Actor>\n";
239        content += "                  <Role>interviewer</Role>\n";
240        content += "                  <Name>Kita</Name>\n";
241        content += "                  <FullName>Sotaro Kita</FullName>\n";
242        content += "                  <Code>k</Code>\n";
243        content += "                  <FamilySocialRole>Unspecified</FamilySocialRole>\n";
244        content += "                  <EthnicGroup/>\n";
245        content += "                  <Age>Unknown</Age>\n";
246        content += "                  <BirthDate>Unspecified</BirthDate>\n";
247        content += "                  <Sex>Unknown</Sex>\n";
248        content += "                  <Education>university</Education>\n";
249        content += "                  <Anonymized>true</Anonymized>\n";
250        content += "                  <Contact>\n";
251        content += "                     <Name/>\n";
252        content += "                     <Address/>\n";
253        content += "                     <Email/>\n";
254        content += "                     <Organisation/>\n";
255        content += "                  </Contact>\n";
256        content += "                  <Keys/>\n";
257        content += "                  <descriptions>\n";
258        content += "                     <Description LanguageId=\"\"/>\n";
259        content += "                  </descriptions>\n";
260        content += "                  <Actor_Languages>\n";
261        content += "                     <descriptions>\n";
262        content += "                        <Description LanguageId=\"\"/>\n";
263        content += "                     </descriptions>\n";
264        content += "                     <Actor_Language>\n";
265        content += "                        <Id>ISO639-3:eng</Id>\n";
266        content += "                        <Name>English</Name>\n";
267        content += "                        <descriptions>\n";
268        content += "                           <Description LanguageId=\"\"/>\n";
269        content += "                        </descriptions>\n";
270        content += "                     </Actor_Language>\n";
271        content += "                     <Actor_Language>\n";
272        content += "                        <Id>ISO639-3:jpn</Id>\n";
273        content += "                        <Name>Japanese</Name>\n";
274        content += "                        <descriptions>\n";
275        content += "                           <Description LanguageId=\"\"/>\n";
276        content += "                        </descriptions>\n";
277        content += "                     </Actor_Language>\n";
278        content += "                  </Actor_Languages>\n";
279        content += "               </Actor>\n";
280        content += "               <Actor>\n";
281        content += "                  <Role>Collector</Role>\n";
282        content += "                  <Name>Peter Wittenburg</Name>\n";
283        content += "                  <FullName>Peter Wittenburg</FullName>\n";
284        content += "                  <Code>Unspecified</Code>\n";
285        content += "                  <FamilySocialRole>Unspecified</FamilySocialRole>\n";
286        content += "                  <EthnicGroup/>\n";
287        content += "                  <Age>Unspecified</Age>\n";
288        content += "                  <BirthDate>Unspecified</BirthDate>\n";
289        content += "                  <Sex>Unspecified</Sex>\n";
290        content += "                  <Education/>\n";
291        content += "                  <Anonymized>false</Anonymized>\n";
292        content += "                  <Contact>\n";
293        content += "                     <Name>Peter Wittenburg</Name>\n";
294        content += "                     <Address>Wundtlaan 1, 6525 XD Nijmegen</Address>\n";
295        content += "                     <Email>peter.wittenburg@mpi.nl</Email>\n";
296        content += "                     <Organisation>Max-Planck-Institute for Psycholinguistics</Organisation>\n";
297        content += "                  </Contact>\n";
298        content += "                  <Keys/>\n";
299        content += "                  <descriptions>\n";
300        content += "                     <Description LanguageId=\"\"/>\n";
301        content += "                  </descriptions>\n";
302        content += "                  <Actor_Languages/>\n";
303        content += "               </Actor>\n";
304        content += "            </Actors>\n";
305        content += "         </MDGroup>\n";
306        content += "         <Resources>\n";
307        content += "            <MediaFile ref=\"d314e408\">\n";
308        content += "               <ResourceLink>../Media/elan-example1.mpg</ResourceLink>\n";
309        content += "               <Type>video</Type>\n";
310        content += "               <Format>video/x-mpeg1</Format>\n";
311        content += "               <Size/>\n";
312        content += "               <Quality>Unknown</Quality>\n";
313        content += "               <RecordingConditions>excellent</RecordingConditions>\n";
314        content += "               <TimePosition>\n";
315        content += "                  <Start>Unknown</Start>\n";
316        content += "                  <End>Unknown</End>\n";
317        content += "               </TimePosition>\n";
318        content += "               <Access>\n";
319        content += "                  <Availability>openly available</Availability>\n";
320        content += "                  <Date>2003-02-12</Date>\n";
321        content += "                  <Owner>MPI for Psycholinguistics</Owner>\n";
322        content += "                  <Publisher/>\n";
323        content += "                  <Contact>\n";
324        content += "                     <Name>Romuald Skiba</Name>\n";
325        content += "                     <Address/>\n";
326        content += "                     <Email/>\n";
327        content += "                     <Organisation/>\n";
328        content += "                  </Contact>\n";
329        content += "                  <descriptions>\n";
330        content += "                     <Description LanguageId=\"\"/>\n";
331        content += "                  </descriptions>\n";
332        content += "               </Access>\n";
333        content += "               <descriptions>\n";
334        content += "                  <Description LanguageId=\"\"/>\n";
335        content += "               </descriptions>\n";
336        content += "               <Keys/>\n";
337        content += "            </MediaFile>\n";
338        content += "            <MediaFile ref=\"d314e471\">\n";
339        content += "               <ResourceLink>../Media/elan-example1.mp4</ResourceLink>\n";
340        content += "               <Type>video</Type>\n";
341        content += "               <Format>video/mp4</Format>\n";
342        content += "               <Size/>\n";
343        content += "               <Quality>Unknown</Quality>\n";
344        content += "               <RecordingConditions>excellent</RecordingConditions>\n";
345        content += "               <TimePosition>\n";
346        content += "                  <Start>Unknown</Start>\n";
347        content += "                  <End>Unknown</End>\n";
348        content += "               </TimePosition>\n";
349        content += "               <Access>\n";
350        content += "                  <Availability>openly available</Availability>\n";
351        content += "                  <Date>2003-02-12</Date>\n";
352        content += "                  <Owner>MPI for Psycholinguistics</Owner>\n";
353        content += "                  <Publisher/>\n";
354        content += "                  <Contact>\n";
355        content += "                     <Name>Romuald Skiba</Name>\n";
356        content += "                     <Address/>\n";
357        content += "                     <Email/>\n";
358        content += "                     <Organisation/>\n";
359        content += "                  </Contact>\n";
360        content += "                  <descriptions>\n";
361        content += "                     <Description LanguageId=\"\"/>\n";
362        content += "                  </descriptions>\n";
363        content += "               </Access>\n";
364        content += "               <descriptions>\n";
365        content += "                  <Description LanguageId=\"\"/>\n";
366        content += "               </descriptions>\n";
367        content += "               <Keys/>\n";
368        content += "            </MediaFile>\n";
369        content += "         </Resources>\n";
370        content += "         <References>\n";
371        content += "            <descriptions>\n";
372        content += "               <Description LanguageId=\"\"/>\n";
373        content += "            </descriptions>\n";
374        content += "         </References>\n";
375        content += "      </Session>\n";
376        content += "   </Components>\n";
377        content += "</CMD>\n";
378        File cmdiFile = createCmdiFile("testSession", content);
379        CMDIDataProcessor processor = getDataParser();
380        CMDIData data = processor.process(cmdiFile);
381        assertEquals("test-hdl_58_1839_47_00-0000-0000-0009-294C-9", data.getId());
382        List<Resource> resources = data.getMetadataResources();
383        assertEquals(0, resources.size());
384        List<Resource> dataResources = data.getDataResources();
385        assertEquals(2, dataResources.size());
386        Resource res = dataResources.get(0);
387        assertEquals("../Media/elan-example1.mpg", res.getResourceName());
388        assertEquals("video/x-mpeg1", res.getMimeType());
389        res = dataResources.get(1);
390        assertEquals("../Media/elan-example1.mp3", res.getResourceName());
391        assertEquals("audio/mpeg", res.getMimeType());
392        SolrInputDocument doc = data.getSolrDocument();
393        assertNotNull(doc);
394        assertEquals(14, doc.getFieldNames().size());
395        assertEquals("test-hdl:1839/00-0000-0000-0009-294C-9", doc.getFieldValue("_selfLink"));
396        assertEquals("kleve-route", doc.getFieldValue("name"));
397        assertEquals("Peter Wittenburg", doc.getFieldValue(FacetConstants.FIELD_PROJECT_NAME));
398        assertEquals("Europe", doc.getFieldValue("continent"));
399        assertEquals("code:eng", doc.getFieldValue("languageCode"));
400        assertEquals("Netherlands", doc.getFieldValue("country"));
401        assertEquals("Max Planck Institute for Psycholinguistics", doc.getFieldValue("organisation"));
402        assertEquals("demo", doc.getFieldValue("genre"));
403        assertEquals(
404                "{code:eng}This  recording was made to generate a freely available test resource including speech and gestures. The annotations were created by Peter and Kita who is gesture researcher at the MPI for Psycholinguistics.",
405                doc.getFieldValue("description"));
406        assertEquals("2002-10-30", doc.getFieldValue("temporalCoverage"));
407        List<String> fieldValues = new ArrayList(doc.getFieldValues(FacetConstants.FIELD_FORMAT));
408        assertEquals(2, fieldValues.size());
409        assertEquals("video/x-mpeg1", fieldValues.get(0));
410        assertEquals("video/mp4", fieldValues.get(1));
411        assertEquals(null, doc.getFieldValue("subject"));
412    }
413
414    @Test
415    public void testCreateCMDISessionSmall() throws Exception {
416        String content = "";
417        content += "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
418        content += "<CMD xmlns=\"http://www.clarin.eu/cmd/\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n";
419        content += "     xsi:schemaLocation=\"http://www.clarin.eu/cmd http://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/profiles/clarin.eu:cr1:p_1271859438204/xsd\">\n";
420        content += "   <Header>\n";
421        content += "      <MdCreationDate>2008-05-27</MdCreationDate>\n";
422        content += "      <MdSelfLink>test-hdl:1839/00-0000-0000-0009-294C-9</MdSelfLink>\n";
423        content += "      <MdProfile>clarin.eu:cr1:p_1271859438204</MdProfile>\n";
424        content += "   </Header>\n";
425        content += "   <Resources>\n";
426        content += "    </Resources>\n";
427        content += "   <Components>\n";
428        content += "      <Session>\n";
429        content += "         <Name>kleve-route</Name>\n";
430        content += "      </Session>\n";
431        content += "   </Components>\n";
432        content += "</CMD>\n";
433        File cmdiFile = createCmdiFile("testSession", content);
434        CMDIDataProcessor processor = getDataParser();
435        CMDIData data = processor.process(cmdiFile);
436        assertEquals("kleve-route", data.getSolrDocument().getFieldValue(FacetConstants.FIELD_NAME));
437    }
438
439    @Test
440    public void testEmptyFieldsShouldBeNull() throws Exception {
441        String content = "";
442        content += "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
443        content += "<CMD xmlns=\"http://www.clarin.eu/cmd/\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n";
444        content += "     xsi:schemaLocation=\"http://www.clarin.eu/cmd http://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/profiles/clarin.eu:cr1:p_1271859438204/xsd\">\n";
445        content += "   <Header>\n";
446        content += "      <MdCreationDate>2008-05-27</MdCreationDate>\n";
447        content += "      <MdSelfLink>test-hdl:1839/00-0000-0000-0009-294C-9</MdSelfLink>\n";
448        content += "      <MdProfile>clarin.eu:cr1:p_1271859438204</MdProfile>\n";
449        content += "   </Header>\n";
450        content += "   <Resources>\n";
451        content += "      <ResourceProxyList>\n";
452        content += "      </ResourceProxyList>\n";
453        content += "      <JournalFileProxyList/>\n";
454        content += "      <ResourceRelationList/>\n";
455        content += "   </Resources>\n";
456        content += "   <Components>\n";
457        content += "      <Session>\n";
458        content += "         <Name>kleve-route</Name>\n";
459        content += "         <Title>route description to Kleve</Title>\n";
460        content += "         <Date></Date>\n";
461        content += "         <descriptions>\n";
462        content += "            <Description LanguageId=\"ISO639-2:eng\">Test.</Description>\n";
463        content += "         </descriptions>\n";
464        content += "         <MDGroup>\n";
465        content += "            <Location>\n";
466        content += "               <Continent>Europe</Continent>\n";
467        content += "               <Country>Netherlands</Country>\n";
468        content += "               <Region/>\n";
469        content += "               <Address>Wundtlaan 1, Nijmegen</Address>\n";
470        content += "            </Location>\n";
471        content += "            <Project>\n";
472        content += "               <Name></Name>\n";
473        content += "               <Title></Title>\n";
474        content += "               <Id/>\n";
475        content += "               <Contact>\n";
476        content += "                  <Name></Name>\n";
477        content += "                  <Address></Address>\n";
478        content += "                  <Email></Email>\n";
479        content += "                  <Organisation></Organisation>\n";
480        content += "               </Contact>\n";
481        content += "               <descriptions>\n";
482        content += "                  <Description LanguageId=\"\"/>\n";
483        content += "               </descriptions>\n";
484        content += "            </Project>\n";
485        content += "            <Keys>\n";
486        content += "            </Keys>\n";
487        content += "            <Content>\n";
488        content += "               <Genre>Demo</Genre>\n";
489        content += "               <SubGenre>Unspecified</SubGenre>\n";
490        content += "               <Task>route description</Task>\n";
491        content += "               <Modalities>Speech; Gestures</Modalities>\n";
492        content += "               <CommunicationContext>\n";
493        content += "               </CommunicationContext>\n";
494        content += "               <Content_Languages>\n";
495        content += "               </Content_Languages>\n";
496        content += "               <descriptions>\n";
497        content += "               </descriptions>\n";
498        content += "            </Content>\n";
499        content += "            <Actors>\n";
500        content += "            </Actors>\n";
501        content += "         </MDGroup>\n";
502        content += "         <Resources>\n";
503        content += "         </Resources>\n";
504        content += "      </Session>\n";
505        content += "   </Components>\n";
506        content += "</CMD>\n";
507        File cmdiFile = createCmdiFile("testSession", content);
508        CMDIDataProcessor processor = getDataParser();
509        CMDIData data = processor.process(cmdiFile);
510        assertEquals("test-hdl_58_1839_47_00-0000-0000-0009-294C-9", data.getId()); //modified handle -> 'clean' id
511        List<Resource> resources = data.getMetadataResources();
512        assertEquals(0, resources.size());
513        SolrInputDocument doc = data.getSolrDocument();
514        assertNotNull(doc);
515        assertEquals(9, doc.getFieldNames().size());
516        assertEquals("test-hdl:1839/00-0000-0000-0009-294C-9", doc.getFieldValue("_selfLink")); //unmodified handle
517        assertEquals("kleve-route", doc.getFieldValue("name"));
518        assertEquals("Europe", doc.getFieldValue("continent"));
519        assertEquals("Netherlands", doc.getFieldValue("country"));
520        assertEquals("demo", doc.getFieldValue("genre"));
521        assertEquals("{code:und}Test.", doc.getFieldValue("description"));
522        assertEquals("Should be null not empty string", null, doc.getFieldValue("organisation"));
523        assertEquals(null, doc.getFieldValue("language"));
524        assertEquals(null, doc.getFieldValue("subject"));
525        assertEquals(null, doc.getFieldValue("year"));
526    }
527
528    @Test
529    public void testOlac() throws Exception {
530        String content = "";
531        content += "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
532        content += "<CMD xmlns=\"http://www.clarin.eu/cmd/\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n";
533        content += "     xmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n";
534        content += "     xmlns:oai_dc=\"http://www.openarchives.org/OAI/2.0/oai_dc/\"\n";
535        content += "     xmlns:defns=\"http://www.openarchives.org/OAI/2.0/\"\n";
536        content += "     xsi:schemaLocation=\"http://www.clarin.eu/cmd http://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/profiles/clarin.eu:cr1:p_1288172614026/xsd\">\n";
537        content += "   <Header>\n";
538        content += "      <MdCreator>olac2cmdi.xsl</MdCreator>\n";
539        content += "      <MdCreationDate>2002-12-14</MdCreationDate>\n";
540        content += "      <MdSelfLink>oai:ailla.utexas.edu:1</MdSelfLink>\n";
541        content += "      <MdProfile>clarin.eu:cr1:p_1288172614026</MdProfile>\n";
542        content += "   </Header>\n";
543        content += "   <Resources>\n";
544        content += "      <ResourceProxyList/>\n";
545        content += "      <JournalFileProxyList/>\n";
546        content += "      <ResourceRelationList/>\n";
547        content += "   </Resources>\n";
548        content += "   <Components>\n";
549        content += "      <OLAC-DcmiTerms>\n";
550        content += "         <creator>Joel Sherzer (recorder)</creator>\n";
551        content += "         <description>\n";
552        content += "    Channel: Talking;\n";
553        content += "    Genre: Traditional Narrative / Story;\n";
554        content += "    Country: Panama;\n";
555        content += "    Place of Recording: Mulatuppu;\n";
556        content += "    Event: Community Gathering;\n";
557        content += "    Institutional Affiliation: University of Texas at Austin;\n";
558        content += "    Participant Information: Political Leader;\n";
559        content += "      </description>\n";
560        content += "         <description>The one-eyed grandmother is one of many traditional Kuna stories performed in the Kuna gathering house. This story, performed here by Pedro Arias, combines European derived motifs (Tom Thumb and Hansel and Gretel) with themes that seem more Kuna in origin. All are woven together and a moral is provided. Pedro Arias performed this story before a gathered audience in the morning..\n";
561        content += "      </description>\n";
562        content += "         <description>Test</description>\n";
563        content += "         <identifier>http://uts.cc.utexas.edu/~ailla/audio/sherzer/one_eyed_grandmother.ram</identifier>\n";
564        content += "         <identifier>http://uts.cc.utexas.edu/~ailla/texts/sherzer/one_eyed_grandmother.pdf</identifier>\n";
565        content += "         <language olac-language=\"x-sil-CHN\"/>\n";
566        content += "         <language>Chinese</language>\n";
567        content += "         <subject olac-linguistic-field=\"testSubject\">Kuna</subject>\n";
568        content += "         <type olac-linguistic-type=\"Transcription\"/>\n";
569        content += "         <format>WAV</format>\n";
570        content += "        <type dcterms-type=\"DCMIType\">Sound</type>\n";
571        content += "      </OLAC-DcmiTerms>\n";
572        content += "   </Components>\n";
573        content += "</CMD>\n";
574
575        File cmdiFile = createCmdiFile("testOlac", content);
576        CMDIDataProcessor processor = getDataParser();
577        CMDIData data = processor.process(cmdiFile);
578        assertEquals("oai_58_ailla.utexas.edu_58_1", data.getId());
579        List<Resource> resources = data.getMetadataResources();
580        assertEquals(0, resources.size());
581        List<Resource> dataResources = data.getDataResources();
582        assertEquals(0, dataResources.size());
583        SolrInputDocument doc = data.getSolrDocument();
584        assertNotNull(doc);
585        assertEquals(9, doc.getFieldNames().size());
586        assertEquals("oai:ailla.utexas.edu:1", doc.getFieldValue("_selfLink"));
587        assertEquals(null, doc.getFieldValue("name"));
588        assertEquals(null, doc.getFieldValue("continent"));
589        assertEquals(1, doc.getFieldValues("languageCode").size());
590        assertEquals("code:zho", doc.getFieldValue("languageCode"));
591        assertEquals(null, doc.getFieldValue("country"));
592        assertEquals(null, doc.getFieldValue("organisation"));
593        assertEquals("transcription", doc.getFieldValue("genre"));
594        assertEquals("kuna", doc.getFieldValue("subject"));
595        Collection<Object> fieldValues = doc.getFieldValues("description");
596        assertEquals(3, fieldValues.size());
597        List<String> descriptions = new ArrayList(fieldValues);
598        Collections.sort(descriptions);
599        assertEquals("{code:und}Channel: Talking;\n    Genre: Traditional Narrative / Story;\n    Country: Panama;\n"
600                + "    Place of Recording: Mulatuppu;\n    Event: Community Gathering;\n"
601                + "    Institutional Affiliation: University of Texas at Austin;\n    Participant Information: Political Leader;", descriptions.get(0).toString());
602        assertEquals("{code:und}Test", descriptions.get(1).toString());
603        assertEquals("{code:und}The one-eyed grandmother is one of many traditional Kuna stories performed "
604                + "in the Kuna gathering house. This story, performed here by Pedro Arias, combines "
605                + "European derived motifs (Tom Thumb and Hansel and Gretel) with themes that seem more "
606                + "Kuna in origin. All are woven together and a moral is provided. Pedro Arias performed "
607                + "this story before a gathered audience in the morning..", descriptions.get(2).toString());
608        assertEquals("audioRecording", doc.getFieldValue(FacetConstants.FIELD_RESOURCE_CLASS));
609        //assertEquals("Sound", doc.getFieldValue(FacetConstants.FIELD_RESOURCE_CLASS));
610    }
611
612    @Test
613    public void testOlacMultiFacets() throws Exception {
614        String content = "";
615        content += "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
616        content += "<CMD xmlns=\"http://www.clarin.eu/cmd/\">\n";
617        content += "   <Header>\n";
618        content += "      <MdProfile>clarin.eu:cr1:p_1288172614026</MdProfile>\n";
619        content += "   </Header>\n";
620        content += "   <Components>\n";
621        content += "      <OLAC-DcmiTerms>\n";
622        content += "         <subject olac-linguistic-field=\"testSubject\">Kuna</subject>\n";
623        content += "         <subject dcterms-type=\"LCSH\">testSubjectFallback</subject>\n";
624        content += "         <spatial dcterms-type=\"ISO3166\">testCountry1</spatial>\n";
625        content += "         <coverage dcterms-type=\"ISO3166\">testCountry2</coverage>\n";
626        content += "         <language olac-language=\"language1\">test1</language>\n";
627        content += "         <subject olac-language=\"language2\">test2</subject>\n";
628        content += "         <subject olac-language=\"language2\">test2</subject>\n";
629        content += "      </OLAC-DcmiTerms>\n";
630        content += "   </Components>\n";
631        content += "</CMD>\n";
632
633        File cmdiFile = createCmdiFile("testOlac", content);
634        CMDIDataProcessor processor = getDataParser();
635        CMDIData data = processor.process(cmdiFile);
636        SolrInputDocument doc = data.getSolrDocument();
637        assertNull(doc.getFieldValue("_selfLink"));
638        assertEquals(3, doc.getFieldValues(FacetConstants.FIELD_SUBJECT).size());
639        assertTrue(doc.getFieldValues(FacetConstants.FIELD_SUBJECT).contains("kuna"));
640        assertEquals(2, doc.getFieldValues(FacetConstants.FIELD_COUNTRY).size());
641        assertTrue(doc.getFieldValues(FacetConstants.FIELD_COUNTRY).contains("testCountry1"));
642        assertTrue(doc.getFieldValues(FacetConstants.FIELD_COUNTRY).contains("testCountry2"));
643        assertEquals(1, doc.getFieldValues(FacetConstants.FIELD_LANGUAGE_CODE).size());
644        assertTrue(doc.getFieldValues(FacetConstants.FIELD_LANGUAGE_CODE).contains("name:test1"));
645
646        content = "";
647        content += "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
648        content += "<CMD xmlns=\"http://www.clarin.eu/cmd/\">\n";
649        content += "   <Header>\n";
650        content += "      <MdProfile>clarin.eu:cr1:p_1288172614026</MdProfile>\n";
651        content += "   </Header>\n";
652        content += "   <Components>\n";
653        content += "      <OLAC-DcmiTerms>\n";
654        content += "         <subject dcterms-type=\"LCSH\">testSubjectFallback</subject>\n";
655        content += "         <coverage dcterms-type=\"ISO3166\">testCountry2</coverage>\n";
656        content += "         <language olac-language=\"language1\">test1</language>\n";
657        content += "         <subject olac-language=\"language2\">test2</subject>\n";
658        content += "      </OLAC-DcmiTerms>\n";
659        content += "   </Components>\n";
660        content += "</CMD>\n";
661
662        cmdiFile = createCmdiFile("testOlac", content);
663        processor = getDataParser();
664        data = processor.process(cmdiFile);
665        doc = data.getSolrDocument();
666        assertEquals(2, doc.getFieldValues(FacetConstants.FIELD_SUBJECT).size());
667        assertEquals("testsubjectfallback", doc.getFieldValue(FacetConstants.FIELD_SUBJECT));
668        assertEquals(1, doc.getFieldValues(FacetConstants.FIELD_COUNTRY).size());
669        assertEquals("testCountry2", doc.getFieldValue(FacetConstants.FIELD_COUNTRY));
670        assertEquals(1, doc.getFieldValues(FacetConstants.FIELD_LANGUAGE_CODE).size());
671        assertEquals("name:test1", doc.getFieldValue(FacetConstants.FIELD_LANGUAGE_CODE));
672
673        content = "";
674        content += "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
675        content += "<CMD xmlns=\"http://www.clarin.eu/cmd/\">\n";
676        content += "   <Header>\n";
677        content += "      <MdProfile>clarin.eu:cr1:p_1288172614026</MdProfile>\n";
678        content += "   </Header>\n";
679        content += "   <Components>\n";
680        content += "      <OLAC-DcmiTerms>\n";
681        content += "         <subject dcterms-type=\"LCSH\">testSubjectFallback</subject>\n";
682        content += "         <subject olac-linguistic-field=\"testSubject\">Kuna</subject>\n";
683        content += "         <coverage dcterms-type=\"ISO3166\">testCountry2</coverage>\n";
684        content += "         <spatial dcterms-type=\"ISO3166\">testCountry1</spatial>\n";
685        content += "         <subject olac-language=\"language1\">test2</subject>\n";
686        content += "         <language olac-language=\"language1\">test1</language>\n";
687        content += "      </OLAC-DcmiTerms>\n";
688        content += "   </Components>\n";
689        content += "</CMD>\n";
690
691        cmdiFile = createCmdiFile("testOlac", content);
692        processor = getDataParser();
693        data = processor.process(cmdiFile);
694        doc = data.getSolrDocument();
695        assertEquals(3, doc.getFieldValues("subject").size());
696        assertEquals("testsubjectfallback", doc.getFieldValue("subject"));
697        assertEquals(2, doc.getFieldValues(FacetConstants.FIELD_COUNTRY).size());
698        assertTrue(doc.getFieldValues(FacetConstants.FIELD_COUNTRY).contains("testCountry1"));
699        assertTrue(doc.getFieldValues(FacetConstants.FIELD_COUNTRY).contains("testCountry2"));
700        assertEquals(1, doc.getFieldValues(FacetConstants.FIELD_LANGUAGE_CODE).size());
701        assertTrue(doc.getFieldValues(FacetConstants.FIELD_LANGUAGE_CODE).contains("name:test1"));
702    }
703
704    @Test
705    public void testIgnoreWhiteSpaceFacets() throws Exception {
706        String content = "";
707        content += "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
708        content += "<CMD xmlns=\"http://www.clarin.eu/cmd/\">\n";
709        content += "   <Header>\n";
710        content += "      <MdProfile>clarin.eu:cr1:p_1288172614026</MdProfile>\n";
711        content += "   </Header>\n";
712        content += "   <Components>\n";
713        content += "      <OLAC-DcmiTerms>\n";
714        content += "         <subject olac-linguistic-field=\"\n\n\t\t\t\">Kuna</subject>\n";
715        content += "      </OLAC-DcmiTerms>\n";
716        content += "   </Components>\n";
717        content += "</CMD>\n";
718
719        File cmdiFile = createCmdiFile("testOlac", content);
720        CMDIDataProcessor processor = getDataParser();
721        CMDIData data = processor.process(cmdiFile);
722        SolrInputDocument doc = data.getSolrDocument();
723        assertTrue(doc.getFieldValues("subject").contains("kuna"));
724    }
725
726    @Test
727    public void testCountryCodesPostProcessing() throws Exception {
728        String content = "";
729        content += "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
730        content += "<CMD xmlns=\"http://www.clarin.eu/cmd/\">\n";
731        content += "   <Header>\n";
732        content += "      <MdProfile>clarin.eu:cr1:p_1288172614026</MdProfile>\n";
733        content += "   </Header>\n";
734        content += "   <Components>\n";
735        content += "      <OLAC-DcmiTerms>\n";
736        content += "         <coverage dcterms-type=\"ISO3166\">NL</coverage>\n";
737        content += "      </OLAC-DcmiTerms>\n";
738        content += "   </Components>\n";
739        content += "</CMD>\n";
740
741        File cmdiFile = createCmdiFile("testOlac", content);
742        CMDIDataProcessor processor = getDataParser();
743        CMDIData data = processor.process(cmdiFile);
744        SolrInputDocument doc = data.getSolrDocument();
745        assertEquals("Netherlands", doc.getFieldValue(FacetConstants.FIELD_COUNTRY));
746    }
747
748    @Test
749    public void testLanguageCodesPostProcessing() throws Exception {
750        String content = "";
751        content += "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
752        content += "<CMD xmlns=\"http://www.clarin.eu/cmd/\">\n";
753        content += "   <Header>\n";
754        content += "      <MdProfile>clarin.eu:cr1:p_1288172614026</MdProfile>\n";
755        content += "   </Header>\n";
756        content += "   <Components>\n";
757        content += "      <OLAC-DcmiTerms>\n";
758        content += "         <language olac-language=\"fr\"/>\n";
759        content += "         <language olac-language=\"spa\"/>\n";
760        content += "      </OLAC-DcmiTerms>\n";
761        content += "   </Components>\n";
762        content += "</CMD>\n";
763
764        File cmdiFile = createCmdiFile("testOlac", content);
765        CMDIDataProcessor processor = getDataParser();
766        CMDIData data = processor.process(cmdiFile);
767        SolrInputDocument doc = data.getSolrDocument();
768//        Collection<Object> values = doc.getFieldValues(FacetConstants.DEPRECATED_FIELD_LANGUAGE);
769//        assertEquals(2, values.size());
770//        Iterator<Object> iter = values.iterator();
771//        assertEquals("French", iter.next());
772//        assertEquals("Spanish; Castilian", iter.next());
773    }
774
775    @Test
776    public void testOlacCollection() throws Exception {
777        String content = "";
778        content += "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
779        content += "<CMD xmlns=\"http://www.clarin.eu/cmd/\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n";
780        content += "    xsi:schemaLocation=\"http://www.clarin.eu/cmd http://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/profiles/clarin.eu:cr1:p_1288172614026/xsd\">\n";
781        content += "    <Header>\n";
782        content += "        <MdCreator>dir2cmdicollection.py</MdCreator>\n";
783        content += "        <MdCreationDate>2010-10-11</MdCreationDate>\n";
784        content += "        <MdSelfLink>collection_ATILF_Resources.cmdi</MdSelfLink>\n";
785        content += "        <MdProfile>clarin.eu:cr1:p_1288172614026</MdProfile>\n";
786        content += "    </Header>\n";
787        content += "    <Resources>\n";
788        content += "        <ResourceProxyList>\n";
789        content += "<ResourceProxy id=\"ATILF_Resources_0_oai_atilf_inalf_fr_0001.xml.cmdi\"><ResourceType>Metadata</ResourceType><ResourceRef>ATILF_Resources/0/oai_atilf_inalf_fr_0001.xml.cmdi</ResourceRef></ResourceProxy>\n";
790        content += "<ResourceProxy id=\"ATILF_Resources_0_oai_atilf_inalf_fr_0002.xml.cmdi\"><ResourceType>Metadata</ResourceType><ResourceRef>ATILF_Resources/0/oai_atilf_inalf_fr_0002.xml.cmdi</ResourceRef></ResourceProxy>\n";
791        content += "<ResourceProxy id=\"ATILF_Resources_0_oai_atilf_inalf_fr_0003.xml.cmdi\"><ResourceType>Metadata</ResourceType><ResourceRef>ATILF_Resources/0/oai_atilf_inalf_fr_0003.xml.cmdi</ResourceRef></ResourceProxy>\n";
792        content += "<ResourceProxy id=\"ATILF_Resources_0_oai_atilf_inalf_fr_0004.xml.cmdi\"><ResourceType>Metadata</ResourceType><ResourceRef>ATILF_Resources/0/oai_atilf_inalf_fr_0004.xml.cmdi</ResourceRef></ResourceProxy>\n";
793        content += "<ResourceProxy id=\"ATILF_Resources_0_oai_atilf_inalf_fr_0005_a.xml.cmdi\"><ResourceType>Metadata</ResourceType><ResourceRef>ATILF_Resources/0/oai_atilf_inalf_fr_0005_a.xml.cmdi</ResourceRef></ResourceProxy>\n";
794        content += "<ResourceProxy id=\"ATILF_Resources_0_oai_atilf_inalf_fr_0005_b.xml.cmdi\"><ResourceType>Metadata</ResourceType><ResourceRef>ATILF_Resources/0/oai_atilf_inalf_fr_0005_b.xml.cmdi</ResourceRef></ResourceProxy>\n";
795        content += "<ResourceProxy id=\"ATILF_Resources_0_oai_atilf_inalf_fr_0006.xml.cmdi\"><ResourceType>Metadata</ResourceType><ResourceRef>ATILF_Resources/0/oai_atilf_inalf_fr_0006.xml.cmdi</ResourceRef></ResourceProxy>\n";
796        content += "<ResourceProxy id=\"ATILF_Resources_0_oai_atilf_inalf_fr_M277.xml.cmdi\"><ResourceType>Metadata</ResourceType><ResourceRef>ATILF_Resources/0/oai_atilf_inalf_fr_M277.xml.cmdi</ResourceRef></ResourceProxy>\n";
797        content += "<ResourceProxy id=\"ATILF_Resources_0_oai_atilf_inalf_fr_M592.xml.cmdi\"><ResourceType>Metadata</ResourceType><ResourceRef>ATILF_Resources/0/oai_atilf_inalf_fr_M592.xml.cmdi</ResourceRef></ResourceProxy>\n";
798        content += "        </ResourceProxyList>\n";
799        content += "        <JournalFileProxyList/>\n";
800        content += "        <ResourceRelationList/>\n";
801        content += "    </Resources>\n";
802        content += "    <Components>\n";
803        content += "        <olac></olac>\n";
804        content += "    </Components>\n";
805        content += "</CMD>\n";
806
807        File cmdiFile = createCmdiFile("testOlac", content);
808        CMDIDataProcessor processor = getDataParser();
809        CMDIData data = processor.process(cmdiFile);
810        assertEquals("collection_ATILF_Resources.cmdi", data.getId());
811        assertEquals("collection_ATILF_Resources.cmdi", data.getSolrDocument().getFieldValue("_selfLink"));
812        List<Resource> resources = data.getMetadataResources();
813        assertEquals(9, resources.size());
814        Resource res = resources.get(0);
815        assertEquals("ATILF_Resources/0/oai_atilf_inalf_fr_0001.xml.cmdi", res.getResourceName());
816        assertEquals(null, res.getMimeType());
817        assertEquals(0, data.getDataResources().size());
818        SolrInputDocument doc = data.getSolrDocument();
819        assertNotNull(doc);
820        List<Resource> dataResources = data.getDataResources();
821        assertEquals(0, dataResources.size());
822    }
823
824    @Test
825    public void testLrtCollection() throws Exception {
826        String content = "";
827        content += "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
828        content += "<CMD xmlns=\"http://www.clarin.eu/cmd/\" ns0:schemaLocation=\"http://www.clarin.eu/cmd http://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/profiles/clarin.eu:cr1:p_1289827960126/xsd\" xmlns:ns0=\"http://www.w3.org/2001/XMLSchema-instance\">\n";
829        content += "    <Header>\n";
830        content += "        <MdCreator>lrt2cmdi.py</MdCreator>\n";
831        content += "        <MdCreationDate>2010-11-17</MdCreationDate>\n";
832        content += "        <MdSelfLink>clarin.eu:lrt:433</MdSelfLink>\n";
833        content += "        <MdProfile>clarin.eu:cr1:p_1289827960126</MdProfile>\n";
834        content += "    </Header>\n";
835        content += "    <Resources>\n";
836        content += "        <ResourceProxyList />\n";
837        content += "        <JournalFileProxyList />\n";
838        content += "        <ResourceRelationList />\n";
839        content += "    </Resources>\n";
840        content += "    <Components>\n";
841        content += "        <LrtInventoryResource>\n";
842        content += "            <LrtCommon>\n";
843        content += "                <ResourceName>Corpus of Present-day Written Estonian</ResourceName>\n";
844        content += "                <ResourceType>Written Corpus</ResourceType>\n";
845        content += "                <LanguagesOther />\n";
846        content += "                <Description xml:lang='en'>written general; 95 mio words; TEI/SGML</Description>\n";
847        content += "                <ContactPerson>Kadri.Muischnek@ut.ee</ContactPerson>\n";
848        content += "                <Format />\n";
849        content += "                <Institute>Test</Institute>\n";
850        content += "                <MetadataLink />\n";
851        content += "                <Publications />\n";
852        content += "                <ReadilyAvailable>true</ReadilyAvailable>\n";
853        content += "                <ReferenceLink />         \n";
854        content += "                <Languages><ISO639><iso-639-3-code>est</iso-639-3-code></ISO639></Languages>\n";
855        content += "                <Countries><Country><Code>EE</Code></Country></Countries>\n";
856        content += "            </LrtCommon>\n";
857        content += "       </LrtInventoryResource>\n";
858        content += "    </Components>\n";
859        content += "</CMD>\n";
860
861        File cmdiFile = createCmdiFile("testOlac", content);
862        CMDIDataProcessor processor = getDataParser();
863        CMDIData data = processor.process(cmdiFile);
864        assertEquals("clarin.eu_58_lrt_58_433", data.getId());
865        List<Resource> resources = data.getMetadataResources();
866        assertEquals(0, resources.size());
867        List<Resource> dataResources = data.getDataResources();
868        assertEquals(0, dataResources.size());
869        SolrInputDocument doc = data.getSolrDocument();
870        assertNotNull(doc);
871        assertEquals(9, doc.getFieldNames().size());
872        assertEquals("clarin.eu:lrt:433", doc.getFieldValue("_selfLink"));
873        assertEquals("Corpus of Present-day Written Estonian", doc.getFieldValue("name"));
874        assertEquals(null, doc.getFieldValue("continent"));
875        assertEquals(1, doc.getFieldValues("languageCode").size());
876        assertEquals("code:est", doc.getFieldValue("languageCode"));
877        assertEquals("Estonia", doc.getFieldValue("country"));
878        assertEquals("Test", doc.getFieldValue("organisation"));
879        assertEquals(null, doc.getFieldValue("year"));
880        assertEquals(null, doc.getFieldValue("genre"));
881        assertEquals("{code:eng}written general; 95 mio words; TEI/SGML", doc.getFieldValue("description"));
882        assertEquals("corpus", doc.getFieldValue(FacetConstants.FIELD_RESOURCE_CLASS));
883        //assertEquals("Written Corpus", doc.getFieldValue(FacetConstants.FIELD_RESOURCE_CLASS));
884    }
885}
Note: See TracBrowser for help on using the repository browser.