- Timestamp:
- 11/09/15 13:49:39 (9 years ago)
- Location:
- vlo/branches/vlo-3.3-oeaw/vlo-importer
- Files:
-
- 7 edited
- 1 copied
Legend:
- Unmodified
- Added
- Removed
-
vlo/branches/vlo-3.3-oeaw/vlo-importer
-
Property
svn:mergeinfo
set to
(toggle deleted branches)
/vlo/trunk/vlo-importer merged eligible /vlo/branches/to-wicket-1.6-twagoo/vlo-importer 4212-4219 /vlo/branches/vlo-2.13-param/vlo-importer 2570-2767 /vlo/branches/vlo-3.0/vlo-importer 5201 /vlo/branches/vlo-3.2-ticket575/vlo-importer 6112-6188 /vlo/branches/vlo-ticket761/vlo-importer 6189-6283
-
Property
svn:mergeinfo
set to
(toggle deleted branches)
-
vlo/branches/vlo-3.3-oeaw/vlo-importer/pom.xml
r6554 r6760 7 7 <groupId>eu.clarin.cmdi</groupId> 8 8 <artifactId>vlo</artifactId> 9 <version>3. 3-SNAPSHOT</version>9 <version>3.4-SNAPSHOT</version> 10 10 </parent> 11 11 … … 13 13 <groupId>eu.clarin.cmdi</groupId> 14 14 <artifactId>vlo-importer</artifactId> 15 <version>3. 3-SNAPSHOT</version>15 <version>3.4-SNAPSHOT</version> 16 16 <packaging>jar</packaging> 17 17 -
vlo/branches/vlo-3.3-oeaw/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/CMDIParserVTDXML.java
r6730 r6760 36 36 private final static Logger LOG = LoggerFactory.getLogger(CMDIParserVTDXML.class); 37 37 38 private static final String DEFAULT_LANGUAGE = " und";38 private static final String DEFAULT_LANGUAGE = "code:und"; 39 39 40 40 public CMDIParserVTDXML(Map<String, PostProcessor> postProcessors, Boolean useLocalXSDCache) { … … 291 291 292 292 // ignore non-English language names for facet LANGUAGE_CODE 293 if (config.getName().equals(FacetConstants.FIELD_LANGUAGE_CODE) && !languageCode.equals(" en") && !languageCode.equals("eng") && !languageCode.equals("und")) {293 if (config.getName().equals(FacetConstants.FIELD_LANGUAGE_CODE) && !languageCode.equals("code:eng") && !languageCode.equals("code:und")) { 294 294 index = ap.evalXPath(); 295 295 continue; … … 329 329 // extract language code in xml:lang if available 330 330 Integer langAttrIndex = nav.getAttrVal("xml:lang"); 331 String languageCode = DEFAULT_LANGUAGE;331 String languageCode; 332 332 if (langAttrIndex != -1) { 333 333 languageCode = nav.toString(langAttrIndex).trim(); 334 } 335 // replace 2-letter with 3-letter codes 336 if (MetadataImporter.languageCodeUtils.getSilToIso639Map().containsKey(languageCode)) { 337 languageCode = MetadataImporter.languageCodeUtils.getSilToIso639Map().get(languageCode); 338 } 339 return languageCode; 334 } else { 335 return DEFAULT_LANGUAGE; 336 } 337 338 return postProcessors.get(FacetConstants.FIELD_LANGUAGE_CODE).process(languageCode).get(0); 340 339 } 341 340 … … 363 362 String fieldValue = valueList.get(i).trim(); 364 363 if (name.equals(FacetConstants.FIELD_DESCRIPTION)) { 365 fieldValue = "{ lang='" + languageCode + "'}" + fieldValue;364 fieldValue = "{" + languageCode + "}" + fieldValue; 366 365 } 367 366 cmdiData.addDocField(name, fieldValue, caseInsensitive); -
vlo/branches/vlo-3.3-oeaw/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/MetadataImporter.java
r6732 r6760 48 48 * and so on. 49 49 */ 50 @SuppressWarnings({"serial"})51 50 public class MetadataImporter { 52 51 … … 166 165 } 167 166 solrServer.commit(); 168 updateDocumentHierarchy(); 167 if(config.isProcessHierarchies()){ 168 updateDocumentHierarchy(); 169 } 169 170 } 170 171 LOG.info("End of processing: " + dataRoot.getOriginName()); … … 227 228 * list of existing data roots will be filtered with the list from user 228 229 * 229 * @return 230 * @param dataRoots complete list of DataRoots 231 * @return list of DataRoots without DataRoots excluded by the user 230 232 */ 231 233 protected List<DataRoot> filterDataRootsWithCLArgs(List<DataRoot> dataRoots){ … … 364 366 // are usually not very interesting (think imdi corpus files) and will not be included. 365 367 updateDocument(solrDocument, cmdiData, file, dataOrigin); 368 if(ResourceStructureGraph.getVertex(cmdiData.getId()) != null) 369 ResourceStructureGraph.getVertex(cmdiData.getId()).setWasImported(true); 366 370 } else { 367 371 nrOfIgnoredFiles++; … … 538 542 // get document 539 543 query = new SolrQuery(); 544 query.setRequestHandler(FacetConstants.SOLR_REQUEST_HANDLER_FAST); 540 545 query.set("q", FacetConstants.FIELD_ID+":"+vertex.getId()); 541 546 SolrDocumentList response = solrServer.query(query).getResults(); … … 550 555 if(vertex.getHierarchyWeight() != 0) { 551 556 doc.setField(FacetConstants.FIELD_HIERARCHY_WEIGHT, Math.abs(vertex.getHierarchyWeight())); 557 } 558 559 // remove vertices that were not imported 560 Iterator<String> incomingVertexIter = incomingVertexNames.iterator(); 561 while(incomingVertexIter.hasNext()) { 562 String vertexId = incomingVertexIter.next(); 563 if(ResourceStructureGraph.getVertex(vertexId) == null || !ResourceStructureGraph.getVertex(vertexId).getWasImported()) 564 incomingVertexIter.remove(); 565 } 566 Iterator<String> outgoingVertexIter = outgoingVertexNames.iterator(); 567 while(outgoingVertexIter.hasNext()) { 568 String vertexId = outgoingVertexIter.next(); 569 if(ResourceStructureGraph.getVertex(vertexId) == null || !ResourceStructureGraph.getVertex(vertexId).getWasImported()) 570 outgoingVertexIter.remove(); 552 571 } 553 572 -
vlo/branches/vlo-3.3-oeaw/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/ResourceStructureGraph.java
r6213 r6760 53 53 */ 54 54 public static void addResource(String mdSelfLink) { 55 if (!vertexIdMap.containsKey(mdSelfLink)) { 56 CmdiVertex newVertex = new CmdiVertex(StringUtils.normalizeIdString(mdSelfLink)); 57 vertexIdMap.put(mdSelfLink, newVertex); 58 } 59 60 if (!foundVerticesSet.contains(vertexIdMap.get(mdSelfLink))) { 61 graph.addVertex(vertexIdMap.get(mdSelfLink)); 62 foundVerticesSet.add(vertexIdMap.get(mdSelfLink)); 55 String normalizedMdSelfLink = StringUtils.normalizeIdString(mdSelfLink); 56 if (!vertexIdMap.containsKey(normalizedMdSelfLink)) { 57 CmdiVertex newVertex = new CmdiVertex(normalizedMdSelfLink); 58 vertexIdMap.put(normalizedMdSelfLink, newVertex); 59 } 60 61 if (!foundVerticesSet.contains(vertexIdMap.get(normalizedMdSelfLink))) { 62 graph.addVertex(vertexIdMap.get(normalizedMdSelfLink)); 63 foundVerticesSet.add(vertexIdMap.get(normalizedMdSelfLink)); 63 64 } else { 64 65 LOG.debug("Duplicate resource vertex mdSelfLink: " + mdSelfLink); … … 73 74 */ 74 75 public static void addEdge(String sourceVertexId, String targetVertexId) { 76 String normalizedSourceVertexId = StringUtils.normalizeIdString(sourceVertexId); 77 String normalizedTargetVertexId = StringUtils.normalizeIdString(targetVertexId); 78 75 79 // add vertices 76 if (!vertexIdMap.containsKey( sourceVertexId)) {77 CmdiVertex sourceVertex = new CmdiVertex( StringUtils.normalizeIdString(sourceVertexId));78 vertexIdMap.put( sourceVertexId, sourceVertex);80 if (!vertexIdMap.containsKey(normalizedSourceVertexId)) { 81 CmdiVertex sourceVertex = new CmdiVertex(normalizedSourceVertexId); 82 vertexIdMap.put(normalizedSourceVertexId, sourceVertex); 79 83 graph.addVertex(sourceVertex); 80 84 } 81 85 82 if (!vertexIdMap.containsKey( targetVertexId)) {83 CmdiVertex targetVertex = new CmdiVertex( StringUtils.normalizeIdString(targetVertexId));84 vertexIdMap.put( targetVertexId, targetVertex);86 if (!vertexIdMap.containsKey(normalizedTargetVertexId)) { 87 CmdiVertex targetVertex = new CmdiVertex(normalizedTargetVertexId); 88 vertexIdMap.put(normalizedTargetVertexId, targetVertex); 85 89 graph.addVertex(targetVertex); 86 90 } … … 88 92 // add edge 89 93 try { 90 graph.addEdge(vertexIdMap.get( sourceVertexId), vertexIdMap.get(targetVertexId));91 updateDepthValues(vertexIdMap.get( sourceVertexId), new HashSet<CmdiVertex>());94 graph.addEdge(vertexIdMap.get(normalizedSourceVertexId), vertexIdMap.get(normalizedTargetVertexId)); 95 updateDepthValues(vertexIdMap.get(normalizedSourceVertexId), new HashSet<CmdiVertex>()); 92 96 } catch (IllegalArgumentException cfe) { 93 97 // was a cycle -> ignore … … 162 166 return foundVerticesSet; 163 167 } 168 169 public static CmdiVertex getVertex(String vertexId) { 170 return vertexIdMap.get(vertexId); 171 } 172 173 public static Map<String, CmdiVertex> getVertexIdMap() { 174 return vertexIdMap; 175 } 164 176 165 177 /** 166 178 * Get all vertices that are source of an edge where targetVertex is target. 167 * In other words get all vertices that are part oftargetVertex.179 * In other words get all resource vertices that are part of resource targetVertex. 168 180 * 169 181 * @param targetVertex … … 187 199 /** 188 200 * Get all vertices that are target of an edge where sourceVertex is source. 189 * In other words get all vertices of whichsourceVertex is part of.201 * In other words get all resource vertices of which resource sourceVertex is part of. 190 202 * 191 203 * @param sourceVertex … … 273 285 private final String id; 274 286 private int hierarchyWeight = 0; 287 private boolean wasImported = false; 275 288 276 289 public CmdiVertex(String id) { … … 288 301 public int getHierarchyWeight() { 289 302 return hierarchyWeight; 303 } 304 305 public void setWasImported(boolean wasImported) { 306 this.wasImported = wasImported; 307 } 308 309 public boolean getWasImported() { 310 return wasImported; 290 311 } 291 312 -
vlo/branches/vlo-3.3-oeaw/vlo-importer/src/test/java/eu/clarin/cmdi/vlo/importer/CMDIDataProcessorTest.java
r6419 r6760 402 402 assertEquals("demo", doc.getFieldValue("genre")); 403 403 assertEquals( 404 "{ lang='eng'}This recording was made to generate a freely available test resource including speech and gestures. The annotations were created by Peter and Kita who is gesture researcher at the MPI for Psycholinguistics.",404 "{code:eng}This recording was made to generate a freely available test resource including speech and gestures. The annotations were created by Peter and Kita who is gesture researcher at the MPI for Psycholinguistics.", 405 405 doc.getFieldValue("description")); 406 406 assertEquals("2002-10-30", doc.getFieldValue("temporalCoverage")); … … 519 519 assertEquals("Netherlands", doc.getFieldValue("country")); 520 520 assertEquals("demo", doc.getFieldValue("genre")); 521 assertEquals("{ lang='und'}Test.", doc.getFieldValue("description"));521 assertEquals("{code:und}Test.", doc.getFieldValue("description")); 522 522 assertEquals("Should be null not empty string", null, doc.getFieldValue("organisation")); 523 523 assertEquals(null, doc.getFieldValue("language")); … … 597 597 List<String> descriptions = new ArrayList(fieldValues); 598 598 Collections.sort(descriptions); 599 assertEquals("{ lang='und'}Channel: Talking;\n Genre: Traditional Narrative / Story;\n Country: Panama;\n"599 assertEquals("{code:und}Channel: Talking;\n Genre: Traditional Narrative / Story;\n Country: Panama;\n" 600 600 + " Place of Recording: Mulatuppu;\n Event: Community Gathering;\n" 601 601 + " Institutional Affiliation: University of Texas at Austin;\n Participant Information: Political Leader;", descriptions.get(0).toString()); 602 assertEquals("{ lang='und'}Test", descriptions.get(1).toString());603 assertEquals("{ lang='und'}The one-eyed grandmother is one of many traditional Kuna stories performed "602 assertEquals("{code:und}Test", descriptions.get(1).toString()); 603 assertEquals("{code:und}The one-eyed grandmother is one of many traditional Kuna stories performed " 604 604 + "in the Kuna gathering house. This story, performed here by Pedro Arias, combines " 605 605 + "European derived motifs (Tom Thumb and Hansel and Gretel) with themes that seem more " … … 879 879 assertEquals(null, doc.getFieldValue("year")); 880 880 assertEquals(null, doc.getFieldValue("genre")); 881 assertEquals("{ lang='eng'}written general; 95 mio words; TEI/SGML", doc.getFieldValue("description"));881 assertEquals("{code:eng}written general; 95 mio words; TEI/SGML", doc.getFieldValue("description")); 882 882 assertEquals("corpus", doc.getFieldValue(FacetConstants.FIELD_RESOURCE_CLASS)); 883 883 //assertEquals("Written Corpus", doc.getFieldValue(FacetConstants.FIELD_RESOURCE_CLASS)); -
vlo/branches/vlo-3.3-oeaw/vlo-importer/src/test/java/eu/clarin/cmdi/vlo/importer/LanguageCodePostProcessorTest.java
r6374 r6760 1 1 package eu.clarin.cmdi.vlo.importer; 2 2 3 import eu.clarin.cmdi.vlo.config.DefaultVloConfigFactory; 3 4 import static org.junit.Assert.assertEquals; 4 5 import org.junit.Before; 5 6 import org.junit.Test; 6 7 7 8 public class LanguageCodePostProcessorTest extends ImporterTestcase { 9 10 @Before 11 public void setUp() throws Exception { 12 13 // read the configuration from the packaged configuration file 14 new DefaultVloConfigFactory().newConfig(); 15 16 // optionally, modify the configuration here 17 } 8 18 9 19 @Test … … 12 22 assertEquals("code:nld", processor.process("NL").get(0)); 13 23 assertEquals("code:eng", processor.process("en").get(0)); 14 assertEquals("code:nld", processor.process("nl").get(0));15 24 assertEquals("code:fry", processor.process("fry").get(0)); 16 25 assertEquals("name:test", processor.process("test").get(0)); … … 23 32 assertEquals("code:eng", processor.process("eng").get(0)); 24 33 assertEquals("code:eng", processor.process("English").get(0)); 25 assertEquals("code:deu", processor.process("German").get(0));26 34 assertEquals("code:esn", processor.process("Salvadoran Sign Language").get(0)); 27 35 assertEquals("code:eng", processor.process("en_US").get(0)); … … 29 37 assertEquals("code:eng", processor.process("ISO639-2:eng").get(0)); 30 38 assertEquals("code:spa", processor.process("Spanish, Castilian").get(0)); 31 32 39 assertEquals("code:ron", processor.process("Romanian").get(0)); 33 40 } 34 41 }
Note: See TracChangeset
for help on using the changeset viewer.