Changeset 6760


Ignore:
Timestamp:
11/09/15 13:49:39 (9 years ago)
Author:
davor.ostojic@oeaw.ac.at
Message:

merged with trunk 3.4

Location:
vlo/branches/vlo-3.3-oeaw/vlo-importer
Files:
7 edited
1 copied

Legend:

Unmodified
Added
Removed
  • vlo/branches/vlo-3.3-oeaw/vlo-importer

    • Property svn:mergeinfo set to (toggle deleted branches)
      /vlo/trunk/vlo-importermergedeligible
      /vlo/branches/to-wicket-1.6-twagoo/vlo-importer4212-4219
      /vlo/branches/vlo-2.13-param/vlo-importer2570-2767
      /vlo/branches/vlo-3.0/vlo-importer5201
      /vlo/branches/vlo-3.2-ticket575/vlo-importer6112-6188
      /vlo/branches/vlo-ticket761/vlo-importer6189-6283
  • vlo/branches/vlo-3.3-oeaw/vlo-importer/pom.xml

    r6554 r6760  
    77        <groupId>eu.clarin.cmdi</groupId>
    88        <artifactId>vlo</artifactId>
    9         <version>3.3-SNAPSHOT</version>
     9        <version>3.4-SNAPSHOT</version>
    1010    </parent>
    1111   
     
    1313    <groupId>eu.clarin.cmdi</groupId>
    1414    <artifactId>vlo-importer</artifactId>
    15     <version>3.3-SNAPSHOT</version>
     15    <version>3.4-SNAPSHOT</version>
    1616    <packaging>jar</packaging>
    1717
  • vlo/branches/vlo-3.3-oeaw/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/CMDIParserVTDXML.java

    r6730 r6760  
    3636    private final static Logger LOG = LoggerFactory.getLogger(CMDIParserVTDXML.class);
    3737
    38     private static final String DEFAULT_LANGUAGE = "und";
     38    private static final String DEFAULT_LANGUAGE = "code:und";
    3939
    4040    public CMDIParserVTDXML(Map<String, PostProcessor> postProcessors, Boolean useLocalXSDCache) {
     
    291291
    292292            // ignore non-English language names for facet LANGUAGE_CODE
    293             if (config.getName().equals(FacetConstants.FIELD_LANGUAGE_CODE) && !languageCode.equals("en") && !languageCode.equals("eng") && !languageCode.equals("und")) {
     293            if (config.getName().equals(FacetConstants.FIELD_LANGUAGE_CODE) && !languageCode.equals("code:eng") && !languageCode.equals("code:und")) {
    294294                index = ap.evalXPath();
    295295                continue;
     
    329329        // extract language code in xml:lang if available
    330330        Integer langAttrIndex = nav.getAttrVal("xml:lang");
    331         String languageCode = DEFAULT_LANGUAGE;
     331        String languageCode;
    332332        if (langAttrIndex != -1) {
    333333            languageCode = nav.toString(langAttrIndex).trim();
    334         }
    335         // replace 2-letter with 3-letter codes
    336         if (MetadataImporter.languageCodeUtils.getSilToIso639Map().containsKey(languageCode)) {
    337             languageCode = MetadataImporter.languageCodeUtils.getSilToIso639Map().get(languageCode);
    338         }
    339         return languageCode;
     334        } else {
     335            return DEFAULT_LANGUAGE;
     336        }
     337
     338        return postProcessors.get(FacetConstants.FIELD_LANGUAGE_CODE).process(languageCode).get(0);
    340339    }
    341340
     
    363362            String fieldValue = valueList.get(i).trim();
    364363            if (name.equals(FacetConstants.FIELD_DESCRIPTION)) {
    365                 fieldValue = "{lang='" + languageCode + "'}" + fieldValue;
     364                fieldValue = "{" + languageCode + "}" + fieldValue;
    366365            }
    367366            cmdiData.addDocField(name, fieldValue, caseInsensitive);
  • vlo/branches/vlo-3.3-oeaw/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/MetadataImporter.java

    r6732 r6760  
    4848 * and so on.
    4949 */
    50 @SuppressWarnings({"serial"})
    5150public class MetadataImporter {
    5251
     
    166165                    }
    167166                    solrServer.commit();
    168                     updateDocumentHierarchy();
     167                    if(config.isProcessHierarchies()){
     168                        updateDocumentHierarchy();
     169                    }
    169170                }
    170171                LOG.info("End of processing: " + dataRoot.getOriginName());
     
    227228     * list of existing data roots will be filtered with the list from user
    228229     *
    229      * @return
     230     * @param dataRoots complete list of DataRoots
     231     * @return list of DataRoots without DataRoots excluded by the user
    230232     */
    231233    protected List<DataRoot> filterDataRootsWithCLArgs(List<DataRoot> dataRoots){
     
    364366                        // are usually not very interesting (think imdi corpus files) and will not be included.
    365367                        updateDocument(solrDocument, cmdiData, file, dataOrigin);
     368                        if(ResourceStructureGraph.getVertex(cmdiData.getId()) != null)
     369                            ResourceStructureGraph.getVertex(cmdiData.getId()).setWasImported(true);
    366370                    } else {
    367371                        nrOfIgnoredFiles++;
     
    538542                // get document
    539543                query = new SolrQuery();
     544                query.setRequestHandler(FacetConstants.SOLR_REQUEST_HANDLER_FAST);
    540545                query.set("q", FacetConstants.FIELD_ID+":"+vertex.getId());
    541546                SolrDocumentList response = solrServer.query(query).getResults();
     
    550555                if(vertex.getHierarchyWeight() != 0) {
    551556                    doc.setField(FacetConstants.FIELD_HIERARCHY_WEIGHT, Math.abs(vertex.getHierarchyWeight()));
     557                }
     558               
     559                // remove vertices that were not imported
     560                Iterator<String> incomingVertexIter = incomingVertexNames.iterator();
     561                while(incomingVertexIter.hasNext()) {
     562                    String vertexId = incomingVertexIter.next();
     563                    if(ResourceStructureGraph.getVertex(vertexId) == null || !ResourceStructureGraph.getVertex(vertexId).getWasImported())
     564                        incomingVertexIter.remove();
     565                }
     566                Iterator<String> outgoingVertexIter = outgoingVertexNames.iterator();
     567                while(outgoingVertexIter.hasNext()) {
     568                    String vertexId = outgoingVertexIter.next();
     569                    if(ResourceStructureGraph.getVertex(vertexId) == null || !ResourceStructureGraph.getVertex(vertexId).getWasImported())
     570                        outgoingVertexIter.remove();
    552571                }
    553572               
  • vlo/branches/vlo-3.3-oeaw/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/ResourceStructureGraph.java

    r6213 r6760  
    5353     */
    5454    public static void addResource(String mdSelfLink) {
    55         if (!vertexIdMap.containsKey(mdSelfLink)) {
    56             CmdiVertex newVertex = new CmdiVertex(StringUtils.normalizeIdString(mdSelfLink));
    57             vertexIdMap.put(mdSelfLink, newVertex);
    58         }
    59 
    60         if (!foundVerticesSet.contains(vertexIdMap.get(mdSelfLink))) {
    61             graph.addVertex(vertexIdMap.get(mdSelfLink));
    62             foundVerticesSet.add(vertexIdMap.get(mdSelfLink));
     55        String normalizedMdSelfLink = StringUtils.normalizeIdString(mdSelfLink);
     56        if (!vertexIdMap.containsKey(normalizedMdSelfLink)) {
     57            CmdiVertex newVertex = new CmdiVertex(normalizedMdSelfLink);
     58            vertexIdMap.put(normalizedMdSelfLink, newVertex);
     59        }
     60
     61        if (!foundVerticesSet.contains(vertexIdMap.get(normalizedMdSelfLink))) {
     62            graph.addVertex(vertexIdMap.get(normalizedMdSelfLink));
     63            foundVerticesSet.add(vertexIdMap.get(normalizedMdSelfLink));
    6364        } else {
    6465            LOG.debug("Duplicate resource vertex mdSelfLink: " + mdSelfLink);
     
    7374     */
    7475    public static void addEdge(String sourceVertexId, String targetVertexId) {
     76        String normalizedSourceVertexId = StringUtils.normalizeIdString(sourceVertexId);
     77        String normalizedTargetVertexId = StringUtils.normalizeIdString(targetVertexId);
     78       
    7579        // add vertices
    76         if (!vertexIdMap.containsKey(sourceVertexId)) {
    77             CmdiVertex sourceVertex = new CmdiVertex(StringUtils.normalizeIdString(sourceVertexId));
    78             vertexIdMap.put(sourceVertexId, sourceVertex);
     80        if (!vertexIdMap.containsKey(normalizedSourceVertexId)) {
     81            CmdiVertex sourceVertex = new CmdiVertex(normalizedSourceVertexId);
     82            vertexIdMap.put(normalizedSourceVertexId, sourceVertex);
    7983            graph.addVertex(sourceVertex);
    8084        }
    8185
    82         if (!vertexIdMap.containsKey(targetVertexId)) {
    83             CmdiVertex targetVertex = new CmdiVertex(StringUtils.normalizeIdString(targetVertexId));
    84             vertexIdMap.put(targetVertexId, targetVertex);
     86        if (!vertexIdMap.containsKey(normalizedTargetVertexId)) {
     87            CmdiVertex targetVertex = new CmdiVertex(normalizedTargetVertexId);
     88            vertexIdMap.put(normalizedTargetVertexId, targetVertex);
    8589            graph.addVertex(targetVertex);
    8690        }
     
    8892        // add edge
    8993        try {
    90             graph.addEdge(vertexIdMap.get(sourceVertexId), vertexIdMap.get(targetVertexId));
    91             updateDepthValues(vertexIdMap.get(sourceVertexId), new HashSet<CmdiVertex>());
     94            graph.addEdge(vertexIdMap.get(normalizedSourceVertexId), vertexIdMap.get(normalizedTargetVertexId));
     95            updateDepthValues(vertexIdMap.get(normalizedSourceVertexId), new HashSet<CmdiVertex>());
    9296        } catch (IllegalArgumentException cfe) {
    9397            // was a cycle -> ignore
     
    162166        return foundVerticesSet;
    163167    }
     168   
     169    public static CmdiVertex getVertex(String vertexId) {
     170        return vertexIdMap.get(vertexId);       
     171    }
     172   
     173    public static Map<String, CmdiVertex> getVertexIdMap() {
     174        return vertexIdMap;
     175    }
    164176
    165177    /**
    166178     * Get all vertices that are source of an edge where targetVertex is target.
    167      * In other words get all vertices that are part of targetVertex.
     179     * In other words get all resource vertices that are part of resource targetVertex.
    168180     *
    169181     * @param targetVertex
     
    187199    /**
    188200     * Get all vertices that are target of an edge where sourceVertex is source.
    189      * In other words get all vertices of which sourceVertex is part of.
     201     * In other words get all resource vertices of which resource sourceVertex is part of.
    190202     *
    191203     * @param sourceVertex
     
    273285    private final String id;
    274286    private int hierarchyWeight = 0;
     287    private boolean wasImported = false;
    275288
    276289    public CmdiVertex(String id) {
     
    288301    public int getHierarchyWeight() {
    289302        return hierarchyWeight;
     303    }
     304   
     305    public void setWasImported(boolean wasImported) {
     306        this.wasImported = wasImported;
     307    }
     308   
     309    public boolean getWasImported() {
     310        return wasImported;
    290311    }
    291312
  • vlo/branches/vlo-3.3-oeaw/vlo-importer/src/test/java/eu/clarin/cmdi/vlo/importer/CMDIDataProcessorTest.java

    r6419 r6760  
    402402        assertEquals("demo", doc.getFieldValue("genre"));
    403403        assertEquals(
    404                 "{lang='eng'}This  recording was made to generate a freely available test resource including speech and gestures. The annotations were created by Peter and Kita who is gesture researcher at the MPI for Psycholinguistics.",
     404                "{code:eng}This  recording was made to generate a freely available test resource including speech and gestures. The annotations were created by Peter and Kita who is gesture researcher at the MPI for Psycholinguistics.",
    405405                doc.getFieldValue("description"));
    406406        assertEquals("2002-10-30", doc.getFieldValue("temporalCoverage"));
     
    519519        assertEquals("Netherlands", doc.getFieldValue("country"));
    520520        assertEquals("demo", doc.getFieldValue("genre"));
    521         assertEquals("{lang='und'}Test.", doc.getFieldValue("description"));
     521        assertEquals("{code:und}Test.", doc.getFieldValue("description"));
    522522        assertEquals("Should be null not empty string", null, doc.getFieldValue("organisation"));
    523523        assertEquals(null, doc.getFieldValue("language"));
     
    597597        List<String> descriptions = new ArrayList(fieldValues);
    598598        Collections.sort(descriptions);
    599         assertEquals("{lang='und'}Channel: Talking;\n    Genre: Traditional Narrative / Story;\n    Country: Panama;\n"
     599        assertEquals("{code:und}Channel: Talking;\n    Genre: Traditional Narrative / Story;\n    Country: Panama;\n"
    600600                + "    Place of Recording: Mulatuppu;\n    Event: Community Gathering;\n"
    601601                + "    Institutional Affiliation: University of Texas at Austin;\n    Participant Information: Political Leader;", descriptions.get(0).toString());
    602         assertEquals("{lang='und'}Test", descriptions.get(1).toString());
    603         assertEquals("{lang='und'}The one-eyed grandmother is one of many traditional Kuna stories performed "
     602        assertEquals("{code:und}Test", descriptions.get(1).toString());
     603        assertEquals("{code:und}The one-eyed grandmother is one of many traditional Kuna stories performed "
    604604                + "in the Kuna gathering house. This story, performed here by Pedro Arias, combines "
    605605                + "European derived motifs (Tom Thumb and Hansel and Gretel) with themes that seem more "
     
    879879        assertEquals(null, doc.getFieldValue("year"));
    880880        assertEquals(null, doc.getFieldValue("genre"));
    881         assertEquals("{lang='eng'}written general; 95 mio words; TEI/SGML", doc.getFieldValue("description"));
     881        assertEquals("{code:eng}written general; 95 mio words; TEI/SGML", doc.getFieldValue("description"));
    882882        assertEquals("corpus", doc.getFieldValue(FacetConstants.FIELD_RESOURCE_CLASS));
    883883        //assertEquals("Written Corpus", doc.getFieldValue(FacetConstants.FIELD_RESOURCE_CLASS));
  • vlo/branches/vlo-3.3-oeaw/vlo-importer/src/test/java/eu/clarin/cmdi/vlo/importer/LanguageCodePostProcessorTest.java

    r6374 r6760  
    11package eu.clarin.cmdi.vlo.importer;
    22
     3import eu.clarin.cmdi.vlo.config.DefaultVloConfigFactory;
    34import static org.junit.Assert.assertEquals;
    4 
     5import org.junit.Before;
    56import org.junit.Test;
    67
    78public class LanguageCodePostProcessorTest extends ImporterTestcase {
     9
     10    @Before
     11    public void setUp() throws Exception {
     12
     13        // read the configuration from the packaged configuration file
     14        new DefaultVloConfigFactory().newConfig();
     15
     16        // optionally, modify the configuration here
     17    }
    818
    919    @Test
     
    1222        assertEquals("code:nld", processor.process("NL").get(0));
    1323        assertEquals("code:eng", processor.process("en").get(0));
    14         assertEquals("code:nld", processor.process("nl").get(0));
    1524        assertEquals("code:fry", processor.process("fry").get(0));
    1625        assertEquals("name:test", processor.process("test").get(0));
     
    2332        assertEquals("code:eng", processor.process("eng").get(0));
    2433        assertEquals("code:eng", processor.process("English").get(0));
    25         assertEquals("code:deu", processor.process("German").get(0));
    2634        assertEquals("code:esn", processor.process("Salvadoran Sign Language").get(0));
    2735        assertEquals("code:eng", processor.process("en_US").get(0));
     
    2937        assertEquals("code:eng", processor.process("ISO639-2:eng").get(0));
    3038        assertEquals("code:spa", processor.process("Spanish, Castilian").get(0));
    31        
    32        
     39        assertEquals("code:ron", processor.process("Romanian").get(0));
    3340    }
    3441}
Note: See TracChangeset for help on using the changeset viewer.