Changeset 6653


Ignore:
Timestamp:
10/07/15 10:51:24 (9 years ago)
Author:
teckart@informatik.uni-leipzig.de
Message:

More consistent handling of xml:lang attributes for description values. Now the same schema as for facet languageCode is used to indicate the content's language (#780)

Location:
vlo/trunk/vlo-importer/src
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • vlo/trunk/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/CMDIParserVTDXML.java

    r6387 r6653  
    2929    private final static Logger LOG = LoggerFactory.getLogger(CMDIParserVTDXML.class);
    3030
    31     private static final String DEFAULT_LANGUAGE = "und";
     31    private static final String DEFAULT_LANGUAGE = "code:und";
    3232
    3333    public CMDIParserVTDXML(Map<String, PostProcessor> postProcessors, Boolean useLocalXSDCache) {
     
    285285
    286286            // ignore non-English language names for facet LANGUAGE_CODE
    287             if (config.getName().equals(FacetConstants.FIELD_LANGUAGE_CODE) && !languageCode.equals("en") && !languageCode.equals("eng") && !languageCode.equals("und")) {
     287            if (config.getName().equals(FacetConstants.FIELD_LANGUAGE_CODE) && !languageCode.equals("code:eng") && !languageCode.equals("code:und")) {
    288288                index = ap.evalXPath();
    289289                continue;
     
    314314        // extract language code in xml:lang if available
    315315        Integer langAttrIndex = nav.getAttrVal("xml:lang");
    316         String languageCode = DEFAULT_LANGUAGE;
     316        String languageCode;
    317317        if (langAttrIndex != -1) {
    318318            languageCode = nav.toString(langAttrIndex).trim();
    319         }
    320         // replace 2-letter with 3-letter codes
    321         if (MetadataImporter.languageCodeUtils.getSilToIso639Map().containsKey(languageCode)) {
    322             languageCode = MetadataImporter.languageCodeUtils.getSilToIso639Map().get(languageCode);
    323         }
    324         return languageCode;
     319        } else {
     320            return DEFAULT_LANGUAGE;
     321        }
     322
     323        return postProcessors.get(FacetConstants.FIELD_LANGUAGE_CODE).process(languageCode).get(0);
    325324    }
    326325
     
    332331            String fieldValue = valueList.get(i).trim();
    333332            if (name.equals(FacetConstants.FIELD_DESCRIPTION)) {
    334                 fieldValue = "{lang='" + languageCode + "'}" + fieldValue;
     333                fieldValue = "{" + languageCode + "}" + fieldValue;
    335334            }
    336335            cmdiData.addDocField(name, fieldValue, caseInsensitive);
  • vlo/trunk/vlo-importer/src/test/java/eu/clarin/cmdi/vlo/importer/CMDIDataProcessorTest.java

    r6208 r6653  
    402402        assertEquals("demo", doc.getFieldValue("genre"));
    403403        assertEquals(
    404                 "{lang='eng'}This  recording was made to generate a freely available test resource including speech and gestures. The annotations were created by Peter and Kita who is gesture researcher at the MPI for Psycholinguistics.",
     404                "{code:eng}This  recording was made to generate a freely available test resource including speech and gestures. The annotations were created by Peter and Kita who is gesture researcher at the MPI for Psycholinguistics.",
    405405                doc.getFieldValue("description"));
    406406        assertEquals("2002-10-30", doc.getFieldValue("temporalCoverage"));
     
    519519        assertEquals("Netherlands", doc.getFieldValue("country"));
    520520        assertEquals("demo", doc.getFieldValue("genre"));
    521         assertEquals("{lang='und'}Test.", doc.getFieldValue("description"));
     521        assertEquals("{code:und}Test.", doc.getFieldValue("description"));
    522522        assertEquals("Should be null not empty string", null, doc.getFieldValue("organisation"));
    523523        assertEquals(null, doc.getFieldValue("language"));
     
    597597        List<String> descriptions = new ArrayList(fieldValues);
    598598        Collections.sort(descriptions);
    599         assertEquals("{lang='und'}Channel: Talking;\n    Genre: Traditional Narrative / Story;\n    Country: Panama;\n"
     599        assertEquals("{code:und}Channel: Talking;\n    Genre: Traditional Narrative / Story;\n    Country: Panama;\n"
    600600                + "    Place of Recording: Mulatuppu;\n    Event: Community Gathering;\n"
    601601                + "    Institutional Affiliation: University of Texas at Austin;\n    Participant Information: Political Leader;", descriptions.get(0).toString());
    602         assertEquals("{lang='und'}Test", descriptions.get(1).toString());
    603         assertEquals("{lang='und'}The one-eyed grandmother is one of many traditional Kuna stories performed "
     602        assertEquals("{code:und}Test", descriptions.get(1).toString());
     603        assertEquals("{code:und}The one-eyed grandmother is one of many traditional Kuna stories performed "
    604604                + "in the Kuna gathering house. This story, performed here by Pedro Arias, combines "
    605605                + "European derived motifs (Tom Thumb and Hansel and Gretel) with themes that seem more "
     
    878878        assertEquals(null, doc.getFieldValue("year"));
    879879        assertEquals(null, doc.getFieldValue("genre"));
    880         assertEquals("{lang='eng'}written general; 95 mio words; TEI/SGML", doc.getFieldValue("description"));
     880        assertEquals("{code:eng}written general; 95 mio words; TEI/SGML", doc.getFieldValue("description"));
    881881        assertEquals("Written Corpus", doc.getFieldValue(FacetConstants.FIELD_RESOURCE_CLASS));
    882882    }
Note: See TracChangeset for help on using the changeset viewer.