Changeset 6142


Ignore:
Timestamp:
04/13/15 12:37:17 (9 years ago)
Author:
teckart@informatik.uni-leipzig.de
Message:

Added support for mapping language name variants to "official" language names (#749)

Location:
vlo/trunk/vlo-importer/src
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • vlo/trunk/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/LanguageCodePostProcessor.java

    r5997 r6142  
    22
    33import eu.clarin.cmdi.vlo.LanguageCodeUtils;
     4import java.io.IOException;
     5import java.io.InputStream;
     6import java.net.MalformedURLException;
    47import org.slf4j.Logger;
    58import org.slf4j.LoggerFactory;
    69
    710import java.util.ArrayList;
     11import java.util.HashMap;
    812import java.util.List;
     13import java.util.Map;
    914import java.util.regex.Matcher;
    1015import java.util.regex.Pattern;
     16import javax.xml.parsers.DocumentBuilder;
     17import javax.xml.parsers.DocumentBuilderFactory;
     18import javax.xml.parsers.ParserConfigurationException;
     19import javax.xml.xpath.XPath;
     20import javax.xml.xpath.XPathConstants;
     21import javax.xml.xpath.XPathExpressionException;
     22import javax.xml.xpath.XPathFactory;
     23import org.w3c.dom.Document;
     24import org.w3c.dom.Node;
     25import org.w3c.dom.NodeList;
     26import org.xml.sax.SAXException;
    1127
    1228public class LanguageCodePostProcessor implements PostProcessor{
    1329
    1430    private final static Logger LOG = LoggerFactory.getLogger(LanguageCodePostProcessor.class);
     31   
     32    private static Map<String, String> languageNameVariantsMap = null;
    1533
    1634    protected static final String CODE_PREFIX = "code:";
     
    4260    protected String extractLanguageCode(String value) {
    4361        final LanguageCodeUtils languageCodeUtils = MetadataImporter.languageCodeUtils;
    44        
    4562        String result = value;
    4663       
    4764        result = result.replaceFirst(ISO639_2_PREFIX, "").replaceFirst(ISO639_3_PREFIX, "").replaceFirst(SIL_CODE_PREFIX, "").replaceFirst(SIL_CODE_PREFIX_alt, "");
    4865       
     66        // map known language name variants to their offical name
     67        if(getLanguageNamesVariantsMap().containsKey(result))
     68            result = getLanguageNamesVariantsMap().get(result);
     69       
    4970        // input is already ISO 639-3?
    50         if(languageCodeUtils.getIso639ToLanguageNameMap().keySet().contains(result.toUpperCase()))
     71        if(languageCodeUtils.getIso639ToLanguageNameMap().keySet().contains(result.toUpperCase())) {
    5172            return CODE_PREFIX + result.toLowerCase();
    52        
     73        }       
    5374        // input is 2-letter code -> map to ISO 639-3
    5475        if(languageCodeUtils.getSilToIso639Map().containsKey(result.toLowerCase())) {
     
    7596        return result;
    7697    }
     98   
     99        private Map<String, String> getLanguageNamesVariantsMap() {
     100        if (languageNameVariantsMap == null) {
     101            try {
     102                // load records from file, in the future this should be loaded from CLAVAS directly and the file only used as fallback
     103                languageNameVariantsMap = createControlledVocabularyMap(MetadataImporter.config.getLanguageNameVariantsUrl());
     104            } catch (Exception e) {
     105                throw new RuntimeException("Cannot instantiate postProcessor:", e);
     106            }
     107        }
     108        return languageNameVariantsMap;
     109    }
     110
     111    private Map<String, String> createControlledVocabularyMap(String languageNamesUrl) throws ParserConfigurationException, MalformedURLException, SAXException, XPathExpressionException, IOException {
     112        Map<String, String> result = new HashMap<String, String>();
     113        DocumentBuilderFactory domFactory = DocumentBuilderFactory.newInstance();
     114        domFactory.setNamespaceAware(true);
     115
     116        InputStream mappingFileAsStream;
     117        mappingFileAsStream = NationalProjectPostProcessor.class.getResourceAsStream(languageNamesUrl);
     118
     119        DocumentBuilder builder = domFactory.newDocumentBuilder();
     120        Document doc = builder.parse(mappingFileAsStream);
     121        XPath xpath = XPathFactory.newInstance().newXPath();
     122        NodeList nodeList = (NodeList) xpath.evaluate("//Language", doc, XPathConstants.NODESET);
     123        for (int i = 0; i < nodeList.getLength(); i++) {
     124            Node node = nodeList.item(i);
     125            String languageName = node.getAttributes().getNamedItem("name").getTextContent();
     126            NodeList childNodeList = node.getChildNodes();
     127            for (int j = 0; j < childNodeList.getLength(); j++) {
     128                String variation = childNodeList.item(j).getTextContent();
     129                result.put(variation, languageName);
     130            }
     131        }
     132        return result;
     133    }
    77134}
  • vlo/trunk/vlo-importer/src/test/java/eu/clarin/cmdi/vlo/importer/LanguageCodePostProcessorTest.java

    r5779 r6142  
    3838        assertEquals("code:nld", processor.process("nl-NL").get(0));
    3939        assertEquals("code:eng", processor.process("ISO639-2:eng").get(0));
     40        assertEquals("code:spa", processor.process("Spanish, Castilian").get(0));
    4041    }
    4142}
Note: See TracChangeset for help on using the changeset viewer.