source: vlo/trunk/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/LanguageCodePostProcessor.java @ 5848

Last change on this file since 5848 was 5848, checked in by teckart@informatik.uni-leipzig.de, 10 years ago

Minor refactoring: moving utility methods to new class LanguageCodeUtils?

File size: 3.4 KB
Line 
1package eu.clarin.cmdi.vlo.importer;
2
3import java.net.URL;
4import java.util.HashMap;
5import java.util.Map;
6
7import javax.xml.parsers.DocumentBuilder;
8import javax.xml.parsers.DocumentBuilderFactory;
9import javax.xml.xpath.XPath;
10import javax.xml.xpath.XPathConstants;
11import javax.xml.xpath.XPathFactory;
12
13import org.slf4j.Logger;
14import org.slf4j.LoggerFactory;
15import org.w3c.dom.Document;
16import org.w3c.dom.Node;
17import org.w3c.dom.NodeList;
18
19import eu.clarin.cmdi.vlo.CommonUtils;
20import java.util.ArrayList;
21import java.util.List;
22import java.util.regex.Matcher;
23import java.util.regex.Pattern;
24
25public class LanguageCodePostProcessor implements PostProcessor{
26
27    private final static Logger LOG = LoggerFactory.getLogger(LanguageCodePostProcessor.class);
28   
29    protected static final String CODE_PREFIX = "code:";
30    protected static final String LANG_NAME_PREFIX = "name:";
31    protected static final String ISO639_2_PREFIX = "ISO639-2:";
32    protected static final String ISO639_3_PREFIX = "ISO639-3:";
33    protected static final String SIL_CODE_PREFIX = "RFC1766:x-sil-";
34    protected static final String SIL_CODE_PREFIX_alt = "RFC-1766:x-sil-";
35   
36    private static final Pattern RFC1766_Pattern = Pattern.compile("^([a-z]{2,3})[-_][a-zA-Z]{2}$");
37
38    /**
39     * Returns the language code based on the mapping defined in the CMDI components: See http://trac.clarin.eu/ticket/40 for the mapping.
40     * If no mapping is found the original value is returned.
41     * @param value extracted language value (language code or language name) from CMDI file
42     * @return ISO 639-3 code
43     */
44    @Override
45    public List<String> process(String value) {
46        List<String> resultList = new ArrayList<String>();
47       
48        if (value != null)
49            resultList.add(extractLanguageCode(value));
50        else
51            resultList.add(null);
52        return resultList;
53    }
54
55    protected String extractLanguageCode(String value) {
56        String result = value;
57       
58        result = result.replaceFirst(ISO639_2_PREFIX, "").replaceFirst(ISO639_3_PREFIX, "").replaceFirst(SIL_CODE_PREFIX, "").replaceFirst(SIL_CODE_PREFIX_alt, "");
59       
60        // input is already ISO 639-3?
61        if(LanguageCodeUtils.getIso639ToLanguageNameMap().keySet().contains(result.toUpperCase()))
62            return CODE_PREFIX + result.toLowerCase();
63       
64        // input is 2-letter code -> map to ISO 639-3
65        if(LanguageCodeUtils.getSilToIso639Map().containsKey(result.toLowerCase())) {
66            return CODE_PREFIX + LanguageCodeUtils.getSilToIso639Map().get(result.toLowerCase());
67        }
68
69        if(LanguageCodeUtils.getLanguageNameToIso639Map().containsKey(result)) { // (english) language name?
70            return CODE_PREFIX + LanguageCodeUtils.getLanguageNameToIso639Map().get(result);
71        }
72
73        // convert ISO 639-2/T codes to ISO 639-3
74        if (LanguageCodeUtils.getIso6392TToISO6393Map().containsKey(result.toLowerCase())) {
75            return CODE_PREFIX + LanguageCodeUtils.getIso6392TToISO6393Map().get(result.toLowerCase());
76        }
77       
78        Matcher matcher = RFC1766_Pattern.matcher(result);
79        if(matcher.find()) {
80            return extractLanguageCode(matcher.group(1));
81        }
82           
83        // language code not identified? -> language name
84        if(!result.equals(""))
85            result = LANG_NAME_PREFIX + result;
86        return result;
87    }
88}
Note: See TracBrowser for help on using the repository browser.