1 | package eu.clarin.cmdi.vlo.importer; |
---|
2 | |
---|
3 | import java.net.URL; |
---|
4 | import java.util.HashMap; |
---|
5 | import java.util.Map; |
---|
6 | |
---|
7 | import javax.xml.parsers.DocumentBuilder; |
---|
8 | import javax.xml.parsers.DocumentBuilderFactory; |
---|
9 | import javax.xml.xpath.XPath; |
---|
10 | import javax.xml.xpath.XPathConstants; |
---|
11 | import javax.xml.xpath.XPathFactory; |
---|
12 | |
---|
13 | import org.slf4j.Logger; |
---|
14 | import org.slf4j.LoggerFactory; |
---|
15 | import org.w3c.dom.Document; |
---|
16 | import org.w3c.dom.Node; |
---|
17 | import org.w3c.dom.NodeList; |
---|
18 | |
---|
19 | import eu.clarin.cmdi.vlo.CommonUtils; |
---|
20 | import java.util.ArrayList; |
---|
21 | import java.util.List; |
---|
22 | import java.util.regex.Matcher; |
---|
23 | import java.util.regex.Pattern; |
---|
24 | |
---|
25 | public class LanguageCodePostProcessor implements PostProcessor{ |
---|
26 | |
---|
27 | private final static Logger LOG = LoggerFactory.getLogger(LanguageCodePostProcessor.class); |
---|
28 | |
---|
29 | protected static final String CODE_PREFIX = "code:"; |
---|
30 | protected static final String LANG_NAME_PREFIX = "name:"; |
---|
31 | protected static final String ISO639_2_PREFIX = "ISO639-2:"; |
---|
32 | protected static final String ISO639_3_PREFIX = "ISO639-3:"; |
---|
33 | protected static final String SIL_CODE_PREFIX = "RFC1766:x-sil-"; |
---|
34 | protected static final String SIL_CODE_PREFIX_alt = "RFC-1766:x-sil-"; |
---|
35 | |
---|
36 | private static final Pattern RFC1766_Pattern = Pattern.compile("^([a-z]{2,3})[-_][a-zA-Z]{2}$"); |
---|
37 | |
---|
38 | /** |
---|
39 | * Returns the language code based on the mapping defined in the CMDI components: See http://trac.clarin.eu/ticket/40 for the mapping. |
---|
40 | * If no mapping is found the original value is returned. |
---|
41 | * @param value extracted language value (language code or language name) from CMDI file |
---|
42 | * @return ISO 639-3 code |
---|
43 | */ |
---|
44 | @Override |
---|
45 | public List<String> process(String value) { |
---|
46 | List<String> resultList = new ArrayList<String>(); |
---|
47 | |
---|
48 | if (value != null) |
---|
49 | resultList.add(extractLanguageCode(value)); |
---|
50 | else |
---|
51 | resultList.add(null); |
---|
52 | return resultList; |
---|
53 | } |
---|
54 | |
---|
55 | protected String extractLanguageCode(String value) { |
---|
56 | String result = value; |
---|
57 | |
---|
58 | result = result.replaceFirst(ISO639_2_PREFIX, "").replaceFirst(ISO639_3_PREFIX, "").replaceFirst(SIL_CODE_PREFIX, "").replaceFirst(SIL_CODE_PREFIX_alt, ""); |
---|
59 | |
---|
60 | // input is already ISO 639-3? |
---|
61 | if(LanguageCodeUtils.getIso639ToLanguageNameMap().keySet().contains(result.toUpperCase())) |
---|
62 | return CODE_PREFIX + result.toLowerCase(); |
---|
63 | |
---|
64 | // input is 2-letter code -> map to ISO 639-3 |
---|
65 | if(LanguageCodeUtils.getSilToIso639Map().containsKey(result.toLowerCase())) { |
---|
66 | return CODE_PREFIX + LanguageCodeUtils.getSilToIso639Map().get(result.toLowerCase()); |
---|
67 | } |
---|
68 | |
---|
69 | if(LanguageCodeUtils.getLanguageNameToIso639Map().containsKey(result)) { // (english) language name? |
---|
70 | return CODE_PREFIX + LanguageCodeUtils.getLanguageNameToIso639Map().get(result); |
---|
71 | } |
---|
72 | |
---|
73 | // convert ISO 639-2/T codes to ISO 639-3 |
---|
74 | if (LanguageCodeUtils.getIso6392TToISO6393Map().containsKey(result.toLowerCase())) { |
---|
75 | return CODE_PREFIX + LanguageCodeUtils.getIso6392TToISO6393Map().get(result.toLowerCase()); |
---|
76 | } |
---|
77 | |
---|
78 | Matcher matcher = RFC1766_Pattern.matcher(result); |
---|
79 | if(matcher.find()) { |
---|
80 | return extractLanguageCode(matcher.group(1)); |
---|
81 | } |
---|
82 | |
---|
83 | // language code not identified? -> language name |
---|
84 | if(!result.equals("")) |
---|
85 | result = LANG_NAME_PREFIX + result; |
---|
86 | return result; |
---|
87 | } |
---|
88 | } |
---|