Changeset 6142
- Timestamp:
- 04/13/15 12:37:17 (9 years ago)
- Location:
- vlo/trunk/vlo-importer/src
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
vlo/trunk/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/LanguageCodePostProcessor.java
r5997 r6142 2 2 3 3 import eu.clarin.cmdi.vlo.LanguageCodeUtils; 4 import java.io.IOException; 5 import java.io.InputStream; 6 import java.net.MalformedURLException; 4 7 import org.slf4j.Logger; 5 8 import org.slf4j.LoggerFactory; 6 9 7 10 import java.util.ArrayList; 11 import java.util.HashMap; 8 12 import java.util.List; 13 import java.util.Map; 9 14 import java.util.regex.Matcher; 10 15 import java.util.regex.Pattern; 16 import javax.xml.parsers.DocumentBuilder; 17 import javax.xml.parsers.DocumentBuilderFactory; 18 import javax.xml.parsers.ParserConfigurationException; 19 import javax.xml.xpath.XPath; 20 import javax.xml.xpath.XPathConstants; 21 import javax.xml.xpath.XPathExpressionException; 22 import javax.xml.xpath.XPathFactory; 23 import org.w3c.dom.Document; 24 import org.w3c.dom.Node; 25 import org.w3c.dom.NodeList; 26 import org.xml.sax.SAXException; 11 27 12 28 public class LanguageCodePostProcessor implements PostProcessor{ 13 29 14 30 private final static Logger LOG = LoggerFactory.getLogger(LanguageCodePostProcessor.class); 31 32 private static Map<String, String> languageNameVariantsMap = null; 15 33 16 34 protected static final String CODE_PREFIX = "code:"; … … 42 60 protected String extractLanguageCode(String value) { 43 61 final LanguageCodeUtils languageCodeUtils = MetadataImporter.languageCodeUtils; 44 45 62 String result = value; 46 63 47 64 result = result.replaceFirst(ISO639_2_PREFIX, "").replaceFirst(ISO639_3_PREFIX, "").replaceFirst(SIL_CODE_PREFIX, "").replaceFirst(SIL_CODE_PREFIX_alt, ""); 48 65 66 // map known language name variants to their offical name 67 if(getLanguageNamesVariantsMap().containsKey(result)) 68 result = getLanguageNamesVariantsMap().get(result); 69 49 70 // input is already ISO 639-3? 50 if(languageCodeUtils.getIso639ToLanguageNameMap().keySet().contains(result.toUpperCase())) 71 if(languageCodeUtils.getIso639ToLanguageNameMap().keySet().contains(result.toUpperCase())) { 51 72 return CODE_PREFIX + result.toLowerCase(); 52 73 } 53 74 // input is 2-letter code -> map to ISO 639-3 54 75 if(languageCodeUtils.getSilToIso639Map().containsKey(result.toLowerCase())) { … … 75 96 return result; 76 97 } 98 99 private Map<String, String> getLanguageNamesVariantsMap() { 100 if (languageNameVariantsMap == null) { 101 try { 102 // load records from file, in the future this should be loaded from CLAVAS directly and the file only used as fallback 103 languageNameVariantsMap = createControlledVocabularyMap(MetadataImporter.config.getLanguageNameVariantsUrl()); 104 } catch (Exception e) { 105 throw new RuntimeException("Cannot instantiate postProcessor:", e); 106 } 107 } 108 return languageNameVariantsMap; 109 } 110 111 private Map<String, String> createControlledVocabularyMap(String languageNamesUrl) throws ParserConfigurationException, MalformedURLException, SAXException, XPathExpressionException, IOException { 112 Map<String, String> result = new HashMap<String, String>(); 113 DocumentBuilderFactory domFactory = DocumentBuilderFactory.newInstance(); 114 domFactory.setNamespaceAware(true); 115 116 InputStream mappingFileAsStream; 117 mappingFileAsStream = NationalProjectPostProcessor.class.getResourceAsStream(languageNamesUrl); 118 119 DocumentBuilder builder = domFactory.newDocumentBuilder(); 120 Document doc = builder.parse(mappingFileAsStream); 121 XPath xpath = XPathFactory.newInstance().newXPath(); 122 NodeList nodeList = (NodeList) xpath.evaluate("//Language", doc, XPathConstants.NODESET); 123 for (int i = 0; i < nodeList.getLength(); i++) { 124 Node node = nodeList.item(i); 125 String languageName = node.getAttributes().getNamedItem("name").getTextContent(); 126 NodeList childNodeList = node.getChildNodes(); 127 for (int j = 0; j < childNodeList.getLength(); j++) { 128 String variation = childNodeList.item(j).getTextContent(); 129 result.put(variation, languageName); 130 } 131 } 132 return result; 133 } 77 134 } -
vlo/trunk/vlo-importer/src/test/java/eu/clarin/cmdi/vlo/importer/LanguageCodePostProcessorTest.java
r5779 r6142 38 38 assertEquals("code:nld", processor.process("nl-NL").get(0)); 39 39 assertEquals("code:eng", processor.process("ISO639-2:eng").get(0)); 40 assertEquals("code:spa", processor.process("Spanish, Castilian").get(0)); 40 41 } 41 42 }
Note: See TracChangeset
for help on using the changeset viewer.