source: vlo/trunk/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/OrganisationPostProcessor.java @ 6020

Last change on this file since 6020 was 6020, checked in by teckart@informatik.uni-leipzig.de, 9 years ago

Extended value normalization and some updates on the organisation name file (#683)

File size: 3.7 KB
Line 
1package eu.clarin.cmdi.vlo.importer;
2
3import java.io.IOException;
4import java.io.InputStream;
5import java.net.MalformedURLException;
6import java.util.Arrays;
7import java.util.HashMap;
8import java.util.List;
9import java.util.Map;
10import javax.xml.parsers.DocumentBuilder;
11import javax.xml.parsers.DocumentBuilderFactory;
12import javax.xml.parsers.ParserConfigurationException;
13import javax.xml.xpath.XPath;
14import javax.xml.xpath.XPathConstants;
15import javax.xml.xpath.XPathExpressionException;
16import javax.xml.xpath.XPathFactory;
17import org.w3c.dom.Document;
18import org.w3c.dom.Node;
19import org.w3c.dom.NodeList;
20import org.xml.sax.SAXException;
21
22public class OrganisationPostProcessor implements PostProcessor {
23
24    private static Map<String, String> organisationNamesMap = null;
25
26    /**
27     * Splits values for organisation facet at delimiter ';' and replaces
28     * organisation name variants with their official name from a controlled
29     * vocabulary
30     *
31     * @param value extracted organisation name/names
32     * @return List of organisation names (splitted at semicolon) and variations
33     * replaced with controlled vocabulary
34     */
35    @Override
36    public List<String> process(String value) {
37        String[] splitArray = normalizeInputString(value).split(";");
38        for (int i = 0; i < splitArray.length; i++) {
39            String orgaName = splitArray[i];
40            if (getNormalizedOrganisationNamesMap().containsKey(normalizeVariant(orgaName))) {
41                splitArray[i] = getNormalizedOrganisationNamesMap().get(normalizeVariant(orgaName));
42            }
43        }
44       
45        return Arrays.asList(splitArray);
46    }
47   
48    private String normalizeInputString(String value) {
49        return value.replaceAll("\\s+", " ");
50    }
51   
52    private String normalizeVariant(String key) {
53        return key.toLowerCase().replaceAll("-", " ");
54    }
55
56    private Map<String, String> getNormalizedOrganisationNamesMap() {
57        if (organisationNamesMap == null) {
58            try {
59                // load records from file, in the future this should be loaded from CLAVAS directly and the file only used as fallback
60                organisationNamesMap = createControlledVocabularyMap(MetadataImporter.config.getOrganisationNamesUrl());
61            } catch (Exception e) {
62                throw new RuntimeException("Cannot instantiate postProcessor:", e);
63            }
64        }
65        return organisationNamesMap;
66    }
67
68    private Map<String, String> createControlledVocabularyMap(String urlToVocabularyFile) throws ParserConfigurationException, MalformedURLException, SAXException, XPathExpressionException, IOException {
69        Map<String, String> result = new HashMap<String, String>();
70        DocumentBuilderFactory domFactory = DocumentBuilderFactory.newInstance();
71        domFactory.setNamespaceAware(true);
72
73        InputStream mappingFileAsStream;
74        mappingFileAsStream = NationalProjectPostProcessor.class.getResourceAsStream(urlToVocabularyFile);
75
76        DocumentBuilder builder = domFactory.newDocumentBuilder();
77        Document doc = builder.parse(mappingFileAsStream);
78        XPath xpath = XPathFactory.newInstance().newXPath();
79        NodeList nodeList = (NodeList) xpath.evaluate("//Organisation", doc, XPathConstants.NODESET);
80        for (int i = 0; i < nodeList.getLength(); i++) {
81            Node node = nodeList.item(i);
82            String organisationName = node.getAttributes().getNamedItem("name").getTextContent();
83            NodeList childNodeList = node.getChildNodes();
84            for (int j = 0; j < childNodeList.getLength(); j++) {
85                String variation = normalizeVariant(childNodeList.item(j).getTextContent());
86                result.put(variation, organisationName);
87            }
88        }
89        return result;
90    }
91}
Note: See TracBrowser for help on using the repository browser.