Changeset 6020


Ignore:
Timestamp:
02/23/15 15:49:09 (9 years ago)
Author:
teckart@informatik.uni-leipzig.de
Message:

Extended value normalization and some updates on the organisation name file (#683)

Location:
vlo/trunk
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • vlo/trunk/vlo-commons/src/main/resources/OrganisationControlledVocabulary.xml

    r5885 r6020  
    761761        <Variation>University of Helsinki, Department of General Linguistics.</Variation>
    762762    </Organisation>
    763     <Organisation name="MPI for Psycholinguistics, MPG">
    764         <Variation>MPI</Variation>
    765         <Variation>Max Planck Institute for Psycholinguistics / MPG</Variation>
    766         <Variation>MPI for Psycholinguistics</Variation>
    767         <Variation>Max Planck Institute for Psycholinguistics</Variation>
    768         <Variation>MPI Nijmegen</Variation>
    769     </Organisation>
    770763    <Organisation name="Heidelberger Akademie der Wissenschaften, University of Heidelberg">
    771764        <Variation>University of Heidelberg, Heidelberger Akademie der Wissenschaften</Variation>
     
    14881481    <Organisation name="MPI">
    14891482        <Variation>Max Planck Institute</Variation>
     1483        <Variation>Max Planck Institut</Variation>
    14901484    </Organisation>
    14911485    <Organisation name="Finanstilsynet, OEM"/>
     
    18681862        <Variation>Université Paris X- Nanterre</Variation>
    18691863        <Variation>Université Paris X Nanterre</Variation>
    1870         <Variation>Max Planck Institute for Evolutionary Anthropology</Variation>
    18711864    </Organisation>
    18721865    <Organisation name="Goethe University Frankfurt, Institute of Empirical Linguistics">
     
    18791872    <Organisation name="Max Planck Institute for Evolutionary Anthropology">
    18801873        <Variation>Max-Planck-Institute for Evolutionary Anthropology</Variation>
     1874        <Variation>Max Planck Institute for Evolutionary Anthropology</Variation>
    18811875    </Organisation>
    18821876    <Organisation name="National Foreign Language Resource Center, University of Hawai?i"/>
     
    20632057        <Variation>Max Planck Institute for Psycolinguistics</Variation>
    20642058        <Variation>Max Planck Institute for Psycholinguisticsc</Variation>
     2059        <Variation>Max Planck Institut fuer Psycholinguistik, Nijmegen, Nl.</Variation>
     2060        <Variation>Max-Planck-Institut fÃŒr Psycholinguistik</Variation>
     2061        <Variation>Max Planck Institute for Psycholinguistics / MPG</Variation>
     2062        <Variation>MPI for Psycholinguistics</Variation>
     2063        <Variation>MPI for psycholinguistics, Nijmegen</Variation>
     2064        <Variation>Max Planck Institute for Psycholinguistics</Variation>
     2065        <Variation>MPI Nijmegen</Variation>
     2066        <Variation>Max-Planck-Institut fÌr Psycholinguistik</Variation>
     2067        <Variation>Max Planck for Psycholinguisics, Nijmegen, NL</Variation>
     2068        <Variation>Max-Planck-InstitÃŒt fÃŒr Psycholinguïstik</Variation>
     2069        <Variation>Max-Planck-InstitÃŒt fÃŒr Psycholinguïstiek</Variation>
     2070        <Variation>Mac Planck Institute for Psycholinguistics</Variation>
     2071        <Variation>Max Planck Institut fÌr Psychlinguistik</Variation>
     2072        <Variation>Max Planck Institut fÌr Psychlinguistik</Variation>
     2073        <Variation>Max Planck Institute for Psycho-Linguistics</Variation>
     2074        <Variation>MPI for Psycholinguistics (Nijmegen, Netherlands)</Variation>
     2075        <Variation>Max Plank Institute for Psycholinguistics</Variation>
     2076        <Variation>MPI for Psycholinguistics Nijmegen</Variation>
     2077        <Variation>Max-Planck-InstitÃŒt fÃŒr Psucholinguïstik</Variation>
     2078        <Variation>Max-Planck-InstitÃŒt fÃŒr Psychlinguïstik</Variation>
    20652079    </Organisation>
    20662080    <Organisation name="Witwatersrand University Press">
     
    21352149    <Organisation name="Institute of Cybernetics, Tallinn University of Technology">
    21362150        <Variation>Institute of Cybernetics at Tallinn University of Technology</Variation>
    2137     </Organisation>
    2138     <Organisation name="MPI fÃŒr Psycholinguistik">
    2139         <Variation>Max Planck Institut fuer Psycholinguistik, Nijmegen, Nl.</Variation>
    2140         <Variation>Max-Planck-Institut fÃŒr Psycholinguistik</Variation>
    21412151    </Organisation>
    21422152    <Organisation name="Sheldon Press">
  • vlo/trunk/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/OrganisationPostProcessor.java

    r6001 r6020  
    3535    @Override
    3636    public List<String> process(String value) {
    37         String[] splitArray = normalizeString(value).split(";");
     37        String[] splitArray = normalizeInputString(value).split(";");
    3838        for (int i = 0; i < splitArray.length; i++) {
    3939            String orgaName = splitArray[i];
    40             if (getNormalizedOrganisationNamesMap().containsKey(orgaName)) {
    41                 splitArray[i] = getNormalizedOrganisationNamesMap().get(orgaName);
     40            if (getNormalizedOrganisationNamesMap().containsKey(normalizeVariant(orgaName))) {
     41                splitArray[i] = getNormalizedOrganisationNamesMap().get(normalizeVariant(orgaName));
    4242            }
    4343        }
     
    4646    }
    4747   
    48     private String normalizeString(String value) {
     48    private String normalizeInputString(String value) {
    4949        return value.replaceAll("\\s+", " ");
     50    }
     51   
     52    private String normalizeVariant(String key) {
     53        return key.toLowerCase().replaceAll("-", " ");
    5054    }
    5155
     
    7983            NodeList childNodeList = node.getChildNodes();
    8084            for (int j = 0; j < childNodeList.getLength(); j++) {
    81                 String variation = childNodeList.item(j).getTextContent();
     85                String variation = normalizeVariant(childNodeList.item(j).getTextContent());
    8286                result.put(variation, organisationName);
    8387            }
  • vlo/trunk/vlo-importer/src/test/java/eu/clarin/cmdi/vlo/importer/CMDIDataProcessorTest.java

    r5979 r6020  
    397397        assertEquals("English", doc.getFieldValue("language"));
    398398        assertEquals("Netherlands", doc.getFieldValue("country"));
    399         assertEquals("MPI for Psycholinguistics, MPG", doc.getFieldValue("organisation"));
     399        assertEquals("Max Planck Institute for Psycholinguistics", doc.getFieldValue("organisation"));
    400400        assertEquals("demo", doc.getFieldValue("genre"));
    401401        assertEquals(
Note: See TracChangeset for help on using the changeset viewer.