Context Navigation

← Previous Change
Next Change →

Changeset 6708 for vlo

Timestamp:

10/30/15 10:19:12 (9 years ago)

Author:

davor.ostojic@oeaw.ac.at

Message:

reimplementation regarding new use cases

Location:

vlo/branches/vlo-3.3-oeaw/vlo-vocabularies/src/main/java/eu/clarin/cmdi/vlo

Files:

: 3 edited

main/TransformAllMaps.java (modified) (1 diff)
pojo/Constants.java (modified) (3 diffs)
transformers/CSVTransformer.java (modified) (5 diffs)

Legend:

: Unmodified
: Added
: Removed

vlo/branches/vlo-3.3-oeaw/vlo-vocabularies/src/main/java/eu/clarin/cmdi/vlo/main/TransformAllMaps.java

r6505	r6708
8	8	import eu.clarin.cmdi.vlo.pojo.Constants;
9	9	import eu.clarin.cmdi.vlo.transformers.CSVTransformer;
10		~~import eu.clarin.cmdi.vlo.transformers.XML2CSVTransformer;~~
11	10
12	11	public class TransformAllMaps {

vlo/branches/vlo-3.3-oeaw/vlo-vocabularies/src/main/java/eu/clarin/cmdi/vlo/pojo/Constants.java

-                      r6589
+                      r6708
         //PATHS
         public static final String CSV_PATH = "maps/csv/";
+        public static final String EXCEL_PATH = "maps/excel/";
         public static final String MAPS_PATH = "maps/uniform_maps/";
 …
         public static final String LANGUAGE_CODE = "LanguageNameVariantsMap";
         public static final String LICENCE = "LicenseAvailabilityMap";
         public static final String RESOURCE_TYPE = "resourceTypeMap";
+        public static final String RESOURCE_TYPE = "ACDH_resourceType";
         public static final String NATIONAL_PROJECT = "nationalProjectsMapping";
         public static final String ORGANISATION = "OrganisationControlledVocabulary";
 …
         static{
                 maps.add(LANGUAGE_CODE);
                 maps.add(LICENCE);
                 maps.add(NATIONAL_PROJECT);
                 maps.add(ORGANISATION);
+//              maps.add(LANGUAGE_CODE);
+//              maps.add(LICENCE);
+//              maps.add(NATIONAL_PROJECT);
+//              maps.add(ORGANISATION);
                 maps.add(RESOURCE_TYPE);
+        }

vlo/branches/vlo-3.3-oeaw/vlo-vocabularies/src/main/java/eu/clarin/cmdi/vlo/transformers/CSVTransformer.java

-                      r6631
+                      r6708
 import javax.xml.bind.JAXBException;
+import org.apache.poi.util.ArrayUtil;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 …
                 _logger.info("reading from file {}{}.xml.", Constants.MAPS_PATH, map);
                 InputStream input = new FileInputStream(Constants.MAPS_PATH + map + ".xml");
                 _logger.info("output will be saved into {}{}.csv.", Constants.CSV_PATH, map);
                 OutputStream output = new FileOutputStream(Constants.CSV_PATH + map + ".csv");
+                _logger.info("output will be saved into {}{}.txt.", Constants.CSV_PATH, map);
+                OutputStream output = new FileOutputStream(Constants.CSV_PATH + map + ".txt");
                 xml2csv(input, output);
 …
         public synchronized void csv2xml(String map) throws IOException, JAXBException{
                 _logger.info("reading from file {}{}.csv.", Constants.CSV_PATH, map);
                 InputStream input = new FileInputStream(Constants.CSV_PATH + map + ".csv");
+                _logger.info("reading from file {}{}.txt.", Constants.CSV_PATH, map);
+                InputStream input = new FileInputStream(Constants.CSV_PATH + map + ".txt");
                 _logger.info("output will be saved into {}{}.xml.", Constants.MAPS_PATH, map);
             OutputStream output = new FileOutputStream(Constants.MAPS_PATH + map + ".xml");
 …
     private VariantsMap csv2xmlTransformer(BufferedReader br) throws IOException{
         boolean regExpresionExists = false;
+        boolean remarksExists = false;
         VariantsMap map = new VariantsMap();
         map.setMappings(new ArrayList<Mapping>());
 …
         String line = br.readLine();
+        //process column names
+        if(!line.contains(Constants.NORMALIZED_VALUE))
+                throw new RuntimeException("The first row of CSV file must contain column names");
+        _logger.info("CSV file contains following headers {}", line);
+        for(String col: line.split("\\t")){
+                if(col.equals(Constants.REMARKS))
+                        break;//last column - don't need it in xml map
+                if(col.equals(Constants.REGULAR_EXPRESSION))
+                        regExpresionExists = true;
+                columnNames.add(col.toLowerCase());
+        }
+        while ((line = br.readLine()) != null) {
+                csvRowCnt++;
+                while(line.charAt(line.length() - 1) != '\t'){//line must end with a tab, otherwise means that value contains line break -> &#xA;
+                        _logger.debug("line {} contains line break!", csvRowCnt);
+                        line += "\n" + br.readLine();
+                }
+                String[] tokens = line.split("\\t");
+                String normalizedVal = regExpresionExists? tokens[3] : tokens[2];
+                if(normalizedVal.isEmpty())
+                        throw new RuntimeException("Normalized value (the 4th column) is mandatory");
+                Variant var = null;
+                if(!tokens[0].isEmpty()){ // Organization vocab has normalized values without variants
+                        var = new Variant();
+                        var.setValue(tokens[0]);
+                        _logger.debug("new varinat was found, variant value is {}. ", var.getValue());
+                        //we are setting regEx attr only if it's true
+                        if(regExpresionExists && tokens[1].toLowerCase().equals("true"))
+                                var.setRegExp(true);
+                        List<CrossMapping> crossMappings = new ArrayList<CrossMapping>();
+                        for(int i = regExpresionExists? 4 : 3; i < tokens.length; i++){
+                                if(tokens[i].isEmpty()) //skip when val is null (empty)
+                                        continue;
+        try{
+                //process column names
+                if(!line.contains(Constants.NORMALIZED_VALUE))
+                        throw new RuntimeException("The first row of CSV file must contain column names");
+                _logger.info("CSV file contains following headers {}", line);
+                for(String col: line.split("\\t")){
+                        if(col.toUpperCase().equals(Constants.REMARKS)){
+                                remarksExists = true;
+                                break;//last column - don't need it in xml map
+                        }
+                        if(col.toUpperCase().equals(Constants.REGULAR_EXPRESSION))
+                                regExpresionExists = true;
+                        columnNames.add(col.toLowerCase());
+                }
+                while ((line = br.readLine()) != null) {
+                        csvRowCnt++;
+                        while(!remarksExists && line.charAt(line.length() - 1) != '\t'){//line must end with a tab, otherwise means that value contains line break -> &#xA;
+                                _logger.debug("line {} contains line break!", csvRowCnt);
+                                line += "\n" + br.readLine();
+                        }
+                                CrossMapping crossMapping = new CrossMapping();
+                                crossMapping.setFacet(columnNames.get(i));
+                                crossMapping.setValue(tokens[i]);
+                                crossMappings.add(crossMapping);
+                                _logger.debug("new cross-mapping was found {}:{} ", crossMapping.getFacet(), crossMapping.getValue());
+                        }//end cross-mappings
+                        if(crossMappings.size() > 0)
+                                var.setCrossMappings(crossMappings);
+                }
+                Mapping mapping = processedNormalizedVals.get(normalizedVal);
+                if(mapping == null){
+                        mapping = new Mapping();
+                        mapping.setValue(normalizedVal);
+                        processedNormalizedVals.put(normalizedVal, mapping);
+                        _logger.debug("new mapping was found, normalized value is {}", mapping.getValue());
+                        csvNormValsCnt++;
+                }
+                if(var != null){
+                        mapping.getVariants().add(var);
+                }else{
+                        mapping.setVariants(null); //to skip it in XML
+                }
+            }//end while
+        map.setField(columnNames.get(0));
+        for(String s: processedNormalizedVals.keySet()){
+                map.getMappings().add(processedNormalizedVals.get(s));
+                        String[] tokens = line.split("\\t");
+                        if(remarksExists && tokens.length == columnNames.size() + 1){ //remove last col (remarks)
+                                String[] tokenstWithoutRemarks = new String[tokens.length - 1];
+                                System.arraycopy(tokens, 0, tokenstWithoutRemarks, 0, tokenstWithoutRemarks.length);
+                                tokens = tokenstWithoutRemarks;
+                        }
+                        String normalizedVal = regExpresionExists? tokens[3] : tokens[2];
+                        if(normalizedVal.isEmpty())
+                                throw new RuntimeException("Normalized value (the 4th column) is mandatory");
+                        Variant var = null;
+                        if(!tokens[0].isEmpty()){ // Organization vocab has normalized values without variants
+                                var = new Variant();
+                                var.setValue(tokens[0]);
+                                _logger.debug("new varinat was found, variant value is {}. ", var.getValue());
+                        //we are setting regEx attr only if it's true
+                        if(regExpresionExists && tokens[1].toLowerCase().equals("true"))
+                                var.setRegExp(true);
+                        List<CrossMapping> crossMappings = new ArrayList<CrossMapping>();
+                        for(int i = regExpresionExists? 4 : 3; i < tokens.length; i++){
+                                if(tokens[i].isEmpty()) //skip when val is null (empty)
+                                        continue;
+                                CrossMapping crossMapping = new CrossMapping();
+                                crossMapping.setFacet(columnNames.get(i));
+                                crossMapping.setValue(tokens[i]);
+                                crossMappings.add(crossMapping);
+                                _logger.debug("new cross-mapping was found {}:{} ", crossMapping.getFacet(), crossMapping.getValue());
+                        }//end cross-mappings
+                        if(crossMappings.size() > 0)
+                                var.setCrossMappings(crossMappings);
+                        }
+                        Mapping mapping = processedNormalizedVals.get(normalizedVal);
+                        if(mapping == null){
+                                mapping = new Mapping();
+                                mapping.setValue(normalizedVal);
+                                processedNormalizedVals.put(normalizedVal, mapping);
+                                _logger.debug("new mapping was found, normalized value is {}", mapping.getValue());
+                                csvNormValsCnt++;
+                        }
+                        if(var != null){
+                                mapping.getVariants().add(var);
+                        }else{
+                                mapping.setVariants(null); //to skip it in XML
+                        }
+            }//end while
+                map.setField(columnNames.get(0));
+                for(String s: processedNormalizedVals.keySet()){
+                        map.getMappings().add(processedNormalizedVals.get(s));
+                }
+        }catch(Exception e){
+                _logger.error("Error on line {}", line, e);
+        }

Note: See TracChangeset for help on using the changeset viewer.