Changeset 5228
- Timestamp:
- 05/19/14 11:38:48 (10 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
vlo/trunk/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/CMDIParserVTDXML.java
r5008 r5228 26 26 27 27 @Override 28 29 CMDIData result= new CMDIData();28 public CMDIData process(File file) throws VTDException, IOException { 29 CMDIData cmdiData = new CMDIData(); 30 30 VTDGen vg = new VTDGen(); 31 31 FileInputStream fileInputStream = new FileInputStream(file); 32 32 vg.setDoc(IOUtils.toByteArray(fileInputStream)); 33 33 vg.parse(true); 34 fileInputStream.close(); ;34 fileInputStream.close(); 35 35 36 36 VTDNav nav = vg.getNav(); 37 setNameSpace(nav); //setting namespace once, all other instance of AutoPilot keep the setting (a bit tricky).37 setNameSpace(nav); //setting namespace once, all other instance of AutoPilot keep the setting (a bit tricky). 38 38 FacetMapping facetMapping = getFacetMapping(nav.cloneNav(), file.getAbsolutePath()); 39 /** New nice error log to find erroneous files */ 40 if(facetMapping.getFacets(). size() == 0){41 LOG.error("Problems mapping facets for file: " +file.getAbsolutePath());39 40 if(facetMapping.getFacets().isEmpty()){ 41 LOG.error("Problems mapping facets for file: {}", file.getAbsolutePath()); 42 42 } 43 43 44 44 nav.toElement(VTDNav.ROOT); 45 processResources( result, nav);46 processFacets( result, nav, facetMapping);47 return result;45 processResources(cmdiData, nav); 46 processFacets(cmdiData, nav, facetMapping); 47 return cmdiData; 48 48 } 49 49 … … 53 53 } 54 54 55 private FacetMapping getFacetMapping(VTDNav nav, String tolog) throws VTDException { 55 /** 56 * Extracts valid XML patterns for all facet definitions 57 * @param nav VTD Navigator 58 * @param cmdiFilePath Absolute path of the XML file for which nav was created 59 * @return the facet mapping used to map meta data to facets 60 * @throws VTDException 61 */ 62 private FacetMapping getFacetMapping(VTDNav nav, String cmdiFilePath) throws VTDException { 56 63 String xsd = extractXsd(nav); 57 64 if (xsd == null) { … … 59 66 } 60 67 if (xsd.indexOf("http") != xsd.lastIndexOf("http")){ 61 LOG.info(" FILE WITH WEIRD HTTP THINGY! " + tolog);68 LOG.info("No valid CMDI schema URL was extracted. This is an indication of a broken CMDI file (like false content in //MdProfile element). {}", cmdiFilePath); 62 69 } 63 70 String facetConceptsFile = MetadataImporter.config.getFacetConceptsFile(); … … 69 76 } 70 77 78 /** 79 * Try two approaches to extract the XSD schema information from the CMDI file 80 * @param nav VTD Navigator 81 * @return URL of CMDI schema, or null if neither the CMDI header nor the XMLSchema-instance's attributes contained the information 82 * @throws VTDException 83 */ 71 84 String extractXsd(VTDNav nav) throws VTDException { 72 85 String xsd = getXsdFromHeader(nav); … … 77 90 } 78 91 92 /** 93 * Extract XSD schema information from CMDI header (using element //Header/MdProfile) 94 * @param nav VTD Navigator 95 * @return URL to CMDI schema, or null if content of //Header/MdProfile element could not be read 96 * @throws XPathParseException 97 * @throws XPathEvalException 98 * @throws NavException 99 */ 79 100 private String getXsdFromHeader(VTDNav nav) throws XPathParseException, XPathEvalException, NavException { 80 101 String result = null; … … 90 111 } 91 112 113 /** 114 * Extract XSD schema information from schemaLocation or noNamespaceSchemaLocation attributes 115 * @param nav VTD Navigator 116 * @return URL to CMDI schema, or null if attributes don't exist 117 * @throws NavException 118 */ 92 119 private String getXsdFromSchemaLocation(VTDNav nav) throws NavException { 93 120 String result = null; … … 106 133 } 107 134 108 private void processResources(CMDIData result, VTDNav nav) throws VTDException { 135 /** 136 * Extract ResourceProxies from ResourceProxyList 137 * @param cmdiData representation of the CMDI document 138 * @param nav VTD Navigator 139 * @throws VTDException 140 */ 141 private void processResources(CMDIData cmdiData, VTDNav nav) throws VTDException { 109 142 110 143 AutoPilot resourceProxy = new AutoPilot(nav); … … 125 158 if (!ref.equals("") && !type.equals("")) { 126 159 // note that the mime type could be empty 127 result.addResource(ref, type, mimeType); 128 } 129 } 130 } 131 132 private void processFacets(CMDIData result, VTDNav nav, FacetMapping facetMapping) throws VTDException { 160 cmdiData.addResource(ref, type, mimeType); 161 } 162 } 163 } 164 165 /** 166 * Extracts facet values according to the facetMapping 167 * @param cmdiData representation of the CMDI document 168 * @param nav VTD Navigator 169 * @param facetMapping the facet mapping used to map meta data to facets 170 * @throws VTDException 171 */ 172 private void processFacets(CMDIData cmdiData, VTDNav nav, FacetMapping facetMapping) throws VTDException { 133 173 List<FacetConfiguration> facetList = facetMapping.getFacets(); 134 174 for (FacetConfiguration config : facetList) { 135 175 List<String> patterns = config.getPatterns(); 136 176 for (String pattern : patterns) { 137 boolean matchedPattern = matchPattern( result, nav, config, pattern, config.getAllowMultipleValues());177 boolean matchedPattern = matchPattern(cmdiData, nav, config, pattern, config.getAllowMultipleValues()); 138 178 if (matchedPattern && !config.getAllowMultipleValues()) { 139 179 break; … … 143 183 } 144 184 145 private boolean matchPattern(CMDIData result, VTDNav nav, FacetConfiguration config, String pattern, Boolean allowMultipleValues) throws VTDException { 185 /** 186 * Extracts content from CMDI file for a specific facet based on a single XPath expression 187 * @param cmdiData representation of the CMDI document 188 * @param nav VTD Navigator 189 * @param config facet configuration 190 * @param pattern XPath expression 191 * @param allowMultipleValues information if multiple values are allowed in this facet 192 * @return pattern matched a node in the CMDI file? 193 * @throws VTDException 194 */ 195 private boolean matchPattern(CMDIData cmdiData, VTDNav nav, FacetConfiguration config, String pattern, Boolean allowMultipleValues) throws VTDException { 146 196 boolean matchedPattern = false; 147 197 AutoPilot ap = new AutoPilot(nav); … … 156 206 String value = nav.toString(index); 157 207 value = postProcess(config.getName(), value); 158 result.addDocField(config.getName(), value, config.isCaseInsensitive());208 cmdiData.addDocField(config.getName(), value, config.isCaseInsensitive()); 159 209 index = ap.evalXPath(); 160 210 … … 165 215 } 166 216 167 private String postProcess(String name, String value) { 168 String result = value; 169 if (postProcessors.containsKey(name)) { 170 PostProcessor processor = postProcessors.get(name); 171 result = processor.process(value); 217 /** 218 * Applies registered PostProcessor to extracted values 219 * @param facetName name of the facet for which value was extracted 220 * @param extractedValue extracted value from CMDI file 221 * @return value after applying matching PostProcessor or the original value if no PostProcessor was registered for the facet 222 */ 223 private String postProcess(String facetName, String extractedValue) { 224 String result = extractedValue; 225 if (postProcessors.containsKey(facetName)) { 226 PostProcessor processor = postProcessors.get(facetName); 227 result = processor.process(extractedValue); 172 228 } 173 229 return result.trim(); 174 230 } 175 176 231 }
Note: See TracChangeset
for help on using the changeset viewer.