1 | package eu.clarin.cmdi.vlo.importer; |
---|
2 | |
---|
3 | import java.io.File; |
---|
4 | import java.io.FileInputStream; |
---|
5 | import java.io.IOException; |
---|
6 | import java.net.URI; |
---|
7 | import java.util.ArrayList; |
---|
8 | import java.util.Arrays; |
---|
9 | import java.util.HashMap; |
---|
10 | import java.util.LinkedList; |
---|
11 | import java.util.List; |
---|
12 | import java.util.Map; |
---|
13 | import java.util.Map.Entry; |
---|
14 | import java.util.regex.Matcher; |
---|
15 | import java.util.regex.Pattern; |
---|
16 | |
---|
17 | import org.apache.commons.io.IOUtils; |
---|
18 | import org.slf4j.Logger; |
---|
19 | import org.slf4j.LoggerFactory; |
---|
20 | |
---|
21 | import com.ximpleware.AutoPilot; |
---|
22 | import com.ximpleware.NavException; |
---|
23 | import com.ximpleware.VTDException; |
---|
24 | import com.ximpleware.VTDGen; |
---|
25 | import com.ximpleware.VTDNav; |
---|
26 | import com.ximpleware.XPathEvalException; |
---|
27 | import com.ximpleware.XPathParseException; |
---|
28 | |
---|
29 | import eu.clarin.cmdi.vlo.FacetConstants; |
---|
30 | |
---|
31 | public class CMDIParserVTDXML implements CMDIDataProcessor { |
---|
32 | |
---|
33 | private final Map<String, PostProcessor> postProcessors; |
---|
34 | private final Boolean useLocalXSDCache; |
---|
35 | private static final Pattern PROFILE_ID_PATTERN = Pattern.compile(".*(clarin.eu:cr1:p_[0-9]+).*"); |
---|
36 | private final static Logger LOG = LoggerFactory.getLogger(CMDIParserVTDXML.class); |
---|
37 | |
---|
38 | private static final String DEFAULT_LANGUAGE = "code:und"; |
---|
39 | |
---|
40 | public CMDIParserVTDXML(Map<String, PostProcessor> postProcessors, Boolean useLocalXSDCache) { |
---|
41 | this.postProcessors = postProcessors; |
---|
42 | this.useLocalXSDCache = useLocalXSDCache; |
---|
43 | } |
---|
44 | |
---|
45 | @Override |
---|
46 | public CMDIData process(File file) throws VTDException, IOException { |
---|
47 | CMDIData cmdiData = new CMDIData(); |
---|
48 | VTDGen vg = new VTDGen(); |
---|
49 | FileInputStream fileInputStream = new FileInputStream(file); |
---|
50 | vg.setDoc(IOUtils.toByteArray(fileInputStream)); |
---|
51 | vg.parse(true); |
---|
52 | fileInputStream.close(); |
---|
53 | |
---|
54 | VTDNav nav = vg.getNav(); |
---|
55 | String profileId = extractXsd(nav.cloneNav()); |
---|
56 | cmdiData.setProfileId(profileId); |
---|
57 | FacetMapping facetMapping = getFacetMapping(profileId); |
---|
58 | |
---|
59 | if (facetMapping.getFacets().isEmpty()) { |
---|
60 | LOG.error("Problems mapping facets for file: {}", file.getAbsolutePath()); |
---|
61 | } |
---|
62 | |
---|
63 | nav.toElement(VTDNav.ROOT); |
---|
64 | processResources(cmdiData, nav); |
---|
65 | processFacets(cmdiData, nav, facetMapping); |
---|
66 | return cmdiData; |
---|
67 | } |
---|
68 | |
---|
69 | /** |
---|
70 | * Setting namespace for Autopilot ap |
---|
71 | * |
---|
72 | * @param ap |
---|
73 | */ |
---|
74 | private void setNameSpace(AutoPilot ap) { |
---|
75 | ap.declareXPathNameSpace("c", "http://www.clarin.eu/cmd/"); |
---|
76 | } |
---|
77 | |
---|
78 | /** |
---|
79 | * Extracts valid XML patterns for all facet definitions |
---|
80 | * |
---|
81 | * @param nav VTD Navigator |
---|
82 | * @return the facet mapping used to map meta data to facets |
---|
83 | * @throws VTDException |
---|
84 | */ |
---|
85 | private FacetMapping getFacetMapping(String profileId) throws VTDException { |
---|
86 | if (profileId == null) { |
---|
87 | throw new RuntimeException("Cannot get xsd schema so cannot get a proper mapping. Parse failed!"); |
---|
88 | } |
---|
89 | // final VloConfig config = MetadataImporter.config; |
---|
90 | // final URI facetConceptsFile |
---|
91 | // = FacetConceptsMarshaller.resolveFacetsFile(config.getConfigLocation(), config.getFacetConceptsFile()); |
---|
92 | // final String facetConceptsFilePath = new File(facetConceptsFile).getAbsolutePath(); |
---|
93 | // return FacetMappingFactory.getFacetMapping(facetConceptsFilePath, profileId, useLocalXSDCache); |
---|
94 | |
---|
95 | String facetConceptsFile = MetadataImporter.config.getFacetConceptsFile(); |
---|
96 | |
---|
97 | //resolve against config location? (empty = default location) |
---|
98 | if (facetConceptsFile != null && !facetConceptsFile.isEmpty()) { |
---|
99 | URI configLocation = MetadataImporter.config.getConfigLocation(); |
---|
100 | if (configLocation != null && !configLocation.getScheme().equals("jar")) { |
---|
101 | URI facetConceptsLocation = configLocation.resolve(facetConceptsFile); |
---|
102 | facetConceptsFile = new File(facetConceptsLocation).getAbsolutePath(); |
---|
103 | } |
---|
104 | } |
---|
105 | |
---|
106 | return FacetMappingFactory.getFacetMapping(facetConceptsFile, profileId, useLocalXSDCache); |
---|
107 | } |
---|
108 | |
---|
109 | /** |
---|
110 | * Try two approaches to extract the XSD schema information from the CMDI |
---|
111 | * file |
---|
112 | * |
---|
113 | * @param nav VTD Navigator |
---|
114 | * @return ID of CMDI schema, or null if neither the CMDI header nor the |
---|
115 | * XMLSchema-instance's attributes contained the information |
---|
116 | * @throws VTDException |
---|
117 | */ |
---|
118 | String extractXsd(VTDNav nav) throws VTDException { |
---|
119 | String profileID = getProfileIdFromHeader(nav); |
---|
120 | if (profileID == null) { |
---|
121 | profileID = getProfileIdFromSchemaLocation(nav); |
---|
122 | } |
---|
123 | return profileID; |
---|
124 | } |
---|
125 | |
---|
126 | /** |
---|
127 | * Extract XSD schema information from CMDI header (using element |
---|
128 | * //Header/MdProfile) |
---|
129 | * |
---|
130 | * @param nav VTD Navigator |
---|
131 | * @return ID of CMDI schema, or null if content of //Header/MdProfile |
---|
132 | * element could not be read |
---|
133 | * @throws XPathParseException |
---|
134 | * @throws XPathEvalException |
---|
135 | * @throws NavException |
---|
136 | */ |
---|
137 | private String getProfileIdFromHeader(VTDNav nav) throws XPathParseException, XPathEvalException, NavException { |
---|
138 | nav.toElement(VTDNav.ROOT); |
---|
139 | AutoPilot ap = new AutoPilot(nav); |
---|
140 | setNameSpace(ap); |
---|
141 | ap.selectXPath("/c:CMD/c:Header/c:MdProfile/text()"); |
---|
142 | int index = ap.evalXPath(); |
---|
143 | String profileId = null; |
---|
144 | if (index != -1) { |
---|
145 | profileId = nav.toString(index).trim(); |
---|
146 | } |
---|
147 | return profileId; |
---|
148 | } |
---|
149 | |
---|
150 | /** |
---|
151 | * Extract XSD schema information from schemaLocation or |
---|
152 | * noNamespaceSchemaLocation attributes |
---|
153 | * |
---|
154 | * @param nav VTD Navigator |
---|
155 | * @return ID of CMDI schema, or null if attributes don't exist |
---|
156 | * @throws NavException |
---|
157 | */ |
---|
158 | private String getProfileIdFromSchemaLocation(VTDNav nav) throws NavException { |
---|
159 | String result = null; |
---|
160 | nav.toElement(VTDNav.ROOT); |
---|
161 | int index = nav.getAttrValNS("http://www.w3.org/2001/XMLSchema-instance", "schemaLocation"); |
---|
162 | if (index != -1) { |
---|
163 | String schemaLocation = nav.toNormalizedString(index); |
---|
164 | result = schemaLocation.split(" ")[1]; |
---|
165 | } else { |
---|
166 | index = nav.getAttrValNS("http://www.w3.org/2001/XMLSchema-instance", "noNamespaceSchemaLocation"); |
---|
167 | if (index != -1) { |
---|
168 | result = nav.toNormalizedString(index); |
---|
169 | } |
---|
170 | } |
---|
171 | |
---|
172 | // extract profile ID |
---|
173 | if (result != null) { |
---|
174 | Matcher m = PROFILE_ID_PATTERN.matcher(result); |
---|
175 | if (m.find()) { |
---|
176 | return m.group(1); |
---|
177 | } |
---|
178 | } |
---|
179 | return null; |
---|
180 | } |
---|
181 | |
---|
182 | /** |
---|
183 | * Extract ResourceProxies from ResourceProxyList |
---|
184 | * |
---|
185 | * @param cmdiData representation of the CMDI document |
---|
186 | * @param nav VTD Navigator |
---|
187 | * @throws VTDException |
---|
188 | */ |
---|
189 | private void processResources(CMDIData cmdiData, VTDNav nav) throws VTDException { |
---|
190 | AutoPilot mdSelfLink = new AutoPilot(nav); |
---|
191 | setNameSpace(mdSelfLink); |
---|
192 | mdSelfLink.selectXPath("/c:CMD/c:Header/c:MdSelfLink"); |
---|
193 | String mdSelfLinkString = mdSelfLink.evalXPathToString(); |
---|
194 | ResourceStructureGraph.addResource(mdSelfLinkString); |
---|
195 | |
---|
196 | AutoPilot resourceProxy = new AutoPilot(nav); |
---|
197 | setNameSpace(resourceProxy); |
---|
198 | resourceProxy.selectXPath("/c:CMD/c:Resources/c:ResourceProxyList/c:ResourceProxy"); |
---|
199 | |
---|
200 | AutoPilot resourceRef = new AutoPilot(nav); |
---|
201 | setNameSpace(resourceRef); |
---|
202 | resourceRef.selectXPath("c:ResourceRef"); |
---|
203 | |
---|
204 | AutoPilot resourceType = new AutoPilot(nav); |
---|
205 | setNameSpace(resourceType); |
---|
206 | resourceType.selectXPath("c:ResourceType"); |
---|
207 | |
---|
208 | AutoPilot resourceMimeType = new AutoPilot(nav); |
---|
209 | setNameSpace(resourceMimeType); |
---|
210 | resourceMimeType.selectXPath("c:ResourceType/@mimetype"); |
---|
211 | |
---|
212 | while (resourceProxy.evalXPath() != -1) { |
---|
213 | String ref = resourceRef.evalXPathToString(); |
---|
214 | String type = resourceType.evalXPathToString(); |
---|
215 | String mimeType = resourceMimeType.evalXPathToString(); |
---|
216 | |
---|
217 | if (!ref.equals("") && !type.equals("")) { |
---|
218 | // note that the mime type could be empty |
---|
219 | cmdiData.addResource(ref, type, mimeType); |
---|
220 | } |
---|
221 | |
---|
222 | // resource hierarchy information? |
---|
223 | if (type.toLowerCase().equals("metadata")) { |
---|
224 | ResourceStructureGraph.addEdge(ref, mdSelfLinkString); |
---|
225 | } |
---|
226 | } |
---|
227 | } |
---|
228 | |
---|
229 | /** |
---|
230 | * Extracts facet values according to the facetMapping |
---|
231 | * |
---|
232 | * @param cmdiData representation of the CMDI document |
---|
233 | * @param nav VTD Navigator |
---|
234 | * @param facetMapping the facet mapping used to map meta data to facets |
---|
235 | * @throws VTDException |
---|
236 | */ |
---|
237 | private void processFacets(CMDIData cmdiData, VTDNav nav, FacetMapping facetMapping) throws VTDException { |
---|
238 | |
---|
239 | List<FacetConfiguration> facetList = facetMapping.getFacets(); |
---|
240 | for (FacetConfiguration config : facetList) { |
---|
241 | boolean matchedPattern = false; |
---|
242 | List<String> patterns = config.getPatterns(); |
---|
243 | for (String pattern : patterns) { |
---|
244 | matchedPattern = matchPattern(cmdiData, nav, config, pattern, config.getAllowMultipleValues()); |
---|
245 | if (matchedPattern && !config.getAllowMultipleValues()) { |
---|
246 | break; |
---|
247 | } |
---|
248 | } |
---|
249 | |
---|
250 | // using fallback patterns if extraction failed |
---|
251 | if (matchedPattern == false) { |
---|
252 | for (String pattern : config.getFallbackPatterns()) { |
---|
253 | matchedPattern = matchPattern(cmdiData, nav, config, pattern, config.getAllowMultipleValues()); |
---|
254 | if (matchedPattern && !config.getAllowMultipleValues()) { |
---|
255 | break; |
---|
256 | } |
---|
257 | } |
---|
258 | } |
---|
259 | } |
---|
260 | } |
---|
261 | |
---|
262 | /** |
---|
263 | * Extracts content from CMDI file for a specific facet based on a single |
---|
264 | * XPath expression |
---|
265 | * |
---|
266 | * @param cmdiData representation of the CMDI document |
---|
267 | * @param nav VTD Navigator |
---|
268 | * @param config facet configuration |
---|
269 | * @param pattern XPath expression |
---|
270 | * @param allowMultipleValues information if multiple values are allowed in |
---|
271 | * this facet |
---|
272 | * @return pattern matched a node in the CMDI file? |
---|
273 | * @throws VTDException |
---|
274 | */ |
---|
275 | private boolean matchPattern(CMDIData cmdiData, VTDNav nav, FacetConfiguration config, String pattern, Boolean allowMultipleValues) throws VTDException { |
---|
276 | final AutoPilot ap = new AutoPilot(nav); |
---|
277 | setNameSpace(ap); |
---|
278 | ap.selectXPath(pattern); |
---|
279 | |
---|
280 | boolean matchedPattern = false; |
---|
281 | int index = ap.evalXPath(); |
---|
282 | while (index != -1) { |
---|
283 | matchedPattern = true; |
---|
284 | if (nav.getTokenType(index) == VTDNav.TOKEN_ATTR_NAME) { |
---|
285 | //if it is an attribute you need to add 1 to the index to get the right value |
---|
286 | index++; |
---|
287 | } |
---|
288 | final String value = nav.toString(index); |
---|
289 | |
---|
290 | final String languageCode = extractLanguageCode(nav); |
---|
291 | |
---|
292 | // ignore non-English language names for facet LANGUAGE_CODE |
---|
293 | if (config.getName().equals(FacetConstants.FIELD_LANGUAGE_CODE) && !languageCode.equals("code:eng") && !languageCode.equals("code:und")) { |
---|
294 | index = ap.evalXPath(); |
---|
295 | continue; |
---|
296 | } |
---|
297 | |
---|
298 | final List<String> values = postProcess(config.getName(), value); |
---|
299 | |
---|
300 | insertFacetValues(config.getName(), values, cmdiData, languageCode, allowMultipleValues, config.isCaseInsensitive(), true); |
---|
301 | |
---|
302 | //in case of profile name forward normalized value (not profileId) |
---|
303 | crossMap(config, config.getName().equals(FacetConstants.FIELD_CLARIN_PROFILE)? values.get(0) : value, cmdiData, languageCode); |
---|
304 | |
---|
305 | //add also non curated resource type |
---|
306 | if(config.getName().equals(FacetConstants.FIELD_RESOURCE_CLASS)) |
---|
307 | cmdiData.setOriginalResourceType(value); |
---|
308 | |
---|
309 | |
---|
310 | // insert post-processed values into derived facet(s) if configured |
---|
311 | for (String derivedFacet : config.getDerivedFacets()) { |
---|
312 | final List<String> derivedValues = new ArrayList<String>(); |
---|
313 | for (String postProcessedValue : values) { |
---|
314 | derivedValues.addAll(postProcess(derivedFacet, postProcessedValue)); |
---|
315 | } |
---|
316 | insertFacetValues(derivedFacet, derivedValues, cmdiData, languageCode, allowMultipleValues, config.isCaseInsensitive(), true); |
---|
317 | } |
---|
318 | |
---|
319 | index = ap.evalXPath(); |
---|
320 | |
---|
321 | if (!allowMultipleValues) { |
---|
322 | break; |
---|
323 | } |
---|
324 | } |
---|
325 | return matchedPattern; |
---|
326 | } |
---|
327 | |
---|
328 | private String extractLanguageCode(VTDNav nav) throws NavException { |
---|
329 | // extract language code in xml:lang if available |
---|
330 | Integer langAttrIndex = nav.getAttrVal("xml:lang"); |
---|
331 | String languageCode; |
---|
332 | if (langAttrIndex != -1) { |
---|
333 | languageCode = nav.toString(langAttrIndex).trim(); |
---|
334 | } else { |
---|
335 | return DEFAULT_LANGUAGE; |
---|
336 | } |
---|
337 | |
---|
338 | return postProcessors.get(FacetConstants.FIELD_LANGUAGE_CODE).process(languageCode).get(0); |
---|
339 | } |
---|
340 | |
---|
341 | |
---|
342 | /* |
---|
343 | * Add values to facet either they come from MD fields either from cross mapping |
---|
344 | * Advantage is given to the values from MD fields. They will be always at the begging of the list and in case |
---|
345 | * when facet doesn't allow multiple values and we already had value from cross mapping this value will be overridden |
---|
346 | * |
---|
347 | */ |
---|
348 | private void insertFacetValues(String name, List<String> valueList, CMDIData cmdiData, String languageCode, boolean allowMultipleValues, boolean caseInsensitive, boolean comesFromConceptMapping) { |
---|
349 | |
---|
350 | //keep only values from original concepts, not from cross mappings |
---|
351 | if(comesFromConceptMapping && !allowMultipleValues && cmdiData.getSolrDocument() != null && cmdiData.getSolrDocument().containsKey(name)){ |
---|
352 | cmdiData.getSolrDocument().remove(name); |
---|
353 | } |
---|
354 | |
---|
355 | if(!comesFromConceptMapping && !allowMultipleValues && cmdiData.getSolrDocument() != null && cmdiData.getSolrDocument().containsKey(name)) |
---|
356 | return; |
---|
357 | |
---|
358 | for (int i = 0; i < valueList.size(); i++) { |
---|
359 | if (!allowMultipleValues && i > 0) { |
---|
360 | break; |
---|
361 | } |
---|
362 | String fieldValue = valueList.get(i).trim(); |
---|
363 | if (name.equals(FacetConstants.FIELD_DESCRIPTION)) { |
---|
364 | fieldValue = "{" + languageCode + "}" + fieldValue; |
---|
365 | } |
---|
366 | cmdiData.addDocField(name, fieldValue, caseInsensitive); |
---|
367 | } |
---|
368 | } |
---|
369 | |
---|
370 | /** |
---|
371 | * Applies registered PostProcessor to extracted values |
---|
372 | * |
---|
373 | * @param facetName name of the facet for which value was extracted |
---|
374 | * @param extractedValue extracted value from CMDI file |
---|
375 | * @return value after applying matching PostProcessor or the original value |
---|
376 | * if no PostProcessor was registered for the facet |
---|
377 | */ |
---|
378 | private List<String> postProcess(String facetName, String extractedValue) { |
---|
379 | List<String> resultList = new ArrayList<String>(); |
---|
380 | if (postProcessors.containsKey(facetName)) { |
---|
381 | PostProcessor processor = postProcessors.get(facetName); |
---|
382 | resultList = processor.process(extractedValue); |
---|
383 | } else { |
---|
384 | resultList.add(extractedValue); |
---|
385 | } |
---|
386 | return resultList; |
---|
387 | } |
---|
388 | |
---|
389 | private void crossMap(FacetConfiguration config, String extractedValue, CMDIData cmdiData, String languageCode){ |
---|
390 | //skip if not enabled |
---|
391 | if(!MetadataImporter.config.isUseCrossMapping()) |
---|
392 | return; |
---|
393 | |
---|
394 | if (postProcessors.containsKey(config.getName())){ |
---|
395 | PostProcessor processor = postProcessors.get(config.getName()); |
---|
396 | if(processor instanceof PostProcessorsWithVocabularyMap){ |
---|
397 | |
---|
398 | List<String> facetNames = MetadataImporter.config.getAllFacetFields(); |
---|
399 | |
---|
400 | Map<String, String> crossMap = ((PostProcessorsWithVocabularyMap) processor).getCrossMappings(extractedValue); |
---|
401 | if(crossMap != null) |
---|
402 | for(Entry e: crossMap.entrySet()){ |
---|
403 | String toFacet = (String) e.getKey(); |
---|
404 | String value = (String) e.getValue(); |
---|
405 | for(String facetName: facetNames){ |
---|
406 | if(toFacet.toLowerCase().equals(facetName.toLowerCase())){//normalize facet name, map can contain it in any case |
---|
407 | insertFacetValues(facetName, Arrays.asList(value), cmdiData, languageCode, config.getAllowMultipleValues(), config.isCaseInsensitive(), false); |
---|
408 | } |
---|
409 | } |
---|
410 | } |
---|
411 | } |
---|
412 | } |
---|
413 | } |
---|
414 | |
---|
415 | } |
---|