1 | package eu.clarin.cmdi.vlo.importer; |
---|
2 | |
---|
3 | import com.ximpleware.AutoPilot; |
---|
4 | import com.ximpleware.NavException; |
---|
5 | import com.ximpleware.VTDException; |
---|
6 | import com.ximpleware.VTDGen; |
---|
7 | import com.ximpleware.VTDNav; |
---|
8 | import com.ximpleware.XPathEvalException; |
---|
9 | import com.ximpleware.XPathParseException; |
---|
10 | import eu.clarin.cmdi.vlo.FacetConstants; |
---|
11 | import java.io.File; |
---|
12 | import java.io.FileInputStream; |
---|
13 | import java.io.IOException; |
---|
14 | import java.util.ArrayList; |
---|
15 | import java.util.List; |
---|
16 | import java.util.Map; |
---|
17 | import org.apache.commons.io.IOUtils; |
---|
18 | import org.slf4j.Logger; |
---|
19 | import org.slf4j.LoggerFactory; |
---|
20 | |
---|
21 | public class CMDIParserVTDXML implements CMDIDataProcessor { |
---|
22 | private final Map<String, PostProcessor> postProcessors; |
---|
23 | private final static Logger LOG = LoggerFactory.getLogger(CMDIParserVTDXML.class); |
---|
24 | |
---|
25 | private static final String DEFAULT_LANGUAGE = "und"; |
---|
26 | |
---|
27 | public CMDIParserVTDXML(Map<String, PostProcessor> postProcessors) { |
---|
28 | this.postProcessors = postProcessors; |
---|
29 | } |
---|
30 | |
---|
31 | @Override |
---|
32 | public CMDIData process(File file) throws VTDException, IOException { |
---|
33 | CMDIData cmdiData = new CMDIData(); |
---|
34 | VTDGen vg = new VTDGen(); |
---|
35 | FileInputStream fileInputStream = new FileInputStream(file); |
---|
36 | vg.setDoc(IOUtils.toByteArray(fileInputStream)); |
---|
37 | vg.parse(true); |
---|
38 | fileInputStream.close(); |
---|
39 | |
---|
40 | VTDNav nav = vg.getNav(); |
---|
41 | FacetMapping facetMapping = getFacetMapping(nav.cloneNav(), file.getAbsolutePath()); |
---|
42 | |
---|
43 | if(facetMapping.getFacets().isEmpty()){ |
---|
44 | LOG.error("Problems mapping facets for file: {}", file.getAbsolutePath()); |
---|
45 | } |
---|
46 | |
---|
47 | nav.toElement(VTDNav.ROOT); |
---|
48 | processResources(cmdiData, nav); |
---|
49 | processFacets(cmdiData, nav, facetMapping); |
---|
50 | return cmdiData; |
---|
51 | } |
---|
52 | |
---|
53 | /** |
---|
54 | * Setting namespace for Autopilot ap |
---|
55 | * @param ap |
---|
56 | */ |
---|
57 | private void setNameSpace(AutoPilot ap) { |
---|
58 | ap.declareXPathNameSpace("c", "http://www.clarin.eu/cmd/"); |
---|
59 | } |
---|
60 | |
---|
61 | /** |
---|
62 | * Extracts valid XML patterns for all facet definitions |
---|
63 | * @param nav VTD Navigator |
---|
64 | * @param cmdiFilePath Absolute path of the XML file for which nav was created |
---|
65 | * @return the facet mapping used to map meta data to facets |
---|
66 | * @throws VTDException |
---|
67 | */ |
---|
68 | private FacetMapping getFacetMapping(VTDNav nav, String cmdiFilePath) throws VTDException { |
---|
69 | String xsd = extractXsd(nav); |
---|
70 | if (xsd == null) { |
---|
71 | throw new RuntimeException("Cannot get xsd schema so cannot get a proper mapping. Parse failed!"); |
---|
72 | } |
---|
73 | if (xsd.indexOf("http") != xsd.lastIndexOf("http")){ |
---|
74 | LOG.info("No valid CMDI schema URL was extracted. This is an indication of a broken CMDI file (like false content in //MdProfile element). {}", cmdiFilePath); |
---|
75 | } |
---|
76 | String facetConceptsFile = MetadataImporter.config.getFacetConceptsFile(); |
---|
77 | if (facetConceptsFile.length() == 0){ |
---|
78 | // use the packaged facet mapping file |
---|
79 | facetConceptsFile = "/facetConcepts.xml"; |
---|
80 | } |
---|
81 | return FacetMappingFactory.getFacetMapping(facetConceptsFile, xsd); |
---|
82 | } |
---|
83 | |
---|
84 | /** |
---|
85 | * Try two approaches to extract the XSD schema information from the CMDI file |
---|
86 | * @param nav VTD Navigator |
---|
87 | * @return URL of CMDI schema, or null if neither the CMDI header nor the XMLSchema-instance's attributes contained the information |
---|
88 | * @throws VTDException |
---|
89 | */ |
---|
90 | String extractXsd(VTDNav nav) throws VTDException { |
---|
91 | String xsd = getXsdFromHeader(nav); |
---|
92 | if (xsd == null) { |
---|
93 | xsd = getXsdFromSchemaLocation(nav); |
---|
94 | } |
---|
95 | return xsd; |
---|
96 | } |
---|
97 | |
---|
98 | /** |
---|
99 | * Extract XSD schema information from CMDI header (using element //Header/MdProfile) |
---|
100 | * @param nav VTD Navigator |
---|
101 | * @return URL to CMDI schema, or null if content of //Header/MdProfile element could not be read |
---|
102 | * @throws XPathParseException |
---|
103 | * @throws XPathEvalException |
---|
104 | * @throws NavException |
---|
105 | */ |
---|
106 | private String getXsdFromHeader(VTDNav nav) throws XPathParseException, XPathEvalException, NavException { |
---|
107 | String result = null; |
---|
108 | nav.toElement(VTDNav.ROOT); |
---|
109 | AutoPilot ap = new AutoPilot(nav); |
---|
110 | setNameSpace(ap); |
---|
111 | ap.selectXPath("/c:CMD/c:Header/c:MdProfile/text()"); |
---|
112 | int index = ap.evalXPath(); |
---|
113 | if (index != -1) { |
---|
114 | String profileId = nav.toString(index).trim(); |
---|
115 | result = MetadataImporter.config.getComponentRegistryProfileSchema(profileId); |
---|
116 | } |
---|
117 | return result; |
---|
118 | } |
---|
119 | |
---|
120 | /** |
---|
121 | * Extract XSD schema information from schemaLocation or noNamespaceSchemaLocation attributes |
---|
122 | * @param nav VTD Navigator |
---|
123 | * @return URL to CMDI schema, or null if attributes don't exist |
---|
124 | * @throws NavException |
---|
125 | */ |
---|
126 | private String getXsdFromSchemaLocation(VTDNav nav) throws NavException { |
---|
127 | String result = null; |
---|
128 | nav.toElement(VTDNav.ROOT); |
---|
129 | int index = nav.getAttrValNS("http://www.w3.org/2001/XMLSchema-instance", "schemaLocation"); |
---|
130 | if (index != -1) { |
---|
131 | String schemaLocation = nav.toNormalizedString(index); |
---|
132 | result = schemaLocation.split(" ")[1]; |
---|
133 | } else { |
---|
134 | index = nav.getAttrValNS("http://www.w3.org/2001/XMLSchema-instance", "noNamespaceSchemaLocation"); |
---|
135 | if (index != -1) { |
---|
136 | result = nav.toNormalizedString(index); |
---|
137 | } |
---|
138 | } |
---|
139 | return result; |
---|
140 | } |
---|
141 | |
---|
142 | /** |
---|
143 | * Extract ResourceProxies from ResourceProxyList |
---|
144 | * @param cmdiData representation of the CMDI document |
---|
145 | * @param nav VTD Navigator |
---|
146 | * @throws VTDException |
---|
147 | */ |
---|
148 | private void processResources(CMDIData cmdiData, VTDNav nav) throws VTDException { |
---|
149 | |
---|
150 | AutoPilot resourceProxy = new AutoPilot(nav); |
---|
151 | setNameSpace(resourceProxy); |
---|
152 | resourceProxy.selectXPath("/c:CMD/c:Resources/c:ResourceProxyList/c:ResourceProxy"); |
---|
153 | |
---|
154 | AutoPilot resourceRef = new AutoPilot(nav); |
---|
155 | setNameSpace(resourceRef); |
---|
156 | resourceRef.selectXPath("c:ResourceRef"); |
---|
157 | |
---|
158 | AutoPilot resourceType = new AutoPilot(nav); |
---|
159 | setNameSpace(resourceType); |
---|
160 | resourceType.selectXPath("c:ResourceType"); |
---|
161 | |
---|
162 | AutoPilot resourceMimeType = new AutoPilot(nav); |
---|
163 | setNameSpace(resourceMimeType); |
---|
164 | resourceMimeType.selectXPath("c:ResourceType/@mimetype"); |
---|
165 | |
---|
166 | while (resourceProxy.evalXPath() != -1) { |
---|
167 | String ref = resourceRef.evalXPathToString(); |
---|
168 | String type = resourceType.evalXPathToString(); |
---|
169 | String mimeType = resourceMimeType.evalXPathToString(); |
---|
170 | |
---|
171 | if (!ref.equals("") && !type.equals("")) { |
---|
172 | // note that the mime type could be empty |
---|
173 | cmdiData.addResource(ref, type, mimeType); |
---|
174 | } |
---|
175 | } |
---|
176 | } |
---|
177 | |
---|
178 | /** |
---|
179 | * Extracts facet values according to the facetMapping |
---|
180 | * @param cmdiData representation of the CMDI document |
---|
181 | * @param nav VTD Navigator |
---|
182 | * @param facetMapping the facet mapping used to map meta data to facets |
---|
183 | * @throws VTDException |
---|
184 | */ |
---|
185 | private void processFacets(CMDIData cmdiData, VTDNav nav, FacetMapping facetMapping) throws VTDException { |
---|
186 | List<FacetConfiguration> facetList = facetMapping.getFacets(); |
---|
187 | for (FacetConfiguration config : facetList) { |
---|
188 | List<String> patterns = config.getPatterns(); |
---|
189 | for (String pattern : patterns) { |
---|
190 | boolean matchedPattern = matchPattern(cmdiData, nav, config, pattern, config.getAllowMultipleValues()); |
---|
191 | if (matchedPattern && !config.getAllowMultipleValues()) { |
---|
192 | break; |
---|
193 | } |
---|
194 | } |
---|
195 | } |
---|
196 | } |
---|
197 | |
---|
198 | /** |
---|
199 | * Extracts content from CMDI file for a specific facet based on a single XPath expression |
---|
200 | * @param cmdiData representation of the CMDI document |
---|
201 | * @param nav VTD Navigator |
---|
202 | * @param config facet configuration |
---|
203 | * @param pattern XPath expression |
---|
204 | * @param allowMultipleValues information if multiple values are allowed in this facet |
---|
205 | * @return pattern matched a node in the CMDI file? |
---|
206 | * @throws VTDException |
---|
207 | */ |
---|
208 | private boolean matchPattern(CMDIData cmdiData, VTDNav nav, FacetConfiguration config, String pattern, Boolean allowMultipleValues) throws VTDException { |
---|
209 | boolean matchedPattern = false; |
---|
210 | AutoPilot ap = new AutoPilot(nav); |
---|
211 | setNameSpace(ap); |
---|
212 | ap.selectXPath(pattern); |
---|
213 | int index = ap.evalXPath(); |
---|
214 | while (index != -1) { |
---|
215 | matchedPattern = true; |
---|
216 | if (nav.getTokenType(index) == VTDNav.TOKEN_ATTR_NAME) { |
---|
217 | //if it is an attribute you need to add 1 to the index to get the right value |
---|
218 | index++; |
---|
219 | } |
---|
220 | String value = nav.toString(index); |
---|
221 | |
---|
222 | // extract language code in xml:lang if available |
---|
223 | Integer langAttrIndex = nav.getAttrVal("xml:lang"); |
---|
224 | String languageCode = DEFAULT_LANGUAGE; |
---|
225 | if(langAttrIndex != -1) |
---|
226 | languageCode = nav.toString(langAttrIndex).trim(); |
---|
227 | // replace 2-letter with 3-letter codes |
---|
228 | if(LanguageCodeUtils.getSilToIso639Map().containsKey(languageCode)) |
---|
229 | languageCode = LanguageCodeUtils.getSilToIso639Map().get(languageCode); |
---|
230 | |
---|
231 | List<String> valueList = postProcess(config.getName(), value); |
---|
232 | for(int i=0; i<valueList.size(); i++) { |
---|
233 | if(!allowMultipleValues && i>0) |
---|
234 | break; |
---|
235 | String fieldValue = valueList.get(i).trim(); |
---|
236 | if(config.getName().equals(FacetConstants.FIELD_DESCRIPTION)) |
---|
237 | fieldValue = "{lang='"+languageCode+"'}"+fieldValue; |
---|
238 | cmdiData.addDocField(config.getName(), fieldValue, config.isCaseInsensitive()); |
---|
239 | } |
---|
240 | index = ap.evalXPath(); |
---|
241 | |
---|
242 | if(!allowMultipleValues) |
---|
243 | break; |
---|
244 | } |
---|
245 | return matchedPattern; |
---|
246 | } |
---|
247 | |
---|
248 | /** |
---|
249 | * Applies registered PostProcessor to extracted values |
---|
250 | * @param facetName name of the facet for which value was extracted |
---|
251 | * @param extractedValue extracted value from CMDI file |
---|
252 | * @return value after applying matching PostProcessor or the original value if no PostProcessor was registered for the facet |
---|
253 | */ |
---|
254 | private List<String> postProcess(String facetName, String extractedValue) { |
---|
255 | List<String> resultList = new ArrayList<String>(); |
---|
256 | if (postProcessors.containsKey(facetName)) { |
---|
257 | PostProcessor processor = postProcessors.get(facetName); |
---|
258 | resultList = processor.process(extractedValue); |
---|
259 | } else { |
---|
260 | resultList.add(extractedValue); |
---|
261 | } |
---|
262 | return resultList; |
---|
263 | } |
---|
264 | } |
---|