1 | package eu.clarin.cmdi.vlo.importer; |
---|
2 | |
---|
3 | import java.io.File; |
---|
4 | import java.io.FileInputStream; |
---|
5 | import java.io.IOException; |
---|
6 | import java.util.List; |
---|
7 | |
---|
8 | import javax.xml.parsers.DocumentBuilder; |
---|
9 | import javax.xml.parsers.DocumentBuilderFactory; |
---|
10 | import javax.xml.parsers.ParserConfigurationException; |
---|
11 | import javax.xml.xpath.XPath; |
---|
12 | import javax.xml.xpath.XPathConstants; |
---|
13 | import javax.xml.xpath.XPathExpressionException; |
---|
14 | import javax.xml.xpath.XPathFactory; |
---|
15 | |
---|
16 | import org.slf4j.Logger; |
---|
17 | import org.slf4j.LoggerFactory; |
---|
18 | import org.w3c.dom.Document; |
---|
19 | import org.w3c.dom.Node; |
---|
20 | import org.w3c.dom.NodeList; |
---|
21 | import org.xml.sax.InputSource; |
---|
22 | import org.xml.sax.SAXException; |
---|
23 | |
---|
24 | public class CMDIDigester { |
---|
25 | private final static Logger LOG = LoggerFactory.getLogger(CMDIDigester.class); |
---|
26 | private final FacetMapping facetMapping; |
---|
27 | // private XMLReader xmlReader; |
---|
28 | private DocumentBuilder builder; |
---|
29 | |
---|
30 | public CMDIDigester(FacetMapping facetMapping) { |
---|
31 | this.facetMapping = facetMapping; |
---|
32 | DocumentBuilderFactory domFactory = DocumentBuilderFactory.newInstance(); |
---|
33 | domFactory.setNamespaceAware(true); |
---|
34 | try { |
---|
35 | builder = domFactory.newDocumentBuilder(); |
---|
36 | } catch (ParserConfigurationException e) { |
---|
37 | throw new RuntimeException("Cannot instantiate documentBuilder:", e); |
---|
38 | } |
---|
39 | // try { |
---|
40 | // xmlReader = createXmlReader(); |
---|
41 | // } catch (SAXException e) { |
---|
42 | // throw new RuntimeException("Cannot instantiate xmlReader:", e); |
---|
43 | // } |
---|
44 | } |
---|
45 | |
---|
46 | public CMDIData process(File file) throws IOException, SAXException, XPathExpressionException { |
---|
47 | CMDIData result = null; |
---|
48 | InputSource inputSource = new InputSource(new FileInputStream(file)); |
---|
49 | inputSource.setSystemId(file.toString()); |
---|
50 | XPath xpath = XPathFactory.newInstance().newXPath(); |
---|
51 | result = createCMDIData(xpath, inputSource); |
---|
52 | |
---|
53 | /** |
---|
54 | * Do not reuse the digester it holds state on bad parses. We can reuse the xmlReader. Creating a new Digester or reusing an |
---|
55 | * instance gives similar performance. |
---|
56 | * @see org.apache.commons.digester.Digester |
---|
57 | */ |
---|
58 | //result = (CMDIData) createDigester().parse(inputSource); |
---|
59 | return result; |
---|
60 | } |
---|
61 | |
---|
62 | private CMDIData createCMDIData(XPath xpath, InputSource inputSource) throws XPathExpressionException, SAXException, IOException { |
---|
63 | CMDIData result = new CMDIData(); |
---|
64 | Document doc = builder.parse(inputSource); |
---|
65 | Node node = (Node) xpath.evaluate(facetMapping.getIdMapping(), doc, XPathConstants.NODE); |
---|
66 | if (node != null) { |
---|
67 | result.setId(node.getNodeValue()); |
---|
68 | } |
---|
69 | NodeList nodes = (NodeList) xpath.evaluate("CMD/Resources/ResourceProxyList/ResourceProxy", doc, XPathConstants.NODESET); |
---|
70 | for (int i = 0; i < nodes.getLength(); i++) { |
---|
71 | Node resourceNode = nodes.item(i); |
---|
72 | Node ref = (Node) xpath.evaluate("ResourceRef/text()", resourceNode, XPathConstants.NODE); |
---|
73 | Node type = (Node) xpath.evaluate("ResourceType/text()", resourceNode, XPathConstants.NODE); |
---|
74 | if (ref != null && type != null) { |
---|
75 | result.addResource(ref.getNodeValue(), type.getNodeValue()); |
---|
76 | } |
---|
77 | } |
---|
78 | List<FacetConfiguration> facetList = facetMapping.getFacets(); |
---|
79 | for (FacetConfiguration facetConfiguration : facetList) { |
---|
80 | matchDocumentField(result, facetConfiguration, doc, xpath); |
---|
81 | } |
---|
82 | return result; |
---|
83 | } |
---|
84 | |
---|
85 | private void matchDocumentField(CMDIData result, FacetConfiguration facetConfig, Document doc, XPath xpath) |
---|
86 | throws XPathExpressionException { |
---|
87 | NodeList nodes = (NodeList) xpath.evaluate(facetConfig.getPattern(), doc, XPathConstants.NODESET); |
---|
88 | if (nodes != null) { |
---|
89 | for (int i = 0; i < nodes.getLength(); i++) { |
---|
90 | result.addDocField(facetConfig.getName(), nodes.item(i).getNodeValue(), facetConfig.isCaseInsensitive()); |
---|
91 | } |
---|
92 | } // else do nothing it is perfectly acceptable that not all data is in a cmdi file so not everything will be matched. E.G xpath expression evaluation CMDI session files will never match on CMD corpus files. |
---|
93 | } |
---|
94 | |
---|
95 | // private Digester createDigester() { |
---|
96 | // Digester digester = new Digester(xmlReader); |
---|
97 | // digester.setValidating(false); |
---|
98 | // digester.addObjectCreate("CMD", CMDIData.class); |
---|
99 | // digester.addBeanPropertySetter(facetMapping.getIdMapping(), "id"); |
---|
100 | // digester.addCallMethod("CMD/Resources/ResourceProxyList/ResourceProxy/", "addResource", 2); |
---|
101 | // digester.addCallParam("CMD/Resources/ResourceProxyList/ResourceProxy/ResourceRef", 0); |
---|
102 | // digester.addCallParam("CMD/Resources/ResourceProxyList/ResourceProxy/ResourceType", 1); |
---|
103 | // // Map<String, String> facetMap = facetMapping.getFacetMap(); |
---|
104 | // // for (String facet : facetMap.keySet()) { |
---|
105 | // // matchDocumentField(digester, facetMap.get(facet), facet); |
---|
106 | // // } |
---|
107 | // return digester; |
---|
108 | // } |
---|
109 | // |
---|
110 | // private void matchDocumentField(Digester digester, String pattern, String fieldName) { |
---|
111 | // String[] split = pattern.split(",@", 2); |
---|
112 | // String path = split[0]; |
---|
113 | // String attribute = split.length == 2 ? split[1] : null; |
---|
114 | // digester.addCallMethod(path, "addDocField", 2); |
---|
115 | // digester.addObjectParam(path, 0, fieldName); |
---|
116 | // digester.addCallParam(path, 1, attribute); |
---|
117 | // } |
---|
118 | // |
---|
119 | // private XMLReader createXmlReader() throws SAXException { |
---|
120 | // XMLReader xmlReader = XMLReaderFactory.createXMLReader(); |
---|
121 | // xmlReader.setFeature("http://xml.org/sax/features/validation", true); |
---|
122 | // xmlReader.setFeature("http://xml.org/sax/features/namespaces", true); |
---|
123 | // xmlReader.setProperty("http://java.sun.com/xml/jaxp/properties/schemaLanguage", "http://www.w3.org/2001/XMLSchema"); |
---|
124 | // return xmlReader; |
---|
125 | // } |
---|
126 | |
---|
127 | } |
---|