1 | package eu.clarin.cmdi.vlo.importer; |
---|
2 | |
---|
3 | import java.io.File; |
---|
4 | import java.io.FileInputStream; |
---|
5 | import java.io.IOException; |
---|
6 | import java.util.List; |
---|
7 | |
---|
8 | import javax.xml.parsers.DocumentBuilder; |
---|
9 | import javax.xml.parsers.DocumentBuilderFactory; |
---|
10 | import javax.xml.parsers.ParserConfigurationException; |
---|
11 | import javax.xml.xpath.XPath; |
---|
12 | import javax.xml.xpath.XPathConstants; |
---|
13 | import javax.xml.xpath.XPathExpressionException; |
---|
14 | import javax.xml.xpath.XPathFactory; |
---|
15 | |
---|
16 | import org.w3c.dom.Document; |
---|
17 | import org.w3c.dom.NamedNodeMap; |
---|
18 | import org.w3c.dom.Node; |
---|
19 | import org.w3c.dom.NodeList; |
---|
20 | import org.xml.sax.InputSource; |
---|
21 | import org.xml.sax.SAXException; |
---|
22 | |
---|
23 | /** |
---|
24 | * @deprecated Dom parsing implementation, use the @see CMDIParserVTDXML it is much faster. Keeping this for now just in case we run into |
---|
25 | * issues with the vlt parsing. patdui 15 December 2010 |
---|
26 | */ |
---|
27 | public class CMDIDigester implements CMDIDataProcessor { |
---|
28 | private final FacetMapping facetMapping; |
---|
29 | private DocumentBuilder builder; |
---|
30 | |
---|
31 | public CMDIDigester(FacetMapping facetMapping) { |
---|
32 | this.facetMapping = facetMapping; |
---|
33 | DocumentBuilderFactory domFactory = DocumentBuilderFactory.newInstance(); |
---|
34 | domFactory.setNamespaceAware(true); |
---|
35 | try { |
---|
36 | builder = domFactory.newDocumentBuilder(); |
---|
37 | } catch (ParserConfigurationException e) { |
---|
38 | throw new RuntimeException("Cannot instantiate documentBuilder:", e); |
---|
39 | } |
---|
40 | } |
---|
41 | |
---|
42 | public CMDIData process(File file) throws IOException, SAXException, XPathExpressionException { |
---|
43 | CMDIData result = null; |
---|
44 | InputSource inputSource = new InputSource(new FileInputStream(file)); |
---|
45 | inputSource.setSystemId(file.toString()); |
---|
46 | XPath xpath = XPathFactory.newInstance().newXPath(); |
---|
47 | result = createCMDIData(xpath, inputSource); |
---|
48 | return result; |
---|
49 | } |
---|
50 | |
---|
51 | private CMDIData createCMDIData(XPath xpath, InputSource inputSource) throws XPathExpressionException, SAXException, IOException { |
---|
52 | CMDIData result = new CMDIData(); |
---|
53 | Document doc = builder.parse(inputSource); |
---|
54 | Node node = (Node) xpath.evaluate(facetMapping.getIdMapping(), doc, XPathConstants.NODE); |
---|
55 | if (node != null) { |
---|
56 | result.setId(node.getNodeValue()); |
---|
57 | } |
---|
58 | NodeList nodes = (NodeList) xpath.evaluate("CMD/Resources/ResourceProxyList/ResourceProxy", doc, XPathConstants.NODESET); |
---|
59 | for (int i = 0; i < nodes.getLength(); i++) { |
---|
60 | Node resourceNode = nodes.item(i); |
---|
61 | Node ref = (Node) xpath.evaluate("ResourceRef/text()", resourceNode, XPathConstants.NODE); |
---|
62 | Node nodeType = (Node) xpath.evaluate("ResourceType", resourceNode, XPathConstants.NODE); |
---|
63 | if (ref != null && nodeType != null) { |
---|
64 | String mimeType = null; |
---|
65 | NamedNodeMap attributes = nodeType.getAttributes(); |
---|
66 | if (attributes != null) { |
---|
67 | Node n = attributes.getNamedItem("mimetype"); |
---|
68 | if (n != null) { |
---|
69 | mimeType = n.getNodeValue(); |
---|
70 | } |
---|
71 | } |
---|
72 | String type = nodeType.getTextContent(); |
---|
73 | result.addResource(ref.getNodeValue(), type, mimeType); |
---|
74 | } |
---|
75 | } |
---|
76 | List<FacetConfiguration> facetList = facetMapping.getFacets(); |
---|
77 | for (FacetConfiguration facetConfiguration : facetList) { |
---|
78 | matchDocumentField(result, facetConfiguration, doc, xpath); |
---|
79 | } |
---|
80 | return result; |
---|
81 | } |
---|
82 | |
---|
83 | private void matchDocumentField(CMDIData result, FacetConfiguration facetConfig, Document doc, XPath xpath) |
---|
84 | throws XPathExpressionException { |
---|
85 | List<String> patterns = facetConfig.getPatterns(); |
---|
86 | for (String pattern : patterns) { |
---|
87 | boolean matchedPattern = matchPattern(result, facetConfig, doc, xpath, pattern); |
---|
88 | if (matchedPattern) { |
---|
89 | break; |
---|
90 | } |
---|
91 | } |
---|
92 | } |
---|
93 | |
---|
94 | private boolean matchPattern(CMDIData result, FacetConfiguration facetConfig, Document doc, XPath xpath, String pattern) |
---|
95 | throws XPathExpressionException { |
---|
96 | boolean matchedPattern = false; |
---|
97 | NodeList nodes = (NodeList) xpath.evaluate(pattern, doc, XPathConstants.NODESET); |
---|
98 | if (nodes != null) { |
---|
99 | matchedPattern = true; |
---|
100 | for (int i = 0; i < nodes.getLength(); i++) { |
---|
101 | result.addDocField(facetConfig.getName(), nodes.item(i).getNodeValue(), facetConfig.isCaseInsensitive()); |
---|
102 | } |
---|
103 | } // else do nothing it is perfectly acceptable that not all data is in a cmdi file so not everything will be matched. E.G xpath expression evaluation CMDI session files will never match on CMD corpus files. |
---|
104 | return matchedPattern; |
---|
105 | } |
---|
106 | } |
---|