source: vlo/trunk/vlo_webapp/src/main/java/eu/clarin/cmdi/vlo/importer/CMDIDigester.java @ 1007

Last change on this file since 1007 was 1007, checked in by patdui, 13 years ago
  • added resourceType facet and added a list of resources on the show result page.
  • added i18n properties
  • fixed some olac facets supporting multiple patterns to match a facet
File size: 4.6 KB
Line 
1package eu.clarin.cmdi.vlo.importer;
2
3import java.io.File;
4import java.io.FileInputStream;
5import java.io.IOException;
6import java.util.List;
7
8import javax.xml.parsers.DocumentBuilder;
9import javax.xml.parsers.DocumentBuilderFactory;
10import javax.xml.parsers.ParserConfigurationException;
11import javax.xml.xpath.XPath;
12import javax.xml.xpath.XPathConstants;
13import javax.xml.xpath.XPathExpressionException;
14import javax.xml.xpath.XPathFactory;
15
16import org.w3c.dom.Document;
17import org.w3c.dom.NamedNodeMap;
18import org.w3c.dom.Node;
19import org.w3c.dom.NodeList;
20import org.xml.sax.InputSource;
21import org.xml.sax.SAXException;
22
23/**
24 * @deprecated Dom parsing implementation, use the @see CMDIParserVTDXML it is much faster. Keeping this for now just in case we run into
25 *             issues with the vlt parsing. patdui 15 December 2010
26 */
27public class CMDIDigester implements CMDIDataProcessor {
28    private final FacetMapping facetMapping;
29    private DocumentBuilder builder;
30
31    public CMDIDigester(FacetMapping facetMapping) {
32        this.facetMapping = facetMapping;
33        DocumentBuilderFactory domFactory = DocumentBuilderFactory.newInstance();
34        domFactory.setNamespaceAware(true);
35        try {
36            builder = domFactory.newDocumentBuilder();
37        } catch (ParserConfigurationException e) {
38            throw new RuntimeException("Cannot instantiate documentBuilder:", e);
39        }
40    }
41
42    public CMDIData process(File file) throws IOException, SAXException, XPathExpressionException {
43        CMDIData result = null;
44        InputSource inputSource = new InputSource(new FileInputStream(file));
45        inputSource.setSystemId(file.toString());
46        XPath xpath = XPathFactory.newInstance().newXPath();
47        result = createCMDIData(xpath, inputSource);
48        return result;
49    }
50
51    private CMDIData createCMDIData(XPath xpath, InputSource inputSource) throws XPathExpressionException, SAXException, IOException {
52        CMDIData result = new CMDIData();
53        Document doc = builder.parse(inputSource);
54        Node node = (Node) xpath.evaluate(facetMapping.getIdMapping(), doc, XPathConstants.NODE);
55        if (node != null) {
56            result.setId(node.getNodeValue());
57        }
58        NodeList nodes = (NodeList) xpath.evaluate("CMD/Resources/ResourceProxyList/ResourceProxy", doc, XPathConstants.NODESET);
59        for (int i = 0; i < nodes.getLength(); i++) {
60            Node resourceNode = nodes.item(i);
61            Node ref = (Node) xpath.evaluate("ResourceRef/text()", resourceNode, XPathConstants.NODE);
62            Node nodeType = (Node) xpath.evaluate("ResourceType", resourceNode, XPathConstants.NODE);
63            if (ref != null && nodeType != null) {
64                String mimeType = null;
65                NamedNodeMap attributes = nodeType.getAttributes();
66                if (attributes != null) {
67                    Node n = attributes.getNamedItem("mimetype");
68                    if (n != null) {
69                        mimeType = n.getNodeValue();
70                    }
71                }
72                String type = nodeType.getTextContent();
73                result.addResource(ref.getNodeValue(), type, mimeType);
74            }
75        }
76        List<FacetConfiguration> facetList = facetMapping.getFacets();
77        for (FacetConfiguration facetConfiguration : facetList) {
78            matchDocumentField(result, facetConfiguration, doc, xpath);
79        }
80        return result;
81    }
82
83    private void matchDocumentField(CMDIData result, FacetConfiguration facetConfig, Document doc, XPath xpath)
84            throws XPathExpressionException {
85        List<String> patterns = facetConfig.getPatterns();
86        for (String pattern : patterns) {
87            boolean matchedPattern = matchPattern(result, facetConfig, doc, xpath, pattern);
88            if (matchedPattern) {
89                break;
90            }
91        }
92    }
93
94    private boolean matchPattern(CMDIData result, FacetConfiguration facetConfig, Document doc, XPath xpath, String pattern)
95            throws XPathExpressionException {
96        boolean matchedPattern = false;
97        NodeList nodes = (NodeList) xpath.evaluate(pattern, doc, XPathConstants.NODESET);
98        if (nodes != null) {
99            matchedPattern = true;
100            for (int i = 0; i < nodes.getLength(); i++) {
101                result.addDocField(facetConfig.getName(), nodes.item(i).getNodeValue(), facetConfig.isCaseInsensitive());
102            }
103        } // else do nothing it is perfectly acceptable that not all data is in a cmdi file so not everything will be matched. E.G xpath expression evaluation CMDI session files will never match on CMD corpus files.
104        return matchedPattern;
105    }
106}
Note: See TracBrowser for help on using the repository browser.