1 | package clarind.fcs; |
---|
2 | |
---|
3 | import java.io.InputStream; |
---|
4 | import java.net.URL; |
---|
5 | import java.util.*; |
---|
6 | import javax.xml.parsers.DocumentBuilder; |
---|
7 | import javax.xml.parsers.DocumentBuilderFactory; |
---|
8 | import javax.xml.xpath.XPath; |
---|
9 | import javax.xml.xpath.XPathConstants; |
---|
10 | import javax.xml.xpath.XPathExpressionException; |
---|
11 | import javax.xml.xpath.XPathFactory; |
---|
12 | import org.w3c.dom.Node; |
---|
13 | import org.w3c.dom.NodeList; |
---|
14 | |
---|
15 | public class Harvester { |
---|
16 | |
---|
17 | final String crStartpoint = "http://130.183.206.32/restxml/"; |
---|
18 | |
---|
19 | private NodeList evaluateXPath(String statement, org.w3c.dom.Document domtree) { |
---|
20 | NodeList result = null; |
---|
21 | |
---|
22 | XPath xpath = XPathFactory.newInstance().newXPath(); |
---|
23 | try { |
---|
24 | result = (NodeList) xpath.evaluate(statement, domtree, XPathConstants.NODESET); |
---|
25 | } catch (XPathExpressionException ex) { |
---|
26 | System.out.println(ex.getMessage()); |
---|
27 | } |
---|
28 | return result; |
---|
29 | } |
---|
30 | |
---|
31 | public String evaluateXPathToString(String statement, org.w3c.dom.Document domtree) { |
---|
32 | String result = null; |
---|
33 | |
---|
34 | XPath xpath = XPathFactory.newInstance().newXPath(); |
---|
35 | try { |
---|
36 | result = (String) xpath.evaluate(statement, domtree, XPathConstants.STRING); |
---|
37 | } catch (XPathExpressionException ex) { |
---|
38 | System.out.println(ex.getMessage()); |
---|
39 | } |
---|
40 | return result; |
---|
41 | } |
---|
42 | |
---|
43 | public ArrayList<Endpoint> getEndpoints() throws Exception { |
---|
44 | ArrayList<Endpoint> ep = new ArrayList<Endpoint>(); |
---|
45 | |
---|
46 | URL u = new URL(crStartpoint); |
---|
47 | InputStream is = u.openStream(); |
---|
48 | |
---|
49 | DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); |
---|
50 | DocumentBuilder builder = factory.newDocumentBuilder(); |
---|
51 | org.w3c.dom.Document document = builder.parse(is); |
---|
52 | |
---|
53 | is.close(); |
---|
54 | String instituteName = evaluateXPathToString("//Name", document); |
---|
55 | |
---|
56 | NodeList institutionsUrls = evaluateXPath("//Center_id_link", document); |
---|
57 | |
---|
58 | int i, i2; |
---|
59 | |
---|
60 | for (i = 0; i < institutionsUrls.getLength(); i++) { |
---|
61 | u = new URL(institutionsUrls.item(i).getTextContent()); |
---|
62 | is = u.openStream(); |
---|
63 | |
---|
64 | org.w3c.dom.Document doc = builder.parse(is); |
---|
65 | is.close(); |
---|
66 | ////WebReference[./Description[text()="CQL"]]/Website |
---|
67 | |
---|
68 | NodeList endpointsUrls = evaluateXPath("//WebReference[./Description[text()=\"CQL\"]]/Website", doc); |
---|
69 | |
---|
70 | for (i2 = 0; i2 < endpointsUrls.getLength(); i2++) { |
---|
71 | String epUrl = endpointsUrls.item(i2).getTextContent(); |
---|
72 | ep.add(new Endpoint(epUrl, instituteName)); |
---|
73 | } // for i2 |
---|
74 | |
---|
75 | } // for i ... |
---|
76 | |
---|
77 | |
---|
78 | return ep; |
---|
79 | } //getEndpoints |
---|
80 | |
---|
81 | public ArrayList<Corpus> getCorporaOfAnEndpoint(String endpointUrl) throws Exception { |
---|
82 | |
---|
83 | ArrayList<Corpus> corpora = new ArrayList<Corpus>(); |
---|
84 | String urlToCall = endpointUrl + "?operation=scan&scanClause=fcs.resource&version=1.2"; |
---|
85 | URL u = new URL(urlToCall); |
---|
86 | |
---|
87 | System.out.println("getCorporaOfAnEndpoint: " + urlToCall); |
---|
88 | |
---|
89 | InputStream is = u.openStream(); |
---|
90 | |
---|
91 | DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); |
---|
92 | DocumentBuilder builder = factory.newDocumentBuilder(); |
---|
93 | org.w3c.dom.Document document = builder.parse(is); |
---|
94 | |
---|
95 | is.close(); |
---|
96 | |
---|
97 | //http://clarinws.informatik.uni-leipzig.de:8080/CQL? |
---|
98 | |
---|
99 | //NodeList corporaNodes = evaluateXPath("//*[local-name()='term']/*[local-name()='value']", document); |
---|
100 | NodeList corporaNodes = evaluateXPath("//*[local-name()='terms']/*[local-name()='term']", document); |
---|
101 | |
---|
102 | int i, i2; |
---|
103 | if (corporaNodes.getLength() > 0) { |
---|
104 | |
---|
105 | System.out.println("Length of corpora: " + corporaNodes.getLength()); |
---|
106 | |
---|
107 | for (i = 0; i < corporaNodes.getLength(); i++) { |
---|
108 | Node n = corporaNodes.item(i); |
---|
109 | |
---|
110 | System.out.println("NODENAEM: " + n.getNodeName()); |
---|
111 | |
---|
112 | Corpus c = new Corpus(); |
---|
113 | |
---|
114 | for (i2 = 0; i2 < n.getChildNodes().getLength(); i2++) { |
---|
115 | Node child = n.getChildNodes().item(i2); |
---|
116 | |
---|
117 | if (child.getNodeName().endsWith("value")) { |
---|
118 | c.setValue(child.getTextContent()); |
---|
119 | } |
---|
120 | |
---|
121 | if (child.getNodeName().endsWith("displayTerm")) { |
---|
122 | c.setDisplayTerm(child.getTextContent()); |
---|
123 | } |
---|
124 | |
---|
125 | if (child.getNodeName().endsWith("numberOfRecords")) { |
---|
126 | c.setNumberOfRecords(child.getTextContent()); |
---|
127 | } |
---|
128 | |
---|
129 | } //for i2 |
---|
130 | |
---|
131 | corpora.add(c); |
---|
132 | |
---|
133 | } // for i ... |
---|
134 | } // if coporaNodes ... |
---|
135 | |
---|
136 | System.out.println("------------"); |
---|
137 | return corpora; |
---|
138 | } // getCorporaOfAnEndpoint |
---|
139 | |
---|
140 | public static void main(String[] args) throws Exception { |
---|
141 | Harvester cr = new Harvester(); |
---|
142 | ArrayList<Endpoint> ep = cr.getEndpoints(); |
---|
143 | |
---|
144 | int i; |
---|
145 | |
---|
146 | for (i = 0; i < ep.size(); i++) { |
---|
147 | System.out.println(ep.get(i).getInstitution() + " " + ep.get(i).getUrl()); |
---|
148 | } // for i ... |
---|
149 | |
---|
150 | |
---|
151 | } |
---|
152 | } |
---|