1 | package clarind.fcs; |
---|
2 | |
---|
3 | import java.io.InputStream; |
---|
4 | import java.net.URL; |
---|
5 | import java.util.*; |
---|
6 | import javax.xml.parsers.DocumentBuilder; |
---|
7 | import javax.xml.parsers.DocumentBuilderFactory; |
---|
8 | import javax.xml.xpath.XPath; |
---|
9 | import javax.xml.xpath.XPathConstants; |
---|
10 | import javax.xml.xpath.XPathExpressionException; |
---|
11 | import javax.xml.xpath.XPathFactory; |
---|
12 | import org.w3c.dom.Node; |
---|
13 | import org.w3c.dom.NodeList; |
---|
14 | |
---|
15 | public class Harvester { |
---|
16 | |
---|
17 | final String crStartpoint = "http://130.183.206.32/restxml/"; |
---|
18 | |
---|
19 | private NodeList evaluateXPath(String statement, org.w3c.dom.Document domtree){ |
---|
20 | NodeList result = null; |
---|
21 | |
---|
22 | XPath xpath = XPathFactory.newInstance().newXPath(); |
---|
23 | try { |
---|
24 | result = (NodeList) xpath.evaluate(statement, domtree, XPathConstants.NODESET); |
---|
25 | } catch (XPathExpressionException ex) { |
---|
26 | System.out.println(ex.getMessage()); |
---|
27 | } |
---|
28 | return result; |
---|
29 | } |
---|
30 | |
---|
31 | public String evaluateXPathToString(String statement, org.w3c.dom.Document domtree) { |
---|
32 | String result = null; |
---|
33 | |
---|
34 | XPath xpath = XPathFactory.newInstance().newXPath(); |
---|
35 | try { |
---|
36 | result = (String) xpath.evaluate(statement, domtree, XPathConstants.STRING); |
---|
37 | } catch (XPathExpressionException ex) { |
---|
38 | System.out.println(ex.getMessage()); |
---|
39 | } |
---|
40 | return result; |
---|
41 | } |
---|
42 | |
---|
43 | |
---|
44 | public ArrayList<Endpoint> getEndpoints() throws Exception { |
---|
45 | ArrayList<Endpoint> ep = new ArrayList<Endpoint>(); |
---|
46 | |
---|
47 | URL u = new URL(crStartpoint); |
---|
48 | InputStream is = u.openStream(); |
---|
49 | |
---|
50 | DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); |
---|
51 | DocumentBuilder builder = factory.newDocumentBuilder(); |
---|
52 | org.w3c.dom.Document document = builder.parse(is); |
---|
53 | |
---|
54 | is.close(); |
---|
55 | String instituteName = evaluateXPathToString("//Name", document); |
---|
56 | |
---|
57 | NodeList institutionsUrls = evaluateXPath("//Center_id_link", document); |
---|
58 | |
---|
59 | int i, i2; |
---|
60 | |
---|
61 | for(i=0; i<institutionsUrls.getLength();i++){ |
---|
62 | u = new URL(institutionsUrls.item(i).getTextContent()); |
---|
63 | is = u.openStream(); |
---|
64 | |
---|
65 | org.w3c.dom.Document doc = builder.parse(is); |
---|
66 | is.close(); |
---|
67 | ////WebReference[./Description[text()="CQL"]]/Website |
---|
68 | |
---|
69 | NodeList endpointsUrls = evaluateXPath("//WebReference[./Description[text()=\"CQL\"]]/Website", doc); |
---|
70 | |
---|
71 | for(i2=0; i2<endpointsUrls.getLength();i2++){ |
---|
72 | String epUrl = endpointsUrls.item(i2).getTextContent(); |
---|
73 | ep.add(new Endpoint(epUrl, instituteName)); |
---|
74 | } // for i2 |
---|
75 | |
---|
76 | } // for i ... |
---|
77 | |
---|
78 | |
---|
79 | return ep; |
---|
80 | } //getEndpoints |
---|
81 | |
---|
82 | |
---|
83 | public ArrayList<String> getCorporaOfAnEndpoint(String endpointUrl) throws Exception { |
---|
84 | System.out.println("getCorporaOfAnEndpoint: " + endpointUrl); |
---|
85 | ArrayList<String> corpora = new ArrayList<String>(); |
---|
86 | |
---|
87 | URL u = new URL(endpointUrl + "?operation=scan&scanClause=fcs.resource"); |
---|
88 | InputStream is = u.openStream(); |
---|
89 | |
---|
90 | DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); |
---|
91 | DocumentBuilder builder = factory.newDocumentBuilder(); |
---|
92 | org.w3c.dom.Document document = builder.parse(is); |
---|
93 | |
---|
94 | is.close(); |
---|
95 | |
---|
96 | //http://clarinws.informatik.uni-leipzig.de:8080/CQL? |
---|
97 | |
---|
98 | NodeList corporaNodes = evaluateXPath("//*[local-name()='term']/*[local-name()='value']", document); |
---|
99 | |
---|
100 | int i, i2; |
---|
101 | |
---|
102 | for(i=0; i<corporaNodes.getLength();i++){ |
---|
103 | corpora.add(corporaNodes.item(i).getTextContent()); |
---|
104 | |
---|
105 | } // for i ... |
---|
106 | return corpora; |
---|
107 | } // getCorporaOfAnEndpoint |
---|
108 | |
---|
109 | |
---|
110 | public static void main (String[] args) throws Exception { |
---|
111 | Harvester cr = new Harvester(); |
---|
112 | ArrayList<Endpoint> ep = cr.getEndpoints(); |
---|
113 | |
---|
114 | int i; |
---|
115 | |
---|
116 | for(i=0; i<ep.size();i++){ |
---|
117 | System.out.println(ep.get(i).getInstitution() + " " + ep.get(i).getUrl()); |
---|
118 | } // for i ... |
---|
119 | |
---|
120 | |
---|
121 | } |
---|
122 | |
---|
123 | |
---|
124 | |
---|
125 | |
---|
126 | } |
---|