1 | package eu.clarin.cmdi.oai.harvester; |
---|
2 | |
---|
3 | import java.io.BufferedInputStream; |
---|
4 | import java.io.File; |
---|
5 | import java.io.FileOutputStream; |
---|
6 | import java.io.IOException; |
---|
7 | import java.io.InputStream; |
---|
8 | import java.util.Arrays; |
---|
9 | import java.util.Date; |
---|
10 | import java.util.List; |
---|
11 | import java.util.zip.ZipEntry; |
---|
12 | import java.util.zip.ZipException; |
---|
13 | import java.util.zip.ZipOutputStream; |
---|
14 | |
---|
15 | import javax.xml.stream.XMLOutputFactory; |
---|
16 | import javax.xml.stream.XMLStreamConstants; |
---|
17 | import javax.xml.stream.XMLStreamReader; |
---|
18 | import javax.xml.stream.XMLStreamWriter; |
---|
19 | |
---|
20 | import eu.clarin.cmdi.oai.harvester.HarvestHandlerAdapter; |
---|
21 | import eu.clarin.cmdi.oai.harvester.impl.SimpleHarvester; |
---|
22 | import eu.clarin.cmdi.oai.harvester.util.XMLStreamCopier; |
---|
23 | |
---|
24 | |
---|
25 | public class HarvesterTest { |
---|
26 | private static XMLOutputFactory factory = |
---|
27 | XMLOutputFactory.newInstance(); |
---|
28 | |
---|
29 | private class MyHarvestHandler extends HarvestHandlerAdapter { |
---|
30 | private ZipOutputStream output; |
---|
31 | private int reqNum; |
---|
32 | private String prefix; |
---|
33 | |
---|
34 | public MyHarvestHandler(File file) { |
---|
35 | try { |
---|
36 | FileOutputStream fos = new FileOutputStream(file); |
---|
37 | output = new ZipOutputStream(fos); |
---|
38 | output.setMethod(ZipOutputStream.DEFLATED); |
---|
39 | } catch (Exception e) { |
---|
40 | throw new RuntimeException("xxx", e); |
---|
41 | } |
---|
42 | } |
---|
43 | |
---|
44 | public void close() { |
---|
45 | try { |
---|
46 | if (output != null) { |
---|
47 | output.flush(); |
---|
48 | output.close(); |
---|
49 | output = null; |
---|
50 | } |
---|
51 | } catch (ZipException e) { |
---|
52 | /* IGNORE */ |
---|
53 | } catch (Exception e) { |
---|
54 | throw new RuntimeException("xxx", e); |
---|
55 | } |
---|
56 | } |
---|
57 | |
---|
58 | @Override |
---|
59 | public void onIdentify(Repository repository) { |
---|
60 | System.err.println("onIdentify"); |
---|
61 | System.err.println(" repositoryName: " + |
---|
62 | repository.getRepositoryName()); |
---|
63 | System.err.println(" baseURL: " + |
---|
64 | repository.getBaseURL()); |
---|
65 | System.err.println(" protocolVersion: " + |
---|
66 | repository.getProtocolVersion()); |
---|
67 | for (String adminEmail : repository.getAdminEmail()) { |
---|
68 | System.err.println(" adminEmail: " + adminEmail); |
---|
69 | } |
---|
70 | System.err.println(" earliestTimestamp: " + |
---|
71 | repository.getEarliestTimestamp()); |
---|
72 | System.err.println(" deletedPolicy: " + |
---|
73 | repository.getDeletedNotion()); |
---|
74 | System.err.println(" granularity: " + |
---|
75 | repository.getGranularity()); |
---|
76 | StringBuilder sb = new StringBuilder(); |
---|
77 | final int mask = repository.getCompressionMask(); |
---|
78 | if (mask != 0) { |
---|
79 | if ((mask & Repository.COMPRESSION_METHOD_DEFLATE) > 0) { |
---|
80 | sb.append("DEFLATE"); |
---|
81 | } |
---|
82 | if ((mask & Repository.COMPRESSION_METHOD_GZIP) > 0) { |
---|
83 | if (sb.length() > 0) { |
---|
84 | sb.append(", "); |
---|
85 | } |
---|
86 | sb.append("GZIP"); |
---|
87 | } |
---|
88 | } else { |
---|
89 | sb.append("N/A"); |
---|
90 | } |
---|
91 | System.err.println(" compression: " + sb.toString()); |
---|
92 | } |
---|
93 | |
---|
94 | @Override |
---|
95 | public void onListMetadataFormats(List<MetadataFormat> metadataFormats) { |
---|
96 | System.err.println("onListMetadataFormats"); |
---|
97 | for (MetadataFormat format : metadataFormats) { |
---|
98 | System.err.println("==> MetadataFormat"); |
---|
99 | System.err.println(" Prefix: " + format.getPrefix()); |
---|
100 | System.err.println(" Schema: " + format.getSchema()); |
---|
101 | System.err.println(" Namespace: " +format.getNamespace()); |
---|
102 | } |
---|
103 | } |
---|
104 | |
---|
105 | @Override |
---|
106 | public void onStartListingRecords(String prefix) { |
---|
107 | System.err.println("onStartListingRecords: " + prefix); |
---|
108 | this.prefix = prefix; |
---|
109 | } |
---|
110 | |
---|
111 | @Override |
---|
112 | public void onFinishListingRecords(String prefix, Date latestDatestamp) { |
---|
113 | System.err.println("onFinishListingRecords: " + prefix + |
---|
114 | ", latest = " + latestDatestamp); |
---|
115 | this.prefix = null; |
---|
116 | } |
---|
117 | |
---|
118 | |
---|
119 | @Override |
---|
120 | public void onRecordMetadata(Header header, XMLStreamReader reader) { |
---|
121 | System.err.println("==> Record"); |
---|
122 | System.err.println(" Identifier: " + header.getIdentifier()); |
---|
123 | System.err.println(" Datestamp: " + header.getDatestamp()); |
---|
124 | if (header.getSets() != null) { |
---|
125 | System.err.println(" Sets: " + header.getSets()); |
---|
126 | } |
---|
127 | String filename = header.getIdentifier(); |
---|
128 | filename = replaceBadChars(filename) + ".xml"; |
---|
129 | if (prefix != null) { |
---|
130 | filename = replaceBadChars(prefix) + "/" + filename; |
---|
131 | } |
---|
132 | try { |
---|
133 | ZipEntry entry = new ZipEntry(filename); |
---|
134 | output.putNextEntry(entry); |
---|
135 | XMLStreamWriter writer = |
---|
136 | factory.createXMLStreamWriter(output); |
---|
137 | XMLStreamCopier.copy(reader, writer); |
---|
138 | writer.flush(); |
---|
139 | writer.close(); |
---|
140 | output.closeEntry(); |
---|
141 | output.flush(); |
---|
142 | reader.close(); |
---|
143 | } catch (Exception e) { |
---|
144 | e.printStackTrace(); |
---|
145 | throw new RuntimeException("record failed"); |
---|
146 | } |
---|
147 | } |
---|
148 | |
---|
149 | @Override |
---|
150 | public InputStream wrap(InputStream stream) throws IOException { |
---|
151 | final int size = 16*1024*1024; |
---|
152 | BufferedInputStream in = |
---|
153 | new BufferedInputStream(stream, size); |
---|
154 | in.mark(size); |
---|
155 | String x = Integer.toHexString(reqNum++); |
---|
156 | while (x.length() < 4) { |
---|
157 | x = "0" + x; |
---|
158 | } |
---|
159 | final String filename = "request/0x" + x + ".xml"; |
---|
160 | System.err.println("FN: " + filename); |
---|
161 | ZipEntry entry = new ZipEntry(filename); |
---|
162 | output.putNextEntry(entry); |
---|
163 | byte[] buffer = new byte[8192]; |
---|
164 | int r = -1; |
---|
165 | do { |
---|
166 | r = in.read(buffer, 0, buffer.length - 1); |
---|
167 | if (r > 0) { |
---|
168 | output.write(buffer, 0, r); |
---|
169 | } |
---|
170 | } while (r > 0); |
---|
171 | output.closeEntry(); |
---|
172 | output.flush(); |
---|
173 | in.reset(); |
---|
174 | return in; |
---|
175 | } |
---|
176 | |
---|
177 | private String replaceBadChars(String s) { |
---|
178 | return s.replace(':', '_').replace('.', '_').replace('\\', '_') |
---|
179 | .replace('/', '_'); |
---|
180 | } |
---|
181 | }; |
---|
182 | |
---|
183 | public void run(String[] args) { |
---|
184 | if (args.length < 2) { |
---|
185 | System.err.println("Usage: [baseURI] [outfile]"); |
---|
186 | System.exit(1); |
---|
187 | } |
---|
188 | final String repos = args[0]; |
---|
189 | MyHarvestHandler handler = new MyHarvestHandler(new File(args[1])); |
---|
190 | try { |
---|
191 | Harvester harvester = SimpleHarvester.newInstance(); |
---|
192 | HarvestJob job = harvester.createJob(repos, handler); |
---|
193 | // Calendar cal = Calendar.getInstance(TimeZone.getDefault()); |
---|
194 | // cal.clear(); |
---|
195 | // cal.set(2010, Calendar.DECEMBER, 15); |
---|
196 | // job.setFrom(cal.getTime()); |
---|
197 | job.setMetadataPrefixes(Arrays.asList("oai_dc")); |
---|
198 | job.run(); |
---|
199 | |
---|
200 | System.err.println("==> " + job.getState() + ": "+ |
---|
201 | job.getRecordCount() + " record(s) [resumed: " + |
---|
202 | job.getResumptionCount() + " time(s)] / " + |
---|
203 | job.getRequestCount() + " request(s) / " + |
---|
204 | job.getBytesTransferred() + " bytes transferred"); |
---|
205 | System.err.println(" " + |
---|
206 | (job.getTotalTime() / 1000f) + " total / " + |
---|
207 | (job.getNetworkTime() / 1000f) + " network / " + |
---|
208 | (job.getWaitTime() / 1000f) + " wait / " + |
---|
209 | (job.getProcessingTime() / 1000f) + " processing"); |
---|
210 | } catch (HarvesterProtocolErrorException e) { |
---|
211 | System.err.println("Protocol error: "); |
---|
212 | if (e.getErrors() != null) { |
---|
213 | for (ProtocolError error : e.getErrors()) { |
---|
214 | System.err.println(" " + error); |
---|
215 | } |
---|
216 | } |
---|
217 | } catch (Exception e) { |
---|
218 | e.printStackTrace(); |
---|
219 | } finally { |
---|
220 | handler.close(); |
---|
221 | } |
---|
222 | } |
---|
223 | |
---|
224 | public static void main(String[] args) { |
---|
225 | HarvesterTest main = new HarvesterTest(); |
---|
226 | main.run(args); |
---|
227 | } |
---|
228 | |
---|
229 | @SuppressWarnings("unused") |
---|
230 | private static void dump(XMLStreamReader reader) { |
---|
231 | StringBuilder sb = new StringBuilder("Main: "); |
---|
232 | switch (reader.getEventType()) { |
---|
233 | case XMLStreamConstants.START_DOCUMENT: |
---|
234 | sb.append("START_DOC"); |
---|
235 | break; |
---|
236 | case XMLStreamConstants.END_DOCUMENT: |
---|
237 | sb.append("END_DOC"); |
---|
238 | break; |
---|
239 | case XMLStreamConstants.COMMENT: |
---|
240 | sb.append("COMMENT["); |
---|
241 | sb.append(reader.getTextCharacters(), reader.getTextStart(), |
---|
242 | reader.getTextLength()); |
---|
243 | sb.append("]"); |
---|
244 | break; |
---|
245 | case XMLStreamConstants.START_ELEMENT: |
---|
246 | sb.append("START["); |
---|
247 | sb.append(reader.getNamespaceURI()); |
---|
248 | sb.append(","); |
---|
249 | sb.append(reader.getLocalName()); |
---|
250 | sb.append("]"); |
---|
251 | break; |
---|
252 | case XMLStreamConstants.END_ELEMENT: |
---|
253 | sb.append("END["); |
---|
254 | sb.append(reader.getNamespaceURI()); |
---|
255 | sb.append(","); |
---|
256 | sb.append(reader.getLocalName()); |
---|
257 | sb.append("]"); |
---|
258 | break; |
---|
259 | case XMLStreamConstants.CHARACTERS: |
---|
260 | sb.append("CHARACTERS["); |
---|
261 | String text = reader.getText(); |
---|
262 | text = text.replace("\n", "\\n").replace("\r", "\\r") |
---|
263 | .replace("\t", "\\t"); |
---|
264 | sb.append(text); |
---|
265 | sb.append("]"); |
---|
266 | break; |
---|
267 | case XMLStreamConstants.CDATA: |
---|
268 | sb.append("CDATA["); |
---|
269 | sb.append(reader.getText()); |
---|
270 | sb.append("]"); |
---|
271 | break; |
---|
272 | default: |
---|
273 | sb.append(Integer.toString(reader.getEventType())); |
---|
274 | } |
---|
275 | System.err.println(sb.toString()); |
---|
276 | } |
---|
277 | |
---|
278 | } // class HarvesterTest |
---|