1 | /** |
---|
2 | * This software is copyright (c) 2012-2022 by |
---|
3 | * - Leibniz-Institut fuer Deutsche Sprache (http://www.ids-mannheim.de) |
---|
4 | * This is free software. You can redistribute it |
---|
5 | * and/or modify it under the terms described in |
---|
6 | * the GNU General Public License v3 of which you |
---|
7 | * should have received a copy. Otherwise you can download |
---|
8 | * it from |
---|
9 | * |
---|
10 | * http://www.gnu.org/licenses/gpl-3.0.txt |
---|
11 | * |
---|
12 | * @copyright Leibniz-Institut fuer Deutsche Sprache (http://www.ids-mannheim.de) |
---|
13 | * |
---|
14 | * @license http://www.gnu.org/licenses/gpl-3.0.txt |
---|
15 | * GNU General Public License v3 |
---|
16 | */ |
---|
17 | package eu.clarin.sru.client.fcs; |
---|
18 | |
---|
19 | import java.net.URI; |
---|
20 | import java.net.URISyntaxException; |
---|
21 | import java.util.ArrayList; |
---|
22 | import java.util.HashMap; |
---|
23 | import java.util.List; |
---|
24 | import java.util.Map; |
---|
25 | |
---|
26 | import javax.xml.namespace.QName; |
---|
27 | import javax.xml.stream.XMLStreamException; |
---|
28 | import javax.xml.stream.XMLStreamReader; |
---|
29 | |
---|
30 | import org.slf4j.Logger; |
---|
31 | import org.slf4j.LoggerFactory; |
---|
32 | |
---|
33 | import eu.clarin.sru.client.SRUClientException; |
---|
34 | import eu.clarin.sru.client.XmlStreamReaderUtils; |
---|
35 | |
---|
36 | |
---|
37 | /** |
---|
38 | * An implementation of a Data View parser that parses Advanced Data Views. This |
---|
39 | * parser expects input that conforms to the CLARIN-FCS specification for the |
---|
40 | * Advanced Data View. |
---|
41 | * |
---|
42 | * @see DataViewAdvanced |
---|
43 | */ |
---|
44 | public final class DataViewParserAdvanced implements DataViewParser { |
---|
45 | private static final String FCS_ADV_NS = |
---|
46 | "http://clarin.eu/fcs/dataview/advanced"; |
---|
47 | private static final String UNIT_ITEM = "item"; |
---|
48 | private static final String UNIT_TIMESTAMP = "timestamp"; |
---|
49 | private static final Logger logger = |
---|
50 | LoggerFactory.getLogger(DataViewParserAdvanced.class); |
---|
51 | |
---|
52 | |
---|
53 | @Override |
---|
54 | public boolean acceptType(String type) { |
---|
55 | return DataViewAdvanced.TYPE.equals(type); |
---|
56 | } |
---|
57 | |
---|
58 | |
---|
59 | @Override |
---|
60 | public int getPriority() { |
---|
61 | return 1000; |
---|
62 | } |
---|
63 | |
---|
64 | |
---|
65 | @Override |
---|
66 | public DataView parse(XMLStreamReader reader, String type, String pid, |
---|
67 | String ref) throws XMLStreamException, SRUClientException { |
---|
68 | XmlStreamReaderUtils.readStart(reader, FCS_ADV_NS, "Advanced", true, true); |
---|
69 | final DataViewAdvanced.Unit unit = readUnit(reader); |
---|
70 | logger.debug("Advanced: unit={}", unit); |
---|
71 | reader.next(); // skip start tag |
---|
72 | |
---|
73 | // Segments |
---|
74 | final Map<String, DataViewAdvanced.Segment> segments = new HashMap<>(); |
---|
75 | XmlStreamReaderUtils.readStart(reader, FCS_ADV_NS, "Segments", true); |
---|
76 | while (XmlStreamReaderUtils.readStart(reader, FCS_ADV_NS, "Segment", |
---|
77 | segments.isEmpty(), true)) { |
---|
78 | final String id = |
---|
79 | XmlStreamReaderUtils.readAttributeValue(reader, null, "id"); |
---|
80 | final long start = readOffset(reader, "start", unit); |
---|
81 | final long end = readOffset(reader, "end", unit); |
---|
82 | final URI reference = readAttributeURI(reader, null, "ref", false); |
---|
83 | if (start > end) { |
---|
84 | throw new SRUClientException("invalid offsets: start > end"); |
---|
85 | } |
---|
86 | reader.next(); // skip start element |
---|
87 | XmlStreamReaderUtils.readEnd(reader, FCS_ADV_NS, "Segment"); |
---|
88 | |
---|
89 | logger.debug("segment: id={}, start={}, end={}, ref={}", |
---|
90 | id, start, end, reference); |
---|
91 | DataViewAdvanced.Segment segment = |
---|
92 | new DataViewAdvanced.Segment(id, start, end, reference); |
---|
93 | segments.put(id, segment); |
---|
94 | } // while |
---|
95 | XmlStreamReaderUtils.readEnd(reader, FCS_ADV_NS, "Segments"); |
---|
96 | |
---|
97 | // Layers |
---|
98 | List<DataViewAdvanced.Layer> layers = |
---|
99 | new ArrayList<>(); |
---|
100 | |
---|
101 | XmlStreamReaderUtils.readStart(reader, FCS_ADV_NS, "Layers", true); |
---|
102 | while (XmlStreamReaderUtils.readStart(reader, FCS_ADV_NS, "Layer", |
---|
103 | layers.isEmpty(), true)) { |
---|
104 | String id = XmlStreamReaderUtils.readAttributeValue(reader, null, "id"); |
---|
105 | reader.next(); // skip start element |
---|
106 | logger.debug("layer: id={}", id); |
---|
107 | |
---|
108 | final List<DataViewAdvanced.Span> spans = |
---|
109 | new ArrayList<>(); |
---|
110 | while (XmlStreamReaderUtils.readStart(reader, FCS_ADV_NS, "Span", |
---|
111 | spans.isEmpty(), true)) { |
---|
112 | String segment_ref = XmlStreamReaderUtils.readAttributeValue(reader, null, "ref"); |
---|
113 | String highlight = XmlStreamReaderUtils.readAttributeValue(reader, null, "highlight", false); |
---|
114 | String altValue = XmlStreamReaderUtils.readAttributeValue(reader, null, "alt-value", false); |
---|
115 | reader.next(); // skip start element |
---|
116 | String content = XmlStreamReaderUtils.readString(reader, false); |
---|
117 | XmlStreamReaderUtils.readEnd(reader, FCS_ADV_NS, "Span"); |
---|
118 | |
---|
119 | logger.debug("span: ref={}, highlight={}, alt-value={}, content={}", |
---|
120 | segment_ref, highlight, altValue, content); |
---|
121 | DataViewAdvanced.Segment segment = segments.get(segment_ref); |
---|
122 | if (segment == null) { |
---|
123 | throw new XMLStreamException("No segment with id '" + |
---|
124 | segment_ref + "' found", reader.getLocation()); |
---|
125 | } |
---|
126 | DataViewAdvanced.Span span = |
---|
127 | new DataViewAdvanced.Span(segment, highlight, altValue, content); |
---|
128 | spans.add(span); |
---|
129 | } // while |
---|
130 | XmlStreamReaderUtils.readEnd(reader, FCS_ADV_NS, "Layer"); |
---|
131 | |
---|
132 | DataViewAdvanced.Layer layer = |
---|
133 | new DataViewAdvanced.Layer(id, spans); |
---|
134 | layers.add(layer); |
---|
135 | } // while |
---|
136 | XmlStreamReaderUtils.readEnd(reader, FCS_ADV_NS, "Layers"); |
---|
137 | |
---|
138 | XmlStreamReaderUtils.readEnd(reader, FCS_ADV_NS, "Advanced"); |
---|
139 | return new DataViewAdvanced(pid, ref, unit, layers); |
---|
140 | } |
---|
141 | |
---|
142 | |
---|
143 | private static final DataViewAdvanced.Unit readUnit(XMLStreamReader reader) |
---|
144 | throws XMLStreamException { |
---|
145 | |
---|
146 | final String s = XmlStreamReaderUtils.readAttributeValue(reader, null, |
---|
147 | "unit", true); |
---|
148 | if (UNIT_ITEM.equals(s)) { |
---|
149 | return DataViewAdvanced.Unit.ITEM; |
---|
150 | } else if (UNIT_TIMESTAMP.equals(s)) { |
---|
151 | return DataViewAdvanced.Unit.TIMESTAMP; |
---|
152 | } else { |
---|
153 | throw new XMLStreamException( |
---|
154 | "Attribute 'unit' may only have values '" + UNIT_ITEM + |
---|
155 | "' or '" + UNIT_TIMESTAMP + "'", |
---|
156 | reader.getLocation()); |
---|
157 | } |
---|
158 | } |
---|
159 | |
---|
160 | |
---|
161 | private static final URI readAttributeURI(XMLStreamReader reader, |
---|
162 | String namespaceURI, String localName, boolean required) |
---|
163 | throws XMLStreamException, SRUClientException { |
---|
164 | final String s = XmlStreamReaderUtils.readAttributeValue(reader, |
---|
165 | namespaceURI, localName, required); |
---|
166 | if (s != null) { |
---|
167 | try { |
---|
168 | return new URI(s); |
---|
169 | } catch (URISyntaxException e) { |
---|
170 | throw new XMLStreamException("malformed URI in attribute '" + |
---|
171 | new QName(namespaceURI, localName) + "'", |
---|
172 | reader.getLocation(), e); |
---|
173 | } |
---|
174 | } else { |
---|
175 | return null; |
---|
176 | } |
---|
177 | } |
---|
178 | |
---|
179 | |
---|
180 | private static final long readOffset(XMLStreamReader reader, |
---|
181 | String localName, DataViewAdvanced.Unit unit) |
---|
182 | throws XMLStreamException, SRUClientException { |
---|
183 | String s = XmlStreamReaderUtils.readAttributeValue(reader, |
---|
184 | null, localName, true); |
---|
185 | switch (unit) { |
---|
186 | case ITEM: |
---|
187 | try { |
---|
188 | long num = Long.parseLong(s); |
---|
189 | if (num < 0) { |
---|
190 | throw new XMLStreamException("offset is smaller than '0'", |
---|
191 | reader.getLocation()); |
---|
192 | } |
---|
193 | return num; |
---|
194 | } catch (NumberFormatException e) { |
---|
195 | throw new XMLStreamException( |
---|
196 | "invalid number in attribute '" + localName + "'", |
---|
197 | reader.getLocation(), e); |
---|
198 | } |
---|
199 | case TIMESTAMP: |
---|
200 | throw new SRUClientException("no support for 'timestamp' offsets, yet!"); |
---|
201 | default: |
---|
202 | throw new SRUClientException("internal error: invalid unit (" + |
---|
203 | unit + ")"); |
---|
204 | } |
---|
205 | } |
---|
206 | |
---|
207 | } // class DataViewParserAdvanced |
---|