[6933] | 1 | /** |
---|
[7274] | 2 | * This software is copyright (c) 2012-2022 by |
---|
| 3 | * - Leibniz-Institut fuer Deutsche Sprache (http://www.ids-mannheim.de) |
---|
[6933] | 4 | * This is free software. You can redistribute it |
---|
| 5 | * and/or modify it under the terms described in |
---|
| 6 | * the GNU General Public License v3 of which you |
---|
| 7 | * should have received a copy. Otherwise you can download |
---|
| 8 | * it from |
---|
| 9 | * |
---|
| 10 | * http://www.gnu.org/licenses/gpl-3.0.txt |
---|
| 11 | * |
---|
[7274] | 12 | * @copyright Leibniz-Institut fuer Deutsche Sprache (http://www.ids-mannheim.de) |
---|
[6933] | 13 | * |
---|
| 14 | * @license http://www.gnu.org/licenses/gpl-3.0.txt |
---|
| 15 | * GNU General Public License v3 |
---|
| 16 | */ |
---|
| 17 | package eu.clarin.sru.client.fcs; |
---|
| 18 | |
---|
| 19 | import java.net.URI; |
---|
| 20 | import java.net.URISyntaxException; |
---|
| 21 | import java.util.ArrayList; |
---|
| 22 | import java.util.HashMap; |
---|
| 23 | import java.util.List; |
---|
| 24 | import java.util.Map; |
---|
| 25 | |
---|
| 26 | import javax.xml.namespace.QName; |
---|
| 27 | import javax.xml.stream.XMLStreamException; |
---|
| 28 | import javax.xml.stream.XMLStreamReader; |
---|
| 29 | |
---|
| 30 | import org.slf4j.Logger; |
---|
| 31 | import org.slf4j.LoggerFactory; |
---|
| 32 | |
---|
| 33 | import eu.clarin.sru.client.SRUClientException; |
---|
| 34 | import eu.clarin.sru.client.XmlStreamReaderUtils; |
---|
| 35 | |
---|
| 36 | |
---|
| 37 | /** |
---|
| 38 | * An implementation of a Data View parser that parses Advanced Data Views. This |
---|
| 39 | * parser expects input that conforms to the CLARIN-FCS specification for the |
---|
| 40 | * Advanced Data View. |
---|
| 41 | * |
---|
| 42 | * @see DataViewAdvanced |
---|
| 43 | */ |
---|
| 44 | public final class DataViewParserAdvanced implements DataViewParser { |
---|
| 45 | private static final String FCS_ADV_NS = |
---|
| 46 | "http://clarin.eu/fcs/dataview/advanced"; |
---|
| 47 | private static final String UNIT_ITEM = "item"; |
---|
| 48 | private static final String UNIT_TIMESTAMP = "timestamp"; |
---|
| 49 | private static final Logger logger = |
---|
| 50 | LoggerFactory.getLogger(DataViewParserAdvanced.class); |
---|
| 51 | |
---|
| 52 | |
---|
| 53 | @Override |
---|
| 54 | public boolean acceptType(String type) { |
---|
| 55 | return DataViewAdvanced.TYPE.equals(type); |
---|
| 56 | } |
---|
| 57 | |
---|
| 58 | |
---|
| 59 | @Override |
---|
| 60 | public int getPriority() { |
---|
| 61 | return 1000; |
---|
| 62 | } |
---|
| 63 | |
---|
| 64 | |
---|
| 65 | @Override |
---|
| 66 | public DataView parse(XMLStreamReader reader, String type, String pid, |
---|
| 67 | String ref) throws XMLStreamException, SRUClientException { |
---|
| 68 | XmlStreamReaderUtils.readStart(reader, FCS_ADV_NS, "Advanced", true, true); |
---|
| 69 | final DataViewAdvanced.Unit unit = readUnit(reader); |
---|
| 70 | logger.debug("Advanced: unit={}", unit); |
---|
| 71 | reader.next(); // skip start tag |
---|
| 72 | |
---|
| 73 | // Segments |
---|
[7280] | 74 | final Map<String, DataViewAdvanced.Segment> segments = new HashMap<>(); |
---|
[6933] | 75 | XmlStreamReaderUtils.readStart(reader, FCS_ADV_NS, "Segments", true); |
---|
| 76 | while (XmlStreamReaderUtils.readStart(reader, FCS_ADV_NS, "Segment", |
---|
| 77 | segments.isEmpty(), true)) { |
---|
| 78 | final String id = |
---|
| 79 | XmlStreamReaderUtils.readAttributeValue(reader, null, "id"); |
---|
| 80 | final long start = readOffset(reader, "start", unit); |
---|
| 81 | final long end = readOffset(reader, "end", unit); |
---|
| 82 | final URI reference = readAttributeURI(reader, null, "ref", false); |
---|
| 83 | if (start > end) { |
---|
| 84 | throw new SRUClientException("invalid offsets: start > end"); |
---|
| 85 | } |
---|
| 86 | reader.next(); // skip start element |
---|
| 87 | XmlStreamReaderUtils.readEnd(reader, FCS_ADV_NS, "Segment"); |
---|
| 88 | |
---|
| 89 | logger.debug("segment: id={}, start={}, end={}, ref={}", |
---|
| 90 | id, start, end, reference); |
---|
| 91 | DataViewAdvanced.Segment segment = |
---|
| 92 | new DataViewAdvanced.Segment(id, start, end, reference); |
---|
| 93 | segments.put(id, segment); |
---|
| 94 | } // while |
---|
| 95 | XmlStreamReaderUtils.readEnd(reader, FCS_ADV_NS, "Segments"); |
---|
| 96 | |
---|
| 97 | // Layers |
---|
| 98 | List<DataViewAdvanced.Layer> layers = |
---|
[7280] | 99 | new ArrayList<>(); |
---|
[6933] | 100 | |
---|
| 101 | XmlStreamReaderUtils.readStart(reader, FCS_ADV_NS, "Layers", true); |
---|
| 102 | while (XmlStreamReaderUtils.readStart(reader, FCS_ADV_NS, "Layer", |
---|
| 103 | layers.isEmpty(), true)) { |
---|
| 104 | String id = XmlStreamReaderUtils.readAttributeValue(reader, null, "id"); |
---|
| 105 | reader.next(); // skip start element |
---|
| 106 | logger.debug("layer: id={}", id); |
---|
[6938] | 107 | |
---|
[6933] | 108 | final List<DataViewAdvanced.Span> spans = |
---|
[7280] | 109 | new ArrayList<>(); |
---|
[6933] | 110 | while (XmlStreamReaderUtils.readStart(reader, FCS_ADV_NS, "Span", |
---|
| 111 | spans.isEmpty(), true)) { |
---|
| 112 | String segment_ref = XmlStreamReaderUtils.readAttributeValue(reader, null, "ref"); |
---|
| 113 | String highlight = XmlStreamReaderUtils.readAttributeValue(reader, null, "highlight", false); |
---|
| 114 | String altValue = XmlStreamReaderUtils.readAttributeValue(reader, null, "alt-value", false); |
---|
| 115 | reader.next(); // skip start element |
---|
| 116 | String content = XmlStreamReaderUtils.readString(reader, false); |
---|
| 117 | XmlStreamReaderUtils.readEnd(reader, FCS_ADV_NS, "Span"); |
---|
| 118 | |
---|
| 119 | logger.debug("span: ref={}, highlight={}, alt-value={}, content={}", |
---|
| 120 | segment_ref, highlight, altValue, content); |
---|
| 121 | DataViewAdvanced.Segment segment = segments.get(segment_ref); |
---|
| 122 | if (segment == null) { |
---|
| 123 | throw new XMLStreamException("No segment with id '" + |
---|
| 124 | segment_ref + "' found", reader.getLocation()); |
---|
| 125 | } |
---|
| 126 | DataViewAdvanced.Span span = |
---|
| 127 | new DataViewAdvanced.Span(segment, highlight, altValue, content); |
---|
| 128 | spans.add(span); |
---|
| 129 | } // while |
---|
| 130 | XmlStreamReaderUtils.readEnd(reader, FCS_ADV_NS, "Layer"); |
---|
[6938] | 131 | |
---|
[6933] | 132 | DataViewAdvanced.Layer layer = |
---|
| 133 | new DataViewAdvanced.Layer(id, spans); |
---|
| 134 | layers.add(layer); |
---|
| 135 | } // while |
---|
| 136 | XmlStreamReaderUtils.readEnd(reader, FCS_ADV_NS, "Layers"); |
---|
| 137 | |
---|
| 138 | XmlStreamReaderUtils.readEnd(reader, FCS_ADV_NS, "Advanced"); |
---|
| 139 | return new DataViewAdvanced(pid, ref, unit, layers); |
---|
| 140 | } |
---|
| 141 | |
---|
[6938] | 142 | |
---|
[6933] | 143 | private static final DataViewAdvanced.Unit readUnit(XMLStreamReader reader) |
---|
| 144 | throws XMLStreamException { |
---|
| 145 | |
---|
| 146 | final String s = XmlStreamReaderUtils.readAttributeValue(reader, null, |
---|
| 147 | "unit", true); |
---|
| 148 | if (UNIT_ITEM.equals(s)) { |
---|
| 149 | return DataViewAdvanced.Unit.ITEM; |
---|
| 150 | } else if (UNIT_TIMESTAMP.equals(s)) { |
---|
| 151 | return DataViewAdvanced.Unit.TIMESTAMP; |
---|
| 152 | } else { |
---|
| 153 | throw new XMLStreamException( |
---|
| 154 | "Attribute 'unit' may only have values '" + UNIT_ITEM + |
---|
| 155 | "' or '" + UNIT_TIMESTAMP + "'", |
---|
| 156 | reader.getLocation()); |
---|
| 157 | } |
---|
| 158 | } |
---|
[6938] | 159 | |
---|
| 160 | |
---|
[6933] | 161 | private static final URI readAttributeURI(XMLStreamReader reader, |
---|
| 162 | String namespaceURI, String localName, boolean required) |
---|
| 163 | throws XMLStreamException, SRUClientException { |
---|
| 164 | final String s = XmlStreamReaderUtils.readAttributeValue(reader, |
---|
| 165 | namespaceURI, localName, required); |
---|
| 166 | if (s != null) { |
---|
| 167 | try { |
---|
| 168 | return new URI(s); |
---|
| 169 | } catch (URISyntaxException e) { |
---|
| 170 | throw new XMLStreamException("malformed URI in attribute '" + |
---|
| 171 | new QName(namespaceURI, localName) + "'", |
---|
| 172 | reader.getLocation(), e); |
---|
| 173 | } |
---|
| 174 | } else { |
---|
| 175 | return null; |
---|
| 176 | } |
---|
| 177 | } |
---|
| 178 | |
---|
[6938] | 179 | |
---|
[6933] | 180 | private static final long readOffset(XMLStreamReader reader, |
---|
| 181 | String localName, DataViewAdvanced.Unit unit) |
---|
| 182 | throws XMLStreamException, SRUClientException { |
---|
| 183 | String s = XmlStreamReaderUtils.readAttributeValue(reader, |
---|
| 184 | null, localName, true); |
---|
| 185 | switch (unit) { |
---|
| 186 | case ITEM: |
---|
| 187 | try { |
---|
| 188 | long num = Long.parseLong(s); |
---|
| 189 | if (num < 0) { |
---|
| 190 | throw new XMLStreamException("offset is smaller than '0'", |
---|
| 191 | reader.getLocation()); |
---|
| 192 | } |
---|
| 193 | return num; |
---|
| 194 | } catch (NumberFormatException e) { |
---|
| 195 | throw new XMLStreamException( |
---|
| 196 | "invalid number in attribute '" + localName + "'", |
---|
| 197 | reader.getLocation(), e); |
---|
| 198 | } |
---|
| 199 | case TIMESTAMP: |
---|
| 200 | throw new SRUClientException("no support for 'timestamp' offsets, yet!"); |
---|
| 201 | default: |
---|
| 202 | throw new SRUClientException("internal error: invalid unit (" + |
---|
| 203 | unit + ")"); |
---|
| 204 | } |
---|
| 205 | } |
---|
| 206 | |
---|
| 207 | } // class DataViewParserAdvanced |
---|