Changeset 2394


Ignore:
Timestamp:
12/07/12 11:51:39 (11 years ago)
Author:
oschonef
Message:
  • re-work to support specific data view parsers
  • generic data view now returns a Document instead of DocumentFragment?
  • minor updates to license header

HEADS UP:

  • the following incompatible changes have been made:
    • class KWICDataView has been renamed to DataViewKWIC
    • class GenericDataView? has been renamed to DataViewGenericDOM

Please update your code accordingly!

Location:
SRUClient/trunk/src
Files:
3 added
5 edited
2 moved

Legend:

Unmodified
Added
Removed
  • SRUClient/trunk/src/main/java/eu/clarin/sru/fcs/ClarinFCSRecordParser.java

    r2387 r2394  
    11/**
    2  * This software is copyright (c) 2011 by
     2 * This software is copyright (c) 2011-2012 by
    33 *  - Institut fuer Deutsche Sprache (http://www.ids-mannheim.de)
    44 * This is free software. You can redistribute it
     
    2020import java.util.List;
    2121
    22 import javax.xml.parsers.DocumentBuilder;
    23 import javax.xml.parsers.DocumentBuilderFactory;
    24 import javax.xml.parsers.ParserConfigurationException;
    2522import javax.xml.stream.XMLStreamException;
    2623import javax.xml.stream.XMLStreamReader;
    27 import javax.xml.transform.Transformer;
    28 import javax.xml.transform.TransformerConfigurationException;
    29 import javax.xml.transform.TransformerException;
    30 import javax.xml.transform.TransformerFactory;
    31 import javax.xml.transform.dom.DOMResult;
    32 import javax.xml.transform.stax.StAXSource;
    33 
    3424import org.slf4j.Logger;
    3525import org.slf4j.LoggerFactory;
    36 import org.w3c.dom.Document;
    37 import org.w3c.dom.DocumentFragment;
    38 import org.w3c.dom.NodeList;
    39 
    4026import eu.clarin.sru.client.SRUClientException;
    4127import eu.clarin.sru.client.SRURecordData;
     
    4531
    4632/**
    47  * A record parse to parse records conforming to CLARIN FCS specification. The
    48  * parser currently supports the KWIC view.
     33 * A record parse to parse records conforming to CLARIN FCS specification.
    4934 */
    5035public class ClarinFCSRecordParser implements SRURecordDataParser {
    51     private static class TransformHelper {
    52         private final DocumentBuilder builder;
    53         private final Transformer transformer;
    54         private Document document;
    55 
    56 
    57         private TransformHelper(DocumentBuilder builder,
    58                 Transformer transformer) {
    59             if (builder == null) {
    60                 throw new NullPointerException("builder == null");
    61             }
    62             this.builder = builder;
    63             if (transformer == null) {
    64                 throw new NullPointerException("transformer == null");
    65             }
    66             this.transformer = transformer;
    67         }
    68 
    69 
    70         private DocumentFragment transform(XMLStreamReader reader)
    71                 throws XMLStreamException, TransformerException {
    72             if (document == null) {
    73                 document = builder.newDocument();
    74             }
    75 
    76             // parse STAX to DOM fragment
    77             DocumentFragment fragment = document.createDocumentFragment();
    78             DOMResult result = new DOMResult(fragment);
    79             transformer.transform(new StAXSource(reader), result);
    80             return fragment;
    81         }
    82 
    83 
    84         private void reset() {
    85             builder.reset();
    86             transformer.reset();
    87             document = null;
    88         }
    89     } // private class TransformHelper
    9036    private static final Logger logger =
    9137            LoggerFactory.getLogger(ClarinFCSRecordParser.class);
    9238    private static final String FCS_NS =
    9339            ClarinFCSRecordData.RECORD_SCHEMA;
    94     private static final String FCS_KWIC_NS = "http://clarin.eu/fcs/1.0/kwic";
    95     private static final String DATAVIEW_KWIC_LEGACY_TYPE = "kwic";
    96     private final ThreadLocal<TransformHelper> transformHelper;
    97 
    98 
    99     public ClarinFCSRecordParser() {
    100         this(DocumentBuilderFactory.newInstance(),
    101                 TransformerFactory.newInstance());
    102     }
    103 
    104 
    105     public ClarinFCSRecordParser(final DocumentBuilderFactory builderFactory,
    106             final TransformerFactory transformerFactory) {
    107         if (builderFactory == null) {
    108             throw new NullPointerException("builderFactory == null");
    109         }
    110         if (transformerFactory == null) {
    111             throw new NullPointerException("transformerFactory == null");
    112         }
    113         this.transformHelper = new ThreadLocal<TransformHelper>() {
    114             @Override
    115             protected TransformHelper initialValue() {
    116                 try {
    117                     return new TransformHelper(builderFactory.newDocumentBuilder(),
    118                                    transformerFactory.newTransformer());
    119                 } catch (TransformerConfigurationException e) {
    120                     throw new InternalError("unexpected error creating new transformer");
    121                 } catch (ParserConfigurationException e) {
    122                     throw new InternalError("unexpected error creating new document builder");
    123                 }
    124             }
    125         };
    126     }
     40    // TODO: make this configurable
     41    private final DataViewParser[] parsers = new DataViewParser[] {
     42            new DataViewParserGenericDOM(),
     43            new DataViewParserKWIC()
     44    };
    12745
    12846
     
    13856        logger.debug("parsing CLARIN-FCS record");
    13957
    140         final TransformHelper helper = transformHelper.get();
    141         try {
    142             // Resource
    143             XmlStreamReaderUtils.readStart(reader, FCS_NS, "Resource", true, true);
    144             String pid = XmlStreamReaderUtils.readAttributeValue(reader, null, "pid");
    145             String ref = XmlStreamReaderUtils.readAttributeValue(reader, null, "ref");
    146             XmlStreamReaderUtils.consumeStart(reader);
     58        // Resource
     59        XmlStreamReaderUtils.readStart(reader, FCS_NS, "Resource", true, true);
     60        String pid = XmlStreamReaderUtils.readAttributeValue(reader, null, "pid");
     61        String ref = XmlStreamReaderUtils.readAttributeValue(reader, null, "ref");
     62        XmlStreamReaderUtils.consumeStart(reader);
    14763
    148             // Resource/Resource (optional)
    149             if (XmlStreamReaderUtils.readStart(reader, FCS_NS, "Resource", false)) {
    150                 logger.info("skipping nested <Resource> element");
    151                 XmlStreamReaderUtils.readEnd(reader, FCS_NS, "Resource", true);
    152             }
     64        // Resource/Resource (optional)
     65        if (XmlStreamReaderUtils.readStart(reader, FCS_NS, "Resource", false)) {
     66            logger.info("skipping nested <Resource> element");
     67            XmlStreamReaderUtils.readEnd(reader, FCS_NS, "Resource", true);
     68        }
    15369
    154             // Resource/DataView
    155             final List<DataView> dataviews = parseDataViews(reader, helper);
     70        // Resource/DataView
     71        final List<DataView> dataviews = parseDataViews(reader);
    15672
    157             // Resource/ResourceFragment
    158             final List<Resource.ResourceFragment> resourceFragments =
    159                     parseResourceFragments(reader, helper);
     73        // Resource/ResourceFragment
     74        final List<Resource.ResourceFragment> resourceFragments =
     75                parseResourceFragments(reader);
    16076
    161             XmlStreamReaderUtils.readEnd(reader, FCS_NS, "Resource", true);
     77        XmlStreamReaderUtils.readEnd(reader, FCS_NS, "Resource", true);
    16278
    163             return new ClarinFCSRecordData(pid, ref, dataviews,
    164                     resourceFragments);
    165         } finally {
    166             // make sure, we reset the helper
    167             helper.reset();
    168         }
     79        return new ClarinFCSRecordData(pid, ref, dataviews, resourceFragments);
    16980    }
    17081
    17182
    172     private static List<DataView> parseDataViews(XMLStreamReader reader,
    173             TransformHelper foo) throws XMLStreamException, SRUClientException {
     83    private List<DataView> parseDataViews(XMLStreamReader reader)
     84            throws XMLStreamException, SRUClientException {
    17485        List<DataView> dataviews = null;
    17586
     
    196107            XmlStreamReaderUtils.consumeWhitespace(reader);
    197108
    198             logger.debug("found DataView of type = {}", type);
     109            logger.debug("processing <DataView> of type = {}", type);
     110
     111            DataViewParser parser = null;
     112            for (int i = 0; i < parsers.length; i++) {
     113                if (parsers[i].acceptType(type) &&
     114                        ((parser == null) ||
     115                         (parser.getPriority() < parsers[i].getPriority()))) {
     116                    parser = parsers[i];
     117                }
     118            }
     119
    199120            DataView dataview = null;
    200             if (KWICDataView.MIMETYPE.equals(type) ||
    201                     DATAVIEW_KWIC_LEGACY_TYPE.equals(type)) {
    202                 logger.debug("parsing dataview using FCS-KWIC parser");
    203                 dataview = parseDataViewKWIC(reader, pid, ref);
     121            if (parser != null) {
     122                dataview = parser.parse(reader, type, pid, ref);
    204123            } else {
    205                 logger.debug("parsing dataview using generic parser");
    206                 dataview = parseDataViewGeneric(reader, foo, type, pid, ref);
     124                logger.warn("no parser found for <DataView> of type = {}", type);
    207125            }
    208126
     
    215133                dataviews.add(dataview);
    216134            } else {
    217                 logger.info("DataView of type = {} skipped", type);
     135                logger.warn("skipped <DataView> of type = {}", type);
    218136            }
    219137        } // while
     
    222140
    223141
    224     private static List<Resource.ResourceFragment> parseResourceFragments(
    225             XMLStreamReader reader, TransformHelper foo)
    226             throws XMLStreamException, SRUClientException {
     142    private List<Resource.ResourceFragment> parseResourceFragments(
     143            XMLStreamReader reader) throws XMLStreamException,
     144            SRUClientException {
    227145        List<Resource.ResourceFragment> resourceFragments = null;
    228146        while (XmlStreamReaderUtils.readStart(reader, FCS_NS, "ResourceFragment", false, true)) {
     
    231149            String ref = XmlStreamReaderUtils.readAttributeValue(reader, null, "ref");
    232150            XmlStreamReaderUtils.consumeStart(reader);
    233             final List<DataView> dataviews = parseDataViews(reader, foo);
     151            final List<DataView> dataviews = parseDataViews(reader);
    234152            XmlStreamReaderUtils.readEnd(reader, FCS_NS, "ResourceFragment", true);
    235153
     
    242160    }
    243161
    244 
    245     private static DataView parseDataViewGeneric(XMLStreamReader reader,
    246             TransformHelper helper, String type, String pid, String ref)
    247             throws XMLStreamException, SRUClientException {
    248         try {
    249             final DocumentFragment fragment = helper.transform(reader);
    250             final NodeList children = fragment.getChildNodes();
    251             if ((children != null) && (children.getLength() > 0)) {
    252                 return new GenericDataView(type, pid, ref, fragment);
    253             } else {
    254                 throw new SRUClientException("element <DataView> does not " +
    255                         "contain any nested elements");
    256             }
    257         } catch (TransformerException e) {
    258             throw new SRUClientException("error while parsing dataview", e);
    259         }
    260     }
    261 
    262 
    263     private static DataView parseDataViewKWIC(XMLStreamReader reader,
    264             String pid, String ref) throws XMLStreamException,
    265             SRUClientException {
    266         String left = null;
    267         String keyword = null;
    268         String right = null;
    269 
    270         XmlStreamReaderUtils.readStart(reader, FCS_KWIC_NS, "kwic", true);
    271         if (XmlStreamReaderUtils.readStart(reader, FCS_KWIC_NS, "c", false)) {
    272             left = XmlStreamReaderUtils.readString(reader, false);
    273             XmlStreamReaderUtils.readEnd(reader, FCS_KWIC_NS, "c");
    274         }
    275         keyword = XmlStreamReaderUtils.readContent(reader, FCS_KWIC_NS, "kw", true);
    276         if (XmlStreamReaderUtils.readStart(reader, FCS_KWIC_NS, "c", false)) {
    277             right = XmlStreamReaderUtils.readString(reader, false);
    278             XmlStreamReaderUtils.readEnd(reader, FCS_KWIC_NS, "c");
    279         }
    280         XmlStreamReaderUtils.readEnd(reader, FCS_KWIC_NS, "kwic");
    281 
    282         logger.debug("left='{}' keyword='{}', right='{}'", new Object[] {
    283                 left, keyword, right }
    284         );
    285         return new KWICDataView(pid, ref, left, keyword, right);
    286     }
    287 
    288 } // class ClarinFederatedContentSearchRecordParser
     162} // class ClarinFCSRecordParser
  • SRUClient/trunk/src/main/java/eu/clarin/sru/fcs/DataView.java

    r2306 r2394  
     1/**
     2 * This software is copyright (c) 2011-2012 by
     3 *  - Institut fuer Deutsche Sprache (http://www.ids-mannheim.de)
     4 * This is free software. You can redistribute it
     5 * and/or modify it under the terms described in
     6 * the GNU General Public License v3 of which you
     7 * should have received a copy. Otherwise you can download
     8 * it from
     9 *
     10 *   http://www.gnu.org/licenses/gpl-3.0.txt
     11 *
     12 * @copyright Institut fuer Deutsche Sprache (http://www.ids-mannheim.de)
     13 *
     14 * @license http://www.gnu.org/licenses/gpl-3.0.txt
     15 *  GNU General Public License v3
     16 */
    117package eu.clarin.sru.fcs;
    218
  • SRUClient/trunk/src/main/java/eu/clarin/sru/fcs/DataViewGenericDOM.java

    r2388 r2394  
     1/**
     2 * This software is copyright (c) 2011-2012 by
     3 *  - Institut fuer Deutsche Sprache (http://www.ids-mannheim.de)
     4 * This is free software. You can redistribute it
     5 * and/or modify it under the terms described in
     6 * the GNU General Public License v3 of which you
     7 * should have received a copy. Otherwise you can download
     8 * it from
     9 *
     10 *   http://www.gnu.org/licenses/gpl-3.0.txt
     11 *
     12 * @copyright Institut fuer Deutsche Sprache (http://www.ids-mannheim.de)
     13 *
     14 * @license http://www.gnu.org/licenses/gpl-3.0.txt
     15 *  GNU General Public License v3
     16 */
    117package eu.clarin.sru.fcs;
    218
    3 import org.w3c.dom.DocumentFragment;
     19import org.w3c.dom.Document;
    420
    521
    6 public class GenericDataView extends DataView {
    7     private final DocumentFragment fragment;
     22public class DataViewGenericDOM extends DataView {
     23    private final Document document;
    824
    925
    10     protected GenericDataView(String mimetype, String pid, String ref,
    11             DocumentFragment fragment) {
     26    protected DataViewGenericDOM(String mimetype, String pid, String ref,
     27            Document document) {
    1228        super(mimetype, pid, ref);
    13         this.fragment = fragment;
     29        this.document = document;
    1430    }
    1531
    1632
    17     public DocumentFragment getDocumentFragment() {
    18         return fragment;
     33    public Document getDocument() {
     34        return document;
    1935    }
    2036
  • SRUClient/trunk/src/main/java/eu/clarin/sru/fcs/DataViewKWIC.java

    r2388 r2394  
     1/**
     2 * This software is copyright (c) 2011-2012 by
     3 *  - Institut fuer Deutsche Sprache (http://www.ids-mannheim.de)
     4 * This is free software. You can redistribute it
     5 * and/or modify it under the terms described in
     6 * the GNU General Public License v3 of which you
     7 * should have received a copy. Otherwise you can download
     8 * it from
     9 *
     10 *   http://www.gnu.org/licenses/gpl-3.0.txt
     11 *
     12 * @copyright Institut fuer Deutsche Sprache (http://www.ids-mannheim.de)
     13 *
     14 * @license http://www.gnu.org/licenses/gpl-3.0.txt
     15 *  GNU General Public License v3
     16 */
    117package eu.clarin.sru.fcs;
    218
     
    420 * A CLARIN FCS KWIC DataView.
    521 */
    6 public final class KWICDataView extends DataView {
     22public final class DataViewKWIC extends DataView {
    723    /**
    824     * The MIME type for CLARIN FCS KWIC dataviews.
     
    1632    /**
    1733     * Constructor.
    18      * 
     34     *
    1935     * @param pid
    2036     *            a persistent identifier or <code>null</code>
     
    2844     *            the right KWIC context
    2945     */
    30     KWICDataView(String pid, String ref, String left, String keyword,
     46    DataViewKWIC(String pid, String ref, String left, String keyword,
    3147            String right) {
    3248        super(MIMETYPE, pid, ref);
  • SRUClient/trunk/src/main/java/eu/clarin/sru/fcs/Resource.java

    r2304 r2394  
     1/**
     2 * This software is copyright (c) 2011-2012 by
     3 *  - Institut fuer Deutsche Sprache (http://www.ids-mannheim.de)
     4 * This is free software. You can redistribute it
     5 * and/or modify it under the terms described in
     6 * the GNU General Public License v3 of which you
     7 * should have received a copy. Otherwise you can download
     8 * it from
     9 *
     10 *   http://www.gnu.org/licenses/gpl-3.0.txt
     11 *
     12 * @copyright Institut fuer Deutsche Sprache (http://www.ids-mannheim.de)
     13 *
     14 * @license http://www.gnu.org/licenses/gpl-3.0.txt
     15 *  GNU General Public License v3
     16 */
    117package eu.clarin.sru.fcs;
    218
     
    3046        /**
    3147         * Get the persistent identifier for this resource fragment.
    32          * 
     48         *
    3349         * @return a persistent identifier or <code>null</code> of this resource
    3450         *         fragment has none
     
    4157        /**
    4258         * Get the reference URI for this resource fragment.
    43          * 
     59         *
    4460         * @return a reference URI or <code>null</code> of this resource
    4561         *         fragment has none
     
    5369         * Convenience method to check if this resource fragment has any
    5470         * dataviews.
    55          * 
     71         *
    5672         * @return <code>true</code> if this resource fragment has dataviews,
    5773         *         <code>false</code> otherwise
     
    6480        /**
    6581         * Get the list of dataview objects for this this resource fragment.
    66          * 
     82         *
    6783         * @return a list of {@link DataView} objects or <code>null</code>,
    6884         *         or <code>null</code> if this resource fragment has
     
    99115    /**
    100116     * Get the persistent identifier for this resource.
    101      * 
     117     *
    102118     * @return a persistent identifier or <code>null</code> of this resource has
    103119     *         none
     
    110126    /**
    111127     * Get the reference URI for this resource.
    112      * 
     128     *
    113129     * @return a reference URI or <code>null</code> of this resource has
    114130     *         none
     
    121137    /**
    122138     * Convenience method to check if this resource has any dataviews.
    123      * 
     139     *
    124140     * @return <code>true</code> if this resource has dataviews,
    125141     *         <code>false</code> otherwise
     
    132148    /**
    133149     * Get the list of dataview objects for this this resource.
    134      * 
     150     *
    135151     * @return a list of {@link DataView} objects or <code>null</code>, or
    136152     *         <code>null</code> if this resource has none
     
    143159    /**
    144160     * Convenience method to check if this resource has any resource fragments.
    145      * 
     161     *
    146162     * @return <code>true</code> if this resource has resource fragments,
    147163     *         <code>false</code> otherwise
     
    154170    /**
    155171     * Get the list of resource fragment objects for this this resource.
    156      * 
     172     *
    157173     * @return a list of {@link ResourceFragment} objects or <code>null</code>,
    158174     *         or <code>null</code> if this resource has none
  • SRUClient/trunk/src/test/java/eu/clarin/sru/client/TestSimpleClient.java

    r2388 r2394  
    2424import eu.clarin.sru.fcs.ClarinFCSRecordData;
    2525import eu.clarin.sru.fcs.ClarinFCSRecordParser;
    26 import eu.clarin.sru.fcs.DataView;
    27 import eu.clarin.sru.fcs.KWICDataView;
    28 import eu.clarin.sru.fcs.Resource;
    2926
    3027
     
    112109                        ClarinFCSRecordData record =
    113110                                (ClarinFCSRecordData) data;
    114                         dumpResource(record.getResource());
     111                        TestUtils.dumpResource(record.getResource());
    115112                    }
    116113                }
     
    157154
    158155
    159     private static void dumpResource(Resource resource) {
    160         logger.info("CLARIN-FCS: pid={}, ref={}",
    161                 resource.getPid(), resource.getRef());
    162         if (resource.hasDataViews()) {
    163             dumpDataView("CLARIN-FCS: ", resource.getDataViews());
    164         }
    165         if (resource.hasResourceFragments()) {
    166             for (Resource.ResourceFragment fragment : resource.getResourceFragments()) {
    167                 logger.debug("CLARIN-FCS: ResourceFragment: pid={}, ref={}",
    168                         fragment.getPid(), fragment.getRef());
    169                 if (fragment.hasDataViews()) {
    170                     dumpDataView("CLARIN-FCS: ResourceFragment/", fragment.getDataViews());
    171                 }
    172             }
    173         }
    174     }
    175 
    176 
    177     private static void dumpDataView(String s, List<DataView> dataviews) {
    178         for (DataView dataview : dataviews) {
    179             logger.info("{}DataView: type={}, pid={}, ref={}",
    180                     new Object[] {
    181                         s,
    182                         dataview.getMimeType(),
    183                         dataview.getPid(),
    184                         dataview.getRef()
    185                     });
    186             if (dataview.isMimeType(KWICDataView.MIMETYPE)) {
    187                 final KWICDataView kw = (KWICDataView) dataview;
    188                 logger.info("{}DataView: {} / {} / {}",
    189                         new Object[] {
    190                             s,
    191                             kw.getLeft(),
    192                             kw.getKeyword(),
    193                             kw.getRight() });
    194             }
    195         }
    196     }
    197 
    198 
    199156    static {
    200157        org.apache.log4j.BasicConfigurator.configure(
  • SRUClient/trunk/src/test/java/eu/clarin/sru/client/TestUtils.java

    r2388 r2394  
     1/**
     2 * This software is copyright (c) 2011-2012 by
     3 *  - Institut fuer Deutsche Sprache (http://www.ids-mannheim.de)
     4 * This is free software. You can redistribute it
     5 * and/or modify it under the terms described in
     6 * the GNU General Public License v3 of which you
     7 * should have received a copy. Otherwise you can download
     8 * it from
     9 *
     10 *   http://www.gnu.org/licenses/gpl-3.0.txt
     11 *
     12 * @copyright Institut fuer Deutsche Sprache (http://www.ids-mannheim.de)
     13 *
     14 * @license http://www.gnu.org/licenses/gpl-3.0.txt
     15 *  GNU General Public License v3
     16 */
    117package eu.clarin.sru.client;
    218
     
    521import org.slf4j.Logger;
    622import org.slf4j.LoggerFactory;
     23import org.w3c.dom.Node;
    724
    825import eu.clarin.sru.fcs.ClarinFCSRecordData;
    926import eu.clarin.sru.fcs.DataView;
    10 import eu.clarin.sru.fcs.GenericDataView;
    11 import eu.clarin.sru.fcs.KWICDataView;
     27import eu.clarin.sru.fcs.DataViewGenericDOM;
     28import eu.clarin.sru.fcs.DataViewKWIC;
    1229import eu.clarin.sru.fcs.Resource;
    1330
     
    126143
    127144
    128     private static void dumpResource(Resource resource) {
     145    public static void dumpResource(Resource resource) {
    129146        logger.info("CLARIN-FCS: pid={}, ref={}",
    130147                resource.getPid(), resource.getRef());
     
    153170                        dataview.getRef()
    154171                    });
    155             if (dataview instanceof GenericDataView) {
    156                 final GenericDataView view = (GenericDataView) dataview;
    157                 logger.info("{}DataView: DocumentFragment with root element <{}>",
    158                             s, view.getDocumentFragment().getFirstChild().getNodeName());
    159             } else  if (dataview.isMimeType(KWICDataView.MIMETYPE)) {
    160                 final KWICDataView kw = (KWICDataView) dataview;
     172            if (dataview instanceof DataViewGenericDOM) {
     173                final DataViewGenericDOM view = (DataViewGenericDOM) dataview;
     174                final Node root = view.getDocument().getFirstChild();
     175                logger.info("{}DataView: root element <{}> / {}",
     176                        new Object[] {
     177                            s,
     178                            root.getNodeName(),
     179                            root.getOwnerDocument().hashCode() });
     180            } else  if (dataview.isMimeType(DataViewKWIC.MIMETYPE)) {
     181                final DataViewKWIC kw = (DataViewKWIC) dataview;
    161182                logger.info("{}DataView: {} / {} / {}",
    162183                        new Object[] {
     
    169190    }
    170191
    171 }
     192} // class TestUtils
Note: See TracChangeset for help on using the changeset viewer.