Changeset 996


Ignore:
Timestamp:
12/15/10 12:48:10 (13 years ago)
Author:
patdui
Message:
  • Added vtd-xml parser which is 10x faster (5mins vs. 1hour for 200.000 cmdi files) then DOM parser. Still supports xpath to query the data out.
  • Made it possible to delete only part of the index, you can delete a whole origin.
Location:
vlo/trunk/vlo_webapp
Files:
2 added
6 edited
1 moved

Legend:

Unmodified
Added
Removed
  • vlo/trunk/vlo_webapp/pom.xml

    r957 r996  
    7676
    7777    <dependency>
    78       <groupId>commons-digester</groupId>
    79       <artifactId>commons-digester</artifactId>
    80       <version>2.1</version>
     78         <groupId>com.ximpleware</groupId>
     79         <artifactId>vtd-xml</artifactId>
     80         <version>2.9</version>
    8181    </dependency>
    8282  </dependencies>
  • vlo/trunk/vlo_webapp/src/main/java/eu/clarin/cmdi/vlo/importer/CMDIDigester.java

    r995 r996  
    1414import javax.xml.xpath.XPathFactory;
    1515
    16 import org.slf4j.Logger;
    17 import org.slf4j.LoggerFactory;
    1816import org.w3c.dom.Document;
    1917import org.w3c.dom.Node;
     
    2220import org.xml.sax.SAXException;
    2321
    24 public class CMDIDigester {
    25     private final static Logger LOG = LoggerFactory.getLogger(CMDIDigester.class);
     22
     23/**
     24 * @deprecated
     25 * Dom parsing implementation, use the @see CMDIParserVTDXML it is much faster.
     26 * Keeping this for now just in case we run into issues with the vlt parsing.
     27 * patdui 15 December 2010
     28 */
     29public class CMDIDigester implements CMDIDataProcessor {
    2630    private final FacetMapping facetMapping;
    2731    private DocumentBuilder builder;
  • vlo/trunk/vlo_webapp/src/main/java/eu/clarin/cmdi/vlo/importer/DataRoot.java

    r894 r996  
    88    private String originName;
    99    private File rootFile;
     10    private boolean deleteFirst = false;
    1011
    1112    public void setFacetMapping(FacetMapping facetMapping) {
     
    3334    }
    3435
     36    public void setDeleteFirst(boolean deleteFirst) {
     37        this.deleteFirst = deleteFirst;
     38    }
     39
     40    public boolean isDeleteFirst() {
     41        return deleteFirst;
     42    }
     43
    3544}
  • vlo/trunk/vlo_webapp/src/main/java/eu/clarin/cmdi/vlo/importer/ImporterConfig.java

    r950 r996  
    55public class ImporterConfig {
    66
    7     private boolean deleteFirst = false;
     7    private boolean deleteAllFirst = false;
    88   
    99    private List<DataRoot> dataRoots;
     
    1717    }
    1818
    19     public void setDeleteFirst(boolean deleteFirst) {
    20         this.deleteFirst = deleteFirst;
     19    public void setDeleteAllFirst(boolean deleteAllFirst) {
     20        this.deleteAllFirst = deleteAllFirst;
    2121    }
    2222
    23     public boolean isDeleteFirst() {
    24         return deleteFirst;
     23    public boolean isDeleteAllFirst() {
     24        return deleteAllFirst;
    2525    }
    2626
  • vlo/trunk/vlo_webapp/src/main/java/eu/clarin/cmdi/vlo/importer/MetadataImporter.java

    r988 r996  
    99import java.util.Set;
    1010
    11 import javax.xml.xpath.XPathExpressionException;
    12 
    1311import org.apache.solr.client.solrj.SolrServerException;
    1412import org.apache.solr.client.solrj.impl.StreamingUpdateSolrServer;
     
    1816import org.springframework.beans.factory.BeanFactory;
    1917import org.springframework.context.support.ClassPathXmlApplicationContext;
    20 import org.xml.sax.SAXException;
    2118
    2219import eu.clarin.cmdi.vlo.Configuration;
     
    5249    }
    5350
    54     //TODO PD can have multiple origins
    5551    private void startImport() {
    5652        List<DataRoot> dataRoots = config.getDataRoots();
     
    6460        long start = System.currentTimeMillis();
    6561        try {
    66             if (config.isDeleteFirst()) {
     62            if (config.isDeleteAllFirst()) {
    6763                LOG.info("Deleting original data...");
    6864                solrServer.deleteByQuery("*:*");//Delete the whole solr db.
     
    7066            }
    7167            for (DataRoot dataRoot : dataRoots) {
    72                 LOG.info("Start of processing: "+dataRoot.getOriginName());
    73                 CMDIDigester digester = new CMDIDigester(dataRoot.getFacetMapping());
    74                 processCmdi(dataRoot.getRootFile(), dataRoot.getOriginName(), digester);
     68                LOG.info("Start of processing: " + dataRoot.getOriginName());
     69                if (dataRoot.isDeleteFirst()) {
     70                    LOG.info("Deleting data for origin: " + dataRoot.getOriginName());
     71                    solrServer.deleteByQuery(FacetConstants.FIELD_ORIGIN + ":" + dataRoot.getOriginName());
     72                    LOG.info("Deleting data for origin done.");
     73                }
     74                CMDIDataProcessor processor = new CMDIParserVTDXML(dataRoot.getFacetMapping());
     75                processCmdi(dataRoot.getRootFile(), dataRoot.getOriginName(), processor);
    7576                if (!docs.isEmpty()) {
    7677                    sendDocs();
    7778                }
    78                 LOG.info("End of processing: "+dataRoot.getOriginName());
     79                LOG.info("End of processing: " + dataRoot.getOriginName());
    7980            }
    8081        } catch (SolrServerException e) {
     
    9899    }
    99100
    100     private void processCmdi(File file, String origin, CMDIDigester digester) throws SolrServerException, IOException {
     101    private void processCmdi(File file, String origin, CMDIDataProcessor processor) throws SolrServerException, IOException {
    101102        nrOfFilesAnalyzed++;
    102103        CMDIData cmdiData = null;
    103104        try {
    104             cmdiData = digester.process(file);
    105         } catch (IOException e) {
    106             LOG.error("error in file: " + file + " Exception", e);
    107         } catch (SAXException e) {
    108             LOG.error("error in file: " + file + " Exception", e);
    109         } catch (XPathExpressionException e) {
     105            cmdiData = processor.process(file);
     106        } catch (Exception e) {
    110107            LOG.error("error in file: " + file + " Exception", e);
    111108        }
     
    119116                File resourceFile = new File(file.getParentFile(), cmdiResource);
    120117                if (resourceFile.exists()) {
    121                     processCmdi(resourceFile, origin, digester);
     118                    processCmdi(resourceFile, origin, processor);
    122119                } else {
    123120                    nrOfNonExistendResourceFiles++;
     
    146143
    147144    private void sendDocs() throws SolrServerException, IOException {
    148         LOG.info("Sending "+docs.size()+" docs to solr server. Total number of docs updated till now: "+nrOFDocumentsUpdated);
     145        LOG.info("Sending " + docs.size() + " docs to solr server. Total number of docs updated till now: " + nrOFDocumentsUpdated);
    149146        nrOFDocumentsUpdated += docs.size();
    150147        solrServer.add(docs);
  • vlo/trunk/vlo_webapp/src/main/resources/importerConfig.xml

    r995 r996  
    44
    55  <bean id="importerConfig" class="eu.clarin.cmdi.vlo.importer.ImporterConfig">
    6     <!--    <property name="deleteFirst" value="true"/>-->
     6    <property name="deleteAllFirst" value="true"/>
    77    <property name="dataRoots">
    88      <list>
    99        <bean class="eu.clarin.cmdi.vlo.importer.DataRoot">
     10<!--          <property name="deleteFirst"  value="true"/>-->
    1011          <property name="originName" value="OLAC Metadata Providers" />
    1112          <property name="rootFile"
     
    1617          <property name="originName" value="Nijmegen corpora of casual speech" />
    1718          <property name="rootFile"
    18 
    1919            value="/Users/patdui/data/snapshots2/data/corpora/qfs1/media-archive/casual_speech/Corpusstructure/casual_speech.imdi.cmdi" />
    2020          <property name="facetMapping" ref="imdiMapping"></property>
  • vlo/trunk/vlo_webapp/src/test/java/eu/clarin/cmdi/vlo/importer/CMDIDataProcessorTest.java

    r995 r996  
    77import java.io.File;
    88import java.io.IOException;
     9import java.util.ArrayList;
     10import java.util.Collection;
     11import java.util.Collections;
    912import java.util.List;
    1013
     
    1720import org.springframework.context.support.ClassPathXmlApplicationContext;
    1821
    19 public class CMDIDigesterTest {
     22public class CMDIDataProcessorTest {
    2023
    2124    private static File testDir;
     25
     26    private CMDIDataProcessor getDataParser(FacetMapping map) {
     27        return new CMDIParserVTDXML(map);
     28        //        return new CMDIDigester(map);
     29    }
    2230
    2331    @Test
     
    6674        content += "</CMD>\n";
    6775        File cmdiFile = createCmdiFile("testCorpus", content);
    68         CMDIDigester digester = new CMDIDigester(getIMDIFacetMap());
    69         CMDIData data = digester.process(cmdiFile);
     76        CMDIDataProcessor processor = getDataParser(getIMDIFacetMap());
     77        CMDIData data = processor.process(cmdiFile);
    7078        assertEquals("test-hdl:1839/00-0000-0000-0000-0001-D", data.getId());
    7179        List<String> resources = data.getResources();
     
    362370        content += "</CMD>\n";
    363371        File cmdiFile = createCmdiFile("testSession", content);
    364         CMDIDigester digester = new CMDIDigester(getIMDIFacetMap());
    365         CMDIData data = digester.process(cmdiFile);
     372        CMDIDataProcessor processor = getDataParser(getIMDIFacetMap());
     373        CMDIData data = processor.process(cmdiFile);
    366374        assertEquals("test-hdl:1839/00-0000-0000-0009-294C-9", data.getId());
    367375        List<String> resources = data.getResources();
     
    452460        content += "</CMD>\n";
    453461        File cmdiFile = createCmdiFile("testSession", content);
    454         CMDIDigester digester = new CMDIDigester(getIMDIFacetMap());
    455         CMDIData data = digester.process(cmdiFile);
     462        CMDIDataProcessor processor = getDataParser(getIMDIFacetMap());
     463        CMDIData data = processor.process(cmdiFile);
    456464        assertEquals("test-hdl:1839/00-0000-0000-0009-294C-9", data.getId());
    457465        List<String> resources = data.getResources();
     
    505513        content += "         <description>The one-eyed grandmother is one of many traditional Kuna stories performed in the Kuna gathering house. This story, performed here by Pedro Arias, combines European derived motifs (Tom Thumb and Hansel and Gretel) with themes that seem more Kuna in origin. All are woven together and a moral is provided. Pedro Arias performed this story before a gathered audience in the morning..\n";
    506514        content += "      </description>\n";
     515        content += "         <description>Test</description>\n";
    507516        content += "         <identifier>http://uts.cc.utexas.edu/~ailla/audio/sherzer/one_eyed_grandmother.ram</identifier>\n";
    508517        content += "         <identifier>http://uts.cc.utexas.edu/~ailla/texts/sherzer/one_eyed_grandmother.pdf</identifier>\n";
     
    516525
    517526        File cmdiFile = createCmdiFile("testOlac", content);
    518         CMDIDigester digester = new CMDIDigester(getOlacFacetMap());
    519         CMDIData data = digester.process(cmdiFile);
     527        CMDIDataProcessor processor = getDataParser(getOlacFacetMap());
     528        CMDIData data = processor.process(cmdiFile);
    520529        assertEquals("oai:ailla.utexas.edu:1", data.getId());
    521530        List<String> resources = data.getResources();
     
    532541        assertEquals("transcription", doc.getFieldValue("genre"));
    533542        //  assertEquals("Kuna", doc.getFieldValue("subject"));
    534         assertEquals(2, doc.getFieldValues("description").size());
     543        Collection<Object> fieldValues = doc.getFieldValues("description");
     544        assertEquals(3, fieldValues.size());
     545        List<String> descriptions = new ArrayList(fieldValues);
     546        Collections.sort(descriptions);
     547        assertEquals("\n    Channel: Talking;\n    Genre: Traditional Narrative / Story;\n    Country: Panama;\n"
     548                + "    Place of Recording: Mulatuppu;\n    Event: Community Gathering;\n"
     549                + "    Institutional Affiliation: University of Texas at Austin;\n    Participant Information: Political Leader;\n"
     550                + "      ", descriptions.get(0).toString());
     551        assertEquals("Test", descriptions.get(1).toString());
     552        assertEquals("The one-eyed grandmother is one of many traditional Kuna stories performed "
     553                + "in the Kuna gathering house. This story, performed here by Pedro Arias, combines "
     554                + "European derived motifs (Tom Thumb and Hansel and Gretel) with themes that seem more "
     555                + "Kuna in origin. All are woven together and a moral is provided. Pedro Arias performed "
     556                + "this story before a gathered audience in the morning..\n      ", descriptions.get(2).toString());
    535557    }
    536558
     
    568590
    569591        File cmdiFile = createCmdiFile("testOlac", content);
    570         CMDIDigester digester = new CMDIDigester(getOlacFacetMap());
    571         CMDIData data = digester.process(cmdiFile);
     592        CMDIDataProcessor processor = getDataParser(getOlacFacetMap());
     593        CMDIData data = processor.process(cmdiFile);
    572594        assertEquals("collection_ATILF_Resources.cmdi", data.getId());
    573595        List<String> resources = data.getResources();
     
    616638
    617639        File cmdiFile = createCmdiFile("testOlac", content);
    618         CMDIDigester digester = new CMDIDigester(getLrtFacetMap());
    619         CMDIData data = digester.process(cmdiFile);
     640        CMDIDataProcessor processor = getDataParser(getLrtFacetMap());
     641        CMDIData data = processor.process(cmdiFile);
    620642        assertEquals("clarin.eu:lrt:433", data.getId());
    621643        List<String> resources = data.getResources();
Note: See TracChangeset for help on using the changeset viewer.