Changeset 5720 for SRUAggregator
- Timestamp:
- 10/16/14 07:56:31 (10 years ago)
- Location:
- SRUAggregator/trunk
- Files:
-
- 23 added
- 19 deleted
- 13 edited
- 2 moved
Legend:
- Unmodified
- Added
- Removed
-
SRUAggregator/trunk/nb-configuration.xml
r5291 r5720 14 14 Any value defined here will override the pom.xml file value but is only applicable to the current project. 15 15 --> 16 <org-netbeans-modules-maven-j2ee.netbeans_2e_hint_2e_deploy_2e_server>Tomcat</org-netbeans-modules-maven-j2ee.netbeans_2e_hint_2e_deploy_2e_server> 16 17 <org-netbeans-modules-maven-jaxws.rest_2e_config_2e_type>ide</org-netbeans-modules-maven-jaxws.rest_2e_config_2e_type> 17 18 </properties> -
SRUAggregator/trunk/pom.xml
r5291 r5720 1 1 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 2 xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 3 <modelVersion>4.0.0</modelVersion> 4 <groupId>eu.clarin.sru.fcs</groupId> 5 <artifactId>aggregator</artifactId> 6 <version>1.5-SNAPSHOT</version> 7 <properties> 8 <endorsed.dir>${project.build.directory}/endorsed</endorsed.dir> 9 <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> 10 <slf4j.version>1.7.2</slf4j.version> 11 <httpClient.version>4.2.5</httpClient.version> 12 </properties> 13 <packaging>war</packaging> 14 <name>Aggregator</name> 15 <description>The Aggregator Project</description> 16 <organization> 17 <name>SfS Uni Tuebingen</name> 18 <url>http://www.sfs.uni-tuebingen.de/</url> 19 </organization> 20 <licenses> 21 <license> 22 <name>GNU GENERAL PUBLIC LICENSE, Version 3</name> 23 <url>http://www.gnu.org/licenses/gpl.html</url> 24 <distribution>repo</distribution> 25 </license> 26 </licenses> 27 <repositories> 28 <repository> 29 <id>clarin</id> 30 <url>http://catalog.clarin.eu/ds/nexus/content/repositories/Clarin/</url> 31 </repository> 32 <repository> 33 <id>sardine-google-svn-repo</id> 34 <snapshots> 35 <enabled>true</enabled> 36 </snapshots> 37 <name>Sardine maven repo at Google Code</name> 38 <url>http://sardine.googlecode.com/svn/maven/</url> 39 </repository> 40 </repositories> 41 <dependencies> 42 <dependency> 43 <groupId>org.zkoss.theme</groupId> 44 <artifactId>silvertail</artifactId> 45 <version>6.5.1</version> 46 </dependency> 47 <dependency> 48 <groupId>org.zkoss.zk</groupId> 49 <artifactId>zkplus</artifactId> 50 <version>6.5.1</version> 51 </dependency> 52 <dependency> 53 <groupId>org.zkoss.zk</groupId> 54 <artifactId>zhtml</artifactId> 55 <version>6.5.1</version> 56 </dependency> 57 <dependency> 58 <groupId>com.google.code.gson</groupId> 59 <artifactId>gson</artifactId> 60 <version>2.2.2</version> 61 </dependency> 62 <dependency> 63 <groupId>eu.clarin.sru</groupId> 64 <artifactId>sru-client</artifactId> 65 <version>0.9.2</version> 66 </dependency> 67 <dependency> 68 <groupId>eu.clarin.weblicht</groupId> 69 <artifactId>wlfxb</artifactId> 70 <version>1.2.9</version> 71 </dependency> 72 <dependency> 73 <groupId>com.sun.jersey</groupId> 74 <artifactId>jersey-client</artifactId> 75 <version>1.17.1</version> 76 </dependency> 77 <dependency> 78 <groupId>com.sun.jersey</groupId> 79 <artifactId>jersey-servlet</artifactId> 80 <version>1.17.1</version> 81 </dependency> 82 <dependency> 83 <groupId>org.apache.opennlp</groupId> 84 <artifactId>opennlp-tools</artifactId> 85 <version>1.5.3</version> 86 </dependency> 87 <dependency> 88 <groupId>org.apache.poi</groupId> 89 <artifactId>poi-ooxml</artifactId> 90 <version>3.10-beta2</version> 91 </dependency> 92 <dependency> 93 <groupId>com.googlecode.sardine</groupId> 94 <artifactId>sardine</artifactId> 95 <version>314</version> 96 <type>jar</type> 97 <exclusions> 98 <exclusion> 99 <groupId>org.apache.httpcomponents</groupId> 100 <artifactId>httpcore</artifactId> 101 </exclusion> 102 <exclusion> 103 <artifactId>commons-codec</artifactId> 104 <groupId>commons-codec</groupId> 105 </exclusion> 106 </exclusions> 107 </dependency> 108 <dependency> 109 <groupId>org.slf4j</groupId> 110 <artifactId>slf4j-jdk14</artifactId> 111 </dependency> 112 113 <dependency> 114 <groupId>eu.clarin.weblicht</groupId> 115 <artifactId>connectors</artifactId> 116 <version>1.0.6</version> 117 </dependency> 118 <dependency> 119 <groupId>eu.clarin.weblicht</groupId> 120 <artifactId>bindings</artifactId> 121 <version>1.0.4</version> 122 </dependency> 123 <dependency> 124 <groupId>joda-time</groupId> 125 <artifactId>joda-time</artifactId> 126 <version>2.2</version> 127 </dependency> 128 129 <dependency> 130 <groupId>junit</groupId> 131 <artifactId>junit</artifactId> 132 <version>4.10</version> 133 <scope>test</scope> 134 <type>jar</type> 135 </dependency> 136 137 <dependency> 138 <groupId>org.mockito</groupId> 139 <artifactId>mockito-all</artifactId> 140 <version>1.9.5</version> 141 <scope>test</scope> 142 </dependency> 143 <dependency> 144 <groupId>javax.servlet</groupId> 145 <artifactId>javax.servlet-api</artifactId> 146 <version>3.1.0</version> 147 <scope>provided</scope> 148 </dependency> 149 </dependencies> 150 151 <dependencyManagement> 152 <dependencies> 153 <dependency> 154 <groupId>org.slf4j</groupId> 155 <artifactId>slf4j-api</artifactId> 156 <version>${slf4j.version}</version> 157 </dependency> 158 <dependency> 159 <groupId>org.slf4j</groupId> 160 <artifactId>slf4j-jdk14</artifactId> 161 <version>${slf4j.version}</version> 162 </dependency> 163 <dependency> 164 <groupId>org.apache.httpcomponents</groupId> 165 <artifactId>httpclient</artifactId> 166 <version>${httpClient.version}</version> 167 </dependency> 168 </dependencies> 169 </dependencyManagement> 170 171 <build> 2 xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 3 <modelVersion>4.0.0</modelVersion> 4 5 <groupId>eu.clarin.sru.fcs</groupId> 6 <artifactId>Aggregator2</artifactId> 7 <version>2.0.0-ALPHA</version> 8 <packaging>war</packaging> 9 10 <name>Aggregator2</name> 11 12 <properties> 13 <endorsed.dir>${project.build.directory}/endorsed</endorsed.dir> 14 <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> 15 </properties> 16 17 <repositories> 18 <repository> 19 <id>clarin</id> 20 <url>http://catalog.clarin.eu/ds/nexus/content/repositories/Clarin/</url> 21 </repository> 22 <repository> 23 <id>sardine-google-svn-repo</id> 24 <snapshots> 25 <enabled>true</enabled> 26 </snapshots> 27 <name>Sardine maven repo at Google Code</name> 28 <url>http://sardine.googlecode.com/svn/maven/</url> 29 </repository> 30 </repositories> 31 32 33 <dependencies> 34 <dependency> 35 <groupId>eu.clarin.sru</groupId> 36 <artifactId>sru-client</artifactId> 37 <version>0.9.2</version> 38 </dependency> 39 40 41 <dependency> 42 <groupId>eu.clarin.weblicht</groupId> 43 <artifactId>wlfxb</artifactId> 44 <version>1.2.9</version> 45 </dependency> 46 <dependency> 47 <groupId>eu.clarin.weblicht</groupId> 48 <artifactId>connectors</artifactId> 49 <version>1.0.6</version> 50 </dependency> 51 <dependency> 52 <groupId>eu.clarin.weblicht</groupId> 53 <artifactId>bindings</artifactId> 54 <version>1.0.4</version> 55 </dependency> 56 57 58 <dependency> 59 <groupId>org.apache.opennlp</groupId> 60 <artifactId>opennlp-tools</artifactId> 61 <version>1.5.3</version> 62 </dependency> 63 <dependency> 64 <groupId>com.googlecode.sardine</groupId> 65 <artifactId>sardine</artifactId> 66 <version>314</version> 67 <type>jar</type> 68 <exclusions> 69 <exclusion> 70 <groupId>org.apache.httpcomponents</groupId> 71 <artifactId>httpcore</artifactId> 72 </exclusion> 73 <exclusion> 74 <artifactId>commons-codec</artifactId> 75 <groupId>commons-codec</groupId> 76 </exclusion> 77 </exclusions> 78 </dependency> 79 <dependency> 80 <groupId>org.apache.poi</groupId> 81 <artifactId>poi-ooxml</artifactId> 82 <version>3.10-beta2</version> 83 </dependency> 84 <dependency> 85 <groupId>joda-time</groupId> 86 <artifactId>joda-time</artifactId> 87 <version>2.2</version> 88 </dependency> 89 <dependency> 90 <groupId>junit</groupId> 91 <artifactId>junit</artifactId> 92 <version>4.10</version> 93 <scope>test</scope> 94 <type>jar</type> 95 </dependency> 96 <dependency> 97 <groupId>javax</groupId> 98 <artifactId>javaee-web-api</artifactId> 99 <version>6.0</version> 100 <scope>provided</scope> 101 </dependency> 102 <dependency> 103 <groupId>com.sun.jersey</groupId> 104 <artifactId>jersey-server</artifactId> 105 <version>1.18.1</version> 106 </dependency> 107 <dependency> 108 <groupId>com.sun.jersey</groupId> 109 <artifactId>jersey-servlet</artifactId> 110 <version>1.18.1</version> 111 </dependency> 112 <dependency> 113 <groupId>com.sun.jersey</groupId> 114 <artifactId>jersey-client</artifactId> 115 <version>1.18.1</version> 116 </dependency> 117 <dependency> 118 <groupId>com.sun.jersey</groupId> 119 <artifactId>jersey-core</artifactId> 120 <version>1.18.1</version> 121 </dependency> 122 <dependency> 123 <groupId>com.sun.jersey</groupId> 124 <artifactId>jersey-json</artifactId> 125 <version>1.18.1</version> 126 </dependency> 127 <dependency> 128 <groupId>com.sun.jersey.contribs</groupId> 129 <artifactId>jersey-multipart</artifactId> 130 <version>1.18.1</version> 131 </dependency> 132 133 </dependencies> 134 135 <build> 172 136 <finalName>${project.artifactId}</finalName> 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 137 <plugins> 138 <plugin> 139 <groupId>org.apache.maven.plugins</groupId> 140 <artifactId>maven-compiler-plugin</artifactId> 141 <version>2.3.2</version> 142 <configuration> 143 <source>1.6</source> 144 <target>1.6</target> 145 <compilerArguments> 146 <endorseddirs>${endorsed.dir}</endorseddirs> 147 </compilerArguments> 148 </configuration> 149 </plugin> 150 <plugin> 151 <groupId>org.apache.maven.plugins</groupId> 152 <artifactId>maven-war-plugin</artifactId> 153 <version>2.1.1</version> 154 <configuration> 155 <failOnMissingWebXml>false</failOnMissingWebXml> 156 </configuration> 157 </plugin> 158 <plugin> 159 <groupId>org.apache.maven.plugins</groupId> 160 <artifactId>maven-dependency-plugin</artifactId> 161 <version>2.1</version> 162 <executions> 163 <execution> 164 <phase>validate</phase> 165 <goals> 166 <goal>copy</goal> 167 </goals> 168 <configuration> 169 <outputDirectory>${endorsed.dir}</outputDirectory> 170 <silent>true</silent> 171 <artifactItems> 172 <artifactItem> 173 <groupId>javax</groupId> 174 <artifactId>javaee-endorsed-api</artifactId> 175 <version>6.0</version> 176 <type>jar</type> 177 </artifactItem> 178 </artifactItems> 179 </configuration> 180 </execution> 181 </executions> 182 </plugin> 183 </plugins> 184 </build> 221 185 </project> -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/app/Aggregator.java
r5041 r5720 1 1 package eu.clarin.sru.fcs.aggregator.app; 2 2 3 import eu.clarin.sru.fcs.aggregator.search.Search; 4 import eu.clarin.sru.fcs.aggregator.cache.ScanCrawlTask; 5 import eu.clarin.sru.fcs.aggregator.cache.ScanCrawler; 6 import eu.clarin.sru.fcs.aggregator.cache.ScanCacheFile; 7 import eu.clarin.sru.fcs.aggregator.cache.SimpleInMemScanCache; 8 import eu.clarin.sru.client.SRUThreadedClient; 9 import eu.clarin.sru.client.SRUVersion; 10 import eu.clarin.sru.client.fcs.ClarinFCSRecordParser; 11 import eu.clarin.sru.fcs.aggregator.cache.EndpointUrlFilter; 12 import eu.clarin.sru.fcs.aggregator.registry.CenterRegistryLive; 13 import eu.clarin.sru.fcs.aggregator.cache.ScanCache; 14 import eu.clarin.sru.fcs.aggregator.registry.Corpus; 15 import java.io.File; 16 import java.io.IOException; 17 import java.io.InputStream; 18 import java.util.Collections; 19 import java.util.HashMap; 20 import java.util.List; 3 21 import java.util.Map; 4 import java.util.Set; 5 import java.util.logging.*; 6 import org.zkoss.zk.ui.Component; 7 import org.zkoss.zk.ui.Executions; 8 import org.zkoss.zk.ui.event.Event; 9 import org.zkoss.zk.ui.select.SelectorComposer; 10 import org.zkoss.zk.ui.select.annotation.Listen; 11 import org.zkoss.zk.ui.select.annotation.Wire; 12 import org.zkoss.zul.Label; 13 import org.zkoss.zul.Messagebox; 14 import org.zkoss.zul.Textbox; 15 import eu.clarin.sru.fcs.aggregator.sopt.Corpus; 16 import eu.clarin.sru.fcs.aggregator.sopt.Languages; 17 import eu.clarin.sru.fcs.aggregator.util.SRUCQL; 22 import java.util.concurrent.ExecutorService; 23 import java.util.concurrent.Executors; 24 import java.util.concurrent.ScheduledExecutorService; 25 import java.util.concurrent.TimeUnit; 26 import java.util.concurrent.atomic.AtomicReference; 27 import java.util.logging.Level; 28 import java.util.logging.Logger; 18 29 import javax.naming.InitialContext; 19 30 import javax.naming.NamingException; 20 import org.zkoss.zul.A; 21 import org.zkoss.zul.Div; 22 import org.zkoss.zul.Menubar; 23 import org.zkoss.zul.Menuitem; 24 import org.zkoss.zul.North; 25 import org.zkoss.zul.Popup; 26 import org.zkoss.zul.Progressmeter; 27 import org.zkoss.zul.South; 31 import javax.servlet.ServletContextEvent; 32 import javax.servlet.ServletContextListener; 33 import opennlp.tools.tokenize.TokenizerModel; 28 34 29 35 /** 30 * Main component of the Aggregator application intended to provide 31 * users access to CLARIN-FCS resources. 32 * 33 * The webapp base URL corresponds to the default behavior of displaying 34 * the main aggregator page, where the user can enter query, select the 35 * resources of CQL endpoints (as specified in the Clarin center registry), 36 * and search in these resources. The endpoints/resources selection is 37 * optional, by default all the endpoints root resources are selected. 38 * 39 * If invoked with 'x-aggregation-context' and 'query' parameter, 40 * the aggregator will pre-select provided resources and fill in the query field. 41 * This mechanism is currently used by VLO. 42 * Example: 43 * POST http://weblicht.sfs.uni-tuebingen.de/Aggregator HTTP/1.1 44 * operation = searchRetrieve & 45 * version = 1.2 & 46 * query = bellen & 47 * x-aggregation-context = {"http://fedora.clarin-d.uni-saarland.de/sru/":["hdl:11858/00-246C-0000-0008-5F2A-0"]} 48 * 49 * 50 * Additionally, if run with the a URL query string parameter 'mode', the 36 * Main component of the Aggregator application intended to provide users access 37 * to CLARIN-FCS resources. 38 * 39 * The webapp base URL corresponds to the default behavior of displaying the 40 * main aggregator page, where the user can enter query, select the resources of 41 * CQL endpoints (as specified in the Clarin center registry), and search in 42 * these resources. The endpoints/resources selection is optional, by default 43 * all the endpoints root resources are selected. 44 * 45 * If invoked with 'x-aggregation-context' and 'query' parameter, the aggregator 46 * will pre-select provided resources and fill in the query field. This 47 * mechanism is currently used by VLO. Example: POST 48 * http://weblicht.sfs.uni-tuebingen.de/Aggregator HTTP/1.1 operation = 49 * searchRetrieve & version = 1.2 & query = bellen & x-aggregation-context = 50 * {"http://fedora.clarin-d.uni-saarland.de/sru/":["hdl:11858/00-246C-0000-0008-5F2A-0"]} 51 * 52 * 53 * Additionally, if run with the a URL query string parameter 'mode', the 51 54 * special behavior of the aggregator is triggered: 52 * 53 * /?mode=testing 54 * corresponds to the mode where the CQL endpoints are taken not from Clarin 55 * center repository, but from a hard-coded endpoints list; this functionality 56 * is useful for testing the development instances of endpoints, before they 57 * are moved to production. Was done to meet the request from MPI. 58 * 59 * /?mode=search 60 * corresponds to the mode where the aggregator page is requested with the 61 * already known query and (optionally) resources to search in, and if the 62 * immediate search is desired. In this case the aggregator search results 63 * page is displayed and search results of the provided query start to fill 64 * it in immediately (i.e. users don't need to click 'search' in the aggregator 65 * page). Was done to meet the request from CLARIN ERIC (Martin Wynne 66 * contacted us). 67 * 68 * /?mode=live 69 * corresponds to the mode where the information about corpora are taken not 70 * from the scan cache (crawled in advance), but loaded live, starting from 71 * the request to center registry and then performing scan operation requests on 72 * each CQL endpoint listed there. It takes time to get the corresponding 73 * responses from the endpoints, therefore the Aggregator page loads very slow 74 * in this mode. But this mode is useful for testing of the newly added or 75 * changed corpora without waiting for the next crawl. 76 * 77 * 55 * 56 * /?mode=testing corresponds to the mode where the CQL endpoints are taken not 57 * from Clarin center repository, but from a hard-coded endpoints list; this 58 * functionality is useful for testing the development instances of endpoints, 59 * before they are moved to production. Was done to meet the request from MPI. 60 * 61 * /?mode=search corresponds to the mode where the aggregator page is requested 62 * with the already known query and (optionally) resources to search in, and if 63 * the immediate search is desired. In this case the aggregator search results 64 * page is displayed and search results of the provided query start to fill it 65 * in immediately (i.e. users don't need to click 'search' in the aggregator 66 * page). Was done to meet the request from CLARIN ERIC (Martin Wynne contacted 67 * us). 68 * 69 * /?mode=live corresponds to the mode where the information about corpora are 70 * taken not from the scan cache (crawled in advance), but loaded live, starting 71 * from the request to center registry and then performing scan operation 72 * requests on each CQL endpoint listed there. It takes time to get the 73 * corresponding responses from the endpoints, therefore the Aggregator page 74 * loads very slow in this mode. But this mode is useful for testing of the 75 * newly added or changed corpora without waiting for the next crawl. 76 * 77 * 78 * Adds Application initialization and clean up: only one SRU threaded client is 79 * used in the application, it has to be shut down when the application stops. 80 * One Languages object instance is used within the application. 81 * 78 82 * @author Yana Panchenko 83 * @author edima 79 84 */ 80 public class Aggregator extends SelectorComposer<Component> { 81 82 private static final Logger LOGGER = Logger.getLogger(Aggregator.class.getName()); 83 @Wire 84 private Textbox searchString; 85 @Wire 86 private Popup wspaceSigninpop; 87 @Wire 88 private Textbox wspaceUserName; 89 @Wire 90 private Textbox wspaceUserPwd; 91 private int exportDataType = 1; 92 @Wire 93 private Div aboutDiv; 94 @Wire 95 private Label aboutLabel; 96 @Wire 97 private Div soDiv; 98 private SearchOptions searchOptionsComposer; 99 @Wire 100 private Label soLabel; 101 @Wire 102 private Div srDiv; 103 private SearchResults searchResultsComposer; 104 @Wire 105 private Label srLabel; 106 @Wire 107 private Div helpDiv; 108 @Wire 109 private Label helpLabel; 110 @Wire 111 private Progressmeter pMeter; 112 @Wire 113 private Menubar menubar; 114 @Wire 115 private North controls1; 116 @Wire 117 private South controls2; 118 @Wire 119 private A prevButton; 120 @Wire 121 private A nextButton; 122 @Wire 123 private Label tooltipPrevText; 124 @Wire 125 private Label tooltipNextText; 126 @Wire 127 private Menuitem weblichtTcf; 128 129 private int[] searchOffset = new int[]{1, 0}; // start and size 130 private ControlsVisibility controlsVisibility; 131 private PagesVisibility pagesVisibility; 132 133 private String weblichtUrl; // defined in web.xml 134 public static final String MODE_PARAM = "mode"; 135 public static final String MODE_PARAM_VALUE_TEST = "testing"; 136 public static final String MODE_PARAM_VALUE_SEARCH = "search"; 137 public static final String MODE_PARAM_VALUE_LIVE = "live"; 138 139 140 141 @Override 142 public void doAfterCompose(Component comp) throws Exception { 143 super.doAfterCompose(comp); 144 processContext(); 145 processParameters(); 146 searchOptionsComposer = (SearchOptions) soDiv.getChildren().get(0).getChildren().get(0).getAttribute("$" + SearchOptions.class.getSimpleName()); 147 searchOptionsComposer.setAggregatorController(this); 148 searchResultsComposer = (SearchResults) srDiv.getChildren().get(0).getChildren().get(0).getAttribute("$" + SearchResults.class.getSimpleName()); 149 pagesVisibility = new PagesVisibility(aboutDiv, aboutLabel, soDiv, soLabel, srDiv, srLabel, helpDiv, helpLabel); 150 controlsVisibility = new ControlsVisibility(controls1, controls2, pMeter, menubar, prevButton, nextButton); 151 searchResultsComposer.setVisibilityControllers(pagesVisibility, controlsVisibility); 152 } 153 154 @Listen("onClick = #searchButton") 155 public void onExecuteSearch(Event ev) { 156 Map<String, Set<Corpus>> selectedCorpora = searchOptionsComposer.getSelectedCorpora(); 157 boolean emptyCorpora = true; 158 for (Set<Corpus> corpora : selectedCorpora.values()) { 159 if (!corpora.isEmpty()) { 160 emptyCorpora = false; 161 break; 162 } 163 } 164 if (emptyCorpora) { 165 Messagebox.show("No corpora is selected. To perform the search, please select corus/corpora of interest by checking the corpora checkboxes.", "FCS", 0, Messagebox.INFORMATION); 166 } else if (searchString.getText().isEmpty()) { 167 Messagebox.show("No query is specified. To perform the search, please enter a keyword of interest in the search input field, e.g. Elefant, and press the 'Search' button.", "FCS", 0, Messagebox.INFORMATION); 168 } else { 169 int maxRecords = searchOptionsComposer.getMaxRecords(); 170 String searchLang = searchOptionsComposer.getSearchLang(); 171 //searchOffset = new int[]{1, 0}; 172 searchOffset = new int[]{1, 0}; 173 searchOffset[0] = searchOffset[0] + searchOffset[1]; 174 searchOffset[1] = maxRecords; 175 searchResultsComposer.executeSearch(selectedCorpora, searchOffset[0], maxRecords, searchString.getText(), searchLang); 176 if (searchLang.equals(Languages.ANY_LANGUAGE_NAME)) { 177 this.weblichtTcf.setVisible(false); 178 } else { 179 this.weblichtTcf.setVisible(true); 180 } 181 onClickSearchResult(null); 182 } 183 } 184 185 @Listen("onOK = #searchString") 186 public void onEnterSearchString(Event ev) { 187 onExecuteSearch(ev); 188 } 189 190 @Listen("onClick=#clearResults") 191 public void onClearResults(Event ev) { 192 this.searchResultsComposer.clearResults(); 193 } 194 195 @Listen("onClick=#downloadCSV") 196 public void onExportResultsCSV(Event ev) { 197 searchResultsComposer.exportCSV(); 198 } 199 200 @Listen("onClick=#downloadTCF") 201 public void onExportResultsTCF(Event ev) { 202 searchResultsComposer.exportTCF(); 203 } 204 205 @Listen("onClick=#downloadText") 206 public void onExportResultsText(Event ev) { 207 searchResultsComposer.exportText(); 208 } 209 210 @Listen("onClick=#downloadExcel") 211 public void onExportResultsExcel(Event ev) { 212 searchResultsComposer.exportExcel(); 213 } 214 215 @Listen("onClick=#exportPWCSV") 216 public void onExportResultsPWCSV(Event ev) { 217 exportDataType = 1; 218 wspaceSigninpop.open(srDiv, "top_center"); 219 } 220 221 @Listen("onClick=#exportPWTCF") 222 public void onExportResultsPWTCF(Event ev) { 223 exportDataType = 0; 224 wspaceSigninpop.open(srDiv, "top_center"); 225 } 226 227 @Listen("onClick=#exportPWText") 228 public void onExportResultsPWText(Event ev) { 229 exportDataType = 2; 230 wspaceSigninpop.open(srDiv, "top_center"); 231 } 232 233 @Listen("onClick=#exportPWExcel") 234 public void onExportResultsPWExcel(Event ev) { 235 exportDataType = 3; 236 wspaceSigninpop.open(srDiv, "top_center"); 237 } 238 239 @Listen("onClick=#weblichtText") 240 public void onUseWebLichtOnText(Event ev) { 241 String url = searchResultsComposer.useWebLichtOnText(); 242 if (url != null) { 243 Executions.getCurrent().sendRedirect(weblichtUrl 244 + url, "_blank"); 245 } 246 } 247 248 @Listen("onClick=#weblichtTcf") 249 public void onUseWebLichtOnTcf(Event ev) { 250 String url = searchResultsComposer.useWebLichtOnToks(); 251 if (url != null) { 252 Executions.getCurrent().sendRedirect(weblichtUrl 253 + url, "_blank"); 254 } 255 } 256 257 @Listen("onClick=#wspaceSigninBtn") 258 public void onSignInExportResults(Event ev) { 259 String user = wspaceUserName.getValue(); 260 String pswd = wspaceUserPwd.getValue(); 261 wspaceUserPwd.setValue(""); 262 if (user.isEmpty() || pswd.isEmpty()) { 263 Messagebox.show("Need user name and password!"); 264 } else { 265 wspaceSigninpop.close(); 266 if (exportDataType == 0) { 267 searchResultsComposer.exportPWTCF(user, pswd); 268 } else if (exportDataType == 1) { 269 searchResultsComposer.exportPWCSV(user, pswd); 270 } else if (exportDataType == 2) { 271 searchResultsComposer.exportPWText(user, pswd); 272 } else if (exportDataType == 3) { 273 searchResultsComposer.exportPWExcel(user, pswd); 274 } 275 } 276 } 277 278 @Listen("onOK=#wspaceUserPwd") 279 public void onSignInExportResultsPwdOK(Event ev) { 280 onSignInExportResults(ev); 281 } 282 283 @Listen("onClick=#wspaceCancelBtn") 284 public void onSignInPWCancel(Event ev) { 285 wspaceUserPwd.setValue(""); 286 wspaceSigninpop.close(); 287 } 288 289 @Listen("onClick = #helpLabel") 290 public void onClickHelp(Event ev) { 291 this.pagesVisibility.openHelp(); 292 this.controlsVisibility.disableControls1(); 293 this.controlsVisibility.disableControls2(); 294 } 295 296 @Listen("onClick = #aboutLabel") 297 public void onClickAbout(Event ev) { 298 this.pagesVisibility.openAbout(); 299 this.controlsVisibility.disableControls1(); 300 this.controlsVisibility.disableControls2(); 301 } 302 303 @Listen("onClick = #soLabel") 304 public void onClickAdvSearch(Event ev) { 305 this.pagesVisibility.openSearchOptions(); 306 this.controlsVisibility.disableControls1(); 307 this.controlsVisibility.disableControls2(); 308 } 309 310 @Listen("onClick = #srLabel") 311 public void onClickSearchResult(Event ev) { 312 setupPrevNextSearchTooltips(); 313 this.pagesVisibility.openSearchResult(); 314 if (this.searchResultsComposer.hasSearchInProgress()) { 315 this.controlsVisibility.enableControls2(); 316 } 317 if (this.searchResultsComposer.hasResults()) { 318 this.controlsVisibility.enableControls1(); 319 this.controlsVisibility.enableControls2(); 320 } 321 322 } 323 324 @Listen("onClick = #prevButton") 325 public void onSearchPrev(Event ev) { 326 Map<String, Set<Corpus>> selectedCorpora = searchOptionsComposer.getSelectedCorpora(); 327 boolean emptyCorpora = true; 328 for (Set<Corpus> corpora : selectedCorpora.values()) { 329 if (!corpora.isEmpty()) { 330 emptyCorpora = false; 331 break; 332 } 333 } 334 if (emptyCorpora) { 335 Messagebox.show("No corpora is selected. To perform the search, please select corus/corpora of interest by checking the corpora checkboxes.", "FCS", 0, Messagebox.INFORMATION); 336 } else if (searchString.getText().isEmpty()) { 337 Messagebox.show("No query is specified. To perform the search, please enter a keyword of interest in the search input field, e.g. Elefant, and press the 'Search' button.", "FCS", 0, Messagebox.INFORMATION); 338 } else { 339 int maxRecords = searchOptionsComposer.getMaxRecords(); 340 String searchLang = searchOptionsComposer.getSearchLang(); 341 //searchOffset[0] = searchOffset[0] - searchOffset[1]; 342 searchOffset[0] = searchOffset[0] - maxRecords; 343 if (searchOffset[0] < 1) { 344 searchOffset[0] = 1; 345 } 346 searchOffset[1] = maxRecords; 347 searchResultsComposer.executeSearch(selectedCorpora, searchOffset[0], maxRecords, searchString.getText(), searchLang); 348 if (searchLang.equals(Languages.ANY_LANGUAGE_NAME)) { 349 this.weblichtTcf.setVisible(false); 350 } else { 351 this.weblichtTcf.setVisible(true); 352 } 353 onClickSearchResult(null); 354 } 355 } 356 357 358 @Listen("onClick = #nextButton") 359 public void onSearchNext(Event ev) { 360 Map<String, Set<Corpus>> selectedCorpora = searchOptionsComposer.getSelectedCorpora(); 361 boolean emptyCorpora = true; 362 for (Set<Corpus> corpora : selectedCorpora.values()) { 363 if (!corpora.isEmpty()) { 364 emptyCorpora = false; 365 break; 366 } 367 } 368 if (emptyCorpora) { 369 Messagebox.show("No corpora is selected. To perform the search, please select corus/corpora of interest by checking the corpora checkboxes.", "FCS", 0, Messagebox.INFORMATION); 370 } else if (searchString.getText().isEmpty()) { 371 Messagebox.show("No query is specified. To perform the search, please enter a keyword of interest in the search input field, e.g. Elefant, and press the 'Search' button.", "FCS", 0, Messagebox.INFORMATION); 372 } else { 373 int maxRecords = searchOptionsComposer.getMaxRecords(); 374 String searchLang = searchOptionsComposer.getSearchLang(); 375 searchOffset[0] = searchOffset[0] + searchOffset[1]; 376 searchOffset[1] = maxRecords; 377 searchResultsComposer.executeSearch(selectedCorpora, searchOffset[0], maxRecords, searchString.getText(), searchLang); 378 if (searchLang.equals(Languages.ANY_LANGUAGE_NAME)) { 379 this.weblichtTcf.setVisible(false); 380 } else { 381 this.weblichtTcf.setVisible(true); 382 } 383 onClickSearchResult(null); 384 } 385 } 386 387 private void processParameters() { 388 String[] paramValue; 389 String query = null; 390 paramValue = Executions.getCurrent().getParameterMap().get(SRUCQL.SEARCH_QUERY_PARAMETER); 391 if (paramValue != null) { 392 query = paramValue[0].trim(); 393 searchString.setValue(query); 394 } 395 LOGGER.log(Level.INFO, "Received parameter: query[{0}], ", query); 396 paramValue = Executions.getCurrent().getParameterMap().get(SRUCQL.OPERATION); 397 String operationString = null; 398 if (paramValue != null) { 399 operationString = paramValue[0].trim(); 400 if (!operationString.equals(SRUCQL.SEARCH_RETRIEVE)) { 401 Messagebox.show("Not supported operation " + operationString, "FCS", 0, Messagebox.INFORMATION); 402 } 403 } 404 LOGGER.log(Level.INFO, "Received parameter: operation[{0}], ", operationString); 405 } 406 407 private void setupPrevNextSearchTooltips() { 408 int startHit = searchOffset[0] - searchOptionsComposer.getMaxRecords(); 409 if (startHit < 1) { 410 startHit = 1; 411 } 412 int endHit = searchOffset[0] - 1; 413 tooltipPrevText.setValue("hits " + 414 startHit + "-" + endHit); 415 startHit = searchOffset[0] + searchOffset[1]; 416 endHit = startHit + searchOptionsComposer.getMaxRecords() - 1; 417 tooltipNextText.setValue("hits " + 418 startHit + "-" + endHit); 419 } 420 421 private void processContext() { 422 InitialContext context; 423 try { 424 context = new InitialContext(); 425 weblichtUrl = (String) context.lookup("java:comp/env/weblicht-url"); 426 } catch (NamingException ex) { 427 LOGGER.log(Level.SEVERE, null, ex); 428 } 429 } 430 85 public class Aggregator implements ServletContextListener { 86 87 private static final Logger LOGGER = Logger.getLogger(Aggregator.class.getName()); 88 89 public static final int WAITING_TIME_FOR_SHUTDOWN_MS = 10000; 90 public static final String DE_TOK_MODEL = "/tokenizer/de-tuebadz-8.0-token.bin"; 91 private static final String DEFAULT_DATA_LOCATION = "/data"; 92 private static final String SCAN_DIR_NAME = "scan"; 93 94 private static final ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(1); 95 private static Aggregator instance; 96 97 private AtomicReference<ScanCache> scanCacheAtom = new AtomicReference<ScanCache>(); 98 private TokenizerModel model; 99 private SRUThreadedClient sruClient = null; 100 private Map<Long, Search> activeSearches = Collections.synchronizedMap(new HashMap<Long, Search>()); 101 102 public static Aggregator getInstance() { 103 return instance; 104 } 105 106 public ScanCache getScanCache() { 107 return scanCacheAtom.get(); 108 } 109 110 @Override 111 public void contextInitialized(ServletContextEvent servletContextEvent) { 112 LOGGER.info("Aggregator is starting now."); 113 instance = this; 114 try { 115 sruClient = new SRUThreadedClient(); 116 sruClient.registerRecordParser(new ClarinFCSRecordParser()); 117 118 InitialContext context = new InitialContext(); 119 Integer cacheMaxDepth = (Integer) context.lookup("java:comp/env/scan-max-depth"); 120 EndpointUrlFilter filter //= null; 121 = new EndpointUrlFilter("uni-tuebingen.de", ".mpi.nl", "dspin.dwds.de", "lindat."); 122 ScanCrawler scanCrawler = new ScanCrawler(new CenterRegistryLive(), sruClient, filter, cacheMaxDepth); 123 124 ScanCacheFile scanCacheFile = new ScanCacheFile(getScanDirectory()); 125 LOGGER.info("Start cache read"); 126 try { 127 scanCacheAtom.set(scanCacheFile.read()); 128 LOGGER.info("Finished cache read, number of root corpora: " + scanCacheAtom.get().getRootCorpora().size()); 129 } catch (Exception e) { 130 LOGGER.log(Level.SEVERE, "Error while reading the scan cache!", e); 131 scanCacheAtom.set(new SimpleInMemScanCache()); 132 } 133 134 String updateIntervalUnitString = (String) context.lookup("java:comp/env/update-interval-unit"); 135 TimeUnit cacheUpdateIntervalUnit = TimeUnit.valueOf(updateIntervalUnitString); 136 Integer cacheUpdateInterval = (Integer) context.lookup("java:comp/env/update-interval"); 137 scheduler.scheduleAtFixedRate( 138 new ScanCrawlTask(scanCrawler, scanCacheFile, scanCacheAtom), 139 0, cacheUpdateInterval, cacheUpdateIntervalUnit); 140 141 model = setUpTokenizers(); 142 LOGGER.info("Aggregator initialization finished."); 143 } catch (Exception ex) { 144 LOGGER.log(Level.SEVERE, null, ex); 145 instance = null; // force crash 146 } 147 } 148 149 @Override 150 public void contextDestroyed(ServletContextEvent sce) { 151 LOGGER.info("Aggregator is shutting down."); 152 for (Search search : activeSearches.values()) { 153 search.shutdown(); 154 } 155 shutdownAndAwaitTermination(sruClient, scheduler); 156 LOGGER.info("Aggregator shutdown complete."); 157 } 158 159 public static SRUVersion getSRUVersion(String sruversion) { 160 SRUVersion version = SRUVersion.VERSION_1_2; 161 if (sruversion.equals("1.2")) { 162 version = SRUVersion.VERSION_1_2; 163 } else if (sruversion.equals("1.1")) { 164 version = SRUVersion.VERSION_1_1; 165 } else { 166 return null; 167 } 168 return version; 169 } 170 171 // this function should be thread-safe 172 public Search startSearch(SRUVersion version, List<Corpus> corpora, String searchString, String searchLang, int maxRecords) throws Exception { 173 if (corpora.isEmpty()) { 174 // No corpora 175 return null; 176 } else if (searchString.isEmpty()) { 177 // No query 178 return null; 179 } else { 180 Search sr = new Search(sruClient, version, corpora, searchString, searchLang, 1, maxRecords); 181 activeSearches.put(sr.getId(), sr); 182 return sr; 183 } 184 } 185 186 public Search getSearchById(Long id) { 187 return activeSearches.get(id); 188 } 189 190 private static String getScanDirectory() throws NamingException { 191 InitialContext context = new InitialContext(); 192 String dataLocationPropertyName = (String) context.lookup("java:comp/env/data-location-property"); 193 String aggregatorDirName = (String) context.lookup("java:comp/env/aggregator-folder"); 194 // see if data location is set in properties 195 String dataLocation = System.getProperty(dataLocationPropertyName); 196 if (dataLocation == null || !(new File(dataLocation, aggregatorDirName).exists())) { 197 dataLocation = DEFAULT_DATA_LOCATION; 198 if (!(new File(dataLocation, aggregatorDirName).exists())) { 199 dataLocation = System.getProperty("user.home"); 200 } 201 if ((new File(dataLocation, aggregatorDirName).exists())) { 202 LOGGER.info(dataLocationPropertyName + " property is not defined, " 203 + "setting to default: " + dataLocation); 204 } else { 205 LOGGER.info(dataLocationPropertyName + " property is not defined, " 206 + "default location does not exist: " + dataLocation); 207 throw new RuntimeException("Data location not found"); 208 } 209 } 210 211 File aggregatorDir = new File(dataLocation, aggregatorDirName); 212 if (!aggregatorDir.exists()) { 213 LOGGER.severe("Aggregator directory does not exist: " 214 + aggregatorDir.getAbsolutePath()); 215 } 216 File scanDir = new File(aggregatorDir, SCAN_DIR_NAME); 217 if (!scanDir.exists()) { 218 if (!scanDir.mkdir()) { 219 LOGGER.severe("Scan directory does not exist and cannot be created: " 220 + aggregatorDir.getAbsolutePath()); 221 } 222 } 223 String scanPath = scanDir.getAbsolutePath(); 224 LOGGER.info("Scan data location: " + scanPath); 225 return scanPath; 226 } 227 228 private static void shutdownAndAwaitTermination(SRUThreadedClient sruClient, ExecutorService scheduler) { 229 try { 230 sruClient.shutdown(); 231 scheduler.shutdown(); 232 Thread.sleep(WAITING_TIME_FOR_SHUTDOWN_MS); 233 sruClient.shutdownNow(); 234 scheduler.shutdownNow(); 235 Thread.sleep(WAITING_TIME_FOR_SHUTDOWN_MS); 236 } catch (InterruptedException ie) { 237 sruClient.shutdownNow(); 238 scheduler.shutdownNow(); 239 Thread.currentThread().interrupt(); 240 } 241 } 242 243 private static TokenizerModel setUpTokenizers() { 244 TokenizerModel model = null; 245 try { 246 InputStream tokenizerModelDeAsIS = Thread.currentThread().getContextClassLoader().getResourceAsStream(DE_TOK_MODEL); 247 model = new TokenizerModel(tokenizerModelDeAsIS); 248 tokenizerModelDeAsIS.close(); 249 } catch (IOException ex) { 250 LOGGER.log(Level.SEVERE, "Failed to load tokenizer model", ex); 251 } 252 return model; 253 } 431 254 } -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/cache/EndpointFilter.java
r5035 r5720 1 1 package eu.clarin.sru.fcs.aggregator.cache; 2 2 3 import eu.clarin.sru.fcs.aggregator. sopt.Endpoint;3 import eu.clarin.sru.fcs.aggregator.registry.Endpoint; 4 4 5 5 /** -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/cache/EndpointUrlFilter.java
r5035 r5720 1 1 package eu.clarin.sru.fcs.aggregator.cache; 2 2 3 import eu.clarin.sru.fcs.aggregator. sopt.Endpoint;3 import eu.clarin.sru.fcs.aggregator.registry.Endpoint; 4 4 import java.util.ArrayList; 5 import java.util.Collections; 5 6 import java.util.List; 6 7 7 8 /** 8 * Filters for the cache of scan data (endpoint/resources descriptions) based 9 * on endpoint url. Only endpoints containing one of the specified string in the9 * Filters for the cache of scan data (endpoint/resources descriptions) based on 10 * endpoint url. Only endpoints containing one of the specified string in the 10 11 * endpoint url will be cached. Useful for testing the endpoints. 11 * 12 * 12 13 * @author yanapanchenko 13 14 */ 14 15 public class EndpointUrlFilter implements EndpointFilter { 15 16 private String[] urlShouldContain = new String[0];17 18 public void urlShouldContainAnyOf(String ... urlSubstrings) {19 urlShouldContain = urlSubstrings;20 }21 16 22 @Override 23 public Iterable<Endpoint> filter(Iterable<Endpoint> endpoints) { 24 List<Endpoint> filtered = new ArrayList<Endpoint>(); 25 26 for (Endpoint endp : endpoints) { 27 for (String urlSubstring : urlShouldContain) { 28 if (endp.getUrl().contains(urlSubstring)) { 29 filtered.add(endp); 30 break; 31 } 32 } 33 } 34 35 return filtered; 36 } 37 17 private List<String> allow = new ArrayList<String>(); 18 19 public EndpointUrlFilter(String... fragments) { 20 Collections.addAll(allow, fragments); 21 } 22 23 @Override 24 public Iterable<Endpoint> filter(Iterable<Endpoint> endpoints) { 25 List<Endpoint> filtered = new ArrayList<Endpoint>(); 26 27 for (Endpoint endp : endpoints) { 28 for (String urlSubstring : allow) { 29 if (endp.getUrl().contains(urlSubstring)) { 30 filtered.add(endp); 31 break; 32 } 33 } 34 } 35 36 return filtered; 37 } 38 38 39 } -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/cache/ScanCache.java
r5035 r5720 1 1 package eu.clarin.sru.fcs.aggregator.cache; 2 2 3 import eu.clarin.sru.fcs.aggregator. sopt.Corpus;4 import eu.clarin.sru.fcs.aggregator. sopt.Institution;3 import eu.clarin.sru.fcs.aggregator.registry.Corpus; 4 import eu.clarin.sru.fcs.aggregator.registry.Institution; 5 5 import java.util.List; 6 6 import java.util.Map; -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/cache/ScanCacheFile.java
r5701 r5720 1 1 package eu.clarin.sru.fcs.aggregator.cache; 2 2 3 import eu.clarin.sru.fcs.aggregator. sopt.Corpus;4 import eu.clarin.sru.fcs.aggregator. sopt.Endpoint;5 import eu.clarin.sru.fcs.aggregator. sopt.Institution;3 import eu.clarin.sru.fcs.aggregator.registry.Corpus; 4 import eu.clarin.sru.fcs.aggregator.registry.Endpoint; 5 import eu.clarin.sru.fcs.aggregator.registry.Institution; 6 6 import java.io.BufferedOutputStream; 7 7 import java.io.BufferedReader; … … 37 37 * @author yanapanchenko 38 38 */ 39 public class ScanCacheFile d{39 public class ScanCacheFile { 40 40 41 41 private String scanDirectory; … … 48 48 public static final String NL = "\n"; 49 49 public static final String SPACE = " "; 50 private static final Logger LOGGER = Logger.getLogger(ScanCacheFile d.class.getName());50 private static final Logger LOGGER = Logger.getLogger(ScanCacheFile.class.getName()); 51 51 52 52 /** … … 56 56 * ScanCache data are/should be stored. 57 57 */ 58 public ScanCacheFile d(String scanDirectory) {58 public ScanCacheFile(String scanDirectory) { 59 59 this.scanDirectory = scanDirectory; 60 60 } -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/cache/ScanCrawlTask.java
r5035 r5720 1 1 package eu.clarin.sru.fcs.aggregator.cache; 2 2 3 import static eu.clarin.sru.fcs.aggregator.app.WebAppListener.CORPUS_CACHE;3 import java.util.concurrent.atomic.AtomicReference; 4 4 import java.util.logging.Level; 5 5 import java.util.logging.Logger; 6 import org.zkoss.zk.ui.WebApp;7 6 8 7 /** … … 19 18 20 19 private final ScanCrawler scanCrawler; 21 private ScanCacheFiledscanCacheFiled;22 private WebApp webapp;20 private ScanCacheFile scanCacheFiled; 21 private AtomicReference<ScanCache> scanCacheAtom; 23 22 24 public ScanCrawlTask( 25 ScanCrawler scanCrawler, ScanCacheFiled scanCacheFiled, WebApp webapp) { 23 public ScanCrawlTask(ScanCrawler scanCrawler, ScanCacheFile scanCacheFiled, AtomicReference<ScanCache> scanCacheAtom) { 26 24 this.scanCrawler = scanCrawler; 27 28 this.webapp = webapp;25 this.scanCacheFiled = scanCacheFiled; 26 this.scanCacheAtom = scanCacheAtom; 29 27 } 30 28 … … 42 40 } else { 43 41 logger.log(Level.INFO, "Started cache write into the file"); 44 45 webapp.setAttribute(CORPUS_CACHE,cacheNew);42 scanCacheFiled.write(cacheNew); 43 scanCacheAtom.set(cacheNew); 46 44 logger.log(Level.INFO, "Finished cache write into the file"); 47 45 } -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/cache/ScanCrawler.java
r5039 r5720 1 1 package eu.clarin.sru.fcs.aggregator.cache; 2 2 3 import eu.clarin.sru.client.SRUClientException;4 3 import eu.clarin.sru.client.SRUScanRequest; 5 4 import eu.clarin.sru.client.SRUScanResponse; 6 5 import eu.clarin.sru.client.SRUTerm; 7 6 import eu.clarin.sru.client.SRUThreadedClient; 8 import eu.clarin.sru.fcs.aggregator. sopt.CenterRegistryI;9 import eu.clarin.sru.fcs.aggregator. sopt.Corpus;10 import eu.clarin.sru.fcs.aggregator. sopt.Endpoint;11 import eu.clarin.sru.fcs.aggregator. sopt.Institution;7 import eu.clarin.sru.fcs.aggregator.registry.CenterRegistryI; 8 import eu.clarin.sru.fcs.aggregator.registry.Corpus; 9 import eu.clarin.sru.fcs.aggregator.registry.Endpoint; 10 import eu.clarin.sru.fcs.aggregator.registry.Institution; 12 11 import eu.clarin.sru.fcs.aggregator.util.SRUCQL; 13 12 import java.util.ArrayList; 14 13 import java.util.List; 15 import java.util.concurrent.ExecutionException;16 14 import java.util.concurrent.Future; 17 15 import java.util.concurrent.TimeUnit; … … 25 23 26 24 /** 27 * Crawler for collecting endpoint scan operation responses of FCS specification.28 * Collects all the endpoints and resources descriptions.29 * 25 * Crawler for collecting endpoint scan operation responses of FCS 26 * specification. Collects all the endpoints and resources descriptions. 27 * 30 28 * @author yanapanchenko 31 29 */ 32 30 public class ScanCrawler { 33 31 34 private static final Logger LOGGER = Logger.getLogger(ScanCrawler.class.getName()); 35 private CenterRegistryI cr; 36 private SRUThreadedClient sruScanClient; 37 private int maxDepth = 1; 38 private EndpointFilter filter = null; 39 40 public ScanCrawler(CenterRegistryI centerRegistry, SRUThreadedClient sruScanClient) { 41 cr = centerRegistry; 42 this.sruScanClient = sruScanClient; 43 } 44 45 public ScanCrawler(CenterRegistryI centerRegistry, SRUThreadedClient sruScanClient, EndpointFilter filter, int maxDepth) { 46 this(centerRegistry, sruScanClient); 47 this.maxDepth = maxDepth; 48 this.filter = filter; 49 } 50 51 /** 52 * Crawler of scan operation of FCS specification. Collects all the endpoints 53 * and resources descriptions into the provided cache. 54 * 55 * @param cache cache into which the endpoints and resources descriptions 56 * from scan operation responses should be collected. 57 */ 58 public void crawl(ScanCache cache) { 59 60 //TODO remember not responding root corpora and come back to them later... ? 61 List<Institution> institutions = cr.getCQLInstitutions(); 62 //LOGGER.info(institutions.toString()); 63 for (Institution institution : institutions) { 64 cache.addInstitution(institution); 65 Iterable<Endpoint> endpoints = institution.getEndpoints(); 66 if (filter != null) { 67 endpoints = filter.filter(endpoints); 68 } 69 for (Endpoint endp : endpoints) { 70 Corpus parentCorpus = null;// i.e. it's root 71 addCorpora(sruScanClient, endp.getUrl(), institution, 0, parentCorpus, cache); 72 } 73 } 74 75 } 76 77 78 79 private void addCorpora(SRUThreadedClient sruScanClient, String endpointUrl, 80 Institution institution, int depth, Corpus parentCorpus, ScanCache cache) { 81 82 83 depth++; 84 if (depth > maxDepth) { 85 return; 86 } 87 88 89 List<Corpus> childrenCorpora = doScan(sruScanClient, endpointUrl, institution, parentCorpus); 90 91 //if (childrenCorpora.isEmpty()) { 92 for (Corpus c : childrenCorpora) { 93 // don't add corpus that introduces cyclic references 94 // as of March 2014, there are 2 such endpoints... 95 if (cache.getCorpus(c.getHandle())!= null) { 96 LOGGER.warning("Cyclic reference in corpus " + c.getHandle() + " of endpoint " + endpointUrl); 97 continue; 98 } 99 //Corpus c = createCorpus(institution, endpointUrl, term); 100 //Corpus c = new Corpus(institution, endpointUrl); 101 // c.setHandle(term.getValue()); 102 // c.setDisplayName(term.getDisplayTerm()); 103 // if (term.getNumberOfRecords() > 0) { 104 // c.setNumberOfRecords(term.getNumberOfRecords()); 105 // } 106 // addExtraInfo(c, term); 107 cache.addCorpus(c, parentCorpus); 108 // how not to query default corpus??? 109 addCorpora(sruScanClient, c.getEndpointUrl(), c.getInstitution(), 110 depth, c, cache); 111 } 112 //} else if () { 113 // TODO if diagnistics came back, try simple scan without the 114 // SRUCQLscan.RESOURCE_INFO_PARAMETER 115 // } 116 //else { 117 // if (parentCorpus == null) { // means root 118 // create default root corpus: 119 // Corpus c = new Corpus(institution, endpointUrl); 120 // cache.addCorpus(c); 121 // } 122 // } 123 124 125 // SRUScanResponse response = doScan(sruScanClient, endpointUrl, parentCorpus); 126 // 127 // 128 // if (response != null && response.hasTerms()) { 129 // for (SRUTerm term : response.getTerms()) { 130 // // don't add corpus that introduces cyclic references 131 // // as of March 2014, there are 2 such endpoints... 132 // if (cache.getCorpus(term.getValue())!= null) { 133 // LOGGER.warning("Cyclic reference in corpus " + term.getValue() + " of endpoint " + endpointUrl); 134 // continue; 135 // } 136 // Corpus c = createCorpus(institution, endpointUrl, term); 137 // //Corpus c = new Corpus(institution, endpointUrl); 138 //// c.setHandle(term.getValue()); 139 //// c.setDisplayName(term.getDisplayTerm()); 140 //// if (term.getNumberOfRecords() > 0) { 141 //// c.setNumberOfRecords(term.getNumberOfRecords()); 142 //// } 143 //// addExtraInfo(c, term); 144 // cache.addCorpus(c, parentCorpus); 145 // addCorpora(sruScanClient, c.getEndpointUrl(), c.getInstitution(), 146 // depth, c, cache); 147 // } 148 // //} else if () { 149 // // TODO if diagnistics came back, try simple scan without the 150 // // SRUCQLscan.RESOURCE_INFO_PARAMETER 151 // } else { 152 // if (parentCorpus == null) { // means root 153 // // create default root corpus: 154 // Corpus c = new Corpus(institution, endpointUrl); 155 // cache.addCorpus(c); 156 // } 157 // } 158 159 } 160 161 private static String normalizeHandle(Corpus corpus, boolean root) { 162 if (root) { 163 return Corpus.ROOT_HANDLE; 164 } 165 String handle = corpus.getHandle(); 166 if (Corpus.HANDLE_WITH_SPECIAL_CHARS.matcher(handle).matches()) { 167 //resourceValue = "%22" + resourceValue + "%22"; 168 handle = "\"" + handle + "\""; 169 } 170 return handle; 171 } 172 173 // public static SRUScanResponse doScan(SRUThreadedClient sruScanClient, 174 // String endpointUrl, Corpus parentCorpus) { 175 // 176 // Future<SRUScanResponse> corporaResponse = null; 177 // SRUScanResponse response = null; 178 // try { 179 // SRUScanRequest corporaRequest = new SRUScanRequest(endpointUrl); 180 // StringBuilder scanClause = new StringBuilder(SRUCQL.SCAN_RESOURCE_PARAMETER); 181 // scanClause.append("="); 182 // String normalizedHandle = normalizeHandle(parentCorpus, parentCorpus == null); 183 // scanClause.append(normalizedHandle); 184 // corporaRequest.setScanClause(scanClause.toString()); 185 // corporaRequest.setExtraRequestData(SRUCQL.SCAN_RESOURCE_INFO_PARAMETER, 186 // SRUCQL.SCAN_RESOURCE_INFO_PARAMETER_DEFAULT_VALUE); 187 // corporaResponse = sruScanClient.scan(corporaRequest); 188 // Thread.sleep(5000); 189 // response = corporaResponse.get(600, TimeUnit.SECONDS); 190 // return response; 191 // } catch (TimeoutException ex) { 192 // LOGGER.log(Level.SEVERE, "Timeout scanning corpora {0} at {1} {2} {3}", 193 // new String[]{Corpus.ROOT_HANDLE, endpointUrl, ex.getClass().getName(), ex.getMessage()}); 194 // } catch (Exception ex) { 195 // LOGGER.log(Level.SEVERE, "Error accessing corpora {0} at {1} {2} {3}", 196 // new String[]{Corpus.ROOT_HANDLE, endpointUrl, ex.getClass().getName(), ex.getMessage()}); 197 // } finally { 198 // if (corporaResponse != null && !corporaResponse.isDone()) { 199 // corporaResponse.cancel(true); 200 // } 201 // } 202 // 203 // return response; 204 // } 205 206 207 public static List<Corpus> doScan(SRUThreadedClient sruScanClient, 208 String endpointUrl, Institution institution, Corpus parentCorpus) { 209 210 List<Corpus> corpora = new ArrayList<Corpus>(); 211 Future<SRUScanResponse> corporaResponse = null; 212 SRUScanResponse response = null; 213 try { 214 SRUScanRequest corporaRequest = new SRUScanRequest(endpointUrl); 215 StringBuilder scanClause = new StringBuilder(SRUCQL.SCAN_RESOURCE_PARAMETER); 216 scanClause.append("="); 217 String normalizedHandle = normalizeHandle(parentCorpus, parentCorpus == null); 218 scanClause.append(normalizedHandle); 219 corporaRequest.setScanClause(scanClause.toString()); 220 corporaRequest.setExtraRequestData(SRUCQL.SCAN_RESOURCE_INFO_PARAMETER, 221 SRUCQL.SCAN_RESOURCE_INFO_PARAMETER_DEFAULT_VALUE); 222 corporaResponse = sruScanClient.scan(corporaRequest); 223 Thread.sleep(5000); 224 response = corporaResponse.get(600, TimeUnit.SECONDS); 225 } catch (TimeoutException ex) { 226 LOGGER.log(Level.SEVERE, "Timeout scanning corpora {0} at {1} {2} {3}", 227 new String[]{Corpus.ROOT_HANDLE, endpointUrl, ex.getClass().getName(), ex.getMessage()}); 228 } catch (Exception ex) { 229 LOGGER.log(Level.SEVERE, "Error accessing corpora {0} at {1} {2} {3}", 230 new String[]{Corpus.ROOT_HANDLE, endpointUrl, ex.getClass().getName(), ex.getMessage()}); 231 } finally { 232 if (corporaResponse != null && !corporaResponse.isDone()) { 233 corporaResponse.cancel(true); 234 } 235 } 236 237 238 if (response != null && response.hasTerms()) { 239 for (SRUTerm term : response.getTerms()) { 240 // don't add corpus that introduces cyclic references 241 // as of March 2014, there are 2 such endpoints... 242 //if (cache.getCorpus(term.getValue())!= null) { 243 // LOGGER.warning("Cyclic reference in corpus " + term.getValue() + " of endpoint " + endpointUrl); 244 // continue; 245 //} 246 Corpus c = createCorpus(institution, endpointUrl, term); 247 corpora.add(c); 248 } 249 //} else if () { 250 // TODO if diagnistics came back, try simple scan without the 251 // SRUCQLscan.RESOURCE_INFO_PARAMETER 252 } else { 253 if (parentCorpus == null) { // means root 254 // create default root corpus: 255 Corpus c = new Corpus(institution, endpointUrl); 256 corpora.add(c); 257 } 258 } 259 return corpora; 260 } 261 262 private static Corpus createCorpus(Institution institution, String endpointUrl, SRUTerm term) { 263 Corpus c = new Corpus(institution, endpointUrl); 264 c.setHandle(term.getValue()); 265 c.setDisplayName(term.getDisplayTerm()); 266 if (term.getNumberOfRecords() > 0) { 267 c.setNumberOfRecords(term.getNumberOfRecords()); 268 } 269 addExtraInfo(c, term); 270 return c; 271 } 272 273 // TODO: ask Oliver to add API support for the extra info in the 274 // SRU client/server libraries, so that it's not necessary to work 275 // with DocumentFragment 276 private static void addExtraInfo(Corpus c, SRUTerm term) { 277 278 DocumentFragment extraInfo = term.getExtraTermData(); 279 String enDescription = null; 280 if (extraInfo != null) { 281 NodeList infoNodes = extraInfo.getChildNodes().item(0).getChildNodes(); 282 for (int i = 0; i < infoNodes.getLength(); i++) { 283 Node infoNode = infoNodes.item(i); 284 if (infoNode.getNodeType() == Node.ELEMENT_NODE && infoNode.getLocalName().equals("LandingPageURI")) { 285 c.setLandingPage(infoNode.getTextContent().trim()); 286 } else if (infoNode.getNodeType() == Node.ELEMENT_NODE && infoNode.getLocalName().equals("Languages")) { 287 NodeList languageNodes = infoNode.getChildNodes(); 288 for (int j = 0; j < languageNodes.getLength(); j++) { 289 if (languageNodes.item(j).getNodeType() == Node.ELEMENT_NODE && languageNodes.item(j).getLocalName().equals("Language")) { 290 Element languageNode = (Element) languageNodes.item(j); 291 String languageText = languageNode.getTextContent().trim(); 292 if (!languageText.isEmpty()) { 293 c.addLanguage(languageText.trim()); 294 } 295 } 296 297 } 298 } else if (infoNode.getNodeType() == Node.ELEMENT_NODE && infoNode.getLocalName().equals("Description")) { 299 Element element = (Element) infoNode; 300 String descr = infoNode.getTextContent().replaceAll("<br/>", " "); 301 descr = descr.replaceAll("<br/>", " "); 302 descr = descr.replaceAll("[\t\n\r ]+", " "); 303 c.setDescription(descr.trim()); 304 //String lang = element.getAttributeNS("http://clarin.eu/fcs/1.0/resource-info", "lang"); 305 //System.out.println("ATTRIBUTE LANG: " + lang); 306 if ("en".equals(element.getAttribute("xml:lang"))) { 307 enDescription = c.getDescription(); 308 } 309 } 310 } 311 // description in Engish has priority 312 if (enDescription != null && !enDescription.isEmpty()) { 313 c.setDescription(enDescription); 314 } 315 } 316 } 32 private static final Logger LOGGER = Logger.getLogger(ScanCrawler.class.getName()); 33 private CenterRegistryI cr; 34 private SRUThreadedClient sruScanClient; 35 private int maxDepth = 1; 36 private EndpointFilter filter = null; 37 38 public ScanCrawler(CenterRegistryI centerRegistry, SRUThreadedClient sruScanClient) { 39 cr = centerRegistry; 40 this.sruScanClient = sruScanClient; 41 } 42 43 public ScanCrawler(CenterRegistryI centerRegistry, SRUThreadedClient sruScanClient, EndpointFilter filter, int maxDepth) { 44 this(centerRegistry, sruScanClient); 45 this.maxDepth = maxDepth; 46 this.filter = filter; 47 } 48 49 /** 50 * Crawler of scan operation of FCS specification. Collects all the 51 * endpoints and resources descriptions into the provided cache. 52 * 53 * @param cache cache into which the endpoints and resources descriptions 54 * from scan operation responses should be collected. 55 */ 56 public void crawl(ScanCache cache) { 57 List<Institution> institutions = cr.getCQLInstitutions(); 58 for (Institution institution : institutions) { 59 cache.addInstitution(institution); 60 Iterable<Endpoint> endpoints = institution.getEndpoints(); 61 if (filter != null) { 62 endpoints = filter.filter(endpoints); 63 } 64 for (Endpoint endp : endpoints) { 65 Corpus parentCorpus = null;// i.e. it's root 66 addCorpora(sruScanClient, endp.getUrl(), institution, 0, parentCorpus, cache); 67 } 68 } 69 70 } 71 72 private void addCorpora(SRUThreadedClient sruScanClient, String endpointUrl, 73 Institution institution, int depth, Corpus parentCorpus, ScanCache cache) { 74 depth++; 75 if (depth > maxDepth) { 76 return; 77 } 78 79 List<Corpus> childrenCorpora = doScan(sruScanClient, endpointUrl, institution, parentCorpus); 80 81 for (Corpus c : childrenCorpora) { 82 // don't add corpus that introduces cyclic references 83 // as of March 2014, there are 2 such endpoints... 84 if (cache.getCorpus(c.getHandle()) != null) { 85 LOGGER.warning("Cyclic reference in corpus " + c.getHandle() + " of endpoint " + endpointUrl); 86 continue; 87 } 88 cache.addCorpus(c, parentCorpus); 89 addCorpora(sruScanClient, c.getEndpointUrl(), c.getInstitution(), 90 depth, c, cache); 91 } 92 } 93 94 public static List<Corpus> doScan(SRUThreadedClient sruScanClient, 95 String endpointUrl, Institution institution, Corpus parentCorpus) { 96 97 List<Corpus> corpora = new ArrayList<Corpus>(); 98 Future<SRUScanResponse> corporaResponse = null; 99 SRUScanResponse response = null; 100 try { 101 SRUScanRequest corporaRequest = new SRUScanRequest(endpointUrl); 102 StringBuilder scanClause = new StringBuilder(SRUCQL.SCAN_RESOURCE_PARAMETER); 103 scanClause.append("="); 104 String normalizedHandle = normalizeHandle(parentCorpus, parentCorpus == null); 105 scanClause.append(normalizedHandle); 106 corporaRequest.setScanClause(scanClause.toString()); 107 corporaRequest.setExtraRequestData(SRUCQL.SCAN_RESOURCE_INFO_PARAMETER, 108 SRUCQL.SCAN_RESOURCE_INFO_PARAMETER_DEFAULT_VALUE); 109 corporaResponse = sruScanClient.scan(corporaRequest); 110 Thread.sleep(5000); 111 response = corporaResponse.get(600, TimeUnit.SECONDS); 112 } catch (TimeoutException ex) { 113 LOGGER.log(Level.SEVERE, "Timeout scanning corpora {0} at {1} {2} {3}", 114 new String[]{Corpus.ROOT_HANDLE, endpointUrl, ex.getClass().getName(), ex.getMessage()}); 115 } catch (Exception ex) { 116 LOGGER.log(Level.SEVERE, "Error accessing corpora {0} at {1} {2} {3}", 117 new String[]{Corpus.ROOT_HANDLE, endpointUrl, ex.getClass().getName(), ex.getMessage()}); 118 } finally { 119 if (corporaResponse != null && !corporaResponse.isDone()) { 120 corporaResponse.cancel(true); 121 } 122 } 123 124 if (response != null && response.hasTerms()) { 125 for (SRUTerm term : response.getTerms()) { 126 Corpus c = createCorpus(institution, endpointUrl, term); 127 corpora.add(c); 128 } 129 } else { 130 if (parentCorpus == null) { // means root 131 // create default root corpus: 132 Corpus c = new Corpus(institution, endpointUrl); 133 corpora.add(c); 134 } 135 } 136 return corpora; 137 } 138 139 private static String normalizeHandle(Corpus corpus, boolean root) { 140 if (root) { 141 return Corpus.ROOT_HANDLE; 142 } 143 String handle = corpus.getHandle(); 144 if (Corpus.HANDLE_WITH_SPECIAL_CHARS.matcher(handle).matches()) { 145 //resourceValue = "%22" + resourceValue + "%22"; 146 handle = "\"" + handle + "\""; 147 } 148 return handle; 149 } 150 151 private static Corpus createCorpus(Institution institution, String endpointUrl, SRUTerm term) { 152 Corpus c = new Corpus(institution, endpointUrl); 153 c.setHandle(term.getValue()); 154 c.setDisplayName(term.getDisplayTerm()); 155 if (term.getNumberOfRecords() > 0) { 156 c.setNumberOfRecords(term.getNumberOfRecords()); 157 } 158 addExtraInfo(c, term); 159 return c; 160 } 161 162 // TODO: ask Oliver to add API support for the extra info in the 163 // SRU client/server libraries, so that it's not necessary to work 164 // with DocumentFragment 165 private static void addExtraInfo(Corpus c, SRUTerm term) { 166 DocumentFragment extraInfo = term.getExtraTermData(); 167 String enDescription = null; 168 if (extraInfo != null) { 169 NodeList infoNodes = extraInfo.getChildNodes().item(0).getChildNodes(); 170 for (int i = 0; i < infoNodes.getLength(); i++) { 171 Node infoNode = infoNodes.item(i); 172 if (infoNode.getNodeType() == Node.ELEMENT_NODE && infoNode.getLocalName().equals("LandingPageURI")) { 173 c.setLandingPage(infoNode.getTextContent().trim()); 174 } else if (infoNode.getNodeType() == Node.ELEMENT_NODE && infoNode.getLocalName().equals("Languages")) { 175 NodeList languageNodes = infoNode.getChildNodes(); 176 for (int j = 0; j < languageNodes.getLength(); j++) { 177 if (languageNodes.item(j).getNodeType() == Node.ELEMENT_NODE && languageNodes.item(j).getLocalName().equals("Language")) { 178 Element languageNode = (Element) languageNodes.item(j); 179 String languageText = languageNode.getTextContent().trim(); 180 if (!languageText.isEmpty()) { 181 c.addLanguage(languageText.trim()); 182 } 183 } 184 } 185 } else if (infoNode.getNodeType() == Node.ELEMENT_NODE && infoNode.getLocalName().equals("Description")) { 186 Element element = (Element) infoNode; 187 String descr = infoNode.getTextContent().replaceAll("<br/>", " "); 188 descr = descr.replaceAll("<br/>", " "); 189 descr = descr.replaceAll("[\t\n\r ]+", " "); 190 c.setDescription(descr.trim()); 191 //String lang = element.getAttributeNS("http://clarin.eu/fcs/1.0/resource-info", "lang"); 192 //System.out.println("ATTRIBUTE LANG: " + lang); 193 if ("en".equals(element.getAttribute("xml:lang"))) { 194 enDescription = c.getDescription(); 195 } 196 } 197 } 198 // description in Engish has priority 199 if (enDescription != null && !enDescription.isEmpty()) { 200 c.setDescription(enDescription); 201 } 202 } 203 } 317 204 } -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/cache/SimpleInMemScanCache.java
r5036 r5720 1 1 package eu.clarin.sru.fcs.aggregator.cache; 2 2 3 import eu.clarin.sru.fcs.aggregator. sopt.Corpus;4 import eu.clarin.sru.fcs.aggregator. sopt.Institution;3 import eu.clarin.sru.fcs.aggregator.registry.Corpus; 4 import eu.clarin.sru.fcs.aggregator.registry.Institution; 5 5 import java.util.ArrayList; 6 6 import java.util.HashMap; … … 14 14 15 15 /** 16 * Implementation of the cached scan data (endpoints descriptions) that 17 * storesthe cache in memory in maps.16 * Implementation of the cached scan data (endpoints descriptions) that stores 17 * the cache in memory in maps. 18 18 * 19 19 * @author yanapanchenko … … 21 21 public class SimpleInMemScanCache implements ScanCache { 22 22 23 24 25 26 27 28 29 23 private Map<String, List<Corpus>> enpUrlToRootCorpora = new LinkedHashMap<String, List<Corpus>>(30); 24 private Map<String, List<Corpus>> corpusToChildren = new HashMap<String, List<Corpus>>(); 25 //private Map<String, String> childToParent = new HashMap<String, String>(); 26 private Map<String, Corpus> handleToCorpus = new HashMap<String, Corpus>(); 27 private Map<String, Set<Corpus>> langToRootCorpora = new HashMap<String, Set<Corpus>>(); 28 private Map<String, Set<Corpus>> langToTopUniqueCorpora = new HashMap<String, Set<Corpus>>(); 29 private List<Institution> institutions = new ArrayList<Institution>(); 30 30 31 private static final Logger LOGGER = Logger.getLogger(SimpleInMemScanCache.class.getName()); 32 33 @Override 34 public List<Institution> getInstitutions() { 35 return institutions; 36 } 31 private static final Logger LOGGER = Logger.getLogger(SimpleInMemScanCache.class.getName()); 37 32 38 @Override 39 public List<Corpus> getRootCorporaOfEndpoint(String enpointUrl) { 40 List<Corpus> roots = new ArrayList<Corpus>(); 41 if (enpUrlToRootCorpora.containsKey(enpointUrl)) { 42 roots.addAll(enpUrlToRootCorpora.get(enpointUrl)); 43 } 44 return roots; 45 } 33 @Override 34 public List<Institution> getInstitutions() { 35 return institutions; 36 } 46 37 47 @Override 48 public void addInstitution(Institution institution) { 49 institutions.add(institution); 50 } 38 @Override 39 public List<Corpus> getRootCorporaOfEndpoint(String enpointUrl) { 40 List<Corpus> roots = new ArrayList<Corpus>(); 41 if (enpUrlToRootCorpora.containsKey(enpointUrl)) { 42 roots.addAll(enpUrlToRootCorpora.get(enpointUrl)); 43 } 44 return roots; 45 } 51 46 52 53 public void addCorpus(Corpus c) {54 addCorpus(c, null);55 47 @Override 48 public void addInstitution(Institution institution) { 49 institutions.add(institution); 50 } 56 51 57 @Override 58 public void addCorpus(Corpus c, Corpus parentCorpus) { 52 @Override 53 public void addCorpus(Corpus c) { 54 addCorpus(c, null); 55 } 59 56 60 57 @Override 58 public void addCorpus(Corpus c, Corpus parentCorpus) { 61 59 62 60 handleToCorpus.put(c.getHandle(), c); 63 61 64 if (parentCorpus == null) { //i.e it's a root corpus 65 // index root corpora as for their languages 66 for (String lang : c.getLanguages()) { 67 if (!langToRootCorpora.containsKey(lang)) { 68 langToRootCorpora.put(lang, new HashSet<Corpus>()); 69 } 70 langToRootCorpora.get(lang).add(c); 71 } 72 // index root corpora as for their endpint url 73 if (!enpUrlToRootCorpora.containsKey(c.getEndpointUrl())) { 74 enpUrlToRootCorpora.put(c.getEndpointUrl(), new ArrayList<Corpus>()); 75 } 76 enpUrlToRootCorpora.get(c.getEndpointUrl()).add(c); 77 //childToParent.put(c.getHandle(), Corpus.ROOT_HANDLE); 78 } else { 79 if (!corpusToChildren.containsKey(parentCorpus.getHandle())) { 80 corpusToChildren.put(parentCorpus.getHandle(), new ArrayList<Corpus>()); 81 } 82 corpusToChildren.get(parentCorpus.getHandle()).add(c); 83 //childToParent.put(c.getHandle(), parentCorpus.getHandle()); 84 } 85 86 // index top corpora with unique language as for their languages 87 if (c.getLanguages().size() == 1 && 88 (parentCorpus == null || parentCorpus.getLanguages().size() > 0)) { 89 String lang = getElementOfStringUnitset(c.getLanguages()); 90 if (!langToTopUniqueCorpora.containsKey(lang)) { 91 langToTopUniqueCorpora.put(lang, new LinkedHashSet<Corpus>()); 92 } 93 langToTopUniqueCorpora.get(lang).add(c); 94 } 95 } 62 if (parentCorpus == null) { //i.e it's a root corpus 63 // index root corpora as for their languages 64 for (String lang : c.getLanguages()) { 65 if (!langToRootCorpora.containsKey(lang)) { 66 langToRootCorpora.put(lang, new HashSet<Corpus>()); 67 } 68 langToRootCorpora.get(lang).add(c); 69 } 70 // index root corpora as for their endpint url 71 if (!enpUrlToRootCorpora.containsKey(c.getEndpointUrl())) { 72 enpUrlToRootCorpora.put(c.getEndpointUrl(), new ArrayList<Corpus>()); 73 } 74 enpUrlToRootCorpora.get(c.getEndpointUrl()).add(c); 75 //childToParent.put(c.getHandle(), Corpus.ROOT_HANDLE); 76 } else { 77 if (!corpusToChildren.containsKey(parentCorpus.getHandle())) { 78 corpusToChildren.put(parentCorpus.getHandle(), new ArrayList<Corpus>()); 79 } 80 corpusToChildren.get(parentCorpus.getHandle()).add(c); 81 //childToParent.put(c.getHandle(), parentCorpus.getHandle()); 82 } 96 83 97 @Override 98 public String toString() { 99 return "cache{\n" + "institutions=" + institutions + "\n" 100 + "enpUrlToRootCorpora=" + enpUrlToRootCorpora 101 + "\n corpusToChildren=" + corpusToChildren 102 + "\n langToTopUniqueCorpora=" + langToTopUniqueCorpora + "\n}"; 103 } 84 // index top corpora with unique language as for their languages 85 if (c.getLanguages().size() == 1 86 && (parentCorpus == null || parentCorpus.getLanguages().size() > 0)) { 87 String lang = getElementOfStringUnitset(c.getLanguages()); 88 if (!langToTopUniqueCorpora.containsKey(lang)) { 89 langToTopUniqueCorpora.put(lang, new LinkedHashSet<Corpus>()); 90 } 91 langToTopUniqueCorpora.get(lang).add(c); 92 } 93 } 104 94 105 @Override 106 public boolean isEmpty() { 107 return enpUrlToRootCorpora.isEmpty(); 108 } 95 @Override 96 public String toString() { 97 return "cache{\n" + "institutions=" + institutions + "\n" 98 + "enpUrlToRootCorpora=" + enpUrlToRootCorpora 99 + "\n corpusToChildren=" + corpusToChildren 100 + "\n langToTopUniqueCorpora=" + langToTopUniqueCorpora + "\n}"; 101 } 109 102 110 @Override 111 public List<Corpus> getRootCorpora() { 112 List<Corpus> rootCorpora = new ArrayList<Corpus>(enpUrlToRootCorpora.size()); 113 for (List<Corpus> corpora : this.enpUrlToRootCorpora.values()) { 114 rootCorpora.addAll(corpora); 115 } 116 return rootCorpora; 117 } 103 @Override 104 public boolean isEmpty() { 105 return enpUrlToRootCorpora.isEmpty(); 106 } 118 107 119 @Override 120 public Set<String> getLanguages() { 121 Set<String> languages = new HashSet<String>(this.langToRootCorpora.size()); 122 languages.addAll(this.langToRootCorpora.keySet()); 123 return languages; 124 } 108 @Override 109 public List<Corpus> getRootCorpora() { 110 List<Corpus> rootCorpora = new ArrayList<Corpus>(enpUrlToRootCorpora.size()); 111 for (List<Corpus> corpora : this.enpUrlToRootCorpora.values()) { 112 rootCorpora.addAll(corpora); 113 } 114 return rootCorpora; 115 } 125 116 126 @Override 127 public List<Corpus> getChildren(Corpus corpus) { 128 List<Corpus> corpora = this.corpusToChildren.get(corpus.getHandle()); 129 if (corpora == null) { 130 return (new ArrayList<Corpus>()); 131 } else { 132 List<Corpus> corporaCopy = new ArrayList<Corpus>(corpora); 133 return corporaCopy; 134 } 135 } 117 @Override 118 public Set<String> getLanguages() { 119 Set<String> languages = new HashSet<String>(this.langToRootCorpora.size()); 120 languages.addAll(this.langToRootCorpora.keySet()); 121 return languages; 122 } 136 123 137 @Override 138 public Map<String, Set<Corpus>> getRootCorporaForLang() { 139 return langToRootCorpora; 140 } 124 @Override 125 public List<Corpus> getChildren(Corpus corpus) { 126 List<Corpus> corpora = this.corpusToChildren.get(corpus.getHandle()); 127 if (corpora == null) { 128 return (new ArrayList<Corpus>()); 129 } else { 130 List<Corpus> corporaCopy = new ArrayList<Corpus>(corpora); 131 return corporaCopy; 132 } 133 } 141 134 142 @Override 143 public List<Corpus> getRootCorporaForLang(String lang) { 144 List<Corpus> rootCorpora = new ArrayList<Corpus>(enpUrlToRootCorpora.size()); 145 for (List<Corpus> corpora : this.enpUrlToRootCorpora.values()) { 146 for (Corpus corpus : corpora) { 147 if (corpus.getLanguages().contains(lang)) { 148 rootCorpora.add(corpus); 149 } 150 } 151 } 152 return rootCorpora; 153 } 135 @Override 136 public Map<String, Set<Corpus>> getRootCorporaForLang() { 137 return langToRootCorpora; 138 } 154 139 155 @Override 156 public Map<String, Set<Corpus>> getTopUniqueLangToCorpora() { 157 return this.langToTopUniqueCorpora; 158 } 140 @Override 141 public List<Corpus> getRootCorporaForLang(String lang) { 142 List<Corpus> rootCorpora = new ArrayList<Corpus>(enpUrlToRootCorpora.size()); 143 for (List<Corpus> corpora : this.enpUrlToRootCorpora.values()) { 144 for (Corpus corpus : corpora) { 145 if (corpus.getLanguages().contains(lang)) { 146 rootCorpora.add(corpus); 147 } 148 } 149 } 150 return rootCorpora; 151 } 159 152 160 @Override 161 public List<Corpus> getTopUniqueLanguageCorpora(String lang) { 162 ArrayList<Corpus> corpora = new ArrayList<Corpus>(langToTopUniqueCorpora.get(lang).size()); 163 corpora.addAll(langToTopUniqueCorpora.get(lang)); 164 return corpora; 165 } 153 @Override 154 public Map<String, Set<Corpus>> getTopUniqueLangToCorpora() { 155 return this.langToTopUniqueCorpora; 156 } 166 157 167 @Override 168 public Corpus getCorpus(String handle) { 169 return this.handleToCorpus.get(handle); 170 } 158 @Override 159 public List<Corpus> getTopUniqueLanguageCorpora(String lang) { 160 ArrayList<Corpus> corpora = new ArrayList<Corpus>(langToTopUniqueCorpora.get(lang).size()); 161 corpora.addAll(langToTopUniqueCorpora.get(lang)); 162 return corpora; 163 } 171 164 172 private String getElementOfStringUnitset(Set<String> stringUnitSet) { 173 return stringUnitSet.iterator().next(); 174 } 165 @Override 166 public Corpus getCorpus(String handle) { 167 return this.handleToCorpus.get(handle); 168 } 169 170 private String getElementOfStringUnitset(Set<String> stringUnitSet) { 171 return stringUnitSet.iterator().next(); 172 } 175 173 } -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/util/SRUCQL.java
r5041 r5720 3 3 /** 4 4 * Utility for storing constants related to SRU/CQL specification. 5 * 5 * 6 6 * @author Yana Panchenko 7 7 */ 8 8 public class SRUCQL { 9 10 public static final String OPERATION = "operation"; 11 12 public static final String VERSION = "version"; 13 14 15 public static final String SEARCH_RETRIEVE = "searchRetrieve"; 16 public static final String SEARCH_CORPUS_HANDLE_PARAMETER = "x-cmd-context"; 17 public static final String SEARCH_QUERY_PARAMETER = "query"; 18 19 20 public static final String SCAN = "scan"; 21 public static final String SCAN_RESOURCE_PARAMETER = "fcs.resource"; 22 public static final String SCAN_RESOURCE_PARAMETER_DEFAULT_VALUE = "root"; 23 public static final String SCAN_RESOURCE_INFO_PARAMETER = "x-cmd-resource-info"; 24 public static final String SCAN_RESOURCE_INFO_PARAMETER_DEFAULT_VALUE = "true"; 25 26 27 public static final String EXPLAIN = "explain"; 28 29 public static final String AGGREGATION_CONTEXT = "x-aggregation-context"; 30 31 9 10 public static final String VERSION = "version"; 11 12 public static final String SEARCH_RETRIEVE = "searchRetrieve"; 13 public static final String SEARCH_CORPUS_HANDLE_PARAMETER = "x-cmd-context"; 14 public static final String SEARCH_QUERY_PARAMETER = "query"; 15 16 public static final String SCAN = "scan"; 17 public static final String SCAN_RESOURCE_PARAMETER = "fcs.resource"; 18 public static final String SCAN_RESOURCE_PARAMETER_DEFAULT_VALUE = "root"; 19 public static final String SCAN_RESOURCE_INFO_PARAMETER = "x-cmd-resource-info"; 20 public static final String SCAN_RESOURCE_INFO_PARAMETER_DEFAULT_VALUE = "true"; 21 22 public static final String EXPLAIN = "explain"; 23 24 public static final String AGGREGATION_CONTEXT = "x-aggregation-context"; 32 25 } -
SRUAggregator/trunk/src/main/webapp/META-INF/context.xml
r2450 r5720 1 1 <?xml version="1.0" encoding="UTF-8"?> 2 <Context antiJARLocking="true" path="/ aggregator"/>2 <Context antiJARLocking="true" path="/Aggregator2"/> -
SRUAggregator/trunk/src/main/webapp/WEB-INF/web.xml
r5041 r5720 1 1 <?xml version="1.0" encoding="UTF-8"?> 2 2 <web-app version="3.0" xmlns="http://java.sun.com/xml/ns/javaee" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://java.sun.com/xml/ns/javaee http://java.sun.com/xml/ns/javaee/web-app_3_0.xsd"> 3 <description>CLARIN-D Federated Content Search Aggregator</description> 4 <display-name>CLARIN-D Federated Content Search Aggregator</display-name> 5 6 <env-entry> 7 <env-entry-name>center-registry-url</env-entry-name> 8 <env-entry-type>java.lang.String</env-entry-type> 9 <env-entry-value>http://centerregistry-clarin.esc.rzg.mpg.de/restxml/</env-entry-value> 10 </env-entry> 11 <env-entry> 12 <env-entry-name>weblicht-url</env-entry-name> 13 <env-entry-type>java.lang.String</env-entry-type> 14 <env-entry-value>https://weblicht.sfs.uni-tuebingen.de/WebLicht-4/?input=</env-entry-value> 15 </env-entry> 16 <env-entry> 17 <env-entry-name>update-interval-unit</env-entry-name> 18 <env-entry-type>java.lang.String</env-entry-type> 19 <env-entry-value>HOURS</env-entry-value> 20 </env-entry> 21 <env-entry> 22 <env-entry-name>update-interval</env-entry-name> 23 <env-entry-type>java.lang.Integer</env-entry-type> 24 <env-entry-value>6</env-entry-value> 25 </env-entry> 26 <env-entry> 27 <env-entry-name>scan-max-depth</env-entry-name> 28 <env-entry-type>java.lang.Integer</env-entry-type> 29 <env-entry-value>3</env-entry-value> 30 </env-entry> 31 <!-- Value of this property (data.location) should be specified in CATALINA_OPT 32 (e.g. inside /etc/init.d/tomcat7*), unless the default is used. 33 Currently defaults to /data/ or user.home --> 34 <env-entry> 35 <env-entry-name>data-location-property</env-entry-name> 36 <env-entry-type>java.lang.String</env-entry-type> 37 <env-entry-value>data.location</env-entry-value> 38 </env-entry> 39 <!-- Folder for the data specific to the current Aggregator application, 40 supposed to be inside the data location folder above --> 41 <env-entry> 42 <env-entry-name>aggregator-folder</env-entry-name> 43 <env-entry-type>java.lang.String</env-entry-type> 44 <env-entry-value>fcsAggregator</env-entry-value> 45 </env-entry> 46 47 <listener> 48 <description>ZK listener for session cleanup</description> 49 <listener-class>org.zkoss.zk.ui.http.HttpSessionListener</listener-class> 50 </listener> 51 52 <servlet> 53 <description>ZK loader for ZUML pages</description> 54 <servlet-name>zkLoader</servlet-name> 55 <servlet-class>org.zkoss.zk.ui.http.DHtmlLayoutServlet</servlet-class> 56 <init-param> 57 <param-name>update-uri</param-name> 58 <param-value>/zkau</param-value> 59 </init-param> 60 <load-on-startup>1</load-on-startup> 61 </servlet> 62 <servlet-mapping> 63 <servlet-name>zkLoader</servlet-name> 64 <url-pattern>*.zul</url-pattern> 65 </servlet-mapping> 66 <servlet-mapping> 67 <servlet-name>zkLoader</servlet-name> 68 <url-pattern>*.zhtml</url-pattern> 69 </servlet-mapping> 70 <!-- Optional. Uncomment it if you want to use richlets. 71 <servlet-mapping> 72 <servlet-name>zkLoader</servlet-name> 73 <url-pattern>/zk/*</url-pattern> 74 </servlet-mapping> 75 --> 76 <servlet> 77 <description>The asynchronous update engine for ZK</description> 78 <servlet-name>auEngine</servlet-name> 79 <servlet-class>org.zkoss.zk.au.http.DHtmlUpdateServlet</servlet-class> 80 </servlet> 81 <servlet-mapping> 82 <servlet-name>auEngine</servlet-name> 83 <url-pattern>/zkau/*</url-pattern> 84 </servlet-mapping> 85 86 <servlet> 87 <servlet-name>ServletAdaptor</servlet-name> 88 <servlet-class>com.sun.jersey.spi.container.servlet.ServletContainer</servlet-class> 89 <init-param> 90 <param-name>javax.ws.rs.Application</param-name> 91 <param-value>eu.clarin.sru.fcs.aggregator.rest.AggregatorService</param-value> 92 </init-param> 93 <load-on-startup>1</load-on-startup> 94 </servlet> 95 <servlet-mapping> 96 <servlet-name>ServletAdaptor</servlet-name> 97 <url-pattern>/service/*</url-pattern> 98 </servlet-mapping> 99 100 <welcome-file-list> 101 <welcome-file>index.zul</welcome-file> 102 <welcome-file>index.zhtml</welcome-file> 103 <welcome-file>index.html</welcome-file> 104 <welcome-file>index.htm</welcome-file> 105 </welcome-file-list> 106 3 <description>CLARIN-D Federated Content Search Aggregator</description> 4 <display-name>CLARIN-D Federated Content Search Aggregator</display-name> 5 6 <env-entry> 7 <env-entry-name>center-registry-url</env-entry-name> 8 <env-entry-type>java.lang.String</env-entry-type> 9 <env-entry-value>http://centerregistry-clarin.esc.rzg.mpg.de/restxml/</env-entry-value> 10 </env-entry> 11 <env-entry> 12 <env-entry-name>weblicht-url</env-entry-name> 13 <env-entry-type>java.lang.String</env-entry-type> 14 <env-entry-value>https://weblicht.sfs.uni-tuebingen.de/WebLicht-4/?input=</env-entry-value> 15 </env-entry> 16 <env-entry> 17 <env-entry-name>update-interval-unit</env-entry-name> 18 <env-entry-type>java.lang.String</env-entry-type> 19 <env-entry-value>HOURS</env-entry-value> 20 </env-entry> 21 <env-entry> 22 <env-entry-name>update-interval</env-entry-name> 23 <env-entry-type>java.lang.Integer</env-entry-type> 24 <env-entry-value>6</env-entry-value> 25 </env-entry> 26 <env-entry> 27 <env-entry-name>scan-max-depth</env-entry-name> 28 <env-entry-type>java.lang.Integer</env-entry-type> 29 <env-entry-value>3</env-entry-value> 30 </env-entry> 31 <!-- Value of this property (data.location) should be specified in CATALINA_OPT 32 (e.g. inside /etc/init.d/tomcat7*), unless the default is used. 33 Currently defaults to /data/ or user.home --> 34 <env-entry> 35 <env-entry-name>data-location-property</env-entry-name> 36 <env-entry-type>java.lang.String</env-entry-type> 37 <env-entry-value>data.location</env-entry-value> 38 </env-entry> 39 <!-- Folder for the data specific to the current Aggregator application, 40 supposed to be inside the data location folder above --> 41 <env-entry> 42 <env-entry-name>aggregator-folder</env-entry-name> 43 <env-entry-type>java.lang.String</env-entry-type> 44 <env-entry-value>fcsAggregator</env-entry-value> 45 </env-entry> 46 47 48 <servlet> 49 <servlet-name>Jersey REST Service</servlet-name> 50 <servlet-class>com.sun.jersey.spi.container.servlet.ServletContainer</servlet-class> 51 <init-param> 52 <param-name>com.sun.jersey.config.property.packages</param-name> 53 <param-value>eu.clarin.sru.fcs.aggregator.rest;org.codehaus.jackson.jaxrs</param-value> 54 </init-param> 55 <load-on-startup>1</load-on-startup> 56 </servlet> 57 <servlet-mapping> 58 <servlet-name>Jersey REST Service</servlet-name> 59 <url-pattern>/rest/*</url-pattern> 60 </servlet-mapping> 61 62 <session-config> 63 <session-timeout>30</session-timeout> 64 </session-config> 65 <welcome-file-list> 66 <welcome-file>index.html</welcome-file> 67 </welcome-file-list> 68 69 70 <listener> 71 <listener-class>eu.clarin.sru.fcs.aggregator.app.Aggregator</listener-class> 72 </listener> 73 107 74 </web-app> -
SRUAggregator/trunk/src/test/java/eu/clarin/sru/fcs/aggregator/app/ScanCacheFileTest.java
r5701 r5720 1 1 package eu.clarin.sru.fcs.aggregator.app; 2 2 3 import eu.clarin.sru.fcs.aggregator.cache.ScanCacheFiled;4 import eu.clarin.sru.fcs.aggregator.cache.SimpleInMemScanCache;5 3 import eu.clarin.sru.fcs.aggregator.cache.ScanCache; 6 import eu.clarin.sru.fcs.aggregator.sopt.Corpus; 7 import eu.clarin.sru.fcs.aggregator.sopt.Endpoint; 4 import eu.clarin.sru.fcs.aggregator.cache.ScanCacheFile; 5 import eu.clarin.sru.fcs.aggregator.registry.Corpus; 6 import eu.clarin.sru.fcs.aggregator.registry.Endpoint; 8 7 import java.io.File; 9 8 import java.util.List; 10 9 import org.junit.Assert; 10 import org.junit.Ignore; 11 11 import org.junit.Test; 12 12 … … 15 15 * @author yanapanchenko 16 16 */ 17 public class ScanCacheFiledTest { 18 19 20 21 @Test 22 public void testReadWriteDepth1() { 23 String scanDir = "/scan-bas"; 24 String scanPath1 = this.getClass().getResource(scanDir).getFile(); 25 String scanPath2 = "/tmp/scan-bas"; 26 File scanDir2 = new File(scanPath2); 27 if (!scanDir2.exists()) { 28 scanDir2.mkdir(); 29 } 30 31 ScanCacheFiled scanFiled1 = new ScanCacheFiled(scanPath1); 32 ScanCache cacheOrig = scanFiled1.read(); 33 34 ScanCacheFiled scanFiled2 = new ScanCacheFiled(scanPath2); 35 scanFiled2.write(cacheOrig); 36 37 ScanCacheFiled scanFiled3 = new ScanCacheFiled(scanPath2); 38 ScanCache cacheRewritten = scanFiled3.read(); 39 40 //make sure caches contain the same info after read-write 41 Assert.assertEquals(cacheOrig.getInstitutions().size(), cacheRewritten.getInstitutions().size()); 42 Endpoint epOrig = cacheOrig.getInstitutions().get(2).getEndpoint(0); 43 Endpoint epRewritten = cacheRewritten.getInstitutions().get(2).getEndpoint(0); 44 Assert.assertEquals(epOrig.getUrl(), epRewritten.getUrl()); 45 Assert.assertEquals(epOrig, epRewritten); 46 List<Corpus> rootCorporaOrig = cacheOrig.getRootCorporaOfEndpoint(epOrig.getUrl()); 47 List<Corpus> rootCorporaRewritten = cacheRewritten.getRootCorporaOfEndpoint(epOrig.getUrl()); 48 Assert.assertEquals(rootCorporaOrig.size(), rootCorporaRewritten.size()); 49 Assert.assertEquals(3, rootCorporaRewritten.size()); 50 Assert.assertEquals(rootCorporaOrig.get(0), rootCorporaRewritten.get(0)); 51 List<Corpus> childenOrig = cacheOrig.getChildren(rootCorporaOrig.get(0)); 52 List<Corpus> childenRewritten = cacheRewritten.getChildren(rootCorporaOrig.get(0)); 53 Assert.assertEquals(childenOrig, childenRewritten); 54 Assert.assertEquals(rootCorporaOrig.get(0).getLanguages(), rootCorporaRewritten.get(0).getLanguages()); 55 17 @Ignore 18 public class ScanCacheFileTest { 19 @Test 20 public void testReadWriteDepth1() { 21 String scanDir = "/scan-bas"; 22 String scanPath1 = this.getClass().getResource(scanDir).getFile(); 23 String scanPath2 = "/tmp/scan-bas"; 24 File scanDir2 = new File(scanPath2); 25 if (!scanDir2.exists()) { 26 scanDir2.mkdir(); 27 } 28 29 ScanCacheFile scanFiled1 = new ScanCacheFile(scanPath1); 30 ScanCache cacheOrig = scanFiled1.read(); 31 32 ScanCacheFile scanFiled2 = new ScanCacheFile(scanPath2); 33 scanFiled2.write(cacheOrig); 34 35 ScanCacheFile scanFiled3 = new ScanCacheFile(scanPath2); 36 ScanCache cacheRewritten = scanFiled3.read(); 37 38 //make sure caches contain the same info after read-write 39 Assert.assertEquals(cacheOrig.getInstitutions().size(), cacheRewritten.getInstitutions().size()); 40 Endpoint epOrig = cacheOrig.getInstitutions().get(2).getEndpoint(0); 41 Endpoint epRewritten = cacheRewritten.getInstitutions().get(2).getEndpoint(0); 42 Assert.assertEquals(epOrig.getUrl(), epRewritten.getUrl()); 43 Assert.assertEquals(epOrig, epRewritten); 44 List<Corpus> rootCorporaOrig = cacheOrig.getRootCorporaOfEndpoint(epOrig.getUrl()); 45 List<Corpus> rootCorporaRewritten = cacheRewritten.getRootCorporaOfEndpoint(epOrig.getUrl()); 46 Assert.assertEquals(rootCorporaOrig.size(), rootCorporaRewritten.size()); 47 Assert.assertEquals(3, rootCorporaRewritten.size()); 48 Assert.assertEquals(rootCorporaOrig.get(0), rootCorporaRewritten.get(0)); 49 List<Corpus> childenOrig = cacheOrig.getChildren(rootCorporaOrig.get(0)); 50 List<Corpus> childenRewritten = cacheRewritten.getChildren(rootCorporaOrig.get(0)); 51 Assert.assertEquals(childenOrig, childenRewritten); 52 Assert.assertEquals(rootCorporaOrig.get(0).getLanguages(), rootCorporaRewritten.get(0).getLanguages()); 53 56 54 //System.out.println(cacheOrig); 57 58 59 } 60 61 62 63 64 65 66 67 68 69 70 71 ScanCacheFiled scanFiled1 = new ScanCacheFiled(scanPath1);72 73 74 ScanCacheFiled scanFiled2 = new ScanCacheFiled(scanPath2);75 76 77 ScanCacheFiled scanFiled3 = new ScanCacheFiled(scanPath2);78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 55 //System.out.println(); 56 //System.out.println(cacheRewritten); 57 } 58 59 @Test 60 public void testReadWriteDepth2() { 61 String scanDir = "/scan-mpi"; 62 String scanPath1 = this.getClass().getResource(scanDir).getFile(); 63 String scanPath2 = "/tmp/scan-mpi"; 64 File scanDir2 = new File(scanPath2); 65 if (!scanDir2.exists()) { 66 scanDir2.mkdir(); 67 } 68 69 ScanCacheFile scanFiled1 = new ScanCacheFile(scanPath1); 70 ScanCache cacheOrig = scanFiled1.read(); 71 72 ScanCacheFile scanFiled2 = new ScanCacheFile(scanPath2); 73 scanFiled2.write(cacheOrig); 74 75 ScanCacheFile scanFiled3 = new ScanCacheFile(scanPath2); 76 ScanCache cacheRewritten = scanFiled3.read(); 77 78 //make sure caches contain the same info after read-write 79 Assert.assertEquals(cacheOrig.getInstitutions().size(), cacheRewritten.getInstitutions().size()); 80 Endpoint epOrig = cacheOrig.getInstitutions().get(4).getEndpoint(0); 81 Endpoint epRewritten = cacheRewritten.getInstitutions().get(4).getEndpoint(0); 82 Assert.assertEquals(epOrig.getUrl(), epRewritten.getUrl()); 83 Assert.assertEquals(epOrig, epRewritten); 84 List<Corpus> rootCorporaOrig = cacheOrig.getRootCorporaOfEndpoint(epOrig.getUrl()); 85 List<Corpus> rootCorporaRewritten = cacheRewritten.getRootCorporaOfEndpoint(epOrig.getUrl()); 86 Assert.assertEquals(rootCorporaOrig.size(), rootCorporaRewritten.size()); 87 Assert.assertEquals(3, rootCorporaRewritten.size()); 88 Assert.assertEquals(rootCorporaOrig.get(0), rootCorporaRewritten.get(0)); 89 List<Corpus> childenOrig = cacheOrig.getChildren(rootCorporaOrig.get(0)); 90 List<Corpus> childenRewritten = cacheRewritten.getChildren(rootCorporaOrig.get(0)); 91 Assert.assertEquals(childenOrig, childenRewritten); 92 Assert.assertEquals(2, childenRewritten.size()); 93 Assert.assertEquals(rootCorporaOrig.get(0).getLanguages(), rootCorporaRewritten.get(0).getLanguages()); 94 97 95 // System.out.println(cacheOrig); 98 96 // System.out.println(); 99 97 // System.out.println(cacheRewritten); 100 } 101 102 103 104 105 106 107 108 109 110 111 112 ScanCacheFiled scanFiled1 = new ScanCacheFiled(scanPath1);113 114 115 ScanCacheFiled scanFiled2 = new ScanCacheFiled(scanPath2);116 117 118 ScanCacheFiled scanFiled3 = new ScanCacheFiled(scanPath2);119 120 121 122 123 124 125 126 127 128 129 130 131 132 98 } 99 100 @Test 101 public void testReadWriteDefaultCorpus() { 102 String scanDir = "/scan-def"; 103 String scanPath1 = this.getClass().getResource(scanDir).getFile(); 104 String scanPath2 = "/tmp/scan-def"; 105 File scanDir2 = new File(scanPath2); 106 if (!scanDir2.exists()) { 107 scanDir2.mkdir(); 108 } 109 110 ScanCacheFile scanFiled1 = new ScanCacheFile(scanPath1); 111 ScanCache cacheOrig = scanFiled1.read(); 112 113 ScanCacheFile scanFiled2 = new ScanCacheFile(scanPath2); 114 scanFiled2.write(cacheOrig); 115 116 ScanCacheFile scanFiled3 = new ScanCacheFile(scanPath2); 117 ScanCache cacheRewritten = scanFiled3.read(); 118 119 //make sure caches contain the same info after read-write 120 Assert.assertEquals(cacheOrig.getInstitutions().size(), cacheRewritten.getInstitutions().size()); 121 Endpoint epOrig = cacheOrig.getInstitutions().get(4).getEndpoint(0); 122 Endpoint epRewritten = cacheRewritten.getInstitutions().get(4).getEndpoint(0); 123 Assert.assertEquals(epOrig.getUrl(), epRewritten.getUrl()); 124 Assert.assertEquals(epOrig, epRewritten); 125 List<Corpus> rootCorporaOrig = cacheOrig.getRootCorporaOfEndpoint(epOrig.getUrl()); 126 List<Corpus> rootCorporaRewritten = cacheRewritten.getRootCorporaOfEndpoint(epOrig.getUrl()); 127 Assert.assertEquals(rootCorporaOrig.size(), rootCorporaRewritten.size()); 128 Assert.assertEquals(1, rootCorporaRewritten.size()); 129 Assert.assertEquals(rootCorporaOrig.get(0).getLanguages(), rootCorporaRewritten.get(0).getLanguages()); 130 133 131 // System.out.println(cacheOrig); 134 132 // System.out.println(); 135 133 // System.out.println(cacheRewritten); 136 } 137 138 139 140 141 142 143 144 145 146 147 148 ScanCacheFiled scanFiled1 = new ScanCacheFiled(scanPath1);149 150 151 ScanCacheFiled scanFiled2 = new ScanCacheFiled(scanPath2);152 153 154 ScanCacheFiled scanFiled3 = new ScanCacheFiled(scanPath2);155 156 157 158 159 160 161 162 163 164 165 166 167 168 134 } 135 136 @Test 137 public void testReadWrite2Endpoints() { 138 String scanDir = "/scan-2ep"; 139 String scanPath1 = this.getClass().getResource(scanDir).getFile(); 140 String scanPath2 = "/tmp/scan-2ep"; 141 File scanDir2 = new File(scanPath2); 142 if (!scanDir2.exists()) { 143 scanDir2.mkdir(); 144 } 145 146 ScanCacheFile scanFiled1 = new ScanCacheFile(scanPath1); 147 ScanCache cacheOrig = scanFiled1.read(); 148 149 ScanCacheFile scanFiled2 = new ScanCacheFile(scanPath2); 150 scanFiled2.write(cacheOrig); 151 152 ScanCacheFile scanFiled3 = new ScanCacheFile(scanPath2); 153 ScanCache cacheRewritten = scanFiled3.read(); 154 155 //make sure caches contain the same info after read-write 156 Assert.assertEquals(cacheOrig.getInstitutions().size(), cacheRewritten.getInstitutions().size()); 157 Assert.assertEquals(cacheOrig.getRootCorpora().size(), cacheRewritten.getRootCorpora().size()); 158 Endpoint epOrig = cacheOrig.getInstitutions().get(2).getEndpoint(0); 159 Endpoint epRewritten = cacheRewritten.getInstitutions().get(2).getEndpoint(0); 160 Assert.assertEquals(epOrig.getUrl(), epRewritten.getUrl()); 161 Assert.assertEquals(epOrig, epRewritten); 162 epOrig = cacheOrig.getInstitutions().get(4).getEndpoint(0); 163 epRewritten = cacheRewritten.getInstitutions().get(4).getEndpoint(0); 164 Assert.assertEquals(epOrig.getUrl(), epRewritten.getUrl()); 165 Assert.assertEquals(epOrig, epRewritten); 166 169 167 // List<Corpus> rootCorporaOrig = cacheOrig.getRootCorporaOfEndpoint(epOrig.getUrl()); 170 168 // List<Corpus> rootCorporaRewritten = cacheRewritten.getRootCorporaOfEndpoint(epOrig.getUrl()); … … 176 174 // Assert.assertEquals(childenOrig, childenRewritten); 177 175 // Assert.assertEquals(rootCorporaOrig.get(0).getLanguages(), rootCorporaRewritten.get(0).getLanguages()); 178 179 176 //System.out.println(cacheOrig); 180 181 182 } 183 177 //System.out.println(); 178 //System.out.println(cacheRewritten); 179 } 180 184 181 } -
SRUAggregator/trunk/src/test/java/eu/clarin/sru/fcs/aggregator/app/ScanCrawlerTest.java
r5037 r5720 3 3 import eu.clarin.sru.client.SRUThreadedClient; 4 4 import eu.clarin.sru.fcs.aggregator.cache.EndpointUrlFilter; 5 import eu.clarin.sru.fcs.aggregator.cache.ScanCrawler; 5 6 import eu.clarin.sru.fcs.aggregator.cache.SimpleInMemScanCache; 6 import eu.clarin.sru.fcs.aggregator.cache.ScanCrawler; 7 import eu.clarin.sru.fcs.aggregator.sopt.CenterRegistryLive; 8 import eu.clarin.sru.fcs.aggregator.sopt.Corpus; 7 import eu.clarin.sru.fcs.aggregator.registry.CenterRegistryLive; 8 import eu.clarin.sru.fcs.aggregator.registry.Corpus; 9 9 import java.util.HashSet; 10 10 import java.util.Set; 11 12 11 import org.junit.Assert; 12 import org.junit.Ignore; 13 13 import org.junit.Test; 14 14 … … 17 17 * @author yanapanchenko 18 18 */ 19 @Ignore 19 20 public class ScanCrawlerTest { 20 21 21 // @Test 22 // public void testCrawlForMpiAndTue() { 23 // 24 // SRUThreadedClient sruClient = new SRUThreadedClient(); 25 // 26 // try { 27 // EndpointUrlFilter filter = new EndpointUrlFilter(); 28 // //filter.urlShouldContainAnyOf("leipzig", ".mpi.nl"); 29 // filter.urlShouldContainAnyOf("uni-tuebingen.de", ".mpi.nl"); 30 // //filter.urlShouldContainAnyOf("dspin.dwds.de", "lindat."); 31 // ScanCrawler crawler = new ScanCrawler(new CenterRegistryLive(), sruClient, filter, 2); 32 // SimpleInMemScanCache cache = new SimpleInMemScanCache(); 33 // crawler.crawl(cache); 34 // Corpus tueRootCorpus = cache.getRootCorporaOfEndpoint("http://weblicht.sfs.uni-tuebingen.de/rws/sru/").get(0); 35 // Corpus mpiRootCorpus = cache.getRootCorporaOfEndpoint("http://cqlservlet.mpi.nl/").get(0); 36 // Assert.assertEquals("http://hdl.handle.net/11858/00-1778-0000-0001-DDAF-D", 37 // tueRootCorpus.getHandle()); 38 // Corpus mpiCorpus = cache.getCorpus("hdl:1839/00-0000-0000-0001-53A5-2@format=cmdi"); 39 // Assert.assertEquals("hdl:1839/00-0000-0000-0003-4692-D@format=cmdi", cache.getChildren(mpiCorpus).get(0).getHandle()); 40 // //check if languages and other corpus data is crawled corectly... 41 // Set<String> tueLangs = new HashSet<String>(); 42 // tueLangs.add("deu"); 43 // Assert.assertEquals(tueLangs, tueRootCorpus.getLanguages()); 44 // String tueDescSubstring = "TÃŒbingen Treebank"; 45 // Assert.assertTrue("Description problem", tueRootCorpus.getDescription().contains(tueDescSubstring)); 46 // String tueNameSubstring = "TuebaDDC"; 47 // Assert.assertTrue("Name problem", tueRootCorpus.getDisplayName().contains(tueNameSubstring)); 48 // String tuePageSubstring = "sfs.uni-tuebingen.de"; 49 // Assert.assertTrue("Landing page problem", tueRootCorpus.getLandingPage().contains(tuePageSubstring)); 50 // Assert.assertTrue("Number of records problem", mpiRootCorpus.getNumberOfRecords() > 10); 51 // 52 // } finally { 53 // sruClient.shutdown(); 54 // } 55 // 56 // } 22 @Test 23 public void testCrawlForMpiAndTue() { 24 25 SRUThreadedClient sruClient = new SRUThreadedClient(); 26 27 try { 28 EndpointUrlFilter filter = new EndpointUrlFilter( 29 "uni-tuebingen.de", ".mpi.nl" //, "leipzig", ".mpi.nl", "dspin.dwds.de", "lindat." 30 ); 31 ScanCrawler crawler = new ScanCrawler(new CenterRegistryLive(), sruClient, filter, 2); 32 SimpleInMemScanCache cache = new SimpleInMemScanCache(); 33 crawler.crawl(cache); 34 Corpus tueRootCorpus = cache.getRootCorporaOfEndpoint("http://weblicht.sfs.uni-tuebingen.de/rws/sru/").get(0); 35 Corpus mpiRootCorpus = cache.getRootCorporaOfEndpoint("http://cqlservlet.mpi.nl/").get(0); 36 Assert.assertEquals("http://hdl.handle.net/11858/00-1778-0000-0001-DDAF-D", 37 tueRootCorpus.getHandle()); 38 Corpus mpiCorpus = cache.getCorpus("hdl:1839/00-0000-0000-0001-53A5-2@format=cmdi"); 39 Assert.assertEquals("hdl:1839/00-0000-0000-0003-4692-D@format=cmdi", cache.getChildren(mpiCorpus).get(0).getHandle()); 40 //check if languages and other corpus data is crawled corectly... 41 Set<String> tueLangs = new HashSet<String>(); 42 tueLangs.add("deu"); 43 Assert.assertEquals(tueLangs, tueRootCorpus.getLanguages()); 44 String tueDescSubstring = "TÃŒbingen Treebank"; 45 Assert.assertTrue("Description problem", tueRootCorpus.getDescription().contains(tueDescSubstring)); 46 String tueNameSubstring = "TuebaDDC"; 47 Assert.assertTrue("Name problem", tueRootCorpus.getDisplayName().contains(tueNameSubstring)); 48 String tuePageSubstring = "sfs.uni-tuebingen.de"; 49 Assert.assertTrue("Landing page problem", tueRootCorpus.getLandingPage().contains(tuePageSubstring)); 50 Assert.assertTrue("Number of records problem", mpiRootCorpus.getNumberOfRecords() > 10); 51 52 } finally { 53 sruClient.shutdown(); 54 } 55 } 57 56 }
Note: See TracChangeset
for help on using the changeset viewer.