Changeset 5035
- Timestamp:
- 04/24/14 09:59:28 (10 years ago)
- Location:
- SRUAggregator/trunk/src
- Files:
-
- 1 added
- 9 deleted
- 22 edited
- 1 copied
- 1 moved
Legend:
- Unmodified
- Added
- Removed
-
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/app/Aggregator.java
r4106 r5035 14 14 import org.zkoss.zul.Textbox; 15 15 import eu.clarin.sru.fcs.aggregator.sopt.Corpus; 16 import eu.clarin.sru.fcs.aggregator.util.SRUCQL; 16 17 import org.zkoss.zul.A; 17 import org.zkoss.zul.Button;18 18 import org.zkoss.zul.Div; 19 19 import org.zkoss.zul.Menubar; … … 25 25 26 26 /** 27 * Main windowof the Aggregator application.27 * Main component of the Aggregator application. 28 28 * 29 29 * @author Yana Panchenko … … 79 79 private PagesVisibility pagesVisibility; 80 80 81 private static final String WEBLICHT_URL = "https://weblicht.sfs.uni-tuebingen.de/WebLicht-4/?input="; 81 private static final String WEBLICHT_URL = 82 "https://weblicht.sfs.uni-tuebingen.de/WebLicht-4/?input="; 82 83 83 84 … … 324 325 String[] paramValue; 325 326 String query = null; 326 paramValue = Executions.getCurrent().getParameterMap().get( "query");327 paramValue = Executions.getCurrent().getParameterMap().get(SRUCQL.SEARCH_QUERY_PARAMETER); 327 328 if (paramValue != null) { 328 329 query = paramValue[0].trim(); … … 330 331 } 331 332 LOGGER.log(Level.INFO, "Received parameter: query[{0}], ", query); 332 paramValue = Executions.getCurrent().getParameterMap().get( "operation");333 paramValue = Executions.getCurrent().getParameterMap().get(SRUCQL.OPERATION); 333 334 String operationString = null; 334 335 if (paramValue != null) { 335 336 operationString = paramValue[0].trim(); 336 if (!operationString.equals( "searchRetrieve")) {337 if (!operationString.equals(SRUCQL.SEARCH_RETRIEVE)) { 337 338 Messagebox.show("Not supported operation " + operationString, "FCS", 0, Messagebox.INFORMATION); 338 339 } -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/app/ControlsVisibility.java
r3038 r5035 2 2 3 3 import org.zkoss.zul.A; 4 import org.zkoss.zul.Button;5 import org.zkoss.zul.Label;6 4 import org.zkoss.zul.Menubar; 7 5 import org.zkoss.zul.North; -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/app/SearchOptions.java
r5034 r5035 19 19 import eu.clarin.sru.fcs.aggregator.sopt.CorpusRendererLive; 20 20 import eu.clarin.sru.fcs.aggregator.sopt.Languages; 21 import eu.clarin.sru.fcs.aggregator.cache.ScanCacheI; 21 import eu.clarin.sru.fcs.aggregator.cache.ScanCache; 22 import eu.clarin.sru.fcs.aggregator.util.SRUCQL; 22 23 import java.lang.reflect.Type; 23 24 import java.util.ArrayList; … … 75 76 76 77 private boolean liveMode = false; 77 78 private SRUVersion version = SRUVersion.VERSION_1_2; 79 80 private ScanCacheI cache; 78 79 private ScanCache cache; 81 80 82 81 @Override 83 82 public void doAfterCompose(Component comp) throws Exception { 84 83 super.doAfterCompose(comp); 85 setUpSRUVersion();86 84 setUpAggerationContext(); 87 cache = (ScanCache I) Executions.getCurrent().getDesktop().getWebApp().getAttribute(WebAppListener.CORPUS_CACHE);85 cache = (ScanCache) Executions.getCurrent().getDesktop().getWebApp().getAttribute(WebAppListener.CORPUS_CACHE); 88 86 //if (cache.isEmpty()) { 89 87 // liveMode = true; … … 111 109 DefaultTreeNode<Corpus> node = (DefaultTreeNode<Corpus>) treeitem.getValue(); 112 110 Corpus corpus = node.getData(); 113 if (corpus.getLanguages().contains(selectedLang) || selectedLang.equals( "anylang")) {111 if (corpus.getLanguages().contains(selectedLang) || selectedLang.equals(Languages.ANY_LANGUAGE_NAME)) { 114 112 treeitem.setVisible(true); 115 113 } else { … … 244 242 245 243 private void setUpAggerationContext() { 246 String[] paramValue = Executions.getCurrent().getParameterMap().get( "x-aggregation-context");244 String[] paramValue = Executions.getCurrent().getParameterMap().get(SRUCQL.AGGREGATION_CONTEXT); 247 245 String contextJson = null; 248 246 if (paramValue != null) { 249 247 contextJson = paramValue[0].trim(); 250 248 } 251 LOGGER.log(Level.INFO, "Received parameter : x-aggregation-context[{0}], ", contextJson);249 LOGGER.log(Level.INFO, "Received parameter {0}:[{1}], ", new String[]{SRUCQL.AGGREGATION_CONTEXT, contextJson}); 252 250 253 251 if (contextJson != null) { … … 259 257 } catch (Exception ex) { 260 258 LOGGER.log(Level.SEVERE, "Error parsing JSON from x-aggregation-context: {0} {1}", new String[]{ex.getMessage(), contextJson}); 261 Messagebox.show("Error in x-aggregation-context parameter", "FCS", 0, Messagebox.INFORMATION);259 Messagebox.show("Error in " + SRUCQL.AGGREGATION_CONTEXT, "FCS", 0, Messagebox.INFORMATION); 262 260 } 263 261 } … … 301 299 } 302 300 303 private void setUpSRUVersion() {304 String[] paramValue = Executions.getCurrent().getParameterMap().get("version");305 String versionString = null;306 if (paramValue != null) {307 versionString = paramValue[0].trim();308 if (versionString.equals("1.2")) {309 version = SRUVersion.VERSION_1_2;310 } else if (versionString.equals("1.1")) {311 version = SRUVersion.VERSION_1_1;312 } else {313 Messagebox.show("SRU Version " + version + " not supported", "FCS", 0, Messagebox.INFORMATION);314 }315 }316 LOGGER.log(Level.INFO, "Received parameter: version[{0}], ", versionString);317 }318 301 319 302 private void selectCorpora(Treeitem openItem, Corpus data, List<String> handles) { -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/app/SearchResults.java
r4106 r5035 34 34 import eu.clarin.sru.fcs.aggregator.sresult.SearchResult; 35 35 import eu.clarin.sru.fcs.aggregator.sresult.SearchResultRecordRenderer; 36 import eu.clarin.sru.fcs.aggregator.util.SRUCQL searchRetrieve;36 import eu.clarin.sru.fcs.aggregator.util.SRUCQL; 37 37 import eu.clarin.weblicht.wlfxb.io.WLDObjector; 38 38 import eu.clarin.weblicht.wlfxb.io.WLFormatException; … … 53 53 import java.util.concurrent.Future; 54 54 import java.util.concurrent.atomic.AtomicBoolean; 55 import javax.ws.rs.core.MediaType;56 55 import opennlp.tools.tokenize.TokenizerME; 57 56 import opennlp.tools.tokenize.TokenizerModel; … … 114 113 Executions.getCurrent().getDesktop().enableServerPush(true); 115 114 searchClient = (SRUThreadedClient) Executions.getCurrent().getDesktop().getWebApp().getAttribute(WebAppListener.SHARED_SRU_CLIENT); 116 setUpSRUVersion();117 115 // assign the search controller to desktop, so that it can be accessed to be shutdown when the desktop is destroyed 118 116 Executions.getCurrent().getDesktop().setAttribute(this.getClass().getSimpleName(), this); … … 187 185 searchRequest.setStartRecord(searchOffset[0] + searchOffset[1]); 188 186 if (resultsItem.hasCorpusHandler()) { 189 searchRequest.setExtraRequestData(SRUCQL searchRetrieve.CORPUS_HANDLE_PARAMETER, resultsItem.getCorpus().getHandle());187 searchRequest.setExtraRequestData(SRUCQL.SEARCH_CORPUS_HANDLE_PARAMETER, resultsItem.getCorpus().getHandle()); 190 188 } 191 189 try { … … 853 851 854 852 private void setUpSRUVersion() { 855 String[] paramValue = Executions.getCurrent().getParameterMap().get( "version");853 String[] paramValue = Executions.getCurrent().getParameterMap().get(SRUCQL.VERSION); 856 854 String versionString = null; 857 855 if (paramValue != null) { … … 865 863 } 866 864 } 867 LOGGER.log(Level.INFO, "Received parameter: version[{0}], ", versionString);865 LOGGER.log(Level.INFO, "Received parameter: {0}[{1}], ", new String[]{SRUCQL.VERSION,versionString}); 868 866 } 869 867 -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/app/WebAppListener.java
r5034 r5035 1 1 package eu.clarin.sru.fcs.aggregator.app; 2 2 3 import eu.clarin.sru.fcs.aggregator.cache.ScanCrawl erRunnable;3 import eu.clarin.sru.fcs.aggregator.cache.ScanCrawlTask; 4 4 import eu.clarin.sru.fcs.aggregator.cache.ScanCrawler; 5 5 import eu.clarin.sru.fcs.aggregator.cache.ScanCacheFiled; 6 import eu.clarin.sru.fcs.aggregator.cache.S canCache;6 import eu.clarin.sru.fcs.aggregator.cache.SimpleInMemScanCache; 7 7 import eu.clarin.sru.client.SRUThreadedClient; 8 8 import eu.clarin.sru.client.fcs.ClarinFCSRecordParser; 9 import eu.clarin.sru.fcs.aggregator.cache.EndpointUrlFilter; 9 10 import eu.clarin.sru.fcs.aggregator.sopt.CenterRegistryI; 10 11 import eu.clarin.sru.fcs.aggregator.sopt.CenterRegistryLive; 11 12 import eu.clarin.sru.fcs.aggregator.sopt.Languages; 12 import eu.clarin.sru.fcs.aggregator.cache.ScanCache I;13 import eu.clarin.sru.fcs.aggregator.cache.ScanCache; 13 14 import java.io.File; 14 15 import java.io.IOException; … … 90 91 91 92 private String getScanDirectory() { 92 //File aggregatorDir = new File(System.getProperty("user.home"), "/." + AGGREGATOR_DIR_NAME);93 File aggregatorDir = new File("/var/www", "/." + AGGREGATOR_DIR_NAME);93 File aggregatorDir = new File(System.getProperty("user.home"), "/." + AGGREGATOR_DIR_NAME); 94 //File aggregatorDir = new File("/var/www", "/." + AGGREGATOR_DIR_NAME); 94 95 95 96 if (!aggregatorDir.exists()) { … … 110 111 CenterRegistryI centerRegistry = new CenterRegistryLive(); 111 112 SRUThreadedClient sruScanClient = (SRUThreadedClient) webapp.getAttribute(WebAppListener.SHARED_SRU_CLIENT); 112 //EndpointUrlFilter filter = new EndpointUrlFilter();113 EndpointUrlFilter filter = new EndpointUrlFilter(); 113 114 //filter.urlShouldContainAnyOf("leipzig", ".mpi.nl"); 114 115 //filter.urlShouldContainAnyOf("uni-tuebingen.de", ".mpi.nl"); 115 //filter.urlShouldContainAnyOf("dspin.dwds.de", "lindat.");116 //ScanCrawler scanCrawler = new ScanCrawler(centerRegistry, sruScanClient, filter, maxDepth);117 ScanCrawler scanCrawler = new ScanCrawler(centerRegistry, sruScanClient, null, CACHE_MAX_DEPTH);118 ScanCache IscanCache;116 filter.urlShouldContainAnyOf("dspin.dwds.de", "lindat."); 117 ScanCrawler scanCrawler = new ScanCrawler(centerRegistry, sruScanClient, filter, CACHE_MAX_DEPTH); 118 //ScanCrawler scanCrawler = new ScanCrawler(centerRegistry, sruScanClient, null, CACHE_MAX_DEPTH); 119 ScanCache scanCache; 119 120 120 121 //synchronized (scanCrawler) { … … 125 126 } catch (Exception e) { 126 127 LOGGER.log(Level.SEVERE, "Error while reading the scan cache!", e); 127 scanCache = new S canCache();128 scanCache = new SimpleInMemScanCache(); 128 129 } 129 130 //} … … 132 133 133 134 scheduler.scheduleAtFixedRate( 134 new ScanCrawl erRunnable(scanCrawler, scanCacheFiled, webapp),135 new ScanCrawlTask(scanCrawler, scanCacheFiled, webapp), 135 136 0, CACHE_UPDATE_INTERVAL, CACHE_UPDATE_INTERVAL_UNIT); 136 137 -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/cache/EndpointFilter.java
r5034 r5035 4 4 5 5 /** 6 * 6 * Filter for the cache of scan data (endpoint/resources descriptions) - for 7 * specifying if only particular endpoints have not to be cached. Useful for 8 * testing the endpoints. 9 * 7 10 * @author yanapanchenko 8 11 */ -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/cache/EndpointUrlFilter.java
r5034 r5035 6 6 7 7 /** 8 * 8 * Filters for the cache of scan data (endpoint/resources descriptions) based 9 * on endpoint url. Only endpoints containing one of the specified string in the 10 * endpoint url will be cached. Useful for testing the endpoints. 11 * 9 12 * @author yanapanchenko 10 13 */ -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/cache/ScanCache.java
r5034 r5035 2 2 3 3 import eu.clarin.sru.fcs.aggregator.sopt.Corpus; 4 import eu.clarin.sru.fcs.aggregator.sopt.InstitutionI; 5 import eu.clarin.sru.fcs.aggregator.cache.ScanCacheI; 6 import java.util.ArrayList; 7 import java.util.HashMap; 8 import java.util.HashSet; 9 import java.util.LinkedHashMap; 4 import eu.clarin.sru.fcs.aggregator.sopt.Institution; 10 5 import java.util.List; 11 6 import java.util.Map; 12 7 import java.util.Set; 13 import java.util.logging.Logger;14 8 15 9 /** 10 * Interface for the cached scan data (endpoints descriptions). 16 11 * 17 * @author yanapanchenko12 * @author Yana Panchenko 18 13 */ 19 public class ScanCache implements ScanCacheI{14 public interface ScanCache { 20 15 21 private Map<String, List<Corpus>> enpUrlToRootCorpora = new LinkedHashMap<String, List<Corpus>>(30); 22 private Map<String, List<Corpus>> corpusToChildren = new HashMap<String, List<Corpus>>(); 23 private Map<String, String> childToParent = new HashMap<String, String>(); 24 private Map<String, Set<Corpus>> langToRootCorpora = new HashMap<String, Set<Corpus>>(); 25 private Map<String, Set<Corpus>> langToTopUniqueCorpora = new HashMap<String, Set<Corpus>>(); 26 private List<InstitutionI> institutions = new ArrayList<InstitutionI>(); 16 /** 17 * Checks whether the Cache has the endpoints resource tree cached. 18 * @return true if the Cache is empty, false otherwise 19 */ 20 public boolean isEmpty(); 21 22 23 /** 24 * Gets all the institutions registered in center repository 25 * as having a SRU/CQL endpoint 26 * @return list of institutions that support SRU/CQL 27 */ 28 public List<Institution> getInstitutions(); 27 29 28 private static final Logger LOGGER = Logger.getLogger(ScanCache.class.getName()); 30 /** 31 * Gets all the root corpora of the endpoints (top nodes in the corpus 32 * resource tree) 33 * @return root corpora of the endpoints 34 */ 35 public List<Corpus> getRootCorpora(); 36 29 37 38 /** 39 * Gets languages mapped to all the root corpora of the endpoints 40 * (top nodes in the corpus resource tree) that have the corresponding language 41 * @return a map from the languages to the root corpora of the endpoints 42 * in the corresponding language 43 */ 44 public Map<String, Set<Corpus>> getRootCorporaForLang(); 45 46 /** 47 * Gets all the root corpora of the endpoints (top nodes in the corpus 48 * resource tree) that have the specified language 49 * @param lang language of interest as a three-letter iso code 50 * @return root corpora of the endpoints that are in the specified language 51 */ 52 public List<Corpus> getRootCorporaForLang(String lang); 53 54 /** 55 * In the corpus resource tree, gets all the languages mapped to the corpus 56 * resources that have the corresponding language as the only one language, 57 * and either have no parent resource or its parent has multiple languages. 58 * @return map of languages to top corpora (in the corpus resource tree) 59 * that have the corresponding language as the only language 60 */ 61 public Map<String, Set<Corpus>> getTopUniqueLangToCorpora(); 62 63 /** 64 * In the corpus resource tree, gets all the resource nodes of the tree 65 * that have the specified language as the only language of the resource, 66 * and either have no parent resource or its parent resource has multiple 67 * languages. 68 * @param lang language of interest as three-letter iso code 69 * @return corpora that have the specified language as the only language 70 * and whose parent corpora do not have it as the only language 71 */ 72 public List<Corpus> getTopUniqueLanguageCorpora(String lang); 73 74 /** 75 * Gets all the root corpora of the specified endpoints 76 * @param enpointUrl the URL of the endpoint of interest 77 * @return root corpora of the endpoint 78 */ 79 public List<Corpus> getRootCorporaOfEndpoint(String enpointUrl); 80 81 /** 82 * Gets all the languages of the existing corpora in the corpus resource 83 * tree (since parent corpora sum-up languages of their children corpora, 84 * that's all the languages of the root corpora) 85 * @return all the languages specified in the corpus resource tree 86 */ 87 public Set<String> getLanguages(); 30 88 31 public List<InstitutionI> getInstitutions() { 32 return institutions; 33 } 89 /** 90 * Gets all children corpora of the specified corpus in the endpoints 91 * resource tree 92 * @param corpus the parent corpus 93 * @return children corpora of the specified parent corpus 94 */ 95 public List<Corpus> getChildren(Corpus corpus); 96 97 98 /** 99 * Adds institution that supports SRU/CQL into the cache. 100 * 101 * @param institution that supports SRU/CQL 102 */ 103 public void addInstitution(Institution institution); 104 105 /** 106 * Adds corpus that is the top level resource of an endpoint 107 * (root corpus that has no parent corpus) into the cache. 108 * Same as addCorpus(c, null) 109 * 110 * @param c root corpus to be added 111 */ 112 public void addCorpus(Corpus c); 34 113 35 @Override36 public List<Corpus> getRootCorporaOfEndpoint(String enpointUrl) {37 List<Corpus> roots = new ArrayList<Corpus>();38 if (enpUrlToRootCorpora.containsKey(enpointUrl)) {39 roots.addAll(enpUrlToRootCorpora.get(enpointUrl));40 }41 return roots;42 }114 115 /** 116 * Adds corpus into the cache. 117 * 118 * @param c corpus to be added 119 * @param parentCorpus parent of the corpus to be added 120 */ 121 public void addCorpus(Corpus c, Corpus parentCorpus); 43 122 44 public List<Corpus> getChildrenCorpora(String handle) { 45 List<Corpus> children = new ArrayList<Corpus>(); 46 if (corpusToChildren.containsKey(handle)) { 47 children.addAll(corpusToChildren.get(handle)); 48 } 49 return children; 50 } 51 52 public void addInstitution(InstitutionI institution) { 53 institutions.add(institution); 54 } 55 56 public void addCorpus(Corpus c) { 57 addCorpus(c, true, null); 58 } 59 60 public void addCorpus(Corpus c, Corpus parentCorpus) { 61 addCorpus(c, false, parentCorpus); 62 } 63 64 public void addCorpus(Corpus c, boolean root, Corpus parentCorpus) { 65 66 // index top corpora with unique language as for their languages 67 //if (c.getLanguages().size() == 1 && 68 // (root || this.)) 69 70 71 // don't add corpus that introduces cyclic references 72 if (this.childToParent.containsKey(c.getHandle())) { 73 // as of March 2014, there are 2 such endpoints... 74 LOGGER.warning("Cyclic reference in corpus " + c.getHandle() + " of endpoint " + c.getEndpointUrl()); 75 return; 76 } 77 78 79 if (root) { 80 // index root corpora as for their languages 81 for (String lang : c.getLanguages()) { 82 if (!langToRootCorpora.containsKey(lang)) { 83 langToRootCorpora.put(lang, new HashSet<Corpus>()); 84 } 85 langToRootCorpora.get(lang).add(c); 86 } 87 // index root corpora as for their endpint url 88 if (!enpUrlToRootCorpora.containsKey(c.getEndpointUrl())) { 89 enpUrlToRootCorpora.put(c.getEndpointUrl(), new ArrayList<Corpus>()); 90 } 91 enpUrlToRootCorpora.get(c.getEndpointUrl()).add(c); 92 childToParent.put(c.getHandle(), Corpus.ROOT_HANDLE); 93 } else { 94 if (!corpusToChildren.containsKey(parentCorpus.getHandle())) { 95 corpusToChildren.put(parentCorpus.getHandle(), new ArrayList<Corpus>()); 96 } 97 corpusToChildren.get(parentCorpus.getHandle()).add(c); 98 childToParent.put(c.getHandle(), parentCorpus.getHandle()); 99 } 100 } 101 102 @Override 103 public String toString() { 104 return "cache{\n" + "institutions=" + institutions + "\n" 105 + "enpUrlToRootCorpora=" + enpUrlToRootCorpora 106 + "\n corpusToChildren=" + corpusToChildren 107 + "\n langToTopUniqueCorpora=" + langToTopUniqueCorpora + "\n}"; 108 } 109 110 @Override 111 public boolean isEmpty() { 112 return enpUrlToRootCorpora.isEmpty(); 113 } 114 115 @Override 116 public List<Corpus> getRootCorpora() { 117 List<Corpus> rootCorpora = new ArrayList<Corpus>(enpUrlToRootCorpora.size()); 118 for (List<Corpus> corpora : this.enpUrlToRootCorpora.values()) { 119 rootCorpora.addAll(corpora); 120 } 121 return rootCorpora; 122 } 123 124 @Override 125 public Set<String> getLanguages() { 126 Set<String> languages = new HashSet<String>(this.langToRootCorpora.size()); 127 languages.addAll(this.langToRootCorpora.keySet()); 128 return languages; 129 } 130 131 @Override 132 public List<Corpus> getChildren(Corpus corpus) { 133 List<Corpus> corpora = this.corpusToChildren.get(corpus.getHandle()); 134 if (corpora == null) { 135 return (new ArrayList<Corpus>()); 136 } else { 137 List<Corpus> corporaCopy = new ArrayList<Corpus>(corpora); 138 return corporaCopy; 139 } 140 } 141 142 @Override 143 public Map<String, Set<Corpus>> getRootCorporaForLang() { 144 return langToRootCorpora; 145 } 146 147 @Override 148 public List<Corpus> getRootCorporaForLang(String lang) { 149 List<Corpus> rootCorpora = new ArrayList<Corpus>(enpUrlToRootCorpora.size()); 150 for (List<Corpus> corpora : this.enpUrlToRootCorpora.values()) { 151 for (Corpus corpus : corpora) { 152 if (corpus.getLanguages().contains(lang)) { 153 rootCorpora.add(corpus); 154 } 155 } 156 } 157 return rootCorpora; 158 } 159 160 @Override 161 public Map<String, Set<Corpus>> getTopUniqueLangToCorpora() { 162 throw new UnsupportedOperationException("Not supported yet."); 163 } 164 165 @Override 166 public List<Corpus> getTopUniqueLanguageCorpora(String lang) { 167 throw new UnsupportedOperationException("Not supported yet."); 168 } 123 124 /** 125 * Gets Corpus that corresponds to the specified handle. 126 * 127 * @param handle of the Corpus of interest. 128 * @return Corpus of the handle or null if there is no Corpus with 129 * this handle in the cache. 130 */ 131 public Corpus getCorpus(String handle); 132 169 133 } -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/cache/ScanCacheFiled.java
r5034 r5035 1 1 package eu.clarin.sru.fcs.aggregator.cache; 2 2 3 import eu.clarin.sru.fcs.aggregator.app.CacheCorporaScanIntoFileTask;4 import eu.clarin.sru.fcs.aggregator.cache.ScanCache;5 3 import eu.clarin.sru.fcs.aggregator.sopt.Corpus; 6 4 import eu.clarin.sru.fcs.aggregator.sopt.Endpoint; 7 5 import eu.clarin.sru.fcs.aggregator.sopt.Institution; 8 import eu.clarin.sru.fcs.aggregator.sopt.InstitutionI;9 6 import java.io.BufferedOutputStream; 10 7 import java.io.BufferedReader; … … 23 20 24 21 /** 22 * Utility for reading/writing scan data (endpoints descriptions) from/to 23 * ScanCache from/to local files. 25 24 * 26 25 * @author yanapanchenko … … 39 38 private static final Logger LOGGER = Logger.getLogger(ScanCacheFiled.class.getName()); 40 39 40 /** 41 * Constructs ScanCache/files reading/writing utility. 42 * 43 * @param scanDirectory path to local directory were files with 44 * ScanCache data are/should be stored. 45 */ 41 46 public ScanCacheFiled(String scanDirectory) { 42 47 this.scanDirectory = scanDirectory; 43 48 } 44 49 50 /** 51 * Writes ScanCache data (endpoints and resources descriptions) in a special 52 * simple plain text format into the files. 53 * 54 * @param cache ScanCache the data of which should be written into the files. 55 */ 45 56 public void write(ScanCache cache) { 46 57 … … 51 62 File sruInstitutionsFile = new File(scanDirectory, INSTITUTION_ENDPOINTS_FILENAME); 52 63 os = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(sruInstitutionsFile)), ENCODING); 53 for (Institution Iinstitution : cache.getInstitutions()) {64 for (Institution institution : cache.getInstitutions()) { 54 65 writeInstitutionInfo(os, institution); 55 66 for (Endpoint endp : institution.getEndpoints()) { … … 95 106 writeCorpusInfo(os, c); 96 107 97 List<Corpus> children = cache.getChildren Corpora(c.getHandle());108 List<Corpus> children = cache.getChildren(c); 98 109 if (children != null) { 99 110 for (Corpus child : children) { … … 121 132 } 122 133 123 private void writeInstitutionInfo(Writer writer, Institution Iinstitution) throws IOException {134 private void writeInstitutionInfo(Writer writer, Institution institution) throws IOException { 124 135 125 136 writer.write(I); … … 182 193 } 183 194 195 /** 196 * Reads ScanCache data from the files in scanDirectory directory. The files 197 * contain endpoint and resources descriptions in a special simple plain 198 * text format, resulting from applying write(ScanCache scanCache) method. 199 * 200 * @return ScanCache with the data read from the files containing endpoints 201 * and resources descriptions. 202 */ 184 203 public ScanCache read() { 185 S canCache cache = newScanCache();204 SimpleInMemScanCache cache = new SimpleInMemScanCache(); 186 205 File sruInstitutionsFile = new File(scanDirectory, INSTITUTION_ENDPOINTS_FILENAME); 187 206 BufferedReader reader = null; … … 194 213 line = line.trim(); 195 214 if (line.length() > 0) { 196 String[] splitted = line.split("\\" + CacheCorporaScanIntoFileTask.SEP);197 if (splitted.length == 2 && splitted[0].equals( CacheCorporaScanIntoFileTask.I)) {215 String[] splitted = line.split("\\" + SEP); 216 if (splitted.length == 2 && splitted[0].equals(I)) { 198 217 inst = new Institution(splitted[1], ""); 199 218 if (!institutions.contains(inst)) { … … 201 220 cache.addInstitution(inst); 202 221 } 203 } else if (inst != null && splitted.length == 3 && splitted[0].equals( CacheCorporaScanIntoFileTask.IE)) {222 } else if (inst != null && splitted.length == 3 && splitted[0].equals(IE)) { 204 223 Endpoint ep = inst.add(splitted[2]); 205 224 if (!splitted[1].trim().isEmpty()) { … … 229 248 } 230 249 231 private void readAndAddCorpus(String path, Corpus parentCorpus, Institution I inst,ScanCache cache) {250 private void readAndAddCorpus(String path, Corpus parentCorpus, Institution inst, SimpleInMemScanCache cache) { 232 251 File corpusFile = new File(path, CORPUS_INFO_FILENAME); 233 252 BufferedReader reader = null; … … 258 277 // corpus langs 259 278 Set<String> langs = new HashSet<String>(); 260 for (String lang : line.split("\\" + CacheCorporaScanIntoFileTask.SEP)) {279 for (String lang : line.split("\\" + SEP)) { 261 280 langs.add(lang); 262 281 } -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/cache/ScanCrawlTask.java
r5034 r5035 1 1 package eu.clarin.sru.fcs.aggregator.cache; 2 2 3 import eu.clarin.sru.fcs.aggregator.cache.ScanCrawler;4 import eu.clarin.sru.fcs.aggregator.cache.ScanCacheFiled;5 import eu.clarin.sru.fcs.aggregator.cache.ScanCache;6 3 import static eu.clarin.sru.fcs.aggregator.app.WebAppListener.CORPUS_CACHE; 7 4 import java.util.logging.Level; … … 10 7 11 8 /** 12 * 9 * A task for crawling endpoint scan operation responses of FCS specification. 10 * If successful, saves found endpoints and resources descriptions into a new 11 * ScanCache and updates the web application contexts with this new cache, as 12 * well as rewrites the previously scanned data saved on the disk. 13 * 13 14 * @author yanapanchenko 14 15 */ 15 public class ScanCrawl erRunnableimplements Runnable {16 public class ScanCrawlTask implements Runnable { 16 17 17 private static final Logger logger = Logger.getLogger(ScanCrawl erRunnable.class.getName());18 private static final Logger logger = Logger.getLogger(ScanCrawlTask.class.getName()); 18 19 19 20 private final ScanCrawler scanCrawler; … … 21 22 private WebApp webapp; 22 23 23 public ScanCrawl erRunnable(24 public ScanCrawlTask( 24 25 ScanCrawler scanCrawler, ScanCacheFiled scanCacheFiled, WebApp webapp) { 25 26 this.scanCrawler = scanCrawler; … … 32 33 33 34 logger.info("STARTING CACHING CORPORA SCAN"); 34 ScanCache cacheNew = scanCrawler.crawl(); 35 SimpleInMemScanCache cacheNew = new SimpleInMemScanCache(); 36 scanCrawler.crawl(cacheNew); 35 37 logger.info("New Cache, number of root corpora: " + cacheNew.getRootCorpora().size()); 36 38 -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/cache/ScanCrawler.java
r5034 r5035 1 1 package eu.clarin.sru.fcs.aggregator.cache; 2 2 3 import eu.clarin.sru.fcs.aggregator.cache.ScanCache;4 3 import eu.clarin.sru.client.SRUScanRequest; 5 4 import eu.clarin.sru.client.SRUScanResponse; … … 9 8 import eu.clarin.sru.fcs.aggregator.sopt.Corpus; 10 9 import eu.clarin.sru.fcs.aggregator.sopt.Endpoint; 11 import eu.clarin.sru.fcs.aggregator.sopt.Institution I;12 import eu.clarin.sru.fcs.aggregator.util.SRUCQL scan;10 import eu.clarin.sru.fcs.aggregator.sopt.Institution; 11 import eu.clarin.sru.fcs.aggregator.util.SRUCQL; 13 12 import java.util.List; 14 13 import java.util.concurrent.Future; … … 23 22 24 23 /** 25 * 24 * Crawler for collecting endpoint scan operation responses of FCS specification. 25 * Collects all the endpoints and resources descriptions. 26 * 26 27 * @author yanapanchenko 27 28 */ … … 45 46 } 46 47 47 public ScanCache crawl() { 48 49 ScanCache cache = new ScanCache(); 48 /** 49 * Crawler of scan operation of FCS specification. Collects all the endpoints 50 * and resources descriptions into the provided cache. 51 * 52 * @param cache cache into which the endpoints and resources descriptions 53 * from scan operation responses should be collected. 54 */ 55 public void crawl(ScanCache cache) { 50 56 51 57 //TODO remember not responding root corpora and come back to them later... ? 52 List<Institution I> institutions = cr.getCQLInstitutions();58 List<Institution> institutions = cr.getCQLInstitutions(); 53 59 //LOGGER.info(institutions.toString()); 54 for (Institution Iinstitution : institutions) {60 for (Institution institution : institutions) { 55 61 cache.addInstitution(institution); 56 62 Iterable<Endpoint> endpoints = institution.getEndpoints(); … … 64 70 } 65 71 66 return cache;67 68 72 } 69 73 74 // TODO: ask Oliver to add API support for the extra info in the 75 // SRU client/server libraries, so that it's not necessary to work 76 // with DocumentFragment 70 77 private void addExtraInfo(Corpus c, SRUTerm term) { 71 78 … … 111 118 112 119 private void addCorpora(SRUThreadedClient sruScanClient, String endpointUrl, 113 InstitutionI institution, int depth, Corpus parentCorpus, ScanCache cache) { 114 //System.out.println("Adding Corpora: " + endpointUrl + " " + handle); 115 120 Institution institution, int depth, Corpus parentCorpus, ScanCache cache) { 121 116 122 Future<SRUScanResponse> corporaResponse = null; 117 123 … … 127 133 128 134 SRUScanRequest corporaRequest = new SRUScanRequest(endpointUrl); 129 StringBuilder scanClause = new StringBuilder(SRUCQL scan.RESOURCE_PARAMETER);135 StringBuilder scanClause = new StringBuilder(SRUCQL.SCAN_RESOURCE_PARAMETER); 130 136 scanClause.append("="); 131 //String normalizedHandle = normalizeHandle(handle, root);132 137 String normalizedHandle = normalizeHandle(parentCorpus, root); 133 138 scanClause.append(normalizedHandle); 134 139 corporaRequest.setScanClause(scanClause.toString()); 135 corporaRequest.setExtraRequestData(SRUCQLscan.RESOURCE_INFO_PARAMETER, "true"); 140 corporaRequest.setExtraRequestData(SRUCQL.SCAN_RESOURCE_INFO_PARAMETER, 141 SRUCQL.SCAN_RESOURCE_INFO_PARAMETER_DEFAULT_VALUE); 136 142 corporaResponse = sruScanClient.scan(corporaRequest); 137 143 Thread.sleep(5000); … … 139 145 if (response != null && response.hasTerms()) { 140 146 for (SRUTerm term : response.getTerms()) { 147 // don't add corpus that introduces cyclic references 148 // as of March 2014, there are 2 such endpoints... 149 if (cache.getCorpus(term.getValue())!= null) { 150 LOGGER.warning("Cyclic reference in corpus " + term.getValue() + " of endpoint " + endpointUrl); 151 continue; 152 } 141 153 Corpus c = new Corpus(institution, endpointUrl); 142 154 c.setHandle(term.getValue()); … … 144 156 c.setNumberOfRecords(term.getNumberOfRecords()); 145 157 addExtraInfo(c, term); 146 cache.addCorpus(c, root,parentCorpus);158 cache.addCorpus(c, parentCorpus); 147 159 addCorpora(sruScanClient, c.getEndpointUrl(), c.getInstitution(), 148 160 depth, c, cache); -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/cache/SimpleInMemScanCache.java
r5034 r5035 2 2 3 3 import eu.clarin.sru.fcs.aggregator.sopt.Corpus; 4 import eu.clarin.sru.fcs.aggregator.sopt.InstitutionI; 5 import eu.clarin.sru.fcs.aggregator.cache.ScanCacheI; 4 import eu.clarin.sru.fcs.aggregator.sopt.Institution; 6 5 import java.util.ArrayList; 7 6 import java.util.HashMap; … … 14 13 15 14 /** 15 * Implementation of the cached scan data (endpoints descriptions) that 16 * stores the cache in memory in maps. 16 17 * 17 18 * @author yanapanchenko 18 19 */ 19 public class S canCache implements ScanCacheI{20 public class SimpleInMemScanCache implements ScanCache { 20 21 21 22 private Map<String, List<Corpus>> enpUrlToRootCorpora = new LinkedHashMap<String, List<Corpus>>(30); 22 23 private Map<String, List<Corpus>> corpusToChildren = new HashMap<String, List<Corpus>>(); 23 private Map<String, String> childToParent = new HashMap<String, String>(); 24 //private Map<String, String> childToParent = new HashMap<String, String>(); 25 private Map<String, Corpus> handleToCorpus = new HashMap<String, Corpus>(); 24 26 private Map<String, Set<Corpus>> langToRootCorpora = new HashMap<String, Set<Corpus>>(); 25 27 private Map<String, Set<Corpus>> langToTopUniqueCorpora = new HashMap<String, Set<Corpus>>(); 26 private List<Institution I> institutions = new ArrayList<InstitutionI>();28 private List<Institution> institutions = new ArrayList<Institution>(); 27 29 28 private static final Logger LOGGER = Logger.getLogger(S canCache.class.getName());30 private static final Logger LOGGER = Logger.getLogger(SimpleInMemScanCache.class.getName()); 29 31 30 31 public List<Institution I> getInstitutions() {32 @Override 33 public List<Institution> getInstitutions() { 32 34 return institutions; 33 35 } … … 42 44 } 43 45 44 public List<Corpus> getChildrenCorpora(String handle) { 45 List<Corpus> children = new ArrayList<Corpus>(); 46 if (corpusToChildren.containsKey(handle)) { 47 children.addAll(corpusToChildren.get(handle)); 48 } 49 return children; 50 } 51 52 public void addInstitution(InstitutionI institution) { 46 @Override 47 public void addInstitution(Institution institution) { 53 48 institutions.add(institution); 54 49 } 55 50 51 @Override 56 52 public void addCorpus(Corpus c) { 57 addCorpus(c, true,null);53 addCorpus(c, null); 58 54 } 59 55 56 @Override 60 57 public void addCorpus(Corpus c, Corpus parentCorpus) { 61 addCorpus(c, false, parentCorpus);62 }63 64 public void addCorpus(Corpus c, boolean root, Corpus parentCorpus) {65 58 66 59 // index top corpora with unique language as for their languages 67 60 //if (c.getLanguages().size() == 1 && 68 61 // (root || this.)) 69 70 71 // don't add corpus that introduces cyclic references72 if (this.childToParent.containsKey(c.getHandle())) {73 // as of March 2014, there are 2 such endpoints...74 LOGGER.warning("Cyclic reference in corpus " + c.getHandle() + " of endpoint " + c.getEndpointUrl());75 return;76 }77 78 62 79 if (root) { 63 handleToCorpus.put(c.getHandle(), c); 64 65 if (parentCorpus == null) { //i.e it's a root corpus 80 66 // index root corpora as for their languages 81 67 for (String lang : c.getLanguages()) { … … 90 76 } 91 77 enpUrlToRootCorpora.get(c.getEndpointUrl()).add(c); 92 childToParent.put(c.getHandle(), Corpus.ROOT_HANDLE);78 //childToParent.put(c.getHandle(), Corpus.ROOT_HANDLE); 93 79 } else { 94 80 if (!corpusToChildren.containsKey(parentCorpus.getHandle())) { … … 96 82 } 97 83 corpusToChildren.get(parentCorpus.getHandle()).add(c); 98 childToParent.put(c.getHandle(), parentCorpus.getHandle());84 //childToParent.put(c.getHandle(), parentCorpus.getHandle()); 99 85 } 100 86 } … … 167 153 throw new UnsupportedOperationException("Not supported yet."); 168 154 } 155 156 @Override 157 public Corpus getCorpus(String handle) { 158 return this.handleToCorpus.get(handle); 159 } 169 160 } -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/rest/AggregatorService.java
r5034 r5035 7 7 8 8 /** 9 * 9 * RESTful service. At the moment does nothing useful and was added just to 10 * make sure that it would be possible to combine REST services with ZK app, 11 * and add to the aggregator the support for its usage as aggregated FCS 12 * server, as was planned in the initial FCS specification. 13 * 10 14 * @author yanapanchenko 11 15 */ -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/sopt/CenterRegistryForTesting.java
r3044 r5035 7 7 8 8 /** 9 * Center registry node. Its children are centers (institutions). 9 * Center registry node. Its children are centers (institutions). 10 * The class is created after a request from MPI to provide them 11 * with a possibility to test their endpoints on development 12 * servers with the aggregator before they put them on production 13 * server. Institutions and endpoint urls that need to be tested are hard-coded. 10 14 * 11 15 * @author Yana Panchenko … … 15 19 private static final Logger logger = Logger.getLogger(CenterRegistryForTesting.class.getName()); 16 20 private boolean hasChildrenLoaded = false; 17 private List<Institution I> centers = new ArrayList<InstitutionI>();21 private List<Institution> centers = new ArrayList<Institution>(); 18 22 private static final String[] INSTITUTION_URLS = new String[]{ 19 23 "http://130.183.206.32/restxml/5" … … 44 48 45 49 @Override 46 public List<Institution I> getCQLInstitutions() {50 public List<Institution> getCQLInstitutions() { 47 51 loadCQLInstitutions(); 48 52 return centers; … … 50 54 51 55 @Override 52 public Institution IgetCQLInstitution(int index) {56 public Institution getCQLInstitution(int index) { 53 57 loadCQLInstitutions(); 54 58 if (index >= centers.size()) { … … 60 64 private void loadCQLInstitutionsForTesting() { 61 65 for (int i = 0; i < INSTITUTION_ENDPOINTS.length; i++) { 62 Institution Iinstitution = new Institution(INSTITUTION_NAMES[i], INSTITUTION_URLS[i]);66 Institution institution = new Institution(INSTITUTION_NAMES[i], INSTITUTION_URLS[i]); 63 67 for (int j = 0; j < INSTITUTION_ENDPOINTS.length; j++) { 64 68 institution.add(INSTITUTION_ENDPOINTS[i][j]); -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/sopt/CenterRegistryI.java
r3044 r5035 16 16 public void loadCQLInstitutions(); 17 17 18 public List<Institution I> getCQLInstitutions();18 public List<Institution> getCQLInstitutions(); 19 19 20 public Institution IgetCQLInstitution(int index);20 public Institution getCQLInstitution(int index); 21 21 22 22 } -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/sopt/CenterRegistryLive.java
r3044 r5035 25 25 //https://centerregistry-clarin.esc.rzg.mpg.de/restxml/ 26 26 private boolean hasInstitutionsLoaded = false; 27 private List<Institution I> centers = new ArrayList<InstitutionI>();27 private List<Institution> centers = new ArrayList<Institution>(); 28 28 29 29 @Override … … 46 46 String institutionUrl = regCenter.getId(); 47 47 String institutionName = regCenter.getCenterName(); 48 Institution Iinstitution = new Institution(institutionName, institutionUrl);48 Institution institution = new Institution(institutionName, institutionUrl); 49 49 // display in the tree only those institutions that have CQL endpoints: 50 50 CenterProfile profile = connector.retrieveCenterProfile(regCenter); … … 77 77 78 78 @Override 79 public List<Institution I> getCQLInstitutions() {79 public List<Institution> getCQLInstitutions() { 80 80 loadCQLInstitutions(); 81 81 return centers; … … 83 83 84 84 @Override 85 public Institution IgetCQLInstitution(int index) {85 public Institution getCQLInstitution(int index) { 86 86 loadCQLInstitutions(); 87 87 if (index >= centers.size()) { -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/sopt/Corpus.java
r3058 r5035 13 13 public static final String ROOT_HANDLE = "root"; 14 14 public static final Pattern HANDLE_WITH_SPECIAL_CHARS = Pattern.compile(".*[<>=/()\\s].*"); 15 private Institution Iinstitution;15 private Institution institution; 16 16 private String endpointUrl; 17 17 private String handle; … … 28 28 } 29 29 30 public Corpus(Institution Iinstitution, String endpointUrl) {30 public Corpus(Institution institution, String endpointUrl) { 31 31 this.institution = institution; 32 32 this.endpointUrl = endpointUrl; … … 65 65 } 66 66 67 public Institution IgetInstitution() {67 public Institution getInstitution() { 68 68 return institution; 69 69 } -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/sopt/CorpusModelCached.java
r5034 r5035 1 1 package eu.clarin.sru.fcs.aggregator.sopt; 2 2 3 import eu.clarin.sru.fcs.aggregator.cache.ScanCache I;3 import eu.clarin.sru.fcs.aggregator.cache.ScanCache; 4 4 import java.util.ArrayList; 5 5 import java.util.Arrays; … … 22 22 private static final Logger logger = Logger.getLogger(CorpusModelCached.class.getName()); 23 23 private Map<String, Set<Corpus>> selectedCorpora = new HashMap<String, Set<Corpus>>(); 24 private ScanCache Icache;24 private ScanCache cache; 25 25 26 public CorpusModelCached(ScanCache Icache) {26 public CorpusModelCached(ScanCache cache) { 27 27 super(new DefaultTreeNode(new Corpus(), new ArrayList<DefaultTreeNode<Corpus>>())); 28 28 this.cache = cache; -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/sopt/CorpusModelLive.java
r3058 r5035 8 8 import eu.clarin.sru.fcs.aggregator.app.WebAppListener; 9 9 import static eu.clarin.sru.fcs.aggregator.sopt.Corpus.ROOT_HANDLE; 10 import eu.clarin.sru.fcs.aggregator.util.SRUCQL scan;10 import eu.clarin.sru.fcs.aggregator.util.SRUCQL; 11 11 import java.util.ArrayList; 12 12 import java.util.Arrays; … … 186 186 187 187 private void initRootChildren(CenterRegistryI startingPoint) { 188 for (Institution Iinstit : startingPoint.getCQLInstitutions()) {188 for (Institution instit : startingPoint.getCQLInstitutions()) { 189 189 for (Endpoint endp : instit.getEndpoints()) { 190 190 try { … … 199 199 Future<SRUScanResponse> corporaResponse = null; 200 200 SRUScanRequest corporaRequest = new SRUScanRequest(endp.getUrl()); 201 StringBuilder scanClause = new StringBuilder(SRUCQL scan.RESOURCE_PARAMETER);201 StringBuilder scanClause = new StringBuilder(SRUCQL.SCAN_RESOURCE_PARAMETER); 202 202 scanClause.append("="); 203 203 scanClause.append(ROOT_HANDLE); 204 204 corporaRequest.setScanClause(scanClause.toString()); 205 corporaRequest.setExtraRequestData(SRUCQL scan.RESOURCE_INFO_PARAMETER, "true");205 corporaRequest.setExtraRequestData(SRUCQL.SCAN_RESOURCE_INFO_PARAMETER, "true"); 206 206 corporaResponse = sruClient.scan(corporaRequest); 207 207 SRUScanResponse response = corporaResponse.get(200, TimeUnit.SECONDS); … … 244 244 try { 245 245 SRUScanRequest corporaRequest = new SRUScanRequest(corpus.getEndpointUrl()); 246 StringBuilder scanClause = new StringBuilder(SRUCQL scan.RESOURCE_PARAMETER);246 StringBuilder scanClause = new StringBuilder(SRUCQL.SCAN_RESOURCE_PARAMETER); 247 247 scanClause.append("="); 248 248 String resourceValue = corpus.getHandle(); -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/sopt/Endpoint.java
r5034 r5035 9 9 10 10 private String url; 11 private Institution Iinstitution;11 private Institution institution; 12 12 13 public Endpoint(String url, Institution Iinstitution) {13 public Endpoint(String url, Institution institution) { 14 14 this.url = url; 15 15 this.institution = institution; … … 24 24 } 25 25 26 public Institution IgetInstitution() {26 public Institution getInstitution() { 27 27 return institution; 28 28 } 29 29 30 public void setInstitution(Institution Iinstitution) {30 public void setInstitution(Institution institution) { 31 31 this.institution = institution; 32 32 } -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/sopt/Institution.java
r5034 r5035 8 8 * @author Yana Panchenko 9 9 */ 10 public class Institution implements InstitutionI{10 public class Institution { 11 11 12 12 private String name; … … 20 20 } 21 21 22 @Override23 22 public Endpoint add(String endpointUrl) { 24 23 Endpoint ep = getEndpoint(endpointUrl); … … 30 29 } 31 30 32 @Override33 31 public String getName() { 34 32 return name; 35 33 } 36 34 37 @Override38 35 public String getLink() { 39 36 return link; 40 37 } 41 38 42 @Override43 39 public List<Endpoint> getEndpoints() { 44 40 return this.endpoints; … … 46 42 47 43 48 @Override49 44 public Endpoint getEndpoint(int index) { 50 45 if (index >= endpoints.size()) { … … 54 49 } 55 50 56 @Override57 51 public Endpoint getEndpoint(String endpointUrl) { 58 52 for (Endpoint ep : endpoints) { … … 64 58 } 65 59 66 @Override67 60 public String toString() { 68 61 if (name != null && name.length() > 0) { … … 73 66 } 74 67 75 76 68 } -
SRUAggregator/trunk/src/main/java/eu/clarin/sru/fcs/aggregator/sopt/Languages.java
r3044 r5035 2 2 3 3 import java.io.BufferedReader; 4 import java.io.FileNotFoundException;5 import java.io.FileReader;6 4 import java.io.IOException; 7 5 import java.io.InputStream; 8 6 import java.io.InputStreamReader; 9 import java.io.UnsupportedEncodingException;10 import java.util.Arrays;11 7 import java.util.HashMap; 12 import java.util.HashSet;13 8 import java.util.Map; 14 9 import java.util.Set; … … 25 20 private Map<String,String> code22Code = new HashMap<String,String>(); 26 21 public static final String LANGUAGES_FILE_PATH = "/lang/ISO-639-2_utf-8.txt"; 22 public static final String LANGUAGES_FILE_ENCODING = "UTF-8"; 23 public static final String ANY_LANGUAGE_NAME = "anylang"; 27 24 28 25 public Languages() { … … 46 43 BufferedReader br = null; 47 44 try { 48 br = new BufferedReader(new InputStreamReader(is, "UTF-8"));45 br = new BufferedReader(new InputStreamReader(is, LANGUAGES_FILE_ENCODING)); 49 46 String line; 50 47 while ((line = br.readLine()) != null) { -
SRUAggregator/trunk/src/test/java/eu/clarin/sru/fcs/aggregator/app/ScanCacheFiledTest.java
r5034 r5035 2 2 3 3 import eu.clarin.sru.fcs.aggregator.cache.ScanCacheFiled; 4 import eu.clarin.sru.fcs.aggregator.cache.SimpleInMemScanCache; 4 5 import eu.clarin.sru.fcs.aggregator.cache.ScanCache; 5 6 import eu.clarin.sru.fcs.aggregator.sopt.Corpus; … … 16 17 public class ScanCacheFiledTest { 17 18 18 //19 //20 //@Test21 //public void testReadWriteDepth1() {22 //String scanDir = "/scan-bas";23 //String scanPath1 = this.getClass().getResource(scanDir).getFile();24 //String scanPath2 = "/tmp/scan-bas";25 //File scanDir2 = new File(scanPath2);26 //if (!scanDir2.exists()) {27 //scanDir2.mkdir();28 //}29 //30 //ScanCacheFiled scanFiled1 = new ScanCacheFiled(scanPath1);31 //ScanCache cacheOrig = scanFiled1.read();32 //33 //ScanCacheFiled scanFiled2 = new ScanCacheFiled(scanPath2);34 //scanFiled2.write(cacheOrig);35 //36 //ScanCacheFiled scanFiled3 = new ScanCacheFiled(scanPath2);37 //ScanCache cacheRewritten = scanFiled3.read();38 //39 ////make sure caches contain the same info after read-write40 //Assert.assertEquals(cacheOrig.getInstitutions().size(), cacheRewritten.getInstitutions().size());41 //Endpoint epOrig = cacheOrig.getInstitutions().get(2).getEndpoint(0);42 //Endpoint epRewritten = cacheRewritten.getInstitutions().get(2).getEndpoint(0);43 //Assert.assertEquals(epOrig.getUrl(), epRewritten.getUrl());44 //Assert.assertEquals(epOrig, epRewritten);45 //List<Corpus> rootCorporaOrig = cacheOrig.getRootCorporaOfEndpoint(epOrig.getUrl());46 //List<Corpus> rootCorporaRewritten = cacheRewritten.getRootCorporaOfEndpoint(epOrig.getUrl());47 //Assert.assertEquals(rootCorporaOrig.size(), rootCorporaRewritten.size());48 //Assert.assertEquals(3, rootCorporaRewritten.size());49 //Assert.assertEquals(rootCorporaOrig.get(0), rootCorporaRewritten.get(0));50 // List<Corpus> childenOrig = cacheOrig.getChildrenCorpora(rootCorporaOrig.get(0).getHandle());51 // List<Corpus> childenRewritten = cacheRewritten.getChildrenCorpora(rootCorporaOrig.get(0).getHandle());52 //Assert.assertEquals(childenOrig, childenRewritten);53 //Assert.assertEquals(rootCorporaOrig.get(0).getLanguages(), rootCorporaRewritten.get(0).getLanguages());54 //55 ////System.out.println(cacheOrig);56 ////System.out.println();57 ////System.out.println(cacheRewritten);58 //}59 //60 //@Test61 //public void testReadWriteDepth2() {62 //String scanDir = "/scan-mpi";63 //String scanPath1 = this.getClass().getResource(scanDir).getFile();64 //String scanPath2 = "/tmp/scan-mpi";65 //File scanDir2 = new File(scanPath2);66 //if (!scanDir2.exists()) {67 //scanDir2.mkdir();68 //}69 //70 //ScanCacheFiled scanFiled1 = new ScanCacheFiled(scanPath1);71 //ScanCache cacheOrig = scanFiled1.read();72 //73 //ScanCacheFiled scanFiled2 = new ScanCacheFiled(scanPath2);74 //scanFiled2.write(cacheOrig);75 //76 //ScanCacheFiled scanFiled3 = new ScanCacheFiled(scanPath2);77 //ScanCache cacheRewritten = scanFiled3.read();78 //79 ////make sure caches contain the same info after read-write80 //Assert.assertEquals(cacheOrig.getInstitutions().size(), cacheRewritten.getInstitutions().size());81 //Endpoint epOrig = cacheOrig.getInstitutions().get(4).getEndpoint(0);82 //Endpoint epRewritten = cacheRewritten.getInstitutions().get(4).getEndpoint(0);83 //Assert.assertEquals(epOrig.getUrl(), epRewritten.getUrl());84 //Assert.assertEquals(epOrig, epRewritten);85 //List<Corpus> rootCorporaOrig = cacheOrig.getRootCorporaOfEndpoint(epOrig.getUrl());86 //List<Corpus> rootCorporaRewritten = cacheRewritten.getRootCorporaOfEndpoint(epOrig.getUrl());87 //Assert.assertEquals(rootCorporaOrig.size(), rootCorporaRewritten.size());88 //Assert.assertEquals(3, rootCorporaRewritten.size());89 //Assert.assertEquals(rootCorporaOrig.get(0), rootCorporaRewritten.get(0));90 // List<Corpus> childenOrig = cacheOrig.getChildrenCorpora(rootCorporaOrig.get(0).getHandle());91 // List<Corpus> childenRewritten = cacheRewritten.getChildrenCorpora(rootCorporaOrig.get(0).getHandle());92 //Assert.assertEquals(childenOrig, childenRewritten);93 //Assert.assertEquals(2, childenRewritten.size());94 //Assert.assertEquals(rootCorporaOrig.get(0).getLanguages(), rootCorporaRewritten.get(0).getLanguages());95 //96 // //System.out.println(cacheOrig);97 // //System.out.println();98 // //System.out.println(cacheRewritten);99 //}100 //101 //@Test102 //public void testReadWriteDefaultCorpus() {103 //String scanDir = "/scan-def";104 //String scanPath1 = this.getClass().getResource(scanDir).getFile();105 //String scanPath2 = "/tmp/scan-def";106 //File scanDir2 = new File(scanPath2);107 //if (!scanDir2.exists()) {108 //scanDir2.mkdir();109 //}110 //111 //ScanCacheFiled scanFiled1 = new ScanCacheFiled(scanPath1);112 //ScanCache cacheOrig = scanFiled1.read();113 //114 //ScanCacheFiled scanFiled2 = new ScanCacheFiled(scanPath2);115 //scanFiled2.write(cacheOrig);116 //117 //ScanCacheFiled scanFiled3 = new ScanCacheFiled(scanPath2);118 //ScanCache cacheRewritten = scanFiled3.read();119 //120 ////make sure caches contain the same info after read-write121 //Assert.assertEquals(cacheOrig.getInstitutions().size(), cacheRewritten.getInstitutions().size());122 //Endpoint epOrig = cacheOrig.getInstitutions().get(4).getEndpoint(0);123 //Endpoint epRewritten = cacheRewritten.getInstitutions().get(4).getEndpoint(0);124 //Assert.assertEquals(epOrig.getUrl(), epRewritten.getUrl());125 //Assert.assertEquals(epOrig, epRewritten);126 //List<Corpus> rootCorporaOrig = cacheOrig.getRootCorporaOfEndpoint(epOrig.getUrl());127 //List<Corpus> rootCorporaRewritten = cacheRewritten.getRootCorporaOfEndpoint(epOrig.getUrl());128 //Assert.assertEquals(rootCorporaOrig.size(), rootCorporaRewritten.size());129 //Assert.assertEquals(1, rootCorporaRewritten.size());130 //Assert.assertEquals(rootCorporaOrig.get(0).getLanguages(), rootCorporaRewritten.get(0).getLanguages());131 //132 // //System.out.println(cacheOrig);133 // //System.out.println();134 // //System.out.println(cacheRewritten);135 //}19 20 21 @Test 22 public void testReadWriteDepth1() { 23 String scanDir = "/scan-bas"; 24 String scanPath1 = this.getClass().getResource(scanDir).getFile(); 25 String scanPath2 = "/tmp/scan-bas"; 26 File scanDir2 = new File(scanPath2); 27 if (!scanDir2.exists()) { 28 scanDir2.mkdir(); 29 } 30 31 ScanCacheFiled scanFiled1 = new ScanCacheFiled(scanPath1); 32 ScanCache cacheOrig = scanFiled1.read(); 33 34 ScanCacheFiled scanFiled2 = new ScanCacheFiled(scanPath2); 35 scanFiled2.write(cacheOrig); 36 37 ScanCacheFiled scanFiled3 = new ScanCacheFiled(scanPath2); 38 ScanCache cacheRewritten = scanFiled3.read(); 39 40 //make sure caches contain the same info after read-write 41 Assert.assertEquals(cacheOrig.getInstitutions().size(), cacheRewritten.getInstitutions().size()); 42 Endpoint epOrig = cacheOrig.getInstitutions().get(2).getEndpoint(0); 43 Endpoint epRewritten = cacheRewritten.getInstitutions().get(2).getEndpoint(0); 44 Assert.assertEquals(epOrig.getUrl(), epRewritten.getUrl()); 45 Assert.assertEquals(epOrig, epRewritten); 46 List<Corpus> rootCorporaOrig = cacheOrig.getRootCorporaOfEndpoint(epOrig.getUrl()); 47 List<Corpus> rootCorporaRewritten = cacheRewritten.getRootCorporaOfEndpoint(epOrig.getUrl()); 48 Assert.assertEquals(rootCorporaOrig.size(), rootCorporaRewritten.size()); 49 Assert.assertEquals(3, rootCorporaRewritten.size()); 50 Assert.assertEquals(rootCorporaOrig.get(0), rootCorporaRewritten.get(0)); 51 List<Corpus> childenOrig = cacheOrig.getChildren(rootCorporaOrig.get(0)); 52 List<Corpus> childenRewritten = cacheRewritten.getChildren(rootCorporaOrig.get(0)); 53 Assert.assertEquals(childenOrig, childenRewritten); 54 Assert.assertEquals(rootCorporaOrig.get(0).getLanguages(), rootCorporaRewritten.get(0).getLanguages()); 55 56 //System.out.println(cacheOrig); 57 //System.out.println(); 58 //System.out.println(cacheRewritten); 59 } 60 61 @Test 62 public void testReadWriteDepth2() { 63 String scanDir = "/scan-mpi"; 64 String scanPath1 = this.getClass().getResource(scanDir).getFile(); 65 String scanPath2 = "/tmp/scan-mpi"; 66 File scanDir2 = new File(scanPath2); 67 if (!scanDir2.exists()) { 68 scanDir2.mkdir(); 69 } 70 71 ScanCacheFiled scanFiled1 = new ScanCacheFiled(scanPath1); 72 ScanCache cacheOrig = scanFiled1.read(); 73 74 ScanCacheFiled scanFiled2 = new ScanCacheFiled(scanPath2); 75 scanFiled2.write(cacheOrig); 76 77 ScanCacheFiled scanFiled3 = new ScanCacheFiled(scanPath2); 78 ScanCache cacheRewritten = scanFiled3.read(); 79 80 //make sure caches contain the same info after read-write 81 Assert.assertEquals(cacheOrig.getInstitutions().size(), cacheRewritten.getInstitutions().size()); 82 Endpoint epOrig = cacheOrig.getInstitutions().get(4).getEndpoint(0); 83 Endpoint epRewritten = cacheRewritten.getInstitutions().get(4).getEndpoint(0); 84 Assert.assertEquals(epOrig.getUrl(), epRewritten.getUrl()); 85 Assert.assertEquals(epOrig, epRewritten); 86 List<Corpus> rootCorporaOrig = cacheOrig.getRootCorporaOfEndpoint(epOrig.getUrl()); 87 List<Corpus> rootCorporaRewritten = cacheRewritten.getRootCorporaOfEndpoint(epOrig.getUrl()); 88 Assert.assertEquals(rootCorporaOrig.size(), rootCorporaRewritten.size()); 89 Assert.assertEquals(3, rootCorporaRewritten.size()); 90 Assert.assertEquals(rootCorporaOrig.get(0), rootCorporaRewritten.get(0)); 91 List<Corpus> childenOrig = cacheOrig.getChildren(rootCorporaOrig.get(0)); 92 List<Corpus> childenRewritten = cacheRewritten.getChildren(rootCorporaOrig.get(0)); 93 Assert.assertEquals(childenOrig, childenRewritten); 94 Assert.assertEquals(2, childenRewritten.size()); 95 Assert.assertEquals(rootCorporaOrig.get(0).getLanguages(), rootCorporaRewritten.get(0).getLanguages()); 96 97 // System.out.println(cacheOrig); 98 // System.out.println(); 99 // System.out.println(cacheRewritten); 100 } 101 102 @Test 103 public void testReadWriteDefaultCorpus() { 104 String scanDir = "/scan-def"; 105 String scanPath1 = this.getClass().getResource(scanDir).getFile(); 106 String scanPath2 = "/tmp/scan-def"; 107 File scanDir2 = new File(scanPath2); 108 if (!scanDir2.exists()) { 109 scanDir2.mkdir(); 110 } 111 112 ScanCacheFiled scanFiled1 = new ScanCacheFiled(scanPath1); 113 ScanCache cacheOrig = scanFiled1.read(); 114 115 ScanCacheFiled scanFiled2 = new ScanCacheFiled(scanPath2); 116 scanFiled2.write(cacheOrig); 117 118 ScanCacheFiled scanFiled3 = new ScanCacheFiled(scanPath2); 119 ScanCache cacheRewritten = scanFiled3.read(); 120 121 //make sure caches contain the same info after read-write 122 Assert.assertEquals(cacheOrig.getInstitutions().size(), cacheRewritten.getInstitutions().size()); 123 Endpoint epOrig = cacheOrig.getInstitutions().get(4).getEndpoint(0); 124 Endpoint epRewritten = cacheRewritten.getInstitutions().get(4).getEndpoint(0); 125 Assert.assertEquals(epOrig.getUrl(), epRewritten.getUrl()); 126 Assert.assertEquals(epOrig, epRewritten); 127 List<Corpus> rootCorporaOrig = cacheOrig.getRootCorporaOfEndpoint(epOrig.getUrl()); 128 List<Corpus> rootCorporaRewritten = cacheRewritten.getRootCorporaOfEndpoint(epOrig.getUrl()); 129 Assert.assertEquals(rootCorporaOrig.size(), rootCorporaRewritten.size()); 130 Assert.assertEquals(1, rootCorporaRewritten.size()); 131 Assert.assertEquals(rootCorporaOrig.get(0).getLanguages(), rootCorporaRewritten.get(0).getLanguages()); 132 133 // System.out.println(cacheOrig); 134 // System.out.println(); 135 // System.out.println(cacheRewritten); 136 } 136 137 137 138 @Test -
SRUAggregator/trunk/src/test/java/eu/clarin/sru/fcs/aggregator/app/ScanCrawlerTest.java
r5034 r5035 2 2 3 3 import eu.clarin.sru.client.SRUThreadedClient; 4 import eu.clarin.sru.fcs.aggregator.cache.EndpointUrlFilter; 5 import eu.clarin.sru.fcs.aggregator.cache.SimpleInMemScanCache; 6 import eu.clarin.sru.fcs.aggregator.cache.ScanCrawler; 4 7 import eu.clarin.sru.fcs.aggregator.sopt.CenterRegistryLive; 5 8 import eu.clarin.sru.fcs.aggregator.sopt.Corpus; … … 19 22 public void testCrawlForMpiAndTue() { 20 23 21 // SRUThreadedClient sruClient = new SRUThreadedClient(); 22 // 23 // try { 24 // EndpointUrlFilter filter = new EndpointUrlFilter(); 25 // filter.urlShouldContainAnyOf("uni-tuebingen.de", "mpi.nl"); 26 // ScanCrawler crawler = new ScanCrawler(new CenterRegistryLive(), sruClient, filter, 2); 27 // ScanCache cache = crawler.crawl(); 28 // Corpus tueRootCorpus = cache.getRootCorporaOfEndpoint("http://weblicht.sfs.uni-tuebingen.de/rws/sru/").get(0); 29 // Corpus mpiRootCorpus = cache.getRootCorporaOfEndpoint("http://cqlservlet.mpi.nl/").get(0); 30 // Assert.assertEquals("http://hdl.handle.net/11858/00-1778-0000-0001-DDAF-D", 31 // tueRootCorpus.getHandle()); 32 // Assert.assertEquals("hdl:1839/00-0000-0000-0003-4692-D@format=cmdi", 33 // cache.getChildrenCorpora("hdl:1839/00-0000-0000-0001-53A5-2@format=cmdi").get(0).getHandle()); 34 // //check if languages and other corpus data is crawled corectly... 35 // Set<String> tueLangs = new HashSet<String>(); 36 // tueLangs.add("deu"); 37 // Assert.assertEquals(tueLangs, tueRootCorpus.getLanguages()); 38 // String tueDescSubstring = "TÃŒbingen Treebank"; 39 // Assert.assertTrue("Description problem", tueRootCorpus.getDescription().contains(tueDescSubstring)); 40 // String tueNameSubstring = "TuebaDDC"; 41 // Assert.assertTrue("Name problem", tueRootCorpus.getDisplayName().contains(tueNameSubstring)); 42 // String tuePageSubstring = "sfs.uni-tuebingen.de"; 43 // Assert.assertTrue("Landing page problem", tueRootCorpus.getLandingPage().contains(tuePageSubstring)); 44 // Assert.assertTrue("Number of records problem", mpiRootCorpus.getNumberOfRecords() > 10); 45 // 46 // } finally { 47 // sruClient.shutdown(); 48 // } 49 // 24 SRUThreadedClient sruClient = new SRUThreadedClient(); 25 26 try { 27 EndpointUrlFilter filter = new EndpointUrlFilter(); 28 //filter.urlShouldContainAnyOf("leipzig", ".mpi.nl"); 29 filter.urlShouldContainAnyOf("uni-tuebingen.de", ".mpi.nl"); 30 //filter.urlShouldContainAnyOf("dspin.dwds.de", "lindat."); 31 ScanCrawler crawler = new ScanCrawler(new CenterRegistryLive(), sruClient, filter, 2); 32 SimpleInMemScanCache cache = new SimpleInMemScanCache(); 33 crawler.crawl(cache); 34 Corpus tueRootCorpus = cache.getRootCorporaOfEndpoint("http://weblicht.sfs.uni-tuebingen.de/rws/sru/").get(0); 35 Corpus mpiRootCorpus = cache.getRootCorporaOfEndpoint("http://cqlservlet.mpi.nl/").get(0); 36 Assert.assertEquals("http://hdl.handle.net/11858/00-1778-0000-0001-DDAF-D", 37 tueRootCorpus.getHandle()); 38 Corpus mpiCorpus = cache.getCorpus("hdl:1839/00-0000-0000-0001-53A5-2@format=cmdi"); 39 Assert.assertEquals("hdl:1839/00-0000-0000-0003-4692-D@format=cmdi", cache.getChildren(mpiCorpus).get(0).getHandle()); 40 //check if languages and other corpus data is crawled corectly... 41 Set<String> tueLangs = new HashSet<String>(); 42 tueLangs.add("deu"); 43 Assert.assertEquals(tueLangs, tueRootCorpus.getLanguages()); 44 String tueDescSubstring = "TÃŒbingen Treebank"; 45 Assert.assertTrue("Description problem", tueRootCorpus.getDescription().contains(tueDescSubstring)); 46 String tueNameSubstring = "TuebaDDC"; 47 Assert.assertTrue("Name problem", tueRootCorpus.getDisplayName().contains(tueNameSubstring)); 48 String tuePageSubstring = "sfs.uni-tuebingen.de"; 49 Assert.assertTrue("Landing page problem", tueRootCorpus.getLandingPage().contains(tuePageSubstring)); 50 Assert.assertTrue("Number of records problem", mpiRootCorpus.getNumberOfRecords() > 10); 51 52 } finally { 53 sruClient.shutdown(); 54 } 55 50 56 } 51 57 }
Note: See TracChangeset
for help on using the changeset viewer.