Context Navigation

source: SRUAggregator/trunk/src/test/java/eu/clarin/sru/fcs/aggregator/app/ScanCrawlerTest.java @ 5971

Last change on this file since 5971 was 5971, checked in by emanuel.dima@uni-tuebingen.de, 9 years ago
alpha 19: better UI for statistics, misc bugfixes
File size: 2.7 KB

Line
1	package eu.clarin.sru.fcs.aggregator.app;
2
3	import eu.clarin.sru.client.fcs.ClarinFCSClientBuilder;
4	import eu.clarin.sru.fcs.aggregator.scan.Corpora;
5	import eu.clarin.sru.fcs.aggregator.scan.EndpointUrlFilterAllow;
6	import eu.clarin.sru.fcs.aggregator.scan.ScanCrawler;
7	import eu.clarin.sru.fcs.aggregator.client.ThrottledClient;
8	import eu.clarin.sru.fcs.aggregator.scan.CenterRegistry;
9	import eu.clarin.sru.fcs.aggregator.scan.CenterRegistryLive;
10	import eu.clarin.sru.fcs.aggregator.scan.Corpus;
11	import java.util.HashSet;
12	import java.util.Set;
13	import javax.naming.InitialContext;
14	import javax.naming.NamingException;
15	import org.junit.Assert;
16	import org.junit.Ignore;
17	import org.junit.Test;
18
19	/**
20	*
21	* @author yanapanchenko
22	*/
23	@Ignore
24	public class ScanCrawlerTest {
25
26	@Test
27	public void testCrawlForMpiAndTue() throws NamingException {
28
29	ThrottledClient sruClient = new ThrottledClient(
30	new ClarinFCSClientBuilder()
31	.addDefaultDataViewParsers()
32	.buildThreadedClient(), 2);
33
34	try {
35	EndpointUrlFilterAllow filter = new EndpointUrlFilterAllow("uni-tuebingen.de"); //, "leipzig", ".mpi.nl", "dspin.dwds.de", "lindat."
36
37	InitialContext context = new InitialContext();
38	String centerRegistryUrl = (String) context.lookup("java:comp/env/center-registry-url");
39	ScanCrawler crawler = new ScanCrawler(
40	new CenterRegistryLive(centerRegistryUrl, filter).getCQLInstitutions(),
41	sruClient, 2);
42	Corpora cache = crawler.crawl();
43	Corpus tueRootCorpus = cache.findByEndpoint("http://weblicht.sfs.uni-tuebingen.de/rws/sru/").get(0);
44	Corpus mpiRootCorpus = cache.findByEndpoint("http://cqlservlet.mpi.nl/").get(0);
45	Assert.assertEquals("http://hdl.handle.net/11858/00-1778-0000-0001-DDAF-D",
46	tueRootCorpus.getHandle());
47	Corpus mpiCorpus = cache.findByHandle("hdl:1839/00-0000-0000-0001-53A5-2@format=cmdi");
48	Assert.assertEquals("hdl:1839/00-0000-0000-0003-4692-D@format=cmdi", mpiCorpus.getSubCorpora().get(0).getHandle());
49	//check if languages and other corpus data is crawled corectly...
50	Set<String> tueLangs = new HashSet<>();
51	tueLangs.add("deu");
52	Assert.assertEquals(tueLangs, tueRootCorpus.getLanguages());
53	String tueDescSubstring = "TÃŒbingen Treebank";
54	Assert.assertTrue("Description problem", tueRootCorpus.getDescription().contains(tueDescSubstring));
55	String tueNameSubstring = "TuebaDDC";
56	Assert.assertTrue("Name problem", tueRootCorpus.getTitle().contains(tueNameSubstring));
57	String tuePageSubstring = "sfs.uni-tuebingen.de";
58	Assert.assertTrue("Landing page problem", tueRootCorpus.getLandingPage().contains(tuePageSubstring));
59	Assert.assertTrue("Number of records problem", mpiRootCorpus.getNumberOfRecords() > 10);
60
61	} finally {
62	sruClient.shutdown();
63	}
64	}
65	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: