1 | package eu.clarin.sru.fcs.aggregator.app; |
---|
2 | |
---|
3 | import eu.clarin.sru.client.SRUThreadedClient; |
---|
4 | import eu.clarin.sru.client.fcs.ClarinFCSClientBuilder; |
---|
5 | import eu.clarin.sru.fcs.aggregator.client.MaxConcurrentRequestsCallback; |
---|
6 | import eu.clarin.sru.fcs.aggregator.scan.Corpora; |
---|
7 | import eu.clarin.sru.fcs.aggregator.scan.EndpointUrlFilterAllow; |
---|
8 | import eu.clarin.sru.fcs.aggregator.scan.ScanCrawler; |
---|
9 | import eu.clarin.sru.fcs.aggregator.client.ThrottledClient; |
---|
10 | import eu.clarin.sru.fcs.aggregator.scan.CenterRegistryLive; |
---|
11 | import eu.clarin.sru.fcs.aggregator.scan.Corpus; |
---|
12 | import java.net.URI; |
---|
13 | import java.util.HashSet; |
---|
14 | import java.util.Set; |
---|
15 | import javax.naming.InitialContext; |
---|
16 | import javax.naming.NamingException; |
---|
17 | import org.junit.Assert; |
---|
18 | import org.junit.Ignore; |
---|
19 | import org.junit.Test; |
---|
20 | |
---|
21 | /** |
---|
22 | * |
---|
23 | * @author yanapanchenko |
---|
24 | */ |
---|
25 | @Ignore |
---|
26 | public class ScanCrawlerTest { |
---|
27 | |
---|
28 | @Test |
---|
29 | public void testCrawlForMpiAndTue() throws NamingException { |
---|
30 | SRUThreadedClient sruThreadedClient = new ClarinFCSClientBuilder() |
---|
31 | .addDefaultDataViewParsers() |
---|
32 | .buildThreadedClient(); |
---|
33 | MaxConcurrentRequestsCallback callback = new MaxConcurrentRequestsCallback() { |
---|
34 | @Override |
---|
35 | public int getMaxConcurrentRequest(URI baseURI) { |
---|
36 | return 2; |
---|
37 | } |
---|
38 | }; |
---|
39 | ThrottledClient sruClient = new ThrottledClient( |
---|
40 | sruThreadedClient, callback, |
---|
41 | sruThreadedClient, callback |
---|
42 | ); |
---|
43 | |
---|
44 | try { |
---|
45 | EndpointUrlFilterAllow filter = new EndpointUrlFilterAllow("uni-tuebingen.de"); //, "leipzig", ".mpi.nl", "dspin.dwds.de", "lindat." |
---|
46 | |
---|
47 | InitialContext context = new InitialContext(); |
---|
48 | String centerRegistryUrl = (String) context.lookup("java:comp/env/center-registry-url"); |
---|
49 | ScanCrawler crawler = new ScanCrawler( |
---|
50 | new CenterRegistryLive(centerRegistryUrl, filter).getCQLInstitutions(), |
---|
51 | sruClient, 2); |
---|
52 | Corpora cache = crawler.crawl(); |
---|
53 | Corpus tueRootCorpus = cache.findByEndpoint("http://weblicht.sfs.uni-tuebingen.de/rws/sru/").get(0); |
---|
54 | Corpus mpiRootCorpus = cache.findByEndpoint("http://cqlservlet.mpi.nl/").get(0); |
---|
55 | Assert.assertEquals("http://hdl.handle.net/11858/00-1778-0000-0001-DDAF-D", |
---|
56 | tueRootCorpus.getHandle()); |
---|
57 | Corpus mpiCorpus = cache.findByHandle("hdl:1839/00-0000-0000-0001-53A5-2@format=cmdi"); |
---|
58 | Assert.assertEquals("hdl:1839/00-0000-0000-0003-4692-D@format=cmdi", mpiCorpus.getSubCorpora().get(0).getHandle()); |
---|
59 | //check if languages and other corpus data is crawled corectly... |
---|
60 | Set<String> tueLangs = new HashSet<>(); |
---|
61 | tueLangs.add("deu"); |
---|
62 | Assert.assertEquals(tueLangs, tueRootCorpus.getLanguages()); |
---|
63 | String tueDescSubstring = "TÃŒbingen Treebank"; |
---|
64 | Assert.assertTrue("Description problem", tueRootCorpus.getDescription().contains(tueDescSubstring)); |
---|
65 | String tueNameSubstring = "TuebaDDC"; |
---|
66 | Assert.assertTrue("Name problem", tueRootCorpus.getTitle().contains(tueNameSubstring)); |
---|
67 | String tuePageSubstring = "sfs.uni-tuebingen.de"; |
---|
68 | Assert.assertTrue("Landing page problem", tueRootCorpus.getLandingPage().contains(tuePageSubstring)); |
---|
69 | Assert.assertTrue("Number of records problem", mpiRootCorpus.getNumberOfRecords() > 10); |
---|
70 | |
---|
71 | } finally { |
---|
72 | sruClient.shutdown(); |
---|
73 | } |
---|
74 | } |
---|
75 | } |
---|