1 | package eu.clarin.sru.fcs.aggregator.app; |
---|
2 | |
---|
3 | import eu.clarin.sru.client.fcs.ClarinFCSClientBuilder; |
---|
4 | import eu.clarin.sru.fcs.aggregator.scan.Corpora; |
---|
5 | import eu.clarin.sru.fcs.aggregator.scan.EndpointUrlFilterAllow; |
---|
6 | import eu.clarin.sru.fcs.aggregator.scan.ScanCrawler; |
---|
7 | import eu.clarin.sru.fcs.aggregator.client.ThrottledClient; |
---|
8 | import eu.clarin.sru.fcs.aggregator.scan.CenterRegistry; |
---|
9 | import eu.clarin.sru.fcs.aggregator.scan.CenterRegistryLive; |
---|
10 | import eu.clarin.sru.fcs.aggregator.scan.Corpus; |
---|
11 | import java.util.HashSet; |
---|
12 | import java.util.Set; |
---|
13 | import javax.naming.InitialContext; |
---|
14 | import javax.naming.NamingException; |
---|
15 | import org.junit.Assert; |
---|
16 | import org.junit.Ignore; |
---|
17 | import org.junit.Test; |
---|
18 | |
---|
19 | /** |
---|
20 | * |
---|
21 | * @author yanapanchenko |
---|
22 | */ |
---|
23 | @Ignore |
---|
24 | public class ScanCrawlerTest { |
---|
25 | |
---|
26 | @Test |
---|
27 | public void testCrawlForMpiAndTue() throws NamingException { |
---|
28 | |
---|
29 | ThrottledClient sruClient = new ThrottledClient( |
---|
30 | new ClarinFCSClientBuilder() |
---|
31 | .addDefaultDataViewParsers() |
---|
32 | .buildThreadedClient(), 2, 2); |
---|
33 | |
---|
34 | try { |
---|
35 | EndpointUrlFilterAllow filter = new EndpointUrlFilterAllow("uni-tuebingen.de"); //, "leipzig", ".mpi.nl", "dspin.dwds.de", "lindat." |
---|
36 | |
---|
37 | InitialContext context = new InitialContext(); |
---|
38 | String centerRegistryUrl = (String) context.lookup("java:comp/env/center-registry-url"); |
---|
39 | CenterRegistry centerRegistry = new CenterRegistryLive(centerRegistryUrl, filter); |
---|
40 | ScanCrawler crawler = new ScanCrawler(centerRegistry, sruClient, 2); |
---|
41 | Corpora cache = crawler.crawl(); |
---|
42 | Corpus tueRootCorpus = cache.findByEndpoint("http://weblicht.sfs.uni-tuebingen.de/rws/sru/").get(0); |
---|
43 | Corpus mpiRootCorpus = cache.findByEndpoint("http://cqlservlet.mpi.nl/").get(0); |
---|
44 | Assert.assertEquals("http://hdl.handle.net/11858/00-1778-0000-0001-DDAF-D", |
---|
45 | tueRootCorpus.getHandle()); |
---|
46 | Corpus mpiCorpus = cache.findByHandle("hdl:1839/00-0000-0000-0001-53A5-2@format=cmdi"); |
---|
47 | Assert.assertEquals("hdl:1839/00-0000-0000-0003-4692-D@format=cmdi", mpiCorpus.getSubCorpora().get(0).getHandle()); |
---|
48 | //check if languages and other corpus data is crawled corectly... |
---|
49 | Set<String> tueLangs = new HashSet<>(); |
---|
50 | tueLangs.add("deu"); |
---|
51 | Assert.assertEquals(tueLangs, tueRootCorpus.getLanguages()); |
---|
52 | String tueDescSubstring = "TÃŒbingen Treebank"; |
---|
53 | Assert.assertTrue("Description problem", tueRootCorpus.getDescription().contains(tueDescSubstring)); |
---|
54 | String tueNameSubstring = "TuebaDDC"; |
---|
55 | Assert.assertTrue("Name problem", tueRootCorpus.getDisplayName().contains(tueNameSubstring)); |
---|
56 | String tuePageSubstring = "sfs.uni-tuebingen.de"; |
---|
57 | Assert.assertTrue("Landing page problem", tueRootCorpus.getLandingPage().contains(tuePageSubstring)); |
---|
58 | Assert.assertTrue("Number of records problem", mpiRootCorpus.getNumberOfRecords() > 10); |
---|
59 | |
---|
60 | } finally { |
---|
61 | sruClient.shutdown(); |
---|
62 | } |
---|
63 | } |
---|
64 | } |
---|