1 | package eu.clarin.sru.fcs.aggregator.scan; |
---|
2 | |
---|
3 | import com.fasterxml.jackson.databind.ObjectMapper; |
---|
4 | import eu.clarin.sru.fcs.aggregator.client.ThrottledClient; |
---|
5 | import java.io.File; |
---|
6 | import java.io.IOException; |
---|
7 | import java.util.concurrent.atomic.AtomicReference; |
---|
8 | import org.slf4j.LoggerFactory; |
---|
9 | |
---|
10 | /** |
---|
11 | * @author yanapanchenko |
---|
12 | * @author edima |
---|
13 | */ |
---|
14 | public class ScanCrawlTask implements Runnable { |
---|
15 | |
---|
16 | private static final org.slf4j.Logger log = LoggerFactory.getLogger(ScanCrawlTask.class); |
---|
17 | |
---|
18 | private ThrottledClient sruClient; |
---|
19 | private int cacheMaxDepth; |
---|
20 | private EndpointFilter filter; |
---|
21 | private AtomicReference<Corpora> corporaAtom; |
---|
22 | private File cachedCorpora; |
---|
23 | private AtomicReference<Statistics> statisticsAtom; |
---|
24 | private String centerRegistryUrl; |
---|
25 | |
---|
26 | public ScanCrawlTask(ThrottledClient sruClient, String centerRegistryUrl, |
---|
27 | int cacheMaxDepth, EndpointFilter filter, |
---|
28 | AtomicReference<Corpora> corporaAtom, File cachedCorpora, |
---|
29 | AtomicReference<Statistics> statisticsAtom |
---|
30 | ) { |
---|
31 | this.sruClient = sruClient; |
---|
32 | this.centerRegistryUrl = centerRegistryUrl; |
---|
33 | this.cacheMaxDepth = cacheMaxDepth; |
---|
34 | this.filter = filter; |
---|
35 | this.corporaAtom = corporaAtom; |
---|
36 | this.cachedCorpora = cachedCorpora; |
---|
37 | this.statisticsAtom = statisticsAtom; |
---|
38 | } |
---|
39 | |
---|
40 | @Override |
---|
41 | public void run() { |
---|
42 | try { |
---|
43 | long time0 = System.currentTimeMillis(); |
---|
44 | |
---|
45 | log.info("ScanCrawlTask: Initiating crawl"); |
---|
46 | CenterRegistry centerRegistry = new CenterRegistryLive(centerRegistryUrl, filter); |
---|
47 | ScanCrawler scanCrawler = new ScanCrawler(centerRegistry, sruClient, cacheMaxDepth); |
---|
48 | |
---|
49 | log.info("ScanCrawlTask: Starting crawl"); |
---|
50 | Corpora corpora = scanCrawler.crawl(); |
---|
51 | |
---|
52 | corporaAtom.set(corpora); |
---|
53 | statisticsAtom.set(scanCrawler.getStatistics()); |
---|
54 | long time = System.currentTimeMillis() - time0; |
---|
55 | |
---|
56 | log.info("ScanCrawlTask: crawl done in {}s, number of root corpora: {}", |
---|
57 | time / 1000., corpora.getCorpora().size()); |
---|
58 | |
---|
59 | if (corpora.getCorpora().isEmpty()) { |
---|
60 | log.warn("ScanCrawlTask: Skipped writing to disk (no corpora). Finished."); |
---|
61 | } else { |
---|
62 | ObjectMapper mapper = new ObjectMapper(); |
---|
63 | mapper.writerWithDefaultPrettyPrinter().writeValue(cachedCorpora, corpora); |
---|
64 | log.info("ScanCrawlTask: wrote to disk, finished"); |
---|
65 | } |
---|
66 | } catch (IOException xc) { |
---|
67 | log.error("!!! Scan Crawler task IO exception", xc); |
---|
68 | } catch (Throwable xc) { |
---|
69 | log.error("!!! Scan Crawler task throwable exception", xc); |
---|
70 | throw xc; |
---|
71 | } |
---|
72 | } |
---|
73 | } |
---|