1 | package eu.clarin.sru.fcs.aggregator.app; |
---|
2 | |
---|
3 | import com.fasterxml.jackson.databind.ObjectMapper; |
---|
4 | import com.optimaize.langdetect.LanguageDetector; |
---|
5 | import com.optimaize.langdetect.LanguageDetectorBuilder; |
---|
6 | import com.optimaize.langdetect.ngram.NgramExtractors; |
---|
7 | import com.optimaize.langdetect.profiles.LanguageProfile; |
---|
8 | import com.optimaize.langdetect.profiles.LanguageProfileReader; |
---|
9 | import com.optimaize.langdetect.text.*; |
---|
10 | import eu.clarin.sru.fcs.aggregator.search.Search; |
---|
11 | import eu.clarin.sru.fcs.aggregator.scan.ScanCrawlTask; |
---|
12 | import eu.clarin.sru.fcs.aggregator.scan.Corpora; |
---|
13 | import eu.clarin.sru.client.SRUVersion; |
---|
14 | import eu.clarin.sru.client.fcs.ClarinFCSClientBuilder; |
---|
15 | import eu.clarin.sru.client.fcs.ClarinFCSEndpointDescriptionParser; |
---|
16 | import eu.clarin.sru.fcs.aggregator.client.ThrottledClient; |
---|
17 | import eu.clarin.sru.fcs.aggregator.scan.Corpus; |
---|
18 | import eu.clarin.sru.fcs.aggregator.rest.RestService; |
---|
19 | import eu.clarin.sru.fcs.aggregator.scan.Statistics; |
---|
20 | import eu.clarin.sru.fcs.aggregator.lang.LanguagesISO693_3; |
---|
21 | import io.dropwizard.Application; |
---|
22 | import io.dropwizard.assets.AssetsBundle; |
---|
23 | import io.dropwizard.setup.Bootstrap; |
---|
24 | import io.dropwizard.setup.Environment; |
---|
25 | import java.io.File; |
---|
26 | import java.io.IOException; |
---|
27 | import java.io.InputStream; |
---|
28 | import java.util.ArrayList; |
---|
29 | import java.util.Collections; |
---|
30 | import java.util.HashMap; |
---|
31 | import java.util.List; |
---|
32 | import java.util.Map; |
---|
33 | import java.util.concurrent.ExecutorService; |
---|
34 | import java.util.concurrent.Executors; |
---|
35 | import java.util.concurrent.ScheduledExecutorService; |
---|
36 | import java.util.concurrent.atomic.AtomicReference; |
---|
37 | import opennlp.tools.tokenize.TokenizerModel; |
---|
38 | import org.slf4j.LoggerFactory; |
---|
39 | |
---|
40 | /** |
---|
41 | * Main component of the Aggregator application intended to provide users access |
---|
42 | * to CLARIN-FCS resources. |
---|
43 | * |
---|
44 | * The webapp base URL corresponds to the default behavior of displaying the |
---|
45 | * main aggregator page, where the user can enter query, select the resources of |
---|
46 | * CQL endpoints (as specified in the Clarin center registry), and search in |
---|
47 | * these resources. The endpoints/resources selection is optional, by default |
---|
48 | * all the endpoints root resources are selected. |
---|
49 | * |
---|
50 | * If invoked with 'x-aggregation-context' and 'query' parameter, the aggregator |
---|
51 | * will pre-select provided resources and fill in the query field. This |
---|
52 | * mechanism is currently used by VLO. Example: POST |
---|
53 | * http://weblicht.sfs.uni-tuebingen.de/Aggregator HTTP/1.1 operation = |
---|
54 | * searchRetrieve & version = 1.2 & query = bellen & x-aggregation-context = |
---|
55 | * {"http://fedora.clarin-d.uni-saarland.de/sru/":["hdl:11858/00-246C-0000-0008-5F2A-0"]} |
---|
56 | * |
---|
57 | * |
---|
58 | * Additionally, if run with the a URL query string parameter 'mode', the |
---|
59 | * special behavior of the aggregator is triggered: |
---|
60 | * |
---|
61 | * /?mode=testing corresponds to the mode where the CQL endpoints are taken not |
---|
62 | * from Clarin center repository, but from a hard-coded endpoints list; this |
---|
63 | * functionality is useful for testing the development instances of endpoints, |
---|
64 | * before they are moved to production. Was done to meet the request from MPI. |
---|
65 | * |
---|
66 | * /?mode=search corresponds to the mode where the aggregator page is requested |
---|
67 | * with the already known query and (optionally) resources to search in, and if |
---|
68 | * the immediate search is desired. In this case the aggregator search results |
---|
69 | * page is displayed and search results of the provided query start to fill it |
---|
70 | * in immediately (i.e. users don't need to click 'search' in the aggregator |
---|
71 | * page). Was done to meet the request from CLARIN ERIC (Martin Wynne contacted |
---|
72 | * us). |
---|
73 | * |
---|
74 | * /?mode=live corresponds to the mode where the information about corpora are |
---|
75 | * taken not from the scan cache (crawled in advance), but loaded live, starting |
---|
76 | * from the request to center registry and then performing scan operation |
---|
77 | * requests on each CQL endpoint listed there. It takes time to get the |
---|
78 | * corresponding responses from the endpoints, therefore the Aggregator page |
---|
79 | * loads very slow in this mode. But this mode is useful for testing of the |
---|
80 | * newly added or changed corpora without waiting for the next crawl. |
---|
81 | * |
---|
82 | * |
---|
83 | * Adds Application initialization and clean up: only one SRU threaded client is |
---|
84 | * used in the application, it has to be shut down when the application stops. |
---|
85 | * One Languages object instance is used within the application. |
---|
86 | * |
---|
87 | * @author Yana Panchenko |
---|
88 | * @author edima |
---|
89 | * |
---|
90 | * TODO: Use weblicht with results |
---|
91 | * |
---|
92 | * TODO: disable popups easily |
---|
93 | * |
---|
94 | * TODO: atomic replace of cached corpora (file) |
---|
95 | * |
---|
96 | * TODO: zoom into the results from a corpus, allow functionality only for |
---|
97 | * the view (search for next set of results) |
---|
98 | * |
---|
99 | * TODO: fix search bug after going to stats |
---|
100 | * |
---|
101 | * TODO: Export search results to personal workspace as csv, excel, tcf, plain |
---|
102 | * text: ask Marie/Wei about oauth ways to do that ndg oauth; ask Menzo, Willem, |
---|
103 | * Twan (they did a test, it worked) |
---|
104 | * |
---|
105 | * TODO: websockets |
---|
106 | * |
---|
107 | * TODO: show multiple hits on the same result in multiple rows, linked visually |
---|
108 | * |
---|
109 | * TODO: optimise page load |
---|
110 | * |
---|
111 | * TODO: test it in IE, other browsers |
---|
112 | * |
---|
113 | * TODO: tri-state for parent collections; search + message implications |
---|
114 | * |
---|
115 | * TODO: improve help page text |
---|
116 | * |
---|
117 | */ |
---|
118 | public class Aggregator extends Application<AggregatorConfiguration> { |
---|
119 | |
---|
120 | private static final org.slf4j.Logger log = LoggerFactory.getLogger(Aggregator.class); |
---|
121 | |
---|
122 | public static final String DE_TOK_MODEL = "tokenizer/de-tuebadz-8.0-token.bin"; |
---|
123 | |
---|
124 | private static final ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(1); |
---|
125 | private static Aggregator instance; |
---|
126 | private AggregatorConfiguration.Params params; |
---|
127 | |
---|
128 | private AtomicReference<Corpora> scanCacheAtom = new AtomicReference<Corpora>(new Corpora()); |
---|
129 | private AtomicReference<Statistics> scanStatsAtom = new AtomicReference<Statistics>(new Statistics()); |
---|
130 | private AtomicReference<Statistics> searchStatsAtom = new AtomicReference<Statistics>(new Statistics()); |
---|
131 | |
---|
132 | private TokenizerModel tokenizerModel; |
---|
133 | private LanguageDetector languageDetector; |
---|
134 | private TextObjectFactory textObjectFactory; |
---|
135 | |
---|
136 | private ThrottledClient sruScanClient = null; |
---|
137 | private ThrottledClient sruSearchClient = null; |
---|
138 | private Map<Long, Search> activeSearches = Collections.synchronizedMap(new HashMap<Long, Search>()); |
---|
139 | |
---|
140 | public static void main(String[] args) throws Exception { |
---|
141 | new Aggregator().run(args); |
---|
142 | } |
---|
143 | |
---|
144 | @Override |
---|
145 | public String getName() { |
---|
146 | return "FCS Aggregator"; |
---|
147 | } |
---|
148 | |
---|
149 | @Override |
---|
150 | public void initialize(Bootstrap<AggregatorConfiguration> bootstrap) { |
---|
151 | bootstrap.addBundle(new AssetsBundle("/assets", "/", "index.html", "static")); |
---|
152 | } |
---|
153 | |
---|
154 | |
---|
155 | @Override |
---|
156 | public void run(AggregatorConfiguration config, Environment environment) throws Exception { |
---|
157 | params = config.aggregatorParams; |
---|
158 | instance = this; |
---|
159 | |
---|
160 | System.out.println("Using parameters: "); |
---|
161 | try { |
---|
162 | System.out.println(new ObjectMapper().writerWithDefaultPrettyPrinter(). |
---|
163 | writeValueAsString(config.aggregatorParams)); |
---|
164 | } catch (IOException xc) { |
---|
165 | } |
---|
166 | |
---|
167 | environment.getApplicationContext().setErrorHandler(new ErrorPageHandler()); |
---|
168 | environment.jersey().setUrlPattern("/rest/*"); |
---|
169 | environment.jersey().register(new RestService()); |
---|
170 | |
---|
171 | try { |
---|
172 | init(); |
---|
173 | } catch (Exception ex) { |
---|
174 | log.error("INIT EXCEPTION", ex); |
---|
175 | throw ex; // force exit |
---|
176 | } |
---|
177 | } |
---|
178 | |
---|
179 | public static Aggregator getInstance() { |
---|
180 | return instance; |
---|
181 | } |
---|
182 | |
---|
183 | public AggregatorConfiguration.Params getParams() { |
---|
184 | return params; |
---|
185 | } |
---|
186 | |
---|
187 | public Corpora getCorpora() { |
---|
188 | return scanCacheAtom.get(); |
---|
189 | } |
---|
190 | |
---|
191 | public Statistics getScanStatistics() { |
---|
192 | return scanStatsAtom.get(); |
---|
193 | } |
---|
194 | |
---|
195 | public Statistics getSearchStatistics() { |
---|
196 | return searchStatsAtom.get(); |
---|
197 | } |
---|
198 | |
---|
199 | public void init() throws IOException { |
---|
200 | log.info("Aggregator initialization started."); |
---|
201 | sruScanClient = new ThrottledClient( |
---|
202 | new ClarinFCSClientBuilder() |
---|
203 | .setConnectTimeout(params.ENDPOINTS_SCAN_TIMEOUT_MS) |
---|
204 | .setSocketTimeout(params.ENDPOINTS_SCAN_TIMEOUT_MS) |
---|
205 | .addDefaultDataViewParsers() |
---|
206 | .registerExtraResponseDataParser( |
---|
207 | new ClarinFCSEndpointDescriptionParser()) |
---|
208 | .enableLegacySupport() |
---|
209 | .buildThreadedClient(), |
---|
210 | params.SCAN_MAX_CONCURRENT_REQUESTS_PER_ENDPOINT |
---|
211 | ); |
---|
212 | sruSearchClient = new ThrottledClient( |
---|
213 | new ClarinFCSClientBuilder() |
---|
214 | .setConnectTimeout(params.ENDPOINTS_SEARCH_TIMEOUT_MS) |
---|
215 | .setSocketTimeout(params.ENDPOINTS_SEARCH_TIMEOUT_MS) |
---|
216 | .addDefaultDataViewParsers() |
---|
217 | .enableLegacySupport() |
---|
218 | .buildThreadedClient(), |
---|
219 | params.SEARCH_MAX_CONCURRENT_REQUESTS_PER_ENDPOINT |
---|
220 | ); |
---|
221 | |
---|
222 | File corporaCacheFile = new File(params.AGGREGATOR_FILE_PATH); |
---|
223 | try { |
---|
224 | Corpora corpora = new ObjectMapper().readValue(corporaCacheFile, Corpora.class); |
---|
225 | scanCacheAtom.set(corpora); |
---|
226 | log.info("corpus list read from file; number of root corpora: " + scanCacheAtom.get().getCorpora().size()); |
---|
227 | } catch (Exception e) { |
---|
228 | log.error("Error while reading cached corpora:", e); |
---|
229 | } |
---|
230 | |
---|
231 | LanguagesISO693_3.getInstance(); // force init |
---|
232 | initTokenizer(); |
---|
233 | initLanguageDetector(); |
---|
234 | |
---|
235 | ScanCrawlTask task = new ScanCrawlTask(sruScanClient, |
---|
236 | params.CENTER_REGISTRY_URL, params.SCAN_MAX_DEPTH, |
---|
237 | params.additionalCQLEndpoints, |
---|
238 | null, scanCacheAtom, corporaCacheFile, |
---|
239 | scanStatsAtom, searchStatsAtom); |
---|
240 | scheduler.scheduleAtFixedRate(task, params.SCAN_TASK_INITIAL_DELAY, |
---|
241 | params.SCAN_TASK_INTERVAL, params.getScanTaskTimeUnit()); |
---|
242 | |
---|
243 | log.info("Aggregator initialization finished."); |
---|
244 | } |
---|
245 | |
---|
246 | public void shutdown(AggregatorConfiguration config) { |
---|
247 | log.info("Aggregator is shutting down."); |
---|
248 | for (Search search : activeSearches.values()) { |
---|
249 | search.shutdown(); |
---|
250 | } |
---|
251 | shutdownAndAwaitTermination(config.aggregatorParams, sruScanClient, scheduler); |
---|
252 | shutdownAndAwaitTermination(config.aggregatorParams, sruSearchClient, scheduler); |
---|
253 | log.info("Aggregator shutdown complete."); |
---|
254 | } |
---|
255 | |
---|
256 | // this function should be thread-safe |
---|
257 | public Search startSearch(SRUVersion version, List<Corpus> corpora, |
---|
258 | String searchString, String searchLang, int maxRecords) throws Exception { |
---|
259 | if (corpora.isEmpty()) { |
---|
260 | // No corpora |
---|
261 | return null; |
---|
262 | } else if (searchString.isEmpty()) { |
---|
263 | // No query |
---|
264 | return null; |
---|
265 | } else { |
---|
266 | Search sr = new Search(sruSearchClient, version, searchStatsAtom.get(), |
---|
267 | corpora, searchString, searchLang, 1, maxRecords); |
---|
268 | if ((activeSearches.size() % 100) == 0) { |
---|
269 | List<Long> toBeRemoved = new ArrayList<Long>(); |
---|
270 | long t0 = System.currentTimeMillis(); |
---|
271 | for (Map.Entry<Long, Search> e : activeSearches.entrySet()) { |
---|
272 | if (t0 - e.getValue().getCreatedAt() > 1800 * 1000) { |
---|
273 | toBeRemoved.add(e.getKey()); |
---|
274 | } |
---|
275 | } |
---|
276 | for (Long l : toBeRemoved) { |
---|
277 | activeSearches.remove(l); |
---|
278 | } |
---|
279 | } |
---|
280 | activeSearches.put(sr.getId(), sr); |
---|
281 | return sr; |
---|
282 | } |
---|
283 | } |
---|
284 | |
---|
285 | public Search getSearchById(Long id) { |
---|
286 | return activeSearches.get(id); |
---|
287 | } |
---|
288 | |
---|
289 | public TokenizerModel getTokenizerModel() { |
---|
290 | return tokenizerModel; |
---|
291 | } |
---|
292 | |
---|
293 | private static void shutdownAndAwaitTermination(AggregatorConfiguration.Params params, |
---|
294 | ThrottledClient sruClient, ExecutorService scheduler) { |
---|
295 | try { |
---|
296 | sruClient.shutdown(); |
---|
297 | scheduler.shutdown(); |
---|
298 | Thread.sleep(params.EXECUTOR_SHUTDOWN_TIMEOUT_MS); |
---|
299 | sruClient.shutdownNow(); |
---|
300 | scheduler.shutdownNow(); |
---|
301 | Thread.sleep(params.EXECUTOR_SHUTDOWN_TIMEOUT_MS); |
---|
302 | } catch (InterruptedException ie) { |
---|
303 | sruClient.shutdownNow(); |
---|
304 | scheduler.shutdownNow(); |
---|
305 | Thread.currentThread().interrupt(); |
---|
306 | } |
---|
307 | } |
---|
308 | |
---|
309 | private void initTokenizer() { |
---|
310 | TokenizerModel model = null; |
---|
311 | try { |
---|
312 | try (InputStream tokenizerModelDeAsIS = Thread.currentThread().getContextClassLoader().getResourceAsStream(DE_TOK_MODEL)) { |
---|
313 | model = new TokenizerModel(tokenizerModelDeAsIS); |
---|
314 | } |
---|
315 | } catch (IOException ex) { |
---|
316 | log.error("Failed to load tokenizer model", ex); |
---|
317 | } |
---|
318 | tokenizerModel = model; |
---|
319 | } |
---|
320 | |
---|
321 | public void initLanguageDetector() throws IOException { |
---|
322 | List<LanguageProfile> languageProfiles = new LanguageProfileReader().readAll(); |
---|
323 | languageDetector = LanguageDetectorBuilder |
---|
324 | .create(NgramExtractors.standard()) |
---|
325 | .withProfiles(languageProfiles) |
---|
326 | .build(); |
---|
327 | |
---|
328 | textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText(); |
---|
329 | } |
---|
330 | |
---|
331 | public String detectLanguage(String text) { |
---|
332 | return languageDetector.detect(textObjectFactory.forText(text)).orNull(); |
---|
333 | } |
---|
334 | } |
---|