source: metadata/trunk/toolkit/scripts/filter_tools_csv.R @ 2063

Last change on this file since 2063 was 2063, checked in by sanmai, 12 years ago
  • Fix <tags/> location in cmdi-lrt-template.xml so as to make it valid CMDI.
  • Small further work on the LRT tools register checking script. (See filter_tools_csv.R for filtering based on the Link checks).
File size: 2.7 KB
Line 
1#!/usr/bin/Rscript
2
3## This R script is useful to inspect the table that is put out by the check-tools-integrity.py script, with the aim of filtering the original CLARIN tools registry CSV
4## for stale/irrelevant/problematic records.
5
6tools_registry          <- read.csv("/tmp/export_tools", check.names = FALSE, header = TRUE);
7checks_output_table <- read.table("/tmp/output.tab", sep = '\t', check.names = FALSE, header = TRUE);
8colnames(checks_output_table) <- paste(colnames(checks_output_table), "check");
9
10records_to_be_kept      <- subset(checks_output_table, `Reference link (field_tool_reference_link) check` != "unspecified");
11
12
13
14
15## Records whose contact person should be warned.
16records_any_unspecified <- subset(records_to_be_kept, `Reference link (field_tool_reference_link) check`                        == "unspecified"
17                                                                                                        | `Documentation link (field_tool_document_link) check`                 == "unspecified"
18                                                                                                        | `Webservice link (field_tool_webservice_link) check`          == "unspecified");
19
20
21complete_extended_table                                         <- cbind(tools_registry, checks_output_table);
22
23write.table(complete_extended_table,
24                                file = "/run/media/sanmai/SAMSUNG/3,MPI/export_tools__complete_extended__7-8-2012.csv",
25                                sep = ',',
26                                row.names = FALSE,
27                                col.names = TRUE);
28
29
30records_relevant_links_specified        <- subset(records_to_be_kept, (`Reference link (field_tool_reference_link) check`           != "unspecified"
31                                                                                                                                        | `Webservice link (field_tool_webservice_link) check`  != "unspecified")
32                                                                                                                                        & `Documentation link (field_tool_document_link) check`         != "unspecified");
33links_specified_table                           <- cbind(tools_registry[row.names(records_relevant_links_specified),], records_relevant_links_specified);
34write.table(links_specified_table,
35                        file = "/run/media/sanmai/SAMSUNG/3,MPI/export_tools__relevant_links_specified__7-8-2012.csv",
36                        sep = ',',
37                        row.names = FALSE,
38                        col.names = TRUE);
39
40
41records_relevant_links_work         <- subset(records_to_be_kept, (`Reference link (field_tool_reference_link) check`       == "works"
42                                                                                                                                        | `Webservice link (field_tool_webservice_link) check`  == "works")
43                                                                                                                                        & `Documentation link (field_tool_document_link) check`         == "works");
44links_work_table                                        <- cbind(tools_registry[row.names(records_relevant_links_work),], records_relevant_links_work);
45write.table(links_work_table,
46                        file = "/run/media/sanmai/SAMSUNG/3,MPI/export_tools__relevant_links_work__7-8-2012.csv",
47                        sep = ',',
48                        row.names = FALSE,
49                        col.names = TRUE);
50
51
52
53URLs                            <- tools_registry[row.names(records_problematic),17]
54
55## To inspect the problematic records manually:
56edit(records_problematic)
57
58## Bar plot of frequencies of problematic Reference link values by country.
59plot(factor(tools_registry[row.names(records_problematic),10]))
Note: See TracBrowser for help on using the repository browser.