1 | #!/usr/bin/env python |
---|
2 | |
---|
3 | # converts the CSV from the LRT inventory to nice and clean CMDI |
---|
4 | # Dieter says: I deny the existance of this script! |
---|
5 | |
---|
6 | import csv, datetime, pdb, sys, traceback, urllib, xml.etree.ElementTree as ElementTree |
---|
7 | from curses.ascii import ascii |
---|
8 | |
---|
9 | if sys.version_info < (2, 7) : |
---|
10 | sys.stderr.write("WARNING: this script was only tested with Python version 2.7.3! You are running version " + str(sys.version_info[1]) + "." + str(sys.version_info[2]) + " instead.\n") |
---|
11 | |
---|
12 | class CmdiFile : |
---|
13 | def __init__(self, nodeId) : |
---|
14 | template = open("cmdi-lrt-template.xml").read() |
---|
15 | self.nodeId = nodeId |
---|
16 | self.xmlTree = ElementTree.ElementTree(ElementTree.fromstring(template)) |
---|
17 | self.parentmap = dict((c, p) for p in self.xmlTree.getiterator() for c in p) |
---|
18 | self.fillElement("//MdCreationDate", datetime.datetime.now().strftime("%Y-%m-%d")) |
---|
19 | self.fillElement("//MdSelfLink", "http://user.clarin.eu/node/%s" % nodeId) |
---|
20 | |
---|
21 | def fillElement(self, XPath, value) : |
---|
22 | try : |
---|
23 | self.xmlTree.find(XPath).text = value.strip() |
---|
24 | except : |
---|
25 | print "Error in filling element " + XPath |
---|
26 | print traceback.format_exc() |
---|
27 | |
---|
28 | pdb.set_trace() |
---|
29 | |
---|
30 | |
---|
31 | def fillOptionalElement(self, XPath, value) : |
---|
32 | try : |
---|
33 | result = self.fillElement(XPath, value) |
---|
34 | except : |
---|
35 | print "Error in filling optional element " + XPath |
---|
36 | print traceback.format_exc() |
---|
37 | |
---|
38 | pdb.set_trace() |
---|
39 | else : |
---|
40 | return result |
---|
41 | |
---|
42 | ### Conceptual code that should remove optional elements if they are being filled with empty strings. |
---|
43 | # optional_element_parent_XPath = XPath + "/.." |
---|
44 | # optional_element_parent = self.xmlTree.find(optional_element_parent_XPath) |
---|
45 | # optional_element = self.xmlTree.find(XPath) |
---|
46 | |
---|
47 | # try : |
---|
48 | # assert(optional_element_parent is not None) |
---|
49 | # assert(optional_element is not None) |
---|
50 | # except : |
---|
51 | # import pdb |
---|
52 | # pdb.set_trace() |
---|
53 | |
---|
54 | # value = str(value).strip() |
---|
55 | # if len(value) > 1 : |
---|
56 | # optional_element.text = value |
---|
57 | # else : |
---|
58 | # optional_element_parent.remove(optional_element) |
---|
59 | |
---|
60 | def fillMultipleElement(self, elementname, xpath, values): |
---|
61 | # fill in the already existing element |
---|
62 | if (values[0]): |
---|
63 | self.fillElement(xpath, values[0]) |
---|
64 | |
---|
65 | element = self.xmlTree.find(xpath) |
---|
66 | parent = self.parentmap[element] |
---|
67 | position = parent.getchildren().index(element) |
---|
68 | |
---|
69 | # then add siblings for the other elements |
---|
70 | for value in values[1:]: |
---|
71 | if value: |
---|
72 | # create new sibling of xpath (elementname) = value |
---|
73 | position += 1 |
---|
74 | newElement = ElementTree.Element(elementname) |
---|
75 | newElement.text = value.strip() |
---|
76 | parent.insert(position, newElement) |
---|
77 | |
---|
78 | def removeEmptyNodes(self): |
---|
79 | # we maybe added some elements so need to recalculate the parentmap |
---|
80 | self.parentmap = dict((c, p) for p in self.xmlTree.getiterator() for c in p) |
---|
81 | |
---|
82 | removeList = ["ResourceType", "BeginYearResourceCreation", "FinalizationYearResourceCreation", "Institute", \ |
---|
83 | "DistributionType", "NonCommercialUsageOnly", "UsageReportRequired", "ModificationsRequireRedeposition", "WorkingLanguages", "Date"] |
---|
84 | for r in removeList: |
---|
85 | results = self.xmlTree.findall("//%s" % r) |
---|
86 | for res in results: |
---|
87 | if not res.text: |
---|
88 | parentNode = self.parentmap[res] |
---|
89 | parentNode.remove(res) |
---|
90 | |
---|
91 | def serialize(self): |
---|
92 | self.removeEmptyNodes() |
---|
93 | #print ElementTree.tostring(self.xmlTree.getroot()) |
---|
94 | filename = "lrt-%s.cmdi" % self.nodeId |
---|
95 | self.xmlTree.write(filename, encoding = "utf-8", xml_declaration = True) |
---|
96 | f = open(filename, 'r+' ) |
---|
97 | content = f.read().replace('<CMD', '<CMD xmlns="http://www.clarin.eu/cmd/"') |
---|
98 | f.close() |
---|
99 | f = open(filename, 'w' ) |
---|
100 | f.write(content) |
---|
101 | f.close |
---|
102 | |
---|
103 | def addFormats(self, format): |
---|
104 | if ";" in format or "," in format: |
---|
105 | if ";" in format: |
---|
106 | formatItems = format.split(";") |
---|
107 | else: |
---|
108 | formatItems = format.split(",") |
---|
109 | self.fillMultipleElement("Format", "//LrtCommon/Format", formatItems) |
---|
110 | else: |
---|
111 | self.fillElement("//LrtCommon/Format", format) |
---|
112 | |
---|
113 | def addInstitutes(self, institute): |
---|
114 | if ";" in institute: |
---|
115 | items = institute.split(";") |
---|
116 | uniqueItems = set(items) # filter out double items |
---|
117 | items = [i for i in uniqueItems] # convert set back to a list |
---|
118 | #print items |
---|
119 | |
---|
120 | self.fillMultipleElement("Institute", "//LrtCommon/Institute", items) |
---|
121 | |
---|
122 | def addCountries(self, countryList, countries): |
---|
123 | countriesNode = self.xmlTree.find("//LrtCommon/Countries") |
---|
124 | goodList = [c.strip() for c in countries.split("||")] |
---|
125 | for country in goodList: |
---|
126 | if country: |
---|
127 | newCountryNode = ElementTree.Element("Country") |
---|
128 | newCodeNode = ElementTree.Element("Code") |
---|
129 | newCodeNode.text = countryList[country] |
---|
130 | newCountryNode.append(newCodeNode) |
---|
131 | countriesNode.append(newCountryNode) |
---|
132 | |
---|
133 | def addLanguages(self, isoList, languages, iso639Type = 3, xpath = "//LrtCommon/Languages"): |
---|
134 | languagesNode = self.xmlTree.find(xpath) |
---|
135 | languageList = [l.strip() for l in languages.split("||")] |
---|
136 | for language in languageList: |
---|
137 | if language and not language == "-- language not in list --": |
---|
138 | newLanguageNode = ElementTree.Element("ISO639") |
---|
139 | newCodeNode = ElementTree.Element("iso-639-%s-code" % iso639Type) |
---|
140 | keyLang = language.encode("utf-8") |
---|
141 | newCodeNode.text = isoList[keyLang] |
---|
142 | newLanguageNode.append(newCodeNode) |
---|
143 | languagesNode.append(newLanguageNode) |
---|
144 | |
---|
145 | |
---|
146 | def addResourceType(self, types, record, isoList): |
---|
147 | typeList = [t.strip() for t in types.split("||")] |
---|
148 | self.fillMultipleElement("ResourceType", "//LrtCommon/ResourceType", typeList) |
---|
149 | typeList = frozenset(typeList) |
---|
150 | |
---|
151 | collectionList = frozenset(("Spoken Corpus", "Written Corpus", "Multimodal Corpus", "Aligned Corpus", "Treebank", "N-Gram Model",)) |
---|
152 | lexiconList = frozenset(("Grammar", "Lexicon / Knowledge Source", "Terminological Resource",)) |
---|
153 | |
---|
154 | if typeList.intersection(collectionList): |
---|
155 | self.addCollectionDetails(record, isoList) |
---|
156 | if typeList.intersection(lexiconList): |
---|
157 | self.addLexiconDetails(record, isoList) |
---|
158 | #if "Web Service" in typeList: |
---|
159 | # self.addServiceDetails(record) |
---|
160 | |
---|
161 | def addCollectionDetails(self, record, isoList): |
---|
162 | LrtCollectionDetails_XPath = "//LrtInventoryResource/LrtCollectionDetails" |
---|
163 | |
---|
164 | |
---|
165 | self.fillOptionalElement(LrtCollectionDetails_XPath + "/LongTermPreservationBy", record["field_longterm_preservation"]) |
---|
166 | self.fillOptionalElement(LrtCollectionDetails_XPath + "/Location", record["field_location_0"]) |
---|
167 | self.fillOptionalElement(LrtCollectionDetails_XPath + "/ContentType", record["field_content_type"]) |
---|
168 | self.fillOptionalElement(LrtCollectionDetails_XPath + "/FormatDetailed", record["field_format_detailed"]) |
---|
169 | self.fillOptionalElement(LrtCollectionDetails_XPath + "/Quality", record["field_quality"]) |
---|
170 | self.fillOptionalElement(LrtCollectionDetails_XPath + "/Applications", record["field_applications"]) |
---|
171 | self.fillOptionalElement(LrtCollectionDetails_XPath + "/Size", record["field_size"]) |
---|
172 | self.fillOptionalElement(LrtCollectionDetails_XPath + "/DistributionForm", record["field_distribution_form"]) |
---|
173 | self.fillOptionalElement(LrtCollectionDetails_XPath + "/Size", record["field_size"]) |
---|
174 | self.fillOptionalElement(LrtCollectionDetails_XPath + "/Access", record["field_access"]) |
---|
175 | self.fillOptionalElement(LrtCollectionDetails_XPath + "/Source", record["field_source_0"]) |
---|
176 | |
---|
177 | # ok - this can be done in a cleaner way |
---|
178 | self.addLanguages(isoList, record["field_working_languages"], 1, LrtCollectionDetails_XPath + "/WorkingLanguages") |
---|
179 | |
---|
180 | def addLexiconDetails(self, record, isoList): |
---|
181 | LrtLexiconDetails_XPath = "//LrtInventoryResource/LrtLexiconDetails" |
---|
182 | |
---|
183 | self.fillOptionalElement(LrtLexiconDetails_XPath + "/Date", record["field_date_0"]) |
---|
184 | self.fillOptionalElement(LrtLexiconDetails_XPath + "/Type", record["field_type"]) |
---|
185 | self.fillOptionalElement(LrtLexiconDetails_XPath + "/FormatDetailed", record["field_format_detailed_1"]) |
---|
186 | self.fillOptionalElement(LrtLexiconDetails_XPath + "/SchemaReference", record["field_schema_reference"]) |
---|
187 | self.fillOptionalElement(LrtLexiconDetails_XPath + "/Size", record["field_size_0"]) |
---|
188 | self.fillOptionalElement(LrtLexiconDetails_XPath + "/Access", record["field_access_1"]) |
---|
189 | self.addLanguages(isoList, record["field_working_languages_0"], 1, LrtLexiconDetails_XPath + "/WorkingLanguages") |
---|
190 | |
---|
191 | def addServiceDetails(self, record): |
---|
192 | LrtLexiconDetails_XPath = "//LrtInventoryResource/LrtServiceDetails" |
---|
193 | |
---|
194 | self.fillOptionalElement(LrtLexiconDetails_XPath + "/Date", record["field_date_0"]) |
---|
195 | |
---|
196 | def addResourceProxy(self, link) : |
---|
197 | template = '''<ResourceProxy id="reflink"> |
---|
198 | <ResourceType>Resource</ResourceType> |
---|
199 | <ResourceRef></ResourceRef> |
---|
200 | </ResourceProxy>''' |
---|
201 | partTree = ElementTree.XML(template) |
---|
202 | parent = self.xmlTree.find(".//ResourceProxyList") |
---|
203 | parent.append(partTree) |
---|
204 | |
---|
205 | # and now fill it |
---|
206 | self.fillElement("//ResourceProxy/ResourceRef", link) |
---|
207 | |
---|
208 | def addTags(self, tags_string) : |
---|
209 | tags_parent_XPath = "//LrtInventoryResource" # One could use "/..", but that is unnecessary and can lead to mistakes. |
---|
210 | tags_XML_element = self.xmlTree.find("//LrtInventoryResource/tags") |
---|
211 | assert(tags_XML_element is not None) |
---|
212 | |
---|
213 | tags = filter(None, tags_string.split(",")) |
---|
214 | if len(tags) > 0 : |
---|
215 | # Remove whitespace left and right to tag values |
---|
216 | tags = list(map(unicode.strip, tags)) # X- Python 3 incompatible |
---|
217 | # Remove empty strings from tags list. |
---|
218 | tags = list(filter(None, tags)) |
---|
219 | |
---|
220 | for tag in tags : |
---|
221 | tag_XML_element = ElementTree.Element('tag') |
---|
222 | tag_XML_element.text = tag |
---|
223 | tags_XML_element.append(tag_XML_element) |
---|
224 | else : |
---|
225 | tags_parent_element = self.xmlTree.find(tags_parent_XPath) |
---|
226 | tags_parent_element.remove(tags_XML_element) |
---|
227 | |
---|
228 | def addChildNode(parent, tag, content) : |
---|
229 | node = ElementTree.Element(tag) |
---|
230 | node.text = content |
---|
231 | parent.append(node) |
---|
232 | |
---|
233 | |
---|
234 | def parseFirstLine(l): |
---|
235 | keyList = [l[0].lower()] |
---|
236 | for key in l[1:]: |
---|
237 | if "(" in key: |
---|
238 | keyList.append(key.split("(")[-1].replace(")", "").lower()) |
---|
239 | else: |
---|
240 | keyList.append(key.replace(" ", "_").lower()) |
---|
241 | return keyList |
---|
242 | |
---|
243 | |
---|
244 | def loadInfo(): |
---|
245 | csvFile = csv.reader(urllib.urlopen("http://user.clarin.eu/export_resources").readlines()) |
---|
246 | #csvFile = csv.reader(urllib.urlopen("resources.csv").readlines()) |
---|
247 | #csvFile =[l.decode('utf-8') for l in rawCsvFile] |
---|
248 | |
---|
249 | |
---|
250 | linenr = 0 |
---|
251 | newDict = dict() |
---|
252 | for l in csvFile: |
---|
253 | if linenr == 0: |
---|
254 | fieldList = parseFirstLine(l) |
---|
255 | else: |
---|
256 | newDict[linenr] = dict() |
---|
257 | colnr = 0 |
---|
258 | for field in fieldList: |
---|
259 | newDict[linenr][fieldList[colnr].replace(" ", "_").decode('utf-8')] = l[colnr].decode('utf-8') |
---|
260 | colnr += 1 |
---|
261 | linenr += 1 |
---|
262 | return newDict |
---|
263 | |
---|
264 | def loadCsv(filename): |
---|
265 | csvFile = csv.reader(urllib.urlopen(filename).readlines()) |
---|
266 | dictionary = dict() |
---|
267 | for l in csvFile: |
---|
268 | dictionary[l[1]] = l[0] |
---|
269 | |
---|
270 | return dictionary |
---|
271 | |
---|
272 | # only to be used in case we use the namespace, but as it is causing a lot of extra coding we just add an xmlns attribute in the end and ignore it |
---|
273 | # def fixXpath(self, xpath): |
---|
274 | # if xpath[0:2] == "//": |
---|
275 | # xpath = "//{http://www.clarin.eu/cmd/}" + xpath[2:].replace("/", "/{http://www.clarin.eu/cmd/}") |
---|
276 | # else: |
---|
277 | # xpath = xpath.replace("/", "/{http://www.clarin.eu/cmd/}") |
---|
278 | # return xpath |
---|
279 | |
---|
280 | def main(): |
---|
281 | infoDict = loadInfo() |
---|
282 | countryList = loadCsv("country_codes.csv") |
---|
283 | iso6393List = loadCsv("639-3-language_codes.csv") |
---|
284 | iso6391List = loadCsv("639-1-language_codes.csv") |
---|
285 | |
---|
286 | for record in infoDict.values(): |
---|
287 | print "creating lrt-%s.cmdi" % record["nid"] |
---|
288 | |
---|
289 | cmdi = CmdiFile(record["nid"]) |
---|
290 | |
---|
291 | # 1-to-1 fields, easy case |
---|
292 | cmdi.fillElement("//LrtCommon/ResourceName", record["name"]) |
---|
293 | cmdi.fillElement("//LrtCommon/Description", record["field_description"]) |
---|
294 | cmdi.fillElement("//LrtCommon/ContactPerson", record["field_creator"]) |
---|
295 | cmdi.fillElement("//LrtCommon/LanguagesOther", record["field_languages_other"]) |
---|
296 | cmdi.fillElement("//LrtCommon/BeginYearResourceCreation", record["field_year"]) |
---|
297 | cmdi.fillElement("//LrtCommon/FinalizationYearResourceCreation", record["field_end_creation_date"]) |
---|
298 | cmdi.fillElement("//LrtCommon/MetadataLink", record["field_metadata_link"]) |
---|
299 | cmdi.fillElement("//LrtCommon/Publications", record["field_publications"]) |
---|
300 | cmdi.fillElement("//LrtCommon/ReadilyAvailable", record["field_resource_available"].replace("Yes","true").replace("No","false")) |
---|
301 | cmdi.fillElement("//LrtCommon/ReferenceLink", record["field_reference_link"]) |
---|
302 | |
---|
303 | cmdi.fillElement("//LrtDistributionClassification/DistributionType", record["distribution_type"]) |
---|
304 | cmdi.fillElement("//LrtDistributionClassification/ModificationsRequireRedeposition", record["modifications_require_redeposition"].replace("1","true").replace("0","false")) |
---|
305 | cmdi.fillElement("//LrtDistributionClassification/NonCommercialUsageOnly", record["non-commercial_usage_only"].replace("1","true").replace("0","false")) |
---|
306 | cmdi.fillElement("//LrtDistributionClassification/UsageReportRequired", record["usage_report_required"].replace("1","true").replace("0","false")) |
---|
307 | cmdi.fillElement("//LrtDistributionClassification/OtherDistributionRestrictions", record["other_distribution_restrictions"]) |
---|
308 | |
---|
309 | cmdi.fillElement("//LrtIPR/EthicalReference", record["field_ethical_reference"]) |
---|
310 | cmdi.fillElement("//LrtIPR/LegalReference", record["field_legal_reference"]) |
---|
311 | cmdi.fillElement("//LrtIPR/LicenseType", record["field_license_type"]) |
---|
312 | cmdi.fillElement("//LrtIPR/Description", record["field_description_0"]) |
---|
313 | cmdi.fillElement("//LrtIPR/ContactPerson", record["field_contact_person"]) |
---|
314 | |
---|
315 | # add a ResourceProxy for ReferenceLink |
---|
316 | if "http" in record["field_reference_link"]: |
---|
317 | cmdi.addResourceProxy(record["field_reference_link"]) |
---|
318 | |
---|
319 | # more sophisticated (dirty) tricks needed |
---|
320 | cmdi.addFormats(record["field_format"]) |
---|
321 | |
---|
322 | orgList = "" |
---|
323 | for i in range(1,5): |
---|
324 | orgList += record["org" + str(i)] + ";" |
---|
325 | cmdi.addInstitutes(orgList + record["field_institute"]) |
---|
326 | |
---|
327 | cmdi.addCountries(countryList, record["field_country"]) |
---|
328 | |
---|
329 | cmdi.addLanguages(iso6393List, record["field_languages"]) |
---|
330 | |
---|
331 | cmdi.addResourceType(record["field_resource_type"], record, iso6391List) |
---|
332 | |
---|
333 | cmdi.addTags(record['tags']); |
---|
334 | |
---|
335 | cmdi.serialize() |
---|
336 | |
---|
337 | main() |
---|