source: metadata/trunk/toolkit/scripts/lrt2cmdi.py @ 2565

Last change on this file since 2565 was 2565, checked in by dietuyt, 11 years ago

Removed old version of lrt2cmdi.py, updated links to user.clarin.eu instead of www.clarin.eu

File size: 15.7 KB
Line 
1#!/usr/bin/env python
2
3# converts the CSV from the LRT inventory to nice and clean CMDI
4# Dieter says: I deny the existance of this script!
5
6import csv, datetime, pdb, sys, traceback, urllib, xml.etree.ElementTree as ElementTree
7from curses.ascii import ascii
8
9if sys.version_info < (2, 7) :
10    sys.stderr.write("WARNING: this script was only tested with Python version 2.7.3! You are running version " + str(sys.version_info[1]) + "." + str(sys.version_info[2]) + " instead.\n")
11
12class CmdiFile :
13    def __init__(self, nodeId) :
14        template            = open("cmdi-lrt-template.xml").read()
15        self.nodeId         = nodeId
16        self.xmlTree        = ElementTree.ElementTree(ElementTree.fromstring(template))
17        self.parentmap      = dict((c, p) for p in self.xmlTree.getiterator() for c in p)
18        self.fillElement("//MdCreationDate", datetime.datetime.now().strftime("%Y-%m-%d"))
19        self.fillElement("//MdSelfLink", "http://user.clarin.eu/node/%s" % nodeId)
20
21    def fillElement(self, XPath, value) :
22        try :
23            self.xmlTree.find(XPath).text = value.strip()
24        except :
25            print "Error in filling element " + XPath
26            print traceback.format_exc()
27
28            pdb.set_trace()
29       
30
31    def fillOptionalElement(self, XPath, value) :
32        try :
33            result = self.fillElement(XPath, value)
34        except :
35            print "Error in filling optional element " + XPath
36            print traceback.format_exc()
37
38            pdb.set_trace()
39        else :
40            return result
41
42        ### Conceptual code that should remove optional elements if they are being filled with empty strings.
43        # optional_element_parent_XPath   = XPath + "/.."
44        # optional_element_parent         = self.xmlTree.find(optional_element_parent_XPath)
45        # optional_element                = self.xmlTree.find(XPath)
46
47        # try :
48        #     assert(optional_element_parent is not None)
49        #     assert(optional_element is not None)
50        # except :
51        #     import pdb
52        #     pdb.set_trace()
53
54        # value = str(value).strip()
55        # if len(value) > 1 :
56        #     optional_element.text   = value
57        # else :
58        #     optional_element_parent.remove(optional_element)
59
60    def fillMultipleElement(self, elementname, xpath, values):
61        # fill in the already existing element
62        if (values[0]):
63            self.fillElement(xpath, values[0])
64
65        element = self.xmlTree.find(xpath)
66        parent = self.parentmap[element]
67        position = parent.getchildren().index(element)
68
69        # then add siblings for the other elements
70        for value in values[1:]:
71            if value:
72                # create new sibling of xpath (elementname) = value
73                position += 1
74                newElement = ElementTree.Element(elementname)
75                newElement.text = value.strip()
76                parent.insert(position, newElement)
77
78    def removeEmptyNodes(self):
79        # we maybe added some elements so need to recalculate the parentmap
80        self.parentmap = dict((c, p) for p in self.xmlTree.getiterator() for c in p)
81
82        removeList = ["ResourceType", "BeginYearResourceCreation", "FinalizationYearResourceCreation", "Institute", \
83                      "DistributionType", "NonCommercialUsageOnly", "UsageReportRequired", "ModificationsRequireRedeposition", "WorkingLanguages", "Date"]
84        for r in removeList:
85            results = self.xmlTree.findall("//%s" % r)
86            for res in results:
87                if not res.text:
88                    parentNode = self.parentmap[res]
89                    parentNode.remove(res)
90
91    def serialize(self):
92        self.removeEmptyNodes()
93        #print ElementTree.tostring(self.xmlTree.getroot())
94        filename            = "lrt-%s.cmdi" % self.nodeId
95        self.xmlTree.write(filename, encoding = "utf-8", xml_declaration = True)
96        f                   = open(filename, 'r+' )
97        content             = f.read().replace('<CMD', '<CMD xmlns="http://www.clarin.eu/cmd/"')
98        f.close()
99        f                   = open(filename, 'w' )
100        f.write(content)
101        f.close
102
103    def addFormats(self, format):
104        if ";" in format or "," in format:
105            if ";" in format:
106                formatItems = format.split(";")
107            else:
108                formatItems = format.split(",")
109            self.fillMultipleElement("Format", "//LrtCommon/Format", formatItems)
110        else:
111            self.fillElement("//LrtCommon/Format", format)
112
113    def addInstitutes(self, institute):
114        if ";" in institute:
115                items = institute.split(";")
116                uniqueItems = set(items) # filter out double items
117                items = [i for i in uniqueItems] # convert set back to a list
118                #print items
119
120                self.fillMultipleElement("Institute", "//LrtCommon/Institute", items)
121
122    def addCountries(self, countryList, countries):
123        countriesNode = self.xmlTree.find("//LrtCommon/Countries")
124        goodList = [c.strip() for c in countries.split("||")]
125        for country in goodList:
126            if country:
127                newCountryNode = ElementTree.Element("Country")
128                newCodeNode = ElementTree.Element("Code")
129                newCodeNode.text = countryList[country]
130                newCountryNode.append(newCodeNode)
131                countriesNode.append(newCountryNode)
132
133    def addLanguages(self, isoList, languages, iso639Type = 3, xpath = "//LrtCommon/Languages"):
134        languagesNode = self.xmlTree.find(xpath)
135        languageList = [l.strip() for l in languages.split("||")]
136        for language in languageList:
137            if language and not language == "-- language not in list --":
138                newLanguageNode = ElementTree.Element("ISO639")
139                newCodeNode = ElementTree.Element("iso-639-%s-code" % iso639Type)
140                keyLang = language.encode("utf-8")
141                newCodeNode.text = isoList[keyLang]
142                newLanguageNode.append(newCodeNode)
143                languagesNode.append(newLanguageNode)
144
145
146    def addResourceType(self, types, record, isoList):
147            typeList = [t.strip() for t in types.split("||")]
148            self.fillMultipleElement("ResourceType", "//LrtCommon/ResourceType", typeList)
149            typeList = frozenset(typeList)
150
151            collectionList  = frozenset(("Spoken Corpus", "Written Corpus", "Multimodal Corpus", "Aligned Corpus", "Treebank", "N-Gram Model",))
152            lexiconList     = frozenset(("Grammar", "Lexicon / Knowledge Source", "Terminological Resource",))
153
154            if typeList.intersection(collectionList):
155                self.addCollectionDetails(record, isoList)
156            if typeList.intersection(lexiconList):
157                self.addLexiconDetails(record, isoList)
158            #if "Web Service" in typeList:
159            #    self.addServiceDetails(record)
160
161    def addCollectionDetails(self, record, isoList):
162        LrtCollectionDetails_XPath  = "//LrtInventoryResource/LrtCollectionDetails"
163
164
165        self.fillOptionalElement(LrtCollectionDetails_XPath + "/LongTermPreservationBy",    record["field_longterm_preservation"])
166        self.fillOptionalElement(LrtCollectionDetails_XPath + "/Location",                  record["field_location_0"])
167        self.fillOptionalElement(LrtCollectionDetails_XPath + "/ContentType",               record["field_content_type"])
168        self.fillOptionalElement(LrtCollectionDetails_XPath + "/FormatDetailed",            record["field_format_detailed"])
169        self.fillOptionalElement(LrtCollectionDetails_XPath + "/Quality",                   record["field_quality"])
170        self.fillOptionalElement(LrtCollectionDetails_XPath + "/Applications",              record["field_applications"])
171        self.fillOptionalElement(LrtCollectionDetails_XPath + "/Size",                      record["field_size"])
172        self.fillOptionalElement(LrtCollectionDetails_XPath + "/DistributionForm",          record["field_distribution_form"])
173        self.fillOptionalElement(LrtCollectionDetails_XPath + "/Size",                      record["field_size"])
174        self.fillOptionalElement(LrtCollectionDetails_XPath + "/Access",                    record["field_access"])
175        self.fillOptionalElement(LrtCollectionDetails_XPath + "/Source",                    record["field_source_0"])
176
177        # ok - this can be done in a cleaner way
178        self.addLanguages(isoList, record["field_working_languages"], 1, LrtCollectionDetails_XPath + "/WorkingLanguages")
179
180    def addLexiconDetails(self, record, isoList):
181        LrtLexiconDetails_XPath     = "//LrtInventoryResource/LrtLexiconDetails"
182
183        self.fillOptionalElement(LrtLexiconDetails_XPath + "/Date",                         record["field_date_0"])
184        self.fillOptionalElement(LrtLexiconDetails_XPath + "/Type",                         record["field_type"])
185        self.fillOptionalElement(LrtLexiconDetails_XPath + "/FormatDetailed",               record["field_format_detailed_1"])
186        self.fillOptionalElement(LrtLexiconDetails_XPath + "/SchemaReference",              record["field_schema_reference"])
187        self.fillOptionalElement(LrtLexiconDetails_XPath + "/Size",                         record["field_size_0"])
188        self.fillOptionalElement(LrtLexiconDetails_XPath + "/Access",                       record["field_access_1"])
189        self.addLanguages(isoList, record["field_working_languages_0"], 1, LrtLexiconDetails_XPath + "/WorkingLanguages")
190
191    def addServiceDetails(self, record):
192        LrtLexiconDetails_XPath  = "//LrtInventoryResource/LrtServiceDetails"
193
194        self.fillOptionalElement(LrtLexiconDetails_XPath + "/Date",                         record["field_date_0"])
195
196    def addResourceProxy(self, link) :
197        template = '''<ResourceProxy id="reflink">
198                <ResourceType>Resource</ResourceType>
199                <ResourceRef></ResourceRef>
200            </ResourceProxy>'''
201        partTree = ElementTree.XML(template)
202        parent = self.xmlTree.find(".//ResourceProxyList")
203        parent.append(partTree)
204
205        # and now fill it
206        self.fillElement("//ResourceProxy/ResourceRef", link)
207
208    def addTags(self, tags_string) :
209        tags_parent_XPath               = "//LrtInventoryResource" # One could use "/..", but that is unnecessary and can lead to mistakes.
210        tags_XML_element                = self.xmlTree.find("//LrtInventoryResource/tags")
211        assert(tags_XML_element is not None)
212
213        tags = filter(None, tags_string.split(","))
214        if len(tags) > 0 :
215            # Remove whitespace left and right to tag values
216            tags                        = list(map(unicode.strip, tags)) # X- Python 3 incompatible
217            # Remove empty strings from tags list.
218            tags                        = list(filter(None, tags))
219
220            for tag in tags :
221                tag_XML_element         = ElementTree.Element('tag')
222                tag_XML_element.text    = tag
223                tags_XML_element.append(tag_XML_element)
224        else :
225            tags_parent_element         = self.xmlTree.find(tags_parent_XPath)
226            tags_parent_element.remove(tags_XML_element)
227
228def addChildNode(parent, tag, content) :
229    node = ElementTree.Element(tag)
230    node.text = content
231    parent.append(node)
232
233
234def parseFirstLine(l):
235    keyList = [l[0].lower()]
236    for key in l[1:]:
237        if "(" in key:
238            keyList.append(key.split("(")[-1].replace(")", "").lower())
239        else:
240            keyList.append(key.replace(" ", "_").lower())
241    return keyList
242
243
244def loadInfo():
245    csvFile = csv.reader(urllib.urlopen("http://user.clarin.eu/export_resources").readlines())
246    #csvFile = csv.reader(urllib.urlopen("resources.csv").readlines())
247    #csvFile =[l.decode('utf-8') for l in rawCsvFile]
248
249
250    linenr = 0
251    newDict = dict()
252    for l in csvFile:
253        if linenr == 0:
254            fieldList = parseFirstLine(l)
255        else:
256            newDict[linenr] = dict()
257            colnr = 0
258            for field in fieldList:
259                newDict[linenr][fieldList[colnr].replace(" ", "_").decode('utf-8')] = l[colnr].decode('utf-8')
260                colnr += 1
261        linenr += 1
262    return newDict
263
264def loadCsv(filename):
265    csvFile = csv.reader(urllib.urlopen(filename).readlines())
266    dictionary = dict()
267    for l in csvFile:
268        dictionary[l[1]] = l[0]
269
270    return dictionary
271
272# only to be used in case we use the namespace, but as it is causing a lot of extra coding we just add an xmlns attribute in the end and ignore it
273#    def fixXpath(self, xpath):
274#        if xpath[0:2] == "//":
275#            xpath = "//{http://www.clarin.eu/cmd/}" + xpath[2:].replace("/", "/{http://www.clarin.eu/cmd/}")
276#        else:
277#            xpath = xpath.replace("/", "/{http://www.clarin.eu/cmd/}")
278#        return xpath
279
280def main():
281    infoDict = loadInfo()
282    countryList = loadCsv("country_codes.csv")
283    iso6393List = loadCsv("639-3-language_codes.csv")
284    iso6391List = loadCsv("639-1-language_codes.csv")
285
286    for record in infoDict.values():
287        print "creating lrt-%s.cmdi" % record["nid"]
288
289        cmdi = CmdiFile(record["nid"])
290
291        # 1-to-1 fields, easy case
292        cmdi.fillElement("//LrtCommon/ResourceName", record["name"])
293        cmdi.fillElement("//LrtCommon/Description", record["field_description"])
294        cmdi.fillElement("//LrtCommon/ContactPerson", record["field_creator"])
295        cmdi.fillElement("//LrtCommon/LanguagesOther", record["field_languages_other"])
296        cmdi.fillElement("//LrtCommon/BeginYearResourceCreation", record["field_year"])
297        cmdi.fillElement("//LrtCommon/FinalizationYearResourceCreation", record["field_end_creation_date"])
298        cmdi.fillElement("//LrtCommon/MetadataLink", record["field_metadata_link"])
299        cmdi.fillElement("//LrtCommon/Publications", record["field_publications"])
300        cmdi.fillElement("//LrtCommon/ReadilyAvailable", record["field_resource_available"].replace("Yes","true").replace("No","false"))
301        cmdi.fillElement("//LrtCommon/ReferenceLink", record["field_reference_link"])
302
303        cmdi.fillElement("//LrtDistributionClassification/DistributionType", record["distribution_type"])
304        cmdi.fillElement("//LrtDistributionClassification/ModificationsRequireRedeposition", record["modifications_require_redeposition"].replace("1","true").replace("0","false"))
305        cmdi.fillElement("//LrtDistributionClassification/NonCommercialUsageOnly", record["non-commercial_usage_only"].replace("1","true").replace("0","false"))
306        cmdi.fillElement("//LrtDistributionClassification/UsageReportRequired", record["usage_report_required"].replace("1","true").replace("0","false"))
307        cmdi.fillElement("//LrtDistributionClassification/OtherDistributionRestrictions", record["other_distribution_restrictions"])
308
309        cmdi.fillElement("//LrtIPR/EthicalReference", record["field_ethical_reference"])
310        cmdi.fillElement("//LrtIPR/LegalReference", record["field_legal_reference"])
311        cmdi.fillElement("//LrtIPR/LicenseType", record["field_license_type"])
312        cmdi.fillElement("//LrtIPR/Description", record["field_description_0"])
313        cmdi.fillElement("//LrtIPR/ContactPerson", record["field_contact_person"])
314
315        # add a ResourceProxy for ReferenceLink
316        if "http" in record["field_reference_link"]:
317            cmdi.addResourceProxy(record["field_reference_link"])
318
319        # more sophisticated (dirty) tricks needed
320        cmdi.addFormats(record["field_format"])
321
322        orgList = ""
323        for i in range(1,5):
324            orgList += record["org" + str(i)] + ";"
325        cmdi.addInstitutes(orgList + record["field_institute"])
326
327        cmdi.addCountries(countryList, record["field_country"])
328
329        cmdi.addLanguages(iso6393List, record["field_languages"])
330
331        cmdi.addResourceType(record["field_resource_type"], record, iso6391List)
332
333        cmdi.addTags(record['tags']);
334
335        cmdi.serialize()
336
337main()
Note: See TracBrowser for help on using the repository browser.