source: metadata/trunk/toolkit/scripts/lrt2cmdi.py @ 2615

Last change on this file since 2615 was 2615, checked in by dietuyt, 11 years ago

Changed URLs for LRT inventory to lrt.clarin.eu

File size: 15.7 KB
Line 
1#!/usr/bin/env python
2
3# converts the CSV from the LRT inventory to nice and clean CMDI
4# Dieter says: I deny the existance of this script!
5
6import csv, datetime, pdb, sys, traceback, urllib, xml.etree.ElementTree as ElementTree
7from curses.ascii import ascii
8
9if sys.version_info < (2, 7) :
10    sys.stderr.write("WARNING: this script was only tested with Python version 2.7.3! You are running version " + str(sys.version_info[1]) + "." + str(sys.version_info[2]) + " instead.\n")
11
12class CmdiFile :
13    def __init__(self, nodeId) :
14        template            = open("cmdi-lrt-template.xml").read()
15        self.nodeId         = nodeId
16        self.xmlTree        = ElementTree.ElementTree(ElementTree.fromstring(template))
17        self.parentmap      = dict((c, p) for p in self.xmlTree.getiterator() for c in p)
18        self.fillElement("//MdCreationDate", datetime.datetime.now().strftime("%Y-%m-%d"))
19        self.fillElement("//MdSelfLink", "http://lrt.clarin.eu/node/%s" % nodeId)
20
21    def fillElement(self, XPath, value) :
22        try :
23            self.xmlTree.find(XPath).text = value.strip()
24        except :
25            print "Error in filling element " + XPath
26            print traceback.format_exc()
27
28            pdb.set_trace()
29       
30
31    def fillOptionalElement(self, XPath, value) :
32        try :
33            result = self.fillElement(XPath, value)
34        except :
35            print "Error in filling optional element " + XPath
36            print traceback.format_exc()
37
38            pdb.set_trace()
39        else :
40            return result
41
42        ### Conceptual code that should remove optional elements if they are being filled with empty strings.
43        # optional_element_parent_XPath   = XPath + "/.."
44        # optional_element_parent         = self.xmlTree.find(optional_element_parent_XPath)
45        # optional_element                = self.xmlTree.find(XPath)
46
47        # try :
48        #     assert(optional_element_parent is not None)
49        #     assert(optional_element is not None)
50        # except :
51        #     import pdb
52        #     pdb.set_trace()
53
54        # value = str(value).strip()
55        # if len(value) > 1 :
56        #     optional_element.text   = value
57        # else :
58        #     optional_element_parent.remove(optional_element)
59
60    def fillMultipleElement(self, elementname, xpath, values):
61        # fill in the already existing element
62        if (values[0]):
63            self.fillElement(xpath, values[0])
64
65        element = self.xmlTree.find(xpath)
66        parent = self.parentmap[element]
67        position = parent.getchildren().index(element)
68
69        # then add siblings for the other elements
70        for value in values[1:]:
71            if value:
72                # create new sibling of xpath (elementname) = value
73                position += 1
74                newElement = ElementTree.Element(elementname)
75                newElement.text = value.strip()
76                parent.insert(position, newElement)
77
78    def removeEmptyNodes(self):
79        # we maybe added some elements so need to recalculate the parentmap
80        self.parentmap = dict((c, p) for p in self.xmlTree.getiterator() for c in p)
81
82        removeList = ["ResourceType", "BeginYearResourceCreation", "FinalizationYearResourceCreation", "Institute", \
83                      "DistributionType", "NonCommercialUsageOnly", "UsageReportRequired", "ModificationsRequireRedeposition", "WorkingLanguages", "Date"]
84        for r in removeList:
85            results = self.xmlTree.findall("//%s" % r)
86            for res in results:
87                if not res.text:
88                    parentNode = self.parentmap[res]
89                    parentNode.remove(res)
90
91    def serialize(self):
92        self.removeEmptyNodes()
93        #print ElementTree.tostring(self.xmlTree.getroot())
94        filename            = "lrt-%s.cmdi" % self.nodeId
95        self.xmlTree.write(filename, encoding = "utf-8", xml_declaration = True)
96        f                   = open(filename, 'r+' )
97        content             = f.read().replace('<CMD', '<CMD xmlns="http://www.clarin.eu/cmd/"')
98        f.close()
99        f                   = open(filename, 'w' )
100        f.write(content)
101        f.close
102
103    def addFormats(self, format):
104        if ";" in format or "," in format:
105            if ";" in format:
106                formatItems = format.split(";")
107            else:
108                formatItems = format.split(",")
109            self.fillMultipleElement("Format", "//LrtCommon/Format", formatItems)
110        else:
111            self.fillElement("//LrtCommon/Format", format)
112
113    def addInstitutes(self, institute):
114        if ";" in institute:
115                items = institute.split(";")
116                uniqueItems = set(items) # filter out double items
117                items = [i for i in uniqueItems] # convert set back to a list
118                #print items
119
120                self.fillMultipleElement("Institute", "//LrtCommon/Institute", items)
121
122    def addCountries(self, countryList, countries):
123        countriesNode = self.xmlTree.find("//LrtCommon/Countries")
124        goodList = [c.strip() for c in countries.split("||")]
125        for country in goodList:
126            if country:
127                newCountryNode = ElementTree.Element("Country")
128                newCodeNode = ElementTree.Element("Code")
129                newCodeNode.text = countryList[country]
130                newCountryNode.append(newCodeNode)
131                countriesNode.append(newCountryNode)
132
133    def addLanguages(self, isoList, languages, iso639Type = 3, xpath = "//LrtCommon/Languages"):
134        languagesNode = self.xmlTree.find(xpath)
135        languageList = [l.strip() for l in languages.split("||")]
136        for language in languageList:
137            if language and not language == "-- language not in list --":
138                newLanguageNode = ElementTree.Element("ISO639")
139                newCodeNode = ElementTree.Element("iso-639-%s-code" % iso639Type)
140                keyLang = language.encode("utf-8")
141                newCodeNode.text = isoList[keyLang]
142                newLanguageNode.append(newCodeNode)
143                languagesNode.append(newLanguageNode)
144
145
146    def addResourceType(self, types, record, isoList):
147            typeList = [t.strip() for t in types.split("||")]
148            self.fillMultipleElement("ResourceType", "//LrtCommon/ResourceType", typeList)
149            typeList = frozenset(typeList)
150
151            collectionList  = frozenset(("Spoken Corpus", "Written Corpus", "Multimodal Corpus", "Aligned Corpus", "Treebank", "N-Gram Model",))
152            lexiconList     = frozenset(("Grammar", "Lexicon / Knowledge Source", "Terminological Resource",))
153
154            if typeList.intersection(collectionList):
155                self.addCollectionDetails(record, isoList)
156            if typeList.intersection(lexiconList):
157                self.addLexiconDetails(record, isoList)
158            #if "Web Service" in typeList:
159            #    self.addServiceDetails(record)
160
161    def addCollectionDetails(self, record, isoList):
162        LrtCollectionDetails_XPath  = "//LrtInventoryResource/LrtCollectionDetails"
163
164
165        self.fillOptionalElement(LrtCollectionDetails_XPath + "/LongTermPreservationBy",    record["field_longterm_preservation"])
166        self.fillOptionalElement(LrtCollectionDetails_XPath + "/Location",                  record["field_location_0"])
167        self.fillOptionalElement(LrtCollectionDetails_XPath + "/ContentType",               record["field_content_type"])
168        self.fillOptionalElement(LrtCollectionDetails_XPath + "/FormatDetailed",            record["field_format_detailed"])
169        self.fillOptionalElement(LrtCollectionDetails_XPath + "/Quality",                   record["field_quality"])
170        self.fillOptionalElement(LrtCollectionDetails_XPath + "/Applications",              record["field_applications"])
171        self.fillOptionalElement(LrtCollectionDetails_XPath + "/Size",                      record["field_size"])
172        self.fillOptionalElement(LrtCollectionDetails_XPath + "/DistributionForm",          record["field_distribution_form"])
173        self.fillOptionalElement(LrtCollectionDetails_XPath + "/Size",                      record["field_size"])
174        self.fillOptionalElement(LrtCollectionDetails_XPath + "/Access",                    record["field_access"])
175        self.fillOptionalElement(LrtCollectionDetails_XPath + "/Source",                    record["field_source_0"])
176
177        # ok - this can be done in a cleaner way
178        self.addLanguages(isoList, record["field_working_languages"], 1, LrtCollectionDetails_XPath + "/WorkingLanguages")
179
180    def addLexiconDetails(self, record, isoList):
181        LrtLexiconDetails_XPath     = "//LrtInventoryResource/LrtLexiconDetails"
182
183        self.fillOptionalElement(LrtLexiconDetails_XPath + "/Date",                         record["field_date_0"])
184        self.fillOptionalElement(LrtLexiconDetails_XPath + "/Type",                         record["field_type"])
185        self.fillOptionalElement(LrtLexiconDetails_XPath + "/FormatDetailed",               record["field_format_detailed_1"])
186        self.fillOptionalElement(LrtLexiconDetails_XPath + "/SchemaReference",              record["field_schema_reference"])
187        self.fillOptionalElement(LrtLexiconDetails_XPath + "/Size",                         record["field_size_0"])
188        self.fillOptionalElement(LrtLexiconDetails_XPath + "/Access",                       record["field_access_1"])
189        self.addLanguages(isoList, record["field_working_languages_0"], 1, LrtLexiconDetails_XPath + "/WorkingLanguages")
190
191    def addServiceDetails(self, record):
192        LrtLexiconDetails_XPath  = "//LrtInventoryResource/LrtServiceDetails"
193
194        self.fillOptionalElement(LrtLexiconDetails_XPath + "/Date",                         record["field_date_0"])
195
196    def addResourceProxy(self, link) :
197        template = '''<ResourceProxy id="reflink">
198                <ResourceType>Resource</ResourceType>
199                <ResourceRef></ResourceRef>
200            </ResourceProxy>'''
201        partTree = ElementTree.XML(template)
202        parent = self.xmlTree.find(".//ResourceProxyList")
203        parent.append(partTree)
204
205        # and now fill it
206        self.fillElement("//ResourceProxy/ResourceRef", link)
207
208    def addTags(self, tags_string) :
209        tags_parent_XPath               = "//LrtInventoryResource" # One could use "/..", but that is unnecessary and can lead to mistakes.
210        tags_XML_element                = self.xmlTree.find("//LrtInventoryResource/tags")
211        assert(tags_XML_element is not None)
212
213        tags = filter(None, tags_string.split(","))
214        if len(tags) > 0 :
215            # Remove whitespace left and right to tag values
216            tags                        = list(map(unicode.strip, tags)) # X- Python 3 incompatible
217            # Remove empty strings from tags list.
218            tags                        = list(filter(None, tags))
219
220            for tag in tags :
221                tag_XML_element         = ElementTree.Element('tag')
222                tag_XML_element.text    = tag
223                tags_XML_element.append(tag_XML_element)
224        else :
225            tags_parent_element         = self.xmlTree.find(tags_parent_XPath)
226            tags_parent_element.remove(tags_XML_element)
227
228def addChildNode(parent, tag, content) :
229    node = ElementTree.Element(tag)
230    node.text = content
231    parent.append(node)
232
233
234def parseFirstLine(l):
235    keyList = [l[0].lower()]
236    for key in l[1:]:
237        if "(" in key:
238            keyList.append(key.split("(")[-1].replace(")", "").lower())
239        else:
240            keyList.append(key.replace(" ", "_").lower())
241    return keyList
242
243
244def loadInfo():
245    csvFile = csv.reader(urllib.urlopen("http://lrt.clarin.eu/export_resources").readlines())
246    #csvFile = csv.reader(urllib.urlopen("resources.csv").readlines())
247    #csvFile =[l.decode('utf-8') for l in rawCsvFile]
248
249
250    linenr = 0
251    newDict = dict()
252    for l in csvFile:
253        if linenr == 0:
254            fieldList = parseFirstLine(l)
255        else:
256            newDict[linenr] = dict()
257            colnr = 0
258            for field in fieldList:
259                newDict[linenr][fieldList[colnr].replace(" ", "_").decode('utf-8')] = l[colnr].decode('utf-8')
260                colnr += 1
261        linenr += 1
262    return newDict
263
264def loadCsv(filename):
265    csvFile = csv.reader(urllib.urlopen(filename).readlines())
266    dictionary = dict()
267    for l in csvFile:
268        dictionary[l[1]] = l[0]
269
270    return dictionary
271
272# only to be used in case we use the namespace, but as it is causing a lot of extra coding we just add an xmlns attribute in the end and ignore it
273#    def fixXpath(self, xpath):
274#        if xpath[0:2] == "//":
275#            xpath = "//{http://www.clarin.eu/cmd/}" + xpath[2:].replace("/", "/{http://www.clarin.eu/cmd/}")
276#        else:
277#            xpath = xpath.replace("/", "/{http://www.clarin.eu/cmd/}")
278#        return xpath
279
280def main():
281    infoDict = loadInfo()
282    countryList = loadCsv("country_codes.csv")
283    iso6393List = loadCsv("639-3-language_codes.csv")
284    iso6391List = loadCsv("639-1-language_codes.csv")
285
286    for record in infoDict.values():
287        print "creating lrt-%s.cmdi" % record["nid"]
288
289        cmdi = CmdiFile(record["nid"])
290
291        # 1-to-1 fields, easy case
292        cmdi.fillElement("//LrtCommon/ResourceName", record["name"])
293        cmdi.fillElement("//LrtCommon/Description", record["field_description"])
294        cmdi.fillElement("//LrtCommon/ContactPerson", record["field_creator"])
295        cmdi.fillElement("//LrtCommon/LanguagesOther", record["field_languages_other"])
296        cmdi.fillElement("//LrtCommon/BeginYearResourceCreation", record["field_year"])
297        cmdi.fillElement("//LrtCommon/FinalizationYearResourceCreation", record["field_end_creation_date"])
298        cmdi.fillElement("//LrtCommon/MetadataLink", record["field_metadata_link"])
299        cmdi.fillElement("//LrtCommon/Publications", record["field_publications"])
300        cmdi.fillElement("//LrtCommon/ReadilyAvailable", record["field_resource_available"].replace("Yes","true").replace("No","false"))
301        cmdi.fillElement("//LrtCommon/ReferenceLink", record["field_reference_link"])
302
303        cmdi.fillElement("//LrtDistributionClassification/DistributionType", record["distribution_type"])
304        cmdi.fillElement("//LrtDistributionClassification/ModificationsRequireRedeposition", record["modifications_require_redeposition"].replace("1","true").replace("0","false"))
305        cmdi.fillElement("//LrtDistributionClassification/NonCommercialUsageOnly", record["non-commercial_usage_only"].replace("1","true").replace("0","false"))
306        cmdi.fillElement("//LrtDistributionClassification/UsageReportRequired", record["usage_report_required"].replace("1","true").replace("0","false"))
307        cmdi.fillElement("//LrtDistributionClassification/OtherDistributionRestrictions", record["other_distribution_restrictions"])
308
309        cmdi.fillElement("//LrtIPR/EthicalReference", record["field_ethical_reference"])
310        cmdi.fillElement("//LrtIPR/LegalReference", record["field_legal_reference"])
311        cmdi.fillElement("//LrtIPR/LicenseType", record["field_license_type"])
312        cmdi.fillElement("//LrtIPR/Description", record["field_description_0"])
313        cmdi.fillElement("//LrtIPR/ContactPerson", record["field_contact_person"])
314
315        # add a ResourceProxy for ReferenceLink
316        if "http" in record["field_reference_link"]:
317            cmdi.addResourceProxy(record["field_reference_link"])
318
319        # more sophisticated (dirty) tricks needed
320        cmdi.addFormats(record["field_format"])
321
322        orgList = ""
323        for i in range(1,5):
324            orgList += record["org" + str(i)] + ";"
325        cmdi.addInstitutes(orgList + record["field_institute"])
326
327        cmdi.addCountries(countryList, record["field_country"])
328
329        cmdi.addLanguages(iso6393List, record["field_languages"])
330
331        cmdi.addResourceType(record["field_resource_type"], record, iso6391List)
332
333        cmdi.addTags(record['tags']);
334
335        cmdi.serialize()
336
337main()
Note: See TracBrowser for help on using the repository browser.