Context Navigation

← Previous Changeset
Next Changeset →

Changeset 1617

Timestamp:

11/10/11 12:35:07 (13 years ago)

Author:

dietuyt

Message:

Now makes .cmdi files that comply to the new default namespace.

Location:

metadata/trunk/toolkit/scripts

Files:

: 2 edited

cmdi-lrt-template.xml (modified) (1 diff)
lrt2cmdi.py (modified) (20 diffs)

Legend:

: Unmodified
: Added
: Removed

metadata/trunk/toolkit/scripts/cmdi-lrt-template.xml

-                      r1432
+                      r1617
 <?xml version="1.0" encoding="UTF-8"?>
 <CMD xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
     xsi:schemaLocation="http://www.clarin.eu/cmd http://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/profiles/clarin.eu:cr1:p_1288172614070/xsd">
+<CMD CMDVersion="1.1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+    xsi:schemaLocation="http://www.clarin.eu/cmd http://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/profiles/clarin.eu:cr1:p_1289827960126/xsd">
     <Header>
         <MdCreator>lrt2cmdi.py</MdCreator>
         <MdCreationDate/>
         <MdSelfLink/>
         <MdProfile>clarin.eu:cr1:p_1288172614070</MdProfile>
+        <MdProfile>clarin.eu:cr1:p_1289827960126</MdProfile>
         <MdCollectionDisplayName>CLARIN LRT inventory</MdCollectionDisplayName>
     </Header>

metadata/trunk/toolkit/scripts/lrt2cmdi.py

-                      r1042
+                      r1617
 # converts the CSV from the LRT inventory to nice and clean CMDI
 # Dieter says: I deny the existance of this script!
+# Dieter says: I deny the existance of this script!
 import urllib, csv, datetime, xml.etree.ElementTree as ElementTree
 …
         self.nodeId = nodeId
         self.xmlTree = ElementTree.ElementTree(ElementTree.fromstring(template))
-        # create dict with links to parent node for each node (= key)
         self.parentmap = dict((c, p) for p in self.xmlTree.getiterator() for c in p)
         self.fillElement("//MdCreationDate", datetime.datetime.now().strftime("%Y-%m-%d"))
         self.fillElement("//MdSelfLink", "http://www.clarin.eu/node/%s" % nodeId)
     def fillElement(self, xpath, value):
+        #print "fill %s with %s" % (xpath, value)
+        self.xmlTree.find(xpath).text = value.strip()
+        self.xmlTree.find(xpath).text = value.strip()
     def fillMultipleElement(self, elementname, xpath, values):
         # fill in the already existing element
-        #print values
         if (values[0]):
-            #print "first one", values[0]
-            #print values[0]
             self.fillElement(xpath, values[0])
+            #print "fill %s with %s" % (xpath, values[0])
+        #print
         element = self.xmlTree.find(xpath)
         parent = self.parentmap[element]
         position = parent.getchildren().index(element)
         # then add siblings for the other elements
         for value in values[1:]:
             if value:
                 # create new sibling of xpath (elementname) = value
-                #print value
-                #print "next one", value
                 position += 1
                 newElement = ElementTree.Element(elementname)
                 newElement.text = value.strip()
                 parent.insert(position, newElement)
     def removeEmptyNodes(self):
         removeList = ["ResourceType", "BeginYearResourceCreation", "FinalizationYearResourceCreation", "Institute"]
 …
                     parentNode = self.parentmap[res]
                     parentNode.remove(res)
     def serialize(self):
+    def serialize(self):
         self.removeEmptyNodes()
+        #print ElementTree.tostring(self.xmlTree.getroot())
         filename = "lrt-%s.cmdi" % self.nodeId
         self.xmlTree.write(filename, encoding="utf-8")
+        self.xmlTree.write(filename, encoding="utf-8", xml_declaration=True)
         f = open(filename, 'r+' )
         content = f.read().replace('<CMD', '<?xml version="1.0" encoding="UTF-8"?>\n<CMD')
+        content = f.read().replace('<CMD', '<CMD xmlns="http://www.clarin.eu/cmd/"')
         f.close()
         f = open(filename, 'w' )
-        #print content
         f.write(content)
         f.close
     def addFormats(self, format):
         if ";" in format or "," in format:
             if ";" in format:
                 formatItems = format.split(";")
+                formatItems = format.split(";")
             else:
                 formatItems = format.split(",")
 …
         else:
             self.fillElement("//LrtCommon/Format", format)
     def addInstitutes(self, institute):
         if ";" in institute:
                 items = institute.split(";")
+                items = institute.split(";")
                 uniqueItems = set(items) # filter out double items
                 items = [i for i in uniqueItems] # convert set back to a list
                 #print items
                 self.fillMultipleElement("Institute", "//LrtCommon/Institute", items)
     def addCountries(self, countryList, countries):
         countriesNode = self.xmlTree.find("//LrtCommon/Countries")
 …
                 newCountryNode.append(newCodeNode)
                 countriesNode.append(newCountryNode)
     def addLanguages(self, isoList, languages, iso639Type=3, xpath="//LrtCommon/Languages"):
         languagesNode = self.xmlTree.find(xpath)
 …
                 newLanguageNode.append(newCodeNode)
                 languagesNode.append(newLanguageNode)
     def addResourceType(self, types, record, isoList):
             typeList = [t.strip() for t in types.split("||")]
             self.fillMultipleElement("ResourceType", "//LrtCommon/ResourceType", typeList)
             collectionList = ["Written Corpus","Multimodal Corpus","Aligned Corpus","Treebank","N-Gram Model"]
             lexiconList = ["Lexicon / Knowledge Source","Terminological Resource"]
             if set(typeList).intersection(set(collectionList)):
                 self.addCollectionDetails(record, isoList)
 …
                 self.addLexiconDetails(record, isoList)
             #if "Web Service" in typeList:
             #    self.addServiceDetails(record)
+            #    self.addServiceDetails(record)
     def addCollectionDetails(self, record, isoList):
         # add the relevant XML subtree
 …
         self.fillElement("//LrtCollectionDetails/Access", record["field_access"])
         self.fillElement("//LrtCollectionDetails/Source", record["field_source_0"])
         # ok - this can be done in a cleaner way
         self.addLanguages(isoList, record["field_working_languages"], 1, "//LrtCollectionDetails/WorkingLanguages")
     def addLexiconDetails(self, record, isoList):
         template = '''<LrtLexiconDetails>
 …
         parent = self.xmlTree.find("//LrtInventoryResource")
         parent.append(partTree)
         # and now fill it
         self.fillElement("//LrtLexiconDetails/Date", record["field_date_0"])
 …
         self.fillElement("//LrtLexiconDetails/Size", record["field_size_0"])
         self.fillElement("//LrtLexiconDetails/Access", record["field_access_1"])
         self.addLanguages(isoList, record["field_working_languages_0"], 1, "//LrtLexiconDetails/WorkingLanguages")
     def addServiceDetails(self, record):
         template = '''<LrtServiceDetails>
 …
         parent = self.xmlTree.find("//LrtInventoryResource")
         parent.append(partTree)
         # and now fill it
         self.fillElement("//LrtLexiconDetails/Date", record["field_date_0"])
     def addResourceProxy(self, link):
         template = '''<ResourceProxy id="reflink">
 …
                 <ResourceRef></ResourceRef>
             </ResourceProxy>'''
         partTree = ElementTree.fromstring(template)
         parent = self.xmlTree.find("//ResourceProxyList")
+        partTree = ElementTree.XML(template)
+        parent = self.xmlTree.find(".//ResourceProxyList")
         parent.append(partTree)
         # and now fill it
         self.fillElement("//ResourceProxy/ResourceRef", link)
 def addChildNode(parent, tag, content):
 …
 def parseFirstLine(l):
     keyList = [l[0].lower()]
+    keyList = [l[0].lower()]
     for key in l[1:]:
         if "(" in key:
 …
             keyList.append(key.replace(" ", "_").lower())
     return keyList
 def loadInfo():
     csvFile = csv.reader(urllib.urlopen("http://www.clarin.eu/export_resources").readlines())
+    #csvFile = csv.reader(urllib.urlopen("resources.csv").readlines())
     #csvFile =[l.decode('utf-8') for l in rawCsvFile]
+    #csvFile = csv.reader(urllib.urlopen("resources.csv").readlines())
     linenr = 0
     newDict = dict()
 …
             for field in fieldList:
                 newDict[linenr][fieldList[colnr].replace(" ", "_").decode('utf-8')] = l[colnr].decode('utf-8')
                 colnr += 1
+                colnr += 1
         linenr += 1
     return newDict
+    return newDict
 def loadCsv(filename):
 …
     return dictionary
+# only to be used in case we use the namespace, but as it is causing a lot of extra coding we just add an xmlns attribute in the end and ignore it
+#    def fixXpath(self, xpath):
+#        if xpath[0:2] == "//":
+#            xpath = "//{http://www.clarin.eu/cmd/}" + xpath[2:].replace("/", "/{http://www.clarin.eu/cmd/}")
+#        else:
+#            xpath = xpath.replace("/", "/{http://www.clarin.eu/cmd/}")
+#        return xpath
 def main():
 …
     iso6393List = loadCsv("639-3-language_codes.csv")
     iso6391List = loadCsv("639-1-language_codes.csv")
     for record in infoDict.values():
         print "creating lrt-%s.cmdi" % record["nid"]
         cmdi = CmdiFile(record["nid"])
         # 1-to-1 fields, easy case
         cmdi.fillElement("//LrtCommon/ResourceName", record["name"])
 …
         cmdi.fillElement("//LrtDistributionClassification/UsageReportRequired", record["usage_report_required"].replace("1","true").replace("0","false"))
         cmdi.fillElement("//LrtDistributionClassification/OtherDistributionRestrictions", record["other_distribution_restrictions"])
         cmdi.fillElement("//LrtIPR/EthicalReference", record["field_ethical_reference"])
         cmdi.fillElement("//LrtIPR/LegalReference", record["field_legal_reference"])
 …
         cmdi.fillElement("//LrtIPR/Description", record["field_description_0"])
         cmdi.fillElement("//LrtIPR/ContactPerson", record["field_contact_person"])
         # add a ResourceProxy for ReferenceLink
         if "http" in record["field_reference_link"]:
             cmdi.addResourceProxy(record["field_reference_link"])
         # more sophisticated (dirty) tricks needed
         cmdi.addFormats(record["field_format"])
 …
             orgList += record["org%s" % i] + ";"
         cmdi.addInstitutes(orgList + record["field_institute"])
         cmdi.addCountries(countryList, record["field_country"])
         cmdi.addLanguages(iso6393List, record["field_languages"])
         cmdi.addResourceType(record["field_resource_type"], record, iso6391List)
         cmdi.serialize()

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 1617

Legend:

metadata/trunk/toolkit/scripts/cmdi-lrt-template.xml

metadata/trunk/toolkit/scripts/lrt2cmdi.py

Download in other formats: