Changeset 1617
- Timestamp:
- 11/10/11 12:35:07 (13 years ago)
- Location:
- metadata/trunk/toolkit/scripts
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
metadata/trunk/toolkit/scripts/cmdi-lrt-template.xml
r1432 r1617 1 1 <?xml version="1.0" encoding="UTF-8"?> 2 <CMD xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"3 xsi:schemaLocation="http://www.clarin.eu/cmd http://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/profiles/clarin.eu:cr1:p_128 8172614070/xsd">2 <CMD CMDVersion="1.1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 3 xsi:schemaLocation="http://www.clarin.eu/cmd http://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/profiles/clarin.eu:cr1:p_1289827960126/xsd"> 4 4 <Header> 5 5 <MdCreator>lrt2cmdi.py</MdCreator> 6 6 <MdCreationDate/> 7 7 <MdSelfLink/> 8 <MdProfile>clarin.eu:cr1:p_128 8172614070</MdProfile>8 <MdProfile>clarin.eu:cr1:p_1289827960126</MdProfile> 9 9 <MdCollectionDisplayName>CLARIN LRT inventory</MdCollectionDisplayName> 10 10 </Header> -
metadata/trunk/toolkit/scripts/lrt2cmdi.py
r1042 r1617 2 2 3 3 # converts the CSV from the LRT inventory to nice and clean CMDI 4 # Dieter says: I deny the existance of this script! 4 # Dieter says: I deny the existance of this script! 5 5 6 6 import urllib, csv, datetime, xml.etree.ElementTree as ElementTree … … 12 12 self.nodeId = nodeId 13 13 self.xmlTree = ElementTree.ElementTree(ElementTree.fromstring(template)) 14 # create dict with links to parent node for each node (= key)15 14 self.parentmap = dict((c, p) for p in self.xmlTree.getiterator() for c in p) 16 15 self.fillElement("//MdCreationDate", datetime.datetime.now().strftime("%Y-%m-%d")) 17 16 self.fillElement("//MdSelfLink", "http://www.clarin.eu/node/%s" % nodeId) 18 17 18 19 19 20 def fillElement(self, xpath, value): 20 #print "fill %s with %s" % (xpath, value) 21 self.xmlTree.find(xpath).text = value.strip() 22 21 self.xmlTree.find(xpath).text = value.strip() 22 23 23 def fillMultipleElement(self, elementname, xpath, values): 24 24 # fill in the already existing element 25 #print values26 25 if (values[0]): 27 #print "first one", values[0]28 #print values[0]29 26 self.fillElement(xpath, values[0]) 30 #print "fill %s with %s" % (xpath, values[0]) 31 #print 32 27 33 28 element = self.xmlTree.find(xpath) 34 29 parent = self.parentmap[element] 35 30 position = parent.getchildren().index(element) 36 31 37 32 # then add siblings for the other elements 38 33 for value in values[1:]: 39 34 if value: 40 35 # create new sibling of xpath (elementname) = value 41 #print value42 #print "next one", value43 36 position += 1 44 37 newElement = ElementTree.Element(elementname) 45 38 newElement.text = value.strip() 46 39 parent.insert(position, newElement) 47 40 48 41 def removeEmptyNodes(self): 49 42 removeList = ["ResourceType", "BeginYearResourceCreation", "FinalizationYearResourceCreation", "Institute"] … … 54 47 parentNode = self.parentmap[res] 55 48 parentNode.remove(res) 56 57 def serialize(self): 49 50 def serialize(self): 58 51 self.removeEmptyNodes() 52 #print ElementTree.tostring(self.xmlTree.getroot()) 59 53 filename = "lrt-%s.cmdi" % self.nodeId 60 self.xmlTree.write(filename, encoding="utf-8" )54 self.xmlTree.write(filename, encoding="utf-8", xml_declaration=True) 61 55 f = open(filename, 'r+' ) 62 content = f.read().replace('<CMD', '< ?xml version="1.0" encoding="UTF-8"?>\n<CMD')56 content = f.read().replace('<CMD', '<CMD xmlns="http://www.clarin.eu/cmd/"') 63 57 f.close() 64 58 f = open(filename, 'w' ) 65 #print content66 59 f.write(content) 67 60 f.close 68 61 69 62 def addFormats(self, format): 70 63 if ";" in format or "," in format: 71 64 if ";" in format: 72 formatItems = format.split(";") 65 formatItems = format.split(";") 73 66 else: 74 67 formatItems = format.split(",") … … 76 69 else: 77 70 self.fillElement("//LrtCommon/Format", format) 78 71 79 72 def addInstitutes(self, institute): 80 73 if ";" in institute: 81 items = institute.split(";") 74 items = institute.split(";") 82 75 uniqueItems = set(items) # filter out double items 83 76 items = [i for i in uniqueItems] # convert set back to a list 84 77 #print items 85 78 86 79 self.fillMultipleElement("Institute", "//LrtCommon/Institute", items) 87 80 88 81 def addCountries(self, countryList, countries): 89 82 countriesNode = self.xmlTree.find("//LrtCommon/Countries") … … 96 89 newCountryNode.append(newCodeNode) 97 90 countriesNode.append(newCountryNode) 98 91 99 92 def addLanguages(self, isoList, languages, iso639Type=3, xpath="//LrtCommon/Languages"): 100 93 languagesNode = self.xmlTree.find(xpath) … … 108 101 newLanguageNode.append(newCodeNode) 109 102 languagesNode.append(newLanguageNode) 110 111 103 104 112 105 def addResourceType(self, types, record, isoList): 113 106 typeList = [t.strip() for t in types.split("||")] 114 107 self.fillMultipleElement("ResourceType", "//LrtCommon/ResourceType", typeList) 115 108 116 109 collectionList = ["Written Corpus","Multimodal Corpus","Aligned Corpus","Treebank","N-Gram Model"] 117 110 lexiconList = ["Lexicon / Knowledge Source","Terminological Resource"] 118 111 119 112 if set(typeList).intersection(set(collectionList)): 120 113 self.addCollectionDetails(record, isoList) … … 122 115 self.addLexiconDetails(record, isoList) 123 116 #if "Web Service" in typeList: 124 # self.addServiceDetails(record) 125 117 # self.addServiceDetails(record) 118 126 119 def addCollectionDetails(self, record, isoList): 127 120 # add the relevant XML subtree … … 154 147 self.fillElement("//LrtCollectionDetails/Access", record["field_access"]) 155 148 self.fillElement("//LrtCollectionDetails/Source", record["field_source_0"]) 156 149 157 150 # ok - this can be done in a cleaner way 158 151 self.addLanguages(isoList, record["field_working_languages"], 1, "//LrtCollectionDetails/WorkingLanguages") 159 160 161 152 153 154 162 155 def addLexiconDetails(self, record, isoList): 163 156 template = '''<LrtLexiconDetails> … … 173 166 parent = self.xmlTree.find("//LrtInventoryResource") 174 167 parent.append(partTree) 175 168 176 169 # and now fill it 177 170 self.fillElement("//LrtLexiconDetails/Date", record["field_date_0"]) … … 181 174 self.fillElement("//LrtLexiconDetails/Size", record["field_size_0"]) 182 175 self.fillElement("//LrtLexiconDetails/Access", record["field_access_1"]) 183 184 176 self.addLanguages(isoList, record["field_working_languages_0"], 1, "//LrtLexiconDetails/WorkingLanguages") 185 186 177 178 187 179 def addServiceDetails(self, record): 188 180 template = '''<LrtServiceDetails> … … 200 192 parent = self.xmlTree.find("//LrtInventoryResource") 201 193 parent.append(partTree) 202 194 203 195 # and now fill it 204 196 self.fillElement("//LrtLexiconDetails/Date", record["field_date_0"]) 205 197 206 198 def addResourceProxy(self, link): 207 199 template = '''<ResourceProxy id="reflink"> … … 209 201 <ResourceRef></ResourceRef> 210 202 </ResourceProxy>''' 211 partTree = ElementTree. fromstring(template)212 parent = self.xmlTree.find(" //ResourceProxyList")203 partTree = ElementTree.XML(template) 204 parent = self.xmlTree.find(".//ResourceProxyList") 213 205 parent.append(partTree) 214 206 215 207 # and now fill it 216 208 self.fillElement("//ResourceProxy/ResourceRef", link) 217 218 209 219 210 def addChildNode(parent, tag, content): … … 224 215 225 216 def parseFirstLine(l): 226 keyList = [l[0].lower()] 217 keyList = [l[0].lower()] 227 218 for key in l[1:]: 228 219 if "(" in key: … … 231 222 keyList.append(key.replace(" ", "_").lower()) 232 223 return keyList 233 234 224 225 235 226 def loadInfo(): 236 227 csvFile = csv.reader(urllib.urlopen("http://www.clarin.eu/export_resources").readlines()) 228 #csvFile = csv.reader(urllib.urlopen("resources.csv").readlines()) 237 229 #csvFile =[l.decode('utf-8') for l in rawCsvFile] 238 239 #csvFile = csv.reader(urllib.urlopen("resources.csv").readlines()) 230 231 240 232 linenr = 0 241 233 newDict = dict() … … 248 240 for field in fieldList: 249 241 newDict[linenr][fieldList[colnr].replace(" ", "_").decode('utf-8')] = l[colnr].decode('utf-8') 250 colnr += 1 242 colnr += 1 251 243 linenr += 1 252 return newDict 244 return newDict 253 245 254 246 def loadCsv(filename): … … 260 252 return dictionary 261 253 254 # only to be used in case we use the namespace, but as it is causing a lot of extra coding we just add an xmlns attribute in the end and ignore it 255 # def fixXpath(self, xpath): 256 # if xpath[0:2] == "//": 257 # xpath = "//{http://www.clarin.eu/cmd/}" + xpath[2:].replace("/", "/{http://www.clarin.eu/cmd/}") 258 # else: 259 # xpath = xpath.replace("/", "/{http://www.clarin.eu/cmd/}") 260 # return xpath 262 261 263 262 def main(): … … 266 265 iso6393List = loadCsv("639-3-language_codes.csv") 267 266 iso6391List = loadCsv("639-1-language_codes.csv") 268 267 269 268 for record in infoDict.values(): 270 269 print "creating lrt-%s.cmdi" % record["nid"] 271 270 272 271 cmdi = CmdiFile(record["nid"]) 273 272 274 273 # 1-to-1 fields, easy case 275 274 cmdi.fillElement("//LrtCommon/ResourceName", record["name"]) … … 289 288 cmdi.fillElement("//LrtDistributionClassification/UsageReportRequired", record["usage_report_required"].replace("1","true").replace("0","false")) 290 289 cmdi.fillElement("//LrtDistributionClassification/OtherDistributionRestrictions", record["other_distribution_restrictions"]) 291 290 292 291 cmdi.fillElement("//LrtIPR/EthicalReference", record["field_ethical_reference"]) 293 292 cmdi.fillElement("//LrtIPR/LegalReference", record["field_legal_reference"]) … … 295 294 cmdi.fillElement("//LrtIPR/Description", record["field_description_0"]) 296 295 cmdi.fillElement("//LrtIPR/ContactPerson", record["field_contact_person"]) 297 296 298 297 # add a ResourceProxy for ReferenceLink 299 298 if "http" in record["field_reference_link"]: 300 299 cmdi.addResourceProxy(record["field_reference_link"]) 301 300 302 301 # more sophisticated (dirty) tricks needed 303 302 cmdi.addFormats(record["field_format"]) … … 307 306 orgList += record["org%s" % i] + ";" 308 307 cmdi.addInstitutes(orgList + record["field_institute"]) 309 308 310 309 cmdi.addCountries(countryList, record["field_country"]) 311 310 312 311 cmdi.addLanguages(iso6393List, record["field_languages"]) 313 312 314 313 cmdi.addResourceType(record["field_resource_type"], record, iso6391List) 315 314 316 315 cmdi.serialize() 317 316
Note: See TracChangeset
for help on using the changeset viewer.