Changeset 1617


Ignore:
Timestamp:
11/10/11 12:35:07 (13 years ago)
Author:
dietuyt
Message:

Now makes .cmdi files that comply to the new default namespace.

Location:
metadata/trunk/toolkit/scripts
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • metadata/trunk/toolkit/scripts/cmdi-lrt-template.xml

    r1432 r1617  
    11<?xml version="1.0" encoding="UTF-8"?>
    2 <CMD xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    3     xsi:schemaLocation="http://www.clarin.eu/cmd http://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/profiles/clarin.eu:cr1:p_1288172614070/xsd">
     2<CMD CMDVersion="1.1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
     3    xsi:schemaLocation="http://www.clarin.eu/cmd http://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/profiles/clarin.eu:cr1:p_1289827960126/xsd">
    44    <Header>
    55        <MdCreator>lrt2cmdi.py</MdCreator>
    66        <MdCreationDate/>
    77        <MdSelfLink/>
    8         <MdProfile>clarin.eu:cr1:p_1288172614070</MdProfile>
     8        <MdProfile>clarin.eu:cr1:p_1289827960126</MdProfile>
    99        <MdCollectionDisplayName>CLARIN LRT inventory</MdCollectionDisplayName>
    1010    </Header>
  • metadata/trunk/toolkit/scripts/lrt2cmdi.py

    r1042 r1617  
    22
    33# converts the CSV from the LRT inventory to nice and clean CMDI
    4 # Dieter says: I deny the existance of this script! 
     4# Dieter says: I deny the existance of this script!
    55
    66import urllib, csv, datetime, xml.etree.ElementTree as ElementTree
     
    1212        self.nodeId = nodeId
    1313        self.xmlTree = ElementTree.ElementTree(ElementTree.fromstring(template))
    14         # create dict with links to parent node for each node (= key)
    1514        self.parentmap = dict((c, p) for p in self.xmlTree.getiterator() for c in p)
    1615        self.fillElement("//MdCreationDate", datetime.datetime.now().strftime("%Y-%m-%d"))
    1716        self.fillElement("//MdSelfLink", "http://www.clarin.eu/node/%s" % nodeId)
    1817
     18
     19
    1920    def fillElement(self, xpath, value):
    20         #print "fill %s with %s" % (xpath, value)
    21         self.xmlTree.find(xpath).text = value.strip()
    22        
     21        self.xmlTree.find(xpath).text = value.strip()
     22
    2323    def fillMultipleElement(self, elementname, xpath, values):
    2424        # fill in the already existing element
    25         #print values
    2625        if (values[0]):
    27             #print "first one", values[0]
    28             #print values[0]
    2926            self.fillElement(xpath, values[0])
    30             #print "fill %s with %s" % (xpath, values[0])
    31         #print
    32        
     27
    3328        element = self.xmlTree.find(xpath)
    3429        parent = self.parentmap[element]
    3530        position = parent.getchildren().index(element)
    36        
     31
    3732        # then add siblings for the other elements
    3833        for value in values[1:]:
    3934            if value:
    4035                # create new sibling of xpath (elementname) = value
    41                 #print value
    42                 #print "next one", value
    4336                position += 1
    4437                newElement = ElementTree.Element(elementname)
    4538                newElement.text = value.strip()
    4639                parent.insert(position, newElement)
    47    
     40
    4841    def removeEmptyNodes(self):
    4942        removeList = ["ResourceType", "BeginYearResourceCreation", "FinalizationYearResourceCreation", "Institute"]
     
    5447                    parentNode = self.parentmap[res]
    5548                    parentNode.remove(res)
    56    
    57     def serialize(self): 
     49
     50    def serialize(self):
    5851        self.removeEmptyNodes()
     52        #print ElementTree.tostring(self.xmlTree.getroot())
    5953        filename = "lrt-%s.cmdi" % self.nodeId
    60         self.xmlTree.write(filename, encoding="utf-8")
     54        self.xmlTree.write(filename, encoding="utf-8", xml_declaration=True)
    6155        f = open(filename, 'r+' )
    62         content = f.read().replace('<CMD', '<?xml version="1.0" encoding="UTF-8"?>\n<CMD')
     56        content = f.read().replace('<CMD', '<CMD xmlns="http://www.clarin.eu/cmd/"')
    6357        f.close()
    6458        f = open(filename, 'w' )
    65         #print content
    6659        f.write(content)
    6760        f.close
    68        
     61
    6962    def addFormats(self, format):
    7063        if ";" in format or "," in format:
    7164            if ";" in format:
    72                 formatItems = format.split(";") 
     65                formatItems = format.split(";")
    7366            else:
    7467                formatItems = format.split(",")
     
    7669        else:
    7770            self.fillElement("//LrtCommon/Format", format)
    78    
     71
    7972    def addInstitutes(self, institute):
    8073        if ";" in institute:
    81                 items = institute.split(";") 
     74                items = institute.split(";")
    8275                uniqueItems = set(items) # filter out double items
    8376                items = [i for i in uniqueItems] # convert set back to a list
    8477                #print items
    85                  
     78
    8679                self.fillMultipleElement("Institute", "//LrtCommon/Institute", items)
    87    
     80
    8881    def addCountries(self, countryList, countries):
    8982        countriesNode = self.xmlTree.find("//LrtCommon/Countries")
     
    9689                newCountryNode.append(newCodeNode)
    9790                countriesNode.append(newCountryNode)
    98        
     91
    9992    def addLanguages(self, isoList, languages, iso639Type=3, xpath="//LrtCommon/Languages"):
    10093        languagesNode = self.xmlTree.find(xpath)
     
    108101                newLanguageNode.append(newCodeNode)
    109102                languagesNode.append(newLanguageNode)
    110                    
    111                    
     103
     104
    112105    def addResourceType(self, types, record, isoList):
    113106            typeList = [t.strip() for t in types.split("||")]
    114107            self.fillMultipleElement("ResourceType", "//LrtCommon/ResourceType", typeList)
    115            
     108
    116109            collectionList = ["Written Corpus","Multimodal Corpus","Aligned Corpus","Treebank","N-Gram Model"]
    117110            lexiconList = ["Lexicon / Knowledge Source","Terminological Resource"]
    118            
     111
    119112            if set(typeList).intersection(set(collectionList)):
    120113                self.addCollectionDetails(record, isoList)
     
    122115                self.addLexiconDetails(record, isoList)
    123116            #if "Web Service" in typeList:
    124             #    self.addServiceDetails(record)   
    125            
     117            #    self.addServiceDetails(record)
     118
    126119    def addCollectionDetails(self, record, isoList):
    127120        # add the relevant XML subtree
     
    154147        self.fillElement("//LrtCollectionDetails/Access", record["field_access"])
    155148        self.fillElement("//LrtCollectionDetails/Source", record["field_source_0"])
    156        
     149
    157150        # ok - this can be done in a cleaner way
    158151        self.addLanguages(isoList, record["field_working_languages"], 1, "//LrtCollectionDetails/WorkingLanguages")
    159        
    160        
    161    
     152
     153
     154
    162155    def addLexiconDetails(self, record, isoList):
    163156        template = '''<LrtLexiconDetails>
     
    173166        parent = self.xmlTree.find("//LrtInventoryResource")
    174167        parent.append(partTree)
    175        
     168
    176169        # and now fill it
    177170        self.fillElement("//LrtLexiconDetails/Date", record["field_date_0"])
     
    181174        self.fillElement("//LrtLexiconDetails/Size", record["field_size_0"])
    182175        self.fillElement("//LrtLexiconDetails/Access", record["field_access_1"])
    183        
    184176        self.addLanguages(isoList, record["field_working_languages_0"], 1, "//LrtLexiconDetails/WorkingLanguages")
    185    
    186    
     177
     178
    187179    def addServiceDetails(self, record):
    188180        template = '''<LrtServiceDetails>
     
    200192        parent = self.xmlTree.find("//LrtInventoryResource")
    201193        parent.append(partTree)
    202        
     194
    203195        # and now fill it
    204196        self.fillElement("//LrtLexiconDetails/Date", record["field_date_0"])
    205        
     197
    206198    def addResourceProxy(self, link):
    207199        template = '''<ResourceProxy id="reflink">
     
    209201                <ResourceRef></ResourceRef>
    210202            </ResourceProxy>'''
    211         partTree = ElementTree.fromstring(template)
    212         parent = self.xmlTree.find("//ResourceProxyList")
     203        partTree = ElementTree.XML(template)
     204        parent = self.xmlTree.find(".//ResourceProxyList")
    213205        parent.append(partTree)
    214        
     206
    215207        # and now fill it
    216208        self.fillElement("//ResourceProxy/ResourceRef", link)
    217        
    218209
    219210def addChildNode(parent, tag, content):
     
    224215
    225216def parseFirstLine(l):
    226     keyList = [l[0].lower()] 
     217    keyList = [l[0].lower()]
    227218    for key in l[1:]:
    228219        if "(" in key:
     
    231222            keyList.append(key.replace(" ", "_").lower())
    232223    return keyList
    233    
    234            
     224
     225
    235226def loadInfo():
    236227    csvFile = csv.reader(urllib.urlopen("http://www.clarin.eu/export_resources").readlines())
     228    #csvFile = csv.reader(urllib.urlopen("resources.csv").readlines())
    237229    #csvFile =[l.decode('utf-8') for l in rawCsvFile]
    238    
    239     #csvFile = csv.reader(urllib.urlopen("resources.csv").readlines()) 
     230
     231
    240232    linenr = 0
    241233    newDict = dict()
     
    248240            for field in fieldList:
    249241                newDict[linenr][fieldList[colnr].replace(" ", "_").decode('utf-8')] = l[colnr].decode('utf-8')
    250                 colnr += 1 
     242                colnr += 1
    251243        linenr += 1
    252     return newDict   
     244    return newDict
    253245
    254246def loadCsv(filename):
     
    260252    return dictionary
    261253
     254# only to be used in case we use the namespace, but as it is causing a lot of extra coding we just add an xmlns attribute in the end and ignore it
     255#    def fixXpath(self, xpath):
     256#        if xpath[0:2] == "//":
     257#            xpath = "//{http://www.clarin.eu/cmd/}" + xpath[2:].replace("/", "/{http://www.clarin.eu/cmd/}")
     258#        else:
     259#            xpath = xpath.replace("/", "/{http://www.clarin.eu/cmd/}")
     260#        return xpath
    262261
    263262def main():
     
    266265    iso6393List = loadCsv("639-3-language_codes.csv")
    267266    iso6391List = loadCsv("639-1-language_codes.csv")
    268    
     267
    269268    for record in infoDict.values():
    270269        print "creating lrt-%s.cmdi" % record["nid"]
    271        
     270
    272271        cmdi = CmdiFile(record["nid"])
    273        
     272
    274273        # 1-to-1 fields, easy case
    275274        cmdi.fillElement("//LrtCommon/ResourceName", record["name"])
     
    289288        cmdi.fillElement("//LrtDistributionClassification/UsageReportRequired", record["usage_report_required"].replace("1","true").replace("0","false"))
    290289        cmdi.fillElement("//LrtDistributionClassification/OtherDistributionRestrictions", record["other_distribution_restrictions"])
    291        
     290
    292291        cmdi.fillElement("//LrtIPR/EthicalReference", record["field_ethical_reference"])
    293292        cmdi.fillElement("//LrtIPR/LegalReference", record["field_legal_reference"])
     
    295294        cmdi.fillElement("//LrtIPR/Description", record["field_description_0"])
    296295        cmdi.fillElement("//LrtIPR/ContactPerson", record["field_contact_person"])
    297        
     296
    298297        # add a ResourceProxy for ReferenceLink
    299298        if "http" in record["field_reference_link"]:
    300299            cmdi.addResourceProxy(record["field_reference_link"])
    301        
     300
    302301        # more sophisticated (dirty) tricks needed
    303302        cmdi.addFormats(record["field_format"])
     
    307306            orgList += record["org%s" % i] + ";"
    308307        cmdi.addInstitutes(orgList + record["field_institute"])
    309        
     308
    310309        cmdi.addCountries(countryList, record["field_country"])
    311        
     310
    312311        cmdi.addLanguages(iso6393List, record["field_languages"])
    313        
     312
    314313        cmdi.addResourceType(record["field_resource_type"], record, iso6391List)
    315        
     314
    316315        cmdi.serialize()
    317316
Note: See TracChangeset for help on using the changeset viewer.