Changeset 819


Ignore:
Timestamp:
10/27/10 08:49:35 (14 years ago)
Author:
vronk
Message:

bigger rework, get rid of the path-based identifiers =
filling ResourceRef? already with handles read from the MdSelfLink? of the individual mdrecords;
also reading ProviderURL-file and filling as ID in the minimal collection-profile

File:
1 edited

Legend:

Unmodified
Added
Removed
  • metadata/trunk/toolkit/scripts/dir2cmdicollection.py

    r771 r819  
    33# generates CMDI collection file hierarchy for collections of CMDI records
    44# support and questions: Dieter Van Uytvanck <dietuyt@mpi.nl>
     5# rework by Matej id@vronk.net :
     6#       - already filling ResourceRef with handles read from the MdSelfLink of the mdrecords
     7#   - also reading ProviderURL-file and filling as ID in the basic collection-profile
     8#   - does NOT add IsPartOf-elements yet
    59
    610import os, datetime
    711from string import Template
    812
     13target_dir = "_corpusstructure/"
     14
    915def main():
    1016        rootList = []
     17        if not os.path.isdir(target_dir):
     18                os.mkdir(target_dir)
    1119        for root, dirs, files in os.walk(os.getcwd()):
    1220                startpath = os.getcwd()         
     
    1422                        if d == "0":
    1523                                rootList.append(generate_branch(root, dirs))
    16         writeCollection(rootList, "collection_root.cmdi")
     24        writeCollection(rootList, target_dir + "collection_root.cmdi", "olac-root")
    1725               
    1826def generate_branch(root, dirs):
    19         collectionFile = "collection_%s.cmdi" % os.path.relpath(root)
     27        collectionName = os.path.relpath(root)
     28        collectionFile = "_corpusstructure/collection_%s.cmdi" % collectionName
     29       
    2030        dirs.sort()
    2131        collectionList = []     
     
    2636                                newFile = os.path.relpath(os.path.join(fullpath,file))
    2737                                collectionList.append(newFile)         
    28         writeCollection(collectionList, collectionFile)
    29         return collectionFile
     38        collid = writeCollection(collectionList, collectionFile, collectionName)
     39        print "genbranch:" + collid
     40        return collid
    3041
    3142
    32 def writeCollection(collectionList, collectionFile):
     43def writeCollection(collectionList, collectionFile, collectionName):
    3344
    34         resourceTemplate = Template('<ResourceProxy id="$idname"><ResourceType>Metadata</ResourceType><ResourceRef>$filename</ResourceRef></ResourceProxy>')
     45        resourceTemplate = Template('<ResourceProxy id="$idname"><ResourceType>Metadata</ResourceType><ResourceRef>$idx</ResourceRef></ResourceProxy>')
    3546
    3647        outstring = Template("""<?xml version="1.0" encoding="UTF-8"?>
    3748<CMD xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    38     xsi:schemaLocation="http://www.clarin.eu/cmd http://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/profiles/clarin.eu:cr1:p_1271859438236/xsd">
     49    xsi:schemaLocation="http://www.clarin.eu/cmd http://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/profiles/clarin.eu:cr1:p_1284723009187/xsd">
    3950    <Header>
    4051        <MdCreator>dir2cmdicollection.py</MdCreator>
    4152        <MdCreationDate>$date</MdCreationDate>
    4253        <MdSelfLink>$selflink</MdSelfLink>
    43         <MdProfile>clarin.eu:cr1:p_1271859438236</MdProfile>
     54        <MdProfile>clarin.eu:cr1:p_1284723009187</MdProfile>
    4455    </Header>
    4556    <Resources>
     
    4859        <JournalFileProxyList/>
    4960        <ResourceRelationList/>
    50     </Resources>
     61    </Resources>   
    5162    <Components>
    52         <olac></olac>
     63        <collection>
     64                <GeneralInfo>
     65                <Name>$name</Name>
     66                        <ID>$url</ID>
     67                </GeneralInfo>
     68        </collection>
    5369    </Components>
    5470</CMD>""")
    5571
    5672        resourceProxies = ""
    57         collectionList.sort()   
     73        collectionList.sort()           
     74        if os.path.isfile(collectionName + "/ProviderURL"):
     75                urlf = open(collectionName + "/ProviderURL", 'r')
     76                url = urlf.readline()
     77        else:
     78          url ="?"
     79        name = "OLAC: " + collectionName.replace("_", " ")
     80        idx = ""
    5881        for item in collectionList:
    59                 resourceProxies += "\n" + resourceTemplate.substitute(idname = item.replace("/", "_").replace("\\", "_"), filename = item)
    60         outfile = outstring.substitute(date= datetime.datetime.now().strftime("%Y-%m-%d"), selflink=collectionFile, rp=resourceProxies)
    61         f = open(collectionFile, 'w')
     82                # trying to restore the original id (which is in the MdSelfLink
     83                if os.path.isfile(item):
     84                                for line in open(item):
     85                                        if "<MdSelfLink>" in line:
     86                                                #  WARNING! rocket science employed here !
     87                                                idx = line.replace("<MdSelfLink>","").replace("</MdSelfLink>","").strip()
     88                                                break
     89                else:
     90                         idx = item
     91                #idx = item.replace(".xml.cmdi","").replace("_", ":",1)[::-1].replace("_", ":",1)[::-1].replace("_", "-")
     92                resourceProxies += "\n" + resourceTemplate.substitute(idname = idx.replace(".","_").replace("/","_"), idx = idx)
     93        if collectionName=="olac-root":
     94                collidx = "olac-root"
     95        else:
     96                # print "idx:" + idx
     97                if idx!="":
     98                        collidx = idx[:idx.rfind(":")] # this is just a hack to derive the collection-id from the id of the collection-item (stripping the running number)
     99                else:
     100                        collidx = "olac:" + collectionName
     101        print collidx
     102        outfile = outstring.substitute(date= datetime.datetime.now().strftime("%Y-%m-%d"), selflink=collidx, rp=resourceProxies,url=url, name=name)
     103        f = open(collectionFile, 'w')   
    62104        f.write(outfile)
    63105        f.close()
     106       
    64107        print collectionFile
     108        return collidx
    65109
    66110main()
Note: See TracChangeset for help on using the changeset viewer.