Changeset 819
- Timestamp:
- 10/27/10 08:49:35 (14 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
metadata/trunk/toolkit/scripts/dir2cmdicollection.py
r771 r819 3 3 # generates CMDI collection file hierarchy for collections of CMDI records 4 4 # support and questions: Dieter Van Uytvanck <dietuyt@mpi.nl> 5 # rework by Matej id@vronk.net : 6 # - already filling ResourceRef with handles read from the MdSelfLink of the mdrecords 7 # - also reading ProviderURL-file and filling as ID in the basic collection-profile 8 # - does NOT add IsPartOf-elements yet 5 9 6 10 import os, datetime 7 11 from string import Template 8 12 13 target_dir = "_corpusstructure/" 14 9 15 def main(): 10 16 rootList = [] 17 if not os.path.isdir(target_dir): 18 os.mkdir(target_dir) 11 19 for root, dirs, files in os.walk(os.getcwd()): 12 20 startpath = os.getcwd() … … 14 22 if d == "0": 15 23 rootList.append(generate_branch(root, dirs)) 16 writeCollection(rootList, "collection_root.cmdi")24 writeCollection(rootList, target_dir + "collection_root.cmdi", "olac-root") 17 25 18 26 def generate_branch(root, dirs): 19 collectionFile = "collection_%s.cmdi" % os.path.relpath(root) 27 collectionName = os.path.relpath(root) 28 collectionFile = "_corpusstructure/collection_%s.cmdi" % collectionName 29 20 30 dirs.sort() 21 31 collectionList = [] … … 26 36 newFile = os.path.relpath(os.path.join(fullpath,file)) 27 37 collectionList.append(newFile) 28 writeCollection(collectionList, collectionFile) 29 return collectionFile 38 collid = writeCollection(collectionList, collectionFile, collectionName) 39 print "genbranch:" + collid 40 return collid 30 41 31 42 32 def writeCollection(collectionList, collectionFile ):43 def writeCollection(collectionList, collectionFile, collectionName): 33 44 34 resourceTemplate = Template('<ResourceProxy id="$idname"><ResourceType>Metadata</ResourceType><ResourceRef>$ filename</ResourceRef></ResourceProxy>')45 resourceTemplate = Template('<ResourceProxy id="$idname"><ResourceType>Metadata</ResourceType><ResourceRef>$idx</ResourceRef></ResourceProxy>') 35 46 36 47 outstring = Template("""<?xml version="1.0" encoding="UTF-8"?> 37 48 <CMD xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 38 xsi:schemaLocation="http://www.clarin.eu/cmd http://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/profiles/clarin.eu:cr1:p_12 71859438236/xsd">49 xsi:schemaLocation="http://www.clarin.eu/cmd http://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/profiles/clarin.eu:cr1:p_1284723009187/xsd"> 39 50 <Header> 40 51 <MdCreator>dir2cmdicollection.py</MdCreator> 41 52 <MdCreationDate>$date</MdCreationDate> 42 53 <MdSelfLink>$selflink</MdSelfLink> 43 <MdProfile>clarin.eu:cr1:p_12 71859438236</MdProfile>54 <MdProfile>clarin.eu:cr1:p_1284723009187</MdProfile> 44 55 </Header> 45 56 <Resources> … … 48 59 <JournalFileProxyList/> 49 60 <ResourceRelationList/> 50 </Resources> 61 </Resources> 51 62 <Components> 52 <olac></olac> 63 <collection> 64 <GeneralInfo> 65 <Name>$name</Name> 66 <ID>$url</ID> 67 </GeneralInfo> 68 </collection> 53 69 </Components> 54 70 </CMD>""") 55 71 56 72 resourceProxies = "" 57 collectionList.sort() 73 collectionList.sort() 74 if os.path.isfile(collectionName + "/ProviderURL"): 75 urlf = open(collectionName + "/ProviderURL", 'r') 76 url = urlf.readline() 77 else: 78 url ="?" 79 name = "OLAC: " + collectionName.replace("_", " ") 80 idx = "" 58 81 for item in collectionList: 59 resourceProxies += "\n" + resourceTemplate.substitute(idname = item.replace("/", "_").replace("\\", "_"), filename = item) 60 outfile = outstring.substitute(date= datetime.datetime.now().strftime("%Y-%m-%d"), selflink=collectionFile, rp=resourceProxies) 61 f = open(collectionFile, 'w') 82 # trying to restore the original id (which is in the MdSelfLink 83 if os.path.isfile(item): 84 for line in open(item): 85 if "<MdSelfLink>" in line: 86 # WARNING! rocket science employed here ! 87 idx = line.replace("<MdSelfLink>","").replace("</MdSelfLink>","").strip() 88 break 89 else: 90 idx = item 91 #idx = item.replace(".xml.cmdi","").replace("_", ":",1)[::-1].replace("_", ":",1)[::-1].replace("_", "-") 92 resourceProxies += "\n" + resourceTemplate.substitute(idname = idx.replace(".","_").replace("/","_"), idx = idx) 93 if collectionName=="olac-root": 94 collidx = "olac-root" 95 else: 96 # print "idx:" + idx 97 if idx!="": 98 collidx = idx[:idx.rfind(":")] # this is just a hack to derive the collection-id from the id of the collection-item (stripping the running number) 99 else: 100 collidx = "olac:" + collectionName 101 print collidx 102 outfile = outstring.substitute(date= datetime.datetime.now().strftime("%Y-%m-%d"), selflink=collidx, rp=resourceProxies,url=url, name=name) 103 f = open(collectionFile, 'w') 62 104 f.write(outfile) 63 105 f.close() 106 64 107 print collectionFile 108 return collidx 65 109 66 110 main()
Note: See TracChangeset
for help on using the changeset viewer.