From c315c4b1ff5b3b83d2fb298e18ae461369082fc2 Mon Sep 17 00:00:00 2001 From: wshahn <162052406+wshahn@users.noreply.github.com> Date: Mon, 9 Sep 2024 16:04:09 -0400 Subject: [PATCH 1/2] Update get_collection_storagesize.py Slight alteration of script to ignore processing the root dataverse collection, and only get the size of sub-collections. Additionally, the instance url has been changed to use 'unc' instead of 'root'. --- get_collection_storagesize.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/get_collection_storagesize.py b/get_collection_storagesize.py index cd0b6bc..353a99c 100755 --- a/get_collection_storagesize.py +++ b/get_collection_storagesize.py @@ -30,7 +30,7 @@ def get_size(dataverse,collection,token): # throws I/O errors in TRSA case #dvurl = dataverse + '/api/dataverses/' + collection + '/storagesize?includeCached=true&key=' + token dvurl = dataverse + '/api/dataverses/' + collection + '/storagesize?key=' + token - r = requests.get(dvurl) + r = requests.get(dvurl) j = r.json() # strip out "size of this ... bytes" error = "Couldn't get storagesize for collection: " + collection @@ -77,15 +77,16 @@ def format_size(byte_size): readablesize = format_size(size) dvfilecount = get_filecount(dataverse,collection,token) print(collection + ': ' + str(size) + ' bytes' + ' (' + readablesize + '), ' + str(dvfilecount) + ' files.') - + else: - # start with the root dataverse - collection = 'root' - size = get_size(dataverse,collection,token) - dvfilecount = get_filecount(dataverse,collection,token) - print(collection + ': ' + str(size) + ' bytes, ' + str(dvfilecount) + ' files.') - # now iterate through sub-collections - instanceurl = dataverse + '/api/dataverses/root/contents' + # don't process root for performance reasons + #collection = 'root' + #size = get_size(dataverse,collection,token) + #dvfilecount = get_filecount(dataverse,collection,token) + #print(collection + ': ' + str(size) + ' bytes, ' + str(dvfilecount) + ' files.') + + # iterate through sub-collections + instanceurl = dataverse + '/api/dataverses/unc/contents' r = requests.get(instanceurl) j = r.json() for i in range(len(j["data"])): @@ -97,6 +98,9 @@ def format_size(byte_size): ar = requests.get(aliasurl) aj = ar.json() collection = aj["data"]["alias"] + # get size size = get_size(dataverse,collection,token) + readablesize = format_size(size) dvfilecount = get_filecount(dataverse,collection,token) print(collection + ': ' + str(size) + ' bytes' + ' (' + readablesize + '), ' + str(dvfilecount) + ' files.') + From cb84245e681a63872f220ff679aeb28fd5792687 Mon Sep 17 00:00:00 2001 From: wshahn <162052406+wshahn@users.noreply.github.com> Date: Mon, 23 Jun 2025 13:59:12 -0400 Subject: [PATCH 2/2] Add files via upload --- get_collection_storagesize.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/get_collection_storagesize.py b/get_collection_storagesize.py index 353a99c..610cf40 100755 --- a/get_collection_storagesize.py +++ b/get_collection_storagesize.py @@ -10,6 +10,7 @@ parser.add_argument('-c', '--collection', help='Sum the size of datasets in the given Dataverse collection.') parser.add_argument('-a', '--all_collections', help='Report the sizes of all collections in the given Dataverse.', action='store_true') parser.add_argument('-t', '--api_token', help='API token of an admin user.') +parser.add_argument('-r', '--root_collection', default='root', help='Specify the name of the root collection.') args = parser.parse_args() if (args.collection is None) and (args.all_collections is False): @@ -25,6 +26,7 @@ collection = args.collection all = args.all_collections token = args.api_token +root_coll = args.root_collection def get_size(dataverse,collection,token): # throws I/O errors in TRSA case @@ -86,7 +88,7 @@ def format_size(byte_size): #print(collection + ': ' + str(size) + ' bytes, ' + str(dvfilecount) + ' files.') # iterate through sub-collections - instanceurl = dataverse + '/api/dataverses/unc/contents' + instanceurl = dataverse + f"/api/dataverses/{root_coll}/contents" r = requests.get(instanceurl) j = r.json() for i in range(len(j["data"])): @@ -103,4 +105,3 @@ def format_size(byte_size): readablesize = format_size(size) dvfilecount = get_filecount(dataverse,collection,token) print(collection + ': ' + str(size) + ' bytes' + ' (' + readablesize + '), ' + str(dvfilecount) + ' files.') -