From 0321a08cf37a79adf1d0a2091c26220c46e4e2df Mon Sep 17 00:00:00 2001 From: Santiago Soler Date: Mon, 13 Mar 2023 11:10:43 -0700 Subject: [PATCH 1/4] Support Dataverse files without a persistentID Add support for downloading Dataverse files that don't have a persistent ID. Use the file ID instead. --- pooch/downloaders.py | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/pooch/downloaders.py b/pooch/downloaders.py index f391fb19..ccc8c48c 100644 --- a/pooch/downloaders.py +++ b/pooch/downloaders.py @@ -1034,20 +1034,31 @@ def download_url(self, file_name): download_url : str The HTTP URL that can be used to download the file. """ - parsed = parse_url(self.archive_url) - - # Iterate over the given files until we find one of the requested name - for filedata in self.api_response.json()["data"]["latestVersion"]["files"]: - if file_name == filedata["dataFile"]["filename"]: - return ( - f"{parsed['protocol']}://{parsed['netloc']}/api/access/datafile/" - f":persistentId?persistentId={filedata['dataFile']['persistentId']}" - ) - - raise ValueError( - f"File '{file_name}' not found in data archive {self.archive_url} (doi:{self.doi})." - ) + response = self.api_response.json() + files = { + file["dataFile"]["filename"]: file["dataFile"] + for file in response["data"]["latestVersion"]["files"] + } + if file_name not in files: + raise ValueError( + f"File '{file_name}' not found in data archive " + f"{self.archive_url} (doi:{self.doi})." + ) + # Generate download_url using persistentId or file id + persistent_id = files[file_name]["persistentId"] + if persistent_id: + download_url = ( + f"{parsed['protocol']}://{parsed['netloc']}/api/access/datafile/" + f":persistentId?persistentId={persistent_id}" + ) + else: + file_id = files[file_name]["id"] + download_url = ( + f"{parsed['protocol']}://{parsed['netloc']}/api/access/datafile/" + f"{file_id}" + ) + return download_url def populate_registry(self, pooch): """ From 7ad5073b2f5e99481edbdd5e86e1fa0c8dc27c36 Mon Sep 17 00:00:00 2001 From: Santiago Soler Date: Mon, 20 Mar 2023 14:29:47 -0700 Subject: [PATCH 2/4] Don't assume that persistentId is always present The `persistentId` key might be missing in the API response, while the `ID` is always there. So, don't assume it exists when deciding which id should be used to download the files. --- pooch/downloaders.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pooch/downloaders.py b/pooch/downloaders.py index ccc8c48c..d7a85a08 100644 --- a/pooch/downloaders.py +++ b/pooch/downloaders.py @@ -1046,8 +1046,8 @@ def download_url(self, file_name): f"{self.archive_url} (doi:{self.doi})." ) # Generate download_url using persistentId or file id - persistent_id = files[file_name]["persistentId"] - if persistent_id: + persistent_id = files[file_name].get("persistentId") + if persistent_id is not None and persistent_id: download_url = ( f"{parsed['protocol']}://{parsed['netloc']}/api/access/datafile/" f":persistentId?persistentId={persistent_id}" From 1e670cfcd76bc230358cae3985847a90306bfac3 Mon Sep 17 00:00:00 2001 From: Santiago Soler Date: Mon, 20 Mar 2023 17:38:06 -0700 Subject: [PATCH 3/4] Simplify the if statement Both for a persistent_id as a None or as an empty string, we can evaluate them with `if persistent_id:`. --- pooch/downloaders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pooch/downloaders.py b/pooch/downloaders.py index d7a85a08..fe1af254 100644 --- a/pooch/downloaders.py +++ b/pooch/downloaders.py @@ -1047,7 +1047,7 @@ def download_url(self, file_name): ) # Generate download_url using persistentId or file id persistent_id = files[file_name].get("persistentId") - if persistent_id is not None and persistent_id: + if persistent_id: download_url = ( f"{parsed['protocol']}://{parsed['netloc']}/api/access/datafile/" f":persistentId?persistentId={persistent_id}" From 15f7536563d02224042426ecf0b69a6cd7bd8944 Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Mon, 19 Feb 2024 16:59:46 -0300 Subject: [PATCH 4/4] Only rely on the file ID, not PID --- pooch/downloaders.py | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/pooch/downloaders.py b/pooch/downloaders.py index 56fbbffb..fa3b52da 100644 --- a/pooch/downloaders.py +++ b/pooch/downloaders.py @@ -1134,19 +1134,11 @@ def download_url(self, file_name): f"File '{file_name}' not found in data archive " f"{self.archive_url} (doi:{self.doi})." ) - # Generate download_url using persistentId or file id - persistent_id = files[file_name].get("persistentId") - if persistent_id: - download_url = ( - f"{parsed['protocol']}://{parsed['netloc']}/api/access/datafile/" - f":persistentId?persistentId={persistent_id}" - ) - else: - file_id = files[file_name]["id"] - download_url = ( - f"{parsed['protocol']}://{parsed['netloc']}/api/access/datafile/" - f"{file_id}" - ) + # Generate download_url using the file id + download_url = ( + f"{parsed['protocol']}://{parsed['netloc']}/api/access/datafile/" + f"{files[file_name]['id']}" + ) return download_url def populate_registry(self, pooch):