From 4e531f83c04edd1f6a71c042bea2474fe5b991b2 Mon Sep 17 00:00:00 2001 From: Jesse Khorasanee Date: Wed, 16 Mar 2022 11:26:27 +1300 Subject: [PATCH 1/8] Context files now passed through 'process_results' --- app/scicrunch_process_results.py | 10 ++++++++-- app/scicrunch_processing_common.py | 2 ++ tests/test_scicrunch.py | 8 ++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/app/scicrunch_process_results.py b/app/scicrunch_process_results.py index ae1508b3..0e22283e 100644 --- a/app/scicrunch_process_results.py +++ b/app/scicrunch_process_results.py @@ -1,7 +1,7 @@ import importlib import json import re - +from flask import jsonify # process_kb_results: Loop through SciCrunch results pulling out desired attributes and processing DOIs and CSV files def _prepare_results(results): @@ -21,6 +21,12 @@ def _prepare_results(results): attr = _transform_attributes(attributes_map, hit) attr['doi'] = _convert_doi_to_url(attr['doi']) attr['took'] = results['took'] + # find context files by looking through object mimetypes + attr['abi-contextual-information'] = [ + file['dataset']['path'] + for file in hit['_source']['objects'] + if file['additional_mimetype']['name'].find('context') is not -1 + ] try: attr['readme'] = hit['_source']['item']['readme']['description'] except KeyError: @@ -38,7 +44,7 @@ def _prepare_results(results): def process_results(results): - return json.dumps({'numberOfHits': results['hits']['total'], 'results': _prepare_results(results)}) + return jsonify({'numberOfHits': results['hits']['total'], 'results': _prepare_results(results)}) def reform_dataset_results(results): diff --git a/app/scicrunch_processing_common.py b/app/scicrunch_processing_common.py index 631ad894..c9644057 100644 --- a/app/scicrunch_processing_common.py +++ b/app/scicrunch_processing_common.py @@ -13,6 +13,7 @@ SCAFFOLD_FILE = 'abi-scaffold-metadata-file' SCAFFOLD_THUMBNAIL = 'abi-scaffold-thumbnail' SCAFFOLD_VIEW_FILE = 'abi-scaffold-view-file' +CONTEXT_FILE = 'abi-context-file' VIDEO = 'video' VERSION = 'version' README = 'readme' @@ -32,6 +33,7 @@ 'inode/vnd.abi.scaffold+thumbnail': SCAFFOLD_THUMBNAIL, 'inode/vnd.abi.scaffold.thumbnail+file': SCAFFOLD_THUMBNAIL, 'inode/vnd.abi.scaffold.view+file': SCAFFOLD_VIEW_FILE, + 'application/x.abi.context-information+json': CONTEXT_FILE, 'text/vnd.abi.plot+Tab-separated-values': PLOT_FILE, 'text/vnd.abi.plot+csv': PLOT_FILE, 'image/png': COMMON_IMAGES, diff --git a/tests/test_scicrunch.py b/tests/test_scicrunch.py index b5a0712b..7e2224dc 100644 --- a/tests/test_scicrunch.py +++ b/tests/test_scicrunch.py @@ -424,3 +424,11 @@ def test_scaffold_files(client): key = f"{uri}files/{path}".replace('s3://pennsieve-prod-discover-publish-use1/', '') r = client.get(f"/s3-resource/{key}") assert r.status_code == 200 + +def test_contextual_information(client): + r = client.get('/dataset_info/using_multiple_discoverIds/?discoverIds=76') + results = json.loads(r.data) + assert results['numberOfHits'] > 0 + for item in results['results']: + if 'abi-contextual-information' in item: + assert r.status_code == 200 From 22e25826f2ed48a987688d57d8904c7fa7f13183 Mon Sep 17 00:00:00 2001 From: Jesse Khorasanee Date: Wed, 16 Mar 2022 12:34:40 +1300 Subject: [PATCH 2/8] Fix the context info test, change context mimetype check --- app/scicrunch_processing_common.py | 2 +- tests/test_scicrunch.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/app/scicrunch_processing_common.py b/app/scicrunch_processing_common.py index c9644057..44fab796 100644 --- a/app/scicrunch_processing_common.py +++ b/app/scicrunch_processing_common.py @@ -33,7 +33,7 @@ 'inode/vnd.abi.scaffold+thumbnail': SCAFFOLD_THUMBNAIL, 'inode/vnd.abi.scaffold.thumbnail+file': SCAFFOLD_THUMBNAIL, 'inode/vnd.abi.scaffold.view+file': SCAFFOLD_VIEW_FILE, - 'application/x.abi.context-information+json': CONTEXT_FILE, + 'application/vnd.abi.context-information+json': CONTEXT_FILE, 'text/vnd.abi.plot+Tab-separated-values': PLOT_FILE, 'text/vnd.abi.plot+csv': PLOT_FILE, 'image/png': COMMON_IMAGES, diff --git a/tests/test_scicrunch.py b/tests/test_scicrunch.py index 7e2224dc..d73f57a8 100644 --- a/tests/test_scicrunch.py +++ b/tests/test_scicrunch.py @@ -425,10 +425,9 @@ def test_scaffold_files(client): r = client.get(f"/s3-resource/{key}") assert r.status_code == 200 -def test_contextual_information(client): +def test_finding_contextual_information(client): r = client.get('/dataset_info/using_multiple_discoverIds/?discoverIds=76') results = json.loads(r.data) - assert results['numberOfHits'] > 0 + assert results['numberOfHits'] > 0 # Test we could find the generic colon scaffold dataset for item in results['results']: - if 'abi-contextual-information' in item: - assert r.status_code == 200 + assert len(item['abi-contextual-information']) > 0 # Check it has contextual information From 3b448784751a867b183f81470b960e8f4e74865d Mon Sep 17 00:00:00 2001 From: Jesse Khorasanee Date: Wed, 16 Mar 2022 13:47:42 +1300 Subject: [PATCH 3/8] Use largest possible string for finding context files --- app/scicrunch_process_results.py | 2 +- app/scicrunch_processing_common.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/app/scicrunch_process_results.py b/app/scicrunch_process_results.py index 0e22283e..963e243b 100644 --- a/app/scicrunch_process_results.py +++ b/app/scicrunch_process_results.py @@ -25,7 +25,7 @@ def _prepare_results(results): attr['abi-contextual-information'] = [ file['dataset']['path'] for file in hit['_source']['objects'] - if file['additional_mimetype']['name'].find('context') is not -1 + if file['additional_mimetype']['name'].find('abi.context-information') is not -1 ] try: attr['readme'] = hit['_source']['item']['readme']['description'] diff --git a/app/scicrunch_processing_common.py b/app/scicrunch_processing_common.py index 44fab796..d141d87b 100644 --- a/app/scicrunch_processing_common.py +++ b/app/scicrunch_processing_common.py @@ -33,7 +33,7 @@ 'inode/vnd.abi.scaffold+thumbnail': SCAFFOLD_THUMBNAIL, 'inode/vnd.abi.scaffold.thumbnail+file': SCAFFOLD_THUMBNAIL, 'inode/vnd.abi.scaffold.view+file': SCAFFOLD_VIEW_FILE, - 'application/vnd.abi.context-information+json': CONTEXT_FILE, + 'application/x.vnd.abi.context-information+json': CONTEXT_FILE, 'text/vnd.abi.plot+Tab-separated-values': PLOT_FILE, 'text/vnd.abi.plot+csv': PLOT_FILE, 'image/png': COMMON_IMAGES, From bd1a8f4f19bddd7c953d41c9d69516c45aaca9ea Mon Sep 17 00:00:00 2001 From: Jesse Khorasanee Date: Tue, 22 Mar 2022 16:11:36 +1300 Subject: [PATCH 4/8] Create a version of sparc-api that can be used with the 'stage' index --- app/bfworker.py | 69 ++++++++++++++++++++ app/config.py | 2 +- app/main.py | 105 ++++++++++++++++++++++++------- app/scicrunch_process_results.py | 4 ++ app/scicrunch_requests.py | 9 +++ tests/test_api.py | 8 +++ 6 files changed, 172 insertions(+), 25 deletions(-) create mode 100644 app/bfworker.py diff --git a/app/bfworker.py b/app/bfworker.py new file mode 100644 index 00000000..8c3119dc --- /dev/null +++ b/app/bfworker.py @@ -0,0 +1,69 @@ +from pennsieve import Pennsieve +import pennsieve +from app.config import Config + +class BFWorker(object): + def __init__(self, id): + self.bf = Pennsieve(api_token=Config.PENNSIEVE_API_TOKEN, api_secret=Config.PENNSIEVE_API_SECRET) + + + def getCollectionAndMetaFromPackageId(self, packageId): + pkg = self.bf.get(packageId) + if type(pkg) is pennsieve.DataPackage: + colId = pkg.parent + col = self.bf.get(colId) + items = col.items + for item in items: + if packageId == item.id: + return [colId, item.name] + return None + + def getURLFromCollectionIdAndFileName(self, collectionId, fileName): + col = self.bf.get(collectionId) + if type(col) is pennsieve.Collection: + items = col.items + for item in items: + if fileName == item.name: + pkg = item + try: + bfFile = pkg.files[0] + url = bfFile.url + return url + except: + return None + return None + + def getUrlfromPackageId(self, packageId): + pId = packageId + if ('N:' not in packageId): + pId = 'N:' + packageId + pk = self.bf.get(pId) + return pk.files[0].url + + def getImagefromPackageId(self, packageId): + pId = packageId + if ('N:' not in packageId): + pId = 'N:' + packageId + pk = self.bf.get(pId) + # resp = requests.get(pk.files[0].url) + return pk.files[0].url if pk is not None else '' + + def getURLFromDatasetIdAndFilePath(self, datasetId, filePath): + fileArray = filePath.split('/') + items = self.bf.get_dataset(datasetId).items + count = 0 + while type(items) is list: + item = items[count] + for fileName in fileArray: + if fileName == item.name: + if type(item) is pennsieve.Collection: + items = item.items + count = 0 + continue + else: + try: + return item.files[0].url + except: + return None + count += 1 + return None diff --git a/app/config.py b/app/config.py index e5b9f6f1..df214973 100644 --- a/app/config.py +++ b/app/config.py @@ -30,7 +30,7 @@ class Config(object): KNOWLEDGEBASE_KEY = os.environ.get("KNOWLEDGEBASE_KEY", "secret-key") DEPLOY_ENV = os.environ.get("DEPLOY_ENV", "development") SPARC_APP_HOST = os.environ.get("SPARC_APP_HOST", "https://sparc-app.herokuapp.com") - SCI_CRUNCH_HOST = os.environ.get("SCICRUNCH_HOST", "https://scicrunch.org/api/1/elastic/SPARC_PortalDatasets_pr") + SCI_CRUNCH_HOST = os.environ.get("SCICRUNCH_HOST", "https://scicrunch.org/api/1/elastic/SPARC_PortalDatasets_stage") MAPSTATE_TABLENAME = os.environ.get("MAPSTATE_TABLENAME", "mapstates") SCAFFOLDSTATE_TABLENAME = os.environ.get("SCAFFOLDSTATE_TABLENAME", "scaffoldstates") WRIKE_TOKEN = os.environ.get("WRIKE_TOKEN") diff --git a/app/main.py b/app/main.py index b514258e..eb7d5007 100644 --- a/app/main.py +++ b/app/main.py @@ -30,6 +30,7 @@ from app.utilities import img_to_base64_str from app.osparc import run_simulation from app.biolucida_process_results import process_results as process_biolucida_results +from app.bfworker import BFWorker app = Flask(__name__) # set environment variable @@ -39,6 +40,7 @@ ma = Marshmallow(app) email_sender = EmailSender() +bfWorker = BFWorker(None) ps = None s3 = boto3.client( @@ -317,32 +319,63 @@ def presign_resource_url(): # Reverse proxy for objects from S3, a simple get object # operation. This is used by scaffoldvuer and its -# important to keep the relative for accessing -# other required files. +# # important to keep the relative for accessing +# # other required files. +# @app.route("/s3-resource/") +# def direct_download_url(path): +# print(path) +# head_response = s3.head_object( +# Bucket=Config.S3_BUCKET_NAME, +# Key=path, +# RequestPayer="requester" +# ) +# +# content_length = head_response.get('ContentLength', Config.DIRECT_DOWNLOAD_LIMIT) +# if content_length and content_length > Config.DIRECT_DOWNLOAD_LIMIT: # 20 MB +# return abort(413, description=f"File too big to download: {content_length}") +# +# response = s3.get_object( +# Bucket=Config.S3_BUCKET_NAME, +# Key=path, +# RequestPayer="requester" +# ) +# +# encode_base64 = request.args.get("encodeBase64") +# resource = response["Body"].read() +# if encode_base64 is not None: +# return base64.b64encode(resource) +# +# return resource + +# This version of s3-resouces is used for accessing files on staging. Use it as a replacement for 's3-resource' +# No changes are need on the front end, just use s3-resource as normal @app.route("/s3-resource/") def direct_download_url(path): - head_response = s3.head_object( - Bucket=Config.S3_BUCKET_NAME, - Key=path, - RequestPayer="requester" - ) - - content_length = head_response.get('ContentLength', Config.DIRECT_DOWNLOAD_LIMIT) - if content_length and content_length > Config.DIRECT_DOWNLOAD_LIMIT: # 20 MB - return abort(413, description=f"File too big to download: {content_length}") - - response = s3.get_object( - Bucket=Config.S3_BUCKET_NAME, - Key=path, - RequestPayer="requester" - ) - - encode_base64 = request.args.get("encodeBase64") - resource = response["Body"].read() - if encode_base64 is not None: - return base64.b64encode(resource) - - return resource + print(path) + filePath = path.split('files/')[-1] + discoverId = path.split('/')[0] + dataset_query = { + "size": 20, + "from": 0, + "query": { + "query_string": { + "fields": [ + "*pennsieve.identifier" + ], + "query": discoverId + } + }, + "_source": [ + "item.identifier" + ] + } + resp = dataset_search(dataset_query) + pennsieveId = resp['hits']['hits'][0]['_source']['item']['identifier'] + url = bfWorker.getURLFromDatasetIdAndFilePath(pennsieveId, filePath) + if url != None: + resp2 = requests.get(url) + return resp2.json() + return jsonify({'error': 'error with the provided ID '}, status=502) @app.route("/scicrunch-dataset//") @@ -419,6 +452,30 @@ def get_dataset_info_discoverIds(): return process_results(dataset_search(query)) +@app.route('/urlFromPennsieveDatasetIdAndFilePath/') +def getFileUrlFromPennsieve(discoverId): + filePath = request.args.get('filePath') + dataset_query = { + "size": 20, + "from": 0, + "query": { + "query_string": { + "fields": [ + "*pennsieve.identifier" + ], + "query": discoverId + } + }, + "_source": [ + "item.identifier" + ] + } + resp = dataset_search(dataset_query) + pennsieveId = resp['hits']['hits'][0]['_source']['item']['identifier'] + url = bfWorker.getURLFromDatasetIdAndFilePath(pennsieveId, filePath) + if url != None: + return jsonify({'url': url}) + return jsonify({'error': 'error with the provided ID '}, status=502) @app.route("/dataset_info/using_title") def get_dataset_info_title(): diff --git a/app/scicrunch_process_results.py b/app/scicrunch_process_results.py index 963e243b..070d468c 100644 --- a/app/scicrunch_process_results.py +++ b/app/scicrunch_process_results.py @@ -27,6 +27,10 @@ def _prepare_results(results): for file in hit['_source']['objects'] if file['additional_mimetype']['name'].find('abi.context-information') is not -1 ] + print([ + file['additional_mimetype']['name'] + for file in hit['_source']['objects'] + ]) try: attr['readme'] = hit['_source']['item']['readme']['description'] except KeyError: diff --git a/app/scicrunch_requests.py b/app/scicrunch_requests.py index 5b0886c9..50144066 100644 --- a/app/scicrunch_requests.py +++ b/app/scicrunch_requests.py @@ -1,3 +1,4 @@ +import json def create_query_string(query_string): return { "from": 0, @@ -20,6 +21,14 @@ def create_doi_query(doi): } def create_multiple_doi_query(dois, size=10, from_=0): + print(json.dumps({ + "size": 999, + "query": { + "terms": { + "item.curie": dois + } + } + })) return { "size": 999, "query": { diff --git a/tests/test_api.py b/tests/test_api.py index fcfc07a1..ffd61939 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -22,6 +22,14 @@ def test_direct_download_url_small_file(client): assert r.status_code == 200 assert b"proximal colon" in r.data +def test_pennsieve_file_path_download(client): + colon_dataset_id = 76 + colon_file_path = 'derivative%2Fscaffold_context_info.json' + r = client.get(f"/urlFromPennsieveDatasetIdAndFilePath/{colon_dataset_id}?filePath={colon_file_path}") + assert r.status_code == 200 + assert 'url' in r.json + + def test_direct_download_url_thumbnail(client): small_s3_file = '95/1/files/derivative%2FScaffold%2Fthumbnail.png' From 8089ad99527fb5418db99e630d29cc30afd9e297 Mon Sep 17 00:00:00 2001 From: Jesse Khorasanee Date: Fri, 25 Mar 2022 00:12:01 +1300 Subject: [PATCH 5/8] Fix issue in logic: - First file was getting skipped - we only return file contents if the file is json --- app/bfworker.py | 2 +- app/main.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/app/bfworker.py b/app/bfworker.py index 8c3119dc..de876b66 100644 --- a/app/bfworker.py +++ b/app/bfworker.py @@ -58,7 +58,7 @@ def getURLFromDatasetIdAndFilePath(self, datasetId, filePath): if fileName == item.name: if type(item) is pennsieve.Collection: items = item.items - count = 0 + count = -1 continue else: try: diff --git a/app/main.py b/app/main.py index eb7d5007..e9e714da 100644 --- a/app/main.py +++ b/app/main.py @@ -373,8 +373,11 @@ def direct_download_url(path): pennsieveId = resp['hits']['hits'][0]['_source']['item']['identifier'] url = bfWorker.getURLFromDatasetIdAndFilePath(pennsieveId, filePath) if url != None: - resp2 = requests.get(url) - return resp2.json() + if '.json' in path: + resp2 = requests.get(url) + return resp2.json() + else: + return url return jsonify({'error': 'error with the provided ID '}, status=502) From 44bc3565dc5b8c5f75b1390d225030b2362f17c0 Mon Sep 17 00:00:00 2001 From: Jesse Khorasanee Date: Fri, 25 Mar 2022 13:44:57 +1300 Subject: [PATCH 6/8] Return content for anything but json is s3-resource --- app/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/app/main.py b/app/main.py index e9e714da..1149d0de 100644 --- a/app/main.py +++ b/app/main.py @@ -377,7 +377,8 @@ def direct_download_url(path): resp2 = requests.get(url) return resp2.json() else: - return url + resp2 = requests.get(url) + return resp2.content return jsonify({'error': 'error with the provided ID '}, status=502) From 754316ce582633b1d3c8ef670bc1a94386e04244 Mon Sep 17 00:00:00 2001 From: Jesse Khorasanee Date: Fri, 1 Jul 2022 14:46:23 +1200 Subject: [PATCH 7/8] A few small fixes --- app/config.py | 2 +- app/main.py | 8 ++------ app/scicrunch_processing_common.py | 1 + 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/app/config.py b/app/config.py index df214973..1eddca26 100644 --- a/app/config.py +++ b/app/config.py @@ -30,7 +30,7 @@ class Config(object): KNOWLEDGEBASE_KEY = os.environ.get("KNOWLEDGEBASE_KEY", "secret-key") DEPLOY_ENV = os.environ.get("DEPLOY_ENV", "development") SPARC_APP_HOST = os.environ.get("SPARC_APP_HOST", "https://sparc-app.herokuapp.com") - SCI_CRUNCH_HOST = os.environ.get("SCICRUNCH_HOST", "https://scicrunch.org/api/1/elastic/SPARC_PortalDatasets_stage") + SCI_CRUNCH_HOST = os.environ.get("SCICRUNCH_HOST", "https://scicrunch.org/api/1/elastic/SPARC_PortalDatasets_dev") MAPSTATE_TABLENAME = os.environ.get("MAPSTATE_TABLENAME", "mapstates") SCAFFOLDSTATE_TABLENAME = os.environ.get("SCAFFOLDSTATE_TABLENAME", "scaffoldstates") WRIKE_TOKEN = os.environ.get("WRIKE_TOKEN") diff --git a/app/main.py b/app/main.py index 1149d0de..d1637ac7 100644 --- a/app/main.py +++ b/app/main.py @@ -373,12 +373,8 @@ def direct_download_url(path): pennsieveId = resp['hits']['hits'][0]['_source']['item']['identifier'] url = bfWorker.getURLFromDatasetIdAndFilePath(pennsieveId, filePath) if url != None: - if '.json' in path: - resp2 = requests.get(url) - return resp2.json() - else: - resp2 = requests.get(url) - return resp2.content + resp2 = requests.get(url) + return resp2.content return jsonify({'error': 'error with the provided ID '}, status=502) diff --git a/app/scicrunch_processing_common.py b/app/scicrunch_processing_common.py index d141d87b..2e5aad48 100644 --- a/app/scicrunch_processing_common.py +++ b/app/scicrunch_processing_common.py @@ -30,6 +30,7 @@ 'application/vnd.mbfbioscience.neurolucida+xml': SEGMENTATION_FILES, 'inode/vnd.abi.scaffold+directory': SCAFFOLD_DIR, 'inode/vnd.abi.scaffold+file': SCAFFOLD_FILE, + 'application/x.vnd.abi.scaffold.meta+json': SCAFFOLD_FILE, 'inode/vnd.abi.scaffold+thumbnail': SCAFFOLD_THUMBNAIL, 'inode/vnd.abi.scaffold.thumbnail+file': SCAFFOLD_THUMBNAIL, 'inode/vnd.abi.scaffold.view+file': SCAFFOLD_VIEW_FILE, From faad292764362462a7607d8dfbcac1df2a5c74d9 Mon Sep 17 00:00:00 2001 From: Jesse Khorasanee Date: Mon, 19 Sep 2022 06:02:12 +0530 Subject: [PATCH 8/8] Add check for future scicrunch processing versions --- app/scicrunch_process_results.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/app/scicrunch_process_results.py b/app/scicrunch_process_results.py index 070d468c..36c2302f 100644 --- a/app/scicrunch_process_results.py +++ b/app/scicrunch_process_results.py @@ -13,6 +13,10 @@ def _prepare_results(results): except KeyError: continue + if version >= '1.1.5': + print('WARINING! Scicrunch processing is out of date!') + version = '1.1.5' + package_version = f'scicrunch_processing_v_{version.replace(".", "_")}' m = importlib.import_module(f'app.{package_version}') attributes_map = getattr(m, 'ATTRIBUTES_MAP')