diff --git a/app/bfworker.py b/app/bfworker.py new file mode 100644 index 00000000..de876b66 --- /dev/null +++ b/app/bfworker.py @@ -0,0 +1,69 @@ +from pennsieve import Pennsieve +import pennsieve +from app.config import Config + +class BFWorker(object): + def __init__(self, id): + self.bf = Pennsieve(api_token=Config.PENNSIEVE_API_TOKEN, api_secret=Config.PENNSIEVE_API_SECRET) + + + def getCollectionAndMetaFromPackageId(self, packageId): + pkg = self.bf.get(packageId) + if type(pkg) is pennsieve.DataPackage: + colId = pkg.parent + col = self.bf.get(colId) + items = col.items + for item in items: + if packageId == item.id: + return [colId, item.name] + return None + + def getURLFromCollectionIdAndFileName(self, collectionId, fileName): + col = self.bf.get(collectionId) + if type(col) is pennsieve.Collection: + items = col.items + for item in items: + if fileName == item.name: + pkg = item + try: + bfFile = pkg.files[0] + url = bfFile.url + return url + except: + return None + return None + + def getUrlfromPackageId(self, packageId): + pId = packageId + if ('N:' not in packageId): + pId = 'N:' + packageId + pk = self.bf.get(pId) + return pk.files[0].url + + def getImagefromPackageId(self, packageId): + pId = packageId + if ('N:' not in packageId): + pId = 'N:' + packageId + pk = self.bf.get(pId) + # resp = requests.get(pk.files[0].url) + return pk.files[0].url if pk is not None else '' + + def getURLFromDatasetIdAndFilePath(self, datasetId, filePath): + fileArray = filePath.split('/') + items = self.bf.get_dataset(datasetId).items + count = 0 + while type(items) is list: + item = items[count] + for fileName in fileArray: + if fileName == item.name: + if type(item) is pennsieve.Collection: + items = item.items + count = -1 + continue + else: + try: + return item.files[0].url + except: + return None + count += 1 + return None diff --git a/app/config.py b/app/config.py index e5b9f6f1..1eddca26 100644 --- a/app/config.py +++ b/app/config.py @@ -30,7 +30,7 @@ class Config(object): KNOWLEDGEBASE_KEY = os.environ.get("KNOWLEDGEBASE_KEY", "secret-key") DEPLOY_ENV = os.environ.get("DEPLOY_ENV", "development") SPARC_APP_HOST = os.environ.get("SPARC_APP_HOST", "https://sparc-app.herokuapp.com") - SCI_CRUNCH_HOST = os.environ.get("SCICRUNCH_HOST", "https://scicrunch.org/api/1/elastic/SPARC_PortalDatasets_pr") + SCI_CRUNCH_HOST = os.environ.get("SCICRUNCH_HOST", "https://scicrunch.org/api/1/elastic/SPARC_PortalDatasets_dev") MAPSTATE_TABLENAME = os.environ.get("MAPSTATE_TABLENAME", "mapstates") SCAFFOLDSTATE_TABLENAME = os.environ.get("SCAFFOLDSTATE_TABLENAME", "scaffoldstates") WRIKE_TOKEN = os.environ.get("WRIKE_TOKEN") diff --git a/app/main.py b/app/main.py index b514258e..d1637ac7 100644 --- a/app/main.py +++ b/app/main.py @@ -30,6 +30,7 @@ from app.utilities import img_to_base64_str from app.osparc import run_simulation from app.biolucida_process_results import process_results as process_biolucida_results +from app.bfworker import BFWorker app = Flask(__name__) # set environment variable @@ -39,6 +40,7 @@ ma = Marshmallow(app) email_sender = EmailSender() +bfWorker = BFWorker(None) ps = None s3 = boto3.client( @@ -317,32 +319,63 @@ def presign_resource_url(): # Reverse proxy for objects from S3, a simple get object # operation. This is used by scaffoldvuer and its -# important to keep the relative for accessing -# other required files. +# # important to keep the relative for accessing +# # other required files. +# @app.route("/s3-resource/") +# def direct_download_url(path): +# print(path) +# head_response = s3.head_object( +# Bucket=Config.S3_BUCKET_NAME, +# Key=path, +# RequestPayer="requester" +# ) +# +# content_length = head_response.get('ContentLength', Config.DIRECT_DOWNLOAD_LIMIT) +# if content_length and content_length > Config.DIRECT_DOWNLOAD_LIMIT: # 20 MB +# return abort(413, description=f"File too big to download: {content_length}") +# +# response = s3.get_object( +# Bucket=Config.S3_BUCKET_NAME, +# Key=path, +# RequestPayer="requester" +# ) +# +# encode_base64 = request.args.get("encodeBase64") +# resource = response["Body"].read() +# if encode_base64 is not None: +# return base64.b64encode(resource) +# +# return resource + +# This version of s3-resouces is used for accessing files on staging. Use it as a replacement for 's3-resource' +# No changes are need on the front end, just use s3-resource as normal @app.route("/s3-resource/") def direct_download_url(path): - head_response = s3.head_object( - Bucket=Config.S3_BUCKET_NAME, - Key=path, - RequestPayer="requester" - ) - - content_length = head_response.get('ContentLength', Config.DIRECT_DOWNLOAD_LIMIT) - if content_length and content_length > Config.DIRECT_DOWNLOAD_LIMIT: # 20 MB - return abort(413, description=f"File too big to download: {content_length}") - - response = s3.get_object( - Bucket=Config.S3_BUCKET_NAME, - Key=path, - RequestPayer="requester" - ) - - encode_base64 = request.args.get("encodeBase64") - resource = response["Body"].read() - if encode_base64 is not None: - return base64.b64encode(resource) - - return resource + print(path) + filePath = path.split('files/')[-1] + discoverId = path.split('/')[0] + dataset_query = { + "size": 20, + "from": 0, + "query": { + "query_string": { + "fields": [ + "*pennsieve.identifier" + ], + "query": discoverId + } + }, + "_source": [ + "item.identifier" + ] + } + resp = dataset_search(dataset_query) + pennsieveId = resp['hits']['hits'][0]['_source']['item']['identifier'] + url = bfWorker.getURLFromDatasetIdAndFilePath(pennsieveId, filePath) + if url != None: + resp2 = requests.get(url) + return resp2.content + return jsonify({'error': 'error with the provided ID '}, status=502) @app.route("/scicrunch-dataset//") @@ -419,6 +452,30 @@ def get_dataset_info_discoverIds(): return process_results(dataset_search(query)) +@app.route('/urlFromPennsieveDatasetIdAndFilePath/') +def getFileUrlFromPennsieve(discoverId): + filePath = request.args.get('filePath') + dataset_query = { + "size": 20, + "from": 0, + "query": { + "query_string": { + "fields": [ + "*pennsieve.identifier" + ], + "query": discoverId + } + }, + "_source": [ + "item.identifier" + ] + } + resp = dataset_search(dataset_query) + pennsieveId = resp['hits']['hits'][0]['_source']['item']['identifier'] + url = bfWorker.getURLFromDatasetIdAndFilePath(pennsieveId, filePath) + if url != None: + return jsonify({'url': url}) + return jsonify({'error': 'error with the provided ID '}, status=502) @app.route("/dataset_info/using_title") def get_dataset_info_title(): diff --git a/app/scicrunch_process_results.py b/app/scicrunch_process_results.py index ae1508b3..36c2302f 100644 --- a/app/scicrunch_process_results.py +++ b/app/scicrunch_process_results.py @@ -1,7 +1,7 @@ import importlib import json import re - +from flask import jsonify # process_kb_results: Loop through SciCrunch results pulling out desired attributes and processing DOIs and CSV files def _prepare_results(results): @@ -13,6 +13,10 @@ def _prepare_results(results): except KeyError: continue + if version >= '1.1.5': + print('WARINING! Scicrunch processing is out of date!') + version = '1.1.5' + package_version = f'scicrunch_processing_v_{version.replace(".", "_")}' m = importlib.import_module(f'app.{package_version}') attributes_map = getattr(m, 'ATTRIBUTES_MAP') @@ -21,6 +25,16 @@ def _prepare_results(results): attr = _transform_attributes(attributes_map, hit) attr['doi'] = _convert_doi_to_url(attr['doi']) attr['took'] = results['took'] + # find context files by looking through object mimetypes + attr['abi-contextual-information'] = [ + file['dataset']['path'] + for file in hit['_source']['objects'] + if file['additional_mimetype']['name'].find('abi.context-information') is not -1 + ] + print([ + file['additional_mimetype']['name'] + for file in hit['_source']['objects'] + ]) try: attr['readme'] = hit['_source']['item']['readme']['description'] except KeyError: @@ -38,7 +52,7 @@ def _prepare_results(results): def process_results(results): - return json.dumps({'numberOfHits': results['hits']['total'], 'results': _prepare_results(results)}) + return jsonify({'numberOfHits': results['hits']['total'], 'results': _prepare_results(results)}) def reform_dataset_results(results): diff --git a/app/scicrunch_processing_common.py b/app/scicrunch_processing_common.py index 631ad894..2e5aad48 100644 --- a/app/scicrunch_processing_common.py +++ b/app/scicrunch_processing_common.py @@ -13,6 +13,7 @@ SCAFFOLD_FILE = 'abi-scaffold-metadata-file' SCAFFOLD_THUMBNAIL = 'abi-scaffold-thumbnail' SCAFFOLD_VIEW_FILE = 'abi-scaffold-view-file' +CONTEXT_FILE = 'abi-context-file' VIDEO = 'video' VERSION = 'version' README = 'readme' @@ -29,9 +30,11 @@ 'application/vnd.mbfbioscience.neurolucida+xml': SEGMENTATION_FILES, 'inode/vnd.abi.scaffold+directory': SCAFFOLD_DIR, 'inode/vnd.abi.scaffold+file': SCAFFOLD_FILE, + 'application/x.vnd.abi.scaffold.meta+json': SCAFFOLD_FILE, 'inode/vnd.abi.scaffold+thumbnail': SCAFFOLD_THUMBNAIL, 'inode/vnd.abi.scaffold.thumbnail+file': SCAFFOLD_THUMBNAIL, 'inode/vnd.abi.scaffold.view+file': SCAFFOLD_VIEW_FILE, + 'application/x.vnd.abi.context-information+json': CONTEXT_FILE, 'text/vnd.abi.plot+Tab-separated-values': PLOT_FILE, 'text/vnd.abi.plot+csv': PLOT_FILE, 'image/png': COMMON_IMAGES, diff --git a/app/scicrunch_requests.py b/app/scicrunch_requests.py index 5b0886c9..50144066 100644 --- a/app/scicrunch_requests.py +++ b/app/scicrunch_requests.py @@ -1,3 +1,4 @@ +import json def create_query_string(query_string): return { "from": 0, @@ -20,6 +21,14 @@ def create_doi_query(doi): } def create_multiple_doi_query(dois, size=10, from_=0): + print(json.dumps({ + "size": 999, + "query": { + "terms": { + "item.curie": dois + } + } + })) return { "size": 999, "query": { diff --git a/tests/test_api.py b/tests/test_api.py index fcfc07a1..ffd61939 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -22,6 +22,14 @@ def test_direct_download_url_small_file(client): assert r.status_code == 200 assert b"proximal colon" in r.data +def test_pennsieve_file_path_download(client): + colon_dataset_id = 76 + colon_file_path = 'derivative%2Fscaffold_context_info.json' + r = client.get(f"/urlFromPennsieveDatasetIdAndFilePath/{colon_dataset_id}?filePath={colon_file_path}") + assert r.status_code == 200 + assert 'url' in r.json + + def test_direct_download_url_thumbnail(client): small_s3_file = '95/1/files/derivative%2FScaffold%2Fthumbnail.png' diff --git a/tests/test_scicrunch.py b/tests/test_scicrunch.py index b5a0712b..d73f57a8 100644 --- a/tests/test_scicrunch.py +++ b/tests/test_scicrunch.py @@ -424,3 +424,10 @@ def test_scaffold_files(client): key = f"{uri}files/{path}".replace('s3://pennsieve-prod-discover-publish-use1/', '') r = client.get(f"/s3-resource/{key}") assert r.status_code == 200 + +def test_finding_contextual_information(client): + r = client.get('/dataset_info/using_multiple_discoverIds/?discoverIds=76') + results = json.loads(r.data) + assert results['numberOfHits'] > 0 # Test we could find the generic colon scaffold dataset + for item in results['results']: + assert len(item['abi-contextual-information']) > 0 # Check it has contextual information