Skip to content

Staging sparc api #23

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 8 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions app/bfworker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from pennsieve import Pennsieve
import pennsieve
from app.config import Config

class BFWorker(object):
def __init__(self, id):
self.bf = Pennsieve(api_token=Config.PENNSIEVE_API_TOKEN, api_secret=Config.PENNSIEVE_API_SECRET)


def getCollectionAndMetaFromPackageId(self, packageId):
pkg = self.bf.get(packageId)
if type(pkg) is pennsieve.DataPackage:
colId = pkg.parent
col = self.bf.get(colId)
items = col.items
for item in items:
if packageId == item.id:
return [colId, item.name]
return None

def getURLFromCollectionIdAndFileName(self, collectionId, fileName):
col = self.bf.get(collectionId)
if type(col) is pennsieve.Collection:
items = col.items
for item in items:
if fileName == item.name:
pkg = item
try:
bfFile = pkg.files[0]
url = bfFile.url
return url
except:
return None
return None

def getUrlfromPackageId(self, packageId):
pId = packageId
if ('N:' not in packageId):
pId = 'N:' + packageId
pk = self.bf.get(pId)
return pk.files[0].url

def getImagefromPackageId(self, packageId):
pId = packageId
if ('N:' not in packageId):
pId = 'N:' + packageId
pk = self.bf.get(pId)
# resp = requests.get(pk.files[0].url)
return pk.files[0].url if pk is not None else ''

def getURLFromDatasetIdAndFilePath(self, datasetId, filePath):
fileArray = filePath.split('/')
items = self.bf.get_dataset(datasetId).items
count = 0
while type(items) is list:
item = items[count]
for fileName in fileArray:
if fileName == item.name:
if type(item) is pennsieve.Collection:
items = item.items
count = -1
continue
else:
try:
return item.files[0].url
except:
return None
count += 1
return None
2 changes: 1 addition & 1 deletion app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ class Config(object):
KNOWLEDGEBASE_KEY = os.environ.get("KNOWLEDGEBASE_KEY", "secret-key")
DEPLOY_ENV = os.environ.get("DEPLOY_ENV", "development")
SPARC_APP_HOST = os.environ.get("SPARC_APP_HOST", "https://sparc-app.herokuapp.com")
SCI_CRUNCH_HOST = os.environ.get("SCICRUNCH_HOST", "https://scicrunch.org/api/1/elastic/SPARC_PortalDatasets_pr")
SCI_CRUNCH_HOST = os.environ.get("SCICRUNCH_HOST", "https://scicrunch.org/api/1/elastic/SPARC_PortalDatasets_dev")
MAPSTATE_TABLENAME = os.environ.get("MAPSTATE_TABLENAME", "mapstates")
SCAFFOLDSTATE_TABLENAME = os.environ.get("SCAFFOLDSTATE_TABLENAME", "scaffoldstates")
WRIKE_TOKEN = os.environ.get("WRIKE_TOKEN")
Expand Down
105 changes: 81 additions & 24 deletions app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from app.utilities import img_to_base64_str
from app.osparc import run_simulation
from app.biolucida_process_results import process_results as process_biolucida_results
from app.bfworker import BFWorker

app = Flask(__name__)
# set environment variable
Expand All @@ -39,6 +40,7 @@

ma = Marshmallow(app)
email_sender = EmailSender()
bfWorker = BFWorker(None)

ps = None
s3 = boto3.client(
Expand Down Expand Up @@ -317,32 +319,63 @@ def presign_resource_url():

# Reverse proxy for objects from S3, a simple get object
# operation. This is used by scaffoldvuer and its
# important to keep the relative <path> for accessing
# other required files.
# # important to keep the relative <path> for accessing
# # other required files.
# @app.route("/s3-resource/<path:path>")
# def direct_download_url(path):
# print(path)
# head_response = s3.head_object(
# Bucket=Config.S3_BUCKET_NAME,
# Key=path,
# RequestPayer="requester"
# )
#
# content_length = head_response.get('ContentLength', Config.DIRECT_DOWNLOAD_LIMIT)
# if content_length and content_length > Config.DIRECT_DOWNLOAD_LIMIT: # 20 MB
# return abort(413, description=f"File too big to download: {content_length}")
#
# response = s3.get_object(
# Bucket=Config.S3_BUCKET_NAME,
# Key=path,
# RequestPayer="requester"
# )
#
# encode_base64 = request.args.get("encodeBase64")
# resource = response["Body"].read()
# if encode_base64 is not None:
# return base64.b64encode(resource)
#
# return resource

# This version of s3-resouces is used for accessing files on staging. Use it as a replacement for 's3-resource'
# No changes are need on the front end, just use s3-resource as normal
@app.route("/s3-resource/<path:path>")
def direct_download_url(path):
head_response = s3.head_object(
Bucket=Config.S3_BUCKET_NAME,
Key=path,
RequestPayer="requester"
)

content_length = head_response.get('ContentLength', Config.DIRECT_DOWNLOAD_LIMIT)
if content_length and content_length > Config.DIRECT_DOWNLOAD_LIMIT: # 20 MB
return abort(413, description=f"File too big to download: {content_length}")

response = s3.get_object(
Bucket=Config.S3_BUCKET_NAME,
Key=path,
RequestPayer="requester"
)

encode_base64 = request.args.get("encodeBase64")
resource = response["Body"].read()
if encode_base64 is not None:
return base64.b64encode(resource)

return resource
print(path)
filePath = path.split('files/')[-1]
discoverId = path.split('/')[0]
dataset_query = {
"size": 20,
"from": 0,
"query": {
"query_string": {
"fields": [
"*pennsieve.identifier"
],
"query": discoverId
}
},
"_source": [
"item.identifier"
]
}
resp = dataset_search(dataset_query)
pennsieveId = resp['hits']['hits'][0]['_source']['item']['identifier']
url = bfWorker.getURLFromDatasetIdAndFilePath(pennsieveId, filePath)
if url != None:
resp2 = requests.get(url)
return resp2.content
return jsonify({'error': 'error with the provided ID '}, status=502)


@app.route("/scicrunch-dataset/<doi1>/<doi2>")
Expand Down Expand Up @@ -419,6 +452,30 @@ def get_dataset_info_discoverIds():

return process_results(dataset_search(query))

@app.route('/urlFromPennsieveDatasetIdAndFilePath/<discoverId>')
def getFileUrlFromPennsieve(discoverId):
filePath = request.args.get('filePath')
dataset_query = {
"size": 20,
"from": 0,
"query": {
"query_string": {
"fields": [
"*pennsieve.identifier"
],
"query": discoverId
}
},
"_source": [
"item.identifier"
]
}
resp = dataset_search(dataset_query)
pennsieveId = resp['hits']['hits'][0]['_source']['item']['identifier']
url = bfWorker.getURLFromDatasetIdAndFilePath(pennsieveId, filePath)
if url != None:
return jsonify({'url': url})
return jsonify({'error': 'error with the provided ID '}, status=502)

@app.route("/dataset_info/using_title")
def get_dataset_info_title():
Expand Down
18 changes: 16 additions & 2 deletions app/scicrunch_process_results.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import importlib
import json
import re

from flask import jsonify

# process_kb_results: Loop through SciCrunch results pulling out desired attributes and processing DOIs and CSV files
def _prepare_results(results):
Expand All @@ -13,6 +13,10 @@ def _prepare_results(results):
except KeyError:
continue

if version >= '1.1.5':
print('WARINING! Scicrunch processing is out of date!')
version = '1.1.5'

package_version = f'scicrunch_processing_v_{version.replace(".", "_")}'
m = importlib.import_module(f'app.{package_version}')
attributes_map = getattr(m, 'ATTRIBUTES_MAP')
Expand All @@ -21,6 +25,16 @@ def _prepare_results(results):
attr = _transform_attributes(attributes_map, hit)
attr['doi'] = _convert_doi_to_url(attr['doi'])
attr['took'] = results['took']
# find context files by looking through object mimetypes
attr['abi-contextual-information'] = [
file['dataset']['path']
for file in hit['_source']['objects']
if file['additional_mimetype']['name'].find('abi.context-information') is not -1
]
print([
file['additional_mimetype']['name']
for file in hit['_source']['objects']
])
try:
attr['readme'] = hit['_source']['item']['readme']['description']
except KeyError:
Expand All @@ -38,7 +52,7 @@ def _prepare_results(results):


def process_results(results):
return json.dumps({'numberOfHits': results['hits']['total'], 'results': _prepare_results(results)})
return jsonify({'numberOfHits': results['hits']['total'], 'results': _prepare_results(results)})


def reform_dataset_results(results):
Expand Down
3 changes: 3 additions & 0 deletions app/scicrunch_processing_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
SCAFFOLD_FILE = 'abi-scaffold-metadata-file'
SCAFFOLD_THUMBNAIL = 'abi-scaffold-thumbnail'
SCAFFOLD_VIEW_FILE = 'abi-scaffold-view-file'
CONTEXT_FILE = 'abi-context-file'
VIDEO = 'video'
VERSION = 'version'
README = 'readme'
Expand All @@ -29,9 +30,11 @@
'application/vnd.mbfbioscience.neurolucida+xml': SEGMENTATION_FILES,
'inode/vnd.abi.scaffold+directory': SCAFFOLD_DIR,
'inode/vnd.abi.scaffold+file': SCAFFOLD_FILE,
'application/x.vnd.abi.scaffold.meta+json': SCAFFOLD_FILE,
'inode/vnd.abi.scaffold+thumbnail': SCAFFOLD_THUMBNAIL,
'inode/vnd.abi.scaffold.thumbnail+file': SCAFFOLD_THUMBNAIL,
'inode/vnd.abi.scaffold.view+file': SCAFFOLD_VIEW_FILE,
'application/x.vnd.abi.context-information+json': CONTEXT_FILE,
'text/vnd.abi.plot+Tab-separated-values': PLOT_FILE,
'text/vnd.abi.plot+csv': PLOT_FILE,
'image/png': COMMON_IMAGES,
Expand Down
9 changes: 9 additions & 0 deletions app/scicrunch_requests.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
def create_query_string(query_string):
return {
"from": 0,
Expand All @@ -20,6 +21,14 @@ def create_doi_query(doi):
}

def create_multiple_doi_query(dois, size=10, from_=0):
print(json.dumps({
"size": 999,
"query": {
"terms": {
"item.curie": dois
}
}
}))
return {
"size": 999,
"query": {
Expand Down
8 changes: 8 additions & 0 deletions tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,14 @@ def test_direct_download_url_small_file(client):
assert r.status_code == 200
assert b"proximal colon" in r.data

def test_pennsieve_file_path_download(client):
colon_dataset_id = 76
colon_file_path = 'derivative%2Fscaffold_context_info.json'
r = client.get(f"/urlFromPennsieveDatasetIdAndFilePath/{colon_dataset_id}?filePath={colon_file_path}")
assert r.status_code == 200
assert 'url' in r.json



def test_direct_download_url_thumbnail(client):
small_s3_file = '95/1/files/derivative%2FScaffold%2Fthumbnail.png'
Expand Down
7 changes: 7 additions & 0 deletions tests/test_scicrunch.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,3 +424,10 @@ def test_scaffold_files(client):
key = f"{uri}files/{path}".replace('s3://pennsieve-prod-discover-publish-use1/', '')
r = client.get(f"/s3-resource/{key}")
assert r.status_code == 200

def test_finding_contextual_information(client):
r = client.get('/dataset_info/using_multiple_discoverIds/?discoverIds=76')
results = json.loads(r.data)
assert results['numberOfHits'] > 0 # Test we could find the generic colon scaffold dataset
for item in results['results']:
assert len(item['abi-contextual-information']) > 0 # Check it has contextual information