diff --git a/app/config.py b/app/config.py index 4b2fa5cb..bb89b043 100644 --- a/app/config.py +++ b/app/config.py @@ -31,7 +31,7 @@ class Config(object): KNOWLEDGEBASE_KEY = os.environ.get("KNOWLEDGEBASE_KEY", "secret-key") DEPLOY_ENV = os.environ.get("DEPLOY_ENV", "development") SPARC_APP_HOST = os.environ.get("SPARC_APP_HOST", "https://sparc-app.herokuapp.com") - SCI_CRUNCH_HOST = os.environ.get("SCICRUNCH_HOST", "https://scicrunch.org/api/1/elastic/SPARC_Datasets_pr") + SCI_CRUNCH_HOST = os.environ.get("SCICRUNCH_HOST", "https://scicrunch.org/api/1/elastic/SPARC_PortalDatasets_dev") MAPSTATE_TABLENAME = os.environ.get("MAPSTATE_TABLENAME", "mapstates") SCAFFOLDSTATE_TABLENAME = os.environ.get("SCAFFOLDSTATE_TABLENAME", "scaffoldstates") WRIKE_TOKEN = os.environ.get("WRIKE_TOKEN") diff --git a/app/main.py b/app/main.py index 178ffd67..91e408b3 100644 --- a/app/main.py +++ b/app/main.py @@ -210,6 +210,63 @@ def direct_download_url(path): resource = response["Body"].read() return resource +# /scicrunch/: Returns scicrunch results for a given query +@app.route("/scicrunch-dataset//") +def sci_doi(doi1,doi2): + doi = doi1 + '/' + doi2 + print(doi) + data = create_doi_request(doi) + try: + response = requests.post( + f'{Config.SCI_CRUNCH_HOST}/_search?api_key={Config.KNOWLEDGEBASE_KEY}', + json=data) + return response.json() + except requests.exceptions.HTTPError as err: + logging.error(err) + return json.dumps({'error': err}) + +# /scicrunch-processed/: Returns scicrunch results for a given query +@app.route("/scicrunch-dataset-processed//") +def sci_doi_processed(doi1,doi2): + doi = doi1 + '/' + doi2 + print(doi) + data = create_doi_request(doi) + try: + response = requests.post( + f'{Config.SCI_CRUNCH_HOST}/_search?api_key={Config.KNOWLEDGEBASE_KEY}', + json=data) + return process_kb_results(response.json()) + except requests.exceptions.HTTPError as err: + logging.error(err) + return json.dumps({'error': err}) + +# /scicrunch-query-string/: Returns results for given organ curie. These can be processed by the sidebar +@app.route("/scicrunch-query-string/") +def sci_organ(): + fields = request.args.getlist('field') + curie = request.args.get('curie') + # field example: "*organ.curie" + data = { + "size": 20, + "from": 0, + "query": { + "query_string": { + "fields": fields, + "query": curie + } + } + } + + try: + response = requests.post( + f'{Config.SCI_CRUNCH_HOST}/_search?api_key={Config.KNOWLEDGEBASE_KEY}', + json=data) + return process_kb_results(response.json()) + except requests.exceptions.HTTPError as err: + logging.error(err) + return json.dumps({'error': err}) + + # /search/: Returns scicrunch results for a given query @app.route("/search/", defaults={'query': ''}) diff --git a/app/process_kb_results.py b/app/process_kb_results.py index 86320309..53094bbe 100644 --- a/app/process_kb_results.py +++ b/app/process_kb_results.py @@ -17,6 +17,23 @@ 'csvFiles': ['objects'] } +def create_doi_request(doi): + + query = { + "query": { + "bool": { + "must": [{"match_all": {}}], + "should": [], + "filter": { + "term": { + "_id": f'DOI:{doi}' + } + } + } + } + } + + return query # create_facet_query(type): Generates facet search request data for scicrunch given a 'type'; where # 'type' is either 'species', 'gender', or 'genotype' at this stage. @@ -25,7 +42,8 @@ def create_facet_query(type): type_map = { 'species': ['organisms.primary.species.name.aggregate', 'organisms.sample.species.name.aggregate'], 'gender': ['attributes.subject.sex.value'], - 'genotype': ['anatomy.organ.name.aggregate'] + 'genotype': ['anatomy.organ.name.aggregate'], + 'organ': ['anatomy.organ.name.aggregate'] } data = { @@ -69,7 +87,8 @@ def create_filter_request(query, terms, facets, size, start): type_map = { 'species': ['organisms.primary.species.name.aggregate', 'organisms.sample.species.name'], 'gender': ['attributes.subject.sex.value', 'attributes.sample.sex.value'], - 'genotype': ['anatomy.organ.name.aggregate'] + 'genotype': ['anatomy.organ.name.aggregate'], + 'organ': ['anatomy.organ.name.aggregate'] } # Data structure of a scicrunch search @@ -136,7 +155,9 @@ def process_kb_results(results): for i, hit in enumerate(hits): attr = get_attributes(attributes, hit) attr['doi'] = convert_doi_to_url(attr['doi']) - attr['csvFiles'] = find_csv_files(attr['csvFiles']) + objects = attr['csvFiles'] # Have to do this as not all datsets return objects + attr['csvFiles'] = find_csv_files(objects) + attr['scaffolds'] = find_scaffold_json_files(objects) output.append(attr) return json.dumps({'numberOfHits': results['hits']['total'], 'results': output}) @@ -146,11 +167,36 @@ def convert_doi_to_url(doi): return doi return doi.replace('DOI:', 'https://doi.org/') +def convert_url_to_doi(doi): + if not doi: + return doi + return doi.replace('https://doi.org/', 'DOI:') + def find_csv_files(obj_list): if not obj_list: return obj_list - return [obj for obj in obj_list if obj.get('mimetype', 'none') == 'text/csv'] + return [obj for obj in obj_list if obj.get('mimetype', {}).get('name', 'none') == 'text/csv'] + + +def find_scaffold_json_files(obj_list): + if not obj_list: + return obj_list + return [obj for obj in obj_list if obj.get('additional_mimetype', {}).get('name', 'none') == 'inode/vnd.abi.scaffold+file'] + + +attributes = { + 'scaffolds': ['scaffolds'], + 'samples': ['attributes','sample','subject'], + 'name': ['item','name'], + 'identifier': ['item', 'identifier'], + 'uri': ['distributions', 'current', 'uri'], + 'updated': ['dates', 'updated'], + 'organs': ['anatomy', 'organ'], + 'contributors': ['contributors'], + 'doi': ['item', 'curie'], + 'csvFiles': ['objects'] +} # get_attributes: Use 'attributes' (defined at top of this document) to step through the large scicrunch result dict @@ -160,11 +206,12 @@ def get_attributes(attributes, dataset): for k, attr in attributes.items(): subset = dataset['_source'] # set our subest to the full dataset result key_attr = False - for key in attr: + for n, key in enumerate(attr): # step through attributes if isinstance(subset, dict): - if key in subset.keys(): + if key in subset.keys(): # continue if keys are found subset = subset[key] - key_attr = subset + if n+1 is len(attr): # if we made it to the end, save this subset + key_attr = subset found_attr[k] = key_attr return found_attr diff --git a/tests/test_scicrunch.py b/tests/test_scicrunch.py index bdc650e0..bd68c253 100644 --- a/tests/test_scicrunch.py +++ b/tests/test_scicrunch.py @@ -13,6 +13,11 @@ def test_scicrunch_keys(client): assert r.status_code == 200 assert 'numberOfHits' in json.loads(r.data).keys() +def test_scicrunch_dataset_doi(client): + r = client.get('/scicrunch-dataset/DOI%3A10.26275%2Fpzek-91wx') + assert json.loads(r.data)['hits']['hits'][0]['_id'] == "DOI:10.26275/pzek-91wx" + + def test_scicrunch_search(client): r = client.get('/search/heart') assert r.status_code == 200