Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dataverse data access module #124

Draft
wants to merge 23 commits into
base: dev
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions docs/src/en/alphafold.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,5 +67,9 @@ gget.pdb("2K42", save=True)

<iframe width="560" height="315" src="https://www.youtube.com/embed/4qxGF1tbZ3I?si=mEqQ5oSnDYtg2OP7" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen></iframe>

### [Example in Google Colab](https://github.com/pachterlab/gget_examples/blob/main/gget_alphafold.ipynb)
### [gget alphafold FAQ](https://github.com/pachterlab/gget/discussions/39)
# Tutorials
### [🔗 Google Colab tutorial](https://github.com/pachterlab/gget_examples/blob/main/gget_alphafold.ipynb)

### [🔗 Protein structure prediction with comparison to related crystal structures](https://github.com/pachterlab/gget_examples/blob/main/protein_structure_prediction_comparison.ipynb)

### [🔗 gget alphafold FAQ](https://github.com/pachterlab/gget/discussions/39)
7 changes: 6 additions & 1 deletion docs/src/en/elm.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,4 +87,9 @@ regex_df:
|ELME000231 |DEG_APCC_DBOX_1 |APCC-binding Destruction motifs|DEG |An RxxL-based motif that binds to the Cdh1 and Cdc20 components of APC/C thereby targeting the protein for destruction in a cell cycle dependent manner|SRVKLNIVR |Saccharomyces cerevisiae S288c|… |
|… |… |… |… |… |… |… |… |

#### [More examples](https://github.com/pachterlab/gget_examples)
# Tutorials
### [🔗 General `gget elm` demo](https://github.com/pachterlab/gget_examples/blob/main/gget_elm_demo.ipynb)

### [🔗 A point mutation in BRCA2 is carcinogenic due to the loss of a protein interaction motif](https://github.com/pachterlab/gget_examples/blob/main/gget_elm_BRCA2_example.ipynb)

### [🔗 Filter `gget elm` results based on disordered protein regions](https://github.com/pachterlab/gget_examples/blob/main/gget_elm_IUPred3_tutorial.ipynb)
3 changes: 2 additions & 1 deletion docs/src/en/enrichr.md
Original file line number Diff line number Diff line change
Expand Up @@ -220,4 +220,5 @@ df |>
xlab("-log10(adjusted P value)")
```

#### [More examples](https://github.com/pachterlab/gget_examples)
# Tutorials
[Using `gget enrichr` with background genes](https://github.com/pachterlab/gget_examples/blob/main/gget_enrichr_with_background_genes.ipynb)
1 change: 1 addition & 0 deletions gget/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from .gget_diamond import diamond
from .gget_cosmic import cosmic
from .gget_mutate import mutate
from .gget_dataverse import dataverse
from .gget_opentargets import opentargets
from .gget_cbio import cbio_plot, cbio_search
from .gget_bgee import bgee
Expand Down
3 changes: 3 additions & 0 deletions gget/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@
COSMIC_GET_URL = "https://cancer.sanger.ac.uk/cosmic/search/"
COSMIC_RELEASE_URL = "https://cancer.sanger.ac.uk/cosmic/release_notes"

# Harvard dataverse API server
DATAVERSE_GET_URL = "https://dataverse.harvard.edu/api/access/datafile/"

# OpenTargets API endpoint
OPENTARGETS_GRAPHQL_API = "https://api.platform.opentargets.org/api/v4/graphql"

Expand Down
125 changes: 125 additions & 0 deletions gget/gget_dataverse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import os
import requests
from tqdm import tqdm
import urllib.request
import json
from .utils import print_sys
from .constants import DATAVERSE_GET_URL

def dataverse_downloader(url, path, file_name):
"""dataverse download helper with progress bar

Args:
url (str): the url of the dataset to download
path (str): the path to save the dataset locally
file_name (str): the name of the file to save locally
"""
save_path = os.path.join(path, file_name)
response = requests.get(url, stream=True)
total_size_in_bytes = int(response.headers.get("content-length", 0))
block_size = 1024
progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
with open(save_path, "wb") as file:
for data in response.iter_content(block_size):
progress_bar.update(len(data))
file.write(data)
progress_bar.close()


def download_wrapper(entry, path, return_type=None):
"""wrapper for downloading a dataset given the name and path, for csv,pkl,tsv or similar files

Args:
entry (dict): the entry of the dataset to download. Must include 'id', 'name', 'type' keys
path (str): the path to save the dataset locally
return_type (str, optional): the return type. Defaults to None. Can be "url", "filename", or ["url", "filename"]

Returns:
str: the exact dataset query name
"""
url = DATAVERSE_GET_URL + str(entry['id'])

if not os.path.exists(path):
os.mkdir(path)

filename = f"{entry['name']}.{entry['type']}"

if os.path.exists(os.path.join(path, filename)):
print_sys(f"Found local copy for {entry['id']} datafile as {filename} ...")
os.path.join(path, filename)
else:
print_sys(f"Downloading {entry['id']} datafile as {filename} ...")
dataverse_downloader(url, path, filename)

if return_type == "url":
return url
elif return_type == "filename":
return filename
elif return_type == ["url", "filename"]:
return url, filename


def process_local_json(filename):
"""Process a local JSON file.

Args:
filename (str): The path to the local JSON file.

Returns:
dict: The local JSON file information as a dictionary.
"""

f = open(filename, 'r')
data = json.load(f)

return data


def process_remote_json(url, save=False):
"""Process a remote JSON file.

Args:
url (str): The URL of the remote JSON file.
save (bool, optional): Whether to save the JSON file locally. Defaults to False.

Returns:
dict: The remote JSON file information as a dictionary.
"""
response = urllib.request.urlopen(url)
data = json.loads(response.read())

# Save JSON
if save:
with open(save, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False)

return data


def dataverse(data, path=None, run_download=False, save_json=None):
"""process a json file including the dataverse datasets information and download the datasets

Args:
data (str or dict): list of datasets to download in JSON format. URL, path to a local file, or a python dictionary.
path (str, optional): the path to save the datasets. Defaults to None.
run_download (bool, optional): whether to download the datasets. Defaults to True.
save_json (str): path to save JSON file. Defaults to None.
"""
if "https" in data or "http" in data:
data = process_remote_json(data, save=save_json)
elif type(data) == str and '.json' in data:
data = process_local_json(data)
elif type(data) == dict:
pass

if "datasets" not in data:
# TODO: Add more error handling
raise ValueError("The json file must include proper 'datasets' key")

if not path and not run_download:
pass
elif not path and run_download:
raise ValueError("Please provide a path to save the datasets and set run_download=True")
elif run_download:
for entry in data['datasets']:
download_wrapper(entry, path)
9 changes: 9 additions & 0 deletions gget/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
from .gget_diamond import diamond
from .gget_cosmic import cosmic
from .gget_mutate import mutate
from .gget_dataverse import dataverse
from .gget_opentargets import opentargets, OPENTARGETS_RESOURCES
from .gget_cbio import cbio_plot, cbio_search
from .gget_bgee import bgee
Expand Down Expand Up @@ -3262,6 +3263,14 @@ def main():
else:
print(json.dumps(pdb_results, ensure_ascii=False, indent=4))

## dataverse return
if args.command == "dataverse":
dataverse(
data = args.json,
path = args.out,
run_download=True,
save_json=args.out + 'dataverse.json'
)
## opentargets return
if args.command == "opentargets":
flag_to_filter_id = {
Expand Down
10 changes: 10 additions & 0 deletions gget/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# from requests.adapters import HTTPAdapter, Retry
# import time
import re
import sys
import os
import uuid
import pandas as pd
Expand Down Expand Up @@ -59,6 +60,15 @@ def set_up_logger():
logger = set_up_logger()


def print_sys(s):
"""system print

Args:
s (str): the string to print
"""
print(s, flush = True, file = sys.stderr)


def flatten(xss):
"""
Function to flatten a list of lists.
Expand Down