From eea9f8e43a22a4336ec843715741409cf3f1ca51 Mon Sep 17 00:00:00 2001 From: Lili Meszaros Date: Tue, 20 Feb 2024 16:09:04 +0000 Subject: [PATCH 1/4] RT728517: fix order based on portal api query --- python3/enaDataGet.py | 2 +- python3/enaGroupGet.py | 2 +- python3/utils.py | 14 +++++++------- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/python3/enaDataGet.py b/python3/enaDataGet.py index c0af1fa..ef37afd 100644 --- a/python3/enaDataGet.py +++ b/python3/enaDataGet.py @@ -52,7 +52,7 @@ def set_parser(): parser.add_argument('-as', '--aspera-settings', default=None, help="""Use the provided settings file, will otherwise check for environment variable or default settings file location.""") - parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.7.0') + parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.7.1') return parser diff --git a/python3/enaGroupGet.py b/python3/enaGroupGet.py index 4d88f9e..55a498c 100644 --- a/python3/enaGroupGet.py +++ b/python3/enaGroupGet.py @@ -58,7 +58,7 @@ def set_parser(): for environment variable or default settings file location.""") parser.add_argument('-t', '--subtree', action='store_true', help='Include subordinate taxa (taxon subtree) when querying with NCBI tax ID (default is false)') - parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.7.0') + parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.7.1') return parser diff --git a/python3/utils.py b/python3/utils.py index ca9dcbc..b3e4ca4 100644 --- a/python3/utils.py +++ b/python3/utils.py @@ -590,15 +590,15 @@ def split_filelist(filelist_string): def parse_file_search_result_line(line, accession, output_format): cols = line.split('\t') - data_acc = cols[0].strip() - sub_filelist = split_filelist(cols[1]) - sub_md5list = split_filelist(cols[2]) + data_acc = cols[-1].strip() + sub_filelist = split_filelist(cols[0]) + sub_md5list = split_filelist(cols[1]) if is_analysis(accession): return data_acc, sub_filelist, sub_md5list - sra_filelist = split_filelist(cols[3]) - sra_md5list = split_filelist(cols[4]) - fastq_filelist = split_filelist(cols[5]) - fastq_md5list = split_filelist(cols[6]) + sra_filelist = split_filelist(cols[2]) + sra_md5list = split_filelist(cols[3]) + fastq_filelist = split_filelist(cols[4]) + fastq_md5list = split_filelist(cols[5]) if output_format is None: if len(sub_filelist) > 0: output_format = SUBMITTED_FORMAT From 703ac4217854cb12f88ed2fa2b8a640b3c7791a6 Mon Sep 17 00:00:00 2001 From: Lili Meszaros Date: Wed, 21 Feb 2024 10:12:11 +0000 Subject: [PATCH 2/4] RT728517: add example --- python3/utils.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/python3/utils.py b/python3/utils.py index b3e4ca4..27bab25 100644 --- a/python3/utils.py +++ b/python3/utils.py @@ -590,6 +590,15 @@ def split_filelist(filelist_string): def parse_file_search_result_line(line, accession, output_format): cols = line.split('\t') + # example: + # submitted_ftp submitted_md5 sra_ftp sra_md5 fastq_ftp fastq_md5 run_accession + # ftp.sra.ebi.ac.uk/vol1/run/ERR251/ERR2512031/20104421_S5_L999_R1_001.fastq.gz;ftp.sra.ebi.ac.uk/vol1/run/ERR251/ERR2512031/20104421_S5_L999_R2_001.fastq.gz + # 5267a0aa15395983b08318af330bfe47;e351cea6ed9d2f45f2d5fc01238789e5 + # ftp.sra.ebi.ac.uk/vol1/err/ERR251/001/ERR2512031 + # 1cf4167dfebec580f3a9f8927c546cc7 + # ftp.sra.ebi.ac.uk/vol1/fastq/ERR251/001/ERR2512031/ERR2512031_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR251/001/ERR2512031/ERR2512031_2.fastq.gz + # 950123b5264f6483040901575a8e8383;8bb209553d1c0292593524813cffb67f + # ERR2512031 data_acc = cols[-1].strip() sub_filelist = split_filelist(cols[0]) sub_md5list = split_filelist(cols[1]) From 1b940484eff78fb9ca976032261d9d52bcabdeca Mon Sep 17 00:00:00 2001 From: Lili Meszaros Date: Wed, 21 Feb 2024 13:27:02 +0000 Subject: [PATCH 3/4] RT728517: parse through json --- python3/readGet.py | 4 ++-- python3/utils.py | 44 ++++++++++++++++++++++++++++---------------- 2 files changed, 30 insertions(+), 18 deletions(-) diff --git a/python3/readGet.py b/python3/readGet.py index 9399e92..e63fcba 100644 --- a/python3/readGet.py +++ b/python3/readGet.py @@ -97,9 +97,9 @@ def download_files(accession, output_format, dest_dir, fetch_meta, aspera): lines = utils.download_report_from_portal(search_url) - for line in lines[1:]: + for line in lines: data_accession, filelist, md5list = utils.parse_file_search_result_line( - line, accession, output_format) + line, accession, output_format, aspera) # create run directory if downloading all data for an experiment if is_experiment: run_dir = os.path.join(accession_dir, data_accession) diff --git a/python3/utils.py b/python3/utils.py index 27bab25..e99d9f7 100644 --- a/python3/utils.py +++ b/python3/utils.py @@ -27,6 +27,7 @@ import requests import urllib.error as urlerror import urllib.parse as urlparse +import json from configparser import SafeConfigParser @@ -523,10 +524,7 @@ def get_report_from_portal(url): def download_report_from_portal(url): response = get_report_from_portal(url) - lines = [] - for line in response: - lines.append(line.decode('utf-8')) - return lines + return json.loads(response.read().decode('utf-8')) def get_accession_query(accession): @@ -575,11 +573,16 @@ def get_result(accession): else: # is_analysis(accession) return ANALYSIS_RESULT +def get_result_accession(accession): + if is_run(accession) or is_experiment(accession) or is_sample(accession): + return 'run_accession' + else: # is_analysis(accession) + return 'analysis_accession' def get_file_search_query(accession, aspera): return PORTAL_SEARCH_BASE + get_accession_query(accession) + '&' + \ get_result(accession) + '&' + \ - get_file_fields(accession, aspera) + '&limit=0' + get_file_fields(accession, aspera) + '&format=json&limit=0' def split_filelist(filelist_string): @@ -588,8 +591,7 @@ def split_filelist(filelist_string): return filelist_string.strip().split(';') -def parse_file_search_result_line(line, accession, output_format): - cols = line.split('\t') +def parse_file_search_result_line(item, accession, output_format, aspera): # example: # submitted_ftp submitted_md5 sra_ftp sra_md5 fastq_ftp fastq_md5 run_accession # ftp.sra.ebi.ac.uk/vol1/run/ERR251/ERR2512031/20104421_S5_L999_R1_001.fastq.gz;ftp.sra.ebi.ac.uk/vol1/run/ERR251/ERR2512031/20104421_S5_L999_R2_001.fastq.gz @@ -599,15 +601,24 @@ def parse_file_search_result_line(line, accession, output_format): # ftp.sra.ebi.ac.uk/vol1/fastq/ERR251/001/ERR2512031/ERR2512031_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR251/001/ERR2512031/ERR2512031_2.fastq.gz # 950123b5264f6483040901575a8e8383;8bb209553d1c0292593524813cffb67f # ERR2512031 - data_acc = cols[-1].strip() - sub_filelist = split_filelist(cols[0]) - sub_md5list = split_filelist(cols[1]) + acc = get_result_accession(accession) + data_acc = item[acc] + + if aspera: + sub_filelist = split_filelist(item[SUBMITTED_ASPERA_FIELD]) + sra_filelist = split_filelist(item[SRA_ASPERA_FIELD]) + fastq_filelist = split_filelist(item[FASTQ_ASPERA_FIELD]) + else: + sub_filelist = split_filelist(item[SUBMITTED_FIELD]) + sra_filelist = split_filelist(item[SRA_FIELD]) + fastq_filelist = split_filelist(item[FASTQ_FIELD]) + + sub_md5list = split_filelist(item[SUBMITTED_MD5_FIELD]) + sra_md5list = split_filelist(item[SRA_MD5_FIELD]) + fastq_md5list = split_filelist(item[FASTQ_MD5_FIELD]) + if is_analysis(accession): return data_acc, sub_filelist, sub_md5list - sra_filelist = split_filelist(cols[2]) - sra_md5list = split_filelist(cols[3]) - fastq_filelist = split_filelist(cols[4]) - fastq_md5list = split_filelist(cols[5]) if output_format is None: if len(sub_filelist) > 0: output_format = SUBMITTED_FORMAT @@ -617,9 +628,10 @@ def parse_file_search_result_line(line, accession, output_format): output_format = FASTQ_FORMAT if output_format == SUBMITTED_FORMAT: return data_acc, sub_filelist, sub_md5list - if output_format == SRA_FORMAT: + elif output_format == SRA_FORMAT: return data_acc, sra_filelist, sra_md5list - return data_acc, fastq_filelist, fastq_md5list + else: + return data_acc, fastq_filelist, fastq_md5list def create_dir(dir_path): From 787b54a057c27746ca8a8b4e34665d4ca453b126 Mon Sep 17 00:00:00 2001 From: Lili Meszaros Date: Thu, 22 Feb 2024 12:12:49 +0000 Subject: [PATCH 4/4] RT728517: add checks --- python3/utils.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/python3/utils.py b/python3/utils.py index e99d9f7..5746e5f 100644 --- a/python3/utils.py +++ b/python3/utils.py @@ -570,18 +570,27 @@ def get_file_fields(accession, aspera): def get_result(accession): if is_run(accession) or is_experiment(accession) or is_sample(accession): return RUN_RESULT - else: # is_analysis(accession) + elif is_analysis(accession): return ANALYSIS_RESULT + else: + raise TypeError('Only runs, experiments, samples and analyses are allowed') def get_result_accession(accession): if is_run(accession) or is_experiment(accession) or is_sample(accession): return 'run_accession' - else: # is_analysis(accession) + elif is_analysis(accession): return 'analysis_accession' + else: + raise TypeError('Only runs, experiments, samples and analyses are allowed') def get_file_search_query(accession, aspera): - return PORTAL_SEARCH_BASE + get_accession_query(accession) + '&' + \ - get_result(accession) + '&' + \ + try: + result = get_result(accession) + except TypeError as e: + print("Error:", e) + raise RuntimeError("Failed to get result for accession: {}".format(accession)) from e + + return PORTAL_SEARCH_BASE + get_accession_query(accession) + '&' + result + '&' + \ get_file_fields(accession, aspera) + '&format=json&limit=0' @@ -601,8 +610,13 @@ def parse_file_search_result_line(item, accession, output_format, aspera): # ftp.sra.ebi.ac.uk/vol1/fastq/ERR251/001/ERR2512031/ERR2512031_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR251/001/ERR2512031/ERR2512031_2.fastq.gz # 950123b5264f6483040901575a8e8383;8bb209553d1c0292593524813cffb67f # ERR2512031 - acc = get_result_accession(accession) - data_acc = item[acc] + try: + result_accession = get_result_accession(accession) + except TypeError as e: + print("Error:", e) + raise RuntimeError("Failed to get result for accession: {}".format(accession)) from e + + data_acc = item[result_accession] if aspera: sub_filelist = split_filelist(item[SUBMITTED_ASPERA_FIELD])