Skip to content

Commit

Permalink
RT728517: parse through json
Browse files Browse the repository at this point in the history
  • Loading branch information
lilim-ebi committed Feb 21, 2024
1 parent 703ac42 commit 1b94048
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 18 deletions.
4 changes: 2 additions & 2 deletions python3/readGet.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,9 @@ def download_files(accession, output_format, dest_dir, fetch_meta, aspera):

lines = utils.download_report_from_portal(search_url)

for line in lines[1:]:
for line in lines:
data_accession, filelist, md5list = utils.parse_file_search_result_line(
line, accession, output_format)
line, accession, output_format, aspera)
# create run directory if downloading all data for an experiment
if is_experiment:
run_dir = os.path.join(accession_dir, data_accession)
Expand Down
44 changes: 28 additions & 16 deletions python3/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import requests
import urllib.error as urlerror
import urllib.parse as urlparse
import json

from configparser import SafeConfigParser

Expand Down Expand Up @@ -523,10 +524,7 @@ def get_report_from_portal(url):

def download_report_from_portal(url):
response = get_report_from_portal(url)
lines = []
for line in response:
lines.append(line.decode('utf-8'))
return lines
return json.loads(response.read().decode('utf-8'))


def get_accession_query(accession):
Expand Down Expand Up @@ -575,11 +573,16 @@ def get_result(accession):
else: # is_analysis(accession)
return ANALYSIS_RESULT

def get_result_accession(accession):
if is_run(accession) or is_experiment(accession) or is_sample(accession):
return 'run_accession'
else: # is_analysis(accession)
return 'analysis_accession'

def get_file_search_query(accession, aspera):
return PORTAL_SEARCH_BASE + get_accession_query(accession) + '&' + \
get_result(accession) + '&' + \
get_file_fields(accession, aspera) + '&limit=0'
get_file_fields(accession, aspera) + '&format=json&limit=0'


def split_filelist(filelist_string):
Expand All @@ -588,8 +591,7 @@ def split_filelist(filelist_string):
return filelist_string.strip().split(';')


def parse_file_search_result_line(line, accession, output_format):
cols = line.split('\t')
def parse_file_search_result_line(item, accession, output_format, aspera):
# example:
# submitted_ftp submitted_md5 sra_ftp sra_md5 fastq_ftp fastq_md5 run_accession
# ftp.sra.ebi.ac.uk/vol1/run/ERR251/ERR2512031/20104421_S5_L999_R1_001.fastq.gz;ftp.sra.ebi.ac.uk/vol1/run/ERR251/ERR2512031/20104421_S5_L999_R2_001.fastq.gz
Expand All @@ -599,15 +601,24 @@ def parse_file_search_result_line(line, accession, output_format):
# ftp.sra.ebi.ac.uk/vol1/fastq/ERR251/001/ERR2512031/ERR2512031_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR251/001/ERR2512031/ERR2512031_2.fastq.gz
# 950123b5264f6483040901575a8e8383;8bb209553d1c0292593524813cffb67f
# ERR2512031
data_acc = cols[-1].strip()
sub_filelist = split_filelist(cols[0])
sub_md5list = split_filelist(cols[1])
acc = get_result_accession(accession)
data_acc = item[acc]

if aspera:
sub_filelist = split_filelist(item[SUBMITTED_ASPERA_FIELD])
sra_filelist = split_filelist(item[SRA_ASPERA_FIELD])
fastq_filelist = split_filelist(item[FASTQ_ASPERA_FIELD])
else:
sub_filelist = split_filelist(item[SUBMITTED_FIELD])
sra_filelist = split_filelist(item[SRA_FIELD])
fastq_filelist = split_filelist(item[FASTQ_FIELD])

sub_md5list = split_filelist(item[SUBMITTED_MD5_FIELD])
sra_md5list = split_filelist(item[SRA_MD5_FIELD])
fastq_md5list = split_filelist(item[FASTQ_MD5_FIELD])

if is_analysis(accession):
return data_acc, sub_filelist, sub_md5list
sra_filelist = split_filelist(cols[2])
sra_md5list = split_filelist(cols[3])
fastq_filelist = split_filelist(cols[4])
fastq_md5list = split_filelist(cols[5])
if output_format is None:
if len(sub_filelist) > 0:
output_format = SUBMITTED_FORMAT
Expand All @@ -617,9 +628,10 @@ def parse_file_search_result_line(line, accession, output_format):
output_format = FASTQ_FORMAT
if output_format == SUBMITTED_FORMAT:
return data_acc, sub_filelist, sub_md5list
if output_format == SRA_FORMAT:
elif output_format == SRA_FORMAT:
return data_acc, sra_filelist, sra_md5list
return data_acc, fastq_filelist, fastq_md5list
else:
return data_acc, fastq_filelist, fastq_md5list


def create_dir(dir_path):
Expand Down

0 comments on commit 1b94048

Please sign in to comment.