From eb310b62ce21e22e63e907355a92e69d249781ea Mon Sep 17 00:00:00 2001 From: thinnerichs Date: Thu, 9 Dec 2021 16:43:37 +0100 Subject: [PATCH 1/3] Update README --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index 0d952bd..60b122f 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,13 @@ This repository contains code for: Requires Matlab and python. The [AllenSDK package](http://alleninstitute.github.io/AllenSDK/install.html) for python must be installed. +Commands to install AllenSDK (As incompatible with newer versions) +``` +conda create -n allensdk python=3.7 +conda activate allensdk +pip install allensdk +``` + If anything is unclear or needs improvement, please send questions by [raising an Issue](https://docs.github.com/en/github/managing-your-work-on-github/creating-an-issue) or [sending me an email](mailto:ben.d.fulcher@gmail.com). This pipeline is based on code developed for [Fulcher and Fornito, _PNAS_ (2016)](https://doi.org/10.1073/pnas.1513302113), and used for [Fulcher et al., _PNAS_ (2019)](https://doi.org/10.1073/pnas.1814144116). From b95ebb7a6f1445786cbce61cd61929eea343ec40 Mon Sep 17 00:00:00 2001 From: thinnerichs Date: Mon, 13 Dec 2021 14:44:08 +0100 Subject: [PATCH 2/3] Update 2to3 .py files and upload full data --- AllGenes.py | 24 ++-- README.md | 2 + RetrieveGene.py | 117 +++++++++++------ RetrieveGene.py.bak | 298 ++++++++++++++++++++++++++++++++++++++++++ WriteStructureInfo.py | 11 +- 5 files changed, 400 insertions(+), 52 deletions(-) create mode 100644 RetrieveGene.py.bak diff --git a/AllGenes.py b/AllGenes.py index dd22e98..59d7bf2 100644 --- a/AllGenes.py +++ b/AllGenes.py @@ -24,13 +24,13 @@ def QueryAPI(model,criteriaString,includeString="",optionsString="",writeOut=[]) api = RmaApi() # Settings for retrieval rows = [] - blockSize = 2000 + blockSize = 5000 done = False startRow = 0 # for i in range(0, total_rows, blockSize): while not done: - print "Row %d, attempting to retrieve %d rows..." % (startRow, blockSize) + print("Row %d, attempting to retrieve %d rows..." % (startRow, blockSize)) tot_rows = len(rows) if len(includeString)==0: @@ -50,16 +50,16 @@ def QueryAPI(model,criteriaString,includeString="",optionsString="",writeOut=[]) numRows = len(rows) - tot_rows # additional rows retrieved on running the query startRow += numRows - print "%d rows retrieved." % numRows + print("%d rows retrieved." % numRows) # Check if we're at the end of the road if numRows == 0 or numRows < blockSize: done = True # Write out the results as they come in, if requested - if isinstance(writeOut, basestring): + if isinstance(writeOut, str): json_utilities.write(json_file_name, rows) - print "Wrote to %s" % json_file_name + print("Wrote to %s" % json_file_name) return rows #------------------------------------------------------------------------------- @@ -106,7 +106,7 @@ def SaveSectionsToCSV(sections): }) # To dataframe: df_sections = pd.DataFrame.from_records(sectionList) #,index='section_id') - df_sections = df_sections.sort('section_id') + df_sections = df_sections.sort_values(by=['section_id']) # Save as a .csv file: df_sections.to_csv(sectionDatasetFilename) @@ -123,19 +123,19 @@ def SaveSectionsToCSV(sections): }) # To dataframe: df_genes = pd.DataFrame.from_records(geneList) #,index='entrez_id') - df_genes = df_genes.sort('entrez_id') + df_genes = df_genes.sort_values(by=['entrez_id']) numGenesFull = df_genes.shape[0] df_genes = df_genes.drop_duplicates() numGenesFiltered = df_genes.shape[0] - print "Genes filtered from %u to %u" % (numGenesFull, numGenesFiltered) + print("Genes filtered from %u to %u" % (numGenesFull, numGenesFiltered)) # Save as a .csv file: df_genes.to_csv(geneInfoFilename) #------------------------------------------------------------------------------- def SaveListCSV(stringList,fileName): # Outputs a csv from a given list of strings - resultFile = open(fileName,'wb') - wr = csv.writer(resultFile, dialect='excel') - wr.writerow(stringList) + with open(file=fileName, mode='w') as resultFile: + wr = csv.writer(resultFile, dialect='excel') + wr.writerow(stringList) #------------------------------------------------------------------------------- @@ -154,7 +154,7 @@ def SaveListCSV(stringList,fileName): entrezSet = set(geneEntrezList) geneEntrezList = list(entrezSet) geneEntrezList.sort() -print "There are %u unique genes in section datasets" % len(entrezSet) +print("There are %u unique genes in section datasets" % len(entrezSet)) SaveListCSV(geneEntrezList,entrezIDFilename) # Saves to: diff --git a/README.md b/README.md index 60b122f..33a87e0 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # AllenSDK +This is a python 3 ported version of [https://github.com/benfulcher/AllenSDK](https://github.com/benfulcher/AllenSDK), enhanced through some parallelism using `joblib` and others. + [![DOI](https://zenodo.org/badge/104984017.svg)](https://zenodo.org/badge/latestdoi/104984017) [![Twitter](https://img.shields.io/twitter/url/https/twitter.com/bendfulcher.svg?style=social&label=Follow%20%40bendfulcher)](https://twitter.com/bendfulcher) diff --git a/RetrieveGene.py b/RetrieveGene.py index 829c71e..afef29b 100644 --- a/RetrieveGene.py +++ b/RetrieveGene.py @@ -5,11 +5,14 @@ import time import csv # For saving string data to csv +from joblib import Parallel, delayed +import os + #------------------------------------------------------------------------------- # Globals: #------------------------------------------------------------------------------- # INPUT file names: -structIDSource = 'structureIDs.csv' # 'layerIDs.csv' # 'structureIDs.csv' +structIDSource = 'structIDs_all.csv' # 'layerIDs.csv' # 'structureIDs.csv' entrezSource = 'geneEntrezID.csv' # 'geneEntrezID.csv' # OUTPUT file names: @@ -24,7 +27,7 @@ json_file_name = 'unionizes.json' #------------------------------------------------------------------------------- -def download_mouse_unionizes(geneEntrezID,structureIDs,doReduced=False,writeOut=False): +def download_mouse_unionizes(geneEntrezID,structureIDs,doReduced='medium',writeOut=False): # Given genes and set of structures, retrieves expression data from Allen API # Can set geneEntrezID to empty to retrieve for all genes # Can set geneEntrezID to be a single number (array length 1) for a single gene @@ -35,7 +38,10 @@ def download_mouse_unionizes(geneEntrezID,structureIDs,doReduced=False,writeOut= string_structureIDs = ','.join(str(sid) for sid in structureIDs) # Set defaults: - if doReduced: + if doReduced == 'struc_gene_density': + print("Only retrieving a minimal subset of results from the queries") + optionsString = "[only$eq'structures.id,data_sets.idgenes.entrez_id,structure_unionizes.expression_density']" + elif doReduced=='medium': print("Only retrieving a reduced subset of results from the queries") optionsString = "[only$eq'structures.id,data_sets.id,data_sets.plane_of_section_id,genes.entrez_id,structure_unionizes.expression_energy,structure_unionizes.expression_density']" else: @@ -55,14 +61,14 @@ def download_mouse_unionizes(geneEntrezID,structureIDs,doReduced=False,writeOut= ] elif len([geneEntrezID])==1: # Get for a single gene: - print("Get for the single gene: entrez_id = %u" % geneEntrezID[0]) + print(("Get for the single gene: entrez_id = %u" % geneEntrezID[0])) criteria = [ "structure[id$in%s][graph_id$eq%d]," % (string_structureIDs, mouse_graph_id), "section_data_set[failed$eqfalse](products[id$eq%d],genes[entrez_id$eq%d])" % (mouse_product_id, geneEntrezID[0]) ] elif len(geneEntrezID)>1: # Get for multiple genes: - print("Getting expression results for %u genes" % len(geneEntrezID)) + print(("Getting expression results for %u genes" % len(geneEntrezID))) string_geneEntrezIDs = ','.join(str(eid) for eid in geneEntrezIDs) criteria = [ "structure[id$in%s][graph_id$eq%d]," % (string_structureIDs, mouse_graph_id), @@ -84,30 +90,34 @@ def QueryAPI(model,criteriaString,includeString=None,optionsString=None,writeOut # Settings for retrieval rows = [] - blockSize = 2000 + blockSize = 1000 done = False startRow = 0 # for i in range(0, total_rows, blockSize): while not done: - print("Row %d, attempting to retrieve %d rows..." % (startRow, blockSize)) + print(("Row %d, attempting to retrieve %d rows..." % (startRow, blockSize))) tot_rows = len(rows) # apiQueryPartial = partial(api.model_query,model=model,criteria=criteriaString, # startRow=startRow,num_rows=blockSize) + print(model, criteriaString, includeString, optionsString, startRow, blockSize) rows += api.model_query(model=model, criteria=criteriaString, include=includeString, options=optionsString, start_row=startRow, num_rows=blockSize) + print(rows) + raise Exception + numRows = len(rows) - tot_rows # additional rows retrieved on running the query startRow += numRows - print("%d rows retrieved." % numRows) + print(("%d rows retrieved." % numRows)) # Check if we're at the end of the road if numRows == 0 or numRows < blockSize: @@ -116,7 +126,7 @@ def QueryAPI(model,criteriaString,includeString=None,optionsString=None,writeOut # Write out the results as they come in, if requested if writeOut: json_utilities.write(json_file_name, rows) - print("Wrote to %s" % json_file_name) + print(("Wrote to %s" % json_file_name)) return rows #------------------------------------------------------------------------------- @@ -153,7 +163,7 @@ def GetStructureDetails(structureIDs,filterFields,graph_id=1): structs = QueryAPI(model="Structure",criteriaString=("[id$in%s][graph_id$eq%u]" \ % (string_structureIDs,graph_id))) # (Mouse Brain Atlas) - print("%u mouse-brain structures retrieved!\n" % len(structs)) + print(("%u mouse-brain structures retrieved!\n" % len(structs))) # Filter to only certain fields, specified as filterFields: structsFilt = [] @@ -182,16 +192,30 @@ def GetStructureDetails(structureIDs,filterFields,graph_id=1): #--------------------------------------------------------------------------- # Read in structure IDs from csv -> save detailed structure info to file #--------------------------------------------------------------------------- -structureIDs = np.genfromtxt(structIDSource,delimiter=',') -print("Retrieved %d structures from %s..." % (len(structureIDs),structIDSource)) +structureIDs = np.genfromtxt(structIDSource,delimiter=',', dtype=np.int) +print(("Retrieved %d structures from %s..." % (len(structureIDs),structIDSource))) # Get details of the structures: filterFields = ['id','name','acronym','color_hex_triplet'] -structs = GetStructureDetails(structureIDs,filterFields,1) + +# parallel downloading through joblib.Parallel, alter num_jobs for more simultaneous downloads +num_jobs = 6 +num_structures = len(structureIDs) + +def structure_details_wrapper(struc_ids): + return GetStructureDetails(struc_ids, filterFields, 1) +struct_list = Parallel(n_jobs=num_jobs)(delayed(structure_details_wrapper)(sub_list) + for sub_list + in np.split(structureIDs, list(range(0,num_structures,num_structures//num_jobs))[1:-1])) + +structs = sum(struct_list, []) +# structs = GetStructureDetails(structureIDs,filterFields,1) # To dataframe: df_struct = pd.DataFrame.from_records(structs) # Save as a .csv file: df_struct.to_csv(structInfoFilename) + +''' #--------------------------------------------------------------------------- # RETRIEVING GENES ONE AT A TIME #--------------------------------------------------------------------------- @@ -200,7 +224,7 @@ def GetStructureDetails(structureIDs,filterFields,graph_id=1): # Read in gene entrez IDs from csv: geneEntrezIDs = np.genfromtxt(entrezSource,delimiter=',') -print("Read in %d genes from %s..." % (len(geneEntrezIDs), entrezSource)) +print(("Read in %d genes from %s..." % (len(geneEntrezIDs), entrezSource))) # geneEntrezIDs = np.concatenate((geneEntrezIDs[0:1],[20604.]),axis=0) unionizes = [] @@ -213,18 +237,19 @@ def GetStructureDetails(structureIDs,filterFields,graph_id=1): # df = pd.concat([df,df_i]) timeSoFar = time.time() - startTime timeRemaining = timeSoFar/(ind+1)*(len(geneEntrezIDs)-ind+1)/60 - print("We're at %d/%d, %f min remaining" % (ind+1,len(geneEntrezIDs),timeRemaining)) + print(("We're at %d/%d, %f min remaining" % (ind+1,len(geneEntrezIDs),timeRemaining))) totalTime = (time.time() - startTime)/60. -print("Took %f min total" % totalTime) +print(("Took %f min total" % totalTime)) # Make a data frame of the datasets downloaded: df = unionizes_to_dataframe(unionizes) df = df.sort_values(by=['structure_id','data_set_id']) df.to_csv(allDataFilename) -print("Saved data frame to %s" % allDataFilename) +print(("Saved data frame to %s" % allDataFilename)) print(df) # df = pd.read_csv(allDataFilename) +''' #------------------------------------------------------------------------------- # RETRIEVING GENES ALL TOGETHER IN ONE QUERY @@ -233,44 +258,58 @@ def GetStructureDetails(structureIDs,filterFields,graph_id=1): # back to the matching genes...? (25924 rows per structure...): # No need to write to a json file as you go (slows down) # Only retrieve the necessary fields (doReduced) -# unionizes = download_mouse_unionizes([],structureIDs,doReduced=True,writeOut=False) +# Read in gene entrez IDs from csv: +geneEntrezIDs = np.genfromtxt(entrezSource,delimiter=',') +print(("Read in %d genes from %s..." % (len(geneEntrezIDs), entrezSource))) + +if not os.file.exists(json_file_name): + # Parallelize download using num_jobs and num_structures from the previous parallelization + def download_mouse_unionizes_wrapper(struc_ids): + return download_mouse_unionizes([],struc_ids,doReduced='medium',writeOut=False) + unionizes = Parallel(n_jobs=num_jobs)(delayed(download_mouse_unionizes_wrapper)(sub_list) + for sub_list + in np.split(structureIDs, list(range(0,num_structures,num_structures//num_jobs))[1:-1])) + unionizes = sum(unionizes, []) + + # unionizes = download_mouse_unionizes([],structureIDs,doReduced='medium',writeOut=False) + + # Write the agglomerated datasets to file + json_utilities.write(json_file_name, unionizes) + print("Wrote %d unionizes to %s" % (len(unionizes),json_file_name)) #------------------------------------------------------------------------------- -# Write the agglomerated datasets to file -# json_utilities.write(json_file_name, unionizes) -# print("Wrote %d unionizes to %s" % (len(unionizes),json_file_name)) # Read in previously-downloaded results: -# unionizes = json_utilities.read(json_file_name) +unionizes = json_utilities.read(json_file_name) # To dataframe and then filter to include only genes in our geneEntrezIDs list: -# dfFull = unionizes_to_dataframe(unionizes) -# dfFull.shape[0] +dfFull = unionizes_to_dataframe(unionizes) +dfFull.shape[0] # # Now we need to filter to include only genes in our geneEntrezIDs list: # # (actually this filtering is not necessary) -- potentially a nice check that we only retrieved # # entrez_ids corresponding to section datasets with expression (22,000) -# df = dfFull.copy() -# df = df[df['gene_entrez'].isin(geneEntrezIDs)] -# df.shape[0] -# len(geneEntrezIDs) +df = dfFull.copy() +df = df[df['gene_entrez'].isin(geneEntrezIDs)] +df.shape[0] +len(geneEntrezIDs) # # (so something like 3,000 duplicates?) -# df = df.sort(['structure_id','data_set_id']) -# df.tail -# df_filter = df.drop_duplicates() -# df['data_set_id'] +df = df.sort(['structure_id','data_set_id']) +df.tail +df_filter = df.drop_duplicates() +df['data_set_id'] # # Save out just the gene information: # dfGenes = df[] # # # Save data summary as a .csv file: -# csv_file_name = 'filteredDataFrame.csv' -# df.to_csv(csv_file_name) +csv_file_name = 'filteredDataFrame.csv' +df.to_csv(csv_file_name) # Compute the mean expression energy for each structure_id: -# gb = df.groupby(['gene_entrez','structure_id'])['expression_energy'].mean() -# print(gb) +gb = df.groupby(['gene_entrez','structure_id'])['expression_energy'].mean() +print(gb) -# gb.pivot(index="structure_id", columns="gene_entrez", values="expression_energy") +gb.pivot(index="structure_id", columns="gene_entrez", values="expression_energy") print("Writing expression energy and density matrices out") for expressionType in ['expression_energy','expression_density']: table = pd.pivot_table(df, values=expressionType, index="structure_id", @@ -282,8 +321,8 @@ def GetStructureDetails(structureIDs,filterFields,graph_id=1): # Save mean expression energy to a .csv output file: fileName = ("%s_%ux%u.csv" % (expressionType, table.shape[0], table.shape[1])) table.to_csv(fileName,na_rep='NaN') - print("Saved expression energy of %u datasets over %u structures to %s" \ - % (table.shape[1],table.shape[0],fileName)) + print(("Saved expression energy of %u datasets over %u structures to %s" \ + % (table.shape[1],table.shape[0],fileName))) # Need to also output the section dataset IDs (columns) and struct IDs (rows) dataSetIDs = pd.DataFrame(list(table.axes[0])) diff --git a/RetrieveGene.py.bak b/RetrieveGene.py.bak new file mode 100644 index 0000000..829c71e --- /dev/null +++ b/RetrieveGene.py.bak @@ -0,0 +1,298 @@ +import pandas as pd +from allensdk.api.queries.rma_api import RmaApi +import allensdk.core.json_utilities as json_utilities +import numpy as np +import time +import csv # For saving string data to csv + +#------------------------------------------------------------------------------- +# Globals: +#------------------------------------------------------------------------------- +# INPUT file names: +structIDSource = 'structureIDs.csv' # 'layerIDs.csv' # 'structureIDs.csv' +entrezSource = 'geneEntrezID.csv' # 'geneEntrezID.csv' + +# OUTPUT file names: +structInfoFilename = 'structureInfo.csv' # structureInfo.csv +allDataFilename = 'bigDataFrame.csv' + +#------------------------------------------------------------------------------- +# graph_id = 1 is the "Mouse Brain Atlas" +# (cf. http://help.brain-map.org/display/api/Atlas+Drawings+and+Ontologies) +mouse_graph_id = 1 +mouse_product_id = 1 +json_file_name = 'unionizes.json' +#------------------------------------------------------------------------------- + +def download_mouse_unionizes(geneEntrezID,structureIDs,doReduced=False,writeOut=False): + # Given genes and set of structures, retrieves expression data from Allen API + # Can set geneEntrezID to empty to retrieve for all genes + # Can set geneEntrezID to be a single number (array length 1) for a single gene + # Can set geneEntrezID to an array to retrieve for multiple genes + # doReduced aims to speed up the process by only retrieving bare-bones info + + # Convert list of structure IDs to a comma-separated string: + string_structureIDs = ','.join(str(sid) for sid in structureIDs) + + # Set defaults: + if doReduced: + print("Only retrieving a reduced subset of results from the queries") + optionsString = "[only$eq'structures.id,data_sets.id,data_sets.plane_of_section_id,genes.entrez_id,structure_unionizes.expression_energy,structure_unionizes.expression_density']" + else: + optionsString = None + + # Set criteria : + if len(geneEntrezID)==0: + # Get for all genes + print("Get for all genes") + # cf. Full query: http://api.brain-map.org/api/v2/data/query.xml? + # criteria=model::StructureUnionize,rma::criteria,structure[id$eq22][graph_id$eq1], + # section_data_set[failed$eqFalse](products[id$eq1]), + # rma::include,section_data_set,structure,section_data_set(genes) + criteria = [ + "structure[id$in%s][graph_id$eq%d]," % (string_structureIDs, mouse_graph_id), + "section_data_set[failed$eqfalse](products[id$eq%d])" % (mouse_product_id) + ] + elif len([geneEntrezID])==1: + # Get for a single gene: + print("Get for the single gene: entrez_id = %u" % geneEntrezID[0]) + criteria = [ + "structure[id$in%s][graph_id$eq%d]," % (string_structureIDs, mouse_graph_id), + "section_data_set[failed$eqfalse](products[id$eq%d],genes[entrez_id$eq%d])" % (mouse_product_id, geneEntrezID[0]) + ] + elif len(geneEntrezID)>1: + # Get for multiple genes: + print("Getting expression results for %u genes" % len(geneEntrezID)) + string_geneEntrezIDs = ','.join(str(eid) for eid in geneEntrezIDs) + criteria = [ + "structure[id$in%s][graph_id$eq%d]," % (string_structureIDs, mouse_graph_id), + "section_data_set[failed$eqfalse](products[id$eq%d],genes[entrez_id$in%s])" % (mouse_product_id, string_geneEntrezIDs) + ] + + # Query the API: + expressionData = QueryAPI(model='StructureUnionize',criteriaString="".join(criteria), + includeString="section_data_set,structure,section_data_set(genes)", + optionsString=optionsString,writeOut=writeOut) + + return expressionData +#------------------------------------------------------------------------------- +def QueryAPI(model,criteriaString,includeString=None,optionsString=None,writeOut=False): + # Send a query to the Allen API, and assemble results + + # Initiate RMA API for Allen data retrieval + api = RmaApi() + + # Settings for retrieval + rows = [] + blockSize = 2000 + done = False + startRow = 0 + # for i in range(0, total_rows, blockSize): + + while not done: + print("Row %d, attempting to retrieve %d rows..." % (startRow, blockSize)) + + tot_rows = len(rows) + + # apiQueryPartial = partial(api.model_query,model=model,criteria=criteriaString, + # startRow=startRow,num_rows=blockSize) + + rows += api.model_query(model=model, + criteria=criteriaString, + include=includeString, + options=optionsString, + start_row=startRow, + num_rows=blockSize) + + numRows = len(rows) - tot_rows # additional rows retrieved on running the query + startRow += numRows + + print("%d rows retrieved." % numRows) + + # Check if we're at the end of the road + if numRows == 0 or numRows < blockSize: + done = True + + # Write out the results as they come in, if requested + if writeOut: + json_utilities.write(json_file_name, rows) + print("Wrote to %s" % json_file_name) + + return rows +#------------------------------------------------------------------------------- +def unionizes_to_dataframe(unionizes): + fdata = [] + for unionize in unionizes: + fdata.append({ + 'structure_id': unionize['structure_id'], + 'expression_energy': unionize['expression_energy'], + 'expression_density': unionize['expression_density'], + 'data_set_id': unionize['section_data_set']['id'], + 'plane_of_section_id': unionize['section_data_set']['plane_of_section_id'], + # 'gene_acronym': unionize['section_data_set']['genes'][0]['acronym'], + # 'gene_name': unionize['section_data_set']['genes'][0]['name'], + # 'gene_entrez': unionize['section_data_set']['genes'][0]['entrez_id'], + }) + return pd.DataFrame.from_records(fdata) +#------------------------------------------------------------------------------- +def SaveListCSV(stringList,fileName): + resultFile = open(fileName,'wb') + wr = csv.writer(resultFile, dialect='excel') + wr.writerow(stringList) +#------------------------------------------------------------------------------- +def GetStructureDetails(structureIDs,filterFields,graph_id=1): + # Get structure info for structureIDs given from Allen API + # (mouse brain atlas) + + print("\n---Downloading all mouse brain structures using the Allen Institute API...\n") + + # Convert list of structure IDs to a comma-separated string: + string_structureIDs = ','.join(str(sid) for sid in structureIDs) + + # Get the structure info by querying Allen API: + structs = QueryAPI(model="Structure",criteriaString=("[id$in%s][graph_id$eq%u]" \ + % (string_structureIDs,graph_id))) # (Mouse Brain Atlas) + + print("%u mouse-brain structures retrieved!\n" % len(structs)) + + # Filter to only certain fields, specified as filterFields: + structsFilt = [] + for i,s in enumerate(structs): + structsFilt.append({keep_key: s[keep_key] for keep_key in filterFields}) + + return structsFilt + + # # Build a dict from structure id to structure and identify each node's direct descendants: + # structHash = {} # dict + # for s in structs: + # s['num_children'] = 0 # add this field + # s['structure_id_path'] = [int(sid) for sid in s['structure_id_path'].split('/') if sid != ''] + # s['acronym'] = s['acronym'].rstrip() # some acronyms have trailing whitespace (e.g., 'SUM'); remove + # structHash[s['id']] = s + # + # # Iterate through to add the 'num_children' field by incrementing the 'num_children' + # # field given structures with parents: + # for sid,s in structHash.iteritems(): + # if len(s['structure_id_path']) > 1: + # parentId = s['structure_id_path'][-2] + # structHash[parentId]['num_children'] += 1 + +#------------------------------------------------------------------------------- + +#--------------------------------------------------------------------------- +# Read in structure IDs from csv -> save detailed structure info to file +#--------------------------------------------------------------------------- +structureIDs = np.genfromtxt(structIDSource,delimiter=',') +print("Retrieved %d structures from %s..." % (len(structureIDs),structIDSource)) +# Get details of the structures: +filterFields = ['id','name','acronym','color_hex_triplet'] +structs = GetStructureDetails(structureIDs,filterFields,1) +# To dataframe: +df_struct = pd.DataFrame.from_records(structs) +# Save as a .csv file: +df_struct.to_csv(structInfoFilename) + +#--------------------------------------------------------------------------- +# RETRIEVING GENES ONE AT A TIME +#--------------------------------------------------------------------------- +# Kind of need to go one at a time since the request is too long if you try +# to get all genes at once? + +# Read in gene entrez IDs from csv: +geneEntrezIDs = np.genfromtxt(entrezSource,delimiter=',') +print("Read in %d genes from %s..." % (len(geneEntrezIDs), entrezSource)) + +# geneEntrezIDs = np.concatenate((geneEntrezIDs[0:1],[20604.]),axis=0) +unionizes = [] +startTime = time.time() +for ind, geneEntrezID in enumerate(geneEntrezIDs): + unionizes_gid = download_mouse_unionizes([geneEntrezID],structureIDs, + doReduced=True,writeOut=False) + unionizes = unionizes + unionizes_gid + # df_i = unionizes_to_dataframe(unionizes_gid) + # df = pd.concat([df,df_i]) + timeSoFar = time.time() - startTime + timeRemaining = timeSoFar/(ind+1)*(len(geneEntrezIDs)-ind+1)/60 + print("We're at %d/%d, %f min remaining" % (ind+1,len(geneEntrezIDs),timeRemaining)) +totalTime = (time.time() - startTime)/60. +print("Took %f min total" % totalTime) + +# Make a data frame of the datasets downloaded: +df = unionizes_to_dataframe(unionizes) +df = df.sort_values(by=['structure_id','data_set_id']) +df.to_csv(allDataFilename) +print("Saved data frame to %s" % allDataFilename) +print(df) + +# df = pd.read_csv(allDataFilename) + +#------------------------------------------------------------------------------- +# RETRIEVING GENES ALL TOGETHER IN ONE QUERY +#------------------------------------------------------------------------------- +# Another option would be to retrieve everything in one query, and then filter +# back to the matching genes...? (25924 rows per structure...): +# No need to write to a json file as you go (slows down) +# Only retrieve the necessary fields (doReduced) +# unionizes = download_mouse_unionizes([],structureIDs,doReduced=True,writeOut=False) + +#------------------------------------------------------------------------------- +# Write the agglomerated datasets to file +# json_utilities.write(json_file_name, unionizes) +# print("Wrote %d unionizes to %s" % (len(unionizes),json_file_name)) + +# Read in previously-downloaded results: +# unionizes = json_utilities.read(json_file_name) + +# To dataframe and then filter to include only genes in our geneEntrezIDs list: +# dfFull = unionizes_to_dataframe(unionizes) +# dfFull.shape[0] +# # Now we need to filter to include only genes in our geneEntrezIDs list: +# # (actually this filtering is not necessary) -- potentially a nice check that we only retrieved +# # entrez_ids corresponding to section datasets with expression (22,000) +# df = dfFull.copy() +# df = df[df['gene_entrez'].isin(geneEntrezIDs)] +# df.shape[0] +# len(geneEntrezIDs) +# # (so something like 3,000 duplicates?) +# df = df.sort(['structure_id','data_set_id']) +# df.tail +# df_filter = df.drop_duplicates() +# df['data_set_id'] + +# # Save out just the gene information: +# dfGenes = df[] +# +# # Save data summary as a .csv file: +# csv_file_name = 'filteredDataFrame.csv' +# df.to_csv(csv_file_name) + +# Compute the mean expression energy for each structure_id: +# gb = df.groupby(['gene_entrez','structure_id'])['expression_energy'].mean() +# print(gb) + +# gb.pivot(index="structure_id", columns="gene_entrez", values="expression_energy") +print("Writing expression energy and density matrices out") +for expressionType in ['expression_energy','expression_density']: + table = pd.pivot_table(df, values=expressionType, index="structure_id", + columns="data_set_id", aggfunc=np.mean) + # table = pd.pivot_table(df, values='expression_energy', index="structure_id", + # columns="gene_entrez", aggfunc=np.mean) + print(table) + + # Save mean expression energy to a .csv output file: + fileName = ("%s_%ux%u.csv" % (expressionType, table.shape[0], table.shape[1])) + table.to_csv(fileName,na_rep='NaN') + print("Saved expression energy of %u datasets over %u structures to %s" \ + % (table.shape[1],table.shape[0],fileName)) + +# Need to also output the section dataset IDs (columns) and struct IDs (rows) +dataSetIDs = pd.DataFrame(list(table.axes[0])) +dataSetIDs.to_csv('structIDs_Rows.csv', index=False, header=False) +dataSetIDs = pd.DataFrame(list(table.axes[1])) +dataSetIDs.to_csv('dataSetIDs_Columns.csv', index=False, header=False) + +# Summary to screen +# gdf = gb.agg({'data_set_id': pd.Series.nunique}) +# print(gdf) + +# if __name__ == "__main__": main() diff --git a/WriteStructureInfo.py b/WriteStructureInfo.py index 7197341..3f13271 100644 --- a/WriteStructureInfo.py +++ b/WriteStructureInfo.py @@ -50,12 +50,21 @@ def writePalomeroStructures(tree): palomeroStructs = tree.get_structures_by_id(theStructIDs) writeIDsAndInfo(palomeroStructs,'structIDs_palomero.csv','structInfo_palomero.csv') #------------------------------------------------------------------------------- +def writeAllStructures(tree): + theStructIDs = [d['id'] for d in tree.nodes()] + theStructIDs = list(set(theStructIDs)) + structs = tree.get_structures_by_id(theStructIDs) + writeIDsAndInfo(structs,'structIDs_all.csv','structInfo_all.csv') +#------------------------------------------------------------------------------- + tree = getFullStructureTree() -writeWhat = 'palomero' +writeWhat = 'all' if writeWhat=='palomero': writePalomeroStructures(tree) elif writeWhat=='cortical': writeCorticalStructures(tree) elif writeWhat=='layer': writeLayerStructures(tree) +elif writeWhat=='all': + writeAllStructures(tree) From d6463892d4461629812ec6d1aa83c44311bbfec4 Mon Sep 17 00:00:00 2001 From: thinnerichs Date: Thu, 16 Dec 2021 14:45:56 +0100 Subject: [PATCH 3/3] Update RetrieveGene.py for checkpointing --- RetrieveGene.py | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/RetrieveGene.py b/RetrieveGene.py index afef29b..78b14ca 100644 --- a/RetrieveGene.py +++ b/RetrieveGene.py @@ -103,15 +103,13 @@ def QueryAPI(model,criteriaString,includeString=None,optionsString=None,writeOut # apiQueryPartial = partial(api.model_query,model=model,criteria=criteriaString, # startRow=startRow,num_rows=blockSize) - print(model, criteriaString, includeString, optionsString, startRow, blockSize) + # print(model, criteriaString, includeString, optionsString, startRow, blockSize) rows += api.model_query(model=model, criteria=criteriaString, include=includeString, options=optionsString, start_row=startRow, num_rows=blockSize) - print(rows) - raise Exception numRows = len(rows) - tot_rows # additional rows retrieved on running the query @@ -139,9 +137,9 @@ def unionizes_to_dataframe(unionizes): 'expression_density': unionize['expression_density'], 'data_set_id': unionize['section_data_set']['id'], 'plane_of_section_id': unionize['section_data_set']['plane_of_section_id'], - # 'gene_acronym': unionize['section_data_set']['genes'][0]['acronym'], - # 'gene_name': unionize['section_data_set']['genes'][0]['name'], - # 'gene_entrez': unionize['section_data_set']['genes'][0]['entrez_id'], + 'gene_acronym': unionize['section_data_set']['genes'][0]['acronym'], + 'gene_name': unionize['section_data_set']['genes'][0]['name'], + 'gene_entrez': unionize['section_data_set']['genes'][0]['entrez_id'], }) return pd.DataFrame.from_records(fdata) #------------------------------------------------------------------------------- @@ -215,7 +213,6 @@ def structure_details_wrapper(struc_ids): df_struct.to_csv(structInfoFilename) -''' #--------------------------------------------------------------------------- # RETRIEVING GENES ONE AT A TIME #--------------------------------------------------------------------------- @@ -223,12 +220,14 @@ def structure_details_wrapper(struc_ids): # to get all genes at once? # Read in gene entrez IDs from csv: -geneEntrezIDs = np.genfromtxt(entrezSource,delimiter=',') +geneEntrezIDs = np.genfromtxt(entrezSource,delimiter=',',dtype=int) print(("Read in %d genes from %s..." % (len(geneEntrezIDs), entrezSource))) # geneEntrezIDs = np.concatenate((geneEntrezIDs[0:1],[20604.]),axis=0) unionizes = [] startTime = time.time() +storage_path = './data/' +''' for ind, geneEntrezID in enumerate(geneEntrezIDs): unionizes_gid = download_mouse_unionizes([geneEntrezID],structureIDs, doReduced=True,writeOut=False) @@ -238,9 +237,27 @@ def structure_details_wrapper(struc_ids): timeSoFar = time.time() - startTime timeRemaining = timeSoFar/(ind+1)*(len(geneEntrezIDs)-ind+1)/60 print(("We're at %d/%d, %f min remaining" % (ind+1,len(geneEntrezIDs),timeRemaining))) +''' +def download_unionizes_wrapper(i, gene_id): + try: + unionizes_gid = download_mouse_unionizes([gene_id],structureIDs, doReduced=True,writeOut=False) + except Exception: + file = 'missed_genes' + with open(file=file,mode='a') as f: + print(gene_id, file=f) + struc_df = unionizes_to_dataframe(unionizes_gid) + struc_df.to_csv(storage_path+str(gene_id)+'_structure_unionizes.csv') + print(f'{i} of {len(geneEntrezIDs)} done.') + return + +n_jobs = 6 +Parallel(n_jobs=n_jobs)(delayed(download_unionizes_wrapper)(i, geneEntrezID) for i, geneEntrezID in enumerate(geneEntrezIDs)) + totalTime = (time.time() - startTime)/60. print(("Took %f min total" % totalTime)) +raise Exception + # Make a data frame of the datasets downloaded: df = unionizes_to_dataframe(unionizes) df = df.sort_values(by=['structure_id','data_set_id']) @@ -249,7 +266,6 @@ def structure_details_wrapper(struc_ids): print(df) # df = pd.read_csv(allDataFilename) -''' #------------------------------------------------------------------------------- # RETRIEVING GENES ALL TOGETHER IN ONE QUERY @@ -259,6 +275,7 @@ def structure_details_wrapper(struc_ids): # No need to write to a json file as you go (slows down) # Only retrieve the necessary fields (doReduced) +''' # Read in gene entrez IDs from csv: geneEntrezIDs = np.genfromtxt(entrezSource,delimiter=',') print(("Read in %d genes from %s..." % (len(geneEntrezIDs), entrezSource))) @@ -281,6 +298,7 @@ def download_mouse_unionizes_wrapper(struc_ids): # Read in previously-downloaded results: unionizes = json_utilities.read(json_file_name) +''' # To dataframe and then filter to include only genes in our geneEntrezIDs list: dfFull = unionizes_to_dataframe(unionizes)