From eb310b62ce21e22e63e907355a92e69d249781ea Mon Sep 17 00:00:00 2001
From: thinnerichs <tilman@hinnerichs.com>
Date: Thu, 9 Dec 2021 16:43:37 +0100
Subject: [PATCH 1/3] Update README

---
 README.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/README.md b/README.md
index 0d952bd..60b122f 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,13 @@ This repository contains code for:
 Requires Matlab and python.
 The [AllenSDK package](http://alleninstitute.github.io/AllenSDK/install.html) for python must be installed.
 
+Commands to install AllenSDK (As incompatible with newer versions)
+```
+conda create -n allensdk python=3.7
+conda activate allensdk
+pip install allensdk
+```
+
 If anything is unclear or needs improvement, please send questions by [raising an Issue](https://docs.github.com/en/github/managing-your-work-on-github/creating-an-issue) or [sending me an email](mailto:ben.d.fulcher@gmail.com).
 
 This pipeline is based on code developed for [Fulcher and Fornito, _PNAS_ (2016)](https://doi.org/10.1073/pnas.1513302113), and used for [Fulcher et al., _PNAS_ (2019)](https://doi.org/10.1073/pnas.1814144116).

From b95ebb7a6f1445786cbce61cd61929eea343ec40 Mon Sep 17 00:00:00 2001
From: thinnerichs <tilman@hinnerichs.com>
Date: Mon, 13 Dec 2021 14:44:08 +0100
Subject: [PATCH 2/3] Update 2to3 .py files and upload full data

---
 AllGenes.py           |  24 ++--
 README.md             |   2 +
 RetrieveGene.py       | 117 +++++++++++------
 RetrieveGene.py.bak   | 298 ++++++++++++++++++++++++++++++++++++++++++
 WriteStructureInfo.py |  11 +-
 5 files changed, 400 insertions(+), 52 deletions(-)
 create mode 100644 RetrieveGene.py.bak

diff --git a/AllGenes.py b/AllGenes.py
index dd22e98..59d7bf2 100644
--- a/AllGenes.py
+++ b/AllGenes.py
@@ -24,13 +24,13 @@ def QueryAPI(model,criteriaString,includeString="",optionsString="",writeOut=[])
     api = RmaApi()
     # Settings for retrieval
     rows = []
-    blockSize = 2000
+    blockSize = 5000
     done = False
     startRow = 0
     # for i in range(0, total_rows, blockSize):
 
     while not done:
-        print "Row %d, attempting to retrieve %d rows..." % (startRow, blockSize)
+        print("Row %d, attempting to retrieve %d rows..." % (startRow, blockSize))
 
         tot_rows = len(rows)
         if len(includeString)==0:
@@ -50,16 +50,16 @@ def QueryAPI(model,criteriaString,includeString="",optionsString="",writeOut=[])
         numRows = len(rows) - tot_rows # additional rows retrieved on running the query
         startRow += numRows
 
-        print "%d rows retrieved." % numRows
+        print("%d rows retrieved." % numRows)
 
         # Check if we're at the end of the road
         if numRows == 0 or numRows < blockSize:
             done = True
 
         # Write out the results as they come in, if requested
-        if isinstance(writeOut, basestring):
+        if isinstance(writeOut, str):
             json_utilities.write(json_file_name, rows)
-            print "Wrote to %s" % json_file_name
+            print("Wrote to %s" % json_file_name)
 
     return rows
 #-------------------------------------------------------------------------------
@@ -106,7 +106,7 @@ def SaveSectionsToCSV(sections):
                 })
     # To dataframe:
     df_sections = pd.DataFrame.from_records(sectionList) #,index='section_id')
-    df_sections = df_sections.sort('section_id')
+    df_sections = df_sections.sort_values(by=['section_id'])
 
     # Save as a .csv file:
     df_sections.to_csv(sectionDatasetFilename)
@@ -123,19 +123,19 @@ def SaveSectionsToCSV(sections):
                 })
     # To dataframe:
     df_genes = pd.DataFrame.from_records(geneList) #,index='entrez_id')
-    df_genes = df_genes.sort('entrez_id')
+    df_genes = df_genes.sort_values(by=['entrez_id'])
     numGenesFull = df_genes.shape[0]
     df_genes = df_genes.drop_duplicates()
     numGenesFiltered = df_genes.shape[0]
-    print "Genes filtered from %u to %u" % (numGenesFull, numGenesFiltered)
+    print("Genes filtered from %u to %u" % (numGenesFull, numGenesFiltered))
     # Save as a .csv file:
     df_genes.to_csv(geneInfoFilename)
 #-------------------------------------------------------------------------------
 def SaveListCSV(stringList,fileName):
     # Outputs a csv from a given list of strings
-    resultFile = open(fileName,'wb')
-    wr = csv.writer(resultFile, dialect='excel')
-    wr.writerow(stringList)
+    with open(file=fileName, mode='w') as resultFile:
+        wr = csv.writer(resultFile, dialect='excel')
+        wr.writerow(stringList)
 
 #-------------------------------------------------------------------------------
 
@@ -154,7 +154,7 @@ def SaveListCSV(stringList,fileName):
 entrezSet = set(geneEntrezList)
 geneEntrezList = list(entrezSet)
 geneEntrezList.sort()
-print "There are %u unique genes in section datasets" % len(entrezSet)
+print("There are %u unique genes in section datasets" % len(entrezSet))
 SaveListCSV(geneEntrezList,entrezIDFilename)
 
 # Saves to:
diff --git a/README.md b/README.md
index 60b122f..33a87e0 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,7 @@
 # AllenSDK
 
+This is a python 3 ported version of [https://github.com/benfulcher/AllenSDK](https://github.com/benfulcher/AllenSDK), enhanced through some parallelism using `joblib` and others.
+
 [![DOI](https://zenodo.org/badge/104984017.svg)](https://zenodo.org/badge/latestdoi/104984017)
 [![Twitter](https://img.shields.io/twitter/url/https/twitter.com/bendfulcher.svg?style=social&label=Follow%20%40bendfulcher)](https://twitter.com/bendfulcher)
 
diff --git a/RetrieveGene.py b/RetrieveGene.py
index 829c71e..afef29b 100644
--- a/RetrieveGene.py
+++ b/RetrieveGene.py
@@ -5,11 +5,14 @@
 import time
 import csv # For saving string data to csv
 
+from joblib import Parallel, delayed
+import os
+
 #-------------------------------------------------------------------------------
 # Globals:
 #-------------------------------------------------------------------------------
 # INPUT file names:
-structIDSource = 'structureIDs.csv' # 'layerIDs.csv' # 'structureIDs.csv'
+structIDSource = 'structIDs_all.csv' # 'layerIDs.csv' # 'structureIDs.csv'
 entrezSource = 'geneEntrezID.csv' # 'geneEntrezID.csv'
 
 # OUTPUT file names:
@@ -24,7 +27,7 @@
 json_file_name = 'unionizes.json'
 #-------------------------------------------------------------------------------
 
-def download_mouse_unionizes(geneEntrezID,structureIDs,doReduced=False,writeOut=False):
+def download_mouse_unionizes(geneEntrezID,structureIDs,doReduced='medium',writeOut=False):
     # Given genes and set of structures, retrieves expression data from Allen API
     # Can set geneEntrezID to empty to retrieve for all genes
     # Can set geneEntrezID to be a single number (array length 1) for a single gene
@@ -35,7 +38,10 @@ def download_mouse_unionizes(geneEntrezID,structureIDs,doReduced=False,writeOut=
     string_structureIDs = ','.join(str(sid) for sid in structureIDs)
 
     # Set defaults:
-    if doReduced:
+    if doReduced == 'struc_gene_density':
+        print("Only retrieving a minimal subset of results from the queries")
+        optionsString = "[only$eq'structures.id,data_sets.idgenes.entrez_id,structure_unionizes.expression_density']"
+    elif doReduced=='medium':
         print("Only retrieving a reduced subset of results from the queries")
         optionsString = "[only$eq'structures.id,data_sets.id,data_sets.plane_of_section_id,genes.entrez_id,structure_unionizes.expression_energy,structure_unionizes.expression_density']"
     else:
@@ -55,14 +61,14 @@ def download_mouse_unionizes(geneEntrezID,structureIDs,doReduced=False,writeOut=
             ]
     elif len([geneEntrezID])==1:
         # Get for a single gene:
-        print("Get for the single gene: entrez_id = %u" % geneEntrezID[0])
+        print(("Get for the single gene: entrez_id = %u" % geneEntrezID[0]))
         criteria = [
             "structure[id$in%s][graph_id$eq%d]," % (string_structureIDs, mouse_graph_id),
             "section_data_set[failed$eqfalse](products[id$eq%d],genes[entrez_id$eq%d])" % (mouse_product_id, geneEntrezID[0])
             ]
     elif len(geneEntrezID)>1:
         # Get for multiple genes:
-        print("Getting expression results for %u genes" % len(geneEntrezID))
+        print(("Getting expression results for %u genes" % len(geneEntrezID)))
         string_geneEntrezIDs = ','.join(str(eid) for eid in geneEntrezIDs)
         criteria = [
             "structure[id$in%s][graph_id$eq%d]," % (string_structureIDs, mouse_graph_id),
@@ -84,30 +90,34 @@ def QueryAPI(model,criteriaString,includeString=None,optionsString=None,writeOut
 
     # Settings for retrieval
     rows = []
-    blockSize = 2000
+    blockSize = 1000
     done = False
     startRow = 0
     # for i in range(0, total_rows, blockSize):
 
     while not done:
-        print("Row %d, attempting to retrieve %d rows..." % (startRow, blockSize))
+        print(("Row %d, attempting to retrieve %d rows..." % (startRow, blockSize)))
 
         tot_rows = len(rows)
 
         # apiQueryPartial = partial(api.model_query,model=model,criteria=criteriaString,
                                     # startRow=startRow,num_rows=blockSize)
 
+        print(model, criteriaString, includeString, optionsString, startRow, blockSize)
         rows += api.model_query(model=model,
                                 criteria=criteriaString,
                                 include=includeString,
                                 options=optionsString,
                                 start_row=startRow,
                                 num_rows=blockSize)
+        print(rows)
+        raise Exception
+
 
         numRows = len(rows) - tot_rows # additional rows retrieved on running the query
         startRow += numRows
 
-        print("%d rows retrieved." % numRows)
+        print(("%d rows retrieved." % numRows))
 
         # Check if we're at the end of the road
         if numRows == 0 or numRows < blockSize:
@@ -116,7 +126,7 @@ def QueryAPI(model,criteriaString,includeString=None,optionsString=None,writeOut
         # Write out the results as they come in, if requested
         if writeOut:
             json_utilities.write(json_file_name, rows)
-            print("Wrote to %s" % json_file_name)
+            print(("Wrote to %s" % json_file_name))
 
     return rows
 #-------------------------------------------------------------------------------
@@ -153,7 +163,7 @@ def GetStructureDetails(structureIDs,filterFields,graph_id=1):
     structs = QueryAPI(model="Structure",criteriaString=("[id$in%s][graph_id$eq%u]" \
                                 % (string_structureIDs,graph_id))) # (Mouse Brain Atlas)
 
-    print("%u mouse-brain structures retrieved!\n" % len(structs))
+    print(("%u mouse-brain structures retrieved!\n" % len(structs)))
 
     # Filter to only certain fields, specified as filterFields:
     structsFilt = []
@@ -182,16 +192,30 @@ def GetStructureDetails(structureIDs,filterFields,graph_id=1):
 #---------------------------------------------------------------------------
 # Read in structure IDs from csv -> save detailed structure info to file
 #---------------------------------------------------------------------------
-structureIDs = np.genfromtxt(structIDSource,delimiter=',')
-print("Retrieved %d structures from %s..." % (len(structureIDs),structIDSource))
+structureIDs = np.genfromtxt(structIDSource,delimiter=',', dtype=np.int)
+print(("Retrieved %d structures from %s..." % (len(structureIDs),structIDSource)))
 # Get details of the structures:
 filterFields = ['id','name','acronym','color_hex_triplet']
-structs = GetStructureDetails(structureIDs,filterFields,1)
+
+# parallel downloading through joblib.Parallel, alter num_jobs for more simultaneous downloads
+num_jobs = 6
+num_structures = len(structureIDs)
+
+def structure_details_wrapper(struc_ids):
+    return GetStructureDetails(struc_ids, filterFields, 1)
+struct_list = Parallel(n_jobs=num_jobs)(delayed(structure_details_wrapper)(sub_list)
+        for sub_list
+        in np.split(structureIDs, list(range(0,num_structures,num_structures//num_jobs))[1:-1]))
+
+structs = sum(struct_list, [])
+# structs = GetStructureDetails(structureIDs,filterFields,1)
 # To dataframe:
 df_struct = pd.DataFrame.from_records(structs)
 # Save as a .csv file:
 df_struct.to_csv(structInfoFilename)
 
+
+'''
 #---------------------------------------------------------------------------
 # RETRIEVING GENES ONE AT A TIME
 #---------------------------------------------------------------------------
@@ -200,7 +224,7 @@ def GetStructureDetails(structureIDs,filterFields,graph_id=1):
 
 # Read in gene entrez IDs from csv:
 geneEntrezIDs = np.genfromtxt(entrezSource,delimiter=',')
-print("Read in %d genes from %s..." % (len(geneEntrezIDs), entrezSource))
+print(("Read in %d genes from %s..." % (len(geneEntrezIDs), entrezSource)))
 
 # geneEntrezIDs = np.concatenate((geneEntrezIDs[0:1],[20604.]),axis=0)
 unionizes = []
@@ -213,18 +237,19 @@ def GetStructureDetails(structureIDs,filterFields,graph_id=1):
     # df = pd.concat([df,df_i])
     timeSoFar = time.time() - startTime
     timeRemaining = timeSoFar/(ind+1)*(len(geneEntrezIDs)-ind+1)/60
-    print("We're at %d/%d, %f min remaining" % (ind+1,len(geneEntrezIDs),timeRemaining))
+    print(("We're at %d/%d, %f min remaining" % (ind+1,len(geneEntrezIDs),timeRemaining)))
 totalTime = (time.time() - startTime)/60.
-print("Took %f min total" % totalTime)
+print(("Took %f min total" % totalTime))
 
 # Make a data frame of the datasets downloaded:
 df = unionizes_to_dataframe(unionizes)
 df = df.sort_values(by=['structure_id','data_set_id'])
 df.to_csv(allDataFilename)
-print("Saved data frame to %s" % allDataFilename)
+print(("Saved data frame to %s" % allDataFilename))
 print(df)
 
 # df = pd.read_csv(allDataFilename)
+'''
 
 #-------------------------------------------------------------------------------
 # RETRIEVING GENES ALL TOGETHER IN ONE QUERY
@@ -233,44 +258,58 @@ def GetStructureDetails(structureIDs,filterFields,graph_id=1):
 # back to the matching genes...? (25924 rows per structure...):
 # No need to write to a json file as you go (slows down)
 # Only retrieve the necessary fields (doReduced)
-# unionizes = download_mouse_unionizes([],structureIDs,doReduced=True,writeOut=False)
 
+# Read in gene entrez IDs from csv:
+geneEntrezIDs = np.genfromtxt(entrezSource,delimiter=',')
+print(("Read in %d genes from %s..." % (len(geneEntrezIDs), entrezSource)))
+
+if not os.file.exists(json_file_name):
+    # Parallelize download using num_jobs and num_structures from the previous parallelization
+    def download_mouse_unionizes_wrapper(struc_ids):
+        return download_mouse_unionizes([],struc_ids,doReduced='medium',writeOut=False)
+    unionizes = Parallel(n_jobs=num_jobs)(delayed(download_mouse_unionizes_wrapper)(sub_list)
+            for sub_list
+            in np.split(structureIDs, list(range(0,num_structures,num_structures//num_jobs))[1:-1]))
+    unionizes = sum(unionizes, [])
+
+    # unionizes = download_mouse_unionizes([],structureIDs,doReduced='medium',writeOut=False)
+
+    # Write the agglomerated datasets to file
+    json_utilities.write(json_file_name, unionizes)
+    print("Wrote %d unionizes to %s" % (len(unionizes),json_file_name))
 #-------------------------------------------------------------------------------
-# Write the agglomerated datasets to file
-# json_utilities.write(json_file_name, unionizes)
-# print("Wrote %d unionizes to %s" % (len(unionizes),json_file_name))
 
 # Read in previously-downloaded results:
-# unionizes = json_utilities.read(json_file_name)
+unionizes = json_utilities.read(json_file_name)
 
 # To dataframe and then filter to include only genes in our geneEntrezIDs list:
-# dfFull = unionizes_to_dataframe(unionizes)
-# dfFull.shape[0]
+dfFull = unionizes_to_dataframe(unionizes)
+dfFull.shape[0]
 # # Now we need to filter to include only genes in our geneEntrezIDs list:
 # # (actually this filtering is not necessary) -- potentially a nice check that we only retrieved
 # # entrez_ids corresponding to section datasets with expression (22,000)
-# df = dfFull.copy()
-# df = df[df['gene_entrez'].isin(geneEntrezIDs)]
-# df.shape[0]
-# len(geneEntrezIDs)
+df = dfFull.copy()
+df = df[df['gene_entrez'].isin(geneEntrezIDs)]
+df.shape[0]
+len(geneEntrezIDs)
 # # (so something like 3,000 duplicates?)
-# df = df.sort(['structure_id','data_set_id'])
-# df.tail
-# df_filter = df.drop_duplicates()
-# df['data_set_id']
+df = df.sort(['structure_id','data_set_id'])
+df.tail
+df_filter = df.drop_duplicates()
+df['data_set_id']
 
 # # Save out just the gene information:
 # dfGenes = df[]
 #
 # # Save data summary as a .csv file:
-# csv_file_name = 'filteredDataFrame.csv'
-# df.to_csv(csv_file_name)
+csv_file_name = 'filteredDataFrame.csv'
+df.to_csv(csv_file_name)
 
 # Compute the mean expression energy for each structure_id:
-# gb = df.groupby(['gene_entrez','structure_id'])['expression_energy'].mean()
-# print(gb)
+gb = df.groupby(['gene_entrez','structure_id'])['expression_energy'].mean()
+print(gb)
 
-# gb.pivot(index="structure_id", columns="gene_entrez", values="expression_energy")
+gb.pivot(index="structure_id", columns="gene_entrez", values="expression_energy")
 print("Writing expression energy and density matrices out")
 for expressionType in ['expression_energy','expression_density']:
     table = pd.pivot_table(df, values=expressionType, index="structure_id",
@@ -282,8 +321,8 @@ def GetStructureDetails(structureIDs,filterFields,graph_id=1):
     # Save mean expression energy to a .csv output file:
     fileName = ("%s_%ux%u.csv" % (expressionType, table.shape[0], table.shape[1]))
     table.to_csv(fileName,na_rep='NaN')
-    print("Saved expression energy of %u datasets over %u structures to %s" \
-                    % (table.shape[1],table.shape[0],fileName))
+    print(("Saved expression energy of %u datasets over %u structures to %s" \
+                    % (table.shape[1],table.shape[0],fileName)))
 
 # Need to also output the section dataset IDs (columns) and struct IDs (rows)
 dataSetIDs = pd.DataFrame(list(table.axes[0]))
diff --git a/RetrieveGene.py.bak b/RetrieveGene.py.bak
new file mode 100644
index 0000000..829c71e
--- /dev/null
+++ b/RetrieveGene.py.bak
@@ -0,0 +1,298 @@
+import pandas as pd
+from allensdk.api.queries.rma_api import RmaApi
+import allensdk.core.json_utilities as json_utilities
+import numpy as np
+import time
+import csv # For saving string data to csv
+
+#-------------------------------------------------------------------------------
+# Globals:
+#-------------------------------------------------------------------------------
+# INPUT file names:
+structIDSource = 'structureIDs.csv' # 'layerIDs.csv' # 'structureIDs.csv'
+entrezSource = 'geneEntrezID.csv' # 'geneEntrezID.csv'
+
+# OUTPUT file names:
+structInfoFilename = 'structureInfo.csv' # structureInfo.csv
+allDataFilename = 'bigDataFrame.csv'
+
+#-------------------------------------------------------------------------------
+# graph_id = 1 is the "Mouse Brain Atlas"
+# (cf. http://help.brain-map.org/display/api/Atlas+Drawings+and+Ontologies)
+mouse_graph_id = 1
+mouse_product_id = 1
+json_file_name = 'unionizes.json'
+#-------------------------------------------------------------------------------
+
+def download_mouse_unionizes(geneEntrezID,structureIDs,doReduced=False,writeOut=False):
+    # Given genes and set of structures, retrieves expression data from Allen API
+    # Can set geneEntrezID to empty to retrieve for all genes
+    # Can set geneEntrezID to be a single number (array length 1) for a single gene
+    # Can set geneEntrezID to an array to retrieve for multiple genes
+    # doReduced aims to speed up the process by only retrieving bare-bones info
+
+    # Convert list of structure IDs to a comma-separated string:
+    string_structureIDs = ','.join(str(sid) for sid in structureIDs)
+
+    # Set defaults:
+    if doReduced:
+        print("Only retrieving a reduced subset of results from the queries")
+        optionsString = "[only$eq'structures.id,data_sets.id,data_sets.plane_of_section_id,genes.entrez_id,structure_unionizes.expression_energy,structure_unionizes.expression_density']"
+    else:
+        optionsString = None
+
+    # Set criteria :
+    if len(geneEntrezID)==0:
+        # Get for all genes
+        print("Get for all genes")
+        # cf. Full query: http://api.brain-map.org/api/v2/data/query.xml?
+        # criteria=model::StructureUnionize,rma::criteria,structure[id$eq22][graph_id$eq1],
+        # section_data_set[failed$eqFalse](products[id$eq1]),
+        # rma::include,section_data_set,structure,section_data_set(genes)
+        criteria = [
+            "structure[id$in%s][graph_id$eq%d]," % (string_structureIDs, mouse_graph_id),
+            "section_data_set[failed$eqfalse](products[id$eq%d])" % (mouse_product_id)
+            ]
+    elif len([geneEntrezID])==1:
+        # Get for a single gene:
+        print("Get for the single gene: entrez_id = %u" % geneEntrezID[0])
+        criteria = [
+            "structure[id$in%s][graph_id$eq%d]," % (string_structureIDs, mouse_graph_id),
+            "section_data_set[failed$eqfalse](products[id$eq%d],genes[entrez_id$eq%d])" % (mouse_product_id, geneEntrezID[0])
+            ]
+    elif len(geneEntrezID)>1:
+        # Get for multiple genes:
+        print("Getting expression results for %u genes" % len(geneEntrezID))
+        string_geneEntrezIDs = ','.join(str(eid) for eid in geneEntrezIDs)
+        criteria = [
+            "structure[id$in%s][graph_id$eq%d]," % (string_structureIDs, mouse_graph_id),
+            "section_data_set[failed$eqfalse](products[id$eq%d],genes[entrez_id$in%s])" % (mouse_product_id, string_geneEntrezIDs)
+            ]
+
+    # Query the API:
+    expressionData = QueryAPI(model='StructureUnionize',criteriaString="".join(criteria),
+                includeString="section_data_set,structure,section_data_set(genes)",
+                optionsString=optionsString,writeOut=writeOut)
+
+    return expressionData
+#-------------------------------------------------------------------------------
+def QueryAPI(model,criteriaString,includeString=None,optionsString=None,writeOut=False):
+    # Send a query to the Allen API, and assemble results
+
+    # Initiate RMA API for Allen data retrieval
+    api = RmaApi()
+
+    # Settings for retrieval
+    rows = []
+    blockSize = 2000
+    done = False
+    startRow = 0
+    # for i in range(0, total_rows, blockSize):
+
+    while not done:
+        print("Row %d, attempting to retrieve %d rows..." % (startRow, blockSize))
+
+        tot_rows = len(rows)
+
+        # apiQueryPartial = partial(api.model_query,model=model,criteria=criteriaString,
+                                    # startRow=startRow,num_rows=blockSize)
+
+        rows += api.model_query(model=model,
+                                criteria=criteriaString,
+                                include=includeString,
+                                options=optionsString,
+                                start_row=startRow,
+                                num_rows=blockSize)
+
+        numRows = len(rows) - tot_rows # additional rows retrieved on running the query
+        startRow += numRows
+
+        print("%d rows retrieved." % numRows)
+
+        # Check if we're at the end of the road
+        if numRows == 0 or numRows < blockSize:
+            done = True
+
+        # Write out the results as they come in, if requested
+        if writeOut:
+            json_utilities.write(json_file_name, rows)
+            print("Wrote to %s" % json_file_name)
+
+    return rows
+#-------------------------------------------------------------------------------
+def unionizes_to_dataframe(unionizes):
+    fdata = []
+    for unionize in unionizes:
+        fdata.append({
+            'structure_id': unionize['structure_id'],
+            'expression_energy': unionize['expression_energy'],
+            'expression_density': unionize['expression_density'],
+            'data_set_id': unionize['section_data_set']['id'],
+            'plane_of_section_id': unionize['section_data_set']['plane_of_section_id'],
+            # 'gene_acronym': unionize['section_data_set']['genes'][0]['acronym'],
+            # 'gene_name': unionize['section_data_set']['genes'][0]['name'],
+            # 'gene_entrez': unionize['section_data_set']['genes'][0]['entrez_id'],
+        })
+    return pd.DataFrame.from_records(fdata)
+#-------------------------------------------------------------------------------
+def SaveListCSV(stringList,fileName):
+    resultFile = open(fileName,'wb')
+    wr = csv.writer(resultFile, dialect='excel')
+    wr.writerow(stringList)
+#-------------------------------------------------------------------------------
+def GetStructureDetails(structureIDs,filterFields,graph_id=1):
+    # Get structure info for structureIDs given from Allen API
+    # (mouse brain atlas)
+
+    print("\n---Downloading all mouse brain structures using the Allen Institute API...\n")
+
+    # Convert list of structure IDs to a comma-separated string:
+    string_structureIDs = ','.join(str(sid) for sid in structureIDs)
+
+    # Get the structure info by querying Allen API:
+    structs = QueryAPI(model="Structure",criteriaString=("[id$in%s][graph_id$eq%u]" \
+                                % (string_structureIDs,graph_id))) # (Mouse Brain Atlas)
+
+    print("%u mouse-brain structures retrieved!\n" % len(structs))
+
+    # Filter to only certain fields, specified as filterFields:
+    structsFilt = []
+    for i,s in enumerate(structs):
+        structsFilt.append({keep_key: s[keep_key] for keep_key in filterFields})
+
+    return structsFilt
+
+    # # Build a dict from structure id to structure and identify each node's direct descendants:
+    # structHash = {} # dict
+    # for s in structs:
+    #     s['num_children'] = 0 # add this field
+    #     s['structure_id_path'] = [int(sid) for sid in s['structure_id_path'].split('/') if sid != '']
+    #     s['acronym'] = s['acronym'].rstrip() # some acronyms have trailing whitespace (e.g., 'SUM'); remove
+    #     structHash[s['id']] = s
+    #
+    # # Iterate through to add the 'num_children' field by incrementing the 'num_children'
+    # # field given structures with parents:
+    # for sid,s in structHash.iteritems():
+    #     if len(s['structure_id_path']) > 1:
+    #         parentId = s['structure_id_path'][-2]
+    #         structHash[parentId]['num_children'] += 1
+
+#-------------------------------------------------------------------------------
+
+#---------------------------------------------------------------------------
+# Read in structure IDs from csv -> save detailed structure info to file
+#---------------------------------------------------------------------------
+structureIDs = np.genfromtxt(structIDSource,delimiter=',')
+print("Retrieved %d structures from %s..." % (len(structureIDs),structIDSource))
+# Get details of the structures:
+filterFields = ['id','name','acronym','color_hex_triplet']
+structs = GetStructureDetails(structureIDs,filterFields,1)
+# To dataframe:
+df_struct = pd.DataFrame.from_records(structs)
+# Save as a .csv file:
+df_struct.to_csv(structInfoFilename)
+
+#---------------------------------------------------------------------------
+# RETRIEVING GENES ONE AT A TIME
+#---------------------------------------------------------------------------
+# Kind of need to go one at a time since the request is too long if you try
+# to get all genes at once?
+
+# Read in gene entrez IDs from csv:
+geneEntrezIDs = np.genfromtxt(entrezSource,delimiter=',')
+print("Read in %d genes from %s..." % (len(geneEntrezIDs), entrezSource))
+
+# geneEntrezIDs = np.concatenate((geneEntrezIDs[0:1],[20604.]),axis=0)
+unionizes = []
+startTime = time.time()
+for ind, geneEntrezID in enumerate(geneEntrezIDs):
+    unionizes_gid = download_mouse_unionizes([geneEntrezID],structureIDs,
+                                            doReduced=True,writeOut=False)
+    unionizes = unionizes + unionizes_gid
+    # df_i = unionizes_to_dataframe(unionizes_gid)
+    # df = pd.concat([df,df_i])
+    timeSoFar = time.time() - startTime
+    timeRemaining = timeSoFar/(ind+1)*(len(geneEntrezIDs)-ind+1)/60
+    print("We're at %d/%d, %f min remaining" % (ind+1,len(geneEntrezIDs),timeRemaining))
+totalTime = (time.time() - startTime)/60.
+print("Took %f min total" % totalTime)
+
+# Make a data frame of the datasets downloaded:
+df = unionizes_to_dataframe(unionizes)
+df = df.sort_values(by=['structure_id','data_set_id'])
+df.to_csv(allDataFilename)
+print("Saved data frame to %s" % allDataFilename)
+print(df)
+
+# df = pd.read_csv(allDataFilename)
+
+#-------------------------------------------------------------------------------
+# RETRIEVING GENES ALL TOGETHER IN ONE QUERY
+#-------------------------------------------------------------------------------
+# Another option would be to retrieve everything in one query, and then filter
+# back to the matching genes...? (25924 rows per structure...):
+# No need to write to a json file as you go (slows down)
+# Only retrieve the necessary fields (doReduced)
+# unionizes = download_mouse_unionizes([],structureIDs,doReduced=True,writeOut=False)
+
+#-------------------------------------------------------------------------------
+# Write the agglomerated datasets to file
+# json_utilities.write(json_file_name, unionizes)
+# print("Wrote %d unionizes to %s" % (len(unionizes),json_file_name))
+
+# Read in previously-downloaded results:
+# unionizes = json_utilities.read(json_file_name)
+
+# To dataframe and then filter to include only genes in our geneEntrezIDs list:
+# dfFull = unionizes_to_dataframe(unionizes)
+# dfFull.shape[0]
+# # Now we need to filter to include only genes in our geneEntrezIDs list:
+# # (actually this filtering is not necessary) -- potentially a nice check that we only retrieved
+# # entrez_ids corresponding to section datasets with expression (22,000)
+# df = dfFull.copy()
+# df = df[df['gene_entrez'].isin(geneEntrezIDs)]
+# df.shape[0]
+# len(geneEntrezIDs)
+# # (so something like 3,000 duplicates?)
+# df = df.sort(['structure_id','data_set_id'])
+# df.tail
+# df_filter = df.drop_duplicates()
+# df['data_set_id']
+
+# # Save out just the gene information:
+# dfGenes = df[]
+#
+# # Save data summary as a .csv file:
+# csv_file_name = 'filteredDataFrame.csv'
+# df.to_csv(csv_file_name)
+
+# Compute the mean expression energy for each structure_id:
+# gb = df.groupby(['gene_entrez','structure_id'])['expression_energy'].mean()
+# print(gb)
+
+# gb.pivot(index="structure_id", columns="gene_entrez", values="expression_energy")
+print("Writing expression energy and density matrices out")
+for expressionType in ['expression_energy','expression_density']:
+    table = pd.pivot_table(df, values=expressionType, index="structure_id",
+                        columns="data_set_id", aggfunc=np.mean)
+    # table = pd.pivot_table(df, values='expression_energy', index="structure_id",
+    #                     columns="gene_entrez", aggfunc=np.mean)
+    print(table)
+
+    # Save mean expression energy to a .csv output file:
+    fileName = ("%s_%ux%u.csv" % (expressionType, table.shape[0], table.shape[1]))
+    table.to_csv(fileName,na_rep='NaN')
+    print("Saved expression energy of %u datasets over %u structures to %s" \
+                    % (table.shape[1],table.shape[0],fileName))
+
+# Need to also output the section dataset IDs (columns) and struct IDs (rows)
+dataSetIDs = pd.DataFrame(list(table.axes[0]))
+dataSetIDs.to_csv('structIDs_Rows.csv', index=False, header=False)
+dataSetIDs = pd.DataFrame(list(table.axes[1]))
+dataSetIDs.to_csv('dataSetIDs_Columns.csv', index=False, header=False)
+
+# Summary to screen
+# gdf = gb.agg({'data_set_id': pd.Series.nunique})
+# print(gdf)
+
+# if __name__ == "__main__": main()
diff --git a/WriteStructureInfo.py b/WriteStructureInfo.py
index 7197341..3f13271 100644
--- a/WriteStructureInfo.py
+++ b/WriteStructureInfo.py
@@ -50,12 +50,21 @@ def writePalomeroStructures(tree):
     palomeroStructs = tree.get_structures_by_id(theStructIDs)
     writeIDsAndInfo(palomeroStructs,'structIDs_palomero.csv','structInfo_palomero.csv')
 #-------------------------------------------------------------------------------
+def writeAllStructures(tree):
+    theStructIDs = [d['id'] for d in tree.nodes()]
+    theStructIDs = list(set(theStructIDs))
+    structs = tree.get_structures_by_id(theStructIDs)
+    writeIDsAndInfo(structs,'structIDs_all.csv','structInfo_all.csv')
+#-------------------------------------------------------------------------------
+
 
 tree = getFullStructureTree()
-writeWhat = 'palomero'
+writeWhat = 'all'
 if writeWhat=='palomero':
     writePalomeroStructures(tree)
 elif writeWhat=='cortical':
     writeCorticalStructures(tree)
 elif writeWhat=='layer':
     writeLayerStructures(tree)
+elif writeWhat=='all':
+    writeAllStructures(tree)

From d6463892d4461629812ec6d1aa83c44311bbfec4 Mon Sep 17 00:00:00 2001
From: thinnerichs <tilman@hinnerichs.com>
Date: Thu, 16 Dec 2021 14:45:56 +0100
Subject: [PATCH 3/3] Update RetrieveGene.py for checkpointing

---
 RetrieveGene.py | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/RetrieveGene.py b/RetrieveGene.py
index afef29b..78b14ca 100644
--- a/RetrieveGene.py
+++ b/RetrieveGene.py
@@ -103,15 +103,13 @@ def QueryAPI(model,criteriaString,includeString=None,optionsString=None,writeOut
         # apiQueryPartial = partial(api.model_query,model=model,criteria=criteriaString,
                                     # startRow=startRow,num_rows=blockSize)
 
-        print(model, criteriaString, includeString, optionsString, startRow, blockSize)
+        # print(model, criteriaString, includeString, optionsString, startRow, blockSize)
         rows += api.model_query(model=model,
                                 criteria=criteriaString,
                                 include=includeString,
                                 options=optionsString,
                                 start_row=startRow,
                                 num_rows=blockSize)
-        print(rows)
-        raise Exception
 
 
         numRows = len(rows) - tot_rows # additional rows retrieved on running the query
@@ -139,9 +137,9 @@ def unionizes_to_dataframe(unionizes):
             'expression_density': unionize['expression_density'],
             'data_set_id': unionize['section_data_set']['id'],
             'plane_of_section_id': unionize['section_data_set']['plane_of_section_id'],
-            # 'gene_acronym': unionize['section_data_set']['genes'][0]['acronym'],
-            # 'gene_name': unionize['section_data_set']['genes'][0]['name'],
-            # 'gene_entrez': unionize['section_data_set']['genes'][0]['entrez_id'],
+            'gene_acronym': unionize['section_data_set']['genes'][0]['acronym'],
+            'gene_name': unionize['section_data_set']['genes'][0]['name'],
+            'gene_entrez': unionize['section_data_set']['genes'][0]['entrez_id'],
         })
     return pd.DataFrame.from_records(fdata)
 #-------------------------------------------------------------------------------
@@ -215,7 +213,6 @@ def structure_details_wrapper(struc_ids):
 df_struct.to_csv(structInfoFilename)
 
 
-'''
 #---------------------------------------------------------------------------
 # RETRIEVING GENES ONE AT A TIME
 #---------------------------------------------------------------------------
@@ -223,12 +220,14 @@ def structure_details_wrapper(struc_ids):
 # to get all genes at once?
 
 # Read in gene entrez IDs from csv:
-geneEntrezIDs = np.genfromtxt(entrezSource,delimiter=',')
+geneEntrezIDs = np.genfromtxt(entrezSource,delimiter=',',dtype=int)
 print(("Read in %d genes from %s..." % (len(geneEntrezIDs), entrezSource)))
 
 # geneEntrezIDs = np.concatenate((geneEntrezIDs[0:1],[20604.]),axis=0)
 unionizes = []
 startTime = time.time()
+storage_path = './data/'
+'''
 for ind, geneEntrezID in enumerate(geneEntrezIDs):
     unionizes_gid = download_mouse_unionizes([geneEntrezID],structureIDs,
                                             doReduced=True,writeOut=False)
@@ -238,9 +237,27 @@ def structure_details_wrapper(struc_ids):
     timeSoFar = time.time() - startTime
     timeRemaining = timeSoFar/(ind+1)*(len(geneEntrezIDs)-ind+1)/60
     print(("We're at %d/%d, %f min remaining" % (ind+1,len(geneEntrezIDs),timeRemaining)))
+'''
+def download_unionizes_wrapper(i, gene_id):
+    try:
+        unionizes_gid = download_mouse_unionizes([gene_id],structureIDs, doReduced=True,writeOut=False)
+    except Exception:
+        file = 'missed_genes'
+        with open(file=file,mode='a') as f:
+            print(gene_id, file=f)
+    struc_df = unionizes_to_dataframe(unionizes_gid)
+    struc_df.to_csv(storage_path+str(gene_id)+'_structure_unionizes.csv')
+    print(f'{i} of {len(geneEntrezIDs)} done.')
+    return
+
+n_jobs = 6
+Parallel(n_jobs=n_jobs)(delayed(download_unionizes_wrapper)(i, geneEntrezID) for i, geneEntrezID in enumerate(geneEntrezIDs))
+
 totalTime = (time.time() - startTime)/60.
 print(("Took %f min total" % totalTime))
 
+raise Exception
+
 # Make a data frame of the datasets downloaded:
 df = unionizes_to_dataframe(unionizes)
 df = df.sort_values(by=['structure_id','data_set_id'])
@@ -249,7 +266,6 @@ def structure_details_wrapper(struc_ids):
 print(df)
 
 # df = pd.read_csv(allDataFilename)
-'''
 
 #-------------------------------------------------------------------------------
 # RETRIEVING GENES ALL TOGETHER IN ONE QUERY
@@ -259,6 +275,7 @@ def structure_details_wrapper(struc_ids):
 # No need to write to a json file as you go (slows down)
 # Only retrieve the necessary fields (doReduced)
 
+'''
 # Read in gene entrez IDs from csv:
 geneEntrezIDs = np.genfromtxt(entrezSource,delimiter=',')
 print(("Read in %d genes from %s..." % (len(geneEntrezIDs), entrezSource)))
@@ -281,6 +298,7 @@ def download_mouse_unionizes_wrapper(struc_ids):
 
 # Read in previously-downloaded results:
 unionizes = json_utilities.read(json_file_name)
+'''
 
 # To dataframe and then filter to include only genes in our geneEntrezIDs list:
 dfFull = unionizes_to_dataframe(unionizes)