diff --git a/Colab_CATE/CATE_on_Colab.ipynb b/Colab_CATE/CATE_on_Colab.ipynb new file mode 100644 index 0000000..bc3284c --- /dev/null +++ b/Colab_CATE/CATE_on_Colab.ipynb @@ -0,0 +1,422 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "gpuType": "T4" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Welcome to CATE on Google Colab!\n", + "\n", + "*Please make sure that the Colab session you have connected to contains an active GPU (eg: T4).*\n", + "\n", + "#### How to Cite\n", + "\n", + "CATE has been successfully published in the journal Methods in Ecology and Evolution (MEE). If you find this framework or the software solution useful in your analyses, please CITE the published article available in [MEE, CATE: A fast and scalable CUDA implementation to conduct highly parallelized evolutionary tests on large scale genomic data](https://doi.org/10.1111/2041-210X.14168).\n", + "\n", + "To cite CATE's code please use the Zenodo release:\n", + "\n", + "[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.7987769.svg)](https://doi.org/10.5281/zenodo.7987769)\n", + "\n", + "The details of the citation are listed below:\n", + "\n", + "Perera, D., Reisenhofer, E., Hussein, S., Higgins, E., Huber, C. D., & Long, Q. (2023).\n", + "CATE: A fast and scalable CUDA implementation to conduct highly parallelized evolutionary tests on large scale genomic data.\n", + "Methods in Ecology and Evolution, 00, 1–15.\n", + "[https://doi.org/10.1111/2041-210X.14168](https://doi.org/10.1111/2041-210X.14168)." + ], + "metadata": { + "id": "QQJm3QGohP3h" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Installing CATE on Google Colab" + ], + "metadata": { + "id": "stes-jMBipuZ" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "y6mhVj-FfYLN", + "outputId": "9ec46f93-2bed-4bb9-de2f-ac33569087c1" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "/content\n", + "Reading package lists... Done\n", + "Building dependency tree... Done\n", + "Reading state information... Done\n", + "git is already the newest version (1:2.34.1-1ubuntu1.11).\n", + "0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.\n", + "Cloning into 'CATE'...\n", + "remote: Enumerating objects: 1064, done.\u001b[K\n", + "remote: Counting objects: 100% (362/362), done.\u001b[K\n", + "remote: Compressing objects: 100% (212/212), done.\u001b[K\n", + "remote: Total 1064 (delta 242), reused 238 (delta 148), pack-reused 702 (from 1)\u001b[K\n", + "Receiving objects: 100% (1064/1064), 42.63 MiB | 21.25 MiB/s, done.\n", + "Resolving deltas: 100% (702/702), done.\n", + "/content/CATE\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "# Set a known valid directory as the current working directory\n", + "os.chdir('/content')\n", + "\n", + "# Verify the current working directory\n", + "!pwd\n", + "\n", + "# Install git if it's not already installed\n", + "!apt-get install git -y\n", + "\n", + "# Clone the CATE GitHub repository\n", + "!git clone https://github.com/theLongLab/CATE\n", + "\n", + "# Check if the directory exists before changing to it\n", + "if os.path.isdir(\"CATE\"):\n", + " %cd CATE\n", + "else:\n", + " print(\"Directory 'CATE' does not exist. Cloning might have failed.\")" + ] + }, + { + "cell_type": "code", + "source": [ + "# Compile using nvcc\n", + "!nvcc -std=c++17 *.cu *.cpp -o \"CATE\"\n", + "\n", + "# Run the compiled program and enjoy CATE\n", + "# CATE's help menu\n", + "!./CATE -h\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "_Sg9egmcf3Mn", + "outputId": "f870e94a-551e-4b60-db12-76aa592abfea" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " /\\\\\\\\\\\\\\\\\\ /\\\\\\\\\\\\\\\\\\ /\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\ /\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\n", + " /\\\\\\//////// /\\\\\\\\\\\\\\\\\\\\\\\\\\ \\///////\\\\\\///// \\/\\\\\\///////////\n", + "/\\\\\\/ /\\\\\\/////////\\\\\\ \\/\\\\\\ \\/\\\\\\\n", + "/\\\\\\ \\/\\\\\\ \\/\\\\\\ \\/\\\\\\ \\/\\\\\\\\\\\\\\\\\\\\\\\n", + "\\/\\\\\\ \\/\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\ \\/\\\\\\ \\/\\\\\\///////\n", + " \\//\\\\\\ \\/\\\\\\/////////\\\\\\ \\/\\\\\\ \\/\\\\\\\n", + " \\///\\\\\\ \\/\\\\\\ \\/\\\\\\ \\/\\\\\\ \\/\\\\\\\n", + " \\////\\\\\\\\\\\\\\\\\\ \\/\\\\\\ \\/\\\\\\ \\/\\\\\\ \\/\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\n", + " \\///////// \\/// \\/// \\/// \\///////////////\n", + "\n", + "CATE: CUDA Accelerated Testing of Evolution\n", + "Evolutionary tests for large scale genomic data\n", + "----------------------------------------------\n", + "HOW TO CITE:\n", + "Perera, D., Reisenhofer, E., Hussein, S., Higgins, E., Huber, C. D., & Long, Q. (2023).\n", + "CATE: A fast and scalable CUDA implementation to conduct highly parallelized evolutionary tests on large scale genomic data.\n", + "Methods in Ecology and Evolution, 00, 1–15.\n", + "https://doi.org/10.1111/2041-210X.14168.\n", + "----------------------------------------------\n", + "\n", + "HELP MENU\n", + "---------\n", + "\n", + "Execution format: \"[--function or -f] properties_file.json\"\n", + "\n", + "** Available functions are (not CaSe sensitive) **\n", + "\n", + "PROCESS MODES:\n", + "A high performance and fully customizable mode called PROMETHEUS is available.\n", + "\n", + "PROMETHEUS is available for the three neutrality tests (Tajima's D, Fay and Wu tests and Fu and Li tests).\n", + "PROMETHEUS is designed for power users on (High Performance Computing) HPC systems.\n", + "PROMETHEUS is activated via the parameters file. All other protocols of test execution remains the same.\n", + "PROMETHEUS uses a CUDA powered engine, therefore, requires a CUDA capable GPU.\n", + "\n", + "PROMETHEUS can be configured by the following five parameters:\n", + "1. Prometheus activate: \"YES\" or \"NO\" parameters used to turn the mode ON or OFF.\n", + "2. CPU cores : Controls the maximum number of cores that can be used at a time.\n", + "3. SNPs per time : Controls the max number of SNPs that will be processed on the GPU at a time.\n", + "4. Number of genes : Controls the number gene combinations that will be processed at a time.\n", + "5. Multi read : \"YES\" or \"NO\" parameters used to control the ability to read multiple files at once.\n", + "\n", + "CALCULATION MODES:\n", + "CATE can perform tests for pre-defined gene regions or the classical sliding window mechanism.\n", + "\n", + "Parameters for calculation mode is as follows:\n", + "Calculation mode: Can be either \"WINDOW\" or \"FILE\".\n", + "\n", + "If the calculation mode is \"WINDOW\" then the following two parameters need to be configured:\n", + "1. Window size: Base pair size of the window or range of the combination.\n", + "2. Step size : The base pair amount by which the next window's start will be incremented.\n", + "NOTE: If \"Step size\" is set to \"0\" then CATE will shift to a continuous sliding window mode.\n", + "\n", + "If the calculation mode is \"FILE\" then the following two parameters need to be configured:\n", + "1. Universal gene list: Configure the location for the tab deliminated gene list file for all tests.\n", + "2. * gene list : Specify the location of the per test file or set it as \"universal\" to access the universal list.\n", + "\n", + "TOOLS:\n", + "A set of simple tools used to manipulate and alter VCF and FASTA files.\n", + "\n", + "-svcf or --splitvcf\t: Splits VCF file\n", + ". \t\t There are two major modes, \"CHR\" and \"CTSPLIT\".\n", + " \t\t CHR mode splits a VCF file by chromosome as well as extracts just the GT column data.\n", + " \t\t CTSPLIT mode creates CATE's file heirarchy from a vcf file.\n", + " \t\t For CTSPLIT the vcf must have only a single chromosome's data only the GT column present.\n", + " \t\t CTSPLIT can separate VCF data by population and even carry out by population MAF filtration.\n", + " \t\t CTSPLIT files are placed in their respective population folders.\n", + " \t\t CTSPLIT files are named as follows: \"CHROMOSOMEnumber_COUNTRY_STARTposition_ENDposition.vcf\".\n", + "\n", + "-sfasta or --splitfasta\t: Split a user specified FASTA file to individual FASTA files.\n", + " \t\t Can be used to extract a singular user specified sequence as well.\n", + " \t\t Split files are placed in a user specified folder.\n", + " \t\t Each FASTA file name will be the name of the respective sequence entry.\n", + "\n", + "-mfasta or --mergefasta\t: Merge all FASTA files in a user specified folder to an individual FASTA file.\n", + " \t\t Ensure that the FASTA files have the APPROPRIATE extensions: .fasta, .fna, .ffn, .faa, .frn, .fa\n", + "\n", + "-egenes or --extractgenes : Reads the gene list file to extract the gene sequences from the reference genome.\n", + " FASTA format reference genome must be specified.\n", + " All gene sequences will be generated into separate FASTA files.\n", + "\n", + "-g2g or --gff2gene : Creates the gene list file in a *.txt format from the input GFF3 file.\n", + " Note that only regions annotated as genes will be extracted.\n", + "\n", + "-hapext or --hapfromvcf : Extracts haplotypes and their sequences for a predefined gene list from a (split) VCF (indexed) folder provided the reference sequence.\n", + " The reference genome must be provided in a FASTA file.\n", + " The system will automatically identify each haplotype present.\n", + " In addition to the summary output each haplotype present for each gene will be generated in a separate FASTA file.\n", + " IF \"Population out\" is set to \"YES\" then the entire population's FASTA configuration will be generated as well.\n", + " Uses a CUDA powered engine, therefore, requires a CUDA capable GPU.\n", + " File format is *.hsum (a tab deliminated text file).\n", + "\n", + "-m2g or --map2gene\t: Creates the gene list file in a *.txt format from the input MAP file.\n", + "\n", + "-pparam or --printparam : Prints a sample layout of the parameter file to the specified location.\n", + " State the path with the name of the parameter file after the \"-pparam\" function.\n", + "\n", + "EVOLUTION TESTS:\n", + "Core functions optimized for conducting Evolution tests on VCF files.\n", + "\n", + "-t or --tajima\t: Calculates the Tajima's D statistic (1989) using a (split) VCF (indexed) folder.\n", + " \t Uses a CUDA powered engine, therefore, requires a CUDA capable GPU.\n", + " \t File format is *.td (a tab deliminated text file).\n", + "\n", + "-f or --fuli\t: Calculates the Fu and Li's D, D*, F and F* statistics (1993) using a (split) VCF (indexed) folder.\n", + " \t ** The D, D* and F statistics are calculated based on the original paper by Fu et al (1993).\n", + " \t ** The F* statistic's vf* and uf* are calculated based on the corrected equations in Simonsen et al (1995).\n", + " \t Uses a CUDA powered engine, therefore, requires a CUDA capable GPU.\n", + " \t File format is *.fl (a tab deliminated text file).\n", + "\n", + "-w or --faywu\t: Calculates the Fay and Wu's normalized H and E statistics (2006) using a (split) VCF (indexed) folder.\n", + " \t Uses a CUDA powered engine, therefore, requires a CUDA capable GPU.\n", + " \t File format is *.fw (a tab deliminated text file).\n", + "\n", + "-n or --neutrality: Calculates the above three Neutrality tests (Tajima's Fu and Li's and Fay and Wu's) at once.\n", + " Uses a CUDA powered engine, therefore, requires a CUDA capable GPU.\n", + " File format is *.nt (a tab deliminated text file).\n", + "\n", + "-m or --mk \t: Calculates the McDonald–Kreitman Neutrality Index (NI) (1991) for a predefined gene list using a (split) vcf (indexed) folder.\n", + " \t The reference genome must be provided in a FASTA format file.\n", + " \t Two \"MODES\" exist. \"CHROM\" mode and \"GENE\" mode. Either must be specified\n", + " \t CHROM mode: Conducts the test on an alignment does across the entire chromosomes\n", + " \t Alignment file of the reference genome to the outgroup genome must also be provided in a *.maf format file.\n", + " \t PLEASE ensure that the REFERENCE sequence is first and OUTGROUP sequence is second in the MAF file.\n", + " \t ** TIP: Chromosome wide whole genome alignment software: GSAlign (https://github.com/hsinnan75/GSAlign).\n", + " \t GENE mode : Conducts the test on alignments per gene between the reference gene and the outgroup gene.\n", + " \t Each gene's alignment file location must be provided as a third column in tab deliminated the gene list file.\n", + " \t Alignments must be provided in the blastn *.txt format.\n", + " \t PLEASE ensure that the REFERENCE gene sequence is the QUERY and OUTGROUP gene sequence is the SUBJECT.\n", + " \t NCBI's online blastn (https://blast.ncbi.nlm.nih.gov/Blast.cgi),\n", + " \t or command line BLAST+ (https://anaconda.org/bioconda/blast) can be used.\n", + " \t Uses a CUDA powered engine, therefore, requires a CUDA capable GPU.\n", + " \t File format is *.mc (a tab deliminated text file).\n", + "\n", + "-x or --fst \t: Calculates the Fixation Index (Fst) (1965) using a (split) vcf (indexed) folder.\n", + " \t The population index of the sequenced samples must be provided in a tab deliminated *txt format file.\n", + " \t Uses a CUDA powered engine, therefore, requires a CUDA capable GPU.\n", + " \t File format is *.fst (a tab deliminated text file).\n", + "\n", + "-e or --ehh \t: Calculates the Extended Haplotype Homozygosity (EHH) (2002) for a predefined gene list using a (split) vcf (indexed) folder.\n", + " \t The \"MODE\" used to generate the extended haplotype region must be specified.\n", + " \t Either \"FILE\" or \"FIXED\" mode can be used where the core haplotype spans over a single SNP.\n", + " \t FILE mode: In the tab deliminated gene list file a tertiary column containing the extended regions dimension's will be present.\n", + " \t Formats include \"START_position:END_position\" or +VALUE or -VALUE.\n", + " \t \"+\" Causes the START_position of the gene region to be incremented by the user specified value.\n", + " \t \"-\" Causes the START_position of the gene region to be reduced by the user specified value.\n", + " \t FIXED mode: In the parameters file's \"FIXED mode\" section specify the +VALUE or -VALUE. It will be applied to all gene regions.\n", + " \t \"SNP\" or \"BP\" mode can be used where the core haplotype spans only a single SNP.\n", + " \t In the tab deliminated gene list file the single SNP will be specified in the second column.\n", + " \t Format include \"CHROMOSOME_NUMBER:GENOMIC_POSITION\".\n", + " \t SNP mode: \"SNP default count\" is used to specify the number of SNPs that will be displaced on either side of the core SNP.\n", + " \t BP mode: \"SNP BP displacement\" is used to specify the number of base pairs that will be displaced on either side of the core SNP.\n", + " \t Uses a CUDA powered engine, therefore, requires a CUDA capable GPU.\n", + " \t File format is *.ehh (a tab deliminated text file).\n", + "\n", + " _____\n", + " (, / | /) /)\n", + " /---| __ ___// // ___\n", + " ) / |_/_)_(_)(/_(/_(_)\n", + "(_/ .-/\n", + " (_/\n", + "\n", + "CATE powered viral simulator\n", + "----------------------------------------------\n", + "\n", + "This section covers the Apollo simulator function and its complimentary tools.\n", + "\n", + "MAIN APOLLO FUNCTION:\n", + "\n", + "-sim or --simulator : Executes the Apollo simulator. Designed to simulate epidemics in a population.\n", + " Simulations are complete with within host dynamics.\n", + " Requires the configuration of 4 parameter files (Master, network, host & genome).\n", + "\n", + "APOLLO UTILITY TOOLS:\n", + "\n", + "-hr or --hapretrieve : Retrieves unique sequence configurations complete with the tissue, generation, sequence configuration and frequency.\n", + " Three tab-delimited *.csv are created. They contain haplotype information on all virions (all_Haplotype_Frequencies.csv),\n", + " virions that survived to the next generation (alive_Haplotype_Frequencies.csv) and,\n", + " those that formed progeny by becoming parents (parent_Haplotype_Frequencies.csv).\n", + "\n", + "-pedr or --pedretrieve : It provides the pedigree of all sequences found in the given generation of the host’s tissues.\n", + " The pedigree is traced for each sequence till the initial ancestral genomes are identified.\n", + " It creates two tab delimited *.csv files.\n", + " pedigree_Relationships.csv: Provides information of the parent progeny relationships.\n", + " sequence_Information.csv: Provides the sequence information of each ancestral virion.\n", + "\n", + "-segm or --segmatch : In an individual, identifies a given set of query sequences by alignment\n", + " to simulated sequences by the given degree of matching segregating sites.\n", + " Will create a folder called seg_Match within the nodes results and provide a tab delimited *.csv file\n", + " with the complete information of the virions with matching sequences.\n", + "\n", + "-s2j or --site2json : Converts the base substitution *.csv to Apollo’s JSON format,\n", + "\n", + "-r2j or --recom2json : Converts the recombination hotspots *.csv to Apollo’s JSON format.\n", + "\n", + "EXTRAS:\n", + "\n", + "-c or --cuda\t: Lists all available CUDA capable devices on machine.\n", + " \t Use the \"GPU number\" to select the desired CUDA device in the parameter file.\n", + "\n", + "-h or --help\t: Accesses this help menu where the software's currently available functions are listed.\n", + "\n", + "Program has completed its run.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "---\n", + "\n", + "## Running CATE on Google Colab" + ], + "metadata": { + "id": "LKtwJyIKiwY_" + } + }, + { + "cell_type": "code", + "source": [ + "# Test run displaying the available CUDA enabled GPU(s)\n", + "!./CATE -c" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LsKzbKC2grCC", + "outputId": "fa2e72e1-0151-433a-a3c5-22ff2359fffe" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " /\\\\\\\\\\\\\\\\\\ /\\\\\\\\\\\\\\\\\\ /\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\ /\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\n", + " /\\\\\\//////// /\\\\\\\\\\\\\\\\\\\\\\\\\\ \\///////\\\\\\///// \\/\\\\\\///////////\n", + "/\\\\\\/ /\\\\\\/////////\\\\\\ \\/\\\\\\ \\/\\\\\\\n", + "/\\\\\\ \\/\\\\\\ \\/\\\\\\ \\/\\\\\\ \\/\\\\\\\\\\\\\\\\\\\\\\\n", + "\\/\\\\\\ \\/\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\ \\/\\\\\\ \\/\\\\\\///////\n", + " \\//\\\\\\ \\/\\\\\\/////////\\\\\\ \\/\\\\\\ \\/\\\\\\\n", + " \\///\\\\\\ \\/\\\\\\ \\/\\\\\\ \\/\\\\\\ \\/\\\\\\\n", + " \\////\\\\\\\\\\\\\\\\\\ \\/\\\\\\ \\/\\\\\\ \\/\\\\\\ \\/\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\n", + " \\///////// \\/// \\/// \\/// \\///////////////\n", + "\n", + "CATE: CUDA Accelerated Testing of Evolution\n", + "Evolutionary tests for large scale genomic data\n", + "----------------------------------------------\n", + "HOW TO CITE:\n", + "Perera, D., Reisenhofer, E., Hussein, S., Higgins, E., Huber, C. D., & Long, Q. (2023).\n", + "CATE: A fast and scalable CUDA implementation to conduct highly parallelized evolutionary tests on large scale genomic data.\n", + "Methods in Ecology and Evolution, 00, 1–15.\n", + "https://doi.org/10.1111/2041-210X.14168.\n", + "----------------------------------------------\n", + "\n", + "Listing all CUDA capable devices:\n", + "\n", + "GPU number\t: 0\n", + "GPU name\t: Tesla T4\n", + "GPU memory (GB)\t: 15\n", + "GPU number of multiprocessor(s)\t: 40\n", + "GPU block(s) per multiprocessor\t: 16\n", + "GPU thread(s) per block\t: 1024\n", + "\n", + "All CUDA capable devices have been listed\n", + "\n", + "Program has completed its run.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "---\n", + "\n", + "\n", + "**Please refer to [CATE's wiki](https://github.com/theLongLab/CATE/wiki/How-to-use) for a detailed breakdown of how to execute different functions provided by the software.**\n", + "\n" + ], + "metadata": { + "id": "1b9oIrx8iGSJ" + } + } + ] +} \ No newline at end of file