diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e3ff5e02..bd430bf4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -62,6 +62,8 @@ jobs: "test_10x_sc", "test_clontech_umi", "test_nebnext_umi", + "test_rnaseq_bulk", + "test_rnaseq_sc", ] fail-fast: false steps: diff --git a/CHANGELOG.md b/CHANGELOG.md index 2689754a..a9cf4c5e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,10 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. | Dependency | Old version | New version | | ---------- | ----------- | ----------- | | biopython | 1.71 | 1.81 | +| enchantr | 0.1.15 | 0.1.16 | +| scoper | 1.2.1 | 1.3.0 | +| dowser | 1.2.0 | 2.1.0 | +| igphyml | 1.1.5 | 2.0.0 | ## [4.0] - 2024-04-22 Ascendio diff --git a/conf/modules.config b/conf/modules.config index 3dc63fa9..dd315b85 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -422,6 +422,14 @@ process { ] } + withName: CHANGEO_PARSEDB_SELECT_LOCUS { + publishDir = [ + path: { "${params.outdir}/vdj_annotation/select-locus/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: CHANGEO_PARSEDB_SPLIT { publishDir = [ path: { "${params.outdir}/vdj_annotation/04-select-productive/${meta.id}" }, diff --git a/conf/test_rnaseq_bulk.config b/conf/test_rnaseq_bulk.config new file mode 100644 index 00000000..eb10e0d9 --- /dev/null +++ b/conf/test_rnaseq_bulk.config @@ -0,0 +1,26 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running tests + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a fast and simple test. Use as follows: + * nextflow run nf-core/airrflow -profile test_rnaseq_bulk, + */ + +params { + config_profile_name = 'Test bulk RNA-seq based workflow using TRUST4' + config_profile_description = 'Minimal test dataset to check pipeline function with raw bulk RNA-seq data' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = 6.GB + max_time = 48.h + + // params + mode = 'fastq' + library_generation_method = 'trust4' + clonal_threshold = 0 + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-rnaseq/rnaseq_metadata.tsv' +} diff --git a/conf/test_rnaseq_sc.config b/conf/test_rnaseq_sc.config new file mode 100644 index 00000000..de2bd2f5 --- /dev/null +++ b/conf/test_rnaseq_sc.config @@ -0,0 +1,30 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running tests + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a fast and simple test. Use as follows: + * nextflow run nf-core/airrflow -profile test_rnaseq_sc, + */ + +params { + config_profile_name = 'Test single-cell RNA-seq based workflow using TRUST4' + config_profile_description = 'Minimal test dataset to check pipeline function with raw single-cell RNA-seq data' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = 6.GB + max_time = 48.h + + // params + mode = 'fastq' + library_generation_method = 'trust4' + clonal_threshold = 0 + barcode_read = "R1" + umi_read = "R1" + read_format = "bc:0:15,um:16:27" + skip_lineage = true + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-rnaseq/sc_rnaseq_metadata.tsv' +} diff --git a/docs/usage.md b/docs/usage.md index cef9b38c..3f6f7963 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -14,41 +14,40 @@ The nf-core/airrflow pipeline allows processing BCR and TCR targeted sequencing ### Quickstart -A typical command for running the pipeline for **bulk raw fastq files** is: +A typical command for running the pipeline for **bulk raw fastq files** using available pre-set protocol profiles is shown below. The full list of supported profiles can be found in the section [Supported protocol profiles](#supported-protocol-profiles). ```bash nextflow run nf-core/airrflow \ --profile \ ---mode fastq \ +-profile nebnext_umi_bcr,docker \ --input input_samplesheet.tsv \ ---library_generation_method specific_pcr_umi \ ---cprimers CPrimers.fasta \ ---vprimers VPrimers.fasta \ ---umi_length 12 \ ---umi_position R1 \ --outdir results ``` -You can optionally set a protocol profile if you're running the pipeline with data from one of the supported profiles. The full list of supported profiles can be found in the section [Supported protocol profiles](#supported-protocol-profiles). An example command running the NEBNext UMI protocol profile with docker containers is: +It is also possible to process custom sequencing protocols with custom primers by manually specifying the primers, UMI length (if available) and position: ```bash nextflow run nf-core/airrflow \ --profile nebnext_umi,docker \ +-profile \ --mode fastq \ --input input_samplesheet.tsv \ +--library_generation_method specific_pcr_umi \ +--cprimers CPrimers.fasta \ +--vprimers VPrimers.fasta \ +--umi_length 12 \ +--umi_position R1 \ --outdir results ``` A typical command to run the pipeline from **single cell raw fastq files** is: ```bash -nextflow run nf-core/airrflow -r dev \ +nextflow run nf-core/airrflow \ -profile \ --mode fastq \ --input input_samplesheet.tsv \ --library_generation_method sc_10x_genomics \ --reference_10x reference/refdata-cellranger-vdj-GRCh38-alts-ensembl-5.0.0.tar.gz \ ---outdir ./results +--outdir results ``` A typical command for running the pipeline departing from **single-cell AIRR rearrangement tables or assembled bulk sequencing fasta** data is: @@ -93,7 +92,6 @@ with `params.yaml` containing: ```yaml input: './samplesheet.csv' outdir: './results/' -genome: 'GRCh37' <...> ``` @@ -123,7 +121,7 @@ If you wish to share such profile (such as upload as supplementary material for ## Input samplesheet -### Fastq input samplesheet (bulk sequencing) +### Fastq input samplesheet (bulk AIRR sequencing) The required input file for processing raw BCR or TCR bulk targeted sequencing data is a sample sheet in TSV format (tab separated). The columns `sample_id`, `filename_R1`, `filename_R2`, `subject_id`, `species`, `tissue`, `pcr_target_locus`, `single_cell`, `sex`, `age` and `biomaterial_provider` are required. An example samplesheet is: @@ -145,7 +143,7 @@ The required input file for processing raw BCR or TCR bulk targeted sequencing d - `age`: Subject biological age. - `single_cell`: TRUE or FALSE. -Other optional columns can be added. These columns will be available when building the contrasts for the repertoire comparison report. It is recommended that these columns also follow the AIRR nomenclature. Examples are: +Other optional columns can be added. These columns will be available as metadata in the final repertoire table. It is recommended that these columns also follow the AIRR nomenclature. Examples are: - `intervention`: Description of intervention. - `disease_diagnosis`: Diagnosis of subject. @@ -153,19 +151,19 @@ Other optional columns can be added. These columns will be available when buildi - `collection_time_point_reference`: Event in the study schedule to which `Sample collection time` relates to (e.g. primary vaccination, intervention start). - `cell_subset`: Commonly-used designation of isolated cell population. -The metadata specified in the input file will then be automatically annotated in a column with the same header in the tables generated by the pipeline. +It is possible to provide several fastq files per sample (e.g. sequenced over different chips or lanes). In this case the different fastq files per sample will be merged together prior to processing. Provide one fastq pair R1/R2 per row, and the same `sample_id` field for these rows. ### Fastq input samplesheet (single cell sequencing) -The required input file for processing raw BCR or TCR single cell targeted sequencing data is a sample sheet in TSV format (tab separated). The columns `sample_id`, `filename_R1`, `filename_R2`, `subject_id`, `species`, `tissue`, `pcr_target_locus`, `single_cell`, `sex`, `age` and `biomaterial_provider` are required. You can refer to the bulk fastq input section for documentation on the individual columns. +The required input file for processing raw BCR or TCR single cell targeted sequencing data is a sample sheet in TSV format (tab separated). The columns `sample_id`, `filename_R1`, `filename_R2`, `subject_id`, `species`, `tissue`, `pcr_target_locus`, `single_cell`, `sex`, `age` and `biomaterial_provider` are required. Any other columns you add will be available in the final repertoire file as extra metadata fields. You can refer to the bulk fastq input section for documentation on the individual columns. An example samplesheet is: -| sample_id | filename_R1 | filename_R2 | subject_id | species | pcr_target_locus | tissue | sex | age | biomaterial_provider | single_cell | intervention | collection_time_point_relative | cell_subset | -| --------- | ------------------------------- | ------------------------------- | ---------- | ------- | ---------------- | ------ | ------ | --- | -------------------- | ----------- | -------------- | ------------------------------ | ------------ | -| sample01 | sample1_S1_L001_R1_001.fastq.gz | sample1_S1_L001_R2_001.fastq.gz | Subject02 | human | IG | blood | NA | 53 | sequencing_facility | FALSE | Drug_treatment | Baseline | plasmablasts | -| sample02 | sample2_S1_L001_R1_001.fastq.gz | sample2_S1_L001_R2_001.fastq.gz | Subject02 | human | TR | blood | female | 78 | sequencing_facility | FALSE | Drug_treatment | Baseline | plasmablasts | +| sample_id | filename_R1 | filename_R2 | subject_id | species | pcr_target_locus | tissue | sex | age | biomaterial_provider | single_cell | +| --------- | -------------------------------- | -------------------------------- | ---------- | ------- | ---------------- | ------ | ------ | --- | -------------------- | ----------- | +| sample01 | sample01_S1_L001_R1_001.fastq.gz | sample01_S1_L001_R2_001.fastq.gz | Subject02 | human | IG | blood | NA | 53 | sequencing_facility | TRUE | +| sample02 | sample02_S1_L001_R1_001.fastq.gz | sample02_S1_L001_R2_001.fastq.gz | Subject02 | human | TR | blood | female | 78 | sequencing_facility | TRUE | -> FASTQ files must confirm the 10xGenomics cellranger naming conventions
>**`[SAMPLE-NAME]`_S1_L00`[LANE-NUMBER]` _`[READ-TYPE]`\_001.fastq.gz** +> FASTQ files must conform with the 10xGenomics cellranger naming conventions with the same sample name as provided in the sample*id column
>\*\*`[SAMPLE-NAME]`* S`[CHIP-NUMBER]`_ L00`[LANE-NUMBER]`_`[R1/R2]`\_001.fastq.gz\*\* > > Read type is one of > @@ -174,6 +172,13 @@ An example samplesheet is: > - `R1`: Read 1 > - `R2`: Read 2 +It is possible to provide several fastq files per sample (e.g. sequenced over different chips or lanes). In this case the different fastq files per sample will be provided to the same cellranger process. These rows should then have an identical `sample_id` field. + +### Fastq input samplesheet (untargeted bulk or sc RNA sequencing) + +When running the untargeted protocol, BCR or TCR sequences will be extracted from the untargeted bulk or single-cell RNA sequencing with tools such as [TRUST4](https://github.com/liulab-dfci/TRUST4). +The required input file is the same as for the [Fastq bulk AIRR samplesheet](#fastq-input-samplesheet-bulk-airr-sequencing) or [Fastq single-cell AIRR samplesheet](#fastq-input-samplesheet-single-cell-sequencing) depending on the input data type (bulk RNAseq or single-cell RNAseq). + ### Assembled input samplesheet (bulk or single-cell sequencing) The required input file for processing raw BCR or TCR bulk targeted sequencing data is a sample sheet in TSV format (tab separated). The columns `sample_id`, `filename`, `subject_id`, `species`, `tissue`, `single_cell`, `sex`, `age` and `biomaterial_provider` are required. All fields are explained in the previous section, with the only difference being that there is only one `filename` column for the assembled input samplesheet. The provided file will be different from assembled single-cell or bulk data: @@ -230,18 +235,29 @@ This profile executes the commands based on the pRESTO pre-set pipeline [presto- - Align and annotate the internal C Region (for the BCR specific protocol) for a more specific isotype annotation. - Remove duplicate sequences and filter to sequences with at least 2 supporting sources. -Please note that the default primer sequences and internal CRegion sequences are for human. If you wish to run this protocol on mouse or other species, please provide the alternative primers: +Please note that the default primer sequences and internal CRegion sequences are for human. If you wish to run this protocol on mouse or other species, please provide the alternative primers. Here an example using the mouse IG primers on the Immcantation Bitbucket repository: ```bash nextflow run nf-core/airrflow -r \ -profile nebnext_umi_bcr,docker \ --input input_samplesheet.tsv \ ---cprimers \ ---internal_cregion_sequences \ +--cprimers https://bitbucket.org/kleinstein/immcantation/raw/354f49228a43b4c2858d67fb09886126b314e317/protocols/AbSeq/AbSeq_R1_Mouse_IG_Primers.fasta \ +--internal_cregion_sequences https://bitbucket.org/kleinstein/immcantation/raw/354f49228a43b4c2858d67fb09886126b314e317/protocols/AbSeq/AbSeq_Mouse_IG_InternalCRegion.fasta \ --outdir results ``` -### Clontech / Takara SMARTer Human BCR Profiling kit +And similarly for TCR libraries: + +```bash +nextflow run nf-core/airrflow -r \ +-profile nebnext_umi_bcr,docker \ +--input input_samplesheet.tsv \ +--cprimers https://bitbucket.org/kleinstein/immcantation/raw/354f49228a43b4c2858d67fb09886126b314e317/protocols/AbSeq/AbSeq_R1_Mouse_TR_Primers.fasta \ +--internal_cregion_sequences https://bitbucket.org/kleinstein/immcantation/raw/354f49228a43b4c2858d67fb09886126b314e317/protocols/AbSeq/AbSeq_Mouse_TR_InternalCRegion.fasta \ +--outdir results +``` + +### Clontech / Takara SMARTer Human BCR/TCR Profiling kit - [TaKaRa SMARTer Human BCR kit](https://www.takarabio.com/products/next-generation-sequencing/immune-profiling/human-repertoire/human-bcr-profiling-kit-for-illumina-sequencing) @@ -266,13 +282,23 @@ This profile executes the sequence assembly commands based on the pRESTO pre-set After the sequence assembly steps, the remaining steps are common for all protocols. -Please note that the default primer sequences and internal CRegion sequences are for human. If you wish to run this protocol on mouse or other species, please provide the alternative primer sequences: +Please note that the default primer sequences and internal CRegion sequences are for human. If you wish to run this protocol on mouse or other species, please provide the alternative primer sequences. Here an example using the mouse IG primers on the Immcantation Bitbucket repository: ```bash nextflow run nf-core/airrflow -r \ -profile clontech_umi_bcr,docker \ --input input_samplesheet.tsv \ ---cprimers \ +--cprimers https://bitbucket.org/kleinstein/immcantation/raw/c98269b194e9c6262fe3b098be3600ba7f64b85c/protocols/Universal/Mouse_IG_CRegion_RC.fasta \ +--outdir results +``` + +And for TCR data: + +```bash +nextflow run nf-core/airrflow -r \ +-profile clontech_umi_tcr,docker \ +--input input_samplesheet.tsv \ +--cprimers https://bitbucket.org/kleinstein/immcantation/raw/c98269b194e9c6262fe3b098be3600ba7f64b85c/protocols/Universal/Mouse_TR_CRegion_RC.fasta \ --outdir results ``` @@ -281,18 +307,18 @@ nextflow run nf-core/airrflow -r \ When processing bulk sequencing data departing from raw `fastq` reads, several sequencing protocols are supported which can be provided with the parameter `--library_generation_method`. The following table matches the library generation methods as described in the [AIRR metadata annotation guidelines](https://docs.airr-community.org/en/stable/miairr/metadata_guidelines.html#library-generation-method) to the value that can be provided to the `--library_generation_method` parameter. -| Library generation methods (AIRR) | Description | Name in pipeline | Commercial protocols | -| --------------------------------- | ------------------------------------------------------------------------------------------ | ---------------- | ----------------------------------------- | -| RT(RHP)+PCR | RT-PCR using random hexamer primers | Not supported | | -| RT(oligo-dT)+PCR | RT-PCR using oligo-dT primers | Not supported | | -| RT(oligo-dT)+TS+PCR | 5’-RACE PCR (i.e. RT is followed by a template switch (TS) step) using oligo-dT primers | dt_5p_race | | -| RT(oligo-dT)+TS(UMI)+PCR | 5’-RACE PCR using oligo-dT primers and template switch primers containing UMI | dt_5p_race_umi | TAKARA SMARTer TCR v2, TAKARA SMARTer BCR | -| RT(specific)+PCR | RT-PCR using transcript-specific primers | specific_pcr | | -| RT(specific)+TS+PCR | 5’-RACE PCR using transcript- specific primers | Not supported | | -| RT(specific)+TS(UMI)+PCR | 5’-RACE PCR using transcript- specific primers and template switch primers containing UMIs | Not supported | | -| RT(specific+UMI)+PCR | RT-PCR using transcript-specific primers containing UMIs | specific_pcr_umi | | -| RT(specific+UMI)+TS+PCR | 5’-RACE PCR using transcript- specific primers containing UMIs | Not supported | | -| RT(specific)+TS | RT-based generation of dsDNA without subsequent PCR. This is used by RNA-seq kits. | Not supported | | +| Library generation methods (AIRR) | Description | Name in pipeline | +| --------------------------------- | ------------------------------------------------------------------------------------------ | ---------------- | +| RT(RHP)+PCR | RT-PCR using random hexamer primers | Not supported | +| RT(oligo-dT)+PCR | RT-PCR using oligo-dT primers | Not supported | +| RT(oligo-dT)+TS+PCR | 5’-RACE PCR (i.e. RT is followed by a template switch (TS) step) using oligo-dT primers | dt_5p_race | +| RT(oligo-dT)+TS(UMI)+PCR | 5’-RACE PCR using oligo-dT primers and template switch primers containing UMI | dt_5p_race_umi | +| RT(specific)+PCR | RT-PCR using transcript-specific primers | specific_pcr | +| RT(specific)+TS+PCR | 5’-RACE PCR using transcript- specific primers | Not supported | +| RT(specific)+TS(UMI)+PCR | 5’-RACE PCR using transcript- specific primers and template switch primers containing UMIs | Not supported | +| RT(specific+UMI)+PCR | RT-PCR using transcript-specific primers containing UMIs | specific_pcr_umi | +| RT(specific+UMI)+TS+PCR | 5’-RACE PCR using transcript- specific primers containing UMIs | Not supported | +| RT(specific)+TS | RT-based generation of dsDNA without subsequent PCR. This is used by RNA-seq kits. | Not supported | ### Multiplex specific PCR (with or without UMI) @@ -405,68 +431,6 @@ nextflow run nf-core/airrflow -profile docker \ --outdir ./results ``` -### dT-Oligo RT and 5'RACE PCR - -This sequencing type requires setting `--library_generation_method race_5p_umi` or `--library_generation_method race_5p_umi` if UMIs are not being employed, and providing sequences for the C-region primers as well as the linker or template switch oligo sequences with the parameter `--race_linker`. Examples are provided below to run airrflow to process amplicons generated with the TAKARA 5'RACE SMARTer Human BCR and TCR protocols (library structure schema shown below). - -#### Takara Bio SMARTer Human BCR - -The read configuration when sequencing with the TAKARA Bio SMARTer Human BCR protocol is the following: - -![nf-core/airrflow](images/TAKARA_RACE_BCR.png) - -```bash -nextflow run nf-core/airrflow -profile docker \ ---input samplesheet.tsv \ ---library_generation_method dt_5p_race_umi \ ---cprimers CPrimers.fasta \ ---race_linker linker.fasta \ ---umi_length 12 \ ---umi_position R2 \ ---cprimer_start 7 \ ---cprimer_position R1 \ ---outdir ./results -``` - -#### Takara Bio SMARTer Human TCR v2 - -The read configuration when sequencing with the Takara Bio SMARTer Human TCR v2 protocol is the following: - -![nf-core/airrflow](images/TAKARA_RACE_TCR.png) - -```bash -nextflow run nf-core/airrflow -profile docker \ ---input samplesheet.tsv \ ---library_generation_method dt_5p_race_umi \ ---cprimers CPrimers.fasta \ ---race_linker linker.fasta \ ---umi_length 12 \ ---umi_position R2 \ ---cprimer_start 5 \ ---cprimer_position R1 \ ---outdir ./results -``` - -For this protocol, the takara linkers are: - -```txt ->takara-linker -GTAC -``` - -And the C-region primers are: - -```txt ->TRAC -CAGGGTCAGGGTTCTGGATATN ->TRBC -GGAACACSTTKTTCAGGTCCTC ->TRDC -GTTTGGTATGAGGCTGACTTCN ->TRGC -CATCTGCATCAAGTTGTTTATC -``` - ## UMI barcode handling Unique Molecular Identifiers (UMIs) enable the quantification of BCR or TCR abundance in the original sample by allowing to distinguish PCR duplicates from original sample duplicates. @@ -511,6 +475,42 @@ nextflow run nf-core/airrflow -r dev \ - The 10xGenomics reference can be downloaded from the [download page](https://www.10xgenomics.com/support/software/cell-ranger/downloads) - To generate a V(D)J segment fasta file as reference from IMGT one can follow the [cellranger docs](https://support.10xgenomics.com/single-cell-vdj/software/pipelines/latest/advanced/references#imgt). +## Supported unselected RNA-seq based methods + +nf-core/airrflow supports unselected bulk or single-cell RNA-seq fastq files as input. [TRUST4](https://github.com/liulab-dfci/TRUST4) is used to extract TCR/BCR sequences from these files. The resulting AIRR tables are then fed into airrflow's Immcantation based workflow.
+To use unselected RNA-seq based input, specify `--library_generation_method trust4`. + +### Bulk RNA-seq + +A typical command to run the pipeline from **bulk RNA-seq fastq files** is: + +```bash +nextflow run nf-core/airrfow \ +-profile \ +--mode fastq \ +--input input_samplesheet.tsv \ +--library_generation_method trust4 \ +--outdir results +``` + +### Single-cell RNA-seq + +A typical command to run the pipeline from **single-cell RNA-seq fastq files** is: + +```bash +nextflow run nf-core/airrfow \ +-profile \ +--mode fastq \ +--input input_samplesheet.tsv \ +--library_generation_method trust4 \ +--umi_read R1 \ +--read_format bc:0:15,um:16:27 \ +--outdir results +``` + +- If UMI's are present, the read containing them must be specified using the `--umi_position` parameter. +- The `--read_format` parameter can be used to specify the Barcode and UMI position within the reads (see TRUST4 [docs](https://github.com/liulab-dfci/TRUST4?tab=readme-ov-file#10x-genomics-data-and-barcode-based-single-cell-data)) + ## Core Nextflow arguments :::note diff --git a/modules.json b/modules.json index 3a6e053c..e561d2ec 100644 --- a/modules.json +++ b/modules.json @@ -34,6 +34,11 @@ "branch": "master", "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a", "installed_by": ["modules"] + }, + "trust4": { + "branch": "master", + "git_sha": "bbb9636dbe460f45fe786d0866f8fd7337e4fc7a", + "installed_by": ["modules"] } } }, diff --git a/modules/local/airrflow_report/airrflow_report.nf b/modules/local/airrflow_report/airrflow_report.nf index b4422153..243d2c75 100644 --- a/modules/local/airrflow_report/airrflow_report.nf +++ b/modules/local/airrflow_report/airrflow_report.nf @@ -6,8 +6,8 @@ process AIRRFLOW_REPORT { error "nf-core/airrflow currently does not support Conda. Please use a container profile instead." } container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'docker.io/immcantation/airrflow:4.0.0': - 'docker.io/immcantation/airrflow:4.0.0' }" + 'docker.io/immcantation/airrflow:4.1.0': + 'docker.io/immcantation/airrflow:4.1.0' }" input: tuple val(meta), path(tab) // sequence tsv table in AIRR format diff --git a/modules/local/changeo/changeo_parsedb_select.nf b/modules/local/changeo/changeo_parsedb_select_locus.nf similarity index 72% rename from modules/local/changeo/changeo_parsedb_select.nf rename to modules/local/changeo/changeo_parsedb_select_locus.nf index 2bba4916..32805c26 100644 --- a/modules/local/changeo/changeo_parsedb_select.nf +++ b/modules/local/changeo/changeo_parsedb_select_locus.nf @@ -1,4 +1,4 @@ -process CHANGEO_PARSEDB_SELECT { +process CHANGEO_PARSEDB_SELECT_LOCUS { tag "$meta.id" label 'process_low' label 'immcantation' @@ -18,25 +18,21 @@ process CHANGEO_PARSEDB_SELECT { path "versions.yml" , emit: versions script: - def args = task.ext.args ?: '' - def args2 = task.ext.args2 ?: '' if (meta.locus.toUpperCase() == 'IG'){ """ - ParseDb.py select -d $tab $args --outname ${meta.id} > ${meta.id}_select_command_log.txt + ParseDb.py select -d $tab -f locus -u "IG[HKL]" --regex --outname ${meta.id} > ${meta.id}_select_command_log.txt cat <<-END_VERSIONS > versions.yml "${task.process}": - igblastn: \$( igblastn -version | grep -o "igblast[0-9\\. ]\\+" | grep -o "[0-9\\. ]\\+" ) changeo: \$( ParseDb.py --version | awk -F' ' '{print \$2}' ) END_VERSIONS """ } else if (meta.locus.toUpperCase() == 'TR'){ """ - ParseDb.py select -d $tab $args2 --outname ${meta.id} > "${meta.id}_command_log.txt" + ParseDb.py select -d $tab -f locus -u "TR[ABDG]" --regex --outname ${meta.id} > "${meta.id}_command_log.txt" cat <<-END_VERSIONS > versions.yml "${task.process}": - igblastn: \$( igblastn -version | grep -o "igblast[0-9\\. ]\\+" | grep -o "[0-9\\. ]\\+" ) changeo: \$( ParseDb.py --version | awk -F' ' '{print \$2}' ) END_VERSIONS """ diff --git a/modules/local/enchantr/collapse_duplicates.nf b/modules/local/enchantr/collapse_duplicates.nf index 903824fe..839af7b1 100644 --- a/modules/local/enchantr/collapse_duplicates.nf +++ b/modules/local/enchantr/collapse_duplicates.nf @@ -8,8 +8,8 @@ process COLLAPSE_DUPLICATES { error "nf-core/airrflow currently does not support Conda. Please use a container profile instead." } container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'docker.io/immcantation/airrflow:4.0.0': - 'docker.io/immcantation/airrflow:4.0.0' }" + 'docker.io/immcantation/airrflow:4.1.0': + 'docker.io/immcantation/airrflow:4.1.0' }" input: tuple val(meta), path(tabs) // tuple [val(meta), sequence tsv in AIRR format ] diff --git a/modules/local/enchantr/define_clones.nf b/modules/local/enchantr/define_clones.nf index 64b8e7df..7efbc49c 100644 --- a/modules/local/enchantr/define_clones.nf +++ b/modules/local/enchantr/define_clones.nf @@ -25,8 +25,8 @@ process DEFINE_CLONES { error "nf-core/airrflow currently does not support Conda. Please use a container profile instead." } container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'docker.io/immcantation/airrflow:4.0.0': - 'docker.io/immcantation/airrflow:4.0.0' }" + 'docker.io/immcantation/airrflow:4.1.0': + 'docker.io/immcantation/airrflow:4.1.0' }" input: tuple val(meta), path(tabs) // meta, sequence tsv in AIRR format diff --git a/modules/local/enchantr/detect_contamination.nf b/modules/local/enchantr/detect_contamination.nf index aae3ef92..a7e5044c 100644 --- a/modules/local/enchantr/detect_contamination.nf +++ b/modules/local/enchantr/detect_contamination.nf @@ -9,8 +9,8 @@ process DETECT_CONTAMINATION { error "nf-core/airrflow currently does not support Conda. Please use a container profile instead." } container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'docker.io/immcantation/airrflow:4.0.0': - 'docker.io/immcantation/airrflow:4.0.0' }" + 'docker.io/immcantation/airrflow:4.1.0': + 'docker.io/immcantation/airrflow:4.1.0' }" input: path(tabs) diff --git a/modules/local/enchantr/dowser_lineages.nf b/modules/local/enchantr/dowser_lineages.nf index 03444f19..53f16ae2 100644 --- a/modules/local/enchantr/dowser_lineages.nf +++ b/modules/local/enchantr/dowser_lineages.nf @@ -25,8 +25,8 @@ process DOWSER_LINEAGES { error "nf-core/airrflow currently does not support Conda. Please use a container profile instead." } container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'docker.io/immcantation/airrflow:4.0.0': - 'docker.io/immcantation/airrflow:4.0.0' }" + 'docker.io/immcantation/airrflow:4.1.0': + 'docker.io/immcantation/airrflow:4.1.0' }" input: tuple val(meta), path(tabs) diff --git a/modules/local/enchantr/find_threshold.nf b/modules/local/enchantr/find_threshold.nf index 8632e081..8e741bea 100644 --- a/modules/local/enchantr/find_threshold.nf +++ b/modules/local/enchantr/find_threshold.nf @@ -25,8 +25,8 @@ process FIND_THRESHOLD { error "nf-core/airrflow currently does not support Conda. Please use a container profile instead." } container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'docker.io/immcantation/airrflow:4.0.0': - 'docker.io/immcantation/airrflow:4.0.0' }" + 'docker.io/immcantation/airrflow:4.1.0': + 'docker.io/immcantation/airrflow:4.1.0' }" input: diff --git a/modules/local/enchantr/remove_chimeric.nf b/modules/local/enchantr/remove_chimeric.nf index 94805169..b8f9db1e 100644 --- a/modules/local/enchantr/remove_chimeric.nf +++ b/modules/local/enchantr/remove_chimeric.nf @@ -9,8 +9,8 @@ process REMOVE_CHIMERIC { error "nf-core/airrflow currently does not support Conda. Please use a container profile instead." } container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'docker.io/immcantation/airrflow:4.0.0': - 'docker.io/immcantation/airrflow:4.0.0' }" + 'docker.io/immcantation/airrflow:4.1.0': + 'docker.io/immcantation/airrflow:4.1.0' }" input: diff --git a/modules/local/enchantr/report_file_size.nf b/modules/local/enchantr/report_file_size.nf index 4fc4c3fa..fe7070ed 100644 --- a/modules/local/enchantr/report_file_size.nf +++ b/modules/local/enchantr/report_file_size.nf @@ -10,8 +10,8 @@ process REPORT_FILE_SIZE { error "nf-core/airrflow currently does not support Conda. Please use a container profile instead." } container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'docker.io/immcantation/airrflow:4.0.0': - 'docker.io/immcantation/airrflow:4.0.0' }" + 'docker.io/immcantation/airrflow:4.1.0': + 'docker.io/immcantation/airrflow:4.1.0' }" input: path logs diff --git a/modules/local/enchantr/single_cell_qc.nf b/modules/local/enchantr/single_cell_qc.nf index 49e97796..c330d1e3 100644 --- a/modules/local/enchantr/single_cell_qc.nf +++ b/modules/local/enchantr/single_cell_qc.nf @@ -24,8 +24,8 @@ process SINGLE_CELL_QC { error "nf-core/airrflow currently does not support Conda. Please use a container profile instead." } container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'docker.io/immcantation/airrflow:4.0.0': - 'docker.io/immcantation/airrflow:4.0.0' }" + 'docker.io/immcantation/airrflow:4.1.0': + 'docker.io/immcantation/airrflow:4.1.0' }" input: path(tabs) diff --git a/modules/local/enchantr/validate_input.nf b/modules/local/enchantr/validate_input.nf index db8ab075..278184f3 100644 --- a/modules/local/enchantr/validate_input.nf +++ b/modules/local/enchantr/validate_input.nf @@ -10,8 +10,8 @@ process VALIDATE_INPUT { error "nf-core/airrflow currently does not support Conda. Please use a container profile instead." } container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'docker.io/immcantation/airrflow:4.0.0': - 'docker.io/immcantation/airrflow:4.0.0' }" + 'docker.io/immcantation/airrflow:4.1.0': + 'docker.io/immcantation/airrflow:4.1.0' }" input: file samplesheet diff --git a/modules/local/prepare_trust4_reference.nf b/modules/local/prepare_trust4_reference.nf new file mode 100644 index 00000000..dce204c8 --- /dev/null +++ b/modules/local/prepare_trust4_reference.nf @@ -0,0 +1,24 @@ +process PREPARE_TRUST4_REFERENCE { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::trust4=1.0.13" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/trust4:1.0.13--h43eeafb_0': + 'biocontainers/trust4:1.0.13--h43eeafb_0' }" + + input: + tuple val(meta), path(R1), path(R2) + path(reference_igblast) + + output: + tuple val(meta), path("trust4_reference.fa") , emit: trust4_reference + + script: + """ + cat ${reference_igblast}/fasta/imgt_${meta.species.toLowerCase()}_*.fasta \\ + ${reference_igblast}/fasta/imgt_${meta.species.toLowerCase()}_*.fasta >> trust4_reference.fa + """ + + +} diff --git a/modules/local/reveal/add_meta_to_tab.nf b/modules/local/reveal/add_meta_to_tab.nf index 8413cebc..a46b9e3f 100644 --- a/modules/local/reveal/add_meta_to_tab.nf +++ b/modules/local/reveal/add_meta_to_tab.nf @@ -7,8 +7,8 @@ process ADD_META_TO_TAB { error "nf-core/airrflow currently does not support Conda. Please use a container profile instead." } container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'docker.io/immcantation/airrflow:4.0.0': - 'docker.io/immcantation/airrflow:4.0.0' }" + 'docker.io/immcantation/airrflow:4.1.0': + 'docker.io/immcantation/airrflow:4.1.0' }" cache 'deep' // Without 'deep' this process would run when using -resume diff --git a/modules/local/reveal/filter_junction_mod3.nf b/modules/local/reveal/filter_junction_mod3.nf index f792aca2..ba4484b2 100644 --- a/modules/local/reveal/filter_junction_mod3.nf +++ b/modules/local/reveal/filter_junction_mod3.nf @@ -7,8 +7,8 @@ process FILTER_JUNCTION_MOD3 { error "nf-core/airrflow currently does not support Conda. Please use a container profile instead." } container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'docker.io/immcantation/airrflow:4.0.0': - 'docker.io/immcantation/airrflow:4.0.0' }" + 'docker.io/immcantation/airrflow:4.1.0': + 'docker.io/immcantation/airrflow:4.1.0' }" input: tuple val(meta), path(tab) // sequence tsv in AIRR format diff --git a/modules/local/reveal/filter_quality.nf b/modules/local/reveal/filter_quality.nf index aa803279..a08472e7 100644 --- a/modules/local/reveal/filter_quality.nf +++ b/modules/local/reveal/filter_quality.nf @@ -7,8 +7,8 @@ process FILTER_QUALITY { error "nf-core/airrflow currently does not support Conda. Please use a container profile instead." } container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'docker.io/immcantation/airrflow:4.0.0': - 'docker.io/immcantation/airrflow:4.0.0' }" + 'docker.io/immcantation/airrflow:4.1.0': + 'docker.io/immcantation/airrflow:4.1.0' }" input: tuple val(meta), path(tab) // sequence tsv in AIRR format diff --git a/modules/nf-core/trust4/environment.yml b/modules/nf-core/trust4/environment.yml new file mode 100644 index 00000000..9270eee2 --- /dev/null +++ b/modules/nf-core/trust4/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "trust4" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::trust4=1.0.13" diff --git a/modules/nf-core/trust4/main.nf b/modules/nf-core/trust4/main.nf new file mode 100644 index 00000000..1d822fb8 --- /dev/null +++ b/modules/nf-core/trust4/main.nf @@ -0,0 +1,105 @@ +process TRUST4 { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::trust4=1.0.13" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/trust4:1.0.13--h43eeafb_0': + 'biocontainers/trust4:1.0.13--h43eeafb_0' }" + + input: + tuple val(meta), path(bam), path(reads) + tuple val(meta2), path(fasta) + tuple val(meta3), path(vdj_reference) + tuple val(meta4), val(barcode_read) + tuple val(meta5), val(umi_read) + + output: + tuple val(meta), path("*.tsv") , emit: tsv + tuple val(meta), path("*_airr.tsv") , emit: airr_files + tuple val(meta), path("${meta.id}_airr.tsv") , emit: airr_tsv + tuple val(meta), path("*_report.tsv") , emit: report_tsv + tuple val(meta), path("*.fa") , emit: fasta + tuple val(meta), path("*.out") , emit: out + tuple val(meta), path("*.fq") , emit: fq + tuple val(meta), path("**") , emit: outs + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def bam_mode = bam ? "-b ${bam}" : '' + def single_end_mode = reads && meta.single_end ? "-u ${reads}" : '' + // reference is optional for fastq input + def reference = vdj_reference ? "--ref ${vdj_reference}" : "" + // separate forward from reverse pairs + def (forward, reverse) = reads.collate(2).transpose() + def paired_end_mode = reads && (meta.single_end == false) ? "-1 ${forward[0]} -2 ${reverse[0]}" : '' + // read format is optional + def readFormat = params.read_format ? "--readFormat ${params.read_format}" : '' + // add barcode information if present + if (barcode_read) { + if (barcode_read == "R1") { + barcode = "--barcode ${forward[0]}" + } else if (barcode_read == "R2") { + barcode = "--barcode ${reverse[0]}" + } + } + else { + barcode = '' + } + // add umi information if present + if (umi_read) { + if (umi_read == "R1") { + umi = "--UMI ${forward[0]}" + } else if (umi_read == "R2") { + umi = "--UMI ${reverse[0]}" + } + } + else { + umi = '' + } + + """ + run-trust4 \\ + ${bam_mode} \\ + ${single_end_mode} \\ + ${paired_end_mode} \\ + ${barcode} \\ + ${readFormat} \\ + ${umi} \\ + -t $task.cpus \\ + -f ${fasta} \\ + -o ${prefix} \\ + ${reference} \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + trust4: \$(run-trust4 2>&1 | grep -o 'v[0-9.]*-r[0-9]*' | sed 's/^/TRUST4 using /' ) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_airr.tsv + touch ${prefix}_airr_align.tsv + touch ${prefix}_report.tsv + touch ${prefix}_assembled_reads.fa + touch ${prefix}_annot.fa + touch ${prefix}_cdr3.out + touch ${prefix}_raw.out + touch ${prefix}_final.out + touch ${prefix}_toassemble.fq + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + trust4: \$(run-trust4 2>&1 | grep -o 'v[0-9.]*-r[0-9]*' | sed 's/^/TRUST4 using /' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/trust4/meta.yml b/modules/nf-core/trust4/meta.yml new file mode 100644 index 00000000..89bc4d29 --- /dev/null +++ b/modules/nf-core/trust4/meta.yml @@ -0,0 +1,75 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "trust4" +description: Run TRUST4 on RNA-seq data +keywords: + - sort + - example + - genomics +tools: + - "trust4": + description: "TCR and BCR assembly from bulk or single-cell RNA-seq data" + homepage: "https://github.com/liulab-dfci/TRUST4" + documentation: "https://github.com/liulab-dfci/TRUST4" + tool_dev_url: "https://github.com/liulab-dfci/TRUST4" + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - bam: + type: file + description: BAM file from bulk or single-cell RNA-seq data + pattern: "*.bam" + - reads: + type: file + description: List of input FastQ files of size 1 and 2 for single-end and paired-end data, respectively + - fasta: + type: file + description: Path to the fasta file coordinate and sequence of V/D/J/C genes + - ref: + type: file + description: Path to detailed V/D/J/C gene reference file, such as from IMGT database. + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - tsv: + type: file + description: tsv files created by TRUST4 + pattern: "*.tsv" + - airr_tsv: + type: file + description: TRUST4 results in AIRR format + pattern: "*_airr.tsv" + - report_tsv: + type: file + description: TRUST4 report in tsv format + pattern: "*_report.tsv" + - fasta: + type: file + description: Fasta files created by TRUST4 + pattern: "*.fa" + - out: + type: file + description: Further report files + pattern: "*.out" + - fq: + type: file + description: Fastq files created by TRUST4 + pattern: "*.fq" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@mapo9, @Joaodemeirelles" +maintainers: + - "@mapo9" diff --git a/nextflow.config b/nextflow.config index 3dfaec00..2751e888 100644 --- a/nextflow.config +++ b/nextflow.config @@ -86,6 +86,7 @@ params { fetch_imgt = false save_databases = true isotype_column = 'c_call' + skip_alignment_filter = false // ----------------------- // bulk filtering options @@ -123,6 +124,13 @@ params { // ----------------------- reference_10x = null + // ----------------------- + // raw RNA seq input options + // ----------------------- + barcode_read = null + read_format = null + umi_read = null + // ----------------------- // generic nf-core options @@ -305,6 +313,8 @@ profiles { test_10x_sc { includeConfig 'conf/test_10x_sc.config' } test_clontech_umi { includeConfig 'conf/test_clontech_umi.config' } test_nebnext_umi { includeConfig 'conf/test_nebnext_umi.config' } + test_rnaseq_bulk { includeConfig 'conf/test_rnaseq_bulk.config' } + test_rnaseq_sc { includeConfig 'conf/test_rnaseq_sc.config' } nebnext_umi_tcr { includeConfig 'conf/nebnext_umi_tcr.config' } nebnext_umi_bcr { includeConfig 'conf/nebnext_umi_bcr.config' } clontech_umi_bcr { includeConfig 'conf/clontech_umi_bcr.config' } diff --git a/nextflow_schema.json b/nextflow_schema.json index e07c007b..566dafa2 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -61,7 +61,14 @@ "type": "string", "fa_icon": "fas fa-flask", "description": "Protocol used for the V(D)J amplicon sequencing library generation.", - "enum": ["specific_pcr_umi", "specific_pcr", "dt_5p_race", "dt_5p_race_umi", "sc_10x_genomics"], + "enum": [ + "specific_pcr_umi", + "specific_pcr", + "dt_5p_race", + "dt_5p_race_umi", + "sc_10x_genomics", + "trust4" + ], "help_text": "Available protocols are:\n- `specific_pcr_umi`: RT-PCR using transcript-specific primers containing UMIs.\n- `specific_pcr`: RT-PCR using transcript-specific primers.\n- `dt_5p_race_umi`: 5\u2019-RACE PCR using oligo-dT primers and template switch primers containing UMI.\n- `dt_5p_race`: 5\u2019-RACE PCR (i.e. RT is followed by a template switch (TS) step) using oligo-dT primers.\n- `sc_10x_genomics`:10x genomics library preparation protocol for scVDJ sequencing." }, "race_linker": { @@ -336,19 +343,22 @@ "save_databases": { "type": "boolean", "description": "Save databases so you can use the cache in future runs.", - "fa_icon": "fas fa-file-download" + "fa_icon": "fas fa-file-download", + "default": true }, "reference_fasta": { "type": "string", "description": "Path to the germline reference fasta.", "help_text": "By default, we provide a pre-downloaded version of the IMGT database. It is also possible to provide a custom reference fasta database. To fetch a fresh version of IMGT, set the `--fetch_imgt` parameter instead.", - "fa_icon": "fas fa-database" + "fa_icon": "fas fa-database", + "default": "https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/database-cache/imgtdb_base.zip" }, "reference_igblast": { "type": "string", "description": "Path to the cached igblast database.", "help_text": "By default, we provide a pre-downloaded version of the IMGT database. It is also possible to provide a custom reference fasta database. To fetch a fresh version of IMGT, set the `--fetch_imgt` parameter instead.", - "fa_icon": "fas fa-database" + "fa_icon": "fas fa-database", + "default": "https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/database-cache/igblast_base.zip" }, "fetch_imgt": { "type": "boolean", @@ -478,6 +488,36 @@ "help_text": "Options for running raw single cell data.", "fa_icon": "fab fa-pagelines" }, + "rnaseq_based_analysis_options": { + "title": "Unselected RNA-seq based analysis options", + "type": "object", + "description": "Options specific for raw unselected RNA-seq input.", + "default": "", + "properties": { + "barcode_read": { + "type": "string", + "description": "Specifies which read holds the barcodes", + "enum": ["R1", "R2"], + "fa_icon": "fas fa-terminal", + "help_text": "file containing the barcodes" + }, + "umi_read": { + "type": "string", + "description": "Indicate if UMI indices are recorded in the R1 (default) or R1 fastq file.", + "help_text": "file containing 10x Genomics-like UMIs", + "enum": ["R1", "R2"], + "fa_icon": "fas fa-barcode" + }, + "read_format": { + "type": "string", + "description": "Specifies where in the read the barcodes and UMIs can be found.", + "help_text": "For further information see the TRUST4 [docs](https://github.com/liulab-dfci/TRUST4?tab=readme-ov-file#10x-genomics-data-and-barcode-based-single-cell-data).", + "fa_icon": "fas fa-terminal" + } + }, + "help_text": "Options for running raw RNA seq data.", + "fa_icon": "fab fa-pagelines" + }, "report_options": { "title": "Report options", "type": "object", @@ -790,6 +830,9 @@ { "$ref": "#/definitions/single_cell_analysis_options" }, + { + "$ref": "#/definitions/rnaseq_based_analysis_options" + }, { "$ref": "#/definitions/institutional_config_options" }, @@ -799,5 +842,10 @@ { "$ref": "#/definitions/generic_options" } - ] + ], + "properties": { + "skip_alignment_filter": { + "type": "boolean" + } + } } diff --git a/subworkflows/local/fastq_input_check.nf b/subworkflows/local/fastq_input_check.nf index e14cfc21..91412b15 100644 --- a/subworkflows/local/fastq_input_check.nf +++ b/subworkflows/local/fastq_input_check.nf @@ -29,8 +29,8 @@ workflow FASTQ_INPUT_CHECK { ch_versions = SAMPLESHEET_CHECK.out.versions - // Merge multi-lane sample fastq for protocols except for 10x genomics (cellranger handles multi-fastq per sample) - if (params.library_generation_method == 'sc_10x_genomics') { + // Merge multi-lane sample fastq for protocols except for 10x genomics, trust4 (cellranger handles multi-fastq per sample) + if (params.library_generation_method == 'sc_10x_genomics' || params.library_generation_method == 'trust4') { ch_merged_reads = ch_reads.single.mix( ch_reads.multiple ) @@ -85,10 +85,9 @@ def create_fastq_channels(LinkedHashMap col) { } array = [ meta, [ file(col.filename_R1), file(col.filename_R2), file(col.filename_I1) ] ] } else { - array = [ meta, [ file(col.filename_R1), file(col.filename_R2) ] ] if (params.index_file) { - error "ERROR: --index_file was provided but the index file path is not specified in the samplesheet!" + error "ERROR: Index file path was provided but the index file path is not specified in the samplesheet!" } } return array diff --git a/subworkflows/local/repertoire_analysis_reporting.nf b/subworkflows/local/repertoire_analysis_reporting.nf index 2a796751..905c2d85 100644 --- a/subworkflows/local/repertoire_analysis_reporting.nf +++ b/subworkflows/local/repertoire_analysis_reporting.nf @@ -30,7 +30,7 @@ workflow REPERTOIRE_ANALYSIS_REPORTING { main: ch_versions = Channel.empty() - if (params.mode == "fastq" && params.library_generation_method != "sc_10x_genomics") { + if (params.mode == "fastq" && params.library_generation_method != "sc_10x_genomics" && params.library_generation_method != "trust4" ) { PARSE_LOGS( ch_presto_filterseq_logs, ch_presto_maskprimers_logs, diff --git a/subworkflows/local/rnaseq_input.nf b/subworkflows/local/rnaseq_input.nf new file mode 100644 index 00000000..6469e6b5 --- /dev/null +++ b/subworkflows/local/rnaseq_input.nf @@ -0,0 +1,135 @@ +include { PREPARE_TRUST4_REFERENCE } from '../../modules/local/prepare_trust4_reference' +include { TRUST4 } from '../../modules/nf-core/trust4/main' +include { FASTQ_INPUT_CHECK } from '../../subworkflows/local/fastq_input_check' +include { CHANGEO_PARSEDB_SELECT_LOCUS } from '../../modules/local/changeo/changeo_parsedb_select_locus' +include { CHANGEO_CONVERTDB_FASTA as CHANGEO_CONVERTDB_FASTA_FROM_AIRR } from '../../modules/local/changeo/changeo_convertdb_fasta' +include { FASTP } from '../../modules/nf-core/fastp/main' +include { RENAME_FASTQ as RENAME_FASTQ_TRUST4 } from '../../modules/local/rename_fastq' + + + +workflow RNASEQ_INPUT { + + take: + ch_input + ch_igblast_reference + + main: + + ch_versions = Channel.empty() + ch_logs = Channel.empty() + + // + // read in samplesheet, validate and stage input fies + // + FASTQ_INPUT_CHECK( + ch_input + ) + ch_versions = ch_versions.mix(FASTQ_INPUT_CHECK.out.versions) + + ch_reads = FASTQ_INPUT_CHECK.out.reads + + + // validate library generation method parameters + if (params.vprimers) { + error "The TRUST4 library generation method does not require V-region primers, please provide a reference file instead or select another library method option." + } else if (params.race_linker) { + error "The TRUST4 10X genomics library generation method does not require the --race_linker parameter, please provide a reference file instead or select another library method option." + } + if (params.cprimers) { + error "The TRUST4 library generation method does not require C-region primers, please provide a reference file instead or select another library method option." + } + if (params.umi_length > 0) { + error "TRUST4 library generation method does not require to set the UMI length, please provide a reference file instead or select another library method option." + } + if (params.reference_10x) { + error "The TRUST4 library generation method does not require this reference, please provide a compliant reference file instead or select another library method option." + } + + // Fastp + save_merged = false + FASTP ( + ch_reads, + [], + [], + save_merged + ) + ch_versions = ch_versions.mix(FASTP.out.versions) + + ch_rename_fastq = FASTP.out.reads.map { meta, reads -> [meta, reads[0], reads[1]] } + + // rename fastp output + RENAME_FASTQ_TRUST4( + ch_rename_fastq + ) + + ch_reads_fastp_filtered = RENAME_FASTQ_TRUST4.out.reads.dump(tag: "fastp_filtered") + + PREPARE_TRUST4_REFERENCE( + ch_reads_fastp_filtered, + ch_igblast_reference + ) + + + // create trust4 input + ch_reads_trust4 = ch_reads_fastp_filtered.map{ meta, read_1, read_2 -> [ meta, [], [read_1, read_2] ] } + + PREPARE_TRUST4_REFERENCE.out.trust4_reference.dump(tag: "trust4_reference") + + ch_reads_trust4.dump(tag: "trust4_input") + + // create barcode and umi channels for nf-core trust4 module + barcode_channel = ch_reads_fastp_filtered.map { meta, read_1, read_2 -> [meta, params.barcode_read] } + umi_channel = ch_reads_fastp_filtered.map { meta, read_1, read_2 -> [meta, params.umi_read] } + + TRUST4( + ch_reads_trust4, + PREPARE_TRUST4_REFERENCE.out.trust4_reference, + Channel.of([[], []]).collect(), + barcode_channel, + umi_channel + ) + + ch_trust4_out = TRUST4.out.outs + + // check whether input is sc or bulk and extract respective airr file for downstream processing + ch_trust4_out + .branch { + meta, out_files -> + bulk : meta["single_cell"] == "false" + return [ meta, out_files.find { it.endsWith("${meta.id}_airr.tsv") } ] + sc : meta["single_cell"] == "true" + return [ meta, out_files.find { it.endsWith("${meta.id}_barcode_airr.tsv") } ] + } + .set { ch_trust4_airr_file } + + + // create channel with airr file + ch_trust4_airr_file.bulk.mix ( ch_trust4_airr_file.sc ).set { ch_trust4_airr } + + // select only provided locus + CHANGEO_PARSEDB_SELECT_LOCUS(ch_trust4_airr) + + + // convert airr tsv to fasta + CHANGEO_CONVERTDB_FASTA_FROM_AIRR( + CHANGEO_PARSEDB_SELECT_LOCUS.out.tab + ) + + ch_fasta = CHANGEO_CONVERTDB_FASTA_FROM_AIRR.out.fasta + + + emit: + versions = ch_versions + // fastp + fastp_reads_json = FASTP.out.json.collect{ meta,json -> json } + fastp_reads_html = FASTP.out.html.collect{ meta,html -> html } + // complete trust4 output + outs = ch_trust4_out + // trust4 airr file + airr = ch_trust4_airr + // trust4 output converted to FASTA format + fasta = ch_fasta + samplesheet = FASTQ_INPUT_CHECK.out.samplesheet + +} diff --git a/subworkflows/local/vdj_annotation.nf b/subworkflows/local/vdj_annotation.nf index 692320ec..18d052bf 100644 --- a/subworkflows/local/vdj_annotation.nf +++ b/subworkflows/local/vdj_annotation.nf @@ -38,19 +38,25 @@ workflow VDJ_ANNOTATION { ch_assigned_tab = CHANGEO_MAKEDB.out.tab ch_assignment_logs = CHANGEO_MAKEDB.out.logs - // Apply quality filters: - // - locus should match v_call chain - // - seq alignment min length informative positions 200 - // - max 10% N nucleotides - FILTER_QUALITY( - ch_assigned_tab - ) - ch_logs = ch_logs.mix(FILTER_QUALITY.out.logs) - ch_versions = ch_versions.mix(FILTER_QUALITY.out.versions) + if (!params.skip_alignment_filter){ + // Apply quality filters: + // - locus should match v_call chain + // - seq alignment min length informative positions 200 + // - max 10% N nucleotides + FILTER_QUALITY( + ch_assigned_tab + ) + ch_for_parsedb_split = FILTER_QUALITY.out.tab + ch_logs = ch_logs.mix(FILTER_QUALITY.out.logs) + ch_versions = ch_versions.mix(FILTER_QUALITY.out.versions) + } else { + ch_for_parsedb_split = ch_assigned_tab + } + if (params.productive_only) { CHANGEO_PARSEDB_SPLIT ( - FILTER_QUALITY.out.tab + ch_for_parsedb_split ) ch_logs = ch_logs.mix(CHANGEO_PARSEDB_SPLIT.out.logs) ch_versions = ch_versions.mix(CHANGEO_PARSEDB_SPLIT.out.versions) diff --git a/workflows/airrflow.nf b/workflows/airrflow.nf index e9165cf7..b5815419 100644 --- a/workflows/airrflow.nf +++ b/workflows/airrflow.nf @@ -43,6 +43,7 @@ include { CLONAL_ANALYSIS } from '../subworkflows/local/clonal_ana include { REPERTOIRE_ANALYSIS_REPORTING } from '../subworkflows/local/repertoire_analysis_reporting' include { SC_RAW_INPUT } from '../subworkflows/local/sc_raw_input' include { FASTQ_INPUT_CHECK } from '../subworkflows/local/fastq_input_check' +include { RNASEQ_INPUT } from '../subworkflows/local/rnaseq_input' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -94,24 +95,51 @@ workflow AIRRFLOW { ch_validated_samplesheet = SC_RAW_INPUT.out.samplesheet.collect() - ch_presto_filterseq_logs = Channel.empty() - ch_presto_maskprimers_logs = Channel.empty() - ch_presto_pairseq_logs = Channel.empty() - ch_presto_clustersets_logs = Channel.empty() - ch_presto_buildconsensus_logs = Channel.empty() - ch_presto_postconsensus_pairseq_logs = Channel.empty() - ch_presto_assemblepairs_logs = Channel.empty() - ch_presto_collapseseq_logs = Channel.empty() - ch_presto_splitseq_logs = Channel.empty() - ch_fastp_html = Channel.empty() - ch_fastp_json = Channel.empty() - ch_fastqc_postassembly_mqc = Channel.empty() - } else { - // Perform sequence assembly if input type is fastq from bulk sequencing data - SEQUENCE_ASSEMBLY( - ch_input, - DATABASES.out.igblast.collect() - ) + ch_presto_filterseq_logs = Channel.empty() + ch_presto_maskprimers_logs = Channel.empty() + ch_presto_pairseq_logs = Channel.empty() + ch_presto_clustersets_logs = Channel.empty() + ch_presto_buildconsensus_logs = Channel.empty() + ch_presto_postconsensus_pairseq_logs = Channel.empty() + ch_presto_assemblepairs_logs = Channel.empty() + ch_presto_collapseseq_logs = Channel.empty() + ch_presto_splitseq_logs = Channel.empty() + ch_fastp_html = Channel.empty() + ch_fastp_json = Channel.empty() + ch_fastqc_postassembly_mqc = Channel.empty() + + } else if (params.library_generation_method == "trust4") { + // Extract VDJ sequences from "general" RNA seq data using TRUST4 + + RNASEQ_INPUT ( + ch_input, + DATABASES.out.igblast.collect() + ) + + ch_fasta = RNASEQ_INPUT.out.fasta + ch_versions = ch_versions.mix(RNASEQ_INPUT.out.versions) + + ch_validated_samplesheet = RNASEQ_INPUT.out.samplesheet.collect() + + ch_presto_filterseq_logs = Channel.empty() + ch_presto_maskprimers_logs = Channel.empty() + ch_presto_pairseq_logs = Channel.empty() + ch_presto_clustersets_logs = Channel.empty() + ch_presto_buildconsensus_logs = Channel.empty() + ch_presto_postconsensus_pairseq_logs = Channel.empty() + ch_presto_assemblepairs_logs = Channel.empty() + ch_presto_collapseseq_logs = Channel.empty() + ch_presto_splitseq_logs = Channel.empty() + ch_fastp_html = RNASEQ_INPUT.out.fastp_reads_html + ch_fastp_json = RNASEQ_INPUT.out.fastp_reads_json + ch_fastqc_postassembly_mqc = Channel.empty() + } + else { + // Perform sequence assembly if input type is fastq from bulk sequencing data + SEQUENCE_ASSEMBLY( + ch_input, + DATABASES.out.igblast.collect() + ) ch_fasta = SEQUENCE_ASSEMBLY.out.fasta ch_versions = ch_versions.mix(SEQUENCE_ASSEMBLY.out.versions)