diff --git a/CHANGELOG.md b/CHANGELOG.md index 0fb91b64..d8fdba6f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Added` - Support for brukers tdf format by adding tdf2mzml converter [#263](https://github.com/nf-core/mhcquant/issues/263) +- DeepLC retention time prediction ### `Fixed` diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 4fe2eb65..583058a9 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -28,7 +28,12 @@ class RowChecker: VALID_FORMATS = (".raw", ".mzML", ".d") def __init__( - self, id_col="ID", sample_col="Sample", condition_col="Condition", filename_col="ReplicateFileName", **kwargs + self, + id_col="ID", + sample_col="Sample", + condition_col="Condition", + filename_col="ReplicateFileName", + **kwargs, ): """ Initialize the row checker with the expected column names. diff --git a/bin/deeplc_cli.py b/bin/deeplc_cli.py new file mode 100755 index 00000000..0ab42681 --- /dev/null +++ b/bin/deeplc_cli.py @@ -0,0 +1,418 @@ +#!/usr/bin/env python +# Written by Jonas Scheid and Steffen Lemke + +import click +import logging +import math +import os +import pandas as pd +import numpy as np +import sys +import tensorflow as tf +from deeplc import DeepLC +from pyopenms import IdXMLFile +from sklearn.preprocessing import MinMaxScaler + +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" # Set TensorFlow logging level to suppress warnings +tf.get_logger().setLevel(logging.ERROR) # Filter out specific warnings + +# initate logger +console = logging.StreamHandler() +formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") +console.setFormatter(formatter) +LOG = logging.getLogger("DeepLC prediction") +LOG.addHandler(console) +LOG.setLevel(logging.INFO) + + +def parse_idxml(path: str) -> tuple[list, list]: + """ + Parse idXML file and return PeptideIdentification and ProteinIdentification objects. + + :param path: path to idXML file + :type path: str + :return: ProteinIdentification and PeptideIdentification objects + :rtype: (list, list) + """ + protein_ids = [] + peptide_ids = [] + IdXMLFile().load(path, protein_ids, peptide_ids) + + return protein_ids, peptide_ids + + +def generate_deeplc_input(peptide_ids: list) -> pd.DataFrame: + """ + Generate input for DeepLC from PeptideIdentification objects. + + :param peptide_ids: list of PeptideIdentification objects + :type peptide_ids: list + :return: Pandas DataFrame containing the input for DeepLC + :rtype: pd.DataFrame + + """ + data = [] + for peptide_id in peptide_ids: + tr = peptide_id.getRT() + scan_id = peptide_id.getMetaValue("spectrum_reference") + for hit in peptide_id.getHits(): + sequence = hit.getSequence() + unmodified_sequence = sequence.toUnmodifiedString() + x_corr = hit.getMetaValue("MS:1002252") + target_decoy = hit.getMetaValue("target_decoy") + + # get all modificaitons + hit_mods = [] + for pos in range(0, sequence.size()): + residue = sequence.getResidue(pos) + if residue.isModified(): + hit_mods.append("|".join([str(pos + 1), residue.getModificationName()])) + if hit_mods == []: + modifications = "" + else: + modifications = "|".join(hit_mods) + + data.append([unmodified_sequence, modifications, tr, x_corr, target_decoy, str(sequence), scan_id]) + + df_deeplc_input = pd.DataFrame( + data, columns=["seq", "modifications", "tr", "x_corr", "target_decoy", "seq_with_mods", "scan_id"] + ) + + return df_deeplc_input + + +def generate_calibration_df(df: pd.DataFrame, num_bins: int) -> pd.DataFrame: + """ + Generates a pandas DataFrame containing calibration peptides for DeepLC. + The input DataFrame is sorted by measured retention time and sliced into + bins of equal peptide count. For each bin the peptides with the highest + x_correlation is selected and returned in a Pandas DataFrame + + :param df: Input DataFrame with retention time of each peptide and xcorr score + :type df: pd.DataFrame + :param num_bins: Number of bins/number of calibratoin peptides to be extracted + :type num_bins: int + :return: Pandas DataFrame containing calibration peptides equal to index-based num_bins + :rtype: pd.DataFrame + + """ + # remove decoys + df = df[df["target_decoy"] != "decoy"] + + # Compute the bin size based on the number of bins + bin_size = len(df) // num_bins + + # Sort the dataframe by tr values + sorted_df = df.sort_values("tr") + + # Rows for dataframe + filtered_row = [] + + # Iterate over the bins + for i in range(num_bins): + # Get the start and end indices of the current bin + start_index = i * bin_size + end_index = start_index + bin_size + + # Get the subset of the dataframe for the current bin + bin_df = sorted_df.iloc[start_index:end_index] + + # Find the row with the maximum x_corr value in the current bin + max_row = bin_df.loc[bin_df["x_corr"].idxmax()] + + # Append the max row to the filtered dataframe + filtered_row.append(max_row) + + # Create DataFrame + calibration_df = pd.DataFrame(filtered_row) + + return calibration_df.copy() + + +def generate_calibration_df_with_RT_bins(df: pd.DataFrame, num_bins: int) -> pd.DataFrame: + """ + Generates a pandas DataFrame containing calibration peptides for DeepLC. + The input DataFrame is sorted by measured retention time and sliced into bins of equal retention time. + For each bin the peptides with the highest x_correlation is selected and return in a Pandas DataFrame + + :param df: Input DataFrame with retention time of each peptide and xcorr score + :type df: pd.DataFrame + :param num_bins: Number of bins/number of calibratoin peptides to be extracted + :type num_bins: int + :return: Pandas DataFrame containing calibration peptides equal to RT-based num_bins + :rtype: pd.DataFrame + """ + # remove decoys + df = df[df["target_decoy"] != "decoy"] + + # Sort the dataframe by tr values + sorted_df = df.sort_values("tr") + + # Create list of linear bins between min and max tr with num_bins and access dataframe with index + bin_size = (sorted_df["tr"].max() - sorted_df["tr"].min()) / num_bins + + # Rows for dataframe + filtered_row = [] + + # Iterate over the bins + for i in range(num_bins): + # Get the start and end indices of the current bin + start_tr = sorted_df["tr"].min() + i * bin_size + end_tr = start_tr + bin_size + + # Get the subset of the dataframe for the current bin + bin_df = sorted_df[(sorted_df["tr"] >= start_tr) & (sorted_df["tr"] < end_tr)] + + # skip if bin is empty (no measurements in RT bin) + if len(bin_df) == 0: + continue + + # Find the row with the maximum x_corr value in the current bin + max_row = bin_df.loc[bin_df["x_corr"].idxmax()] + + # Append the max row to the filtered dataframe + filtered_row.append(max_row) + + # Create DataFrame + calibration_df = pd.DataFrame(filtered_row) + + return calibration_df + + +def min_max_scaler(df: pd.DataFrame) -> pd.DataFrame: + """ + Scales the predicted retention time values of the input DataFrame to the range of the measured retention time values + + :param df: Input DataFrame with predicted retention time values + :type df: pd.DataFrame + :return: DataFrame with scaled predicted retention time values + :rtype: pd.DataFrame + """ + scaler = MinMaxScaler((min(df["tr"]), max(df["tr"]))) + df["predicted_RT"] = scaler.fit_transform(df[["predicted_RT"]]) + + return df + + +def run_deeplc(df: pd.DataFrame, calibration_df: pd.DataFrame = None) -> pd.DataFrame: + dlc = DeepLC() + if calibration_df is not None: + dlc.calibrate_preds(seq_df=calibration_df) + preds = dlc.make_preds(seq_df=df) + df["predicted_RT"] = preds + else: + preds = dlc.make_preds(seq_df=df, calibrate=False) + df["predicted_RT"] = preds + df = min_max_scaler(df) + + return df + + +def add_rt_error( + peptide_ids: list, + prediction_dict: dict, + add_abs_rt_error: bool = False, + add_sqr_rt_error: bool = False, + add_log_rt_error: bool = False, +) -> list: + """ + Adds the error of the predicted retention time in comparison to the measured retention time to each peptide hit. + Different error scores can be selected. + + :param peptide_ids: list of PeptideIdentification objects + :type peptide_ids: list + :param prediction_dict: dictionary containing the predicted retention time for each peptide sequence + :type prediction_dict: dict + :param add_abs_rt_error: add absolute RT prediction errors to idXML + :type add_abs_rt_error: bool + :param add_sqr_rt_error: add squared RT prediction errors to idXML + :type add_sqr_rt_error: bool + :param add_log_rt_error: add log RT prediction errors to idXML + :type add_log_rt_error: bool + :return: list of PeptideIdentification objects with added error scores + :rtype: list + """ + noncanonical_aa = ["B", "J", "O", "U", "X", "Z"] + peptide_hits_noncanonical_aa = {} + abs_rt_errors = [] + sqr_rt_errors = [] + log_rt_errors = [] + + for peptide_id in peptide_ids: + # Get measured Retention time + measured_rt = peptide_id.getRT() + scan_id = peptide_id.getMetaValue("spectrum_reference") + + # Initilaize list for edited hits (with added features) + new_hits = [] + for hit in peptide_id.getHits(): + sequence = hit.getSequence() + unmodified_sequence = sequence.toUnmodifiedString() + # Catch peptides with noncanonical amino acids and save spectrum reference and hit in dictionary + if any(aa in noncanonical_aa for aa in unmodified_sequence): + peptide_hits_noncanonical_aa[(peptide_id.getMetaValue("spectrum_reference"), sequence)] = hit + continue + + predicted_rt = prediction_dict[(str(sequence), scan_id)] + + # calculate abs error + if add_abs_rt_error: + abs_error = abs(measured_rt - predicted_rt) + hit.setMetaValue("deeplc_abs_error", abs_error) + abs_rt_errors.append(abs_error) + + # calculate seq error + if add_sqr_rt_error: + sqr_error = abs(measured_rt - predicted_rt) ** 2 + hit.setMetaValue("deeplc_sqr_error", sqr_error) + sqr_rt_errors.append(sqr_error) + + # calcultae log error + if add_log_rt_error: + log_error = math.log(abs(measured_rt - predicted_rt)) + hit.setMetaValue("deeplc_log_error", log_error) + log_rt_errors.append(log_error) + + new_hits.append(hit) + peptide_id.setHits(new_hits) + + # Add peptides with noncanonical amino acids to peptide_ids and return the median error + for scan_id, sequence in peptide_hits_noncanonical_aa.keys(): + LOG.info( + f"Peptide {sequence} hit of spectrum {scan_id} contains noncanonical amino acids. Adding median error(s)" + ) + # get peptide id for scan id + peptide_id = [ + peptide_id for peptide_id in peptide_ids if peptide_id.getMetaValue("spectrum_reference") == scan_id + ][0] + hit = peptide_hits_noncanonical_aa[(scan_id, sequence)] + if add_abs_rt_error: + hit.setMetaValue("deeplc_abs_error", np.median(abs_rt_errors)) + if add_sqr_rt_error: + hit.setMetaValue("deeplc_sqr_error", np.median(sqr_rt_errors)) + if add_log_rt_error: + hit.setMetaValue("deeplc_log_error", np.median(log_rt_errors)) + peptide_id.insertHit(hit) + + peptide_ids = peptide_ids + + return peptide_ids + + +@click.command() +@click.option("-i", "--input", help="input path of idXML", required=True) +@click.option("-o", "--output", help="output path of idXML", required=True) +@click.option( + "--calibration_mode", + type=click.Choice(["idx_bin", "rt_bin", "min_max"]), + default="idx_bin", + help="Calibration method", +) +@click.option( + "--calibration_bins", + type=click.IntRange(min=2), + default=20, + help="number of bins for calibration", +) +@click.option( + "--add_abs_rt_error", + is_flag=True, + help="add absolute RT prediction errors to idXML", +) +@click.option( + "--add_sqr_rt_error", + is_flag=True, + help="add squared RT prediction errors to idXML (default if nothing is selected)", +) +@click.option("--add_log_rt_error", is_flag=True, help="add log RT prediction errors to idXML") +@click.option( + "--debug", + is_flag=True, + help="Additionally write out calibration file and deeplc output", +) +def main( + input: str, + output: str, + calibration_mode: str, + calibration_bins: int, + add_abs_rt_error: bool, + add_sqr_rt_error: bool, + add_log_rt_error: bool, + debug: bool, +): + # check if at least one error is selected, if not set squared error to true + num_true = sum([add_abs_rt_error, add_sqr_rt_error, add_log_rt_error]) + if num_true == 0: + LOG.info("No error calculation was set, falling back to squared error") + add_sqr_rt_error = True + + LOG.info("Parse idXML") + protein_ids, peptide_ids = parse_idxml(input) + + LOG.info("Generate DeepLC input") + df_deeplc_input = generate_deeplc_input(peptide_ids) + # Skip sequences with noncanonical amino acids, DeepLC cannot predict them + # Add them later with median error + df_deeplc_input = df_deeplc_input[~df_deeplc_input["seq"].str.contains("B|J|O|U|X|Z")] + + if len(df_deeplc_input[df_deeplc_input["target_decoy"] != "decoy"]) <= calibration_bins: + LOG.info("Number of peptide hits is smaller than calibration bins. Falling back to min/max scaling") + calibration_mode = "min_max" + + # Run DeepLC + if calibration_mode == "rt_bin": + LOG.info("Run DeepLC with RT bin calibration") + calibration_df = generate_calibration_df_with_RT_bins(df_deeplc_input, calibration_bins) + if debug: + calibration_df.to_csv(output + "_calibration.tsv", index=False, sep="\t") + df_deeplc_input.to_csv(output + "_deeplc_input.tsv", index=False, sep="\t") + df_deeplc_output = run_deeplc(df_deeplc_input, calibration_df) + + elif calibration_mode == "idx_bin": + LOG.info("Run DeepLC with index bin calibration") + calibration_df = generate_calibration_df(df_deeplc_input, calibration_bins) + if debug: + calibration_df.to_csv(output + "_calibration.tsv", index=False, sep="\t") + df_deeplc_input.to_csv(output + "_deeplc_input.tsv", index=False, sep="\t") + df_deeplc_output = run_deeplc(df_deeplc_input, calibration_df) + + elif calibration_mode == "min_max": + LOG.info("Run DeepLC with min/max calibration") + if debug: + df_deeplc_input.to_csv(output + "_deeplc_input.tsv", index=False, sep="\t") + df_deeplc_output = run_deeplc(df_deeplc_input) + + if debug: + df_deeplc_output.to_csv(output + "_deeplc_output.tsv", index=False, sep="\t") + + # Create map containing the predicted retention time for each peptide sequence and modification + sequence_to_prediction = {} + for seq_mod, scan_id, pred_rt in zip( + df_deeplc_output["seq_with_mods"], + df_deeplc_output["scan_id"], + df_deeplc_output["predicted_RT"], + ): + sequence_to_prediction[(seq_mod, scan_id)] = pred_rt + + LOG.info("Add error to idXML") + peptide_ids_pred_RT = add_rt_error( + peptide_ids, + sequence_to_prediction, + add_abs_rt_error, + add_sqr_rt_error, + add_log_rt_error, + ) + + LOG.info("Write idXML") + IdXMLFile().store(output, protein_ids, peptide_ids_pred_RT) + + if debug: + df_deeplc_input.to_csv(output + "_deeplc_input.tsv", index=False, sep="\t") + if calibration_mode == "rt_bin" or calibration_mode == "idx_bin": + calibration_df.to_csv(output + "_calibration.tsv", index=False, sep="\t") + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/markdown_to_html.py b/bin/markdown_to_html.py index 06bc6b10..168daee6 100755 --- a/bin/markdown_to_html.py +++ b/bin/markdown_to_html.py @@ -12,7 +12,14 @@ def convert_markdown(in_fn): input_md = io.open(in_fn, mode="r", encoding="utf-8").read() html = markdown.markdown( "[TOC]\n" + input_md, - extensions=["pymdownx.extra", "pymdownx.b64", "pymdownx.highlight", "pymdownx.emoji", "pymdownx.tilde", "toc"], + extensions=[ + "pymdownx.extra", + "pymdownx.b64", + "pymdownx.highlight", + "pymdownx.emoji", + "pymdownx.tilde", + "toc", + ], extension_configs={ "pymdownx.b64": {"base_path": os.path.dirname(in_fn)}, "pymdownx.highlight": {"noclasses": True}, @@ -74,9 +81,18 @@ def wrap_html(contents): def parse_args(args=None): parser = argparse.ArgumentParser() - parser.add_argument("mdfile", type=argparse.FileType("r"), nargs="?", help="File to convert. Defaults to stdin.") parser.add_argument( - "-o", "--out", type=argparse.FileType("w"), default=sys.stdout, help="Output file name. Defaults to stdout." + "mdfile", + type=argparse.FileType("r"), + nargs="?", + help="File to convert. Defaults to stdin.", + ) + parser.add_argument( + "-o", + "--out", + type=argparse.FileType("w"), + default=sys.stdout, + help="Output file name. Defaults to stdout.", ) return parser.parse_args(args) diff --git a/bin/mhcnuggets_predict_peptides.py b/bin/mhcnuggets_predict_peptides.py index d4ae30ee..724e89b3 100755 --- a/bin/mhcnuggets_predict_peptides.py +++ b/bin/mhcnuggets_predict_peptides.py @@ -176,7 +176,12 @@ def main(): supp_alleles = parse_alleles(args.alleles) for allele in supp_alleles: - predict(class_="II", peptides_path=args.peptides, mhc=allele, output=allele + args.output) + predict( + class_="II", + peptides_path=args.peptides, + mhc=allele, + output=allele + args.output, + ) else: op = open("predicted_neoepitopes_class_2", "w") diff --git a/bin/preprocess_neoepitopes_mhcnuggets.py b/bin/preprocess_neoepitopes_mhcnuggets.py index 3c08b32a..d454d1c2 100755 --- a/bin/preprocess_neoepitopes_mhcnuggets.py +++ b/bin/preprocess_neoepitopes_mhcnuggets.py @@ -41,7 +41,10 @@ def main(): model.add_argument("-n", "--neoepitopes", type=str, help="neoepitopes input file") model.add_argument( - "-o", "--output", type=str, help="preprocess neoepitope file for subsequent mhcnuggets prediction" + "-o", + "--output", + type=str, + help="preprocess neoepitope file for subsequent mhcnuggets prediction", ) args = model.parse_args() diff --git a/bin/resolve_neoepitopes.py b/bin/resolve_neoepitopes.py index 5f30ab70..378798c1 100755 --- a/bin/resolve_neoepitopes.py +++ b/bin/resolve_neoepitopes.py @@ -149,7 +149,13 @@ def main(): model.add_argument("-m", "--mztab", type=str, help="Path to mztab file") - model.add_argument("-f", "--file_format", type=str, default="csv", help="File format for output file") + model.add_argument( + "-f", + "--file_format", + type=str, + default="csv", + help="File format for output file", + ) model.add_argument("-o", "--output", type=str, required=True, help="Output file path") diff --git a/bin/variants2fasta.py b/bin/variants2fasta.py index b1b900da..308795a3 100755 --- a/bin/variants2fasta.py +++ b/bin/variants2fasta.py @@ -161,7 +161,11 @@ def main(): model.add_argument("-f", "--fasta_ref", type=str, default=None, help="Path to the fasta input file") model.add_argument( - "-p", "--proteins", type=str, default=None, help="Path to the protein ID input file (in HGNC-ID)" + "-p", + "--proteins", + type=str, + default=None, + help="Path to the protein ID input file (in HGNC-ID)", ) model.add_argument( @@ -173,7 +177,10 @@ def main(): ) model.add_argument( - "-fINDEL", "--filterINDEL", action="store_true", help="Filter insertions and deletions (including frameshifts)" + "-fINDEL", + "--filterINDEL", + action="store_true", + help="Filter insertions and deletions (including frameshifts)", ) model.add_argument("-fFS", "--filterFSINDEL", action="store_true", help="Filter frameshift INDELs") @@ -215,12 +222,20 @@ def main(): if args.filterINDEL: variants = filter( lambda x: x.type - not in [VariationType.INS, VariationType.DEL, VariationType.FSDEL, VariationType.FSINS], + not in [ + VariationType.INS, + VariationType.DEL, + VariationType.FSDEL, + VariationType.FSINS, + ], variants, ) if args.filterFSINDEL: - variants = filter(lambda x: x.type not in [VariationType.FSDEL, VariationType.FSINS], variants) + variants = filter( + lambda x: x.type not in [VariationType.FSDEL, VariationType.FSINS], + variants, + ) if not variants: sys.stderr.write("No variants left after filtering. Please refine your filtering criteria.\n") diff --git a/bin/vcf_neoepitope_predictor.py b/bin/vcf_neoepitope_predictor.py index b4227d23..95c08733 100755 --- a/bin/vcf_neoepitope_predictor.py +++ b/bin/vcf_neoepitope_predictor.py @@ -224,19 +224,35 @@ def main(): ) model.add_argument( - "-p", "--proteins", type=str, default=None, help="Path to the protein ID input file (in HGNC-ID)" + "-p", + "--proteins", + type=str, + default=None, + help="Path to the protein ID input file (in HGNC-ID)", ) model.add_argument( - "-minl", "--peptide_min_length", type=int, default=8, help="Minimum peptide length for epitope prediction" + "-minl", + "--peptide_min_length", + type=int, + default=8, + help="Minimum peptide length for epitope prediction", ) model.add_argument( - "-maxl", "--peptide_max_length", type=int, default=12, help="Maximum peptide length for epitope prediction" + "-maxl", + "--peptide_max_length", + type=int, + default=12, + help="Maximum peptide length for epitope prediction", ) model.add_argument( - "-a", "--alleles", type=str, required=True, help="Path to the allele file (one per line in new nomenclature)" + "-a", + "--alleles", + type=str, + required=True, + help="Path to the allele file (one per line in new nomenclature)", ) model.add_argument( @@ -248,7 +264,10 @@ def main(): ) model.add_argument( - "-fINDEL", "--filterINDEL", action="store_true", help="Filter insertions and deletions (including frameshifts)" + "-fINDEL", + "--filterINDEL", + action="store_true", + help="Filter insertions and deletions (including frameshifts)", ) model.add_argument("-fFS", "--filterFSINDEL", action="store_true", help="Filter frameshift INDELs") @@ -294,12 +313,20 @@ def main(): if args.filterINDEL: variants = filter( lambda x: x.type - not in [VariationType.INS, VariationType.DEL, VariationType.FSDEL, VariationType.FSINS], + not in [ + VariationType.INS, + VariationType.DEL, + VariationType.FSDEL, + VariationType.FSINS, + ], variants, ) if args.filterFSINDEL: - variants = filter(lambda x: x.type not in [VariationType.FSDEL, VariationType.FSINS], variants) + variants = filter( + lambda x: x.type not in [VariationType.FSDEL, VariationType.FSINS], + variants, + ) if not variants: sys.stderr.write("No variants left after filtering. Please refine your filtering criteria.\n") @@ -340,7 +367,11 @@ def main(): if protein_seq is not None: transcript_to_genes[ensembl_ids[EAdapterFields.TRANSID]] = l.strip() proteins.append( - Protein(protein_seq, gene_id=l.strip(), transcript_id=ensembl_ids[EAdapterFields.TRANSID]) + Protein( + protein_seq, + gene_id=l.strip(), + transcript_id=ensembl_ids[EAdapterFields.TRANSID], + ) ) epitopes = [] for length in range(args.peptide_min_length, args.peptide_max_length): diff --git a/bin/vcf_reader.py b/bin/vcf_reader.py index 743ab03b..9ad879fa 100755 --- a/bin/vcf_reader.py +++ b/bin/vcf_reader.py @@ -237,7 +237,15 @@ def read_vcf(filename, pass_only=True): if coding: pos, reference, alternative = get_fred2_annotation(vt, p, r, str(alt)) var = Variant( - "line" + str(num), vt, c, pos, reference, alternative, coding, isHomozygous, isSynonymous + "line" + str(num), + vt, + c, + pos, + reference, + alternative, + coding, + isHomozygous, + isSynonymous, ) var.gene = gene var.log_metadata("vardbid", variation_dbid) @@ -254,7 +262,17 @@ def read_vcf(filename, pass_only=True): for tId, vs in transToVar.iteritems(): if len(vs) > 10: for v in vs: - vs_new = Variant(v.id, v.type, v.chrom, v.genomePos, v.ref, v.obs, v.coding, True, v.isSynonymous) + vs_new = Variant( + v.id, + v.type, + v.chrom, + v.genomePos, + v.ref, + v.obs, + v.coding, + True, + v.isSynonymous, + ) vs_new.gene = v.gene for m in ["vardbid"]: vs_new.log_metadata(m, v.get_metadata(m)) diff --git a/conf/modules.config b/conf/modules.config index a53e6553..66fcc46c 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -433,3 +433,23 @@ process { } } } + + +process { + if (params.use_deeplc) { + withName: 'DEEPLC' { + publishDir = [ + path: {"${params.outdir}/DeepLC"}, + mode: params.publish_dir_mode, + pattern: '*.idXML' + ] + } + // DeepLC settings + use_deeplc = false + deeplc_calibration_mode = 'rt_bin' + deeplc_calibration_bins = 20 + deeplc_add_abs_rt_error = false + deeplc_add_sqr_rt_error = false + deeplc_add_log_rt_error = false + } +} diff --git a/modules/local/deeplc.nf b/modules/local/deeplc.nf new file mode 100644 index 00000000..800c510f --- /dev/null +++ b/modules/local/deeplc.nf @@ -0,0 +1,39 @@ +process DEEPLC { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::deeplc=2.2.0 bioconda::pyopenms=2.9.1" + container 'ghcr.io/jonasscheid/mhcquant:deeplc' + + input: + tuple val(meta), path(idxml_in) + + output: + tuple val(meta), path('*.idXML'), emit: idxml + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = idxml_in.baseName + def add_abs_rt_error = params.deeplc_add_abs_rt_error ? "--add_abs_rt_error" : "" + def add_sqr_rt_error = params.deeplc_add_sqr_rt_error ? "--add_sqr_rt_error" : "" + def add_log_rt_error = params.deeplc_add_log_rt_error ? "--add_log_rt_error" : "" + + """ + deeplc_cli.py \\ + --input $idxml_in \\ + --output ${prefix}_deeplc.idXML \\ + --calibration_mode ${params.deeplc_calibration_mode} \\ + --calibration_bins ${params.deeplc_calibration_bins} \\ + $add_abs_rt_error \\ + $add_sqr_rt_error \\ + $add_log_rt_error + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + DeepLC: \$(deeplc --version) + END_VERSIONS + """ +} diff --git a/modules/local/openms_psmfeatureextractor.nf b/modules/local/openms_psmfeatureextractor.nf index b2f1680b..dc9117eb 100644 --- a/modules/local/openms_psmfeatureextractor.nf +++ b/modules/local/openms_psmfeatureextractor.nf @@ -20,11 +20,25 @@ process OPENMS_PSMFEATUREEXTRACTOR { script: def prefix = task.ext.prefix ?: "${merged.baseName}_psm" def args = task.ext.args ?: '' + def extra_features = "" + if(params.use_deeplc){ + extra_features = "-extra" + if(params.add_abs_rt_error){ + extra_features = "${extra_features} deeplc_abs_error" + } + if(params.add_log_rt_error){ + extra_features = "${extra_features} deeplc_log_error" + } + if(params.add_sqr_rt_error || (!params.add_sqr_rt_error && !params.add_abs_rt_error && !params.add_log_rt_error)){ + extra_features = "${extra_features} deeplc_sqr_error" + } + } """ PSMFeatureExtractor -in $merged \\ -out ${prefix}.idXML \\ -threads $task.cpus \\ + $extra_features \\ $args cat <<-END_VERSIONS > versions.yml diff --git a/nextflow.config b/nextflow.config index d39c10ed..dd9e8f79 100644 --- a/nextflow.config +++ b/nextflow.config @@ -63,6 +63,14 @@ params { vcf_sheet = null annotate_ions = false + // DeepLC settings + use_deeplc = false + deeplc_calibration_mode = 'rt_bin' + deeplc_calibration_bins = 20 + deeplc_add_abs_rt_error = false + deeplc_add_sqr_rt_error = false + deeplc_add_log_rt_error = false + // MultiQC options skip_multiqc = false multiqc_config = null diff --git a/nextflow_schema.json b/nextflow_schema.json index ddbf4152..502cdb78 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -246,8 +246,8 @@ } } }, - "fdr_scoring": { - "title": "FDR Scoring", + "rescoring": { + "title": "Rescoring", "type": "object", "fa_icon": "fas fa-star-half-stroke", "description": "", @@ -294,6 +294,38 @@ "type": "integer", "fa_icon": "fas fa-train-track", "description": "Maximum subset for percolator training iterations" + }, + "use_deeplc": { + "type": "boolean", + "fa_icon": "fas fa-microchip", + "description": "Use DeepLC retention time features for Percolator rescoring", + "help_text": "https://www.nature.com/articles/s41592-021-01301-5" + }, + "deeplc_calibration_bins": { + "type": "integer", + "fa_icon": "fas fa-train-track", + "description": "Number of bins (peptides) used for DeepLC calibration. For each bin the best hit is used." + }, + "deeplc_calibration_mode": { + "type": "string", + "fa_icon": "fas fa-train-track", + "description": "Specify the DeepLC calibration mode. rt_bin: bin peptides by RT, idx_bin: bin peptides by index, min_max: scale uncalibrated predictions to experimental RT range", + "enum": ["rt_bin", "idx_bin", "min_max"] + }, + "deeplc_add_abs_rt_error": { + "type": "boolean", + "fa_icon": "fas fa-train-track", + "description": "Add absolute RT error to of experimental and predicted RT to the feature set" + }, + "deeplc_add_sqr_rt_error": { + "type": "boolean", + "fa_icon": "fas fa-train-track", + "description": "Add squared RT error to of experimental and predicted RT to the feature set" + }, + "deeplc_add_log_rt_error": { + "type": "boolean", + "fa_icon": "fas fa-train-track", + "description": "Add log RT error to of experimental and predicted RT to the feature set" } } }, @@ -609,7 +641,7 @@ "$ref": "#/definitions/mass_spectrometry_data_processing" }, { - "$ref": "#/definitions/fdr_scoring" + "$ref": "#/definitions/rescoring" }, { "$ref": "#/definitions/quantification_options" diff --git a/workflows/mhcquant.nf b/workflows/mhcquant.nf index 5c106c2f..b3b9dd9b 100644 --- a/workflows/mhcquant.nf +++ b/workflows/mhcquant.nf @@ -64,6 +64,7 @@ include { TDF2MZML } from include { OPENMS_PEAKPICKERHIRES } from '../modules/local/openms_peakpickerhires' include { OPENMS_COMETADAPTER } from '../modules/local/openms_cometadapter' include { OPENMS_PEPTIDEINDEXER } from '../modules/local/openms_peptideindexer' +include { DEEPLC } from '../modules/local/deeplc' include { OPENMS_TEXTEXPORTER as OPENMS_TEXTEXPORTER_COMET } from '../modules/local/openms_textexporter' @@ -135,7 +136,7 @@ workflow MHCQUANT { tdf : meta.ext == 'd' return [ meta, filename ] other : true } - .set { ms_files } + .set { branched_ms_files } // Input fasta file Channel.fromPath(params.fasta) @@ -167,15 +168,16 @@ workflow MHCQUANT { ch_decoy_db = ch_fasta_file } + ch_ms_files = (branched_ms_files.mzml) // Raw file conversion - THERMORAWFILEPARSER(ms_files.raw) + THERMORAWFILEPARSER(branched_ms_files.raw) ch_versions = ch_versions.mix(THERMORAWFILEPARSER.out.versions.ifEmpty(null)) - ch_ms_files = THERMORAWFILEPARSER.out.mzml.mix(ms_files.mzml.map{ it -> [it[0], it[1][0]] }) + ch_ms_files = ch_ms_files.mix(THERMORAWFILEPARSER.out.mzml) // timsTOF data conversion - TDF2MZML(ms_files.tdf) - ch_versions = ch_versions.mix(TDF2MZML.out.versions.ifEmpty(null)) - ch_ms_files = TDF2MZML.out.mzml.mix(ms_files.mzml.map{ it -> [it[0], it[1][0]] }) + TDF2MZML(branched_ms_files.tdf) + ch_versions = ch_versions.mix(TDF2MZML.out.versions.ifEmpty(null)) + ch_ms_files = ch_ms_files.mix(TDF2MZML.out.mzml) if (params.run_centroidisation) { // Optional: Run Peak Picking as Preprocessing @@ -189,11 +191,21 @@ workflow MHCQUANT { // Run comet database search OPENMS_COMETADAPTER( ch_mzml_file.join(ch_decoy_db, remainder:true)) + + // Run DeepLC if specified + if (params.use_deeplc){ + DEEPLC(OPENMS_COMETADAPTER.out.idxml) + ch_versions = ch_versions.mix(DEEPLC.out.versions.ifEmpty(null)) + ch_comet_out_idxml = DEEPLC.out.idxml + } else { + ch_comet_out_idxml = OPENMS_COMETADAPTER.out.idxml + } + // Write this information to an tsv file - OPENMS_TEXTEXPORTER_COMET(OPENMS_COMETADAPTER.out.idxml) + OPENMS_TEXTEXPORTER_COMET(ch_comet_out_idxml) ch_versions = ch_versions.mix(OPENMS_COMETADAPTER.out.versions.ifEmpty(null)) // Index decoy and target hits - OPENMS_PEPTIDEINDEXER(OPENMS_COMETADAPTER.out.idxml.join(ch_decoy_db)) + OPENMS_PEPTIDEINDEXER(ch_comet_out_idxml.join(ch_decoy_db)) ch_versions = ch_versions.mix(OPENMS_PEPTIDEINDEXER.out.versions.ifEmpty(null)) //