diff --git a/workflow/rules/de_analysis.smk b/workflow/rules/de_analysis.smk index 3ae1f64..1eed6d3 100644 --- a/workflow/rules/de_analysis.smk +++ b/workflow/rules/de_analysis.smk @@ -1,32 +1,6 @@ -rule make_dea_input: - input: - f"{OUTPUT}/Quantification/mt_normalized.tsv" if len(mt_exps) > 0 else f"{OUTPUT}/Metaproteomics/mp_normalized.tsv" - output: - f"{OUTPUT}/DE_analysis/dea_input.tsv" - threads: - 1 - run: - result = pd.DataFrame(columns=['sseqid']) - for file in input: - df = pd.read_csv(file, sep='\t') - sample = os.path.basename(os.path.dirname(file)) - # parse only first two columns of blast - blast = pd.read_csv( - f"{OUTPUT}/Annotation/{sample}/aligned.blast", sep='\t', usecols=[0,1], names=['qseqid', 'sseqid']) - df = pd.merge(df, blast, how="left", on="qseqid") - df = df.groupby('sseqid')[df.columns.tolist()[1:-1]].sum() - # remove rows of df with less than 2 non-missing values - df = df[(df > 0.0).sum(axis=1) > 1] - if len(result) > 0: - # inner join to keep all samples from having more than 2 non-missing values - result = pd.merge(result, df, how="inner", on="sseqid") - else: - result = pd.merge(result, df, how="outer", on="sseqid") - result.replace(0.0, np.nan).to_csv(output[0], sep='\t', index=False) - rule de_analysis: input: - f"{OUTPUT}/DE_analysis/dea_input.tsv" + f"{OUTPUT}/Quantification/dea_input.tsv" output: f"{OUTPUT}/DE_analysis/condition_treated_results.tsv", threads: diff --git a/workflow/rules/entry_report.smk b/workflow/rules/entry_report.smk index 9e24f9c..95b4393 100644 --- a/workflow/rules/entry_report.smk +++ b/workflow/rules/entry_report.smk @@ -1,9 +1,7 @@ rule entry_report: input: p_reports = expand("{output}/MOSCA_{sample}_Protein_Report.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])), - norm = ( - expand("{output}/Quantification/mt_normalized.tsv", output=OUTPUT) + - expand("{output}/Metaproteomics/mp_normalized.tsv", output=OUTPUT)) + norm = f"{OUTPUT}/Quantification/mt_normalized.tsv" if len(mt_exps) > 0 else f"{OUTPUT}/Metaproteomics/mp_normalized.tsv" output: f"{OUTPUT}/MOSCA_Entry_Report.xlsx", f"{OUTPUT}/MOSCA_Entry_Report.tsv" diff --git a/workflow/rules/normalization.smk b/workflow/rules/normalization.smk index 6321991..76963a3 100644 --- a/workflow/rules/normalization.smk +++ b/workflow/rules/normalization.smk @@ -1,7 +1,7 @@ rule normalization: input: - f"{OUTPUT}/Quantification/mg_readcounts.tsv", - f"{OUTPUT}/Quantification/mt_readcounts.tsv" if len(mt_exps) > 0 else f"{OUTPUT}/Metaproteomics/mp_spectracounts.tsv" + f"{OUTPUT}/Quantification/mg_entry_quant.tsv", + f"{OUTPUT}/Quantification/mt_entry_quant.tsv" if len(mt_exps) > 0 else f"{OUTPUT}/Metaproteomics/mp_entry_quant.tsv" output: f"{OUTPUT}/Quantification/mg_normalized.tsv", f"{OUTPUT}/Quantification/mt_normalized.tsv" if len(mt_exps) > 0 else f"{OUTPUT}/Metaproteomics/mp_normalized.tsv" diff --git a/workflow/rules/protein_report.smk b/workflow/rules/protein_report.smk index 704f2b7..f872cfa 100644 --- a/workflow/rules/protein_report.smk +++ b/workflow/rules/protein_report.smk @@ -2,14 +2,17 @@ rule protein_report: input: expand("{output}/Annotation/{sample}/UPIMAPI_results.tsv", output=OUTPUT, sample=set(EXPS['Sample'])), expand("{output}/Annotation/{sample}/reCOGnizer_results.xlsx", output=OUTPUT, sample=set(EXPS["Sample"])), - expand("{output}/Quantification/{sample}_mg_readcounts.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])), - expand("{output}/Quantification/{sample}_mt_readcounts.tsv", output=OUTPUT, sample=set(mt_exps['Sample'])), - expand("{output}/Metaproteomics/{sample}_mp_spectracounts.tsv", output=OUTPUT, sample=set(mp_exps['Sample'])) + expand("{output}/Quantification/{sample}_mg.readcounts", output=OUTPUT, sample=set(mg_exps['Sample'])), + expand("{output}/Quantification/{sample}_mg_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])), + expand("{output}/Quantification/{sample}_mt.readcounts", output=OUTPUT, sample=set(mt_exps['Sample'])), + expand("{output}/Quantification/{sample}_mt_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])), + expand("{output}/Metaproteomics/{sample}_mp.spectracounts", output=OUTPUT, sample=set(mp_exps['Sample'])) output: expand("{output}/MOSCA_{sample}_Protein_Report.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])), f"{OUTPUT}/MOSCA_Protein_Report.xlsx", - f"{OUTPUT}/Quantification/mg_readcounts.tsv", - f"{OUTPUT}/Quantification/mt_readcounts.tsv" if len(mt_exps) > 0 else f"{OUTPUT}/Metaproteomics/mp_spectracounts.tsv" + f"{OUTPUT}/Quantification/dea_input.tsv", + f"{OUTPUT}/Quantification/mg_entry_quant.tsv", + f"{OUTPUT}/Quantification/mt_entry_quant.tsv" if len(mt_exps) > 0 else f"{OUTPUT}/Metaproteomics/mp_entry_quant.tsv" threads: 1 params: diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk index ba5942c..b2ff10e 100644 --- a/workflow/rules/quantification.smk +++ b/workflow/rules/quantification.smk @@ -6,8 +6,10 @@ rule quantification: expand("{output}/Annotation/{sample}/fgs.ffn", output=OUTPUT, sample=set(EXPS["Sample"])) output: expand("{output}/Quantification/{name}.readcounts", output=OUTPUT, name=set(not_mp_exps['Name'])), - expand("{output}/Quantification/{sample}_mg_readcounts.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])), - expand("{output}/Quantification/{sample}_mt_readcounts.tsv", output=OUTPUT, sample=set(mt_exps['Sample'])) + expand("{output}/Quantification/{sample}_mg.readcounts", output=OUTPUT, sample=set(mg_exps['Sample'])), + expand("{output}/Quantification/{sample}_mg_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])), + expand("{output}/Quantification/{sample}_mt.readcounts", output=OUTPUT, sample=set(mt_exps['Sample'])), + expand("{output}/Quantification/{sample}_mt_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])) threads: config["threads"] params: diff --git a/workflow/rules/summary_report.smk b/workflow/rules/summary_report.smk index 66d0e6b..d4b49f6 100644 --- a/workflow/rules/summary_report.smk +++ b/workflow/rules/summary_report.smk @@ -1,6 +1,6 @@ rule summary_report: input: - f"{OUTPUT}/MOSCA_{sample}_Protein_Report.tsv", + expand("{output}/MOSCA_{sample}_Protein_Report.tsv", output=OUTPUT, sample=set(EXPS['Sample'])), f"{OUTPUT}/MOSCA_Entry_Report.xlsx", f"{OUTPUT}/DE_analysis/condition_treated_results.tsv" output: diff --git a/workflow/scripts/main_reports.py b/workflow/scripts/main_reports.py index ad23c78..3ef8ba9 100644 --- a/workflow/scripts/main_reports.py +++ b/workflow/scripts/main_reports.py @@ -39,34 +39,28 @@ def make_protein_report(out, exps, sample, mg_preport, mt_preport, mp_preport): mg_names = exps[(exps['Sample'] == sample) & (exps['Data type'] == 'dna')]['Name'].tolist() mt_names = exps[(exps['Sample'] == sample) & (exps['Data type'] == 'mrna')]['Name'].tolist() mp_names = exps[(exps['Sample'] == sample) & (exps['Data type'] == 'protein')]['Name'].tolist() - for mg_name in mg_names: - readcounts = pd.read_csv( - f'{out}/Quantification/{mg_name}.readcounts.norm', sep='\t', header=None, names=['Contig', mg_name]) - readcounts['Contig'] = readcounts['Contig'].apply(lambda x: x.split('_')[1]) - report = pd.merge(report, readcounts, on='Contig', how='left') - for mt_name in mt_names: - readcounts = pd.read_csv( - f'{out}/Quantification/{mt_name}.readcounts.norm', sep='\t', header=None, names=['qseqid', mt_name]) - report = pd.merge(report, readcounts, on='qseqid', how='left') - if len(mp_names) > 0: - spectracounts = pd.read_csv(f'{out}/Metaproteomics/{sample}/spectracounts.tsv', sep='\t') - spectracounts.rename(columns={'Main Accession': 'qseqid'}, inplace=True) - report = pd.merge(report, spectracounts, on='qseqid', how='left') if len(mg_names) > 0: + mg_counts = pd.read_csv(f'{out}/Quantification/{sample}_mg_norm.tsv', sep='\t') + mg_counts['Contig'] = mg_counts['Contig'].apply(lambda x: x.split('_')[1]) + report = pd.merge(report, mg_counts, on='Contig', how='left') mg_preport = pd.merge(mg_preport, report[['Entry'] + mg_names], on='Entry', how='outer') if len(mt_names) > 0: + report = pd.merge( + report, pd.read_csv(f'{out}/Quantification/{sample}_mt_norm.tsv', sep='\t', names=[ + 'qseqid'] + mt_names), on='qseqid', how='left') mt_preport = pd.merge(mt_preport, report[['Entry'] + mt_names], on='Entry', how='outer') if len(mp_names) > 0: + spectracounts = pd.read_csv(f'{out}/Metaproteomics/{sample}/spectracounts.tsv', sep='\t') + spectracounts.rename(columns={'Main Accession': 'qseqid'}, inplace=True) + report = pd.merge(report, spectracounts, on='qseqid', how='left') mp_preport = pd.merge(mp_preport, report[['Entry'] + mp_names], on='Entry', how='outer') - report[mg_names + mt_names + mp_names] = report[mg_names + mt_names + mp_names].fillna(value=0).astype(int) + report[mg_names + mt_names + mp_names] = report[mg_names + mt_names + mp_names].fillna(value=0).astype(float).astype(int) report.to_csv(f'{out}/MOSCA_{sample}_Protein_Report.tsv', sep='\t', index=False) return report, mg_preport, mt_preport, mp_preport def make_protein_reports(out, exps, max_lines=1000000): - mg_report = pd.DataFrame(columns=['Entry']) - mt_report = pd.DataFrame(columns=['Entry']) - mp_report = pd.DataFrame(columns=['Entry']) + mg_report = mt_report = mp_report = pd.DataFrame(columns=['Entry']) writer = pd.ExcelWriter(f'{out}/MOSCA_Protein_Report.xlsx', engine='xlsxwriter') for sample in set(exps['Sample']): report, mg_report, mt_report, mp_report = make_protein_report( @@ -82,14 +76,30 @@ def make_protein_reports(out, exps, max_lines=1000000): timed_message(f'Wrote Protein Report for sample: {sample}.') writer.close() # Write quantification matrices to normalize all together - mg_report = mg_report.groupby('Entry')[mg_report.columns.tolist()[1:]].sum().reset_index() - mt_report = mt_report.groupby('Entry')[mt_report.columns.tolist()[1:]].sum().reset_index() - mp_report = mp_report.groupby('Entry')[mp_report.columns.tolist()[1:]].sum().reset_index() - mg_report.to_csv(f'{out}/Quantification/mg_entries.tsv', sep='\t', index=False) - mt_report.to_csv(f'{out}/Quantification/mt_entries.tsv', sep='\t', index=False) - mp_report[mp_report[mp_report.columns.tolist()[1:]].isnull().sum( - axis=1) < len(mp_report.columns.tolist()[1:])].drop_duplicates().to_csv( - f'{out}/Metaproteomics/mp.spectracounts', sep='\t', index=False) + if len(mg_report) > 0: + mg_report = mg_report.groupby('Entry')[mg_report.columns.tolist()[1:]].sum().reset_index() + mg_report.to_csv(f'{out}/Quantification/mg_entry_quant.tsv', sep='\t', index=False) + if len(mt_report) > 0: + mt_report = mt_report.groupby('Entry')[mt_report.columns.tolist()[1:]].sum().reset_index() + mt_report.to_csv(f'{out}/Quantification/mt_entry_quant.tsv', sep='\t', index=False) + if len(mp_report) > 0: + mp_report = mp_report.groupby('Entry')[mp_report.columns.tolist()[1:]].sum().reset_index() + mp_report[mp_report[mp_report.columns.tolist()[1:]].isnull().sum( + axis=1) < len(mp_report.columns.tolist()[1:])].drop_duplicates().to_csv( + f'{out}/Metaproteomics/mp_entry_quant', sep='\t', index=False) + + +def write_dea_input(out, exps): + mt_samples = exps[(exps['Sample'] == sample) & (exps['Data type'] == 'mrna')]['Name'].unique().tolist() + mp_samples = exps[(exps['Sample'] == sample) & (exps['Data type'] == 'protein')]['Name'].unique().tolist() + if len(mt_samples) > 0: + col, samples, folder, term, tdata = 'Gene', mt_samples, 'Quantification', 'readcounts', 'mt' + else: + col, samples, folder, term, tdata = 'Main Accession', mp_samples, 'Metaproteomics', 'spectracounts', 'mp' + dea_input = pd.DataFrame(columns=[col]) + for sample in samples: + dea_input = pd.merge(pd.read_csv(f'{out}/{folder}/{sample}_{tdata}.{term}')) + dea_input.to_csv(f'{out}/dea_input.tsv', sep='\t') def estimate_cog_for_entries(e_report): @@ -158,7 +168,7 @@ def get_lowest_otu(entry, tax_cols): i = 0 for col in tax_cols[::-1]: if not pd.isna(entry[col]): - if i == 0 or 'Candidatus' in entry[col]: + if i == 0 or 'Candidatus' in entry[col] or 'Other' in entry[col]: return entry[col] return f'Other {entry[col]}' i += 1 @@ -195,6 +205,9 @@ def make_entry_report(out, exps): timed_message('Writing KEGGcharter input.') entry_report.to_csv(f'{out}/keggcharter_input.tsv', sep='\t', index=False) timed_message('Writing Krona plots.') + for i in range(len(taxonomy_columns)): + entry_report[taxonomy_columns[i]] = entry_report.apply( + lambda x: get_lowest_otu(x, taxonomy_columns[:i + 1]), axis=1) entry_report['Taxonomic lineage (SPECIES)'] = entry_report.apply( lambda x: get_lowest_otu(x, taxonomy_columns), axis=1) write_kronas( @@ -208,8 +221,9 @@ def run(): if snakemake.params.report == 'protein': make_protein_reports(snakemake.params.output, exps) + write_dea_input(snakemake.params.output, exps) elif snakemake.params.report == 'entry': - make_entry_reports(snakemake.params.output, exps) + make_entry_report(snakemake.params.output, exps) if __name__ == '__main__': diff --git a/workflow/scripts/quantification.py b/workflow/scripts/quantification.py index ab322f4..c62c2e8 100644 --- a/workflow/scripts/quantification.py +++ b/workflow/scripts/quantification.py @@ -16,8 +16,8 @@ def run(): exps = pd.read_csv(snakemake.params.exps, sep='\t') for sample in set(exps['Sample']): - mg_result = pd.DataFrame(columns=['Contig']) - mt_result = pd.DataFrame(columns=['Gene']) + mg_result = mg_result_norm = pd.DataFrame(columns=['Contig']) + mt_result = mt_result_norm = pd.DataFrame(columns=['Gene']) pexps = exps[(exps['Sample'] == sample)] for i in pexps.index: if pexps.loc[i]['Data type'] == 'mrna': @@ -34,6 +34,9 @@ def run(): perform_alignment( reference, reads, f"{snakemake.params.output}/Quantification/{pexps.loc[i]['Name']}", threads=snakemake.threads) + counts = pd.read_csv( + f"{snakemake.params.output}/Quantification/{pexps.loc[i]['Name']}.readcounts", + sep='\t', names=['Gene' if pexps.loc[i]['Data type'] == 'mrna' else 'Contig', pexps.loc[i]['Name']]) normalize_counts_by_size( f"{snakemake.params.output}/Quantification/{pexps.loc[i]['Name']}.readcounts", reference) # Read the results of alignment and add them to the readcounts result at sample level @@ -41,14 +44,21 @@ def run(): f"{snakemake.params.output}/Quantification/{pexps.loc[i]['Name']}.readcounts.norm", sep='\t', names=['Gene' if pexps.loc[i]['Data type'] == 'mrna' else 'Contig', pexps.loc[i]['Name']]) if pexps.loc[i]['Data type'] == 'dna': - mg_result = pd.merge(mg_result, normalized_counts, how='outer', on='Contig') + mg_result = pd.merge(mg_result, counts, how='outer', on='Contig') + mg_result_norm = pd.merge(mg_result_norm, normalized_counts, how='outer', on='Contig') else: - mt_result = pd.merge(mt_result, normalized_counts, how='outer', on='Gene') + mt_result = pd.merge(mt_result, counts, how='outer', on='Gene') + mt_result_norm = pd.merge(mt_result_norm, normalized_counts, how='outer', on='Gene') if len(mg_result) > 0: - mg_result.to_csv(f"{snakemake.params.output}/Quantification/{sample}_mg_readcounts.tsv", sep='\t', index=False) + mg_result.to_csv( + f"{snakemake.params.output}/Quantification/{sample}_mg.readcounts", sep='\t', index=False) + mg_result_norm.to_csv( + f"{snakemake.params.output}/Quantification/{sample}_mg_norm.tsv", sep='\t', index=False) if len(mt_result) > 0: - mt_result.astype(int, errors='ignore').to_csv( - f"{snakemake.params.output}/Quantification/{sample}_mt_readcounts.tsv", sep='\t', index=False) + mt_result.to_csv( + f"{snakemake.params.output}/Quantification/{sample}_mt.readcounts", sep='\t', index=False) + mt_result_norm.astype(int, errors='ignore').to_csv( + f"{snakemake.params.output}/Quantification/{sample}_mt_norm.tsv", sep='\t', index=False) if __name__ == '__main__':