Skip to content

Commit

Permalink
Some more fixes on "sample"s handling
Browse files Browse the repository at this point in the history
  • Loading branch information
iquasere committed Dec 7, 2023
1 parent 89bc83d commit ceddd63
Show file tree
Hide file tree
Showing 8 changed files with 75 additions and 74 deletions.
28 changes: 1 addition & 27 deletions workflow/rules/de_analysis.smk
Original file line number Diff line number Diff line change
@@ -1,32 +1,6 @@
rule make_dea_input:
input:
f"{OUTPUT}/Quantification/mt_normalized.tsv" if len(mt_exps) > 0 else f"{OUTPUT}/Metaproteomics/mp_normalized.tsv"
output:
f"{OUTPUT}/DE_analysis/dea_input.tsv"
threads:
1
run:
result = pd.DataFrame(columns=['sseqid'])
for file in input:
df = pd.read_csv(file, sep='\t')
sample = os.path.basename(os.path.dirname(file))
# parse only first two columns of blast
blast = pd.read_csv(
f"{OUTPUT}/Annotation/{sample}/aligned.blast", sep='\t', usecols=[0,1], names=['qseqid', 'sseqid'])
df = pd.merge(df, blast, how="left", on="qseqid")
df = df.groupby('sseqid')[df.columns.tolist()[1:-1]].sum()
# remove rows of df with less than 2 non-missing values
df = df[(df > 0.0).sum(axis=1) > 1]
if len(result) > 0:
# inner join to keep all samples from having more than 2 non-missing values
result = pd.merge(result, df, how="inner", on="sseqid")
else:
result = pd.merge(result, df, how="outer", on="sseqid")
result.replace(0.0, np.nan).to_csv(output[0], sep='\t', index=False)

rule de_analysis:
input:
f"{OUTPUT}/DE_analysis/dea_input.tsv"
f"{OUTPUT}/Quantification/dea_input.tsv"
output:
f"{OUTPUT}/DE_analysis/condition_treated_results.tsv",
threads:
Expand Down
4 changes: 1 addition & 3 deletions workflow/rules/entry_report.smk
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
rule entry_report:
input:
p_reports = expand("{output}/MOSCA_{sample}_Protein_Report.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])),
norm = (
expand("{output}/Quantification/mt_normalized.tsv", output=OUTPUT) +
expand("{output}/Metaproteomics/mp_normalized.tsv", output=OUTPUT))
norm = f"{OUTPUT}/Quantification/mt_normalized.tsv" if len(mt_exps) > 0 else f"{OUTPUT}/Metaproteomics/mp_normalized.tsv"
output:
f"{OUTPUT}/MOSCA_Entry_Report.xlsx",
f"{OUTPUT}/MOSCA_Entry_Report.tsv"
Expand Down
4 changes: 2 additions & 2 deletions workflow/rules/normalization.smk
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
rule normalization:
input:
f"{OUTPUT}/Quantification/mg_readcounts.tsv",
f"{OUTPUT}/Quantification/mt_readcounts.tsv" if len(mt_exps) > 0 else f"{OUTPUT}/Metaproteomics/mp_spectracounts.tsv"
f"{OUTPUT}/Quantification/mg_entry_quant.tsv",
f"{OUTPUT}/Quantification/mt_entry_quant.tsv" if len(mt_exps) > 0 else f"{OUTPUT}/Metaproteomics/mp_entry_quant.tsv"
output:
f"{OUTPUT}/Quantification/mg_normalized.tsv",
f"{OUTPUT}/Quantification/mt_normalized.tsv" if len(mt_exps) > 0 else f"{OUTPUT}/Metaproteomics/mp_normalized.tsv"
Expand Down
13 changes: 8 additions & 5 deletions workflow/rules/protein_report.smk
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,17 @@ rule protein_report:
input:
expand("{output}/Annotation/{sample}/UPIMAPI_results.tsv", output=OUTPUT, sample=set(EXPS['Sample'])),
expand("{output}/Annotation/{sample}/reCOGnizer_results.xlsx", output=OUTPUT, sample=set(EXPS["Sample"])),
expand("{output}/Quantification/{sample}_mg_readcounts.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])),
expand("{output}/Quantification/{sample}_mt_readcounts.tsv", output=OUTPUT, sample=set(mt_exps['Sample'])),
expand("{output}/Metaproteomics/{sample}_mp_spectracounts.tsv", output=OUTPUT, sample=set(mp_exps['Sample']))
expand("{output}/Quantification/{sample}_mg.readcounts", output=OUTPUT, sample=set(mg_exps['Sample'])),
expand("{output}/Quantification/{sample}_mg_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])),
expand("{output}/Quantification/{sample}_mt.readcounts", output=OUTPUT, sample=set(mt_exps['Sample'])),
expand("{output}/Quantification/{sample}_mt_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])),
expand("{output}/Metaproteomics/{sample}_mp.spectracounts", output=OUTPUT, sample=set(mp_exps['Sample']))
output:
expand("{output}/MOSCA_{sample}_Protein_Report.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])),
f"{OUTPUT}/MOSCA_Protein_Report.xlsx",
f"{OUTPUT}/Quantification/mg_readcounts.tsv",
f"{OUTPUT}/Quantification/mt_readcounts.tsv" if len(mt_exps) > 0 else f"{OUTPUT}/Metaproteomics/mp_spectracounts.tsv"
f"{OUTPUT}/Quantification/dea_input.tsv",
f"{OUTPUT}/Quantification/mg_entry_quant.tsv",
f"{OUTPUT}/Quantification/mt_entry_quant.tsv" if len(mt_exps) > 0 else f"{OUTPUT}/Metaproteomics/mp_entry_quant.tsv"
threads:
1
params:
Expand Down
6 changes: 4 additions & 2 deletions workflow/rules/quantification.smk
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@ rule quantification:
expand("{output}/Annotation/{sample}/fgs.ffn", output=OUTPUT, sample=set(EXPS["Sample"]))
output:
expand("{output}/Quantification/{name}.readcounts", output=OUTPUT, name=set(not_mp_exps['Name'])),
expand("{output}/Quantification/{sample}_mg_readcounts.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])),
expand("{output}/Quantification/{sample}_mt_readcounts.tsv", output=OUTPUT, sample=set(mt_exps['Sample']))
expand("{output}/Quantification/{sample}_mg.readcounts", output=OUTPUT, sample=set(mg_exps['Sample'])),
expand("{output}/Quantification/{sample}_mg_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])),
expand("{output}/Quantification/{sample}_mt.readcounts", output=OUTPUT, sample=set(mt_exps['Sample'])),
expand("{output}/Quantification/{sample}_mt_norm.tsv", output=OUTPUT, sample=set(mg_exps['Sample']))
threads:
config["threads"]
params:
Expand Down
2 changes: 1 addition & 1 deletion workflow/rules/summary_report.smk
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
rule summary_report:
input:
f"{OUTPUT}/MOSCA_{sample}_Protein_Report.tsv",
expand("{output}/MOSCA_{sample}_Protein_Report.tsv", output=OUTPUT, sample=set(EXPS['Sample'])),
f"{OUTPUT}/MOSCA_Entry_Report.xlsx",
f"{OUTPUT}/DE_analysis/condition_treated_results.tsv"
output:
Expand Down
68 changes: 41 additions & 27 deletions workflow/scripts/main_reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,34 +39,28 @@ def make_protein_report(out, exps, sample, mg_preport, mt_preport, mp_preport):
mg_names = exps[(exps['Sample'] == sample) & (exps['Data type'] == 'dna')]['Name'].tolist()
mt_names = exps[(exps['Sample'] == sample) & (exps['Data type'] == 'mrna')]['Name'].tolist()
mp_names = exps[(exps['Sample'] == sample) & (exps['Data type'] == 'protein')]['Name'].tolist()
for mg_name in mg_names:
readcounts = pd.read_csv(
f'{out}/Quantification/{mg_name}.readcounts.norm', sep='\t', header=None, names=['Contig', mg_name])
readcounts['Contig'] = readcounts['Contig'].apply(lambda x: x.split('_')[1])
report = pd.merge(report, readcounts, on='Contig', how='left')
for mt_name in mt_names:
readcounts = pd.read_csv(
f'{out}/Quantification/{mt_name}.readcounts.norm', sep='\t', header=None, names=['qseqid', mt_name])
report = pd.merge(report, readcounts, on='qseqid', how='left')
if len(mp_names) > 0:
spectracounts = pd.read_csv(f'{out}/Metaproteomics/{sample}/spectracounts.tsv', sep='\t')
spectracounts.rename(columns={'Main Accession': 'qseqid'}, inplace=True)
report = pd.merge(report, spectracounts, on='qseqid', how='left')
if len(mg_names) > 0:
mg_counts = pd.read_csv(f'{out}/Quantification/{sample}_mg_norm.tsv', sep='\t')
mg_counts['Contig'] = mg_counts['Contig'].apply(lambda x: x.split('_')[1])
report = pd.merge(report, mg_counts, on='Contig', how='left')
mg_preport = pd.merge(mg_preport, report[['Entry'] + mg_names], on='Entry', how='outer')
if len(mt_names) > 0:
report = pd.merge(
report, pd.read_csv(f'{out}/Quantification/{sample}_mt_norm.tsv', sep='\t', names=[
'qseqid'] + mt_names), on='qseqid', how='left')
mt_preport = pd.merge(mt_preport, report[['Entry'] + mt_names], on='Entry', how='outer')
if len(mp_names) > 0:
spectracounts = pd.read_csv(f'{out}/Metaproteomics/{sample}/spectracounts.tsv', sep='\t')
spectracounts.rename(columns={'Main Accession': 'qseqid'}, inplace=True)
report = pd.merge(report, spectracounts, on='qseqid', how='left')
mp_preport = pd.merge(mp_preport, report[['Entry'] + mp_names], on='Entry', how='outer')
report[mg_names + mt_names + mp_names] = report[mg_names + mt_names + mp_names].fillna(value=0).astype(int)
report[mg_names + mt_names + mp_names] = report[mg_names + mt_names + mp_names].fillna(value=0).astype(float).astype(int)
report.to_csv(f'{out}/MOSCA_{sample}_Protein_Report.tsv', sep='\t', index=False)
return report, mg_preport, mt_preport, mp_preport


def make_protein_reports(out, exps, max_lines=1000000):
mg_report = pd.DataFrame(columns=['Entry'])
mt_report = pd.DataFrame(columns=['Entry'])
mp_report = pd.DataFrame(columns=['Entry'])
mg_report = mt_report = mp_report = pd.DataFrame(columns=['Entry'])
writer = pd.ExcelWriter(f'{out}/MOSCA_Protein_Report.xlsx', engine='xlsxwriter')
for sample in set(exps['Sample']):
report, mg_report, mt_report, mp_report = make_protein_report(
Expand All @@ -82,14 +76,30 @@ def make_protein_reports(out, exps, max_lines=1000000):
timed_message(f'Wrote Protein Report for sample: {sample}.')
writer.close()
# Write quantification matrices to normalize all together
mg_report = mg_report.groupby('Entry')[mg_report.columns.tolist()[1:]].sum().reset_index()
mt_report = mt_report.groupby('Entry')[mt_report.columns.tolist()[1:]].sum().reset_index()
mp_report = mp_report.groupby('Entry')[mp_report.columns.tolist()[1:]].sum().reset_index()
mg_report.to_csv(f'{out}/Quantification/mg_entries.tsv', sep='\t', index=False)
mt_report.to_csv(f'{out}/Quantification/mt_entries.tsv', sep='\t', index=False)
mp_report[mp_report[mp_report.columns.tolist()[1:]].isnull().sum(
axis=1) < len(mp_report.columns.tolist()[1:])].drop_duplicates().to_csv(
f'{out}/Metaproteomics/mp.spectracounts', sep='\t', index=False)
if len(mg_report) > 0:
mg_report = mg_report.groupby('Entry')[mg_report.columns.tolist()[1:]].sum().reset_index()
mg_report.to_csv(f'{out}/Quantification/mg_entry_quant.tsv', sep='\t', index=False)
if len(mt_report) > 0:
mt_report = mt_report.groupby('Entry')[mt_report.columns.tolist()[1:]].sum().reset_index()
mt_report.to_csv(f'{out}/Quantification/mt_entry_quant.tsv', sep='\t', index=False)
if len(mp_report) > 0:
mp_report = mp_report.groupby('Entry')[mp_report.columns.tolist()[1:]].sum().reset_index()
mp_report[mp_report[mp_report.columns.tolist()[1:]].isnull().sum(
axis=1) < len(mp_report.columns.tolist()[1:])].drop_duplicates().to_csv(
f'{out}/Metaproteomics/mp_entry_quant', sep='\t', index=False)


def write_dea_input(out, exps):
mt_samples = exps[(exps['Sample'] == sample) & (exps['Data type'] == 'mrna')]['Name'].unique().tolist()
mp_samples = exps[(exps['Sample'] == sample) & (exps['Data type'] == 'protein')]['Name'].unique().tolist()
if len(mt_samples) > 0:
col, samples, folder, term, tdata = 'Gene', mt_samples, 'Quantification', 'readcounts', 'mt'
else:
col, samples, folder, term, tdata = 'Main Accession', mp_samples, 'Metaproteomics', 'spectracounts', 'mp'
dea_input = pd.DataFrame(columns=[col])
for sample in samples:
dea_input = pd.merge(pd.read_csv(f'{out}/{folder}/{sample}_{tdata}.{term}'))
dea_input.to_csv(f'{out}/dea_input.tsv', sep='\t')


def estimate_cog_for_entries(e_report):
Expand Down Expand Up @@ -158,7 +168,7 @@ def get_lowest_otu(entry, tax_cols):
i = 0
for col in tax_cols[::-1]:
if not pd.isna(entry[col]):
if i == 0 or 'Candidatus' in entry[col]:
if i == 0 or 'Candidatus' in entry[col] or 'Other' in entry[col]:
return entry[col]
return f'Other {entry[col]}'
i += 1
Expand Down Expand Up @@ -195,6 +205,9 @@ def make_entry_report(out, exps):
timed_message('Writing KEGGcharter input.')
entry_report.to_csv(f'{out}/keggcharter_input.tsv', sep='\t', index=False)
timed_message('Writing Krona plots.')
for i in range(len(taxonomy_columns)):
entry_report[taxonomy_columns[i]] = entry_report.apply(
lambda x: get_lowest_otu(x, taxonomy_columns[:i + 1]), axis=1)
entry_report['Taxonomic lineage (SPECIES)'] = entry_report.apply(
lambda x: get_lowest_otu(x, taxonomy_columns), axis=1)
write_kronas(
Expand All @@ -208,8 +221,9 @@ def run():

if snakemake.params.report == 'protein':
make_protein_reports(snakemake.params.output, exps)
write_dea_input(snakemake.params.output, exps)
elif snakemake.params.report == 'entry':
make_entry_reports(snakemake.params.output, exps)
make_entry_report(snakemake.params.output, exps)


if __name__ == '__main__':
Expand Down
24 changes: 17 additions & 7 deletions workflow/scripts/quantification.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ def run():
exps = pd.read_csv(snakemake.params.exps, sep='\t')

for sample in set(exps['Sample']):
mg_result = pd.DataFrame(columns=['Contig'])
mt_result = pd.DataFrame(columns=['Gene'])
mg_result = mg_result_norm = pd.DataFrame(columns=['Contig'])
mt_result = mt_result_norm = pd.DataFrame(columns=['Gene'])
pexps = exps[(exps['Sample'] == sample)]
for i in pexps.index:
if pexps.loc[i]['Data type'] == 'mrna':
Expand All @@ -34,21 +34,31 @@ def run():
perform_alignment(
reference, reads, f"{snakemake.params.output}/Quantification/{pexps.loc[i]['Name']}",
threads=snakemake.threads)
counts = pd.read_csv(
f"{snakemake.params.output}/Quantification/{pexps.loc[i]['Name']}.readcounts",
sep='\t', names=['Gene' if pexps.loc[i]['Data type'] == 'mrna' else 'Contig', pexps.loc[i]['Name']])
normalize_counts_by_size(
f"{snakemake.params.output}/Quantification/{pexps.loc[i]['Name']}.readcounts", reference)
# Read the results of alignment and add them to the readcounts result at sample level
normalized_counts = pd.read_csv(
f"{snakemake.params.output}/Quantification/{pexps.loc[i]['Name']}.readcounts.norm",
sep='\t', names=['Gene' if pexps.loc[i]['Data type'] == 'mrna' else 'Contig', pexps.loc[i]['Name']])
if pexps.loc[i]['Data type'] == 'dna':
mg_result = pd.merge(mg_result, normalized_counts, how='outer', on='Contig')
mg_result = pd.merge(mg_result, counts, how='outer', on='Contig')
mg_result_norm = pd.merge(mg_result_norm, normalized_counts, how='outer', on='Contig')
else:
mt_result = pd.merge(mt_result, normalized_counts, how='outer', on='Gene')
mt_result = pd.merge(mt_result, counts, how='outer', on='Gene')
mt_result_norm = pd.merge(mt_result_norm, normalized_counts, how='outer', on='Gene')
if len(mg_result) > 0:
mg_result.to_csv(f"{snakemake.params.output}/Quantification/{sample}_mg_readcounts.tsv", sep='\t', index=False)
mg_result.to_csv(
f"{snakemake.params.output}/Quantification/{sample}_mg.readcounts", sep='\t', index=False)
mg_result_norm.to_csv(
f"{snakemake.params.output}/Quantification/{sample}_mg_norm.tsv", sep='\t', index=False)
if len(mt_result) > 0:
mt_result.astype(int, errors='ignore').to_csv(
f"{snakemake.params.output}/Quantification/{sample}_mt_readcounts.tsv", sep='\t', index=False)
mt_result.to_csv(
f"{snakemake.params.output}/Quantification/{sample}_mt.readcounts", sep='\t', index=False)
mt_result_norm.astype(int, errors='ignore').to_csv(
f"{snakemake.params.output}/Quantification/{sample}_mt_norm.tsv", sep='\t', index=False)


if __name__ == '__main__':
Expand Down

0 comments on commit ceddd63

Please sign in to comment.