From 63a447438eab88682a9dd9be6a46badad838fdba Mon Sep 17 00:00:00 2001 From: bbimber Date: Wed, 2 Jul 2025 15:53:54 -0700 Subject: [PATCH 01/20] Allow subset to use dplyr --- .../pipeline/singlecell/SubsetSeurat.java | 33 ++++++++++++------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/singlecell/src/org/labkey/singlecell/pipeline/singlecell/SubsetSeurat.java b/singlecell/src/org/labkey/singlecell/pipeline/singlecell/SubsetSeurat.java index 13ae96797..9ec09710e 100644 --- a/singlecell/src/org/labkey/singlecell/pipeline/singlecell/SubsetSeurat.java +++ b/singlecell/src/org/labkey/singlecell/pipeline/singlecell/SubsetSeurat.java @@ -9,7 +9,6 @@ import org.labkey.api.singlecell.pipeline.SingleCellStep; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collection; import java.util.HashSet; import java.util.List; @@ -34,7 +33,8 @@ public Provider() put("height", 150); put("width", 600); put("delimiter", DELIM); - }}, null) + }}, null), + ToolParameterDescriptor.create("useDplyr", "Use dplyr", "If checked, the subset will be executed using dplyr::filter rather than Seurat::subset. This should allow more complex expressions to be used, including negations", "checkbox", null, false) ), List.of("/sequenceanalysis/field/TrimmingTextArea.js"), null); } @@ -71,6 +71,9 @@ protected List loadChunkFromFile() throws PipelineJobException final String[] values = val.split(DELIM); + ToolParameterDescriptor pd2 = getProvider().getParameterByName("useDplyr"); + final boolean useDplyr = pd2.extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Boolean.class, false); + List ret = new ArrayList<>(); for (String line : super.loadChunkFromFile()) { @@ -82,15 +85,23 @@ protected List loadChunkFromFile() throws PipelineJobException ret.add("\tif (!is.null(seuratObj)) {"); ret.add("\tprint(paste0('Subsetting dataset: ', datasetId, ' with the expression: " + subsetEscaped + "'))"); - ret.add("\t\tcells <- c()"); - ret.add("\t\ttryCatch({"); - ret.add("\t\t\tcells <- WhichCells(seuratObj, expression = " + subset + ")"); - ret.add("\t\t}, error = function(e){"); - ret.add("\t\t\tif (!is.null(e) && e$message == 'Cannot find cells provided') {"); - ret.add("\t\t\t\tprint(paste0('There were no cells remaining after the subset: ', '" + subsetEscaped + "'))"); - ret.add("\t\t\t}"); - ret.add("\t\t})"); - ret.add(""); + if (useDplyr) + { + ret.add("\t\t\tcells <- rownames(seuratObj@meta.data %>% dplyr::filter( " + subset + " ))"); + } + else + { + ret.add("\t\tcells <- c()"); + ret.add("\t\ttryCatch({"); + ret.add("\t\t\tcells <- WhichCells(seuratObj, expression = " + subset + ")"); + ret.add("\t\t}, error = function(e){"); + ret.add("\t\t\tif (!is.null(e) && e$message == 'Cannot find cells provided') {"); + ret.add("\t\t\t\tprint(paste0('There were no cells remaining after the subset: ', '" + subsetEscaped + "'))"); + ret.add("\t\t\t}"); + ret.add("\t\t})"); + ret.add(""); + } + ret.add("\t\tif (length(cells) == 0) {"); ret.add("\t\t\tprint(paste0('There were no cells after subsetting for dataset: ', datasetId, ', with subset: ', '" + subsetEscaped + "'))"); ret.add("\t\t\tseuratObj <- NULL"); From 5f7d8c32c1cfd665956eaea410466c84f99bf638 Mon Sep 17 00:00:00 2001 From: bbimber Date: Wed, 2 Jul 2025 16:26:53 -0700 Subject: [PATCH 02/20] Bugfix to vireo when using donor file --- .../pipeline/singlecell/VireoHandler.java | 70 ++++++++++++++----- 1 file changed, 54 insertions(+), 16 deletions(-) diff --git a/singlecell/src/org/labkey/singlecell/pipeline/singlecell/VireoHandler.java b/singlecell/src/org/labkey/singlecell/pipeline/singlecell/VireoHandler.java index eb7ae055d..28da76fae 100644 --- a/singlecell/src/org/labkey/singlecell/pipeline/singlecell/VireoHandler.java +++ b/singlecell/src/org/labkey/singlecell/pipeline/singlecell/VireoHandler.java @@ -12,6 +12,7 @@ import org.labkey.api.sequenceanalysis.SequenceAnalysisService; import org.labkey.api.sequenceanalysis.SequenceOutputFile; import org.labkey.api.sequenceanalysis.pipeline.AbstractParameterizedOutputHandler; +import org.labkey.api.sequenceanalysis.pipeline.BcftoolsRunner; import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome; import org.labkey.api.sequenceanalysis.pipeline.SequenceAnalysisJobSupport; import org.labkey.api.sequenceanalysis.pipeline.SequenceOutputHandler; @@ -256,6 +257,49 @@ public void processFilesRemote(List inputFiles, JobContext c } } + File cellSnpBaseVcf = new File(cellsnpDir, "cellSNP.base.vcf.gz"); + if (!cellSnpBaseVcf.exists()) + { + throw new PipelineJobException("Unable to find cellsnp base VCF"); + } + + + File cellSnpCellsVcf = new File(cellsnpDir, "cellSNP.cells.vcf.gz"); + if (!cellSnpCellsVcf.exists()) + { + throw new PipelineJobException("Unable to find cellsnp calls VCF"); + } + + sortAndFixVcf(cellSnpBaseVcf, genome, ctx.getLogger()); + sortAndFixVcf(cellSnpCellsVcf, genome, ctx.getLogger()); + + int vcfFile = ctx.getParams().optInt(REF_VCF, -1); + File refVcfSubset = null; + if (vcfFile > -1) + { + File vcf = ctx.getSequenceSupport().getCachedData(vcfFile); + if (vcf == null || !vcf.exists()) + { + throw new PipelineJobException("Unable to find file with ID: " + vcfFile); + } + + refVcfSubset = new File(ctx.getWorkingDirectory(), vcf.getName()); + BcftoolsRunner bcftoolsRunner = new BcftoolsRunner(ctx.getLogger()); + bcftoolsRunner.execute(Arrays.asList( + BcftoolsRunner.getBcfToolsPath().getAbsolutePath(), + "view", + vcf.getPath(), + "-R", + cellSnpCellsVcf.getPath(), + "-Oz", + "-o", + refVcfSubset.getPath() + )); + + ctx.getFileManager().addIntermediateFile(refVcfSubset); + ctx.getFileManager().addIntermediateFile(new File(refVcfSubset.getPath() + ".tbi")); + } + List vireo = new ArrayList<>(); vireo.add("vireo"); vireo.add("-c"); @@ -277,6 +321,12 @@ public void processFilesRemote(List inputFiles, JobContext c throw new PipelineJobException("Must provide nDonors"); } + if (refVcfSubset != null) + { + vireo.add("-d"); + vireo.add(refVcfSubset.getPath()); + } + vireo.add("-N"); vireo.add(String.valueOf(nDonors)); @@ -312,25 +362,13 @@ else if (outFiles.length > 1) so.setName(inputFiles.get(0).getName() + ": Vireo Demultiplexing"); } so.setCategory("Vireo Demultiplexing"); + if (vcfFile > -1) + { + so.setDescription("Reference VCF ID: " + vcfFile); + } ctx.addSequenceOutput(so); } - File cellSnpBaseVcf = new File(cellsnpDir, "cellSNP.base.vcf.gz"); - if (!cellSnpBaseVcf.exists()) - { - throw new PipelineJobException("Unable to find cellsnp base VCF"); - } - - - File cellSnpCellsVcf = new File(cellsnpDir, "cellSNP.cells.vcf.gz"); - if (!cellSnpCellsVcf.exists()) - { - throw new PipelineJobException("Unable to find cellsnp calls VCF"); - } - - sortAndFixVcf(cellSnpBaseVcf, genome, ctx.getLogger()); - sortAndFixVcf(cellSnpCellsVcf, genome, ctx.getLogger()); - if (storeCellSnpVcf) { SequenceOutputFile so = new SequenceOutputFile(); From c83a8e94e35cc2dc256fad0cf70dedab312c676b Mon Sep 17 00:00:00 2001 From: bbimber Date: Wed, 2 Jul 2025 16:41:53 -0700 Subject: [PATCH 03/20] Avoid dplyr loading --- .../org/labkey/singlecell/pipeline/singlecell/SubsetSeurat.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/singlecell/src/org/labkey/singlecell/pipeline/singlecell/SubsetSeurat.java b/singlecell/src/org/labkey/singlecell/pipeline/singlecell/SubsetSeurat.java index 9ec09710e..7a646eb38 100644 --- a/singlecell/src/org/labkey/singlecell/pipeline/singlecell/SubsetSeurat.java +++ b/singlecell/src/org/labkey/singlecell/pipeline/singlecell/SubsetSeurat.java @@ -87,7 +87,7 @@ protected List loadChunkFromFile() throws PipelineJobException ret.add("\tprint(paste0('Subsetting dataset: ', datasetId, ' with the expression: " + subsetEscaped + "'))"); if (useDplyr) { - ret.add("\t\t\tcells <- rownames(seuratObj@meta.data %>% dplyr::filter( " + subset + " ))"); + ret.add("\t\t\tcells <- rownames(seuratObj@meta.data |> dplyr::filter( " + subset + " ))"); } else { From 16e9682b42f78ecb85b5cc5827da8dce2d2ddf23 Mon Sep 17 00:00:00 2001 From: bbimber Date: Thu, 3 Jul 2025 05:29:30 -0700 Subject: [PATCH 04/20] Handle missing values --- singlecell/resources/chunks/FindClustersAndDimRedux.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/singlecell/resources/chunks/FindClustersAndDimRedux.R b/singlecell/resources/chunks/FindClustersAndDimRedux.R index b7fdc7c0f..175472c6c 100644 --- a/singlecell/resources/chunks/FindClustersAndDimRedux.R +++ b/singlecell/resources/chunks/FindClustersAndDimRedux.R @@ -19,7 +19,7 @@ for (datasetId in names(seuratObjects)) { printName(datasetId) seuratObj <- readSeuratRDS(seuratObjects[[datasetId]]) - if (all(is.null(clusterResolutions))) { + if (all(is.null(clusterResolutions)) || clusterResolutions == '') { clusterResolutions <- c(0.2, 0.4, 0.6, 0.8, 1.2) } else if (is.character(clusterResolutions)) { clusterResolutionsOrig <- clusterResolutions From a45c0c0cf60d5de04b749d377a4fcbab3a5aad32 Mon Sep 17 00:00:00 2001 From: bbimber Date: Thu, 3 Jul 2025 06:03:24 -0700 Subject: [PATCH 05/20] Bugfix to FindClustersAndDimRedux --- .../chunks/FindClustersAndDimRedux.R | 30 ++++++++++--------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/singlecell/resources/chunks/FindClustersAndDimRedux.R b/singlecell/resources/chunks/FindClustersAndDimRedux.R index 175472c6c..baba68718 100644 --- a/singlecell/resources/chunks/FindClustersAndDimRedux.R +++ b/singlecell/resources/chunks/FindClustersAndDimRedux.R @@ -15,24 +15,26 @@ if (!reticulate::py_module_available(module = 'leidenalg')) { } } +if (all(is.null(clusterResolutions)) || clusterResolutions == '') { + clusterResolutions <- c(0.2, 0.4, 0.6, 0.8, 1.2) +} else if (is.character(clusterResolutions)) { + clusterResolutionsOrig <- clusterResolutions + clusterResolutions <- gsub(clusterResolutions, pattern = ' ', replacement = '') + clusterResolutions <- unlist(strsplit(clusterResolutions, split = ',')) + clusterResolutions <- as.numeric(clusterResolutions) + if (any(is.na(clusterResolutions))) { + stop(paste0('Some values for clusterResolutions were not numeric: ', clusterResolutionsOrig)) + } +else if (is.numeric(clusterResolutions)) { + # No action needed +} else { + stop('Must provide a value for clusterResolutions') +} + for (datasetId in names(seuratObjects)) { printName(datasetId) seuratObj <- readSeuratRDS(seuratObjects[[datasetId]]) - if (all(is.null(clusterResolutions)) || clusterResolutions == '') { - clusterResolutions <- c(0.2, 0.4, 0.6, 0.8, 1.2) - } else if (is.character(clusterResolutions)) { - clusterResolutionsOrig <- clusterResolutions - clusterResolutions <- gsub(clusterResolutions, pattern = ' ', replacement = '') - clusterResolutions <- unlist(strsplit(clusterResolutions, split = ',')) - clusterResolutions <- as.numeric(clusterResolutions) - if (any(is.na(clusterResolutions))) { - stop(paste0('Some values for clusterResolutions were not numeric: ', clusterResolutionsOrig)) - } - } else { - stop('Must provide a value for clusterResolutions') - } - seuratObj <- CellMembrane::FindClustersAndDimRedux(seuratObj, minDimsToUse = minDimsToUse, useLeiden = useLeiden, clusterResolutions = clusterResolutions) saveData(seuratObj, datasetId) From 5a42760aa647544ab800494364c78856ae755604 Mon Sep 17 00:00:00 2001 From: bbimber Date: Thu, 3 Jul 2025 15:52:32 -0700 Subject: [PATCH 06/20] Bugfix to FindClustersAndDimRedux --- singlecell/resources/chunks/FindClustersAndDimRedux.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/singlecell/resources/chunks/FindClustersAndDimRedux.R b/singlecell/resources/chunks/FindClustersAndDimRedux.R index baba68718..47ecfa6c2 100644 --- a/singlecell/resources/chunks/FindClustersAndDimRedux.R +++ b/singlecell/resources/chunks/FindClustersAndDimRedux.R @@ -25,7 +25,7 @@ if (all(is.null(clusterResolutions)) || clusterResolutions == '') { if (any(is.na(clusterResolutions))) { stop(paste0('Some values for clusterResolutions were not numeric: ', clusterResolutionsOrig)) } -else if (is.numeric(clusterResolutions)) { +} else if (is.numeric(clusterResolutions)) { # No action needed } else { stop('Must provide a value for clusterResolutions') From 057d0276ffefe93103a4fcc64adb26b1df439fd9 Mon Sep 17 00:00:00 2001 From: bbimber Date: Tue, 8 Jul 2025 20:42:07 -0700 Subject: [PATCH 07/20] Update vireo args --- .../singlecell/pipeline/singlecell/VireoHandler.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/singlecell/src/org/labkey/singlecell/pipeline/singlecell/VireoHandler.java b/singlecell/src/org/labkey/singlecell/pipeline/singlecell/VireoHandler.java index 28da76fae..b37f73ac2 100644 --- a/singlecell/src/org/labkey/singlecell/pipeline/singlecell/VireoHandler.java +++ b/singlecell/src/org/labkey/singlecell/pipeline/singlecell/VireoHandler.java @@ -202,12 +202,6 @@ public void processFilesRemote(List inputFiles, JobContext c cellsnp.add(maxThreads.toString()); } - cellsnp.add("--minMAF"); - cellsnp.add("0.1"); - - cellsnp.add("--minCOUNT"); - cellsnp.add("100"); - String maxDepth = StringUtils.trimToNull(ctx.getParams().optString("maxDepth")); if (maxDepth != null) { @@ -240,6 +234,12 @@ public void processFilesRemote(List inputFiles, JobContext c cellsnp.add("--chrom"); cellsnp.add(contigs); } + + cellsnp.add("--minMAF"); + cellsnp.add("0.1"); + + cellsnp.add("--minCOUNT"); + cellsnp.add("100"); } new SimpleScriptWrapper(ctx.getLogger()).execute(cellsnp); From 77203cec29edbc9812d4a1f6b25eb05d79f1307a Mon Sep 17 00:00:00 2001 From: bbimber Date: Tue, 8 Jul 2025 21:44:12 -0700 Subject: [PATCH 08/20] Update vireo args --- .../pipeline/singlecell/VireoHandler.java | 28 +++++++++++-------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/singlecell/src/org/labkey/singlecell/pipeline/singlecell/VireoHandler.java b/singlecell/src/org/labkey/singlecell/pipeline/singlecell/VireoHandler.java index b37f73ac2..341126b95 100644 --- a/singlecell/src/org/labkey/singlecell/pipeline/singlecell/VireoHandler.java +++ b/singlecell/src/org/labkey/singlecell/pipeline/singlecell/VireoHandler.java @@ -41,8 +41,8 @@ public class VireoHandler extends AbstractParameterizedOutputHandler(PageFlowUtil.set("sequenceanalysis/field/SequenceOutputFileSelectorField.js")), Arrays.asList( - ToolParameterDescriptor.create("nDonors", "# Donors", "The number of donors to demultiplex", "ldk-integerfield", new JSONObject(){{ - put("allowBlank", false); + ToolParameterDescriptor.create("nDonors", "# Donors", "The number of donors to demultiplex. This can be blank only if a reference VCF is provided.", "ldk-integerfield", new JSONObject(){{ + }}, null), ToolParameterDescriptor.create("maxDepth", "Max Depth", "At a position, read maximally INT reads per input file, to avoid excessive memory usage", "ldk-integerfield", new JSONObject(){{ put("minValue", 0); @@ -112,6 +112,13 @@ public void init(JobContext ctx, List inputFiles, List inputFiles, JobContext c File barcodesGz = getBarcodesFile(inputFiles.get(0).getFile()); File bam = getBamFile(inputFiles.get(0).getFile()); - File barcodes = new File(ctx.getWorkingDirectory(), "barcodes.csv"); + File barcodes = new File(ctx.getWorkingDirectory(), "barcodes.tsv"); try (BufferedReader reader = IOUtil.openFileForBufferedUtf8Reading(barcodesGz); PrintWriter writer = PrintWriters.getPrintWriter(barcodes)) { String line; @@ -314,21 +321,20 @@ public void processFilesRemote(List inputFiles, JobContext c vireo.add("-o"); vireo.add(ctx.getWorkingDirectory().getPath()); - int nDonors = ctx.getParams().optInt("nDonors", 0); boolean storeCellSnpVcf = ctx.getParams().optBoolean("storeCellSnpVcf", false); - if (nDonors == 0) - { - throw new PipelineJobException("Must provide nDonors"); - } - if (refVcfSubset != null) { vireo.add("-d"); vireo.add(refVcfSubset.getPath()); } - vireo.add("-N"); - vireo.add(String.valueOf(nDonors)); + // Note: this value should be checked in init(). It is required only if refVCF is null + int nDonors = ctx.getParams().optInt("nDonors", -1); + if (nDonors > -1) + { + vireo.add("-N"); + vireo.add(String.valueOf(nDonors)); + } if (nDonors == 1) { From 392881bd7fcc764e654e564025f17a1e22a9d96b Mon Sep 17 00:00:00 2001 From: bbimber Date: Wed, 9 Jul 2025 07:39:25 -0700 Subject: [PATCH 09/20] Reduce logging --- .../sequenceanalysis/run/AbstractCommandWrapper.java | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/AbstractCommandWrapper.java b/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/AbstractCommandWrapper.java index e9b0df727..74e665c07 100644 --- a/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/AbstractCommandWrapper.java +++ b/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/AbstractCommandWrapper.java @@ -44,6 +44,7 @@ abstract public class AbstractCommandWrapper implements CommandWrapper private File _outputDir = null; private File _workingDir = null; private Logger _log = null; + private boolean _logPath = false; private Level _logLevel = Level.DEBUG; private boolean _warnNonZeroExits = true; private boolean _throwNonZeroExits = true; @@ -229,11 +230,19 @@ private void setPath(ProcessBuilder pb) path = fileExe.getParent() + File.pathSeparatorChar + path; } - getLogger().debug("using path: " + path); + if (_logPath) + { + getLogger().debug("using path: " + path); + } pb.environment().put("PATH", path); } } + public void setLogPath(boolean logPath) + { + _logPath = logPath; + } + public void setOutputDir(File outputDir) { _outputDir = outputDir; From c57d7e792afc530d6dcc0fd03e927465cc408f7f Mon Sep 17 00:00:00 2001 From: bbimber Date: Wed, 9 Jul 2025 07:56:23 -0700 Subject: [PATCH 10/20] Defer sorting of cellsnp VCF --- .../pipeline/singlecell/VireoHandler.java | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/singlecell/src/org/labkey/singlecell/pipeline/singlecell/VireoHandler.java b/singlecell/src/org/labkey/singlecell/pipeline/singlecell/VireoHandler.java index 341126b95..42db6bc8a 100644 --- a/singlecell/src/org/labkey/singlecell/pipeline/singlecell/VireoHandler.java +++ b/singlecell/src/org/labkey/singlecell/pipeline/singlecell/VireoHandler.java @@ -277,9 +277,6 @@ public void processFilesRemote(List inputFiles, JobContext c throw new PipelineJobException("Unable to find cellsnp calls VCF"); } - sortAndFixVcf(cellSnpBaseVcf, genome, ctx.getLogger()); - sortAndFixVcf(cellSnpCellsVcf, genome, ctx.getLogger()); - int vcfFile = ctx.getParams().optInt(REF_VCF, -1); File refVcfSubset = null; if (vcfFile > -1) @@ -377,33 +374,44 @@ else if (outFiles.length > 1) if (storeCellSnpVcf) { + File fixedVcf = sortAndFixVcf(cellSnpBaseVcf, genome, ctx.getLogger(), ctx.getWorkingDirectory()); + SequenceOutputFile so = new SequenceOutputFile(); so.setReadset(inputFiles.get(0).getReadset()); so.setLibrary_id(inputFiles.get(0).getLibrary_id()); - so.setFile(cellSnpCellsVcf); + so.setFile(fixedVcf); if (so.getReadset() != null) { so.setName(ctx.getSequenceSupport().getCachedReadset(so.getReadset()).getName() + ": Cellsnp-lite VCF"); } else { - so.setName(inputFiles.get(0).getName() + ": Cellsnp-lite VCF"); + so.setName(inputFiles.get(0).getName() + ": Cellsnp-lite Base VCF"); } so.setCategory("VCF File"); ctx.addSequenceOutput(so); } + else + { + ctx.getFileManager().addIntermediateFile(cellSnpBaseVcf.getParentFile()); + } } - private void sortAndFixVcf(File vcf, ReferenceGenome genome, Logger log) throws PipelineJobException + private File sortAndFixVcf(File vcf, ReferenceGenome genome, Logger log, File outDir) throws PipelineJobException { + File outVcf = new File(outDir, vcf.getName()); + // This original VCF is generally not properly sorted, and has an invalid index. This is redundant, the VCF is not that large: try { - SequencePipelineService.get().sortROD(vcf, log, 2); - SequenceAnalysisService.get().ensureVcfIndex(vcf, log, true); + FileUtils.copyFile(vcf, outVcf); + SequencePipelineService.get().sortROD(outVcf, log, 2); + SequenceAnalysisService.get().ensureVcfIndex(outVcf, log, true); + + new UpdateVCFSequenceDictionary(log).execute(outVcf, genome.getSequenceDictionary()); + SequenceAnalysisService.get().ensureVcfIndex(outVcf, log); - new UpdateVCFSequenceDictionary(log).execute(vcf, genome.getSequenceDictionary()); - SequenceAnalysisService.get().ensureVcfIndex(vcf, log); + return outVcf; } catch (IOException e) { From ac24a995012775a5bf5ea56c6f563a6ba0575a84 Mon Sep 17 00:00:00 2001 From: bbimber Date: Wed, 9 Jul 2025 10:31:40 -0700 Subject: [PATCH 11/20] More obvious link to ManageFileRootAction --- .../org/labkey/discvrcore/DiscvrCoreController.java | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/discvrcore/src/org/labkey/discvrcore/DiscvrCoreController.java b/discvrcore/src/org/labkey/discvrcore/DiscvrCoreController.java index ea2596b7f..3382d1a0d 100644 --- a/discvrcore/src/org/labkey/discvrcore/DiscvrCoreController.java +++ b/discvrcore/src/org/labkey/discvrcore/DiscvrCoreController.java @@ -291,4 +291,16 @@ public URLHelper getRedirectURL(Object o) throws Exception return DetailsURL.fromString("laboratory/setTableIncrementValue.view", getContainer()).getActionURL(); } } + + // This allows registration of this action without creating a dependency between laboratory and discvrcore + @UtilityAction(label = "Manage File Roots", description = "This standalone file root management action can be used on folder types that do not support the normal 'Manage Folder' UI.") + @RequiresPermission(AdminPermission.class) + public class ManageFileRootAction extends SimpleRedirectAction + { + @Override + public URLHelper getRedirectURL(Object o) throws Exception + { + return DetailsURL.fromString("admin/manageFileRoot.view", getContainer()).getActionURL(); + } + } } From fdfaab204781567692df01d8a1f084f5d32c04ae Mon Sep 17 00:00:00 2001 From: bbimber Date: Wed, 9 Jul 2025 10:32:23 -0700 Subject: [PATCH 12/20] Report summary for vireo --- .../pipeline/singlecell/VireoHandler.java | 32 ++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/singlecell/src/org/labkey/singlecell/pipeline/singlecell/VireoHandler.java b/singlecell/src/org/labkey/singlecell/pipeline/singlecell/VireoHandler.java index 42db6bc8a..75f26c2e6 100644 --- a/singlecell/src/org/labkey/singlecell/pipeline/singlecell/VireoHandler.java +++ b/singlecell/src/org/labkey/singlecell/pipeline/singlecell/VireoHandler.java @@ -1,5 +1,6 @@ package org.labkey.singlecell.pipeline.singlecell; +import au.com.bytecode.opencsv.CSVReader; import htsjdk.samtools.util.IOUtil; import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.StringUtils; @@ -9,6 +10,7 @@ import org.labkey.api.pipeline.PipelineJob; import org.labkey.api.pipeline.PipelineJobException; import org.labkey.api.pipeline.RecordedAction; +import org.labkey.api.reader.Readers; import org.labkey.api.sequenceanalysis.SequenceAnalysisService; import org.labkey.api.sequenceanalysis.SequenceOutputFile; import org.labkey.api.sequenceanalysis.pipeline.AbstractParameterizedOutputHandler; @@ -365,10 +367,38 @@ else if (outFiles.length > 1) so.setName(inputFiles.get(0).getName() + ": Vireo Demultiplexing"); } so.setCategory("Vireo Demultiplexing"); + StringBuilder description = new StringBuilder(); if (vcfFile > -1) { - so.setDescription("Reference VCF ID: " + vcfFile); + description.append("Reference VCF ID: \n").append(vcfFile); } + + File summary = new File(ctx.getOutputDir(), "summary.tsv"); + if (!summary.exists()) + { + throw new PipelineJobException("Missing file: " + summary.getPath()); + } + + description.append("Results:\n"); + try (CSVReader reader = new CSVReader(Readers.getReader(summary))) + { + String[] line; + while ((line = reader.readNext()) != null) + { + if ("Var1".equals(line[0])) + { + continue; + } + + description.append(line[0]).append(": ").append(line[1]).append("\n"); + } + } + catch (IOException e) + { + throw new PipelineJobException(e); + } + + so.setDescription(StringUtils.trimToEmpty(description.toString())); ctx.addSequenceOutput(so); } From 05be6caa1f0243b0af634c583bacc820eb9657d1 Mon Sep 17 00:00:00 2001 From: bbimber Date: Wed, 9 Jul 2025 11:13:12 -0700 Subject: [PATCH 13/20] Always re-cache lookups after clear --- .../src/org/labkey/studies/query/LookupSetsTable.java | 6 +++--- .../org/labkey/studies/query/StudiesUserSchema.java | 10 ++++++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/Studies/src/org/labkey/studies/query/LookupSetsTable.java b/Studies/src/org/labkey/studies/query/LookupSetsTable.java index 4d6ff8230..83c8c78ef 100644 --- a/Studies/src/org/labkey/studies/query/LookupSetsTable.java +++ b/Studies/src/org/labkey/studies/query/LookupSetsTable.java @@ -38,14 +38,14 @@ public UpdateService(SimpleUserSchema.SimpleTable ti) @Override protected void afterInsertUpdate(int count, BatchValidationException errors) { - LookupSetsManager.get().getCache().clear(); + StudiesUserSchema.repopulateCaches(getUserSchema().getUser(), getUserSchema().getContainer()); } @Override protected Map deleteRow(User user, Container container, Map oldRowMap) throws QueryUpdateServiceException, SQLException, InvalidKeyException { Map row = super.deleteRow(user, container, oldRowMap); - LookupSetsManager.get().getCache().clear(); + StudiesUserSchema.repopulateCaches(getUserSchema().getUser(), getUserSchema().getContainer()); return row; } @@ -53,7 +53,7 @@ protected Map deleteRow(User user, Container container, Map> getPropertySetNames() { Map> nameMap = (Map>) LookupSetsManager.get().getCache().get(LookupSetTable.getCacheKey(getTargetContainer())); @@ -81,6 +90,7 @@ private Map> getPropertySetNames() return nameMap; } + _log.debug("Populating lookup tables in StudiesUserSchema.getPropertySetNames() for container: " + getTargetContainer().getName()); nameMap = new CaseInsensitiveHashMap<>(); TableSelector ts = new TableSelector(_dbSchema.getTable(TABLE_LOOKUP_SETS), new SimpleFilter(FieldKey.fromString("container"), getTargetContainer().getId()), null); From 7382511e4abe9ef5ab9534eb83812280b147f473 Mon Sep 17 00:00:00 2001 From: bbimber Date: Wed, 9 Jul 2025 16:38:34 -0700 Subject: [PATCH 14/20] TSVs need tab --- .../org/labkey/singlecell/pipeline/singlecell/VireoHandler.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/singlecell/src/org/labkey/singlecell/pipeline/singlecell/VireoHandler.java b/singlecell/src/org/labkey/singlecell/pipeline/singlecell/VireoHandler.java index 75f26c2e6..d4ddc7326 100644 --- a/singlecell/src/org/labkey/singlecell/pipeline/singlecell/VireoHandler.java +++ b/singlecell/src/org/labkey/singlecell/pipeline/singlecell/VireoHandler.java @@ -380,7 +380,7 @@ else if (outFiles.length > 1) } description.append("Results:\n"); - try (CSVReader reader = new CSVReader(Readers.getReader(summary))) + try (CSVReader reader = new CSVReader(Readers.getReader(summary), '\t')) { String[] line; while ((line = reader.readNext()) != null) From 7d11ce99a224c527e733f742bf70aa53cd81a572 Mon Sep 17 00:00:00 2001 From: bbimber Date: Thu, 10 Jul 2025 15:08:48 -0700 Subject: [PATCH 15/20] Support sawfish --- .../pipeline_code/extra_tools_install.sh | 14 ++ .../SequenceAnalysisModule.java | 4 + .../run/analysis/SawfishAnalysis.java | 90 +++++++++ .../analysis/SawfishJointCallingHandler.java | 183 ++++++++++++++++++ 4 files changed, 291 insertions(+) create mode 100644 SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/SawfishAnalysis.java create mode 100644 SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/SawfishJointCallingHandler.java diff --git a/SequenceAnalysis/pipeline_code/extra_tools_install.sh b/SequenceAnalysis/pipeline_code/extra_tools_install.sh index e3cce3c55..8ecd60f37 100755 --- a/SequenceAnalysis/pipeline_code/extra_tools_install.sh +++ b/SequenceAnalysis/pipeline_code/extra_tools_install.sh @@ -319,3 +319,17 @@ then else echo "Already installed" fi + +if [[ ! -e ${LKTOOLS_DIR}/sawfish || ! -z $FORCE_REINSTALL ]]; +then + echo "Cleaning up previous installs" + rm -Rf $LKTOOLS_DIR/sawfish* + + wget https://github.com/PacificBiosciences/sawfish/releases/download/v2.0.0/sawfish-v2.0.0-x86_64-unknown-linux-gnu.tar.gz + tar -xzf sawfish-v2.0.0-x86_64-unknown-linux-gnu.tar.gz + + mv sawfish-v2.0.0-x86_64-unknown-linux-gnu $LKTOOLS_DIR/ + ln -s $LKTOOLS_DIR/sawfish-v2.0.0/bin/sawfish $LKTOOLS_DIR/ +else + echo "Already installed" +fi diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisModule.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisModule.java index 7a10338fd..8dd0d99b8 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisModule.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisModule.java @@ -123,6 +123,8 @@ import org.labkey.sequenceanalysis.run.analysis.PbsvAnalysis; import org.labkey.sequenceanalysis.run.analysis.PbsvJointCallingHandler; import org.labkey.sequenceanalysis.run.analysis.PindelAnalysis; +import org.labkey.sequenceanalysis.run.analysis.SawfishAnalysis; +import org.labkey.sequenceanalysis.run.analysis.SawfishJointCallingHandler; import org.labkey.sequenceanalysis.run.analysis.SequenceBasedTypingAnalysis; import org.labkey.sequenceanalysis.run.analysis.SnpCountAnalysis; import org.labkey.sequenceanalysis.run.analysis.SubreadAnalysis; @@ -342,6 +344,7 @@ public static void registerPipelineSteps() SequencePipelineService.get().registerPipelineStep(new PindelAnalysis.Provider()); SequencePipelineService.get().registerPipelineStep(new PbsvAnalysis.Provider()); SequencePipelineService.get().registerPipelineStep(new GenrichStep.Provider()); + SequencePipelineService.get().registerPipelineStep(new SawfishAnalysis.Provider()); SequencePipelineService.get().registerPipelineStep(new PARalyzerAnalysis.Provider()); SequencePipelineService.get().registerPipelineStep(new RnaSeQCStep.Provider()); @@ -400,6 +403,7 @@ public static void registerPipelineSteps() SequenceAnalysisService.get().registerFileHandler(new NextCladeHandler()); SequenceAnalysisService.get().registerFileHandler(new ConvertToCramHandler()); SequenceAnalysisService.get().registerFileHandler(new PbsvJointCallingHandler()); + SequenceAnalysisService.get().registerFileHandler(new SawfishJointCallingHandler()); SequenceAnalysisService.get().registerFileHandler(new DeepVariantHandler()); SequenceAnalysisService.get().registerFileHandler(new GLNexusHandler()); SequenceAnalysisService.get().registerFileHandler(new ParagraphStep()); diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/SawfishAnalysis.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/SawfishAnalysis.java new file mode 100644 index 000000000..04abb26ef --- /dev/null +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/SawfishAnalysis.java @@ -0,0 +1,90 @@ +package org.labkey.sequenceanalysis.run.analysis; + +import org.labkey.api.pipeline.PipelineJobException; +import org.labkey.api.sequenceanalysis.model.AnalysisModel; +import org.labkey.api.sequenceanalysis.model.Readset; +import org.labkey.api.sequenceanalysis.pipeline.AbstractAnalysisStepProvider; +import org.labkey.api.sequenceanalysis.pipeline.AbstractPipelineStep; +import org.labkey.api.sequenceanalysis.pipeline.AnalysisOutputImpl; +import org.labkey.api.sequenceanalysis.pipeline.AnalysisStep; +import org.labkey.api.sequenceanalysis.pipeline.PipelineContext; +import org.labkey.api.sequenceanalysis.pipeline.PipelineStepProvider; +import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome; +import org.labkey.api.sequenceanalysis.pipeline.SequencePipelineService; +import org.labkey.api.sequenceanalysis.run.SimpleScriptWrapper; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; + +public class SawfishAnalysis extends AbstractPipelineStep implements AnalysisStep +{ + public SawfishAnalysis(PipelineStepProvider provider, PipelineContext ctx) + { + super(provider, ctx); + } + + public static class Provider extends AbstractAnalysisStepProvider + { + public Provider() + { + super("sawfish", "Sawfish Analysis", null, "This will run sawfish SV dicvoery and calling on the selected BAMs", List.of(), null, null); + } + + + @Override + public SawfishAnalysis create(PipelineContext ctx) + { + return new SawfishAnalysis(this, ctx); + } + } + + @Override + public Output performAnalysisPerSampleRemote(Readset rs, File inputBam, ReferenceGenome referenceGenome, File outputDir) throws PipelineJobException + { + AnalysisOutputImpl output = new AnalysisOutputImpl(); + + List args = new ArrayList<>(); + args.add(getExe().getPath()); + args.add("discover"); + + args.add("--bam"); + args.add(inputBam.getPath()); + + args.add("--ref"); + args.add(referenceGenome.getWorkingFastaFile().getPath()); + + File svOutDir = new File(outputDir, "sawfish"); + args.add("--output-dir"); + args.add(svOutDir.getPath()); + + Integer maxThreads = SequencePipelineService.get().getMaxThreads(getPipelineCtx().getLogger()); + if (maxThreads != null) + { + args.add("-threads"); + args.add(String.valueOf(maxThreads)); + } + + new SimpleScriptWrapper(getPipelineCtx().getLogger()).execute(args); + + File vcf = new File(svOutDir, "genotyped.sv.vcf.gz"); + if (!vcf.exists()) + { + throw new PipelineJobException("Unable to find file: " + vcf.getPath()); + } + + output.addSequenceOutput(vcf, rs.getName() + ": sawfish", "Sawfish SV Discovery", rs.getReadsetId(), null, referenceGenome.getGenomeId(), null); + return output; + } + + @Override + public Output performAnalysisPerSampleLocal(AnalysisModel model, File inputBam, File referenceFasta, File outDir) throws PipelineJobException + { + return null; + } + + private File getExe() + { + return SequencePipelineService.get().getExeForPackage("SAWFISHPATH", "sawfish"); + } +} \ No newline at end of file diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/SawfishJointCallingHandler.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/SawfishJointCallingHandler.java new file mode 100644 index 000000000..50957ddd3 --- /dev/null +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/SawfishJointCallingHandler.java @@ -0,0 +1,183 @@ +package org.labkey.sequenceanalysis.run.analysis; + +import org.apache.commons.io.FileUtils; +import org.json.JSONObject; +import org.labkey.api.module.ModuleLoader; +import org.labkey.api.pipeline.PipelineJob; +import org.labkey.api.pipeline.PipelineJobException; +import org.labkey.api.pipeline.RecordedAction; +import org.labkey.api.sequenceanalysis.SequenceAnalysisService; +import org.labkey.api.sequenceanalysis.SequenceOutputFile; +import org.labkey.api.sequenceanalysis.pipeline.AbstractParameterizedOutputHandler; +import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome; +import org.labkey.api.sequenceanalysis.pipeline.SequenceAnalysisJobSupport; +import org.labkey.api.sequenceanalysis.pipeline.SequenceOutputHandler; +import org.labkey.api.sequenceanalysis.pipeline.SequencePipelineService; +import org.labkey.api.sequenceanalysis.pipeline.ToolParameterDescriptor; +import org.labkey.api.sequenceanalysis.run.SimpleScriptWrapper; +import org.labkey.sequenceanalysis.SequenceAnalysisModule; +import org.labkey.sequenceanalysis.util.SequenceUtil; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.stream.Collectors; + +public class SawfishJointCallingHandler extends AbstractParameterizedOutputHandler +{ + private static final String OUTPUT_CATEGORY = "Sawfish VCF"; + + public SawfishJointCallingHandler() + { + super(ModuleLoader.getInstance().getModule(SequenceAnalysisModule.NAME), "Sawfish Joint-Call", "Runs sawfish joint-call, which jointly calls SVs from PacBio CCS data", new LinkedHashSet<>(List.of("sequenceanalysis/panel/VariantScatterGatherPanel.js")), Arrays.asList( + ToolParameterDescriptor.create("fileName", "VCF Filename", "The name of the resulting file.", "textfield", new JSONObject(){{ + put("allowBlank", false); + put("doNotIncludeInTemplates", true); + }}, null) + )); + } + + @Override + public boolean canProcess(SequenceOutputFile o) + { + return o.getFile() != null && SequenceUtil.FILETYPE.vcf.getFileType().isType(o.getFile()); + } + + @Override + public boolean doRunRemote() + { + return true; + } + + @Override + public boolean doRunLocal() + { + return false; + } + + @Override + public SequenceOutputProcessor getProcessor() + { + return new Processor(); + } + + public static class Processor implements SequenceOutputProcessor + { + @Override + public void processFilesOnWebserver(PipelineJob job, SequenceAnalysisJobSupport support, List inputFiles, JSONObject params, File outputDir, List actions, List outputsToCreate) throws UnsupportedOperationException, PipelineJobException + { + + } + + @Override + public void processFilesRemote(List inputFiles, JobContext ctx) throws UnsupportedOperationException, PipelineJobException + { + List filesToProcess = inputFiles.stream().map(SequenceOutputFile::getFile).collect(Collectors.toList()); + + ReferenceGenome genome = ctx.getSequenceSupport().getCachedGenomes().iterator().next(); + String outputBaseName = ctx.getParams().getString("fileName"); + if (!outputBaseName.toLowerCase().endsWith(".gz")) + { + outputBaseName = outputBaseName.replaceAll(".gz$", ""); + } + + if (!outputBaseName.toLowerCase().endsWith(".vcf")) + { + outputBaseName = outputBaseName.replaceAll(".vcf$", ""); + } + + File expectedFinalOutput = new File(ctx.getOutputDir(), outputBaseName + ".vcf.gz"); + File expectedFinalOutputIdx = new File(expectedFinalOutput.getPath() + ".tbi"); + boolean jobCompleted = expectedFinalOutputIdx.exists(); // this would occur if the job died during the cleanup phase + + File ouputVcf = runSawfishCall(ctx, filesToProcess, genome, outputBaseName); + + SequenceOutputFile so = new SequenceOutputFile(); + so.setName("Sawfish call: " + outputBaseName); + so.setFile(ouputVcf); + so.setCategory(OUTPUT_CATEGORY); + so.setLibrary_id(genome.getGenomeId()); + + ctx.addSequenceOutput(so); + } + + private File runSawfishCall(JobContext ctx, List inputs, ReferenceGenome genome, String outputBaseName) throws PipelineJobException + { + if (inputs.isEmpty()) + { + throw new PipelineJobException("No inputs provided"); + } + + List args = new ArrayList<>(); + args.add(getExe().getPath()); + args.add("joint-call"); + + Integer maxThreads = SequencePipelineService.get().getMaxThreads(ctx.getLogger()); + if (maxThreads != null) + { + args.add("--threads"); + args.add(String.valueOf(maxThreads)); + } + + args.add("--ref"); + args.add(genome.getWorkingFastaFile().getPath()); + + for (File sample : inputs) + { + args.add("--sample"); + args.add(sample.getParentFile().getPath()); + } + + File outDir = new File(ctx.getOutputDir(), "sawfish"); + args.add("--output-dir"); + args.add(outDir.getPath()); + + new SimpleScriptWrapper(ctx.getLogger()).execute(args); + + File vcfOut = new File(outDir, "genotyped.sv.vcf.gz"); + if (!vcfOut.exists()) + { + throw new PipelineJobException("Unable to find file: " + vcfOut.getPath()); + } + + File vcfOutFinal = new File(ctx.getOutputDir(), outputBaseName + ".vcf.gz"); + + try + { + if (vcfOutFinal.exists()) + { + vcfOutFinal.delete(); + FileUtils.moveFile(vcfOut, vcfOutFinal); + + File targetIndex = new File(vcfOutFinal.getPath() + ".tbi"); + if (targetIndex.exists()) + { + targetIndex.delete(); + } + + File origIndex = new File(vcfOut.getPath() + ".tbi"); + if (origIndex.exists()) + { + FileUtils.moveFile(origIndex, targetIndex); + } + + SequenceAnalysisService.get().ensureVcfIndex(vcfOutFinal, ctx.getLogger(), true); + } + } + catch (IOException e) + { + throw new PipelineJobException(e); + } + + return vcfOutFinal; + } + + private File getExe() + { + return SequencePipelineService.get().getExeForPackage("PBSVPATH", "pbsv"); + } + } +} From 6b1801611e54d5a34530261498633bd8b30f7bde Mon Sep 17 00:00:00 2001 From: bbimber Date: Thu, 10 Jul 2025 16:12:53 -0700 Subject: [PATCH 16/20] Correct sawfish argument --- .../labkey/sequenceanalysis/run/analysis/SawfishAnalysis.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/SawfishAnalysis.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/SawfishAnalysis.java index 04abb26ef..df2e534cb 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/SawfishAnalysis.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/SawfishAnalysis.java @@ -61,7 +61,7 @@ public Output performAnalysisPerSampleRemote(Readset rs, File inputBam, Referenc Integer maxThreads = SequencePipelineService.get().getMaxThreads(getPipelineCtx().getLogger()); if (maxThreads != null) { - args.add("-threads"); + args.add("--threads"); args.add(String.valueOf(maxThreads)); } From 23bf99992f3b28b854db0b619f91015868fa4577 Mon Sep 17 00:00:00 2001 From: bbimber Date: Thu, 10 Jul 2025 21:51:01 -0700 Subject: [PATCH 17/20] Provide sawfish with BAM input --- .../run/analysis/SawfishAnalysis.java | 55 ++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/SawfishAnalysis.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/SawfishAnalysis.java index df2e534cb..1e6c7ef4c 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/SawfishAnalysis.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/SawfishAnalysis.java @@ -1,5 +1,7 @@ package org.labkey.sequenceanalysis.run.analysis; +import org.apache.logging.log4j.Logger; +import org.jetbrains.annotations.Nullable; import org.labkey.api.pipeline.PipelineJobException; import org.labkey.api.sequenceanalysis.model.AnalysisModel; import org.labkey.api.sequenceanalysis.model.Readset; @@ -10,8 +12,10 @@ import org.labkey.api.sequenceanalysis.pipeline.PipelineContext; import org.labkey.api.sequenceanalysis.pipeline.PipelineStepProvider; import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome; +import org.labkey.api.sequenceanalysis.pipeline.SamtoolsRunner; import org.labkey.api.sequenceanalysis.pipeline.SequencePipelineService; import org.labkey.api.sequenceanalysis.run.SimpleScriptWrapper; +import org.labkey.sequenceanalysis.util.SequenceUtil; import java.io.File; import java.util.ArrayList; @@ -44,12 +48,24 @@ public Output performAnalysisPerSampleRemote(Readset rs, File inputBam, Referenc { AnalysisOutputImpl output = new AnalysisOutputImpl(); + File inputFile = inputBam; + if (SequenceUtil.FILETYPE.cram.getFileType().isType(inputFile)) + { + CramToBam samtoolsRunner = new CramToBam(getPipelineCtx().getLogger()); + File bam = new File(getPipelineCtx().getWorkingDirectory(), inputFile.getName().replaceAll(".cram$", ".bam")); + samtoolsRunner.convert(inputFile, bam, referenceGenome.getWorkingFastaFile(), SequencePipelineService.get().getMaxThreads(getPipelineCtx().getLogger())); + inputFile = bam; + + output.addIntermediateFile(bam); + output.addIntermediateFile(new File(bam.getPath() + ".bai")); + } + List args = new ArrayList<>(); args.add(getExe().getPath()); args.add("discover"); args.add("--bam"); - args.add(inputBam.getPath()); + args.add(inputFile.getPath()); args.add("--ref"); args.add(referenceGenome.getWorkingFastaFile().getPath()); @@ -87,4 +103,41 @@ private File getExe() { return SequencePipelineService.get().getExeForPackage("SAWFISHPATH", "sawfish"); } + + private static class CramToBam extends SamtoolsRunner + { + public CramToBam(Logger log) + { + super(log); + } + + public void convert(File inputCram, File outputBam, File fasta, @Nullable Integer threads) throws PipelineJobException + { + getLogger().info("Converting CRAM to BAM"); + + execute(getParams(inputCram, outputBam, fasta, threads)); + } + + private List getParams(File inputCram, File outputBam, File fasta, @Nullable Integer threads) + { + List params = new ArrayList<>(); + params.add(getSamtoolsPath().getPath()); + params.add("view"); + params.add("-b"); + params.add("-T"); + params.add(fasta.getPath()); + params.add("-o"); + params.add(outputBam.getPath()); + + if (threads != null) + { + params.add("-@"); + params.add(String.valueOf(threads)); + } + + params.add(inputCram.getPath()); + + return params; + } + } } \ No newline at end of file From 311f538917afb8de6b70c288c6828afa8aebc990 Mon Sep 17 00:00:00 2001 From: bbimber Date: Thu, 10 Jul 2025 22:28:09 -0700 Subject: [PATCH 18/20] Reduce logging --- .../api/sequenceanalysis/run/AbstractCommandWrapper.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/AbstractCommandWrapper.java b/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/AbstractCommandWrapper.java index 74e665c07..5b5a830ec 100644 --- a/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/AbstractCommandWrapper.java +++ b/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/AbstractCommandWrapper.java @@ -206,9 +206,11 @@ private void setPath(ProcessBuilder pb) { String path = System.getenv("PATH"); - getLogger().debug("Existing PATH: " + path); - getLogger().debug("toolDir: " + toolDir); - + if (_logPath) + { + getLogger().debug("Existing PATH: " + path); + getLogger().debug("toolDir: " + toolDir); + } if (path == null) { From dccf4c7248de90266f90aaacdcbc2a02a660f404 Mon Sep 17 00:00:00 2001 From: bbimber Date: Thu, 10 Jul 2025 22:34:44 -0700 Subject: [PATCH 19/20] Ensure BAM index exists --- .../run/analysis/SawfishAnalysis.java | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/SawfishAnalysis.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/SawfishAnalysis.java index 1e6c7ef4c..bc346470b 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/SawfishAnalysis.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/SawfishAnalysis.java @@ -12,6 +12,7 @@ import org.labkey.api.sequenceanalysis.pipeline.PipelineContext; import org.labkey.api.sequenceanalysis.pipeline.PipelineStepProvider; import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome; +import org.labkey.api.sequenceanalysis.pipeline.SamtoolsIndexer; import org.labkey.api.sequenceanalysis.pipeline.SamtoolsRunner; import org.labkey.api.sequenceanalysis.pipeline.SequencePipelineService; import org.labkey.api.sequenceanalysis.run.SimpleScriptWrapper; @@ -53,11 +54,21 @@ public Output performAnalysisPerSampleRemote(Readset rs, File inputBam, Referenc { CramToBam samtoolsRunner = new CramToBam(getPipelineCtx().getLogger()); File bam = new File(getPipelineCtx().getWorkingDirectory(), inputFile.getName().replaceAll(".cram$", ".bam")); - samtoolsRunner.convert(inputFile, bam, referenceGenome.getWorkingFastaFile(), SequencePipelineService.get().getMaxThreads(getPipelineCtx().getLogger())); + File bamIdx = new File(bam.getPath() + ".bai"); + if (!bamIdx.exists()) + { + samtoolsRunner.convert(inputFile, bam, referenceGenome.getWorkingFastaFile(), SequencePipelineService.get().getMaxThreads(getPipelineCtx().getLogger())); + new SamtoolsIndexer(getPipelineCtx().getLogger()).execute(bam); + } + else + { + getPipelineCtx().getLogger().debug("BAM index exists, will not re-convert CRAM"); + } + inputFile = bam; output.addIntermediateFile(bam); - output.addIntermediateFile(new File(bam.getPath() + ".bai")); + output.addIntermediateFile(bamIdx); } List args = new ArrayList<>(); From 3c454e35be5b70edb7d6d152212970209550a436 Mon Sep 17 00:00:00 2001 From: bbimber Date: Fri, 11 Jul 2025 06:52:40 -0700 Subject: [PATCH 20/20] Bugfix to sawfish --- .../run/analysis/SawfishAnalysis.java | 19 ++++++++---- .../analysis/SawfishJointCallingHandler.java | 30 +++++++++---------- .../sequenceanalysis/util/SequenceUtil.java | 1 + 3 files changed, 30 insertions(+), 20 deletions(-) diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/SawfishAnalysis.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/SawfishAnalysis.java index bc346470b..421f77ffb 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/SawfishAnalysis.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/SawfishAnalysis.java @@ -92,15 +92,24 @@ public Output performAnalysisPerSampleRemote(Readset rs, File inputBam, Referenc args.add(String.valueOf(maxThreads)); } - new SimpleScriptWrapper(getPipelineCtx().getLogger()).execute(args); + File bcf = new File(svOutDir, "candidate.sv.bcf"); + File bcfIdx = new File(bcf.getPath() + ".csi"); + if (bcfIdx.exists()) + { + getPipelineCtx().getLogger().debug("BCF index already exists, reusing output"); + } + else + { + new SimpleScriptWrapper(getPipelineCtx().getLogger()).execute(args); + } - File vcf = new File(svOutDir, "genotyped.sv.vcf.gz"); - if (!vcf.exists()) + if (!bcf.exists()) { - throw new PipelineJobException("Unable to find file: " + vcf.getPath()); + throw new PipelineJobException("Unable to find file: " + bcf.getPath()); } - output.addSequenceOutput(vcf, rs.getName() + ": sawfish", "Sawfish SV Discovery", rs.getReadsetId(), null, referenceGenome.getGenomeId(), null); + output.addSequenceOutput(bcf, rs.getName() + ": sawfish", "Sawfish SV Discovery", rs.getReadsetId(), null, referenceGenome.getGenomeId(), null); + return output; } diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/SawfishJointCallingHandler.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/SawfishJointCallingHandler.java index 50957ddd3..ce3b066a9 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/SawfishJointCallingHandler.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/SawfishJointCallingHandler.java @@ -43,7 +43,7 @@ public SawfishJointCallingHandler() @Override public boolean canProcess(SequenceOutputFile o) { - return o.getFile() != null && SequenceUtil.FILETYPE.vcf.getFileType().isType(o.getFile()); + return o.getFile() != null && SequenceUtil.FILETYPE.bcf.getFileType().isType(o.getFile()); } @Override @@ -90,8 +90,6 @@ public void processFilesRemote(List inputFiles, JobContext c } File expectedFinalOutput = new File(ctx.getOutputDir(), outputBaseName + ".vcf.gz"); - File expectedFinalOutputIdx = new File(expectedFinalOutput.getPath() + ".tbi"); - boolean jobCompleted = expectedFinalOutputIdx.exists(); // this would occur if the job died during the cleanup phase File ouputVcf = runSawfishCall(ctx, filesToProcess, genome, outputBaseName); @@ -150,20 +148,22 @@ private File runSawfishCall(JobContext ctx, List inputs, ReferenceGenome g if (vcfOutFinal.exists()) { vcfOutFinal.delete(); - FileUtils.moveFile(vcfOut, vcfOutFinal); - - File targetIndex = new File(vcfOutFinal.getPath() + ".tbi"); - if (targetIndex.exists()) - { - targetIndex.delete(); - } + } + FileUtils.moveFile(vcfOut, vcfOutFinal); - File origIndex = new File(vcfOut.getPath() + ".tbi"); - if (origIndex.exists()) - { - FileUtils.moveFile(origIndex, targetIndex); - } + File targetIndex = new File(vcfOutFinal.getPath() + ".tbi"); + if (targetIndex.exists()) + { + targetIndex.delete(); + } + File origIndex = new File(vcfOut.getPath() + ".tbi"); + if (origIndex.exists()) + { + FileUtils.moveFile(origIndex, targetIndex); + } + else + { SequenceAnalysisService.get().ensureVcfIndex(vcfOutFinal, ctx.getLogger(), true); } } diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/util/SequenceUtil.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/util/SequenceUtil.java index 0a2ce0784..a5357d176 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/util/SequenceUtil.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/util/SequenceUtil.java @@ -96,6 +96,7 @@ public enum FILETYPE bed(Collections.singletonList(".bed"), true), bw(Collections.singletonList(".bw"), false), vcf(List.of(".vcf"), true), + bcf(List.of(".bcf"), true), gvcf(List.of(".g.vcf"), true); private final List _extensions;