Merge pull request #1 from crickbabs/master

Minor bugs and fixes
nf-core · Nov 21, 2018 · 4516f13 · 4516f13
2 parents 5409a37 + 695dc81
commit 4516f13
Show file tree

Hide file tree

Showing 15 changed files with 275 additions and 257 deletions.
diff --git a/README.md b/README.md
@@ -43,9 +43,9 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool
     * annotate peaks relative to gene features ([`HOMER`](http://homer.ucsd.edu/homer/download.html))
     * create consensus peakset across all samples and create tabular file to aid in the filtering of the data ([`BEDTools`](https://github.com/arq5x/bedtools2/))
     * count reads in consensus peaks from replicate-level alignments ([`featureCounts`](http://bioinf.wehi.edu.au/featureCounts/))
-    * differential binding analysis, PCA and clustering ([`R`](https://www.r-project.org/), [`DESeq2`](https://bioconductor.org/packages/release/bioc/html/DESeq2.html))
+    * differential accessibility analysis, PCA and clustering ([`R`](https://www.r-project.org/), [`DESeq2`](https://bioconductor.org/packages/release/bioc/html/DESeq2.html))
 7. Create IGV session file containing bigWig tracks, peaks and differential sites for data visualisation ([`IGV`](https://software.broadinstitute.org/software/igv/)).
-8. Present QC for raw read, alignment, peak-calling and differential binding results ([`MultiQC`](http://multiqc.info/), [`R`](https://www.r-project.org/))
+8. Present QC for raw read, alignment, peak-calling and differential accessibility results ([`MultiQC`](http://multiqc.info/), [`R`](https://www.r-project.org/))
 
 ### Documentation
 The nf-core/atacseq pipeline comes with documentation about the pipeline, found in the `docs/` directory:

diff --git a/assets/multiqc/multiqc_config.yaml b/assets/multiqc/multiqc_config.yaml
@@ -7,6 +7,7 @@ skip_generalstats: true
 
 fn_clean_exts:
     - 'fastq.gz'
+    - '_trimmed'
     - '_val'
     - 'sorted.bam'
     - 'mkD'
@@ -27,6 +28,7 @@ module_order:
         target: ''
         path_filters:
             - '*val*_fastqc.zip'
+            - '*trimmed_fastqc.zip'
     - samtools:
         name: 'SAMTools (library-level; unfiltered)'
         info: 'This section of the report shows SAMTools results at the library-level before read filtering.'

diff --git a/assets/nfcore-atacseq_logo.png b/assets/nfcore-atacseq_logo.png
diff --git a/bin/featurecounts_deseq2.r b/bin/featurecounts_deseq2.r
diff --git a/bin/plot_homer_annotatepeaks.r b/bin/plot_homer_annotatepeaks.r
@@ -18,20 +18,20 @@ library(scales)
 ################################################
 
 option_list <- list(make_option(c("-i", "--homer_files"), type="character", default=NULL, help="Comma-separated list of homer annotated text files.", metavar="anno_files"),
-										make_option(c("-s", "--sample_ids"), type="character", default=NULL, help="Comma-separated list of sample ids associated with homer annotated text files. Must be unique and in same order as homer files input.", metavar="sampleids"),
-									  make_option(c("-o", "--outdir"), type="character", default='./', help="Output directory", metavar="path"),
-									  make_option(c("-p", "--outprefix"), type="character", default='homer_annotation', help="Output prefix", metavar="character"))
+                    make_option(c("-s", "--sample_ids"), type="character", default=NULL, help="Comma-separated list of sample ids associated with homer annotated text files. Must be unique and in same order as homer files input.", metavar="sampleids"),
+                    make_option(c("-o", "--outdir"), type="character", default='./', help="Output directory", metavar="path"),
+                    make_option(c("-p", "--outprefix"), type="character", default='homer_annotation', help="Output prefix", metavar="character"))
 
 opt_parser <- OptionParser(option_list=option_list)
 opt <- parse_args(opt_parser)
 
 if (is.null(opt$homer_files)){
-		print_help(opt_parser)
-		stop("At least one homer annotated file must be supplied", call.=FALSE)
+    print_help(opt_parser)
+    stop("At least one homer annotated file must be supplied", call.=FALSE)
 }
 if (is.null(opt$sample_ids)){
-		print_help(opt_parser)
-		stop("Please provide sample ids associated with homer files.", call.=FALSE)
+    print_help(opt_parser)
+    stop("Please provide sample ids associated with homer files.", call.=FALSE)
 }
 
 if (file.exists(opt$outdir) == FALSE) {
@@ -41,8 +41,8 @@ if (file.exists(opt$outdir) == FALSE) {
 HomerFiles <- unlist(strsplit(opt$homer_files,","))
 SampleIDs <- unlist(strsplit(opt$sample_ids,","))
 if (length(HomerFiles) != length(SampleIDs)) {
-		print_help(opt_parser)
-		stop("Number of sample ids must equal number of homer annotated files.", call.=FALSE)
+    print_help(opt_parser)
+    stop("Number of sample ids must equal number of homer annotated files.", call.=FALSE)
 }
 
 ################################################
@@ -56,35 +56,35 @@ plot.dist.dat <- data.frame()
 plot.feature.dat <- data.frame()
 for (idx in 1:length(HomerFiles)) {
 
-		sampleid = SampleIDs[idx]
-		anno.dat <- read.table(HomerFiles[idx], sep="\t", header=TRUE)
-		anno.dat <- anno.dat[,c("Annotation","Distance.to.TSS","Nearest.PromoterID")]
-		anno.dat <- anno.dat[which(!is.na(anno.dat$Distance.to.TSS)),]
-		if (nrow(anno.dat) == 0) {
-				quit(save = "no", status = 0, runLast = FALSE)
-		}
-		anno.dat$name <- rep(sampleid,nrow(anno.dat))
-		anno.dat$Distance.to.TSS <- abs(anno.dat$Distance.to.TSS) + 1
-		plot.dat <- rbind(plot.dat,anno.dat)
-
-		## GET ANNOTATION COUNTS
-		anno.freq <- as.character(lapply(strsplit(as.character(anno.dat$Annotation)," "), function(x) x[1]))
-		anno.freq <- as.data.frame(table(anno.freq))
-		colnames(anno.freq) <- c("feature",sampleid)
-		anno.melt <- melt(anno.freq)
-		plot.feature.dat <- rbind(plot.feature.dat,anno.melt)
-
-		## GET CLOSEST INSTANCE OF GENE TO ANY GIVEN PEAK
-		unique.gene.dat <- anno.dat[order(anno.dat$Distance.to.TSS),]
-		unique.gene.dat <- unique.gene.dat[!duplicated(unique.gene.dat$Nearest.PromoterID), ]
-		dist.freq <- rep("> 10kb",nrow(unique.gene.dat))
-		dist.freq[which(unique.gene.dat$Distance.to.TSS < 10000)] <- "< 10kb"
-		dist.freq[which(unique.gene.dat$Distance.to.TSS < 5000)] <- "< 5kb"
-		dist.freq[which(unique.gene.dat$Distance.to.TSS < 2000)] <- "< 2kb"
-		dist.freq <- as.data.frame(table(dist.freq))
-		colnames(dist.freq) <- c("distance",sampleid)
-		dist.melt <- melt(dist.freq)
-		plot.dist.dat <- rbind(plot.dist.dat,dist.melt)
+    sampleid = SampleIDs[idx]
+    anno.dat <- read.table(HomerFiles[idx], sep="\t", header=TRUE)
+    anno.dat <- anno.dat[,c("Annotation","Distance.to.TSS","Nearest.PromoterID")]
+    anno.dat <- anno.dat[which(!is.na(anno.dat$Distance.to.TSS)),]
+    if (nrow(anno.dat) == 0) {
+        quit(save = "no", status = 0, runLast = FALSE)
+    }
+    anno.dat$name <- rep(sampleid,nrow(anno.dat))
+    anno.dat$Distance.to.TSS <- abs(anno.dat$Distance.to.TSS) + 1
+    plot.dat <- rbind(plot.dat,anno.dat)
+
+    ## GET ANNOTATION COUNTS
+    anno.freq <- as.character(lapply(strsplit(as.character(anno.dat$Annotation)," "), function(x) x[1]))
+    anno.freq <- as.data.frame(table(anno.freq))
+    colnames(anno.freq) <- c("feature",sampleid)
+    anno.melt <- melt(anno.freq)
+    plot.feature.dat <- rbind(plot.feature.dat,anno.melt)
+
+    ## GET CLOSEST INSTANCE OF GENE TO ANY GIVEN PEAK
+    unique.gene.dat <- anno.dat[order(anno.dat$Distance.to.TSS),]
+    unique.gene.dat <- unique.gene.dat[!duplicated(unique.gene.dat$Nearest.PromoterID), ]
+    dist.freq <- rep("> 10kb",nrow(unique.gene.dat))
+    dist.freq[which(unique.gene.dat$Distance.to.TSS < 10000)] <- "< 10kb"
+    dist.freq[which(unique.gene.dat$Distance.to.TSS < 5000)] <- "< 5kb"
+    dist.freq[which(unique.gene.dat$Distance.to.TSS < 2000)] <- "< 2kb"
+    dist.freq <- as.data.frame(table(dist.freq))
+    colnames(dist.freq) <- c("distance",sampleid)
+    dist.melt <- melt(dist.freq)
+    plot.dist.dat <- rbind(plot.dist.dat,dist.melt)
 
 }
 levels(plot.dat$name) <- sort(unique(as.character(plot.dat$name)))
@@ -97,7 +97,7 @@ write.table(summary.dat,file=file.path(opt$outdir,paste(opt$outprefix,".summary.
 
 ################################################
 ################################################
-## PLOTS                											##
+## PLOTS                                      ##
 ################################################
 ################################################
 
@@ -106,52 +106,52 @@ pdf(PlotFile,height=6,width=3*length(HomerFiles))
 
 ## FEATURE COUNT STACKED BARPLOT
 plot  <- ggplot(plot.feature.dat, aes(x=variable, y=value, group=feature)) +
-				 geom_bar(stat="identity", position = "fill", aes(colour=feature,fill=feature), alpha = 0.3) +
-				 xlab("") +
-				 ylab("% Feature") +
-				 ggtitle("Peak Location Relative to Annotation") +
-				 scale_y_continuous(labels = percent_format()) +
-				 theme(panel.grid.major = element_blank(),
-							 panel.grid.minor = element_blank(),
-							 panel.background = element_blank(),
-							 axis.text.y = element_text(colour="black"),
-							 axis.text.x= element_text(colour="black",face="bold"),
-							 axis.line.x = element_line(size = 1, colour = "black", linetype = "solid"),
-							 axis.line.y = element_line(size = 1, colour = "black", linetype = "solid"))
+         geom_bar(stat="identity", position = "fill", aes(colour=feature,fill=feature), alpha = 0.3) +
+         xlab("") +
+         ylab("% Feature") +
+         ggtitle("Peak Location Relative to Annotation") +
+         scale_y_continuous(labels = percent_format()) +
+         theme(panel.grid.major = element_blank(),
+               panel.grid.minor = element_blank(),
+               panel.background = element_blank(),
+               axis.text.y = element_text(colour="black"),
+               axis.text.x= element_text(colour="black",face="bold"),
+               axis.line.x = element_line(size = 1, colour = "black", linetype = "solid"),
+               axis.line.y = element_line(size = 1, colour = "black", linetype = "solid"))
 print(plot)
 
 ## DISTANCE TO CLOSEST GENE ACROSS ALL PEAKS STACKED BARPLOT
 plot  <- ggplot(plot.dist.dat, aes(x=variable, y=value, group=distance)) +
-				 geom_bar(stat="identity", position = "fill", aes(colour=distance,fill=distance), alpha = 0.3) +
-				 xlab("") +
-				 ylab("% Unique genes to closest peak") +
-				 ggtitle("Distance of Closest Peak to Gene") +
-				 scale_y_continuous(labels = percent_format()) +
-				 theme(panel.grid.major = element_blank(),
-							 panel.grid.minor = element_blank(),
-							 panel.background = element_blank(),
-							 axis.text.y = element_text(colour="black"),
-							 axis.text.x= element_text(colour="black",face="bold"),
-							 axis.line.x = element_line(size = 1, colour = "black", linetype = "solid"),
-							 axis.line.y = element_line(size = 1, colour = "black", linetype = "solid"))
+         geom_bar(stat="identity", position = "fill", aes(colour=distance,fill=distance), alpha = 0.3) +
+         xlab("") +
+         ylab("% Unique genes to closest peak") +
+         ggtitle("Distance of Closest Peak to Gene") +
+         scale_y_continuous(labels = percent_format()) +
+         theme(panel.grid.major = element_blank(),
+               panel.grid.minor = element_blank(),
+               panel.background = element_blank(),
+               axis.text.y = element_text(colour="black"),
+               axis.text.x= element_text(colour="black",face="bold"),
+               axis.line.x = element_line(size = 1, colour = "black", linetype = "solid"),
+               axis.line.y = element_line(size = 1, colour = "black", linetype = "solid"))
 print(plot)
 
 ## VIOLIN PLOT OF PEAK DISTANCE TO TSS
 plot  <- ggplot(plot.dat, aes(x=name, y=Distance.to.TSS)) +
-				 geom_violin(aes(colour=name,fill=name), alpha = 0.3) +
-				 geom_boxplot(width=0.1) +
-				 xlab("") +
-				 ylab(expression(log[10]*" distance to TSS")) +
-				 ggtitle("Peak Distribution Relative to TSS") +
-				 scale_y_continuous(trans='log10',breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x))) +
-				 theme(legend.position="none",
-							 panel.grid.major = element_blank(),
-							 panel.grid.minor = element_blank(),
-							 panel.background = element_blank(),
-							 axis.text.y = element_text(colour="black"),
-							 axis.text.x= element_text(colour="black",face="bold"),
-							 axis.line.x = element_line(size = 1, colour = "black", linetype = "solid"),
-							 axis.line.y = element_line(size = 1, colour = "black", linetype = "solid"))
+         geom_violin(aes(colour=name,fill=name), alpha = 0.3) +
+         geom_boxplot(width=0.1) +
+         xlab("") +
+         ylab(expression(log[10]*" distance to TSS")) +
+         ggtitle("Peak Distribution Relative to TSS") +
+         scale_y_continuous(trans='log10',breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x))) +
+         theme(legend.position="none",
+               panel.grid.major = element_blank(),
+               panel.grid.minor = element_blank(),
+               panel.background = element_blank(),
+               axis.text.y = element_text(colour="black"),
+               axis.text.x= element_text(colour="black",face="bold"),
+               axis.line.x = element_line(size = 1, colour = "black", linetype = "solid"),
+               axis.line.y = element_line(size = 1, colour = "black", linetype = "solid"))
 print(plot)
 dev.off()