Skip to content

Commit

Permalink
Merge pull request #1 from crickbabs/master
Browse files Browse the repository at this point in the history
Minor bugs and fixes
  • Loading branch information
drpatelh committed Nov 21, 2018
2 parents 5409a37 + 695dc81 commit 4516f13
Show file tree
Hide file tree
Showing 15 changed files with 275 additions and 257 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool
* annotate peaks relative to gene features ([`HOMER`](http://homer.ucsd.edu/homer/download.html))
* create consensus peakset across all samples and create tabular file to aid in the filtering of the data ([`BEDTools`](https://github.com/arq5x/bedtools2/))
* count reads in consensus peaks from replicate-level alignments ([`featureCounts`](http://bioinf.wehi.edu.au/featureCounts/))
* differential binding analysis, PCA and clustering ([`R`](https://www.r-project.org/), [`DESeq2`](https://bioconductor.org/packages/release/bioc/html/DESeq2.html))
* differential accessibility analysis, PCA and clustering ([`R`](https://www.r-project.org/), [`DESeq2`](https://bioconductor.org/packages/release/bioc/html/DESeq2.html))
7. Create IGV session file containing bigWig tracks, peaks and differential sites for data visualisation ([`IGV`](https://software.broadinstitute.org/software/igv/)).
8. Present QC for raw read, alignment, peak-calling and differential binding results ([`MultiQC`](http://multiqc.info/), [`R`](https://www.r-project.org/))
8. Present QC for raw read, alignment, peak-calling and differential accessibility results ([`MultiQC`](http://multiqc.info/), [`R`](https://www.r-project.org/))

### Documentation
The nf-core/atacseq pipeline comes with documentation about the pipeline, found in the `docs/` directory:
Expand Down
2 changes: 2 additions & 0 deletions assets/multiqc/multiqc_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ skip_generalstats: true

fn_clean_exts:
- 'fastq.gz'
- '_trimmed'
- '_val'
- 'sorted.bam'
- 'mkD'
Expand All @@ -27,6 +28,7 @@ module_order:
target: ''
path_filters:
- '*val*_fastqc.zip'
- '*trimmed_fastqc.zip'
- samtools:
name: 'SAMTools (library-level; unfiltered)'
info: 'This section of the report shows SAMTools results at the library-level before read filtering.'
Expand Down
Binary file modified assets/nfcore-atacseq_logo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
182 changes: 91 additions & 91 deletions bin/featurecounts_deseq2.r

Large diffs are not rendered by default.

154 changes: 77 additions & 77 deletions bin/plot_homer_annotatepeaks.r
Original file line number Diff line number Diff line change
Expand Up @@ -18,20 +18,20 @@ library(scales)
################################################

option_list <- list(make_option(c("-i", "--homer_files"), type="character", default=NULL, help="Comma-separated list of homer annotated text files.", metavar="anno_files"),
make_option(c("-s", "--sample_ids"), type="character", default=NULL, help="Comma-separated list of sample ids associated with homer annotated text files. Must be unique and in same order as homer files input.", metavar="sampleids"),
make_option(c("-o", "--outdir"), type="character", default='./', help="Output directory", metavar="path"),
make_option(c("-p", "--outprefix"), type="character", default='homer_annotation', help="Output prefix", metavar="character"))
make_option(c("-s", "--sample_ids"), type="character", default=NULL, help="Comma-separated list of sample ids associated with homer annotated text files. Must be unique and in same order as homer files input.", metavar="sampleids"),
make_option(c("-o", "--outdir"), type="character", default='./', help="Output directory", metavar="path"),
make_option(c("-p", "--outprefix"), type="character", default='homer_annotation', help="Output prefix", metavar="character"))

opt_parser <- OptionParser(option_list=option_list)
opt <- parse_args(opt_parser)

if (is.null(opt$homer_files)){
print_help(opt_parser)
stop("At least one homer annotated file must be supplied", call.=FALSE)
print_help(opt_parser)
stop("At least one homer annotated file must be supplied", call.=FALSE)
}
if (is.null(opt$sample_ids)){
print_help(opt_parser)
stop("Please provide sample ids associated with homer files.", call.=FALSE)
print_help(opt_parser)
stop("Please provide sample ids associated with homer files.", call.=FALSE)
}

if (file.exists(opt$outdir) == FALSE) {
Expand All @@ -41,8 +41,8 @@ if (file.exists(opt$outdir) == FALSE) {
HomerFiles <- unlist(strsplit(opt$homer_files,","))
SampleIDs <- unlist(strsplit(opt$sample_ids,","))
if (length(HomerFiles) != length(SampleIDs)) {
print_help(opt_parser)
stop("Number of sample ids must equal number of homer annotated files.", call.=FALSE)
print_help(opt_parser)
stop("Number of sample ids must equal number of homer annotated files.", call.=FALSE)
}

################################################
Expand All @@ -56,35 +56,35 @@ plot.dist.dat <- data.frame()
plot.feature.dat <- data.frame()
for (idx in 1:length(HomerFiles)) {

sampleid = SampleIDs[idx]
anno.dat <- read.table(HomerFiles[idx], sep="\t", header=TRUE)
anno.dat <- anno.dat[,c("Annotation","Distance.to.TSS","Nearest.PromoterID")]
anno.dat <- anno.dat[which(!is.na(anno.dat$Distance.to.TSS)),]
if (nrow(anno.dat) == 0) {
quit(save = "no", status = 0, runLast = FALSE)
}
anno.dat$name <- rep(sampleid,nrow(anno.dat))
anno.dat$Distance.to.TSS <- abs(anno.dat$Distance.to.TSS) + 1
plot.dat <- rbind(plot.dat,anno.dat)

## GET ANNOTATION COUNTS
anno.freq <- as.character(lapply(strsplit(as.character(anno.dat$Annotation)," "), function(x) x[1]))
anno.freq <- as.data.frame(table(anno.freq))
colnames(anno.freq) <- c("feature",sampleid)
anno.melt <- melt(anno.freq)
plot.feature.dat <- rbind(plot.feature.dat,anno.melt)

## GET CLOSEST INSTANCE OF GENE TO ANY GIVEN PEAK
unique.gene.dat <- anno.dat[order(anno.dat$Distance.to.TSS),]
unique.gene.dat <- unique.gene.dat[!duplicated(unique.gene.dat$Nearest.PromoterID), ]
dist.freq <- rep("> 10kb",nrow(unique.gene.dat))
dist.freq[which(unique.gene.dat$Distance.to.TSS < 10000)] <- "< 10kb"
dist.freq[which(unique.gene.dat$Distance.to.TSS < 5000)] <- "< 5kb"
dist.freq[which(unique.gene.dat$Distance.to.TSS < 2000)] <- "< 2kb"
dist.freq <- as.data.frame(table(dist.freq))
colnames(dist.freq) <- c("distance",sampleid)
dist.melt <- melt(dist.freq)
plot.dist.dat <- rbind(plot.dist.dat,dist.melt)
sampleid = SampleIDs[idx]
anno.dat <- read.table(HomerFiles[idx], sep="\t", header=TRUE)
anno.dat <- anno.dat[,c("Annotation","Distance.to.TSS","Nearest.PromoterID")]
anno.dat <- anno.dat[which(!is.na(anno.dat$Distance.to.TSS)),]
if (nrow(anno.dat) == 0) {
quit(save = "no", status = 0, runLast = FALSE)
}
anno.dat$name <- rep(sampleid,nrow(anno.dat))
anno.dat$Distance.to.TSS <- abs(anno.dat$Distance.to.TSS) + 1
plot.dat <- rbind(plot.dat,anno.dat)

## GET ANNOTATION COUNTS
anno.freq <- as.character(lapply(strsplit(as.character(anno.dat$Annotation)," "), function(x) x[1]))
anno.freq <- as.data.frame(table(anno.freq))
colnames(anno.freq) <- c("feature",sampleid)
anno.melt <- melt(anno.freq)
plot.feature.dat <- rbind(plot.feature.dat,anno.melt)

## GET CLOSEST INSTANCE OF GENE TO ANY GIVEN PEAK
unique.gene.dat <- anno.dat[order(anno.dat$Distance.to.TSS),]
unique.gene.dat <- unique.gene.dat[!duplicated(unique.gene.dat$Nearest.PromoterID), ]
dist.freq <- rep("> 10kb",nrow(unique.gene.dat))
dist.freq[which(unique.gene.dat$Distance.to.TSS < 10000)] <- "< 10kb"
dist.freq[which(unique.gene.dat$Distance.to.TSS < 5000)] <- "< 5kb"
dist.freq[which(unique.gene.dat$Distance.to.TSS < 2000)] <- "< 2kb"
dist.freq <- as.data.frame(table(dist.freq))
colnames(dist.freq) <- c("distance",sampleid)
dist.melt <- melt(dist.freq)
plot.dist.dat <- rbind(plot.dist.dat,dist.melt)

}
levels(plot.dat$name) <- sort(unique(as.character(plot.dat$name)))
Expand All @@ -97,7 +97,7 @@ write.table(summary.dat,file=file.path(opt$outdir,paste(opt$outprefix,".summary.

################################################
################################################
## PLOTS ##
## PLOTS ##
################################################
################################################

Expand All @@ -106,52 +106,52 @@ pdf(PlotFile,height=6,width=3*length(HomerFiles))

## FEATURE COUNT STACKED BARPLOT
plot <- ggplot(plot.feature.dat, aes(x=variable, y=value, group=feature)) +
geom_bar(stat="identity", position = "fill", aes(colour=feature,fill=feature), alpha = 0.3) +
xlab("") +
ylab("% Feature") +
ggtitle("Peak Location Relative to Annotation") +
scale_y_continuous(labels = percent_format()) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text(colour="black"),
axis.text.x= element_text(colour="black",face="bold"),
axis.line.x = element_line(size = 1, colour = "black", linetype = "solid"),
axis.line.y = element_line(size = 1, colour = "black", linetype = "solid"))
geom_bar(stat="identity", position = "fill", aes(colour=feature,fill=feature), alpha = 0.3) +
xlab("") +
ylab("% Feature") +
ggtitle("Peak Location Relative to Annotation") +
scale_y_continuous(labels = percent_format()) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text(colour="black"),
axis.text.x= element_text(colour="black",face="bold"),
axis.line.x = element_line(size = 1, colour = "black", linetype = "solid"),
axis.line.y = element_line(size = 1, colour = "black", linetype = "solid"))
print(plot)

## DISTANCE TO CLOSEST GENE ACROSS ALL PEAKS STACKED BARPLOT
plot <- ggplot(plot.dist.dat, aes(x=variable, y=value, group=distance)) +
geom_bar(stat="identity", position = "fill", aes(colour=distance,fill=distance), alpha = 0.3) +
xlab("") +
ylab("% Unique genes to closest peak") +
ggtitle("Distance of Closest Peak to Gene") +
scale_y_continuous(labels = percent_format()) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text(colour="black"),
axis.text.x= element_text(colour="black",face="bold"),
axis.line.x = element_line(size = 1, colour = "black", linetype = "solid"),
axis.line.y = element_line(size = 1, colour = "black", linetype = "solid"))
geom_bar(stat="identity", position = "fill", aes(colour=distance,fill=distance), alpha = 0.3) +
xlab("") +
ylab("% Unique genes to closest peak") +
ggtitle("Distance of Closest Peak to Gene") +
scale_y_continuous(labels = percent_format()) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text(colour="black"),
axis.text.x= element_text(colour="black",face="bold"),
axis.line.x = element_line(size = 1, colour = "black", linetype = "solid"),
axis.line.y = element_line(size = 1, colour = "black", linetype = "solid"))
print(plot)

## VIOLIN PLOT OF PEAK DISTANCE TO TSS
plot <- ggplot(plot.dat, aes(x=name, y=Distance.to.TSS)) +
geom_violin(aes(colour=name,fill=name), alpha = 0.3) +
geom_boxplot(width=0.1) +
xlab("") +
ylab(expression(log[10]*" distance to TSS")) +
ggtitle("Peak Distribution Relative to TSS") +
scale_y_continuous(trans='log10',breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x))) +
theme(legend.position="none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text(colour="black"),
axis.text.x= element_text(colour="black",face="bold"),
axis.line.x = element_line(size = 1, colour = "black", linetype = "solid"),
axis.line.y = element_line(size = 1, colour = "black", linetype = "solid"))
geom_violin(aes(colour=name,fill=name), alpha = 0.3) +
geom_boxplot(width=0.1) +
xlab("") +
ylab(expression(log[10]*" distance to TSS")) +
ggtitle("Peak Distribution Relative to TSS") +
scale_y_continuous(trans='log10',breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x))) +
theme(legend.position="none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text(colour="black"),
axis.text.x= element_text(colour="black",face="bold"),
axis.line.x = element_line(size = 1, colour = "black", linetype = "solid"),
axis.line.y = element_line(size = 1, colour = "black", linetype = "solid"))
print(plot)
dev.off()

Expand Down
Loading

0 comments on commit 4516f13

Please sign in to comment.