From b4168c3b625183bb0723a511bd0acdbb65624fcb Mon Sep 17 00:00:00 2001 From: Sascha Steinbiss Date: Sun, 12 Jul 2015 13:35:39 +0100 Subject: [PATCH 1/6] tweak Sanger use parameters --- annot.nf | 2 +- loc_sanger_farm.config | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/annot.nf b/annot.nf index bf05bed..ded0e36 100644 --- a/annot.nf +++ b/annot.nf @@ -191,7 +191,7 @@ if (params.run_exonerate) { -join -type CDS -translate -retainids 1 > ref.pep """ } - exn_prot_chunk = ref_pep.splitFasta( by: 20) + exn_prot_chunk = ref_pep.splitFasta( by: 100) exn_genome_chunk = pseudochr_seq_exonerate.splitFasta( by: 3) process run_exonerate { cache 'deep' diff --git a/loc_sanger_farm.config b/loc_sanger_farm.config index 99ce198..6f43088 100644 --- a/loc_sanger_farm.config +++ b/loc_sanger_farm.config @@ -22,6 +22,8 @@ process.$contiguate_pseudochromosomes.memory = "8 GB" process.$contiguate_pseudochromosomes.clusterOptions = " -R 'select[mem>8192] rusage[mem=8192]' " process.$run_ratt.memory = "8 GB" process.$run_ratt.clusterOptions = " -R 'select[mem>8192] rusage[mem=8192]' " +process.$run_exonerate.memory = "4 GB" +process.$run_exonerate.clusterOptions = " -R 'select[mem>4096] rusage[mem=4096]' " process.$run_augustus_pseudo.memory = "2 GB" process.$run_augustus_pseudo.clusterOptions = " -R 'select[mem>2048] rusage[mem=2048]' " process.$predict_ncRNA.memory = "8 GB" From c4f63125621691cfa04a575bafdf31569964109a Mon Sep 17 00:00:00 2001 From: Sascha Steinbiss Date: Mon, 13 Jul 2015 10:36:52 +0100 Subject: [PATCH 2/6] tweak memory usage for exonerate runs --- loc_sanger_farm.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loc_sanger_farm.config b/loc_sanger_farm.config index 6f43088..9eb0f60 100644 --- a/loc_sanger_farm.config +++ b/loc_sanger_farm.config @@ -22,8 +22,8 @@ process.$contiguate_pseudochromosomes.memory = "8 GB" process.$contiguate_pseudochromosomes.clusterOptions = " -R 'select[mem>8192] rusage[mem=8192]' " process.$run_ratt.memory = "8 GB" process.$run_ratt.clusterOptions = " -R 'select[mem>8192] rusage[mem=8192]' " -process.$run_exonerate.memory = "4 GB" -process.$run_exonerate.clusterOptions = " -R 'select[mem>4096] rusage[mem=4096]' " +process.$run_exonerate.memory = "6 GB" +process.$run_exonerate.clusterOptions = " -R 'select[mem>6144] rusage[mem=6144]' " process.$run_augustus_pseudo.memory = "2 GB" process.$run_augustus_pseudo.clusterOptions = " -R 'select[mem>2048] rusage[mem=2048]' " process.$predict_ncRNA.memory = "8 GB" From fb5857e4ef61f134d5e91bdf0e617a02081f8a11 Mon Sep 17 00:00:00 2001 From: Sascha Steinbiss Date: Tue, 14 Jul 2015 10:57:30 +0100 Subject: [PATCH 3/6] allow single exonerate runs to fail --- annot.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/annot.nf b/annot.nf index ded0e36..9302d56 100644 --- a/annot.nf +++ b/annot.nf @@ -195,6 +195,8 @@ if (params.run_exonerate) { exn_genome_chunk = pseudochr_seq_exonerate.splitFasta( by: 3) process run_exonerate { cache 'deep' + // this process can fail for rogue exonerate processes + errorStrategy 'ignore' input: set file('genome.fasta'), file('prot.fasta') from exn_genome_chunk.spread(exn_prot_chunk) From 8368fd7eaa54f081e213ad7c59ba393066c25bc7 Mon Sep 17 00:00:00 2001 From: Sascha Steinbiss Date: Tue, 14 Jul 2015 10:57:50 +0100 Subject: [PATCH 4/6] increase Sanger farm memory allowance --- loc_sanger_farm.config | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/loc_sanger_farm.config b/loc_sanger_farm.config index 9eb0f60..213784b 100644 --- a/loc_sanger_farm.config +++ b/loc_sanger_farm.config @@ -20,10 +20,12 @@ process.memory = "2 GB" process.clusterOptions = " -R 'select[mem>2048] rusage[mem=2048]' " process.$contiguate_pseudochromosomes.memory = "8 GB" process.$contiguate_pseudochromosomes.clusterOptions = " -R 'select[mem>8192] rusage[mem=8192]' " +process.$blast_for_circos.memory = "8 GB" +process.$blast_for_circos.clusterOptions = " -R 'select[mem>8192] rusage[mem=8192]' " process.$run_ratt.memory = "8 GB" process.$run_ratt.clusterOptions = " -R 'select[mem>8192] rusage[mem=8192]' " -process.$run_exonerate.memory = "6 GB" -process.$run_exonerate.clusterOptions = " -R 'select[mem>6144] rusage[mem=6144]' " +process.$run_exonerate.memory = "8 GB" +process.$run_exonerate.clusterOptions = " -R 'select[mem>8192] rusage[mem=8192]' " process.$run_augustus_pseudo.memory = "2 GB" process.$run_augustus_pseudo.clusterOptions = " -R 'select[mem>2048] rusage[mem=2048]' " process.$predict_ncRNA.memory = "8 GB" From 5cb4c89c538da9318be98c965dee9bc53b63cf87 Mon Sep 17 00:00:00 2001 From: Sascha Steinbiss Date: Tue, 14 Jul 2015 10:58:27 +0100 Subject: [PATCH 5/6] only transfer HMMER hits inside parent polypeptide --- bin/iproscan_gff3_merge.lua | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/bin/iproscan_gff3_merge.lua b/bin/iproscan_gff3_merge.lua index 4230e44..a3cfb25 100755 --- a/bin/iproscan_gff3_merge.lua +++ b/bin/iproscan_gff3_merge.lua @@ -142,16 +142,25 @@ function annotate_vis:visit_feature(fn) for k,v in pairs(hits) do if not FILTERED_SOURCES[k] then for _,n in ipairs(v) do - rng = aminoloc_to_dnaloc(fn, n:get_range(), n:get_strand()) - new_node = gt.feature_node_new(fn:get_seqid(), "protein_match", - rng[1], rng[2], fn:get_strand()) - new_node:set_source(k) - for attr, attrv in n:attribute_pairs() do - if not FILTERED_ATTRIBS[attr] then - new_node:set_attribute(attr, string.gsub(attrv, "\"","")) + local rng = aminoloc_to_dnaloc(fn, n:get_range(), n:get_strand()) + if fn:get_range():contains(gt.range_new(rng[1],rng[2])) then + local new_node = gt.feature_node_new(fn:get_seqid(), + "protein_match", + rng[1], rng[2], + fn:get_strand()) + new_node:set_source(k) + for attr, attrv in n:attribute_pairs() do + if not FILTERED_ATTRIBS[attr] then + new_node:set_attribute(attr, string.gsub(attrv, "\"","")) + end end + fn:add_child(new_node) + else + io.stderr:write("coordinates for feature outside of parent: " + .. tostring(fn:get_range()) .. " vs. " + .. tostring(rng) .. " -- not attaching to " + .. "polypeptide parent") end - fn:add_child(new_node) end end end From f234ba7f62490179e7fe73bce19f614e215112a4 Mon Sep 17 00:00:00 2001 From: Sascha Steinbiss Date: Tue, 14 Jul 2015 10:58:44 +0100 Subject: [PATCH 6/6] only sample single transcript reference genes --- bin/sample_ref_genes.lua | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bin/sample_ref_genes.lua b/bin/sample_ref_genes.lua index a09cd9b..c189179 100755 --- a/bin/sample_ref_genes.lua +++ b/bin/sample_ref_genes.lua @@ -20,7 +20,7 @@ math.randomseed(os.time()) function usage() - io.stderr:write("Randomly samples a number of protein coding gene CCs.\n") + io.stderr:write("Randomly samples a number of single transcript protein coding gene CCs.\n") io.stderr:write(string.format("Usage: %s " .. "\n" , arg[0])) os.exit(1) @@ -36,17 +36,19 @@ cv.out = nil function cv:visit_feature(fn) local gene = false local mrna = false + local nof_transcripts = 0 local cds = false for n in fn:get_children() do if n:get_type() == "gene" then gene = true elseif n:get_type() == "mRNA" then mrna = true + nof_transcripts = nof_transcripts + 1 elseif n:get_type() == "CDS" then cds = true end end - if gene and mrna and cds then + if gene and mrna and nof_transcripts == 1 and cds then self.out = fn else self.out = nil