Skip to content
This repository has been archived by the owner on Nov 7, 2021. It is now read-only.

'Finish line' fixes towards ENA validity #43

Merged
merged 8 commits into from
Jan 20, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 3 additions & 15 deletions annot.nf
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ process sanitize_input {
file 'sanitized.fasta' into sanitized_genome_file

"""
sed 's/['\\''+ &]/_/g' ${genome_file} > sanitized.fasta
sed 's/['\\''+ &]/_/g' ${genome_file} | trim_wildcards.lua > sanitized.fasta
"""
}

Expand All @@ -78,9 +78,6 @@ if (params.do_contiguation) {
file sanitized_genome_file
file ref_chr
file ref_dir
val params.ref_species
val params.GENOME_PREFIX
val params.ABACAS_BIN_CHR

output:
file 'pseudo.pseudochr.fasta' into pseudochr_seq
Expand All @@ -92,7 +89,8 @@ if (params.do_contiguation) {

"""
abacas2.nonparallel.sh \
${ref_chr} ${sanitized_genome_file} 500 85 0 3000
"${ref_chr}" "${sanitized_genome_file}" "${params.ABACAS_MATCH_SIZE}" \
"${params.ABACAS_MATCH_SIM}" 0 3000
abacas_combine.lua . pseudo "${ref_dir}" "${params.ref_species}" \
"${params.GENOME_PREFIX}" "${params.ABACAS_BIN_CHR}" \
"${params.GENOME_PREFIX}"
Expand Down Expand Up @@ -589,7 +587,6 @@ process integrate_genemodels {
input:
file 'merged.gff3' from merged_gff3
file 'sequence.fasta' from pseudochr_seq_integrate
val params.WEIGHT_FILE

output:
file 'integrated.gff3' into integrated_gff3
Expand Down Expand Up @@ -787,7 +784,6 @@ process split_splice_models_at_gaps {
process add_polypeptides {
input:
file 'input.gff3' from genemodels_with_gaps_split_gff3
val params.CHR_PATTERN

output:
file 'output.gff3' into genemodels_with_polypeptides_gff3
Expand Down Expand Up @@ -854,7 +850,6 @@ proteins_target.into{ proteins_orthomcl
process make_ref_input_for_orthomcl {
input:
file omcl_pepfile
val params.ref_species

output:
file 'out.gg' into gg_file
Expand All @@ -873,7 +868,6 @@ process make_ref_input_for_orthomcl {
process make_target_input_for_orthomcl {
input:
file 'pepfile.fas' from proteins_orthomcl
val params.GENOME_PREFIX

output:
file 'out.gg' into gg_file_ref
Expand Down Expand Up @@ -1135,9 +1129,7 @@ if (params.use_reference) {
input:
file 'in.protein.fasta' from refcomp_protein_in
set file('pseudo.in.annotation.gff3'), file('scafs.in.annotation.gff3') from refcomp_gff3_in
val params.GENOME_PREFIX
file ref_dir
val params.ref_species

output:
file 'tree_selection.fasta' into tree_fasta
Expand Down Expand Up @@ -1245,9 +1237,6 @@ if (params.do_contiguation && params.do_circos) {
file 'refannot.gff3' from ref_annot
file 'blast.in' from circos_blastout
file ref_dir
val params.ref_species
val params.CHR_PATTERN
val params.ABACAS_BIN_CHR

output:
file 'links.txt' into circos_input_links
Expand Down Expand Up @@ -1405,7 +1394,6 @@ process make_report {
input:
set file('pseudo.fasta.gz'), file('scaf.fasta.gz') from report_inseq
set file('pseudo.gff3'), file('scaf.gff3') from report_gff3
val params.SPECK_TEMPLATE
val specfile

output:
Expand Down
4 changes: 2 additions & 2 deletions bin/abacas_combine.lua
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ table.sort(newkeys)
-- for all toplevel seqs...
for _,seqid in ipairs(newkeys) do
pseudochr_fasta_out:write(">" .. seqid .. "\n")
print_max_width(pseudochr_seq[seqid], pseudochr_fasta_out, 60)
print_max_width(trim_ns(pseudochr_seq[seqid]), pseudochr_fasta_out, 60)
print(seqid .. ": " .. #scafs[seqid])
local i = 1
local s_last_stop = 0
Expand Down Expand Up @@ -252,4 +252,4 @@ end

for k,v in pairs(ref_target_chromosome_map) do
ref_target_mapping_out:write(k .. "\t" .. v[1] .. "\t" .. v[2] .. "\n")
end
end
75 changes: 53 additions & 22 deletions bin/gff3_to_embl.lua
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,8 @@ function embl_vis:visit_feature(fn)
io.write("XX \n")
io.write("AC XXX;\n")
io.write("XX \n")
io.write("AC * _" .. fn:get_seqid() .. ";\n")
io.write("XX \n")
else
io.write("ID " .. fn:get_seqid() .. "; SV 1; linear; "
.. "genomic DNA; STD; UNC; "
Expand Down Expand Up @@ -202,6 +204,9 @@ function embl_vis:visit_feature(fn)
cnt = cnt + 1
end
end
if cnt == 0 then
break
end
io.write("FT CDS ")
if node:get_strand() == "-" then
io.write("complement(")
Expand All @@ -211,14 +216,26 @@ function embl_vis:visit_feature(fn)
end
local i = 1
local coding_length = 0
local start_phase = 0
local end_phase = 0
for cds in node:get_children() do
if cds:get_type() == "CDS" or cds:get_type() == "pseudogenic_exon" then
if i == 1 and fn:get_attribute("Start_range") then
if fn:get_strand() == '-' then
end_phase = tonumber(cds:get_phase())
else
start_phase = tonumber(cds:get_phase())
end
io.write("<")
end
io.write(cds:get_range():get_start())
io.write("..")
if i == cnt and fn:get_attribute("End_range") then
if fn:get_strand() == '-' then
start_phase = tonumber(cds:get_phase())
else
end_phase = tonumber(cds:get_phase())
end
io.write(">")
end
io.write(cds:get_range():get_end())
Expand All @@ -240,28 +257,17 @@ function embl_vis:visit_feature(fn)
format_embl_attrib(node , "ID", "locus_tag", nil)
if fn:get_type() == "pseudogene" then
io.write("FT /pseudo\n")
format_embl_attrib(pp, "product", "note",
function (s)
local pr_a = gff3_extract_structure(s)
local gprod = pr_a[1].term
if gprod then
return "product: " .. gprod
else
return nil
end
end)
else
format_embl_attrib(pp, "product", "product",
function (s)
local pr_a = gff3_extract_structure(s)
local gprod = pr_a[1].term
if gprod then
return gprod
else
return nil
end
end)
end
format_embl_attrib(pp, "product", "product",
function (s)
local pr_a = gff3_extract_structure(s)
local gprod = pr_a[1].term
if gprod then
return gprod
else
return nil
end
end)
format_embl_attrib(pp, "Dbxref", "EC_number",
function (s)
m = string.match(s, "EC:([0-9.-]+)")
Expand All @@ -279,9 +285,31 @@ function embl_vis:visit_feature(fn)
-- translation
local protseq = nil
if node:get_type() == "mRNA" then
cdnaseq = node:extract_sequence("CDS", true, region_mapping)
protseq = node:extract_and_translate_sequence("CDS", true,
region_mapping)
io.write("FT /translation=\"" .. protseq:sub(1,-2) .."\"\n")
if embl_compliant and cdnaseq:len() % 3 > 0 then
-- The ENA expects translations for DNA sequences with a length
-- which is not a multiple of three in a different way from the one
-- produced by GenomeTools. While GenomeTools cuts off the
-- remainder and does not translate the partial codon, the ENA
-- validator fills it up with Ns and translates it. This behaviour
-- is emulated here to make the \translation value validate
-- properly.
cdnaseq = cdnaseq .. string.rep('n', 3 - (cdnaseq:len() % 3))
if start_phase > 0 then
cdnaseq = cdnaseq:sub(start_phase + 1)
end
-- print(">"..geneid .."\n"..protseq)
protseq = gt.translate_dna(cdnaseq)
end

-- clip off stop codon
if protseq:sub(-1,-1) == "*" then
protseq = protseq:sub(1,-2)
end
io.write("FT /translation=\"" .. protseq .."\"\n")
io.write("FT /codon_start=" .. start_phase + 1 .. "\n")
end
io.write("FT /transl_table=1\n")
-- orthologs
Expand Down Expand Up @@ -348,6 +376,7 @@ function embl_vis:visit_feature(fn)
io.write(" (" .. node:get_attribute("anticodon") .. ")")
end
io.write("\"\n")
io.write("FT /gene=\"" .. fn:get_attribute("ID") .. "\"\n")
end
format_embl_attrib(node , "ID", "locus_tag", nil)
elseif string.match(node:get_type(), "snRNA") or string.match(node:get_type(), "snoRNA") then
Expand All @@ -361,6 +390,7 @@ function embl_vis:visit_feature(fn)
end
io.write("\n")
io.write("FT /ncRNA_class=\"" .. node:get_type() .. "\"\n")
io.write("FT /gene=\"" .. fn:get_attribute("ID") .. "\"\n")
format_embl_attrib(node , "ID", "locus_tag", nil)
elseif string.match(node:get_type(), "rRNA") then
io.write("FT rRNA ")
Expand All @@ -373,6 +403,7 @@ function embl_vis:visit_feature(fn)
end
io.write("\n")
io.write("FT /product=\"" .. node:get_type() .. "\"\n")
io.write("FT /gene=\"" .. fn:get_attribute("ID") .. "\"\n")
format_embl_attrib(node , "ID", "locus_tag", nil)
elseif string.match(node:get_type(), "gap") then
io.write("FT gap ")
Expand Down
7 changes: 6 additions & 1 deletion bin/lib.lua
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,11 @@ function deep_copy(root, copy, table)
return copy
end

function trim_ns(inseq)
inseq = inseq:gsub("^[Nn]+", ""):gsub("[Nn]+$", "")
return inseq
end

function print_max_width(str, ioo, width)
local i = 1
while (i + width - 1) < str:len() do
Expand Down Expand Up @@ -368,4 +373,4 @@ function print_r ( t )
sub_print_r(t," ")
end
print()
end
end
15 changes: 9 additions & 6 deletions bin/split_genes_at_gaps.lua
Original file line number Diff line number Diff line change
Expand Up @@ -63,16 +63,20 @@ function stream:process_current_cluster()

-- count and collect gaps between scaffolds -- these always need to be broken
for _,n in ipairs(self.curr_gene_set) do
if n:get_type() == "gap"
and n:get_attribute("gap_type") == "between scaffolds" then
table.insert(gaps, n)
if n:get_type() == "gap" then
table.insert(stream.outqueue, n)
if n:get_attribute("gap_type") == "between scaffolds" then
table.insert(gaps, n)
end
end
end

-- no gaps in cluster, so just pass along genes
-- no relevant gaps in cluster, so just pass along genes
if #gaps == 0 then
for _,n in ipairs(self.curr_gene_set) do
table.insert(stream.outqueue, n)
if n:get_type() ~= "gap" then
table.insert(stream.outqueue, n)
end
end
return 0
end
Expand Down Expand Up @@ -132,7 +136,6 @@ function stream:process_current_cluster()
end
if not skip_gene then
table.insert(outbuf, cur_gene)
table.insert(outbuf, g)
end
cur_gene = rest
end
Expand Down
57 changes: 57 additions & 0 deletions bin/trim_wildcards.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/usr/bin/env gt

--[[
Author: Sascha Steinbiss <ss34@sanger.ac.uk>
Copyright (c) 2015 Genome Research Ltd

Permission to use, copy, modify, and distribute this software for any
purpose with or without fee is hereby granted, provided that the above
copyright notice and this permission notice appear in all copies.

THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
]]

package.path = gt.script_dir .. "/?.lua;" .. package.path
require("lib")

function chomp(s)
return string.gsub(s, "\n$", "")
end

function usage()
io.stderr:write(string.format("Usage: %s\n", arg[0]))
io.stderr:write("Removes leading/trailing Ns from input FASTA sequences.\n")
os.exit(1)
end

local i, j = 0, 0
hdr = ""
seq = {}

for l in io.lines() do
if string.sub(l,1,1) == '>' then
if i > 0 then
j = j + 1
i = 0
print(">"..hdr)
thisseq = trim_ns(table.concat(seq,""))
print_max_width(thisseq, io.stdout, 60)
end
hdr = string.sub(l, 2)
seq = {}
i = i + 1
else
table.insert(seq, tostring(chomp(l)))
end
end
if #seq > 0 then
print(">"..hdr)
thisseq = trim_ns(table.concat(seq,""))
print_max_width(thisseq, io.stdout, 60)
end
Loading