Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mfannot - big improvement in handling this conversion #483

Merged
merged 9 commits into from
Sep 9, 2024
610 changes: 288 additions & 322 deletions bin/agat_convert_mfannot2gff.pl

Large diffs are not rendered by default.

67 changes: 49 additions & 18 deletions lib/AGAT/OmniscientI.pm
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ sub slurp_gff3_file_JD {
# ============================> HASH CASE <============================
elsif(ref($file) eq 'HASH'){

foreach my $level (keys %{$file}){
foreach my $level ( sort { ncmp ($a, $b) } keys %{$file}){
# save header if any
if ($level eq 'other'){
foreach my $thing( keys %{$file->{'other'} } ){
Expand All @@ -301,10 +301,10 @@ sub slurp_gff3_file_JD {
next;
}
if ( ref($file->{$level}) eq 'HASH'){ #Header,level1,level2,#level3
foreach my $tag (keys %{$file->{$level}}){
foreach my $id (keys %{$file->{$level}{$tag}}){
foreach my $tag ( sort { ncmp ($a, $b) } keys %{$file->{$level}}){
foreach my $id ( sort { ncmp ($a, $b) } keys %{$file->{$level}{$tag}}){
if ( ref($file->{$level}{$tag}{$id}) eq 'ARRAY'){ #level2,#level3
foreach my $feature ( @{$file->{$level}{$tag}{$id} }){
foreach my $feature ( sort { ncmp ($a->start."|".$a->end.$a->_tag_value('ID'), $b->start."|".$b->end.$b->_tag_value('ID') ) } @{$file->{$level}{$tag}{$id} }){
($locusTAGvalue, $last_l1_f, $last_l2_f, $last_l3_f, $last_f, $lastL1_new) =
manage_one_feature($ontology, $feature, \%omniscient, \%mRNAGeneLink, \%duplicate, \%hashID, \%locusTAG, \%infoSequential, \%attachedL2Sequential, $locusTAGvalue, $last_l1_f, $last_l2_f, $last_l3_f, $last_f, $lastL1_new, $verbose, $log, $debug);
}
Expand All @@ -318,7 +318,7 @@ sub slurp_gff3_file_JD {
}
}
else{ #extra list of feature
foreach my $feature ( @{$file->{$level}} ) {
foreach my $feature ( sort { ncmp ($a->start."|".$a->end.$a->_tag_value('ID'), $b->start."|".$b->end.$b->_tag_value('ID') ) } @{$file->{$level}} ) {
($locusTAGvalue, $last_l1_f, $last_l2_f, $last_l3_f, $last_f, $lastL1_new) =
manage_one_feature($ontology, $feature, \%omniscient, \%mRNAGeneLink, \%duplicate, \%hashID, \%locusTAG, \%infoSequential, \%attachedL2Sequential, $locusTAGvalue, $last_l1_f, $last_l2_f, $last_l3_f, $last_f, $lastL1_new, $verbose, $log, $debug);
}
Expand Down Expand Up @@ -952,7 +952,7 @@ sub manage_one_feature{
if(!$last_l1_f and $last_l3_f){ #particular case : Two l3 that follow each other, but first one has locus_tag but not the second
dual_print ($log, "::::::::::Push-L3-sequential-2 $last_locusTAGvalue || ".lc($l2_id)." || level3 == ".$feature->gff_string."\n", $verbose) if ($debug);
push( @{$infoSequential->{'locus'}{lc($last_locusTAGvalue)}{lc($l2_id)}{'level3'}}, $feature );
$infoSequential->{'id'}{lc($l2_id)} = $l2_id;
$infoSequential->{'id'}{lc($l2_id)} = $l2_id;#}
return $last_locusTAGvalue, $last_l1_f, $last_l2_f, $feature, $feature, $lastL1_new;
}
else{
Expand Down Expand Up @@ -2842,21 +2842,47 @@ sub _deinterleave_sequential{
dual_print($log, "$resume_case cases of interleaved locus. ( Features of the locus data are defined earlier in the file.\n", $verbose) if($resume_case);
}

# Same locus but L3 and L2 have been put in two different buckets because L2 was met later in the file another parent have been created by AGAT
sub _fixL2andL3inDifferentBucketBecauseL2wasMetLaterInTheFile{
my ($infoSequential) = @_;
my $resume_case=undef;

foreach my $locus ( sort { ncmp($a,$b) } keys %{$infoSequential->{'locus'}} ){
foreach my $bucket (sort { ncmp($a,$b) } keys %{$infoSequential->{'locus'}{$locus} } ){
if( exists_keys($infoSequential,("locus", $locus, $bucket, 'level3') ) and ! exists_keys($infoSequential,("locus", $locus, $bucket, 'level2') ) ){

foreach my $bucket2 (sort { ncmp($a,$b) } keys %{$infoSequential->{'locus'}{$locus}} ){
if ($bucket ne $bucket2){
if (exists_keys($infoSequential,("locus", $locus, $bucket2, 'level2') ) ){
#Push to allow to add different bucket in the level2 bucket
foreach my $feature ( sort { ncmp ($a->start."|".$a->end.$a->_tag_value('ID'), $b->start."|".$b->end.$b->_tag_value('ID') ) } @{$infoSequential->{'locus'}{ $locus }{$bucket}{'level3'} } ){
push( @{$infoSequential->{'locus'}{lc($locus)}{lc($bucket2)}{'level3'}}, $feature );
}
delete $infoSequential->{'locus'}{ $locus }{$bucket}{'level3'};
if( !%{$infoSequential->{'locus'}{ $locus }{$bucket}} ) { delete $infoSequential->{'locus'}{ $locus }{$bucket}; }# remove this hash if empty
}
}
}
}
}
}
}

#
#
# BUCKET = LOCUS_ID
# All Level1 feature are for sure in omniscient, we have only the ID in sequential,
# other level feature are in sequential only if no parent has been found
sub _check_sequential{ # Goes through from L3 to l1
my ($debug, $log, $infoSequential, $omniscient, $hashID, $locusTAG_uniq, $mRNAGeneLink, $verbose) = @_;
my $resume_case_l2=0;
my $resume_case_l3=0;


_deinterleave_sequential($infoSequential, $locusTAG_uniq, $verbose, $log); # PART OF LOCUS LOST BEFORE TO MEET ITS L2 or L1 ... we catch them and re-link everything as it should be
_fixL2andL3inDifferentBucketBecauseL2wasMetLaterInTheFile( $infoSequential);

foreach my $locusNameHIS ( sort { ncmp($a,$b) } keys %{$infoSequential->{'locus'}} ){ #comon tag was l1 id when no real comon tag present

foreach my $bucket (sort { ncmp($a,$b) } keys %{$infoSequential->{'locus'}{$locusNameHIS} } ){ #bucket = level1 or Id L2
foreach my $bucket (sort { ncmp($a,$b) } keys %{$infoSequential->{'locus'}{$locusNameHIS} } ){ #bucket = level1 or Id L2

dual_print($log, "\nlocusNameHIS $locusNameHIS bucket $bucket\n", $verbose) if($debug);

Expand All @@ -2865,7 +2891,7 @@ sub _check_sequential{ # Goes through from L3 to l1
my $must_create_l2=undef;
my $feature_l2 = undef;

#Bucket is an uniq ID created during the reading process. So it can be used as uniq ID.
# NO LEVEL3 FEATURE FOR THIS BUCKET/LOCUS
if(! exists_keys($infoSequential,("locus", $locusNameHIS, $bucket, 'level3') ) ){

# Link the l2 to the L1 feature
Expand Down Expand Up @@ -2903,6 +2929,7 @@ sub _check_sequential{ # Goes through from L3 to l1
#We cannot guess the structure except if it is prokaryote or single exon in eucaryote.
}

# THIS LOCUS HAS LEVEL3 FEATURE
else{
foreach my $feature_L3 (@{$infoSequential->{'locus'}{$locusNameHIS}{$bucket}{'level3'}} ){

Expand All @@ -2928,7 +2955,7 @@ sub _check_sequential{ # Goes through from L3 to l1
}
}
$infoSequential->{'locus'}{$locusNameHIS}{$bucket}{'level2'} = $feature_l2;
}
}
#If locus_tag check from omniscient if feature has same locus tag
elsif ( $common_tag and ( exists_keys($locusTAG_uniq,('topfeature', lc($common_tag) ) ) ) and (! exists_keys($locusTAG_uniq,('topfeature', lc($common_tag),'level1') ) ) ) {
my $id_level2 = undef;
Expand All @@ -2954,13 +2981,17 @@ sub _check_sequential{ # Goes through from L3 to l1

#manage primary tag
my $primary_tag_l2='RNA';
foreach my $feature_L3 (@{$infoSequential->{'locus'}{$locusNameHIS}{$bucket}{'level3'}} ){
if ( $feature_l2->has_tag('agat_parent_type') ){
$primary_tag_l2 = $feature_l2->_tag_value('agat_parent_type');
} else {
foreach my $feature_L3 (@{$infoSequential->{'locus'}{$locusNameHIS}{$bucket}{'level3'}} ){

if ( lc($feature_L3->primary_tag) eq 'cds'){
$primary_tag_l2 ='mRNA';
last;
}
}
if ( lc($feature_L3->primary_tag) eq 'cds'){
$primary_tag_l2 ='mRNA';
last;
}
}
}
$feature_l2->primary_tag($primary_tag_l2);

#Manage ID
Expand Down
3 changes: 2 additions & 1 deletion lib/AGAT/OmniscientO.pm
Original file line number Diff line number Diff line change
Expand Up @@ -608,7 +608,8 @@ sub write_headers{
if (exists_keys( $omniscient, ('other', 'header') ) ){
my $gffXtra=$gffout->{"_filehandle"}; #to add extra lines to gff!!
foreach my $header_line ( @{$omniscient->{'other'}{'header'} } ) {
print $gffXtra $header_line;
$header_line =~ s/[\r\n]+$//g;
print $gffXtra $header_line."\n";
}
}
# to avoid to write again the header later in the file
Expand Down
4 changes: 3 additions & 1 deletion share/feature_levels.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ level1:
est_match: 1
expressed_sequence_match: 1
gene: 1
group_ii_intron: standalone
hat_tir_transposon: standalone
helitron: standalone
insulator: standalone
Expand All @@ -61,6 +62,7 @@ level1:
non_allelic_homologous_recombination_region: standalone
nucleotide_motif: 1
nucleotide_to_protein_match: 1
orf: 1
pif_harbinger_tir_transposon: standalone
pirna_gene: 1
polypeptide: 1
Expand Down Expand Up @@ -203,7 +205,7 @@ spread:
stop_codon: 1
three_prime_utr: 1
utr: 1
uORF: 1
uorf: 1
3utr: 1
5utr: 1

Expand Down
7 changes: 7 additions & 0 deletions t/scripts_output.t
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,13 @@ system(" $script --mfannot $input_folder/test.mfannot -o $outtmp 2>&1 1>/dev/nul
ok( system("diff $result $outtmp") == 0, "output $script");
unlink $outtmp;

$script = $script_prefix."bin/agat_convert_mfannot2gff.pl";
$result = "$output_folder/agat_convert_mfannot2gff_2.gff";
system(" $script --mfannot $input_folder/test.mfannot2 -o $outtmp 2>&1 1>/dev/null");
#run test
ok( system("diff $result $outtmp") == 0, "output $script");
unlink $outtmp;

# -------------------------- check agat_convert_minimap2_bam2gff.pl --------------------------
$script = $script_prefix."bin/agat_convert_minimap2_bam2gff.pl";
$result = "$output_folder/agat_convert_minimap2_bam2gff_1.gff";
Expand Down
Loading
Loading