#!/usr/bin/perl -w

use strict;
use Getopt::Long;
use List::Util qw(min max);

$|=1;

my ($script) = ($0 =~ m|([^/]*)$|);

my $Uso = "Use of $script:

-match            correspondence between virus name and accession (Virus.txt)
-g4               file of conserved G4 (produced by matrixScanner_g4.pl)
-alignmentdir     directory containing alignment files
-out              output file

";

my ($match,$g4,$alignmentdir,$out);

my $result = GetOptions ("match=s"=>\$match,
                    "g4=s"=>\$g4,
                    "alignmentdir=s"=>\$alignmentdir,
                    "out=s"=>\$out,
                    );

unless ($result)
{
    print STDERR "Some options are not well formatted\n";
    exit (1);
}

if ((!$match)||(!$g4)||(!$alignmentdir)||(!$out))
{
    print "$Uso";
    exit;
}

my %matches;

#load map of accesions

open (MATCH,"$match");

while (my $line = <MATCH>)
{
    chomp $line;
    $line = lc($line);
    my @fields = split(/\t/,$line);
    $matches{$fields[0]} = $fields[1];
}
close MATCH;

my $switch = 0;
my $no_gaps;
my $found_vir =1;
my $non_repetitive = 0;
my $header = '';
open (G4, "$g4");
open (OUT,">$out");
print OUT "#Start\tEnd\t%conserved\tN seq with G4\tTotal sequences\n\n";
while (my $line = <G4>)
{
    chomp $line;
    next if ($line =~ /^Start/);
    next if ($line =~ /^$/);
    if ($line =~ /^\>/)
    {
        if ($found_vir == 0)
        {
            print STDERR "Multiple alignment not found for $header\n";
        }
        $found_vir = 0;
        $non_repetitive=0;
        $header = $line;
        $no_gaps = '';
        $switch = 0;
        $line = lc($line);
        my @fields = split(/\./,$line);
        (my $name) = ($line =~ /(\w+\.fastaalign\d+)/);
        (my $name_short) = ($fields[0] =~ /([^>].+)/);
        if ($matches{$name_short})
        {
            print "";
        }
        else
        {
            print STDERR "$name_short is not found in Virus.txt, it will not be present in the results!\n";
        }
        opendir(DIR,"$alignmentdir");
        while (my $file = readdir(DIR))
        {
            next if ($file =~ /^\./);
            if ($file =~ /$name/i)
            {
                $found_vir =1;#I found the alignment file corresponding to the virus under analysis
                open(FILE,"$alignmentdir/$file");
                my $start = 0;
                my $seq = '';
                while (my $line1 = <FILE>)
                {
                    chomp $line1;
                    if ($line1 =~ /^\>/)
                    {
                        $start = 0;
                        if ($line1 =~ /$matches{$name_short}/i)
                        {
                            print OUT "$line\n";
                            $switch = 1;#this alignment contains the reference sequence
                            $start = 1;
                            $seq = '';
                        }
                    }
                    elsif ($start == 1)
                    {
                        $seq .= $line1;
                    }
                }
                if ($seq)
                {
                    $no_gaps = parse_gapped_seq($seq);
                }
            }
        }
    }
    else
    {
        if ($switch == 1)
        {
            my @fields = split("\t",$line);
            my $start = $fields[0] - 1;
            my $end = $fields[1] - 1;
            my @a = @$no_gaps;
            print OUT "$a[$start]\t$a[$end]\t$fields[2]\t$fields[3]\t$fields[4]\n";
        }
        else
        {
            if ($non_repetitive == 0)
            {
                print STDERR "Reference sequence not present in the multiple alignment for $header\n";
                $non_repetitive++;
            }
        }
    }
}

close G4;

sub parse_gapped_seq
{
    my $sek = shift;
    my @sek_no_gap = ();
    my $counter = 0;
    
    my @fields = split (//,$sek);
    for (my $i=0;$i<=$#fields;$i++)
    {
        if ($fields[$i] ne '-')
        {
            $counter++;
        }
        push(@sek_no_gap, $counter);
    }
    
    return \@sek_no_gap;
}
