#!/usr/bin/perl -w

use strict;
use Getopt::Long;
use List::Util qw(min max);

$|=1;

my ($script) = ($0 =~ m|([^/]*)$|);

my $Uso = "Use of $script:

-fastAlignm       multifasta alignment file (either with '>' or '#')
-coords           file of G4 coords generated with 'findPattern_multifasta.pl'
-threshold        minimum threshold for reporting conserved G4 (integer from 0 to 100)
-outdir           output directory

";

my ($fastAlignm,$threshold,$coords,$outdir);

my $result = GetOptions ("fastAlignm=s"=>\$fastAlignm,
					"threshold=i"=>\$threshold,
                    "coords=s"=>\$coords,
                    "outdir=s"=>\$outdir);

unless ($result)
{
    print STDERR "Some options are not well formatted\n";
    exit (1);
}

if ((!$fastAlignm)||(!$threshold)||(!$coords)||(!$outdir))
{
    print "$Uso";
    exit;
}

#first create a matrix of zeros from multifasta alignment files (columns=length of the alignment, rows=number of sequences)
my @nome = split(/\//, $coords);
my $filename = $nome[-1];

$threshold = $threshold/100;
my $seq = '';
my ($header,%matrix,$matrixLength,%seqStarted,%seqEnded);
my $numSeqs =0;

open (FILE, "$fastAlignm");

while (my $line = <FILE>)
{
	chomp $line;
    next if ($line =~ /^$/);
	if (($line =~ /^\>/) || ($line =~ /^\#/))
	{
		$numSeqs++;
		if ($seq)
		{
            my @zeros;
            my $len = length($seq);
            my @bases = split (//,$seq);
            for (my $i=0;$i<$len;$i++)
            {
                push(@zeros, 0);
                $seqStarted{$header} = $i if (($bases[$i] ne '-') && (not exists $seqStarted{$header}));
                $seqEnded{$header} = $i if (($bases[$i] ne '-'));
            }  
			@{$matrix{$header}} = @zeros;
		}
		$header = $line;
		$seq = '';
	}
	else
	{
		$seq .= uc($line);
	}
}
if ($seq)
{
    my @zeros;
    my $len = length($seq);
    my @bases = split (//,$seq);
    for (my $i=0;$i<$len;$i++)
    {
        push(@zeros, 0);
        $seqStarted{$header} = $i if (($bases[$i] ne '-') && (not exists $seqStarted{$header}));
        $seqEnded{$header} = $i if (($bases[$i] ne '-'));
    }
	@{$matrix{$header}} = @zeros;
	
	$matrixLength = $#zeros; #this is the number of matrix columns
}
close FILE;

#read the G4 coords and convert from 0 to 1 the relative positions in the matrix

my $max_consecutive_g4 = 1;

open(COORDS, "$coords");

while (my $line = <COORDS>)
{
    chomp $line;
    if ($line =~ /^\>/)
    {
        $header = $line;
    }
    else
    {
        my @fields = split(/\t/,$line);
        for (my $i=($fields[0]-1);$i<$fields[1];$i++)
        {
            @{$matrix{$header}}[$i]++;
            $max_consecutive_g4 = @{$matrix{$header}}[$i] if (@{$matrix{$header}}[$i] > $max_consecutive_g4);
        }
    }
}
close COORDS;

#now scan the alignment and record the consensus

my %consensus;
my @seqs_per_position;

for (my $i=1;$i<=$max_consecutive_g4;$i++)
{
    $consensus{$i} = [];#create a empty array for each value from 0 to the max number of consecutice G4
}

for (my $i =0; $i<=$matrixLength;$i++) #parse the matrix position by position
{
    my $tot = 0;
    my $uni = 0;
    
    for (my $m=1;$m<=$max_consecutive_g4;$m++)
    {
        foreach my $key (keys (%matrix))
        {
            if ($matrix{$key}[$i] >= $m)
            {
                $uni++;
            }
            $tot++ if (($seqStarted{$key} <= $i) && ($seqEnded{$key} >= $i));
        }
        my $ratio = sprintf("%.2f",($uni/$tot));
        my $info = $ratio . ':' . $uni . ':' . $tot;
        push(@{$consensus{$m}}, $info);
        $tot=0;
        $uni=0;
    }
}

my $current = 0;
my $max = 0;
my $max_seq = 0;#num seq with g4
my $tot_seq = 10000000;#num seq in that position
my $seqG4 = 0;
my $a=0;
my $start = 0;
my $matrix_length = @{$consensus{1}};#length of the matrix

open(OUT2,">$outdir/$filename.G4coords");
print OUT2 ">$filename\n";

for ($a=0;$a<$matrix_length;$a++)
{
    my @campi = split(':', $consensus{1}[$a]);#(ratio, num seq with G4, tot seq for this position)
    
    if ($campi[0] >= $threshold)
    {
        $max_seq = $campi[1] if ($campi[1] > $max_seq);#keep the max num of seqs with g4
        $tot_seq = $campi[2] if ($campi[2] < $tot_seq);#keep the min num of tot seq (otherwise longer seqs are penalized)
        if ($current < $threshold)
        {
            $start = $a+1;
            print OUT2 "$start\t";
        }
        $max = $campi[0] if ($campi[0] > $max);
    }
    else
    {
        if ($current >= $threshold)
        {
            my $end = $a;
            $max = $max*100;
            print OUT2 "$end\t$max\t$max_seq\t$tot_seq";

            for (my $h=1;$h<=$max_consecutive_g4;$h++)
            {
                next if ($h == 1);
                my $max_local = 0;
                my $max_seqbis = 0;
                my $tot_seqbis = 100000000;
                for (my $s=$start;$s<=$end;$s++)
                {
                    my @campibis = split(':', $consensus{$h}[$s]);
                    $max_local = $campibis[0] if ($campibis[0] > $max_local);
                    $max_seqbis = $campibis[1] if ($campibis[1] > $max_seqbis);
                    $tot_seqbis = $campibis[2] if ($campibis[2] < $tot_seqbis);
                }
                if ($max_local >= $threshold)
                {
                    $max_local = $max_local*100;
                    print OUT2 "\n$start\t$end\t$max_local\t$max_seqbis\t$tot_seqbis";
                }
            }
            print OUT2 "\n";
        }
        $max = 0;
        $max_seq = 0;
        $tot_seq = 10000000;
    }
    $current = $campi[0];
}
#if there is a g4 in the last portion of seqs
my @campi = split(':', $consensus{1}[-1]);#(ratio, num seq with G4, tot seq for this position)
if ($campi[0] >= $threshold)
{
    my @campicampi = split(':', $consensus{1}[-2]);#(ratio, num seq with G4, tot seq for this position)
    if ($campicampi[0] >= $threshold)
    {
        $max = $max*100;
        print OUT2 "$a\t$max\t$max_seq\t$tot_seq";
        for (my $h=1;$h<=$max_consecutive_g4;$h++)
        {
            next if ($h == 1);
            my $max_local = 0;
            my $max_seqbis = 0;
            my $tot_seqbis = 1000000000;
            for (my $s=$start;$s<$a;$s++)
            {
                my @campibis = split(':', $consensus{$h}[$s]);
                $max_local = $campibis[0] if ($campibis[0] > $max_local);
                $max_seqbis = $campibis[1] if ($campibis[1] > $max_seqbis);
                $tot_seqbis = $campibis[2] if ($campibis[2] < $tot_seqbis);
            }
            if ($max_local >= $threshold)
            {
                $max_local = $max_local*100;
                print OUT2 "\n$start\t$a\t$max_local\t$max_seqbis\t$tot_seqbis";
            }
        }
        print OUT2 "\n";
    }
    else
    {
        $max = $max*100;
        print OUT2 "$a\t$max\t$max_seq\t$tot_seq\n";
    }
}
close OUT2;




