#!/usr/bin/perl -w

use strict;

use Getopt::Long;

$| = 1;


my $use = "

This script finds G-quadruplex \"islands\" in a DNA sequence, even those containing gaps because part of an alignment:

-i    Fasta sequence
-l    minimal length of the G-block (default = 2)
-n    minimal number of G-blocks (default = 4)
-d    maximal distance allowed between G-blocks (default = 15)
-m    max length of a G4 (default = 0, over this threshold I try to split the G4, if there are at least 8 G-islands)
-o    output file
-s    optional, put 1 if you need to obtain also gap-free coords of G4

";

my ($file,$minG_block_size,$minG_block_number,$maxG_block_distance,$out,$max_len,$s);

$minG_block_size         = 2;
$minG_block_number   = 4;
$maxG_block_distance = 15;
$max_len = 0;
$s = 0;

my $overShort = 15;
my $overLong = 50;


my $result=GetOptions(
    "i=s"   =>\$file,
    "l=i"  =>\$minG_block_size,
    "n=i"   =>\$minG_block_number,
    "d=i"   =>\$maxG_block_distance,
    "m=i"   =>\$max_len,
    "o=s"  =>\$out,
    "s=i"  =>\$s
);


if ((!$file) || (!$out))
{
    print "$use\n";
    exit (1);
}

qx{/bin/rm -f $out."G"};
qx{/bin/rm -f $out."C"};

my $seq;
my $head;
open (A,"$file") || die "Missing input file\n";
while (my $line = <A>)
{
	chomp ($line);
    if ($line =~ /^>/)
    {
        if ($seq)
        {
            $seq = uc($seq);
            
            my $pattern = 'G';
            my $pattern_inv = 'C';
            my $outG = "$out" . '.' . $pattern;
            G_quad($seq,$pattern,$pattern_inv,$minG_block_size,$maxG_block_distance,$minG_block_number,$outG,$head);
            
            $pattern = 'C';
            $pattern_inv = 'G';
            my $outC = "$out" . '.' . $pattern;
            G_quad($seq,$pattern,$pattern_inv,$minG_block_size,$maxG_block_distance,$minG_block_number,$outC,$head);
        }
        $head = $line;
        $seq = '';
    }
	else
    {
        $seq .= $line;
    }
}

if ($seq)
{
    $seq = uc($seq);
    
    my $pattern = 'G';
    my $pattern_inv = 'C';
    my $outG = "$out" . '.' . $pattern;
    G_quad($seq,$pattern,$pattern_inv,$minG_block_size,$maxG_block_distance,$minG_block_number,$outG,$head);
    
    $pattern = 'C';
    $pattern_inv = 'G';
    my $outC = "$out" . '.' . $pattern;
    G_quad($seq,$pattern,$pattern_inv,$minG_block_size,$maxG_block_distance,$minG_block_number,$outC,$head);
}

close A;

sub G_quad
{
	my $seq = shift;
	my $pattern = shift;
	my $pattern_inv = shift;
	my $GblockLen = shift;
	my $GblockDistance = shift;
	my $minNumGBlock = shift;
	my $out = shift;
    my $head = shift;
	my $countG = 0;
	my $OK = 0;
	my $PreviousEnd = 0;
	my $distance = 0;
	my $startBeginBlock = 0;
    my $end = '';

    my @bases = split(//,$seq);
    
	open(B,">>$out") || die "Cannot write $out\n";
    print B "$head\n";

    my $search = $pattern . '[' . $pattern . '-]{0,}' . $pattern;

	while ($seq =~ /($search)/g)
	{		
        my $len = length($1);
		$end = pos($seq);
		my $start = $end - $len +1;

        my @c = $1 =~ /$pattern/g;
        my $count = @c;
        next if ($count < $GblockLen);#the block of 'G' and '-' has not enough 'G'
        
		my $counter = int($count/$GblockLen);

		unless ($PreviousEnd)
		{
			$PreviousEnd = $end;
			$startBeginBlock = $start;
			#$countG += $counter;
		}
		if ($PreviousEnd)
		{
            #count how many nt are actually present between 2 islands (skip gaps)
            $distance = 0;
            for (my $i=$PreviousEnd;$i<($start-1);$i++)
            {
                $distance++ if (($bases[$i] eq 'A')||($bases[$i] eq 'C')||($bases[$i] eq 'T')||($bases[$i] eq 'G')||($bases[$i] eq 'N'));
            }
            if ($distance <= $GblockDistance)
			{
                $countG += $counter;
				$PreviousEnd = $end;
   			}
			else
			{
				if ($countG >= $minNumGBlock)
				{                    
                    if (($PreviousEnd - $startBeginBlock + 1)  > $max_len)
                    {
                        my $islands = int($countG / $minNumGBlock);
                        for (my $i = 0;$i<$islands;$i++)
                        {
                            print B "$startBeginBlock\t$PreviousEnd\n";
                        }

                    }
                    else
                    {
                        print B "$startBeginBlock\t$PreviousEnd\n";
                    }                     
				}
				else
				{
					#$PreviousEnd = $start + $len;
				}
				$countG = 0;
				$PreviousEnd = $end;
				$countG += $counter;
				$startBeginBlock = $start;
			}
		}
	}
	if ($countG >= $minNumGBlock)
	{
        if (($PreviousEnd - $startBeginBlock + 1)  > $max_len)
        {
            my $islands = int($countG / $minNumGBlock);
            for (my $i = 0;$i<$islands;$i++)
            {
                print B "$startBeginBlock\t$PreviousEnd\n";
            }
            
        }
        else
        {
            print B "$startBeginBlock\t$PreviousEnd\n";
        }
	}
}
