#!/usr/bin/env python

from __future__ import division
import sys, argparse
import random
import numpy as np

def generate_sequence(indexes_to_use, fo, header, len_seq, counter, isl):
	fo.write('{0}_{1}\n'.format(header, counter))
	index_0, max_i, i = indexes_to_use[0][0], len(indexes_to_use) - 1, 0
	if index_0 > 0:
		fo.write(''.join('A' for x in range(index_0)))
	#end if
	while i < max_i:
		fo.write(indexes_to_use[i][1])
		fo.write(''.join('A' for x in range(indexes_to_use[i + 1][0] - indexes_to_use[i][0] - isl)))
		i += 1
	#end while
	fo.write(indexes_to_use[i][1])
	fo.write(''.join('A' for x in range(len_seq - indexes_to_use[i][0] - isl)))
	fo.write('\n')
#end def

def random_indexes(indexes_to_pick, G_counts, C_counts, indexes_to_use, isl, G_run, C_run):
	while G_counts:
		idx = random.randint(0, len(indexes_to_pick) - isl)
		index = indexes_to_pick[idx]
		indexes_to_use.append((index, G_run))
		upper_index, lower_index = 1, 0
		for i in range(1, isl):
			if indexes_to_pick[idx + i] == index + i:
				upper_index = i + 1
			else:
				break
			#end if
		#end for
		for i in range(1, isl):
			if indexes_to_pick[idx - i] == index - i:
				lower_index = i
			else:
				break
			#end if
		#end for
		del indexes_to_pick[idx - lower_index : idx + upper_index]
		G_counts -= 1
	#end while

	while C_counts:
		idx = random.randint(0, len(indexes_to_pick) - isl)
		index = indexes_to_pick[idx]
		indexes_to_use.append((index, C_run))
		upper_index, lower_index = 1, 0
		for i in range(1, isl):
			if indexes_to_pick[idx + i] == index + i:
				upper_index = i + 1
			else:
				break
			#end if
		#end for
		for i in range(1, isl):
			if indexes_to_pick[idx - i] == index - i:
				lower_index = i
			else:
				break
			#end if
		#end for
		del indexes_to_pick[idx - lower_index : idx + upper_index]
		C_counts -= 1
	#end while

	indexes_to_use.sort()
#end def 

def main(args): # use as args['name']

	# Variables
	isl = int(args['islandlen'])
	G_run = 'G' * isl
	C_run = 'C' * isl
	dict_reference = {} #file_name: (G_counts, C_counts, len_seq)
	G_counts_list, C_counts_list, len_seq_list = [], [], []
	name_file = args['sequencefile'].rstrip().split('/')[-1]

	# Reading reference info
	with open(args['referenceinfo']) as fi:
		for line in fi:
			if not line.startswith('#'):
				file_name, G_counts, C_counts, len_seq = line.rstrip().split('\t')[0], int(line.rstrip().split('\t')[1]), int(line.rstrip().split('\t')[2]), int(line.rstrip().split('\t')[3])
				dict_reference.setdefault(file_name, (G_counts, C_counts, len_seq))
			#end if
		#end for
	#end with

	# Reading sequence
	with open(args['sequencefile'], 'r') as fi:
		first = True
		for line in fi:
			if line.startswith('>') and first:
				first = False
				G_counts, C_counts, seq = 0, 0, ''
			elif line.startswith('>') and not first:
				# Counting
				len_seq = len(seq)
				max_i, i = len_seq - isl, 0
				while i <= max_i:
					if seq[i : i + isl] == G_run: 
						G_counts += 1
						i += isl
					elif seq[i : i + isl] == C_run: 
						C_counts += 1
						i += isl
					else:
						i += 1
					#end if
				#end while
				G_counts_list.append(G_counts)
				C_counts_list.append(C_counts)
				len_seq_list.append(len_seq)
				#Init
				G_counts, C_counts, seq = 0, 0, ''
			else:
				for base in line.rstrip().upper():
					if not base == '-':
						seq += base
					#end if
			#end if
		#end for
		# Counting
		len_seq = len(seq)
		max_i, i = len_seq - isl, 0
		while i <= max_i:
			if seq[i : i + isl] == G_run: 
				G_counts += 1
				i += isl
			elif seq[i : i + isl] == C_run: 
				C_counts += 1
				i += isl
			else:
				i += 1
			#end if
		#end while
		G_counts_list.append(G_counts)
		C_counts_list.append(C_counts)
		len_seq_list.append(len_seq)
	#end with

	# Calculating stats
	G_counts_median = np.median(np.array(G_counts_list))
	C_counts_median = np.median(np.array(C_counts_list))
	len_seq_median = np.median(np.array(len_seq_list))

	try:
		if (round(G_counts_median) + round(C_counts_median)) > (dict_reference[name_file][0] + dict_reference[name_file][1]):
			G_counts_use = int(round(G_counts_median))
			C_counts_use = int(round(C_counts_median))
			len_seq_use = int(round(len_seq_median))
		else:
			G_counts_use = dict_reference[name_file][0]
			C_counts_use = dict_reference[name_file][1]
			len_seq_use = dict_reference[name_file][2]
		#end if
	
		print name_file, G_counts_use, C_counts_use, len_seq_use
	
		# Generating sequences
		with open('{0}_RANDOMIZED_{1}'.format(name_file, isl), 'w') as fo:
			for i in range(int(args['numberrep'])):
				indexes_to_pick, indexes_to_use = range(len_seq_use), []
				random_indexes(indexes_to_pick, G_counts_use, C_counts_use, indexes_to_use, isl, G_run, C_run)
				generate_sequence(indexes_to_use, fo, '>RANDOM', len_seq_use, i, isl)
			#end for
		#end with 
	except:
		with open('ERRORS_RANDOMIZED_{0}'.format(isl), 'a') as fe:
			fe.write('{0} error\n'.format(name_file))
		#end with
	#end try
# end def main
 
 
if __name__ == '__main__':
 
    parser = argparse.ArgumentParser(description='')
 
    parser.add_argument('-s','--sequencefile', help='inputfile with fasta sequences, multiple alignment', required=True)
    parser.add_argument('-r','--referenceinfo', help='inputfile with reference info', required=True)
    parser.add_argument('-l','--islandlen', help='island length', required=True)
    parser.add_argument('-n','--numberrep', help='number of replicates', required=True)

    args = vars(parser.parse_args())
 
    main(args)
 
# end if
