#!/usr/local/bin/perl -w
# coda_v1.pl

use strict;
use Getopt::Long;
use Digest::MD5;

my $help;
my $uniprot_query;
my $query_taxon;
my $db;
my $max_score = 0.65;
my $seq;
my $md5_query;
my $batch_file;
my $md5_output;

GetOptions
(
	"h|help"	=> \$help,
	"q:s"		=> \$uniprot_query,
	"seq:s"		=> \$seq,
	"md5:s"		=> \$md5_query,
	"b:s"		=> \$batch_file,
	"t:i"		=> \$query_taxon,
	"db:s"		=> \$db,
	"s:s"		=> \$max_score,
	"md5_output"	=> \$md5_output
);

if($help)
{
	print_usage();
	exit;
}

# Query should ultimately be an amino acid sequence
# which can then be converted to an MD5
# For the moment we will only allow MD5s which are in database
# however could eventually scan arbitrary sequence and
# domain finder it
my %batch = ();

if($seq)
{
	print STDERR "Sequence provided - assumed to be amino acid...\n";
	chomp $seq;
	my $query_md5 = Digest::MD5::md5_hex($seq);
	$batch{$query_md5} = 1;
	print STDERR "Seq converted to MD5 - $query_md5\n";
}
elsif ($uniprot_query)
{
	$batch{$uniprot_query} = 1;
}
elsif ($md5_query)
{
	$batch{$md5_query} = 1;
}
elsif($batch_file)
{
	open(B, "<$batch_file") or die "$!";

	while(<B>)
	{
		chomp;
		$batch{$_}=1;
	}
}
else
{
	print_usage("Please provide an Ensembl Protein Accession (-q) or a sequence (-seq)");
	exit;
}

open(DB, "<$db") or die "$!";

my %linked = ();
my %dom_counts = ();

my %query_mdas;

my %target_mdas = ();
my %md52uniprot = ();

while(<DB>)
{
	chomp;

	my($taxon, $md5, $uniprots, $mda) = split /\t/;

	my @uniprots = split /\s/, $uniprots;

	foreach my $id (@uniprots)
	{
		push @{$md52uniprot{$md5}->{$taxon}}, $id;
	}

	my @mda_unsorted = split /\s/, $mda;

	my @mda = sort @mda_unsorted; 

	if($taxon == $query_taxon)
	{
		my $query_flag = 0;
		foreach my $id (@uniprots)
		{
			$query_flag = $id if exists $batch{$id} || exists $batch{$md5};
		}
		if ($query_flag)
		{
			$query_mdas{$md5} = \@mda;

			print STDERR "Found MDA for $query_flag : @mda\n";
		}

		push @{$target_mdas{"@mda"}}, $md5;
	}

	for(my $i=0;$i<scalar @mda;$i++)
	{
		for(my $j=0;$j<$i;$j++)
		{
			if(exists $linked{$mda[$j].'_'.$mda[$i]})
			{
				$linked{$mda[$j].'_'.$mda[$i]}->{$taxon}++;
			}
			else
			{
				$linked{$mda[$i].'_'.$mda[$j]}->{$taxon}++;
			}
		}

		$dom_counts{$mda[$i]}->{$taxon}++;
	}
}
close DB;

if(scalar keys %query_mdas == 0)
{
	print STDERR "No query proteins found in database - maybe they had no domains or were otherwise missing\n";
	exit;
}

my %results = ();

foreach my $query_md5 (sort keys %query_mdas)
{
	foreach my $target_mda (sort keys %target_mdas)
	{
		# Does this protein share any domains with the query domain?
		# If so we'll exclude it, otherwise we'll score all the domain
		# pairs and get a score for this pair of proteins
		my @target_mda = split /\s/, $target_mda;
	
		my $hom_flag = 0;	# Note homologous domains
	
		my $best_score = 0;
		my $best_target_dom = '';
		my $best_query_dom = '';
		my $best_linking_taxon = '';
	
		foreach my $query_dom (@{$query_mdas{$query_md5}})
		{
			foreach my $target_dom (@target_mda)
			{
				if($query_dom eq $target_dom)
				{
					$hom_flag = 1;
					last;
				}
				else
				{
					if(exists $linked{$query_dom.'_'.$target_dom}->{$query_taxon} || exists $linked{$target_dom.'_'.$query_dom}->{$query_taxon})
					{
						next;
					}	
				}
	
				my $dom_string = '';
				if (exists $linked{$query_dom.'_'.$target_dom})
				{
					$dom_string = $query_dom.'_'.$target_dom;
				}
				elsif (exists $linked{$target_dom.'_'.$query_dom})
				{
					$dom_string = $target_dom.'_'.$query_dom
				}
	
				if(defined $dom_string)
				{
					foreach my $linking_taxon (sort keys %{$linked{$dom_string}})
					{
						my $query_dom_qcount = $dom_counts{$query_dom}->{$query_taxon};
						my $query_dom_lcount = $dom_counts{$query_dom}->{$linking_taxon};
						my $target_dom_qcount = $dom_counts{$target_dom}->{$query_taxon};
						my $target_dom_lcount = $dom_counts{$target_dom}->{$linking_taxon};
	
						my $score = (1 / ($dom_counts{$query_dom}->{$query_taxon} + $dom_counts{$query_dom}->{$linking_taxon})) + (1 / ($dom_counts{$target_dom}->{$query_taxon} + $dom_counts{$target_dom}->{$linking_taxon}));
	
						if ($score > $best_score)
						{
							$best_score = $score;
							$best_target_dom = $target_dom;
							$best_query_dom = $query_dom;
							$best_linking_taxon = $linking_taxon;
						}
					}
				}
			}
		}
		next if $hom_flag;
	
		if($best_score > $max_score)
		{
			foreach my $target_md5 (@{$target_mdas{$target_mda}})
			{
				#print "$query_md5\t@{$query_mdas{$query_md5}}\t$target_md5\t$target_mda\t$best_linking_taxon\t$best_query_dom\t$best_target_dom\t$best_score\n";

				my $uniprot_query_string = join "\;", @{$md52uniprot{$query_md5}->{$query_taxon}};
				my $uniprot_target_string = join "\;", @{$md52uniprot{$target_md5}->{$query_taxon}};
				#print "$uniprot_query_string\t$uniprot_target_string\t$best_score\n";
				if (!$md5_output)
				{
					$results{$uniprot_query_string."\t".$uniprot_target_string} = $best_score;
				}
				else
				{
					$results{$query_md5."\t".$target_md5} = $best_score;
				}
			}
		}
	}
}

foreach my $result (sort {$results{$b}<=>$results{$a}} keys %results)
{
	print "$result\t$results{$result}\n";
}

sub print_usage
{
	my $err_msg = shift;

	print <<USAGE;		

	###########################################
	# CODA v1 (Co-Occurance of Domains Analysis)
	# by Adam Reid
	###########################################
	# A program for domain fusion analysis
	###########################################
	# This script will determine links based on
	# domain fusions for a query protein of interest
	# in a given species with a completed genome
	# Currently the sequence must exist in the 
	# database already
	###########################################
	#
	# DATABASE FORMAT
	# 
	# The database must be in the following format
	# <NCBI_taxon_id>\t<MD5>\t<Ensembl Protein Acc>\t<MDA>\n
	# The MDA is the list of domains, separated by
	# spaces
	###########################################

	-h | -help	Print this message
	-t		NCBI taxon code for query organism
	-q		Ensembl Protein Accession (or hex MD5) query 
	-b		Batch file of Ensembl Protein Accessions (or hex MD5s)
			One ID per line
	-seq		Protein sequence of query
	-db		Database of taxon/protein/domains
	-s 		Score cutoff (0-1, 1 is best, 0.65 recommended)

	output is to STDOUT

USAGE

	print "$err_msg\n" if defined $err_msg;
}
