#!/usr/local/bin/perl -w # coda_v1.pl use strict; use Getopt::Long; use Digest::MD5; my $help; my $uniprot_query; my $query_taxon; my $db; my $max_score = 0.65; my $seq; my $md5_query; my $batch_file; my $md5_output; GetOptions ( "h|help" => \$help, "q:s" => \$uniprot_query, "seq:s" => \$seq, "md5:s" => \$md5_query, "b:s" => \$batch_file, "t:i" => \$query_taxon, "db:s" => \$db, "s:s" => \$max_score, "md5_output" => \$md5_output ); if($help) { print_usage(); exit; } # Query should ultimately be an amino acid sequence # which can then be converted to an MD5 # For the moment we will only allow MD5s which are in database # however could eventually scan arbitrary sequence and # domain finder it my %batch = (); if($seq) { print STDERR "Sequence provided - assumed to be amino acid...\n"; chomp $seq; my $query_md5 = Digest::MD5::md5_hex($seq); $batch{$query_md5} = 1; print STDERR "Seq converted to MD5 - $query_md5\n"; } elsif ($uniprot_query) { $batch{$uniprot_query} = 1; } elsif ($md5_query) { $batch{$md5_query} = 1; } elsif($batch_file) { open(B, "<$batch_file") or die "$!"; while() { chomp; $batch{$_}=1; } } else { print_usage("Please provide an Ensembl Protein Accession (-q) or a sequence (-seq)"); exit; } open(DB, "<$db") or die "$!"; my %linked = (); my %dom_counts = (); my %query_mdas; my %target_mdas = (); my %md52uniprot = (); while() { chomp; my($taxon, $md5, $uniprots, $mda) = split /\t/; my @uniprots = split /\s/, $uniprots; foreach my $id (@uniprots) { push @{$md52uniprot{$md5}->{$taxon}}, $id; } my @mda_unsorted = split /\s/, $mda; my @mda = sort @mda_unsorted; if($taxon == $query_taxon) { my $query_flag = 0; foreach my $id (@uniprots) { $query_flag = $id if exists $batch{$id} || exists $batch{$md5}; } if ($query_flag) { $query_mdas{$md5} = \@mda; print STDERR "Found MDA for $query_flag : @mda\n"; } push @{$target_mdas{"@mda"}}, $md5; } for(my $i=0;$i{$taxon}++; } else { $linked{$mda[$i].'_'.$mda[$j]}->{$taxon}++; } } $dom_counts{$mda[$i]}->{$taxon}++; } } close DB; if(scalar keys %query_mdas == 0) { print STDERR "No query proteins found in database - maybe they had no domains or were otherwise missing\n"; exit; } my %results = (); foreach my $query_md5 (sort keys %query_mdas) { foreach my $target_mda (sort keys %target_mdas) { # Does this protein share any domains with the query domain? # If so we'll exclude it, otherwise we'll score all the domain # pairs and get a score for this pair of proteins my @target_mda = split /\s/, $target_mda; my $hom_flag = 0; # Note homologous domains my $best_score = 0; my $best_target_dom = ''; my $best_query_dom = ''; my $best_linking_taxon = ''; foreach my $query_dom (@{$query_mdas{$query_md5}}) { foreach my $target_dom (@target_mda) { if($query_dom eq $target_dom) { $hom_flag = 1; last; } else { if(exists $linked{$query_dom.'_'.$target_dom}->{$query_taxon} || exists $linked{$target_dom.'_'.$query_dom}->{$query_taxon}) { next; } } my $dom_string = ''; if (exists $linked{$query_dom.'_'.$target_dom}) { $dom_string = $query_dom.'_'.$target_dom; } elsif (exists $linked{$target_dom.'_'.$query_dom}) { $dom_string = $target_dom.'_'.$query_dom } if(defined $dom_string) { foreach my $linking_taxon (sort keys %{$linked{$dom_string}}) { my $query_dom_qcount = $dom_counts{$query_dom}->{$query_taxon}; my $query_dom_lcount = $dom_counts{$query_dom}->{$linking_taxon}; my $target_dom_qcount = $dom_counts{$target_dom}->{$query_taxon}; my $target_dom_lcount = $dom_counts{$target_dom}->{$linking_taxon}; my $score = (1 / ($dom_counts{$query_dom}->{$query_taxon} + $dom_counts{$query_dom}->{$linking_taxon})) + (1 / ($dom_counts{$target_dom}->{$query_taxon} + $dom_counts{$target_dom}->{$linking_taxon})); if ($score > $best_score) { $best_score = $score; $best_target_dom = $target_dom; $best_query_dom = $query_dom; $best_linking_taxon = $linking_taxon; } } } } } next if $hom_flag; if($best_score > $max_score) { foreach my $target_md5 (@{$target_mdas{$target_mda}}) { #print "$query_md5\t@{$query_mdas{$query_md5}}\t$target_md5\t$target_mda\t$best_linking_taxon\t$best_query_dom\t$best_target_dom\t$best_score\n"; my $uniprot_query_string = join "\;", @{$md52uniprot{$query_md5}->{$query_taxon}}; my $uniprot_target_string = join "\;", @{$md52uniprot{$target_md5}->{$query_taxon}}; #print "$uniprot_query_string\t$uniprot_target_string\t$best_score\n"; if (!$md5_output) { $results{$uniprot_query_string."\t".$uniprot_target_string} = $best_score; } else { $results{$query_md5."\t".$target_md5} = $best_score; } } } } } foreach my $result (sort {$results{$b}<=>$results{$a}} keys %results) { print "$result\t$results{$result}\n"; } sub print_usage { my $err_msg = shift; print <\t\t\t\n # The MDA is the list of domains, separated by # spaces ########################################### -h | -help Print this message -t NCBI taxon code for query organism -q Ensembl Protein Accession (or hex MD5) query -b Batch file of Ensembl Protein Accessions (or hex MD5s) One ID per line -seq Protein sequence of query -db Database of taxon/protein/domains -s Score cutoff (0-1, 1 is best, 0.65 recommended) output is to STDOUT USAGE print "$err_msg\n" if defined $err_msg; }