Reverse Complement Perl

#!
/usr/bin/perl
#
# A hacky implementation of a genetic algorithm which looks for overrpresented m
otifs in a test set (compared to a control set)
#
use strict;
my @alphabet = ("A", "T", "G", "C", ".", "Y", "R", "S", "B", "D", "H", "V", "M",
"K", "W"); # the alphabet that will make up all of the motifs in our genepool.
Here we use "." instead of N (. matches anything in a regular expression)
my $DEBUG = 0; # if this is set to 1 then the script outputs more debug informat
ion to STDOUT
# some settings
my $max_generations = 100000; # how many generations should we run for before as
suming that the GA isn't going to produce a better result?
my $max_genepool = 32; # how big should the genepool be, initially?
my $max_pattern_length = 12; # how big should the patterns we seed the genepool
with initially be?
my $endpoint = 75; # what's our endpoint? See the tutorial writeup for more info
rmation
my $mutate_rate = 20; # what's the chance (in 1000) of a particular motif being
mutated in any one generation?
my $max_loops = 250; # used when picking motifs to carry over to the next genera
tion
my $toxic_at = 2000; # we keep track of the best fitness in the genepool. If it
stays stagnant then we gradually increase the mutation rate ($mutate_rate). At s
ome point - 2000 - then we should give up as the GA obviously isn't going to pro
duce a better result.
my $check_reverse = 1; # should we check the reverse complement of sequences?
my $check_dir = 1; # should we check both 5' to 3' and 3' to 5'?
my $collect = 32; # how many times should we run the algorithm? (to collect only
one motif, change this to 0)
my %collected; # used to store solutions whose fitness exceeds the endpoint
# data to use during fitness evalution
my @pos = get_seqs_from_fasta("clusters_5_train.fa"); # converts the sequences i
n a FASTA file to an array
my @neg = get_seqs_from_fasta("clusters_5_control.fa"); # ditto
my @genepool = create_life(); # first seed the genepool
my $current_best; # we keep track of the current best fitness in the genepool
my $current_mutate_rate; # we vary the mutation rate; see description of $toxic_
at, above
my %fitness_cache; # we keep track of the fitnesses of motifs so that if the sam
e motif comes up in another generation we don't have to recalculate fitness
for (my $generation = 0; $generation < $max_generations; $generation++) {
print STDERR "GENERATION $generation (COL ".scalar(keys(%collected)).", CMR $c
urrent_mutate_rate, POOLSIZE ".scalar(@genepool).") ==> ";
# assess fitness
my %fitness;
foreach my $chr (@genepool) {
if ($fitness_cache{$chr}) {
$fitness{$chr} = $fitness_cache{$chr};
} else {
$fitness{$chr} = assess_fitness($chr);
$fitness_cache{$chr} = $fitness{$chr};
}
print STDERR $fitness{$chr}." " if $DEBUG;
if ($fitness{$chr} >= $endpoint) { # if the fitness of the motif we are curr
ently evaluating is more than the endpoint...
if (!$collect) { # if collect is 0, just print out the current motif and e
nd the script
print STDERR "\nEndpoint ($endpoint) achieved with $chr\n";
print STDERR assess_fitness($chr, 1);
exit;
}
}
}
print STDERR "\n" if $DEBUG;

# sort by fitness
my @sorted = sort {$fitness{$b} <=> $fitness{$a}} keys(%fitness);
print STDERR "Best: ".$sorted[0].", ".assess_fitness($sorted[0], 1)." (".$fitn
ess{$sorted[0]}.")\n";
if (($collect) && ($fitness{$sorted[0]} >= $endpoint)) { # if the fitness of t
he motif we are currently evaluating is more than the endpoint...
print STDERR "Collected another motif (".$sorted[0]."), reseeding genepool a
nd starting afresh.\n";
$collected{$sorted[0]} = 1;
if (scalar(keys(%collected)) >= $collect) {
print STDERR "Collected enough motifs ($collect).";
open(LOG, ">ga_log.txt");
foreach my $key (keys(%collected)) {
print LOG $key."\n";
}
close(LOG);
exit;
}
@genepool = create_life(); # start the whole process again until we've colle
cted $collect motifs.
$generation = 0;
next;
}
if ($fitness{$sorted[0]} eq $current_best) {
$current_mutate_rate++; # increase the mutation rate if the pool looks stale
(we're stuck at a local maxima)
if ($current_mutate_rate >= $toxic_at) {
print STDERR "Pool too toxic (toxic at $toxic_at)!\n";
foreach my $chr (@genepool) {print STDERR "$chr ==> ".$fitness{$chr}."\n";
}
exit;
}
} else {
$current_mutate_rate = $mutate_rate;
}
$current_best = $fitness{$sorted[0]}; # keep track of the current best

# select half of the pool as breeders
my @selected;
my $loops = 0;
while ((scalar(@selected) <= (scalar(@genepool) / 2)) && ($loops <= $max_loops
)) {
for (my $i=0; $i < scalar(@sorted); $i++) {
if ((rand(100) <= $fitness{$sorted[$i]}) || ($i <= 2)) {
$fitness{$sorted[$i]} = -1; # take it out of the running
push(@selected, $sorted[$i]);
}
}
$loops++;
}

# choose how many children we should create
my $num_children = (scalar(@genepool) / 4) + rand(scalar(@genepool) / 2);
if (scalar(@genepool) >= $max_genepool) {
$num_children = ($num_children / 4) * 3; # reduce number of kids if there's
a surplus population
}

# kill off the unselected motifs
@genepool = @selected;

# breed
for (my $i=0; $i < $num_children; $i++) {
my $child;

# pick two parents
my $parent_a = $genepool[pickrand(\@genepool)];
my $parent_b = $genepool[pickrand(\@genepool)];

# do some recombination
my $half_a = length($parent_a) / 2;
my $half_b = length($parent_b) / 2;

if (rand(100) <= 50) {$child .= substr($parent_a, 0, $half_a);} else {$child
.= substr($parent_a, $half_a);}
if (rand(100) <= 50) {$child .= substr($parent_b, 0, $half_b);} else {$child
.= substr($parent_b, $half_b);}

push(@genepool, $child);
}

# mutate motifs if necessary
for (my $c=0; $c < scalar(@genepool); $c++) {
my $chr = $genepool[$c];

if (rand(1000) <= $current_mutate_rate) {
my $type = rand(100);

if ($type <= 25) {
# polymorphism
my $pos = int(rand(length($chr) - 1));
my $newpos = pickrand(\@alphabet);
my $oldpos = substr($chr, $pos, 1, $newpos);
print STDERR "Mutation: replaced $oldpos with $newpos at pos $pos, chr $
c\n" if $DEBUG;
$genepool[$c] = $chr; # update the genepool
} elsif ($type <= 50) {
# insertion
my $pos = int(rand(length($chr) - 1)); # insert after this base
my $newpos = pickrand(\@alphabet);
my $existing_pos = substr($chr, $pos, 1);
my $oldpos = substr($chr, $pos, 1, $existing_pos.$newpos);
print STDERR "Mutation: added $newpos after $existing_pos at pos $pos, c
hr $c\n" if $DEBUG;
} elsif ($type <= 75) {
# deletion
my $oldpos = substr($chr, $pos, 1, "x");
$chr =~ s/x//g;
print STDERR "Mutation: deleted base $oldpos at pos $pos, chr $c\n" if $
DEBUG;
} else {
# gross aberration (inversion, deletion, duplication)
my $gab = rand(100);
if ($gab <= 33) {
# inversion
my $maxlen = length(substr($chr, $pos));
my $randlen = int(rand($maxlen)) + 1;
my $oldseq = substr($chr, $pos, $randlen);
my $was = substr($chr, $pos, $randlen, reverse_dna($oldseq));
print STDERR "Mutation: gross inversion from pos $pos of len $randlen
(".$was." to ".reverse_dna($oldseq)."), chr $c\n" if $DEBUG;
} elsif ($gab <= 66) {
# deletion
my $was = substr($chr, $pos, $randlen, "");
print STDERR "Mutation: gross deletion from pos $pos of len $randlen (
was len ".length($genepool[$c])." now ".length($chr)."), chr $c\n" if $DEBUG;
} else {
# duplication
my $was = substr($chr, $pos, $randlen, $oldseq.$oldseq);
print STDERR "Mutation: gross duplication of pos $pos, len $randlen ($
oldseq) on chr $c\n" if $DEBUG;
}

}
}
}
}
# picks an element from an array at random (can't remember why this is so comple
x!)
sub pickrand {
my @list = @{$_[0]};
my $rand = rand(1000);
my $num = scalar(@list);
my $binsize = 1000 / $num;

my $bin = int($rand / $binsize);

if ($list[$bin]) {return $list[$bin];} else {return pickrand(\@list);}
}
sub assess_fitness {
my $pattern = $_[0];
my $detailed = $_[1];
# count how many times the pattern occurs in the positive set vs. the negative
set.
# return the ratio.
# check the reverse complement of the sequence, too.

# if we're collecting, flatten the fitness of any seqs we've seen before.
if ($collected{$pattern}) {return 0;}
# don't let empty patterns count
if (length($pattern) <= 0) {return 0;}

my $cpos = count($pattern, \@pos);
my $cneg = count($pattern, \@neg);
my $pos = ($cpos / scalar(@pos)) * 100;
my $neg = ($cneg / scalar(@neg)) * 100;
if ($detailed) {
return sprintf("pos %d %d%%, neg %d %d%%", $cpos, $pos, $cneg, $neg);
} else {
return sprintf("%f", $pos - $neg); # as high as possible out of 100 would be
nice. Min -100, max 100
}
}
sub count {
my @seqs = @{$_[1]};
my %seen;
foreach my $seq (@seqs) {

if (rawcount($pattern, $seq)) {$seen{$seq} = 1; next;}
if ($check_reverse) {
my $revseq = reverse_dna($seq);
if (rawcount($pattern, $revseq)) {$seen{$seq} = 1; next;}
}
if ($check_dir) {
my $dirseq = reverse($seq);
if (rawcount($pattern, $dirseq)) {$seen{$seq} = 1; next;}
if ($check_reverse) {
my $revseq = reverse(reverse_dna($seq));
if (rawcount($pattern, $revseq)) {$seen{$seq} = 1; next;}
}
}
}
#print STDERR "Counted ".scalar(keys(%seen))." of $pattern\n";
return scalar(keys(%seen));
}
sub average {
my @list = @{$_[0]};
my $average;

foreach my $item (@list) {$average += $item;}

return $average / scalar(@list);
}
sub rawcount {
my $seq = $_[1];
# convert our motif into a regular expression
$pattern =~ s/R/\[AG\]/g;
$pattern =~ s/Y/\[CT\]/g;
$pattern =~ s/M/\[CA\]/g;
$pattern =~ s/K/\[TG\]/g;
$pattern =~ s/W/\[TA\]/g;
$pattern =~ s/S/\[CG\]/g;
$pattern =~ s/B/\[CTG\]/g;
$pattern =~ s/D/\[ATG\]/g;
$pattern =~ s/H/\[ATC\]/g;
$pattern =~ s/V/\[ACG\]/g;
$pattern =~ s/N/\[ATCG\]/g;
$pattern =~ s/2/\.\{1,2\}/g;
$pattern =~ s/3/\.\{1,3\}/g;
$pattern =~ s/4/\.\{1,4\}/g;
$pattern =~ s/5/\.\{1,5\}/g;
$pattern =~ s/6/\.\{1,6\}/g;
my $count = 0;
# try to match the regular expression
while ($seq =~ /$pattern/g) {
$count++;
}
return $count;
}
sub reverse_dna {
my $string = $_[0];
$string =~ tr/[A,T,G,C]/[T,A,C,G]/;
$string =~ tr/[R,Y,M,K,W,S]/[Y,R,K,M,S,W]/;
$string =~ tr/[B,D,H,V]/[V,H,D,B]/;
return reverse($string);
}
# this sub creates a new genepool, filling it with $max_genepool random motifs o
f max length $max_pattern_length
sub create_life {
my @genepool;

for (my $i=0; $i < $max_genepool; $i++) {
# what length of pattern to generate?
my $len = rand($max_pattern_length) + 1;
my $pattern;

for (my $k=0; $k < $len; $k++) {
$pattern .= pickrand(\@alphabet);
}

push(@genepool, $pattern);
}
return @genepool;
}
# simple helper function to convert a FASTA file to an array
sub get_seqs_from_fasta {
my $filename = $_[0];
my $seq_size = $_[1];
open(INPUT, $filename) or die("Couldn't open $filename (did you download them?
)");
my @lines = <INPUT>;
close(INPUT);
my $lines = "@lines";
my @seqs;
my @fasta = split(/>/, $lines);
foreach my $seq (@fasta) {
my @elements = split(/\n/, $seq);
$elements[0] = "";
$seq = "@elements";
$seq =~ s/\s//g;
if (length($seq) <= 0) {next;}
$seq = uc($seq); # only return uppercase (for the index function)

if ($seq_size) {
if (length($seq) <= $seq_size) {next;}
if (length($seq) > $seq_size) {
my $extra = length($seq) - $seq_size;
$seq = substr($seq, int($extra / 2), $seq_size);
}
}

if (scalar(@seqs) >= 50) {next;}
push(@seqs, uc($seq));
}
if (!scalar(@seqs)) {die("No sequences in file $filename");}
return @seqs;
}

Reverse Complement Perl

Enviado por

Dados do documento

Direitos autorais

Formatos disponíveis

Compartilhar este documento

Compartilhar ou incorporar documento

Opções de compartilhamento

Você considera este documento útil?

Este conteúdo é inapropriado?

Direitos autorais:

Formatos disponíveis

Reverse Complement Perl

Enviado por

Direitos autorais:

Formatos disponíveis

#!

Você também pode gostar