#!/usr/bin/perl -w -s ##########INPUT Sequence, concatenated into a single string########## #skip annotation lines in case of fasta. if multiple annotation lines, concatenate these too. # unless(@ARGV==1) {die "please provide name of the file in the command line!!\n";} my $filename=$ARGV[0]; #takes filenname from input line open(IN, "< $filename") or die "cannot open $filename:$!"; #assigns filehandle IN to filename or dies my $seq=''; #assigns empty string my $line=''; my $name=''; my @bases=(); #assigns empty list while(defined($line=)){ chomp($line); if ($line=~/^>/) { #look for beginning of line starting with > (^ is an anchor for the beginning of the line) $name .= $line; } else { $seq .= $line ; } } #################### move sequence to array # check for all CAPS, report non ATGCs, remove white spaces # $seq =~ tr/atgc/ATGC/; #translates all ATGC to upper case $seq =~ s/\s//g;# substitutes all white spaces \s with nothing globally in $seq @bases=split(//,$seq); #splits string into separate elements (bases) my$num_bases=@bases; #length of array ###################calculate GC content my$num_GC=0; for ($i=0; $i<($num_bases); $i++) #counts Gs and Cs in @bases Note the number of bases is one larger than the array { if(($bases[$i]=~"G") || ($bases[$i]=~"C")) #if it matches G or C increase counter {$num_GC++; } if (!(($bases[$i]=~"G") || ($bases[$i]=~"A") || ($bases[$i]=~"T") || ($bases[$i]=~"C"))) {print "Warning there is a strange base $bases[$i] before position $i\n"; my$errors++;} } if (defined ($errors)){$num_bases=$num_bases-$errors}; $GC_content=($num_GC/$num_bases)*100; print "\nThe GC content of the sequence in the file ".'"'."$filename".'"'. " is $GC_content\%.\n\n"; if (!($name eq '')) {print "Annotation line(s) in $filename was/were $name\n";} # print "The sequence analyzed was:\n"; # print @bases; ########determine the number of di nucleotide repeats for (my $j=0; $j<($num_bases-1); $j++) { my $di=($bases[$j].$bases[$j+1]); push(@di_nucleotides, $di); } my %di=(); foreach my $i (@di_nucleotides) { if(!(exists $di{$i})) { $di{$i}=1; } else {$di{$i} += 1 } } #foreach $keys (sort keys %di) { # print "$keys => $di{$keys}\t"; # } ########determine the number of tri nucleotide repeats for (my $j=0; $j<($num_bases-2); $j++) { my $tri=($bases[$j].$bases[$j+1].$bases[$j+2]); push(@tri_nucleotides, $tri); } my %tri=(); foreach my $i (@tri_nucleotides) { if(!(exists $tri{$i})) { $tri{$i}=1; } else { $tri{$i} += 1 } } #foreach $keys (sort keys %tri) { # print "$keys => $tri{$keys}\t"; # } ########determine the number of di nucleotide repeats for (my $j=0; $j<($num_bases-3); $j++) { my $quad=($bases[$j].$bases[$j+1].$bases[$j+2].$bases[$j+3]); push(@quad_nucleotides, $quad); } my %quad=(); foreach my $i (@quad_nucleotides) { if(!(exists $quad{$i})) { $quad{$i}=1; # you coul dmake the program faster by skipping the (if !(exist ...) statement and just increase the hash for each encounter. } else { $quad{$i} += 1 } } #you could sort the keys of hash by their their values: @sorted_array = sort {$quad{$b} <=> $quad{$a} or $a cmp $b } (keys(%quad)); open (OUT, "> out.txt") or die "cannot open $filename:$!"; foreach (@sorted_array) { print OUT "$_\t$quad{$_}\n"; print "$_ => $quad{$_}\n"; } print "\n"; close (OUT);