#!/usr/bin/perl -w ##########INPUT Sequence, concatenated into a single string########## #skip annotation lines in case of fasta. if multiple annotation lines, concatenate these too. # unless(@ARGV==1) {die "please provide name of the file in the command line!!\n";} my$filename=$ARGV[0]; #takes filenname from input line open(IN, "< $filename") or die "cannot open $filename:$!"; #assigns filehandle IN to filename or dies my$seq=''; #assigns empty string my$line=''; my$name=''; my@bases=(); #assigns empty list while(defined($line=)){ chomp($line); if ($line=~/^>/) { #look for beginning of line starting with > (^ is an anchor for the beginning of the line) $name .= $line; } else { $seq .= $line ; } } # clean up sequence # check for all CAPS, report non ATGCs, remove white spaces # $seq =~ tr/atgc/ATGC/; #translates all ATGC to upper case $seq =~ s/\s//g;# substitutes all white spaces \s with nothing globally in $seq ####################sequence to array @bases=split(//,$seq); #splits string into separate elements (bases) $num_bases=@bases; #length of array ###################calculate GC content $n=3;#if you want different n-mers change this number %nlet=(); #reset things $nmer=''; for ($i=0; $i<$num_bases+1-$n; $i++) #go through @bases and form nlets of consecutive nucleotides { for ($k=0;$k<$n;$k+=1){ #joins $n consecutive nucleotides $nmer .= $bases[$i+$k];#form nlet } $nlet{$nmer} += 1; #increase nlet counter for one particular nlet by one $nmer=''; } #print hash @nlet_present = sort {$nlet{$b} <=> $nlet{$a} or $a cmp $b } (keys(%nlet)); foreach (@nlet_present){ print "$_ occurred $nlet{$_} times\n"; }; #print %triplet;