#!/usr/local/bin/perl ####################### # @rrays and %ashes # ####################### ############ INPUT same as in cass3.pl########## unless(@ARGV==1) {die "please provide name of the file in the command line!!\n";} $filename=$ARGV[0]; #the file to be read contains a list of GI numbers #could need a check for contents open(IN, "< $filename") || die "cannot open $filename:$!"; #again, the program will exit, if it cannot open the file #(good against typos in the filename) #this (the ||) works because the program firast executes the left side. If this works #the left side is true and Perl doesn't even try to evaluate the right hand side @gi=(); #initialized an array where I will store GI numbers while(defined($line=)){ #populate array wih GI numbers #chomp($line); $line =~ s/\s//g;# substitutes all white spaces \s with nothing globally in $line #chomp only removes the \n at the end if ($line ne '') {push(@gi,$line)}; } close(IN); # Done with input ##########Something was wrong - empty line treated as GI number # debugging screening for empty gi numbers #$dummy=0; #foreach (@gi) { #print " $dummy $_ \n"; #$dummy ++; #} #exit; added empty line test above and remove white spaces #how many gi numbers did I read in? # $number_gis=@gi; # print "I read in $number_gis GI numbers\n"; #$number_gis=scalar (@gi); $number_gis=(@gi); #would work too print "I read in $number_gis GI numbers\n\n"; #with list of GI numbers I could now send a request to NCBI to return me sequences #corresponding to these GI numbers #Now, suppose that not all GI numbers in my file are unique, i.e. the same GIs #occur more than once in the list. This often happens when GIs from several #BLAST searches are combined. #So, we want to get rid of duplicates. #%gi_hash=(); #foreach $i (@gi){ # # if(!(exists $gi_hash{$i})){ # $gi_hash{$i}=1; # } #} ###################################### # task 4.2 : # # HOW MANY DUPLICATES PER GI NUMBER? # ###################################### %gi_hash=(); #initialize hash foreach (@gi) #this loop populate gi_hash with key=gi-number and value how often encountered { $gi_hash{$_} += 1 ; #Hash contains Key: gi number Values: "how often did it occur?" } @gi_names = sort(keys(%gi_hash)); #assigns keys to an array $number_gis=@gi_names; # determines number of different keys (the array in scalar context thing) print "I read $number_gis different gi numbers\n"; # above answers task "4.4. Write a script that determines the number of elements in a %ash" foreach (@gi_names){ print "$_ occurred $gi_hash{$_} times\n"; } # above answers task 4.5 Write a script (or subroutine) that prints out a hash sorted on the keys in alphabetical order. ######################################################### # But usually you want to SORT BY VALUES not by keys # # The sort command allows you to define a subroutine as follows # see old handout #@sorted_by_value = sort { $gi_hash{$a} <=> $gi_hash{$b}} keys %gi_hash; #this also works: #@sorted_by_value = sort by_value keys %gi_hash; # in contrast to the comment in the subroutine chapter this does not work with & #sub by_value { $gi_hash{$a} <=> $gi_hash{$b}} # defines the order smaller befor larger (a before b) #even better @sorted_by_value = sort by_value (keys (%gi_hash)); sub by_value { $gi_hash{$a} <=> $gi_hash{$b} or $a <=> $b #if the values are the same, then sort ascibethically (cmp) or numerically (<=>) on the keys } # defines the order smaller befor larger (a before b) foreach (@sorted_by_value) { print "$_ => \t$gi_hash{$_}\n"; } # exit; ################################ # task 4.3: #count how many unique GI numbers are in the list, i.e. count number of elements #in our %gi_hash hash. ############################### $unique_count=0; %gi_unique=(); foreach (@gi_names) # could have used @gi { if ($gi_hash{$_}==1) #if the value is exactly 1 { $unique_count++; #increase the uncique counter $gi_unique{$_}=1 # }; }; print "\n$unique_count gi numbers occurred only once.\n\n"; @gi_unique_array = sort(keys(%gi_unique)); foreach (@gi_unique_array){ print "$_ occurred $gi_hash{$_} times\n"; }