#!/usr/bin/perl

######################
#                    #
#  @rrays and input  #
#                    #
######################

#first, I would like to pass a name of the file to my perl program
#I want to make sure that the program does not proceed unless it has 
#the name of the file, since what I do further in the program does not make sense
#without this file

unless(@ARGV==1) {die "please provide name of the file in the command line!!\n";}

#@ARGV refers to whatever is handed to the program in the command line, i.e. 
# the array containing the command-line arguments intended for the script.
#in the line above array @ARGV is used in a scalar context, returning the length of array
#unless the length of array is exactly 1, the program "dies", i.e. prints 
#the message and exits.


$filename=$ARGV[0];  # in this case it starts at the first (and in this case only) element is in slot 0

#We will discuss input/output later, but it is very difficult to talk about arrays 
# w/o reasonably large dataset.
#the file I will read in contains a list of GI numbers

open(IN, "< $filename") or die "cannot open $filename:$!";

#again, the program will exit, if it cannot open the file 
#(good against typos in the filename)

@seq=(); #initialized an array where I will store GI numbers

while(defined($line=<IN>)){

	chomp($line);
	push(@seq,$line);

}
close(IN);
foreach (@seq){
	print "$_ \n";
	}
print  "\n\n\n\n"; 

#one very useful function is the split function
#Example: split DNA sequence into an array of single nucleotide bases

$seq="ATCGATGCGCGCGAAAA";
@bases=split(//,$seq); # split searches for the pattern between the two // and 
# splits the string $seq at this pattern, the results of the split are then stored in @bases
# you can use any "regular" expession as pattern. Useful is /\s+/, which would split the string at any "white spaces" 
  
$length=@bases;
print "length of $seq is $length bases\n"; 

print "every third nucleotide only, starting with the first:\n";	
for($i=0;$i<@bases;$i++){
	#(remember that counting of array starts with 0!)
 	if(($i % 3) ==0){
		print $bases[$i];
	} 
}
print "\n";



# HOMEWORK ASSIGNMENTS
# Create a program that reads in a sequence stored in a file handed to the program on the command line and 
# determines GC content of a sequence. 

# There are more than one way to implement this program
# One approach is to get the whole sequence into one array, 
# another would be to have a loop acting on every line

# if you are up for a challenge, modify the program to print the numbers of Gs and Cs 
# in a rolling window of variabe size (100 might be a good starting point?




