#!/usr/local/bin/perl

########################################################################################
#Class 9
#For a given genome record (in GenBank format) create a table of protein ORF GIs and their 
#corresponding coding regions in the genome
#
#This information is available in the FEATURES:CDS field of GenBank format
#
########################################################################################

unless (@ARGV==1) {die "usage: class9.pl infile_in_GenBank_format";}

$infile=$ARGV[0];
open (GBK, "< $infile") or die "Can't open $infile:$!";
$line=<GBK>;
if($line=~/^"LOCUS"/){
    die "Infile is not in GenBank format";
}

$outfile="gi2dna.tab";
open (OUT, "> $outfile") or die "Can't open $outfile:$!";

while(defined($line=<GBK>))
{
    if($line=~"FEATURES") {last;}
}    

$flag=0;
while(defined($line=<GBK>))
{
    chomp($line);
    

    if($line=~"ORIGIN") {last;}    

    if($line=~"CDS"){
	@parts=split(' ',$line);
	$cds=$parts[1];
	$flag=1;
    }
    if(($line=~"/db_xref")&&($flag==1)){
	@parts=split("=",$line);
	$gi_number=$parts[1];
	$gi_number=~s/\"//g;
	@parts=split(":",$gi_number);
	print OUT "$parts[1]\t$cds\n";
	$flag=0;
    }
    
    
}
close(GBK);
close(OUT);



########################################################################################
#Homework:
#
#Modify this program to calculate AT-content of each ORF of the genome 
#and print a table in the form:
# 	protein GI	AT content
#
#There are several ways to obtain AT-content calculation per ORF 
#(some are more difficult than others), 
#it is for you to choose
#
########################################################################################
