#!/usr/local/bin/perl -w

######################################################
#  Get bacterial genomes from NCBI                   #
#                                                    #
#  Usage: FTP GenBank files for microbial            #
#  genomes, concatenate all files for one genome     #
#  (i.e. plasmid and chromosome sequences), in GFF,  #
#  fasta, and "*.genes" files, and                   #
#  create list of genomes.                           #
#                                                    #
# OPTIONS: -f  forces update of all files            #
#          -n  looks for only new genomes            #
#          -s  species_name                          #
#              looks for specified species           #
#                                                    #
#  Kim Wong kwong@bcgsc.bc.ca                        #
#  Sheldon McKay smckay@bcgsc.bc.ca                  #
######################################################

$| = 1;
use strict;
use Getopt::Std;
use Net::FTP;
use Bio::SeqIO;

my %opt = ();
getopts('nfs:', \%opt);


###################################################
# USER-SPECIFIC INFO                               
#                                                  
# MAKE NECESSARY CHANGES BELOW !                   
###################################################
#                                                  
# species root directory                           
# you will need write permission                   
my $seq_dir  = '/home/user/path/species'; 
#                                   
# email (for anonymous ftp password)
my $email = 'somone@somewhere.org'; 
#                                   
####################################################

# FTP site and directory -- should not have to change these                                         
my $ftp_site = 'ftp.ncbi.nlm.nih.gov';
my $ftp_cwd  = '/genomes/Bacteria';


my $force = 1 if $opt{'f'};
my $new   = 1 if $opt{'n'};
my $spec  = $opt{'s'} || '';

die "-s requires an argument\n"
    if exists $opt{'s'} && !defined $opt{'s'};
    
die <<END unless $force || $new || $spec;
Usage: ftp_genomes [-fn] [-s species_name]
       -n : find only new bacterial species on NCBI ftp site
       -f : force an update of all bacterial species on NCBI ftp site 
       -s : specify species to update (use name from NCBI ftp site)
END

if ( $new && $force ) {
    undef $force;
    warn "-f flag ignored; updating new species only";
} elsif ( $force ) {
    warn "Forced update invoked\n";
} else {
    warn "Adding new records only...\n";
}


#
# Make sure we are in the correct directory
#

chdir $seq_dir
    or die "Did not change to correct directory";
chomp ( my $dir=`pwd` );
print "Working directory is $dir\n";

#
# Log on to the ftp site
#

my $ftp = Net::FTP->new($ftp_site) 
    || die "Not connected to ftp site", Net::FTP->error;
$ftp->login('anonymous', $email)    
    || die "Could not log in to ftp site", $ftp->error;
$ftp->cwd($ftp_cwd) 
    || die "Could not change directories";

print "Logged on to the ftp site\n";

#
# Define arrays to store names of updated folders
# and a complete list of genomes downloaded 
#

my @updated_list = ();
my @complete_list = ();
my $count=0;

#
# Find out which genomes are available
# Update all if forced, or add new genomes only
#

#get_one($spec) if $spec;

FOO:  for ($ftp->ls) {
    
    next FOO if $spec && $spec ne $_;
    
    $ftp->cwd($_) or warn "$_ not a directory?\n" and next;
    my $spec_dir = $_; 
  
    if (-e $spec_dir) {
	unless ($force) {
	    push(@complete_list, $spec_dir);
	    warn "$spec already done, use -f to force\n";
	    $ftp->cwd('..') and next;
	}
    } else {
	system "mkdir $seq_dir/$spec_dir";
    } 
    unless ($ftp->ls('*.gbk')) {	# some directories are empty 
	system "rmdir $spec_dir";
	$ftp->cwd('..');
	next FOO;
    }
    push(@updated_list, $spec_dir);
    for my $file ($ftp->ls('*.gbk')) {
	$ftp->get($file,"$spec_dir/$file");
	print "Got $file\n";
    }
    
    push(@complete_list, $spec_dir); 
    
    # 
    # Get Accession and sequence source information 
    # from 'Description' line in gbk files
    # and use for Fasta headers. 
    # 
    
    chdir $spec_dir || warn "Can't find directory $spec_dir\n";
    my $gb_files = `echo *.gbk`;
    chomp $gb_files;
    
    #
    # Write over any existing files
    # 
    
    open GENES, ">$spec_dir.genes";
    open FA,    ">$spec_dir.fa";
    my @strings = ();
    
    for my $gb (split /\s+/, $gb_files) { 
	my $gb_id = $1 if ( $gb =~ /(.+)\.gbk/);  
	my $in  = Bio::SeqIO->new( -file   => $gb, 
				   -format => 'genbank'
				 ) or die Bio::SeqIO->error;
	while (my $seq = $in->next_seq) {
	    print "Processing ", my $header = $seq->desc, "..\n";
	    
	    (my $dna = $seq->seq) =~ s/\w{60}/$&\n/g;
	    print FA ">$gb_id $header\n$dna\n";
	    
	    
	    #
	    # Make a GFF record for each ORF
	    # and a simple gene list with coords and strand.
	    #
	    # First column is Accession which matches original
	    # gbk file name and fasta file header
	    #       
	    for ($seq->all_SeqFeatures) {
		
		my $gff = $_->gff_string if $_->primary_tag eq "CDS";
		next unless $gff;
		$gff =~ s/^SEQ/$gb_id/;
		$count++;
		push (@strings, $gff);
		my $strand = $_->strand < 0 ? '-' : '+';
		my ($gene) = eval { $_->each_tag_value('gene'); };
		if ($@) { 
		    ($gene) = eval {$_->each_tag_value('product');}; 
		    if ($@) {
			next;
		    } 
		}
		print GENES "$gb_id\t", "$gene\t", $_->start, "\t", $_->end,
		    "\t", $strand, "\n";
	    }
	    open GFF, ">$spec_dir.gff";
	    for (@strings) {
		print GFF "$_\n";
	    }
	    
	}
    }
    close GFF;
    close GENES;
    close FA;
    system "\\rm *.gbk" || die "Can't remove gbk file";
    system "gzip -f *.gff";	# force overwrite of existing .gz file
    chdir $dir;
    $ftp->cwd('..') or die "not in dir $ftp_cwd";
    
}

#
# Print out a list of all genomes available, and
# indicate the genomes added during the most
# recent run
#

my $day = localtime;

open (GENE_LIST, ">Genome_list");
print GENE_LIST "Complete list of available genomes:\n\n";

if ($force) {
    print GENE_LIST "All files updated on: $day\n\n";
} else {
    print GENE_LIST "(Updated on $day to download\n all newly available genomes)\n\n";
}

unless ($force) {
    print GENE_LIST "\n\n" . "*" x 20;
    if (@updated_list) {
	print GENE_LIST "\n\nThese genomes were added $day:\n\n";
    } else {
	print GENE_LIST "\n\nNo new genomes were added $day\n\n";
    }
    foreach (sort(@updated_list)) {
	print GENE_LIST "$_\n";
    }
} 

close GENE_LIST;

__END__

=pod

=head1 NAME

 ftp_genome - retrieve and format GenBank files

=head1 DESCRIPTION

  Uses ftp to retrieve GenBank files for microbial            
  genomes, and formats files to use for 'genome_primer.'
  
  Output files :
  
  *.gff.gz  - zipped flat file with gene annotations
  *.fa      - fasta file DNA sequence
  *.genes   - list of genes with coordinates and strand
  
  Files for genomes with more than one chromosome 
  and/or plasmid are concatenated, and differentiated
  by GenBank accession numbers.


=head1 OPTIONS

  -f    - forces update of all files           
  -n    - looks for only new genomes           
  -s species_name                           
        - looks for specified species

       Species name must appear exactly as listed
       in GenBank, with underscores replacing any
       blank spaces.

=cut
