#!/usr/bin/perl -w

use strict ;
use warnings ;
use Carp ;

=head1 NAME 

mci - Index your CPAN mini repository

=head1 DOCUMENTATION

Let you create and search full text indexed database based on the content of files.

=head2 First indexing

Indexing a CPAN Mini repository takes between two and three hours on a modern box if you have your repository
on an old laptop, it's going to take ages. B<mci> can attempt to speedup the indexing by installing a pre-generated 
CPAN mini indexing database if you use the B<--pre_indexed> option.

=head1 USAGE

 $> siim --option --option  file_1 file_2 ... file_n

=head2 Examples

=over 2

=item 

  $> 

=item 

  $> 

=item command line completion (in your current shell)

  $> mci --completion_script > /tmp/mci ; source /tmp/mci ; rm /tmp/mci

=head1 OPTIONS

  'h|help'                display help
  'v|verbose'             
  'completion_script'     generates a bash completion script
  
  'cpan_mini'        cpan mini location or $ENV{CPAN_MINI} or '/devel/cpan'
  

=head1 EXIT STATUS

=head1 AUTHOR

	Nadim ibn hamouda el Khemir
	CPAN ID: NKH
	mailto: nkh@cpan.org

=cut

#------------------------------------------------------------------------------------------------------------------------

our $VERSION = '0.01' ;

use Getopt::Long ;
use Term::Bash::Completion::Generator ;
use Readonly ;
use English qw( -no_match_vars ) ;

use Time::HiRes 'time' ;
use File::Temp ;
use Search::Indexer::Incremental::MD5 qw() ;
use Search::Indexer::Incremental::MD5::Indexer qw() ;
use Search::Indexer::Incremental::MD5::Searcher qw() ;

#---------------------------------------------------------------------------------------------------------

my (@siim_arguments) = @ARGV ;
undef @ARGV ;

mci(@siim_arguments) ;

#---------------------------------------------------------------------------------------------------------

sub mci
{

#~ =head2 mci(@arguments)

#~ 

#~ I<Arguments> - command line arguments to be parsed by Getop::Long

#~ I<Returns> -  Nothing

#~ I<Exceptions> - 

#~ =over 2 

#~ =item Invalid options

#~ =item Invalid index database

#~ =back

#~ =cut

my (@arguments) = @_ ;
my ($options, @files)  = get_options(@arguments) ;

generate_completion_script() if $options->{completion_script} ;

if($options->{show_info})
	{
	my $information = Search::Indexer::Incremental::MD5::show_info($options->{index_directory}) ;
	
	print <<"EOI" ;
Location: $options->{index_directory}
Last updated on: $information->{update_date}
Number of indexed documents: $information->{entries }
Database size: $information->{size} bytes
EOI
	}

#todo: handle pre_indexed option

check_index($options) ;

if($options->{search})
	{
	my @stopwords = (STOPWORDS => $options->{stopwords_file}) if $options->{stopwords_file} ;

	my $searcher 
		= eval 
			{
			Search::Indexer::Incremental::MD5::Searcher->new
				(
				INDEX_DIRECTORY => $options->{index_directory}, 
				USE_POSITIONS => 0, 
				WORD_REGEX => qr/\w+/,
				@stopwords ,
				get_perl_word_regex_and_stopwords(),
				);
			} or croak "No full text index found! $@\n" ;

	my $results  = $searcher->search(SEARCH_STRING => $options->{search}) ;

	my @indexes = map { $_->[0] }
					reverse
						sort { $a->[1] <=> $b->[1] }
							map { [$_, $results->[$_]{SCORE}] }
								0 .. $#$results ;

	for my $index (@indexes)
		{
		my $matching_file = $results->[$index]{PATH} ;
		
		unless($matching_file)
			{
			carp "matched id:'$results->[$index]{ID}' which was removed!\n" ;
			next ;
			}
		
		(my $matching_file_short = $matching_file) =~ s{^/tmp/[^/]+/}{} ;
		
		if($options->{verbose})
			{
			print "'$matching_file_short' [id:'$results->[$index]{ID}', score: '$results->[$index]{SCORE}]'\n" ;
			}
		else
			{
			print "$matching_file_short\n" ;
			}
		}
	}
}

#----------------------------------------------------------------------------------------------------------

sub check_index
{

#~ =head2 check_index($indexer, $options)

#~  brings the cpan mini index database up to date

#~ I<Arguments> - 

#~ $indexer, $options

#~ I<Returns> -  Nothing

#~ I<Exceptions> - 

#~ =cut

my ($options) = @_ ;

my @stopwords = (STOPWORDS => $options->{stopwords_file}) if $options->{stopwords_file} ;

my $indexer = Search::Indexer::Incremental::MD5::Indexer->new
				(
				INDEX_DIRECTORY => $options->{index_directory}, 
				USE_POSITIONS => 0, 
				WORD_REGEX => qr/\w+/,
				@stopwords,
				get_perl_word_regex_and_stopwords(),
				) ;

my $cpan_mini = $options->{cpan_mini} || $ENV{CPAN_MINI} || '/devel/cpan' ;
printf "[CPAN mini repository in '$cpan_mini']\n" if ($options->{verbose}) ;

my $t0_index = time;

#todo: remove tar call
my %modules_in_repository = map {chomp($_) ; $_ => 1} `find $cpan_mini -name '*.tar.gz'` ; #todo: get module list from 02xxxx
my %modules_up_to_date ;
my %indexed_modules_to_remove ;

#~ use Data::TreeDumper ;
#~ print DumpTree \%modules_in_repository,  '%modules_in_repository:', DISPLAY_ADDRESS => 0, QUOTE_HASH_KEYS => 1 ;

$indexer->check_indexed_files
		(
		DONE_ONE_FILE_CALLBACK =>
			sub 
			{
			my ($file, $description, $file_info) = @_ ;			
			
			if(exists $modules_in_repository{$cpan_mini . $description})
				{
				# we can't delete $modules_in_repository{$cpan_mini . $description} as
				# it may contain multiple indexed files
				$modules_up_to_date{$cpan_mini . $description}++ ;
				}
			else
				{
				#~ print "'" . $cpan_mini . $description . "' " ;
				#~ print $modules_in_repository{$cpan_mini . $description} . "\n" ;
				
				$indexed_modules_to_remove{$description}{$file} = $file_info->{ID} ;
				}
			},
		) ;

# remove modules that don't exist anymore
my $t0_remove = time ;

my $number_of_modules = scalar(keys %indexed_modules_to_remove) ;
my $module_index = 0 ;
my $total_number_of_files = 0 ;

for my $module_to_remove(sort keys %indexed_modules_to_remove)
	{
	my $t0_remove_module = time ;
	
	$module_index++ ;
	print "-$module_to_remove\n" ;
	
	my $number_of_files_in_module = 0 ;
	
	for my $module_element (sort keys %{$indexed_modules_to_remove{$module_to_remove}})
		{
		(my $module_element_short = $module_element) =~ s{^/tmp/[^/]+/}{} ;
		
		$total_number_of_files++ ;
		$number_of_files_in_module++ ;
		
		print "\t-$module_element_short\n" if $options->{verbose} ;
		
		$indexer->remove_document_with_id($indexed_modules_to_remove{$module_to_remove}{$module_element})   ;
		}
		
	if ($options->{verbose})
		{
		printf
			"\t[$module_index/$number_of_modules ($number_of_files_in_module) in %.3f s.]\n",
			(time - $t0_remove_module)  ;
		}
	}

if ($options->{verbose})
	{
	printf "[Removed $total_number_of_files files in $number_of_modules modules in %.3f s.]\n", (time - $t0_remove) ; 
	}

# add new modules
$module_index = 0 ;
$total_number_of_files = 0 ;

delete $modules_in_repository{$_} for (keys %modules_up_to_date) ;

$number_of_modules = scalar(keys %modules_in_repository) ;

for my $module (sort keys %modules_in_repository)
	{
	my $t0_module = time ;
	
	$module_index++ ;
	
	my $directory = File::Temp->newdir() ;
	my $extraction_directory = $directory->dirname;

	#todo use module instead for tar and find
	`tar -C $extraction_directory -xzf $module` ;
	
	my @files = grep{/pod|pm|pl$/} `find $extraction_directory/ -name '*.p*' ` ;
	chomp  @files ;
	
	my $number_of_files_in_module = scalar(@files) ;
	$total_number_of_files += $number_of_files_in_module ;
	
	(my $module_to_add_short = $module) =~ s/^$cpan_mini// ;
	print "+$module_to_add_short\n" ;

	my $t0_index = time ;
	
	for my $file (@files)
		{
		(my $file_short = $file) =~ s{^/tmp/[^/]+/}{} ;
		print "\t+$file_short\n" if ($options->{verbose}) ;
			
		$indexer->add_files
			(
			FILES => [map { {NAME => $_, DESCRIPTION => $module_to_add_short} } $file],
			MAXIMUM_DOCUMENT_SIZE => $options->{maximum_document_size},
			) ;
		}
		
	if ($options->{verbose})
		{
		printf
		"\t[$module_index/$number_of_modules ($number_of_files_in_module) in "
		. "%.3f s. (indexing: %.3f s.)]\n",
		(time - $t0_module), (time - $t0_index)  ;
		}
	}
	
#todo: pluralize
if ($options->{verbose})
	{
	printf "[Re-indexed $total_number_of_files files in $number_of_modules modules in %.3f s.]\n", (time - $t0_index) ; 
	}

return ;
}
		
#----------------------------------------------------------------------------------------------------------
		
sub get_perl_word_regex_and_stopwords
{
#~ =head2 get_perl_word_regex_and_stopwords()

#~ creates a $word_regex and $stopwords for the perl language

#~ I<Arguments> - None

#~ I<Returns> -  a list of tuples 

#~ =over 2 

#~ =item (WORD_REGEX => $word_regex) - a key and a regex defining a word in the Perl language

#~ =item (STOPWORDS => $stopwords) - a key and an array reference containing words to ignore in the Perl language

#~ =back

#~ I<Exceptions> - None

#~ =cut

my $id_regex = 
	qr/
	(?![0-9])       # don't start with a digit
	\w\w+           # start with 2 or more word chars ..
	 (?:::\w+)*      # .. and  possibly ::some::more::components
	 /x; 

my $word_regex = 
	qr/
  	    (?:                # either a Perl variable:
	    (?:\$\#?|\@|\%)    #   initial sigil
	    (?:                #     followed by
	       $id_regex       #       an id
	       |               #     or
	       \^\w            #       builtin var with '^' prefix
	       |               #     or
	       (?:[\#\$](?!\w))#       just '$$' or '$#'
	       |               #     or
	       [^{\w\s\$]      #       builtin vars with 1 special char
	     )
	     |                 # or
	     $id_regex         # a plain word or module name
	 )/x;


my @stopwords = 
	(
	'a' .. 'z', '_', '0' .. '9',
	qw/
	__data__ __end__ $class $indexing_operation
	above after all also always an and any are as at
	be because been before being both but by
	can cannot could
	die do don done
	defined do does doesn
	each else elsif eq
	for from
	ge gt
	has have how
	if in into is isn it item its
	keys
	last le lt
	many may me method might must my
	ne new next no nor not
	of on only or other our
	package perl pl pm pod push
	qq qr qw
	ref return
	see set shift should since so some something sub such
	text than that the their them then these they this those to tr
	undef unless until up us use used uses using
	values
	was we what when which while will with would
	you your
	SYNOPSIS DESCRIPTION METHODS  FUNCTIONS 
	BUGS  AUTHOR  
	COPYRIGHT  LICENSE 
	/, 
	'SEE ALSO',
	);

return(WORD_REGEX => $word_regex, STOPWORDS => \@stopwords,) ;
}

#------------------------------------------------------------------------------------------------------------------------

sub get_options_definition
{
#~ =head2 get_options_definition()

#~ I<Arguments> - None

#~ I<Returns> - a list of tuples, config name => container

#~ I<Exceptions> -None

#~ =cut

my ($container) = @_ ;

croak "Error: Expected a hash reference!\n" if (defined $container && 'HASH' eq ref $container) ;

$container ||=
	{
	add_files => 0,
	remove_files => 0,
	check_index => 0,
	use_position => 1, 
	} ;
	
my @definitions = 
	(
	'h|help' => \&display_help,
	'cpan_mini=s' => \$container->{cpan_mini},
	'i|index_directory=s' => \$container->{index_directory},
	'pre_indexed=s' => \$container->{pre_indexed},
	'maximum_document_size' => \$container->{maximum_document_size},
	's|search=s' => \$container->{search},
	'v|verbose' => \$container->{verbose},
	'stopwords_file=s' => \$container->{stopwords_file},
	'show_info' => \$container->{show_info},
	'completion_script' => \$container->{completion_script},
	) ;

return $container, @definitions ;
}	

#------------------------------------------------------------------------------------------------------------------------

sub get_options
{

#~ =head2 get_options(@arguments)

#~ Parses the command line for the module to search and user defined options.

#~ I<Arguments>

#~ =over 2 

#~ =item @arguments - the command line arguments

#~ =back

#~ I<Returns> -  A list containing the module to search followed by the options accepted by this command

#~ I<Exceptions> - exits if an invalid option is passed

#~ =cut

my (@arguments) = @_ ;
local @ARGV = @arguments ;

my ($container, @definitions) = get_options_definition() ;

die "Error: Invalid Option! Try --help.\n" unless GetOptions(@definitions) ;

return($container, @ARGV) ;
}

#---------------------------------------------------------------------------------------------------------

sub display_help
{

#~ =head2 display_help()

#~ I<Arguments> - None

#~ I<Returns> - Nothing

#~ I<Exceptions> - exits with status code B<1>

#~ =cut

my ($this_script) = ($PROGRAM_NAME =~m/(.*)/sxm ) ;

print {*STDERR} `perldoc $this_script`  or croak 'Error: Can\'t display help!' ; ## no critic (InputOutput::ProhibitBacktickOperators)
exit(1) ;
}

#---------------------------------------------------------------------------------------------------------

sub generate_completion_script
{
#~ =head2 generate_completion_script()

#~ I<Arguments> - None

#~ I<Returns> - Nothing

#~ I<Exceptions> - exits with status code B<1> after emitting the completion script on stdout

#~ =cut

my ($container, @definitions) = get_options_definition() ;

my $flip = 0 ;
my @options = grep {++$flip % 2} @definitions ;

print Term::Bash::Completion::Generator::generate_bash_completion_function('mci', [@options], undef, 0) ;
exit(1) ;
}
