#!/usr/bin/perl -w

use strict ;
use warnings ;
use Carp ;

=head1 NAME 

 $> siim - Create and search a full text indexed database

=head1 DOCUMENTATION

Let you create and search full text indexed database based on the content of files.

=head1 USAGE

 $> siim --option --option  file_1 file_2 ... file_n

=head2 Examples

=over 2

=item Adding files to an index database

  $> find | xargs perl siim -i test -a

=item Indexing perl modules

  $> find /devel/perl_modules/My_Module -type f -readable -size -300k| grep -v blib | grep -v _build | xargs siim -p -i test -a

=item Removing some files

  $> xargs siim -i test -r file file ...

=item Checking the index database

  $> siim -p -i test -c

=item Searching for text

  $> siim -i test -s 'your query'

=item command line completion (in your current shell)

  $> siim --completion_script > /tmp/siim ; source /tmp/siim ; rm /tmp/siim

=back

=head1 OPTIONS

  'h|help'              display help
  'completion_script'   generates a bash completion script
  
  'i|index_directory=s' path to the database directory
  'a|add_files'         add files to the database
  'r|remove_files'      remove files from the database
  'c|check_index'       check the database and display file state
  's|search=s'          search strings in the database
  'v|verbose'           display score, state, timing, ...
  'p|perl_mode'         pre-defined perl stopword list
  'stopwords_file=s'   path to files containing stopwords
  'delete_database'     deletes the database in the index directory

=head1 EXIT STATUS

=head1 AUTHOR

	Nadim ibn hamouda el Khemir
	CPAN ID: NKH
	mailto: nkh@cpan.org

=cut

#------------------------------------------------------------------------------------------------------------------------

our $VERSION = '0.02' ;

use Getopt::Long ;
use Term::Bash::Completion::Generator ;
use Readonly ;
use English qw( -no_match_vars ) ;

use Search::Indexer::Incremental::MD5 qw() ;
use Search::Indexer::Incremental::MD5::Indexer qw() ;
use Search::Indexer::Incremental::MD5::Searcher qw() ;

#---------------------------------------------------------------------------------------------------------

my (@siim_arguments) = @ARGV ;
undef @ARGV ;

siim(@siim_arguments) ;

#---------------------------------------------------------------------------------------------------------

sub siim
{

#~ =head2 siim(@arguments)

#~ Create and search  full text indexed database based on the content of files.

#~ I<Arguments> - command line arguments to be parsed by Getop::Long

#~ I<Returns> -  Nothing

#~ I<Exceptions> - 

#~ =over 2 

#~ =item Invalid options

#~ =item Invalid index database

#~ =back

#~ =cut

my (@arguments) = @_ ;
my ($options, @files)  = get_options(@arguments) ;

generate_completion_script() if $options->{completion_script} ;

unless 
	(
	$options->{search}|| $options->{add_files}	|| $options->{remove_files}
	|| $options->{check_index } || $options->{delete_database}
	)
	{
	display_help() ;
	}

if($options->{add_files} + $options->{remove_files} + $options->{check_index } > 1)
	{
	croak "Error: only one of 'add_files', 'remove_files', 'check_index' allowed!\n" ;
	}
	
my @perl_extra_arguments = get_perl_word_regex_and_stopwords() if $options->{perl_mode} ;
my @stopwords = (STOPWORDS => $options->{stopwords_file}) if $options->{stopwords_file} ;

if($options->{delete_database})
	{
	Search::Indexer::Incremental::MD5::delete_indexing_databases($options->{index_directory}) ;
	}
	
if($options->{add_files}	|| $options->{remove_files} || $options->{check_index })
	{
	my $indexer 
		= Search::Indexer::Incremental::MD5::Indexer->new
			(
			INDEX_DIRECTORY => $options->{index_directory}, 
			USE_POSITIONS => 1, 
			WORD_REGEX => qr/\w+/,
			@stopwords,
			@perl_extra_arguments,
			) ;
	
	if($options->{add_files})
		{
		$indexer->add_files
			(
			FILES => \@files,
			DONE_ONE_FILE_CALLBACK => 
				sub
				{
				my ($file, $file_info) = @_ ;
				
				if($file_info->{STATE} == 0)
					{
					if($options->{verbose})
						{
						printf "'$file' up to date %.3f s.\n", $file_info->{TIME} ;
						}
					}
				elsif($file_info->{STATE} == 1)
					{
					if($options->{verbose})
						{
						printf "'$file' re-indexed in %.3f s.\n", $file_info->{TIME} ;
						}
					else
						{
						print "$file\n" ;
						}
					}
				elsif($file_info->{STATE} == 2)
					{
					if($options->{verbose})
						{
						printf "'$file' new file %.3f s.\n", $file_info->{TIME} ;
						}
					else
						{
						print "$file\n" ;
						}
					}
				else
					{
					croak "Error: Unexpected file '$file' state!\n" ;
					}
				}
			) ;
		}
	elsif($options->{remove_files})
		{
		$indexer->remove_files
			(
			FILES => \@files,
			DONE_ONE_FILE_CALLBACK => 
				sub
				{
				my ($file, $file_info) = @_ ;

				if($file_info->{STATE} == 0)
					{
					if($options->{verbose})
						{
						printf "'$file' found and identical in %.3f s.\n", $file_info->{TIME} ;
						}
					else
						{
						print "$file\n" ;
						}
					}
				elsif($file_info->{STATE} == 1)
					{
					if($options->{verbose})
						{
						printf "'$file' file found, contents differ %.3f s.\n", $file_info->{TIME} ;
						}
					else
						{
						print "$file\n" ;
						}
					}
				elsif($file_info->{STATE} == 2)
					{
					if($options->{verbose})
						{
						printf "'$file' not found in %.3f s.\n", $file_info->{TIME} ;
						}
					}
				else
					{
					croak "Error: Unexpected file '$file' state!\n" ;
					}
				}
			) ;
		}
	elsif($options->{check_index})
		{
		$indexer->check_indexed_files
			(
			DONE_ONE_FILE_CALLBACK => 
				sub
				{
				my ($file, $file_info) = @_ ;

				if($file_info->{STATE} == 0)
					{
					if($options->{verbose})
						{
						printf "'$file' found and identical in %.3f s.\n", $file_info->{TIME} ;
						}
					else
						{
						print "$file\n" ;
						}
					}
				elsif($file_info->{STATE} == 1)
					{
					if($options->{verbose})
						{
						printf "'$file' file found, contents differ %.3f s.\n", $file_info->{TIME} ;
						}
					else
						{
						print "$file\n" ;
						}
					}
				elsif($file_info->{STATE} == 2)
					{
					if($options->{verbose})
						{
						printf "'$file' not found in %.3f s.\n", $file_info->{TIME} ;
						}
					else
						{
						print "$file\n" ;
						}
					}
				else
					{
					croak "Error: Unexpected file '$file' state!\n" ;
					}
				}
			) ;
		}
	}

if($options->{search})
	{
	my $searcher 
		= eval 
			{
			Search::Indexer::Incremental::MD5::Searcher->new
				(
				INDEX_DIRECTORY => $options->{index_directory}, 
				USE_POSITIONS => 1, 
				WORD_REGEX => qr/\w+/,
				@stopwords ,
				@perl_extra_arguments,
				);
			} or croak "No full text index found! $@\n" ;

	my $results  = $searcher->search(SEARCH_STRING => $options->{search}) ;

	my @indexes = map { $_->[0] }
					reverse
						sort { $a->[1] <=> $b->[1] }
							map { [$_, $results->[$_]{SCORE}] }
								0 .. $#$results ;

	for (@indexes)
		{
		if($options->{verbose})
			{
			print "'$results->[$_]{PATH}' with scrore $results->[$_]{SCORE}.\n" ;
			}
		else
			{
			print "$results->[$_]{PATH}\n" ;
			}
		}
	}
}

#----------------------------------------------------------------------------------------------------------

sub get_perl_word_regex_and_stopwords
{
#~ =head2 get_perl_word_regex_and_stopwords()

#~ creates a $word_regex and $stopwords for the perl language

#~ I<Arguments> - None

#~ I<Returns> -  a list of tuples 

#~ =over 2 

#~ =item (WORD_REGEX => $word_regex) - a key and a regex defining a word in the Perl language

#~ =item (STOPWORDS => $stopwords) - a key and an array reference containing words to ignore in the Perl language

#~ =back

#~ I<Exceptions> - None

#~ =cut

my $id_regex = 
	qr/
	(?![0-9])       # don't start with a digit
	\w\w+           # start with 2 or more word chars ..
	 (?:::\w+)*      # .. and  possibly ::some::more::components
	 /x; 

my $word_regex = 
	qr/
  	    (?:                # either a Perl variable:
	    (?:\$\#?|\@|\%)    #   initial sigil
	    (?:                #     followed by
	       $id_regex       #       an id
	       |               #     or
	       \^\w            #       builtin var with '^' prefix
	       |               #     or
	       (?:[\#\$](?!\w))#       just '$$' or '$#'
	       |               #     or
	       [^{\w\s\$]      #       builtin vars with 1 special char
	     )
	     |                 # or
	     $id_regex         # a plain word or module name
	 )/x;


my @stopwords = 
	(
	'a' .. 'z', '_', '0' .. '9',
	qw/
	__data__ __end__ $class $indexing_operation
	above after all also always an and any are as at
	be because been before being both but by
	can cannot could
	die do don done
	defined do does doesn
	each else elsif eq
	for from
	ge gt
	has have how
	if in into is isn it item its
	keys
	last le lt
	many may me method might must my
	ne new next no nor not
	of on only or other our
	package perl pl pm pod push
	qq qr qw
	ref return
	see set shift should since so some something sub such
	text than that the their them then these they this those to tr
	undef unless until up us use used uses using
	values
	was we what when which while will with would
	you your
	SYNOPSIS DESCRIPTION METHODS  FUNCTIONS 
	BUGS  AUTHOR  
	COPYRIGHT  LICENSE 
	/, 
	'SEE ALSO',
	);

return(WORD_REGEX => $word_regex, STOPWORDS => \@stopwords,) ;
}

#------------------------------------------------------------------------------------------------------------------------

sub get_options_definition
{
#~ =head2 get_options_definition()

#~ I<Arguments> - None

#~ I<Returns> - a list of tuples, config name => container

#~ I<Exceptions> -None

#~ =cut

my ($container) = @_ ;

croak "Error: Expected a hash reference!\n" if (defined $container && 'HASH' eq ref $container) ;

$container ||=
	{
	add_files => 0,
	remove_files => 0,
	check_index => 0,
	use_position => 1, 
	} ;
	
my @definitions = 
	(
	'h|help' => \&display_help,
	'i|index_directory=s' => \$container->{index_directory},
	'a|add_files' => \$container->{add_files},
	'r|remove_files' => \$container->{remove_files},
	'c|check_index' => \$container->{check_index}, 
	's|search=s' => \$container->{search},
	'v|verbose' => \$container->{verbose},
	'p|perl_mode' => \$container->{perl_mode},
	'use_position=i' => \$container->{use_position},
	'stopwords_file=s' => \$container->{stopwords_file},
	'delete_database' => \$container->{delete_database},
	'completion_script' => \$container->{completion_script},
	) ;

return $container, @definitions ;
}	

#------------------------------------------------------------------------------------------------------------------------

sub get_options
{

#~ =head2 get_options(@arguments)

#~ Parses the command line for the module to search and user defined options.

#~ I<Arguments>

#~ =over 2 

#~ =item @arguments - the command line arguments

#~ =back

#~ I<Returns> -  A list containing the module to search followed by the options accepted by this command

#~ I<Exceptions> - exits if an invalid option is passed

#~ =cut

my (@arguments) = @_ ;
local @ARGV = @arguments ;

my ($container, @definitions) = get_options_definition() ;

die "Error: Invalid Option! Try --help.\n" unless GetOptions(@definitions) ;

return($container, @ARGV) ;
}

#---------------------------------------------------------------------------------------------------------

sub display_help
{

#~ =head2 display_help()

#~ I<Arguments> - None

#~ I<Returns> - Nothing

#~ I<Exceptions> - exits with status code B<1>

#~ =cut

my ($this_script) = ($PROGRAM_NAME =~m/(.*)/sxm ) ;

print {*STDERR} `perldoc $this_script`  or croak 'Error: Can\'t display help!' ; ## no critic (InputOutput::ProhibitBacktickOperators)
exit(1) ;
}

#---------------------------------------------------------------------------------------------------------

sub generate_completion_script
{
#~ =head2 generate_completion_script()

#~ I<Arguments> - None

#~ I<Returns> - Nothing

#~ I<Exceptions> - exits with status code B<1> after emitting the completion script on stdout

#~ =cut

my ($container, @definitions) = get_options_definition() ;

my $flip = 0 ;
my @options = grep {++$flip % 2} @definitions ;

print Term::Bash::Completion::Generator::generate_bash_completion_function('siim', [@options], undef, 0) ;
exit(1) ;
}


#---------------------------------------------------------------------------------------------------------

