package OpenInteract::FullText;

# $Id: FullText.pm,v 1.2 2001/02/04 23:07:55 lachoy Exp $

use strict;
use Lingua::Stem ();
use DBI          qw( SQL_VARCHAR SQL_INTEGER );
use Data::Dumper qw( Dumper );

$OpenInteract::FullText::VERSION = sprintf("%d.%02d", q$Revision: 1.2 $ =~ /(\d+)\.(\d+)/);
@OpenInteract::FullText::ISA     = ();

my %STOPWORDS = map { $_ => 1 } qw/
   of a the and an that which are is am they our who what when where how
   why whose but however or not was were could should would to
/;

use constant DEBUG  => 0;

my $MIN_WORD_LENGTH = 3;
my $MAX_WORD_LENGTH = 30;
my $TABLE_NAME      = 'full_text_index';

##########
# Class Methods
##########

# Used when we first inaugurate a class into the index -- take all the
# existing objects in the class and call ->reindex_object on them.

sub create_class_index {
  my ( $class ) = @_;
  my $obj_list = eval { $class->fetch_group };
  my $R = OpenInteract::Request->instance;
  if ( $@ ) {
    $R->scrib( 0, "Cannot retrieve objects from $class for indexing: $@" );
    return undef;
  }
  my $count = 0;
  foreach my $obj ( @{ $obj_list } ) {
    $obj->reindex_object;
    $count++;
  }
  return $count;
}

##########
# Class methods for searching
##########

# $p->{search_terms} is an arrayref of terms
# $p->{search_type} is 'any' (OR -- the default) or 'all' (AND)
# $p->{include_classes} \% of classes to use (and only these)
# $p->{exclude_classes} \% of classes NOT to use

sub search_ft_index {
  my ( $class, $p ) = @_;
  return [] unless ( scalar @{ $p->{search_terms} } );
  my $R = OpenInteract::Request->instance;
  $R->scrib( 1, "Searching for terms: ", join( ',', @{ $p->{search_terms} } ) );
  
  $p->{search_type}     ||= 'any';
  $p->{include_classes} ||= [];
  $p->{exclude_classes} ||= [];
  my %include_classes = map { $_ => 1 } @{ $p->{include_classes} };
  my %exclude_classes = map { $_ => 1 } @{ $p->{exclude_classes} };

  # after this assignment, @fixed_search is filled with stemmed,
  # lowercased non-STOPWORDS from the terms that the user has given us

  my @fixed_search = map { Lingua::Stem::stem( $_ )->[0] } 
                     grep { not defined $STOPWORDS{ $_ } } 
                     map { lc $_ } 
                     @{ $p->{search_terms} };

  # \%results is indexed by the object identifier coming from the
  # database, constructed by the _build_ft_object_id method (see the
  # _ft_search_terms comments/pod for an example

  my $results = $class->_ft_search_terms( { terms => \@fixed_search } );
  $R->scrib( 2, "Raw results listing: ", Dumper( $results ) );

  # If this was an AND search, knock off all the results that didn't
  # have matches for all the terms

  if ( lc $p->{search_type} eq 'all' ) {
    my $num_terms = scalar @fixed_search;
    my $num_removed = map { delete $results->{ $_ } } 
                      grep { scalar keys %{ $results->{ $_ } } < $num_terms } 
                      keys %{ $results };
    $R->scrib( 1, "Removed ($num_removed) items from the list since they didn't match all the terms" );
  }

  # @object_list keeps the objects constructed from the results

  my @object_list = ();

  # Try and create the object from the object identifier ($oi); if we
  # can, put as a temporary variable the number of results so we can
  # track it; if we wanted some other means of sorting -- for instance,
  # by class so we can break the results down by type -- we can easily
  # do so in this loop

  my $include_use = scalar keys %include_classes;
  my $exclude_use = scalar keys %exclude_classes;

  foreach my $oi ( keys %{ $results } ) {
    my ( $obj_class, $oid ) = $class->_crack_ft_object_id( $oi );
    
    # Do not keep search result if it's not one of a specified list to
    # include

    next if ( $include_use and ! $include_classes{ $obj_class } );

    # Do not keep search result if we have been told to exclude
    # any results from this class

    next if ( $exclude_use and $exclude_classes{ $obj_class } );

    # Create the object...

    my $obj = eval { $obj_class->fetch( $oid, 
                                        { skip_security => $p->{skip_security} } ) };
    $R->scrib( 0, "Cannot construct object from $obj_class ($oid): $@" ) if ( $@ );
    unless ( $obj and ref $obj ) {
      $R->scrib( 0, "No object resulting from ($obj_class) - ($oid) : likely security related." );
      next;
    }

    # Markup the score for each word indexed

    foreach my $key ( keys %{ $results->{ $oi } } ) {
      $obj->{tmp_ft_score} += $results->{ $oi }->{ $key };
    }
    push @object_list, $obj;
  }

  # ...but for now we return a list of the objects sorted by the number
  # of occurrences of all the words (so an object with 10 occurrences
  # of term1 and 2 of term2 will be ranked higher -- earlier in the
  # list -- than one with 5 of term1 and 6 of term2)

 return [ sort { $b->{tmp_ft_score} <=> $a->{tmp_ft_score} } @object_list  ];
}


# $terms is an arrayref of words already stemmed and the STOPWORDS
# picked out -- it's a clean list
# Return a hashref of information, 
# { ft_oid => { term => $occur, ... } }

sub _ft_search_terms {
  my ( $class, $p ) = @_;
  my $R = OpenInteract::Request->instance;

  # Resulting clause is something like:
  # term = ? OR term = ?             -- for two terms
  # term = ? OR term = ? OR term = ? -- for three terms
  # ...

  my $term_clause = join ' OR ', map { ' term = ? ' } @{ $p->{terms} };
  my $sql = qq/
     SELECT term, ft_oid, occur
       FROM $TABLE_NAME
      WHERE $term_clause       
  /;
  my ( $sth );
  eval {
    $sth = $R->db->prepare( $sql );
    $sth->execute( @{ $p->{terms} } );
  };
  die "Cannot search for terms!\n$sql\nError: $@" if ( $@ );
  
  # \%oid becomes a hashref indexed by the object identifier (created
  # by _build_ft_object_id) that points to a hashref with each term as
  # a key and the number of occurrences of that term as the value; note that
  # every object identifier does not have to have all the terms found
  # 
  # $oid = { 'OpenInteract::News||44' => { congress => 2, senate => 3 },
  #          'OpenInteract::BasicPage||/toolkit/congress' => { congress => 8, senate => 1 },
  #          ... };
  # 
  # Note that the object identifier (e.g., 'OpenInteract::News||44') may
  # sacrifice readability for space in the future

  my $oid = {};
  my ( $term, $ft_oid, $num ); 
  $sth->bind_columns( undef, \$term, \$ft_oid, \$num );
  while ( $sth->fetch ) {
    $oid->{ $ft_oid }->{ $term } = $num;
  }
  return $oid;
}



##########
# Object Methods
##########

# Add the various group checking/validating methods 
# to the subclass and send it on up the line 

sub ruleset_add {
  my ( $class, $rs_table ) = @_;
  my $obj_class = ref $class || $class;
  push @{ $rs_table->{post_save_action} }, \&reindex_object;
  push @{ $rs_table->{post_remove_action} }, \&remove_object_from_index;
  my $R = OpenInteract::Request->instance;
  $R->scrib( 1, "Installed post_save for $obj_class" );
  return 1;
}


sub reindex_object {
  my ( $self, $p ) = @_;
  my $R = OpenInteract::Request->instance;
  $R->scrib( 1, "Trying to index ", ref $self, " (", $self->id, ")" );
  my $indexable = $self->_indexable_object_text;
  $R->scrib( 2, "Indexable text: ", $indexable );
  my $wc = $self->_tokenize( $indexable );
  $R->scrib( 2, "Found the following tokens:", Dumper( $wc ) );
 
  # In the 'real world', we'd start a transaction here...

  my $remove_rv = $self->remove_object_from_index;
  $R->scrib( 1, "Results of removal: ($remove_rv)" );
  my $store_rv  = $self->_store_terms( $wc ) if ( keys %{ $wc } );
  # ... and commit it here if everything went ok
  return 1;
}



sub remove_object_from_index {
  my ( $self, $p ) = @_;
  my $ft_oid = $self->_build_ft_object_id;
  my $R = OpenInteract::Request->instance;
  $R->scrib( 1, "Trying to remove object: ", ref $self, " (", $self->id, ") from the index." );
  return eval { $self->db_delete( { table => $TABLE_NAME, 
                                    where => 'ft_oid = ?',
                                    value => [ $ft_oid ] } ) };
}



# Get the fields that should be indexed and join the values together
# with a space (easy), since we're just going to index all the text as
# one big field

sub _indexable_object_text {
  my ( $self ) = @_;
  my $R = OpenInteract::Request->instance;
  my $field_list = $self->CONFIG->{fulltext_field};
  unless ( ref $field_list eq 'ARRAY' ) {
    $R->scrib( 0, "Cannot index object text -- no fields presented in config file." );
    return undef;
  }
  return join ' ', map { $self->{$_} } @{ $field_list };
}



# Break up the text into tokens -- stemmed using Lingua::Stem and
# counted for occurrences. Remove the words that are too long, too
# short and those that are found in our STOPWORDS listing.

sub _tokenize {
  my ( $self, $text ) = @_;
  $text =~ tr/A-Z/a-z/;  # lowercase
  my %words = ();
  map { $words{ $_ }++ } map { Lingua::Stem::stem( $_ )->[0] } ( $text =~ /\w+/g );
  map { delete $words{ $_ } } grep { length $_ < $MIN_WORD_LENGTH } keys %words;
  map { delete $words{ $_ } } grep { length $_ > $MAX_WORD_LENGTH } keys %words;
  map { delete $words{ $_ } } keys %STOPWORDS;
  return \%words;
}

sub _store_terms {
  my ( $self, $terms ) = @_;
  my $R = OpenInteract::Request->instance;
  my $ft_oid = $self->_build_ft_object_id;
  my $sql = qq/
     INSERT INTO $TABLE_NAME
      ( term, ft_oid, occur )
     VALUES
      ( ?,    ?,       ?     ) 
   /;
  my $sth = eval { $self->global_db_handle->prepare( $sql ) };
  die "Cannot prepare statement for inserting terms! $@"  if ( $@ );

  my $count = 0;
  foreach my $term ( keys %{ $terms } ) {
    $R->scrib( 2, "Storing $ft_oid: $term ($terms->{$term})" );
    eval {
      $sth->bind_param( 1, $term, SQL_VARCHAR );
      $sth->bind_param( 2, $ft_oid, SQL_VARCHAR );
      $sth->bind_param( 3, $terms->{ $term }, SQL_INTEGER );
      $sth->execute;
      $count++;
    };
    die "Cannot execute\n$sql\nwith $term/$ft_oid/$terms->{ $term }: $@" if ( $@ );
  }
  return $count;
}



sub _build_ft_object_id {
  return $_[0]->{tmp_ft_oid} = join '>>', ref $_[0], $_[0]->id;
}



sub _crack_ft_object_id {
  return split '>>', $_[1];
}

1;

__END__

=pod

=head1 NAME

OpenInteract::FullText - Metadata layer for objects to implement simple full-text searching

=head1 SYNOPSIS

 # In object's spops.perl file
 myobj => {
  isa => [ qw/ OpenInteract::FullText ... / ],
  fulltext_field => [ qw/ description title / ],
  ...
 }

 # All 'save()' calls to the object will trigger the object's
 # 'description' and 'title' fields being indexed.

=head1 DESCRIPTION

This module implements a few simple rules, along with some
implementation goop, that allow objects to be full-text indexed just
by changing a few lines in the configuration file. By putting this
module in the @ISA (done through the configuration file) of a SPOPS
data object, that object can then call methods to index itself, and
calls to search the index can return instances of that object.

The design of this module should make it simple to swap out various
text indexing solutions, so objects can treat full-text indexing as a
black box -- some boxes give better (or different) results than
others, but they all take the same inputs and give the same outputs.

=head1 METHODS

B<create_class_index()>

Initialize a class into the index by retrieving all of its objects and
calling the I<reindex_object> method on each (see below).

Currently you can call this from the command line 

B<ruleset_add( $class, \%ruleset_table )>

Adds the necessary rules to the $class that puts this class in its
ISA. Currently, these rules consist of:

=over 4

=item * post_save_action: reindex this object -- first obliterate all
references in the index, then build the references anew (called on
both INSERTs and UPDATEs)

=item * post_remove_action: remove all references to this object from
the index

=back

B<reindex_object()>

The object updates any existing information in the index with its new
or updated content. I<OpenInteract::FullText> takes care of replacing the
old links with the new.

B<remove_object_from_index()>

Removes all instances of the object from the index.

B<search_ft_index( \%params )>

Parameters:

 search_terms (\@)
   Terms to be searched for. You do not need to deal with stemming,
   lowercasing the words or such things -- that is all part of the
   black box :)

 search_type ($) (defaults to 'any')
   Type of search this is. Currently we support only very simple
   boolean operations -- either any of the terms searched, or all of
   them. No in-betweens.

 include_class (\@) (optional)
   List of classes for which we should display results; any search
   items found not in these classes will not be included in the
   results.

 exclude_class (\@) (optional)
   List of classes for which results should NOT be included.

=head2 Private Methods

The following methods are private, although some of them might move to
public as the API stabilizes.

B<_indexable_object_text()>

Gets the text out of the object to index. Currently, we treat all text
from the object as one big field.

B<_tokenize( $text )>

Breaks text down into tokens. This process is very simple. First we
break the text into words, then we lower case each word, then we
'stem' each word. Here is a brief description of stemming:

 Truncation - Also referred to as "root/suffix management" or
 "Stemming" or "Word Stemming", truncation allows some search engines
 to recognize and shorten long words such as "plants" or "boating" to
 their root words (or word stems) "plant" and "boat." This makes
 searching for such words much easier because it is not necessary to
 consider every permutation of that word when trying to find it.1 In a
 search, the ability to enter the first part of a keyword, insert a
 symbol (usually *), and accept any variant spellings or word endings,
 from the occurrence of the symbol forward (e.g., femini* retrieves
 feminine, feminism, feminism, etc.).3 See also word variants, plurals
 and singulars.

(From: http://ollie.dcccd.edu/library/Module2/Books/concepts.htm)

We use the L<Lingua::Stem> module for this, which implements the
I<Porter algorithm> for stemming, as do most implementations,
apparently. (This is something that I<OpenInteract::FullText> treats as a
black box itself :)

Parameters:

 $text
   Text to tokenize

B<_store_terms( \%term_info )>

Parameters:

 \%term_info
   Information about the terms associated with this object. Keys are
   the stemmed terms, values are the number of times the stemmed term
   appears in the object.

B<_build_ft_object_id()>

Returns the string used to uniquely identify an object.

B<_crack_ft_object_id( $object_identifier )>

Takes the $object_identifier as created by I<_build_ft_object_id> and
returns a two-element list of the object class and ID.

B<_ft_search_terms( \%params )>

Performs the actual search, putting the results into an intermediate
format. Returns a hashref with the keys as object identifiers and the
values a hashref of stemmed terms with the number of ocurrences of
each term as the value there. For instance, doing a search for
'congress' and 'senate' might bring up:

  $oid = { 'OpenInteract::News||44' => { congress => 2, 
                                         senate => 3 },
           'OpenInteract::BasicPage||/toolkit/congress' => { congress => 8, 
                                                             senate => 1 },
           ... 
  };

=head1 TO DO

See the TO DO list in the documentation found in this package. (Under
'doc/' in the package distribution, or via the system documentation
available through the OpenInteract browser interface.)

=head1 BUGS

=head1 COPYRIGHT

Copyright (c) 2001 intes.net, inc.. All rights reserved.

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

=head1 AUTHORS

Chris Winters <chris@cwinters.com>

=cut
