package OpenInteract::FullText;

# $Id: FullText.pm,v 1.8 2001/07/12 23:06:55 lachoy Exp $

use strict;
use Lingua::Stem ();
use DBI          qw( SQL_VARCHAR SQL_INTEGER );
use Data::Dumper qw( Dumper );

BEGIN {
    eval { require OpenInteract::FullTextIterator };
}

$OpenInteract::FullText::VERSION = sprintf("%d.%02d", q$Revision: 1.8 $ =~ /(\d+)\.(\d+)/);
@OpenInteract::FullText::ISA     = ();

use constant COLUMN_GROUP    => 'listing';
use constant SQL_ORDER_KEY   => 0;
use constant SQL_ORDER_TERM  => 1;
use constant SQL_ORDER_COUNT => 2;
use constant FREQUENCY_KEY   => '_FREQ_';
use constant MIN_WORD_LENGTH => 3;
use constant MAX_WORD_LENGTH => 30;

my %STOPWORDS = map { $_ => 1 } qw/
   of a the and an that which are is am they our who what when where how
   why whose but however or not was were could should would to
/;

my $TABLE_NAME      = 'full_text_index';
my @SQL_FIELD_NAMES = qw/ object_key term occur /;


########################################
# CLASS METHODS
########################################


# Used when we first inaugurate a class into the index -- take all the
# existing objects in the class and call ->reindex_object on them.

sub create_class_index {
    my ( $class ) = @_;
    my $iter = eval { $class->fetch_iterator };
    if ( $@ ) {
        my $R = OpenInteract::Request->instance;
        $R->scrib( 0, "Cannot retrieve objects from $class for indexing: $@" );
        return undef;
    }
    my $count = 0;
    while ( my $obj = $iter->get_next ) {
        $obj->reindex_object;
        $count++;
    }
    return $count;
}



##########
# OBJECT METHODS
##########

# $p->{search_terms} is an arrayref of terms
# $p->{search_type} is 'any' (OR -- the default) or 'all' (AND)
# $p->{include_classes} \% of classes to use (and only these)
# $p->{exclude_classes} \% of classes NOT to use
# $p->{return} can be 'object', 'class', or 'iterator'

sub search_fulltext_index {
    my ( $class, $p ) = @_;
    return [] unless ( scalar @{ $p->{search_terms} } );

    my $R = OpenInteract::Request->instance;
    $R->DEBUG && $R->scrib( 1, "Searching for terms: ", join( ',', @{ $p->{search_terms} } ) );
  
    $p->{return}          ||= 'object';
    $p->{search_type}     ||= 'any';

    # after this assignment, @fixed_search is filled with stemmed,
    # lowercased non-STOPWORDS from the terms that the user has given us

    my @fixed_search = map { Lingua::Stem::stem( $_ )->[0] } 
                       grep { not defined $STOPWORDS{ $_ } } 
                       map { lc $_ } 
                       @{ $p->{search_terms} };

    # \%results is indexed by the object identifier coming from the
    # database, constructed by the _build_fulltext_object_id method (see the
    # _fulltext_search_terms comments/pod for an example)

    my $sth = $class->_execute_fulltext_search({ terms => \@fixed_search });
    my $results = $class->_fetch_raw_fulltext_results( $sth );
    $R->DEBUG && $R->scrib( 3, "Raw results listing: ", Dumper( $results ) );
    $R->scrib( 0, "Raw results listing: ", Dumper( $results ) );

    # Remove any items from the search that don't belong -- for
    # instance, if the search is 'AND' we need to ensure that each
    # term is represented.

    $class->_screen_results( $results, { search_type => $p->{search_type},
                                         terms       => \@fixed_search } );

    # Now create a list of \@( object key, frequency ) items, sorted
    # by the frequency

    my @frequency = sort { $b->[1] <=> $a->[1] }
                    map { [ $_, $results->{ $_ }->{ FREQUENCY_KEY() } ] }
                    keys %{ $results };

    if ( $p->{return} eq 'raw' )      { return \@frequency }
    if ( $p->{return} eq 'iterator' ) { return OpenInteract::FullTextIterator->new({ 
                                                            %{ $p }, results => \@frequency }) }
    if ( $p->{return} eq 'object' )   { return $class->_fulltext_object_listing( \@frequency, $p ) }
}


# Take the already-sorted list of [ object_key, frequency ] and turn
# it into a list of objects with the 'tmp_fulltext_score' property in
# each set to the frequency

sub _fulltext_object_listing {
    my ( $class, $result_list, $p ) = @_;
    return [] unless ( ref $result_list eq 'ARRAY' );
    $p ||= {};
    my ( @object_list );
    foreach my $result ( @{ $result_list } ) {
        push @object_list, OpenInteract::SPOPS->fetch_object_by_key( 
                                                     $result->[0], 
                                                     { column_group  => COLUMN_GROUP,
                                                       skip_security => $p->{skip_security},
                                                       %{ $p } });
        $object_list[ -1 ]->{ tmp_fulltext_score } = $result->[1];
    }
    return \@object_list;
}


########################################
# RULESET METHODS
########################################


# Add the various group checking/validating methods 
# to the subclass and send it on up the line 

sub ruleset_add {
    my ( $class, $rs_table ) = @_;
    my $obj_class = ref $class || $class;
    push @{ $rs_table->{post_save_action} }, \&reindex_object;
    push @{ $rs_table->{post_remove_action} }, \&remove_object_from_index;
    my $R = OpenInteract::Request->instance;
    $R->DEBUG && $R->scrib( 1, "Installed post_save for $obj_class" );
    return __PACKAGE__;
}


# Remove the previous object information from the index, tokenize the
# object and save the tokens/frequences back to the index.

sub reindex_object {
    my ( $self, $p ) = @_;
    my $R = OpenInteract::Request->instance;
    $R->DEBUG && $R->scrib( 1, "Trying to index ", ref $self, " (", $self->id, ")" );
    my $indexable = $self->_indexable_object_text;
    $R->DEBUG && $R->scrib( 2, "Indexable text: ", $indexable );
    my $wc = $self->_tokenize( $indexable );
    $R->DEBUG && $R->scrib( 2, "Found the following tokens:", Dumper( $wc ) );
 
    # In the 'real world', we'd start a transaction here...

    my $remove_rv = $self->remove_object_from_index;
    $R->DEBUG && $R->scrib( 1, "Results of removal: ($remove_rv)" );
    my $store_rv  = $self->_store_terms( $wc ) if ( keys %{ $wc } );

    # ... and then commit it here if everything went ok

    return 1;
}


# Remove all instances of the object's terms from the index.

sub remove_object_from_index {
    my ( $self, $p ) = @_;
    my $object_key = $self->generate_object_key;
    my $R = OpenInteract::Request->instance;
    $R->DEBUG && $R->scrib( 1, "Trying to remove from index: ", 
                               ref $self, " (", $self->id, ")" );
    return eval { $self->db_delete({ table => $TABLE_NAME, 
                                     where => 'object_key = ?',
                                     value => [ $object_key ] }) };
}


########################################
# INTERNAL METHODS
########################################


# Get the fields that should be indexed and join the values together
# with a space (easy), since we're just going to index all the text as
# one big field

sub _indexable_object_text {
    my ( $self ) = @_;
    my $R = OpenInteract::Request->instance;
    my $field_list = $self->CONFIG->{fulltext_field};
    unless ( ref $field_list eq 'ARRAY' ) {
        $R->scrib( 0, "Cannot index object text -- no fields presented in config file." );
        return undef;
    }
    return join ' ', map { $self->{$_} } @{ $field_list };
}


# Break up the text into tokens -- stemmed using Lingua::Stem and
# counted for occurrences. Remove the words that are too long, too
# short and those that are found in our STOPWORDS listing.

sub _tokenize {
    my ( $self, $text ) = @_;
    $text =~ tr/A-Z/a-z/;  # lowercase
    my %words = ();
    map { $words{ $_ }++ } map { Lingua::Stem::stem( $_ )->[0] } ( $text =~ /\w+/g );
    map { delete $words{ $_ } }  
        grep { length $_ < MIN_WORD_LENGTH || length $_ > MAX_WORD_LENGTH }
        keys %words;
    map { delete $words{ $_ } } keys %STOPWORDS;
    return \%words;
}


# Store a hashref of terms in the database. Keys are stemmed terms,
# values are number of times the term appears in the object.
#
# Returns: number of terms successfully stored

sub _store_terms {
    my ( $self, $terms ) = @_;
    my $R = OpenInteract::Request->instance;
    my $object_key = $self->generate_object_key;
    my $sql = " INSERT INTO $TABLE_NAME ( " .join( ',', @SQL_FIELD_NAMES ) . " )\n" .
              ' VALUES ( ' . join( ',', map { '?' } @SQL_FIELD_NAMES ) . ' )';
    my $sth = eval { $self->global_db_handle->prepare( $sql ) };
    die "Cannot prepare statement ($sql) for inserting terms! $@"  if ( $@ );

    my $count = 0;
    foreach my $term ( keys %{ $terms } ) {
        $R->DEBUG && $R->scrib( 2, "Storing $object_key: $term ($terms->{$term})" );
        eval {
            $sth->bind_param( SQL_ORDER_KEY + 1,   $object_key,       SQL_VARCHAR );
            $sth->bind_param( SQL_ORDER_TERM + 1,  $term,             SQL_VARCHAR );
            $sth->bind_param( SQL_ORDER_COUNT + 1, $terms->{ $term }, SQL_INTEGER );
            $sth->execute;
            $count++;
        };
        die "Cannot execute\n$sql\nwith $object_key/$term/$terms->{ $term }: $@" if ( $@ );
    }
    return $count;
}


# $p->{terms} should be an arrayref of words already stemmed and the
# STOPWORDS picked out -- it's a clean list

# Return a hashref of information, 
# { ft_oid => { term => $occur, ... } }

sub _execute_fulltext_search {
    my ( $class, $p ) = @_;
    my $R = OpenInteract::Request->instance;

    my $sql = $class->_build_sql_search_statement( $p->{terms} );

    my ( $sth );
    eval {
        $sth = $R->db->prepare( $sql );
        $sth->execute( @{ $p->{terms} } );
    };
    die "Cannot search for terms!\n$sql\nError: $@" if ( $@ );
    return $sth;
}


sub _build_sql_search_statement {
    my ( $class, $term_list ) = @_;

    $term_list ||= [];

    # Resulting clause is something like:
    # WHERE term = ?                         -- for one term
    # WHERE term = ? OR term = ?             -- for two terms
    # WHERE term = ? OR term = ? OR term = ? -- for three terms
    # ...

    my $term_clause = 'WHERE ' . join( ' OR ', map { ' term = ? ' } @{ $term_list } );
    return "SELECT ". join( ', ', @SQL_FIELD_NAMES ) . "\n" .
           "  FROM $TABLE_NAME\n" .
           " $term_clause";
}


# Returns a hashref with:
#
#   object_key->{term} = count
#   object_key->{_FREQ_} = all_counts

sub _fetch_raw_fulltext_results {
    my ( $class, $sth ) = @_;

    # %object_info is indexed by the object key that points to a
    # hashref with each term as a key and the number of occurrences of
    # that term as the value; note that every object identifier does
    # not have to have all the terms found

    # %object_info = (
    #     '34e22f75a1c8b1fd488114b7ae4de61e' => { congress => 2, senate => 3, _FREQ_ => 5 },
    #     '40624e3ef223bf88678b5c3606e00333' => { congress => 8, senate => 1, _FREQ_ => 9 },
    #     ...,
    # );

    my %object_info = ();
    my ( $object_key, $term, $count );
    my ( @bind_order );
    @bind_order[ SQL_ORDER_KEY,
                 SQL_ORDER_TERM,
                 SQL_ORDER_COUNT ] = ( \$object_key, \$term, \$count );
    $sth->bind_columns( @bind_order );
    while ( $sth->fetch ) {
        $object_info{ $object_key }->{ $term } = $count;
        $object_info{ $object_key }->{ FREQUENCY_KEY() } += $count;
    }
    return \%object_info;
}


# Remove results that do not belong in the resultset -- currently this
# only screens out results in an 'AND' search that don't have all the
# terms found

sub _screen_results {
    my ( $class, $results, $p ) = @_;
    my $R = OpenInteract::Request->instance;

    # If this was an AND search, knock off all the results that didn't
    # have matches for all the terms

    if ( lc $p->{search_type} eq 'all' ) {
        my $num_terms = scalar @{ $p->{terms} };

        # The '- 1' is in there to account for the frequency key in
        # the results

        my $num_removed = map { delete $results->{ $_ } } 
                          grep { ( scalar keys %{ $results->{ $_ } } ) - 1 < $num_terms } 
                          keys %{ $results };
        $R->DEBUG && $R->scrib( 1, "Removed ($num_removed) items from the list",
                                   "since they didn't match all the terms" );
    }

}


# Create a unique identifier for the object from the class and ID (not
# used anymore, will probably be removed)

#sub _build_fulltext_object_id {
#    return $_[0]->{tmp_ft_oid} = join '>>', ref $_[0], $_[0]->id;
#}


# Deserialize the unique object identifier (not used anymore, will
# probably be removed)

#sub _crack_fulltext_object_id {
#    return split '>>', $_[1];
#}

1;

__END__

=pod

=head1 NAME

OpenInteract::FullText - Metadata layer for objects to implement simple full-text searching

=head1 SYNOPSIS

 # In object's spops.perl file
 myobj => {
  isa => [ qw/ OpenInteract::FullText ... / ],
  fulltext_field => [ qw/ description title / ],
  ...
 }

 # All 'save()' calls to the object will trigger the object's
 # 'description' and 'title' fields being indexed.

 my $raw_result_list = OpenInteract::FullText->search_fulltext_index({
                                        return   => 'raw',
                                        keywords => [ 'google', 'engine' ] });

=head1 DESCRIPTION

This module implements a few simple rules, along with some
implementation goop, that allow objects to be full-text indexed just
by changing a few lines in the configuration file. By putting this
module in the @ISA (done through the configuration file) of a SPOPS
data object, that object can then call methods to index itself, and
calls to search the index can return instances of that object.

The design of this module should make it simple to swap out various
text indexing solutions, so objects can treat full-text indexing as a
black box -- some boxes give better (or different) results than
others, but they all take the same inputs and give the same outputs.

=head1 CLASS METHODS

B<create_class_index()>

Initialize a class into the index by retrieving all of its objects and
calling the I<reindex_object> method on each (see below).

Currently you can call this from the command line with the script
'script/search_index.pl' within the 'full_text' package.

B<search_fulltext_index( \%params )>

Parameters:

=over 4

=item *

B<search_terms> (\@)

Terms to be searched for. You do not need to deal with stemming,
lowercasing the words or such things -- that is all part of the black
box :)

=item *

B<search_type> ($) (defaults to 'any')

Type of search this is. Currently we support only very simple boolean
operations -- either any of the terms searched, or all of them. No
in-betweens.

=item *

B<include_class> (\@) (optional)

List of classes for which we should display results; any search items
found not in these classes will not be included in the results.

=item *

B<exclude_class> (\@) (optional)

List of classes for which results should NOT be included.

=item *

B<return> ($) (optional, default 'object')

What do you want returned? The options are:

 object   - Arrayref of objects
 class    - Two arrayrefs: one of classes, one of IDs
 iterator - Iterator that returns objects one at a time (not yet done)

=back

=head1 RULESET METHODS

B<ruleset_add( $class, \%ruleset_table )>

Adds the necessary rules to the $class that puts this class in its
ISA. Currently, these rules consist of:

=over 4

=item *

B<post_save_action>: reindex this object -- first obliterate all
references in the index, then build the references anew (called on
both INSERTs and UPDATEs)

=item *

B<post_remove_action>: remove all references to this object from the
index

=back

B<reindex_object()>

The object updates any existing information in the index with its new
or updated content. I<OpenInteract::FullText> takes care of replacing the
old links with the new.

B<remove_object_from_index()>

Removes all instances of the object from the index.

=head1 PRIVATE METHODS

The following methods are private, although some of them might move to
public as the API stabilizes.

B<_indexable_object_text()>

Gets the text out of the object to index. Currently, we treat all text
from the object as one big field.

B<_tokenize( $text )>

Breaks text down into tokens. This process is very simple. First we
break the text into words, then we lower case each word, then we
'stem' each word. Here is a brief description of stemming:

 Truncation - Also referred to as "root/suffix management" or
 "Stemming" or "Word Stemming", truncation allows some search engines
 to recognize and shorten long words such as "plants" or "boating" to
 their root words (or word stems) "plant" and "boat." This makes
 searching for such words much easier because it is not necessary to
 consider every permutation of that word when trying to find it.1 In a
 search, the ability to enter the first part of a keyword, insert a
 symbol (usually *), and accept any variant spellings or word endings,
 from the occurrence of the symbol forward (e.g., femini* retrieves
 feminine, feminism, feminism, etc.).3 See also word variants, plurals
 and singulars.

(From: http://ollie.dcccd.edu/library/Module2/Books/concepts.htm)

We use the L<Lingua::Stem> module for this, which implements the
I<Porter algorithm> for stemming, as do most implementations,
apparently. (This is something that I<OpenInteract::FullText> treats as a
black box itself :)

Parameters:

=over 4

=item *

B<text> ($)

Text to tokenize

=back

B<_store_terms( \%term_info )>

Parameters:

=over 4

=item *

B<term_info> (\%)

Information about the terms associated with this object. Keys are the
stemmed terms, values are the number of times the stemmed term appears
in the object.

=back

B<_build_fulltext_object_id()>

Returns the string used to uniquely identify an object.

B<_crack_fulltext_object_id( $object_identifier )>

Takes the $object_identifier as created by I<_build_fulltext_object_id> and
returns a two-element list of the object class and ID.

B<_fulltext_search_terms( \%params )>

Performs the actual search, putting the results into an intermediate
format. Returns a hashref with the keys as object identifiers and the
values a hashref of stemmed terms with the number of ocurrences of
each term as the value there. For instance, doing a search for
'congress' and 'senate' might bring up:

  $oid = { 'OpenInteract::News||44' => { congress => 2, 
                                         senate => 3 },
           'OpenInteract::BasicPage||/toolkit/congress' => { congress => 8, 
                                                             senate => 1 },
           ... 
  };

=head1 TO DO

See the TO DO list in the documentation found in this package. (Under
'doc/' in the package distribution, or via the system documentation
available through the OpenInteract browser interface.)

=head1 BUGS

None known.

=head1 COPYRIGHT

Copyright (c) 2001 intes.net, inc.. All rights reserved.

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

=head1 AUTHORS

Chris Winters <chris@cwinters.com>

=cut
