# spell: a mostly traditional implementation of the UNIX spelling checker.
# Copyright 1999 Nathan Scott Thompson ( quimby at city-net dot com )
# You may use this according to the GNU Public License: see http://www.gnu.org
# Hope you have fun with this - maybe even find it useful.
# Documentation at bottom.

use Spelling;
use strict;

my $Path     = $ENV{SPELLDIR} || ( -e '/usr/dict/' ? '/usr/dict/' : '' );
my $Wordfile = $Path . 'hlista';
my $Stopfile = $Path . 'hstop';
my $History  = $Path . 'spellhist';
my $Filter   = 'deroff';
my ($British, $Verbose, $Expose);

while ( $ARGV[0] =~ /^-(\w+)/ )
{
    foreach ( split //, $1 )
    {
        /b/ and $British = 1;
        /p/ and $Filter = 'depod';
        /v/ and $Verbose = 1;
        /x/ and $Expose = 1;
        /[^bpvx]/ and die "usage: spell [-b] [-p] [-v] [-x] [file] ...\n";
    }
    shift;
}

if ( $British )
{
    $Wordfile = $Path . 'hlistb';
    Stems::Britishise();
}
else
{
    Stems::Americanize();
}

open( DICT, $Wordfile ) or die "Can't open word list `$Wordfile': $!\n";
my $wordlist = Hlist->new( FileHandle => \*DICT )
    or die "Bad hashed word list: $Wordfile\n";
close DICT;
open( STOP, $Stopfile ) or die "Can't open stop list `$Stopfile': $!\n";
my $stoplist = Hlist->new( FileHandle => \*STOP )
    or die "Bad hashed word list: $Stopfile\n";
close STOP;

# Rely on the filter to extract words from the input.
# Direct the output of the filter into a temporary file.
# Is this the most portable approach?

my $tempfile = 'spelltmp';
open( TEMP, "+>$tempfile" )
    or die "Can't open temporary file $tempfile: $!\n";
select TEMP;
unshift( @ARGV, '-w' );
require $Filter;
select STDOUT;

# Gather just the unique words from the output of the filter.

my %words;
seek( TEMP, 0, 0 );
while ( <TEMP> )
{
    /\S+/ and $words{$&} = 1;
}
close TEMP;
unlink $tempfile;

# Sort the unique input words.
# Lookup each word, trying variations in capitalization.  
# If the word matches the word list, go to the next word.
# If the word matches the stop list, it's a misspelling.
# Lacking a match, lookup each plausible stem.  
# Print the unmatched word.

my (@rejects,@stemmed);

WORD:
foreach my $word ( sort { lc($a) cmp lc($b) } keys %words )
{
    foreach ( capitalizations( $word ) )
    {
        print( "=$_\n" ) if $Expose;
        next WORD if $wordlist->contains( $_ );
        last if $stoplist->contains( $_ );
        foreach ( stems( $_ ) )
        {
            print( "=", $_->[0], "\n" ) if $Expose;
            if ( $wordlist->contains( $_->[0] ) )
            {
                push @stemmed, sprintf( "%s\t%s", $_->[1], $word )
                    if $Verbose;
                next WORD;
            }
        }
    }
    push @rejects, $word;
}

$, = "\n";
print @rejects, "";
print @stemmed, "" if $Verbose;

if ( open HIST, ">>$History" )
{
     my ($sec,$min,$hr,$day,$mo,$yr) = localtime(time);
     my $name;
     eval { $name = getlogin() || getpwuid($<) || 'Quimby'; };
     print HIST "$name $mo-$day-$yr", @rejects, "";
     close HIST;
}

# Return a list of the given word and variant capitalizations.
# Case is significant in the dictionary; an uncapitalized proper name
# is a spelling error.  Most acronyms or initialisms should be uppercase.
# But we must also accept uppercase spellings of proper names,
# and uppercase or capitalized spellings for normally lowercase words.
# Weird capitalization is not forgiven.

sub capitalizations
{
    my @r = (shift);

    if ( $r[0] =~ /^[A-Z']+s?$/ )       # All caps with optional 's' suffix
    {
        # Given the word `SCALARs', try `Scalars' and `scalars' also.

        push( @r, ucfirst( lc($r[0]) ), lc( $r[0] ) );
    }
    elsif ( $r[0] =~ /^[A-Z]/ )         # First character capitalized
    {
        # Given the word `Scalars', try `scalars' also.

        push( @r, lcfirst( $r[0] ) );
    }
    @r;
}

__END__

=head1 NAME

B<spell> - print spelling errors

=head1 SYNOPSIS

 spell [ -b ] [ -p ] [ -v ] [ -x ] [ file ] ...
 spellin {hashfile|size} [ file ] ...
 spellout [ -d ] hashfile [ file ] ...

=head1 DESCRIPTION

B<spell> examines the words in the given files (or standard input)
and prints a list of those that it doesn't recognize.  Capitalization
is significant for proper names, acronyms and initialisms.

B<spell> strips words that look like troff, eqn and tbl requests.
Words are matched to a list of word stems and if no match is found,
prefixes and suffixes are stripped until a match is found or the
word is rejected.  Common misspellings that could be accepted due to the
over-stripping of affixes are rejected with the help of a stop list.

The B<-b> option selects British spelling over American (the default.)

The B<-p> option filters Perl pod directives instead of troff (the default.)
This filter passes only pod text and removes many Perl keywords, functions
and constructs.

The B<-v> option causes acceptable words to be also printed if they
must be stripped of affixes before they match.  The applied affix rules
are printed with each word.

The B<-x> option causes all lookup attempts to be printed.
Each attempt indicated by `='.

Results also accumulate in the history file; this file is helpful
in finding words that could be added to the word list.  The history file
must be manually purged.

B<spellin> opens the given hashed word list and adds words, supplied
one per line, from standard input.  The resulting hashed list is printed
to standard output.  If no hashed list is given, the fixed capacity
must be given for the new hashed list to be created.

B<spellout> opens the indicated hashed word list and matches words,
supplied one per line, from standard input.  Words that do not literally
match are printed.  The B<-d> option causes only duplicates to be printed.

=head1 ENVIRONMENT

The environment variable B<SPELLDIR> may provide the dictionary path
to override the default C</usr/dict>.

=head1 FILES

 /usr/dict/hlista:      hashed American word list
 /usr/dict/hlistb:      hashed British word list
 /usr/dict/hstop:       hashed stop list
 /usr/dict/spellhist:   history file

=head1 SEE ALSO 

 deroff
 depod

=head1 BUGS

The lexically-based approach is inherently imperfect.
Misspellings can slip by from over-stemming or hash collision.

B<spell> errs on the side of being overly permissive, where other
spelling checkers may be overly restrictive.
You decide which is to your taste.
