#!/usr/bin/perl -w

# use perl                                  -*- mode: Perl; -*-
eval 'exec perl -S $0 "$@"'
  if $running_under_some_shell;

use vars qw($running_under_some_shell);         # no whining!

require 5.003;

# grepmail version 2.1

# Grepmail searches a normal or gzip'ed mailbox for a given regular expression,
# and returns those emails that match it. Piped input is allowed, and date
# restrictions are supported.

# Do a pod2text on this file to get full documentation, or pod2man to get
# man pages.

# Written by David Coppit (coppit@cs.virginia.edu,
#  http://www.cs.virginia.edu/~dwc3q/index.html)

# Please send me any modifications you make. (for the better, that is. :)

# This code is distributed under the GNU General Public License (GPL). See
# http://www.opensource.org/gpl-license.html and http://www.opensource.org/.

# Version History (major changes only)
# 2.1 Added -l,-r, and -e, as suggested by Reinhard Max <max@suse.de>. Now
#   uses about 1/3 the memory, and is a little faster.
# 2.0 Added POD documentation at the end of the script (thanks, Jeffrey
#   Haemer <jsh@boulder.qms.com>). -h for headers only -b for body only
# 1.9 "Ignore empty files" by Gerald Pfeifer <pfeifer@dbai.tuwien.ac.at>.
#   Emails without dates are now automatically output no matter what the
#   date specification is. (Better safe than sorry!)
# 1.7 Sped up by Andrew Johnson. It no longer looks for dates unless
#   the email matches the search string.
# 1.6 removed use of Compress::Zlib because it was 30% slower, complicated the
#   code, and because any user with gzip'd mail has zcat...
# 1.5 Andrew Johnson <ajohnson@gpu.srv.ualberta.ca> fixed a couple of bugs.
# 1.4 Incorporated conditional loading of the date module, use of
#   compress::Zlib instead of shelling out to gunzip, as well as some bug
#   fixes, as submitted by Andrew Johnson <ajohnson@gpu.srv.ualberta.ca>
#   (Many thanks!). Also restructured the code a bit.
# 1.3 Made it pipeable so you can do:
#   grepmail <pattern> file | grepmail <pattern>
# 1.1 Support for dates.
# 1.0 Initial version, with -v -i, and gzip support

use strict;
use FileHandle;
use Getopt::Std;
use Carp;

sub usage
{
<<EOF;
usage: grepmail [[-e] <expr>] [-vihblr] [-d \"datespec\"] <files...>

-h Search headers only
-b Search body only
-l Output the names of files having an email matching the expression
-r Output the names of the files and the number of emails matching the
   expression
-i Ignore case in the search expression
-v Output emails that don't match the expression
-e Explicitely name expr (when searching for strings beginning with "-")

Date specifications must be of the form of:
a date like "today", "1st thursday in June 1992", "05/10/93",
  "12:30 Dec 12th 1880", "8:00pm december tenth",
OR "before", "after", or "since", followed by a date as defined above,
OR "between <date> and <date>", where <date> is defined as above.

Files can be ASCII or gzip'd ASCII. You can also pipe gzip'd or normal ASCII to
grepmail.
EOF
}

#-------------------------------------------------------------------------------

my (%opts, $pattern, $unzipMethod);

BEGIN
{
  $pattern = "";

  die usage if ($#ARGV < 0);

  if ($ARGV[0] !~ /^-/)
  {
    $pattern = shift @ARGV;
  }

  getopt("ed",\%opts);

  if ($opts{e})
  {
    die "You specified two search patterns\n" if ($pattern ne "");
    
    $pattern = $opts{e};
  }
  elsif ($pattern eq "")
  {
    $pattern = ".";
  }

  if ($opts{d})
  {
    unless (eval "require Date::Manip")
    {
      die "You specified -d, but do not have Date::Manip. Get it from CPAN.\n";
    }

    import Date::Manip;
  }
}

#-------------------------------------------------------------------------------

# Make the pattern insensitive if we need to
$pattern = "(?i)$pattern" if ($opts{i});

my ($daterestriction, $date1, $date2);

if ($opts{d})
{
  ($daterestriction,$date1,$date2) = &ProcessDate($opts{d});
}
else
{
  $daterestriction = "none";
}


# For debugging
#foreach my $i (keys %opts)
#{
#  print "$i: $opts{$i}\n";
#}
#print "PATTERN: $pattern\n";
#print "ARGV: @ARGV\n";
#die;


# If the user provided input files...
if (@ARGV)
{
  # For each input file...
  my $file;
  foreach $file (@ARGV)
  {
    # First of all, ignore empty files.
    next if -z $file;

    # If it's not a gzipped file
    if ($file !~ /\.(gz|Z)$/)
    {
      warn "** Skipping binary file: '$file' **\n" and next if -B $file;
      my $fileHandle = new FileHandle;
      $fileHandle->open($file) || die "Can't open $file.\n";
      ProcessMailFile($fileHandle,$file);
      $fileHandle->close();
    }
    # If it is a gzipped file
    else
    {
      my $tempFile = "/tmp/$$";
      `zcat $file > $tempFile`;
      my $fileHandle = new FileHandle;
      $fileHandle->open($tempFile)
          || die "Can't open temporary file used to decompress the file $file.\n";
      ProcessMailFile($fileHandle,$file);
      $fileHandle->close();
      unlink $tempFile;
    }
    
  }
}
# Using STDIN
else
{ 
  my $fileHandle = new FileHandle;
  $fileHandle->open("<&STDIN") || die "Can't dup STDIN $!";

  # If it looks binary, try to unzip it.
  if (-B $fileHandle)
  {
    binmode $fileHandle;
    my $tempFile = "/tmp/$$";
    open(TMP,"|zcat >$tempFile") || die "Can't create $tempFile $!";
    binmode TMP;
    print TMP while <$fileHandle>;
    close TMP;
    $fileHandle->close()||die "Error writing $tempFile $!";
    $fileHandle->open($tempFile) || die "Can't open $tempFile $!";
    ProcessMailFile($fileHandle,"Gzip'd standard input");
    $fileHandle->close();
    unlink $tempFile;      
  }
  # Otherwise process it directly
  else
  {
    ProcessMailFile($fileHandle,"Standard input");
    $fileHandle->close();
  }
}

#-------------------------------------------------------------------------------

sub ProcessMailFile ($$)
{
my $fileHandle = shift @_;
my $fileName = shift @_;
my ($email,$header,$body,$found,$firstLine,$numberOfMatches);

$firstLine = 1;
$email = "";
$numberOfMatches = 0;

while (<$fileHandle>)
{
  # Code modified from Mail::Util
  if ((!$firstLine && /^From .*\d{4}/) || eof) 
  {

    if ($opts{b} || $opts{h})
    {
      my $bodyIndex = index ($email, "\n\n");

      $found = (substr($email,$bodyIndex+2) =~ /$pattern/o) if ($opts{b});
      $found = (substr($email,0,$bodyIndex) =~ /$pattern/o) if ($opts{h});
    }
    else
    {
      $found = ($email =~ /$pattern/o);
    }

    $found = !$found if ($opts{v});

    # Don't print the email if -l was given
    if ($found && &CheckDate(\$email))
    {
      if ($opts{l})
      {
        print "$fileName\n";

        # We can return since we found at least one email that matches.
        return;
      }
      elsif ($opts{r})
      {
        $numberOfMatches++;
      }
      else
      {
        print $email;
      }
    }

    $email = $_;
    $found = 0;
  }
  else
  {
    $firstLine = 0;
    $email .= $_;
  }
}

if ($opts{r})
{
 print "$fileName: $numberOfMatches\n";
}

}


#-------------------------------------------------------------------------------

sub CheckDate($)
{
my $emailref = shift;
my ($emailDate, $isInDate);
$emailDate = "";
$isInDate = 0;

if ($opts{d})
{
  # The email might not have a date. In this case, print it out anyway.
  if ($$emailref =~ /^Date:\s*(\S*\s*\S*\s*\S*\s*\S*\s*\S*)/m)
  {
    $emailDate = &ParseDate($1);
    $isInDate = &IsInDate($emailDate,$daterestriction,$date1,$date2);
  }
  else
  {
    $isInDate = 1;
  }
}
else
{
  $isInDate = 1;
}

return $isInDate;

}

#-------------------------------------------------------------------------------

# Figure out what kind of date restriction they want, and what the dates in
# question are.
sub ProcessDate($)
{
my ($daterestriction, $date1, $date2);

if(!defined($_[0]))
{
  return ("none","","");
}

my $datestring = $_[0];

if ($datestring =~ /^before (.*)/)
{
  $daterestriction = "before";
  $date1 = &ParseDate($1);
  $date2 = "";

  if (!$date1)
  {
    die "\"$1\" is not a valid date\n";
  }
}
elsif ($datestring =~ /^(after |since )(.*)/)
{
  $daterestriction = "after";
  $date1 = &ParseDate($2);
  $date2 = "";

  if (!$date1)
  {
    die "\"$2\" is not a valid date\n";
  }
}
elsif ($datestring =~ /^between (.*) and (.*)/)
{
  $daterestriction = "between";
  $date1 = &ParseDate($1);
  $date2 = &ParseDate($2);

  if (!$date1)
  {
    die "\"$1\" is not a valid date\n";
  }
  if (!$date2)
  {
    die "\"$2\" is not a valid date\n";
  }

  # Swap the dates if the user gave them backwards.
  if ($date1 gt $date2)
  {
    my $temp;
    $temp = $date1;
    $date1 = $date2;
    $date2 = $temp;
  }

}
elsif ($date1 = &ParseDate($datestring))
{
  $daterestriction = "on"
}
else
{
  die "Invalid date specification. Use \"$0 -h\" for help\n";
}

return ($daterestriction,$date1,$date2);

}

#-------------------------------------------------------------------------------

sub IsInDate($$$$)
{
my ($emailDate,$daterestriction,$date1,$date2);
$emailDate = shift @_;
$daterestriction = shift @_;
$date1 = shift @_;
$date2 = shift @_;

# Here we do the date checking.
if ($daterestriction eq "none")
{
  return 1;
}
else
{
  if ($daterestriction eq "before")
  {
    if ($emailDate lt $date1)
    {
      return 1;
    }
    else
    {
      return 0;
    }
  }
  elsif ($daterestriction eq "after")
  {
    if ($emailDate gt $date1)
    {
      return 1;
    }
    else
    {
      return 0;
    }
  }
  elsif ($daterestriction eq "on")
  {
    if (&UnixDate($emailDate,"%m %d %Y") eq &UnixDate($date1,"%m %d %Y"))
    {
      return 1;
    }
    else
    {
      return 0;
    }
  }
  elsif ($daterestriction eq "between")
  {
    if (($emailDate gt $date1) && ($emailDate lt $date2))
    {
      return 1;
    }
    else
    {
      return 0;
    }
  }
}

}

#-------------------------------------------------------------------------------

=head1 NAME

grepmail - search mailboxes for mail matching a regular expression

=head1 SYNOPSIS

  grepmail [-e <regex>] [-vihblr] [-d "datespec"] [mailbox ...]

=head1 DESCRIPTION

=over 2

I<grepmail> looks for mail messages containing a pattern, and prints the
resulting messages on standard out.

By default I<grepmail> looks in both header and body for the specified pattern.

When redirected to a file, the result is another mailbox, which can, in turn,
be handled by standard User Agents, such as I<elm>, or even used as input for
another instance of I<grepmail>.

The pattern is optional if -d is used, and must precede all flags unless it is
specified using -e.

=back

=head1 OPTIONS AND ARGUMENTS

Many of the options and arguments are analogous to those of grep.

=over 8

=item B<pattern>

The pattern to search for in the mail message.  May be any Perl regular
expression, but should be quoted on the command line to protect against
globbing (shell expansion).

=item B<mailbox>

Mailboxes must be traditional, UNIX C</bin/mail> mailbox format.  The
mailboxes may be zipped by gzip, in which case zcat must be installed on the
system.  If no mailbox is specified, takes input from stdin.

=item B<-b>

Look only in the bodies of mail messages.

=item B<-h>

Look only in the headers of mail messages.

=item B<-i>

Make the search case-insensitive (by analogy to I<grep -i>).

=item B<-v>

Invert the sense of the search, (by analogy to I<grep -v>).

=item B<-l>

Output the names of files having an email matching the expression, (by analogy
to I<grep -l>).

=item B<-r>

Generate a report of the names of the files containing emails matching the
expression, along with a count of the number of matching emails.

=item B<-e>

Explicitely specify the search pattern. This is useful for specifying patterns
that begin with "-", which would otherwise be interpreted as a flag.

=item B<-d>

Date specifications must be of the form of:
  - a date like "today", "1st thursday in June 1992", "05/10/93", "12:30 Dec 12th 1880", "8:00pm december tenth",
  - OR "before", "after", or "since", followed by a date as defined above,
  - OR "between <date> and <date>", where <date> is defined as above.

=back

=head1 EXAMPLES

Get all email that you mailed yesterday

  grepmail -d yesterday sent-mail

Get all email that you mailed before the first thursday in June 1998 that
pertains to research:

  grepmail research -d "before 1st thursday in June 1992" sent-mail

Get all email you received since 8/20/98 that wasn't about research or your
job, ignoring case:

  grepmail "(research|job)" -i -d "since 8/20/98" -v saved-mail

Get all email about mime but not about Netscape:

  grepmail mime -b saved-mail | grepmail Netscape -v

Print a list of all mailboxes containing a message from Rodney:

  grepmail "^From.*Rodney" -h -l saved-mail*

Print a count of the number of messages about grepmail in all saved-mail
mailboxes:

  grepmail grepmail -b -r saved-mail*

=head1 AUTHOR

  David Coppit, <coppit@cs.virginia.edu>,
  http://www.cs.virginia.edu/~dwc3q/index.thml

=head1 SEE ALSO

elm(1), mail(1), grep(1), perl(1), printmail(1), Mail::Internet(3)
Crocker,  D.  H., Standard for the
Format of Arpa Internet Text Messages, RFC822.

=cut

