###########################################################################
#
# EMAILPlug.pm - a plugin for parsing email files
#
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################



# EMAILPlug
#
# by Gordon Paynter (gwp@cs.waikato.ac.nz)
#
# Email plug reads email files.  These are named with a simple
# number (i.e. as they appear in mh_mail folders) or with the 
# extension .email
#
# Document text:
#   The document text consists of all the text 
#   after the first blank line in the document.
#
# Metadata:
#   $Headers      All the header content
#   $Subject      Subject: header
#   $To           To: header
#   $From         From: header - this will be stored as Creator
#   $DateText     Date: header
#   $Date         Date: header in GSDL format (eg: 19990924)

package EMAILPlug;

use SplitPlug;

use sorttools;
use util;


# EMAILPlug is a sub-class of SplitPlug.

sub BEGIN { 
    @ISA = ('SplitPlug');
}

# Create a new EMAILPlug object with which to parse a file.
# Accomplished by creating a new BasPlug and using bless to 
# turn it into an EMAILPlug.

sub new {
    my ($class) = @_;
    my $self = new BasPlug ("EMAILPlug", @_);

    return bless $self, $class;
}

sub get_default_process_exp {
    my $self = shift (@_);

    return q^\d+(\.email)?$^;
}

# This plugin splits the mbox mail files at lines starting with From<sp>
sub get_default_split_exp {
    return q^From .*\n^;
}


# do plugin specific processing of doc_obj
sub process {
    my $self = shift (@_);
    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
    my $outhandle = $self->{'outhandle'};

    # Check that we're dealing with a valid mail file
    return undef unless (($$textref =~ /From:/) || ($$textref =~ /To:/));

    # slightly more strict validity check, to prevent us from matching 
    # .so.x files ...
    return undef unless (($$textref =~ /^From /) ||
			 ($$textref =~ /^[-A-Za-z]{2,100}:/));

    print $outhandle "EMAILPlug: processing $file\n" 
	if $self->{'verbosity'} > 1;

    my $cursection = $doc_obj->get_top_section();

    #
    # Parse the document's text and extract metadata
    #

    # Separate header from body of message
    my $Headers = $$textref;
    $Headers =~ s/\n\n.*//s;
    $$textref = substr $$textref, (length $Headers);

    # Extract basic metadata from header
    my @headers = ("From", "To", "Subject", "Date");
    my %raw;
    foreach my $name (@headers) {
	$raw{$name} = "No $name value";
    }

    # Examine each line of the headers
    my ($line, $name, $value);
    my @parts;
    foreach $line (split(/\n/, $Headers)) {
	
	# Ignore lines with no content or which begin with whitespace
	next unless ($line =~ /:/);
	next if ($line =~ /^\s/);

	# Find out what metadata is on this line
	@parts = split(/:/, $line);
	$name = shift @parts;
	next unless $name;
	next unless ($raw{$name});

	# Find the value of that metadata
	$value = join(":", @parts);
	$value =~ s/^\s+//;
	$value =~ s/\s+$//;

	# Store the metadata
	$raw{$name} = $value;
    }

    # Process Date information
    if ($raw{"Date"} !~ /No Date/) {
	$raw{"DateText"} = $raw{"Date"};
	
	# Convert the date text to internal date format
	$value = $raw{"Date"};
	my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
	if ($year < 100) { $year += 1900; }
	$raw{"Date"} = &sorttools::format_date($day, $month, $year);
	
    } else {
	# We have not extracted a date
	$raw{"DateText"} = "Unknown.";
	$raw{"Date"} = "19000000";
    }


    # Add extracted metadata to document object
    foreach my $name (keys %raw) {
	$value = $raw{$name};
	if ($value) {
	    $value = &text_into_html($value);
	} else {
	    $value = "No $name field";
	}
	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
    }

    # Add "All headers" metadata
    $Headers = &text_into_html($Headers);
    $Headers = "No headers" unless ($Headers =~ /\w/);
    $doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);

    # Add text to document object
    $$textref = &text_into_html($$textref);
    $$textref = "No message" unless ($$textref =~ /\w/);
    $doc_obj->add_utf8_text($cursection, $$textref);

    return 1;
}


# Convert a text string into HTML.
#
# The HTML is going to be inserted into a GML file, so 
# we have to be careful not to use symbols like ">",
# which ocurs frequently in email messages (and use
# &gt instead.
#
# This function also turns links and email addresses into hyperlinks,
# and replaces carriage returns with <BR> tags (and multiple carriage
# returns with <P> tags).


sub text_into_html {
    my ($text) = @_;

    # Convert problem characters into HTML symbols
    $text =~ s/&/&amp;/go;
    $text =~ s/</&lt;/go;
    $text =~ s/>/&gt;/go;
    $text =~ s/\"/&quot;/go;

    # convert email addresses and URLs into links
    $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
    $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-~]*)/<a href=\"$1\">$1<\/a>/g;

    # Clean up whitespace and convert \n charaters to <BR> or <P>
    $text =~ s/ +/ /go;
    $text =~ s/\s*$//o; 
    $text =~ s/^\s*//o; 
    $text =~ s/\n/\n<BR>/go;
    $text =~ s/<BR>\s*<BR>/<P>/go;

    return $text;
}


# Perl packages have to return true if they are run.
1;
