#!/usr/local/bin/perl
# Copyright 1993 by Cornell University and by Xerox Corporation

# This is the front end of a WAIS server
# begun by Miao-Jane Lin (mjlin@cs.cornell.edu) July 1993
# finished by Jim Davis (davis@dri.cornell.edu) Apr 1994
#  with much  help from Jonny Goldman.

# This code provides a WAIS front end which you can attach to any
# back-end information service you wish.  The back end should load
# this file and provide two functions, SearchDoc and readDocContent.
# After loading this file, call &main
# You should not need to modify this file at all, only the back end.

# The interface to SearchDoc is
#  ($databaseName, $seedWord, $maxDocs, *header, *diag)
# where $databaseName is a string
#       $seedWord is a string of words, with space as separator
#       $maxDocs is the maximum # of docs to return
#       *results is a pointer to an array to get results
#       *diag is a pointer to an array to hold diagnostic messages
# return status code: 1 means okay
# the elements of the results array are to be constructed with
#  make_WAIS_search_result 
#    ($docid, $score, $best_score, $size, $nlines, $type, $databaseName, $date, $name, $city)
#  docid is a number
#  score is a number between 0 and 1000
#  size in characters
#  nlines in lines
#  type should be blank for now
#  database name is a string,
#  date is a string in the form YYMMDD
#  name is the name of the document (the "headline")
#  city is a string.  (the location of the server)

# The interface to readDocContent is
#	($text, $errorcode) =
#           &readDocContent($databaseName, $docID, $startChunk, $endChunk);

# The interface to main is main($port);

# backend routine may also call &log (STRING) to write a string to the log file.

#
# if 1, fork a process to handle client.
$standalone = 1 unless defined $standalone;

# If logging is 1 then logging log messages are appended to log_file
$logging = 0 unless defined $logging;
$log_file = "/usr/tmp/SFgate.log" unless defined $log_file;

# if debugging is 1 then calls to dprint go to std output
$debugging = 0 unless defined $debugging;
$| = 1;			# force output

# Comments from Jim.

# Several things about this program trouble me but I don't understand
# them well.
#  1) Several routines return pointers (packed
# with p) to strings allocated in local storage.  I would expect that
# after the subroutine exits, the local storage would be released and
# the pointers would point to trash.  Yet it seems to work.  Maybe
# perl has a garbage collector? 
#  2)  Some routines call local inside a loop.  I think this just
#  wastes space, but does not make the program invalid.
#  3)  The code calls "die" if there's an error.  It should return an error
#  message to the client and log the error.  This is not too terrible, as long
#  as the main loop in the server does not die.  
#  4) the server does nothing with the diagnostics array passed to
#     the backend ocode.  
#  5) I suspect,  but don't know for sure, that this code is depending upon 
#  certain implementation conventions in the WAIS or Z39.50 protocol.  I notice
# lots of places where the code simply assumes that certain tagged
# items will arrive in a certain order or position.  But what's the
# purpose for tagging fields, if the order is fixed?  So I suspect
# that the protocol in general allows for a more flexible ordering of
# fields, but it just happens that the public domain implementation
# always follows a certain order, and this code depends on on that.

# Note: DocIDs not supported in relevance feedback.

# If no search matches, should it offer to return a catalog, or call
# some user-defined routine?

#-------------------------------------------------------------------
# definitions of PDU types: (from zutil.h)
#-------------------------------------------------------------------
$Init_PDUtype = 20;
$Search_PDUtype = 22;
$SearchResponse_PDUtype = 23;

#-------------------------------------------------------------------
# definitions of data tags: (from zutil.h, wprot.h)
#-------------------------------------------------------------------
$DT_ReferenceID = 2; 
$DT_ProtocolVersion = 3;
$DT_Option = 4;
$DT_PrefMsgSize = 5;
$DT_MaxMsgSize = 6;
$DT_IDAuthentication = 7;
$DT_ImplementationID = 8;
$DT_ImplementationName = 9;
$DT_ImplementationVersion = 10;
$DT_ResultSetName = 17;
$DT_DatabaseName = 18;
$DT_ElementSetName = 19;
$DT_QueryType = 20;
$DT_Query = 21;
$DT_ResultSetStatus = 26;
$DT_PresentStatus = 27;

# query terms
$DT_DatabaseDiagnosticRecord = 28;
$DT_AttributeList = 44;
$DT_Term = 45;
$DT_Operator = 46;

$DT_UserInfoLength = 99;
$DT_SeedWord = 106;
$DT_MaxDocumentsRetrieved = 114;
$DT_SeedWordUsed = 115;
$DT_DocumentID = 116;
$DT_VersionNumber = 117;
$DT_Score = 118;
$DT_BestMatch = 119;
$DT_DocumentLength = 120;
$DT_Source = 121;
$DT_Date = 122;
$DT_Headline = 123;
$DT_OriginCity = 124;
$DT_DocumentText = 127;
$DT_Lines = 131;
$DT_TYPE_BLOCK = 132;
$DT_TYPE = 133;

$DT_DocumentHeaderGroup = 150;
$DT_DocumentTextGroup = 153;
#-------------------------------------------------------------------
# definitions of query type (from wprot.h)
#-------------------------------------------------------------------
$BooleanQuery = "1";
$TextRetrievalQuery = $BooleanQuery;
$RelevanceFeedbackQuery = "3";

#-------------------------------------------------------------------
# definitions of term type (refered wprot.h)
#-------------------------------------------------------------------

#-------------------------------------------------------------------
# definitions of chunk type (refered wprot.h)
#-------------------------------------------------------------------

#-------------------------------------------------------------------
# definitions of data types:
#-------------------------------------------------------------------
# any: one long integer and string whose size is the preceding integer
$any_type = 'L1 a*';

#-------------------------------------------------------------------

#-------------------------------------------------------------------
# When the search engine matches a document it returns a "structure"
# in the following format:
# Numbers:Doctype:Source:Date:Headline:Origin City
# where Numbers is five packed long integers,
# docID, score, bestMatch, nchars, nlines
# doc type, source name, data, headline, and origin city are just strings.
# The original routine used packing type p (pointer) but that failed.
# note date is a six char string in the form YYMMDD

#-------------------------------------------------------------------
# diagnosticRecord contains surrogate(int),diagnostics & addinfo are string
#-------------------------------------------------------------------

#-------------------------------------------------------------------
# docObj contains docID(string), doc type(string), chunk code(long),
# start chunkcode(long) and end chunkcode(long)
#-------------------------------------------------------------------
#-------------------------------------------------------------------
# other definitions (refered wais source codes) 
#-------------------------------------------------------------------
$bitsPerByte = 8;
$maxBufSize = 29952; # just give some limit; refered from source code
$defaultID = "TMC";
$defaultName = "Ulrich Pfeifer <pfeifer@ls6.informatik.uni-dortmund.de";
$defaultVersion = $revision;
$diagnosticCodeSize = 3;
$endOfRecord = 29; # 1D in hexadecimal
$UNUSED = -1;
$HeaderVersion = 2;

# readDocContent error codes

#-------------------------------------------------------------------
# main program
#-------------------------------------------------------------------
# once the program is invoked, it keeps listening on port. 
# whenever a client talks to this port, it invokes a child
# process to serve the client. 

# Define some constants for internet hacking.
$AT_docID = "un";		# system control number encodes doc id
$AT_docType = "wt";
$AT_bytePosition = "wb";

# wais_header: 10 byte msg length, 1 byte msg type('z'= z39.50 APDU,  
# 'a'=ACK, 'n'=NACK), 1 byte msg header version(currently 2), 10 byte
# server name or address, 1 byte compression(<sp>=no compression, 
# 'u'= unix compress), 1 byte encoding(<sp>=no encoding, 'h'=hexize, 
# 'u'=uuencode), 1 byte checksum

sub Parse_WAIS_Header {
    local ($header) = @_;
    unpack ("a10 a a a10 a a a", $header);
}
    
sub writeWAISPacketHeader {
    local ($packet_len, $packet_type, $header_vers, $servertype, 
	   $compression, $encoding, $check_sum) = @_;
    sprintf ("%010d%s%s%010s%s%s%s",
	     $packet_len, $packet_type, $header_vers, $servertype, 
	     $compression, $encoding, $check_sum);
}




# from docid.h and docid.c

#typedef struct DocID{
#   any* originalServer;
#   any* originalDatabase;
#   any* originalLocalID;
#   any* distributorServer;
#   any* distributorDatabase;
#   any* distributorLocalID;
#   long copyrightDisposition;
#}

$DT_OriginalServer 	= 1;
$DT_OriginalDatabase 	= 2;
$DT_OriginalLocalID	= 3;
$DT_DistributorServer	= 4;
$DT_DistributorDatabase	= 5;
$DT_DistributorLocalID	= 6;
$DT_CopyrightDisposition = 7;


sub readDocAny {
    local ($buffer) = @_;
    local ($docid);
    local ($tag);

#    &ddump ("Parsing DocAny", $buffer);
    while ($buffer ne "") {
	$tag = &PeekTag ($buffer);
	if ($tag == $DT_OriginalServer ||
            $tag == $DT_OriginalLocalID ||
            $tag == $DT_OriginalDatabase ||
            $tag == $DT_DistributorServer ||
            $tag == $DT_DistributorDatabase ||
            $tag == $DT_DistributorLocalID ) {
	    $docid .= "$tag=".&readString (*buffer).";";} # save this too (up)
	elsif ($tag == $DT_CopyrightDisposition) {
	    $docid .= "$tag=%".sprintf("%02x",&readNumber (*buffer));}
	else {
	    die ("Got Tag type $tag inside a DocAny.\nStopped");}}
    $docid;
}


#-------------------------------------------------------------------
# PeekTag:
# argument: buf- where to read
# read a tag from buf without advancing the buf. return the tag.
#-------------------------------------------------------------------
sub PeekTag {
    local ($buf) = @_;
    local ($number, $byte, $offset);

    $number = 0;
    $offset = 0;

    do {			# get one byte from buf at first
	($byte) = unpack("C", substr ($buf,$offset++,1));
	$number <<= 7;
	$number += ($byte & 127); # 127 = 7F in hexadecimal   
    } until (($byte & 128) == 0); # until the most significant  
    # bit of byte equals to 0
    $number;
}

#-------------------------------------------------------------------
# readBinaryNumber:
# arguments: size- number of bytes to read from string
#            string- a binary stream.  modified
#-------------------------------------------------------------------
sub readBinaryNumber {
    local ($size, *string) = @_;
    local ($byte, $number);

    $number = 0;
    while ($size > 0) {   
        ($byte, $string) = unpack("C1 a*", $string);
        $number <<= $bitsPerByte; # shift left for 8 bits
        $number = $number + $byte;
        $size--;
    }
    $number;
}

# written high byte .... low byte
sub writeBinaryNumber {
    local ($number, $size, *string) = @_;
    local ($byte, $bin_num);
    $bin_num = "";
    while ($size > 0)
    { 
        $byte = $number & 255;	# FF in hexadecimal
        $byte = pack("C", $byte); # pack into binary
        $bin_num = $byte . $bin_num;
        $number >>= $bitsPerByte;
        $size--;
    }
    $string = $string . $bin_num;
}


#-------------------------------------------------------------------
# writtenCompressedBinIntSize:
# return the number of bytes which a decimal number will take when
# written into compressed binary integer. at most return 4 bytes.
# JRD: I think this name is a misnomer but I am not changing.  I
# think this computes the size of an UNCOMPRESSED binary number
#-------------------------------------------------------------------
sub writtenCompressedBinIntSize {
    local ($number) = @_;

    if ($number < 0) { return 4; }
    elsif ($number < 2**8) { return 1; }
    elsif ($number < 2**16) { return 2; }
    elsif ($number < 2**24) {return 3; }
    else 
    { die "Number $number is too big to be written as compressed binary integer.\n"; }
}


#-------------------------------------------------------------------
# readCompressedInteger:
# arguments:  buf- a binary stream.  must be passed as *buf
# take from buf byte by byte. for each byte, transfer the lower 7 bits
# into decimal. test the 8th bit, if it's 1 then go on to take one more
# byte from buf; if it's 0, stop.  buf is globally modified.
#-------------------------------------------------------------------
sub readCompressedInteger {
    local (*buf) = @_;
    local ($number, $byte);

    # this initialisation is just for tuning: most frequent case is
    # 0<=n<=127
    ($byte, $buf) = unpack("C1 a*", $buf);
    return($byte) if (($byte & 128)==0);
    $number = $byte&127;

    do {			# get one byte from buf at first
	($byte, $buf) = unpack("C1 a*", $buf);
	$number <<= 7;
	$number += ($byte & 127); # 127 = 7F in hexadecimal   
    } until (($byte & 128) == 0); # until the most significant  
    # bit of byte equals to 0
    $number;
}                        

#-------------------------------------------------------------------
# writeCompressedInteger:
# arguments: number- the decimal number to be transfered to compressed
#            buf- where the compressed goes
# write the number into the lower 7 bits at each byte. turn on the 8th
# bit of each byte if it is not the least byte. Else, the 8th bit is 0.
#-------------------------------------------------------------------
sub writeCompressedInteger {
    local ($number, *buf) = @_;
    local ($size, $byte, $bin_num, $count);
    $bin_num = "";
    $size = &writtenCompressedIntSize($number);

    for ($count=$size; $count>0; $count--) {
	$byte = $number & 127;	# 127 is 7F in hexadecimal
	$byte = pack("C", $byte); # pack into binary
	if ($count != $size)	# not the last byte of the number
	{  vec($byte, 7, 1) = 1;  } # turn on the most left bit 
	$bin_num = $byte . $bin_num;
	$number >>= 7;
    }

    $buf = $buf . $bin_num;     # store into buf
}

#-------------------------------------------------------------------
# writtenCompressedIntSize:
# return the number of bytes which a decimal number will take when
# written into compressed integer. at most return 4 bytes.
#-------------------------------------------------------------------

sub writtenCompressedIntSize {
    local ($number) = @_;

    if ($number < 2**7) { return 1; } 
    elsif ($number < 2**14) { return 2; }
    elsif ($number < 2**21) { return 3; }
    else 
    { die "Number is too big to be written as compressed integer.\n"; }
}


#-------------------------------------------------------------------
# readNumber:
# arguments: buf- where the number comes from
# take a binary stream from buf and translate it to a decimal number
#-------------------------------------------------------------------
sub readNumber {
    local (*buf) = @_;
    local ($size);

    &readCompressedInteger(*buf); # read the tag - ignore it
    $size = &readCompressedInteger(*buf); # read the size of number
    &readBinaryNumber($size, *buf);
}

#-------------------------------------------------------------------
# writeNumber:
# arguments: number- the number we want to write out
#            tag- the number's tag
#            buf- where the number goes
# if the number is not -1, translate it to binary and write to buf
#-------------------------------------------------------------------
sub writeNumber {
    local ($number, $tag, *buf) = @_;
    local ($size);

    $size = &writtenCompressedBinIntSize($number); 

    if ($number != $UNUSED) {
	&writeCompressedInteger($tag, *buf);
	&writeCompressedInteger($size, *buf);
	&writeBinaryNumber($number, $size, *buf);
    }
}

#-------------------------------------------------------------------
# makeBitMap:
# arguments: bin_string- a binary string
#            numbits- number of bits of the binary string
# fill bytes by a binary string. then pack the bytes to an any
# which we call bit map here. return the bit map.
#-------------------------------------------------------------------
sub makeBitMap {
    local ($numBits, $bin_string) = @_;
    local ($bit_map, $size, $bytes, $count);

    $size = 0;
    do {  $numBits = $numBits - $bitsPerByte; # to see how many
	  $size++;		# bytes it occupies
      } until $numBits <= 0;
    $bytes = "";		# initialize
    for ($count=$size*$bitsPerByte; $count>0; $count--)
    {
	if ($bin_string ne "") 
	{  vec($bytes, $count-1, 1) = substr($bin_string, 0, 1);
	   $bin_string = substr($bin_string, 1);   
       }
	else
	{  vec($bytes, $count-1, 1) = 0;  } # put 0 to left space of byte
    }

    $bit_map = pack($any_type, $size, $bytes);
    return $bit_map;
}

#-------------------------------------------------------------------
# readBitMap:
# arguments: buf- where to read
#            bit_map- an any 
# read a tag and a size from buf. take following size bytes of stream
# from buf. pack the size and bytes to an any which we call bit_map.
#-------------------------------------------------------------------
sub readBitMap {
    local(*bit_map, *buf) = @_;
    local($size, $bytes);

    &readCompressedInteger(*buf); # read the tag
    $size = &readCompressedInteger(*buf); 
    $bytes = substr($buf, 0, $size);
    $buf = substr($buf, $size);

    $bit_map = pack($any_type, $size, $bytes);
}

#-------------------------------------------------------------------
# writeBitMap:
# arguments: buf- where to write
#            bit_map- an any which we want to write out
#            tag- the tag of the bit_map
# if bit_map is not NULL, unpack it to a size and bytes. write out
# the tag, size and copy the bytes to buf.
#-------------------------------------------------------------------
sub writeBitMap {
    local ($bit_map, $tag, *buf) = @_;
    local ($size, $bytes);

    if (length($bit_map) > 0)
    {
	($size, $bytes) = unpack($any_type, $bit_map);

	&writeCompressedInteger($tag, *buf);
	&writeCompressedInteger($size, *buf);
	$buf = $buf . $bytes;
    }
}

  

#-------------------------------------------------------------------
# readString:
# arguments: buf- where to read from
#-------------------------------------------------------------------
sub readString {
    local(*buf) = @_;
    local($size, $bytes, $field);

    &readCompressedInteger(*buf); # tag ignored
    $size = &readCompressedInteger(*buf); 
    $field = substr($buf, 0, $size);
    $buf = substr($buf, $size);

    return $field;
}

#-------------------------------------------------------------------
# writeString:
# arguments: 
#           string- the string we want to write out
#           tag- the tag of the string
#           buf- where to write
# if the string is not NULL, write the tag, size of string and copy
# the string to buf.
#-------------------------------------------------------------------
sub writeString {
    local ($string, $tag, *buf) = @_;
    local ($size);

    if ($string ne "")  {
	$size = length($string);
	&writeCompressedInteger($tag, *buf);
	&writeCompressedInteger($size, *buf);
	$buf = $buf . $string;
    }
}
