/*
 * cache.c: mirror a document, putting it into this server's /mirror directory
 * 
 * Reinier Post (thanks to Rob McCool)
 *
 * $Log: cache.c,v $
 * Revision 0.14  1994/05/17  13:33:04  reinpost
 * major code cleanup, some functionality enhancements:
 * surpassing of cache under certain circumstances, even when translating
 * support for POST requests
 * maybe a few small bug fixes
 *
 *
 * Revision 0.12  1994/04/01  15:06:30  reinpost
 * translate the Location: header for redirections
 *
 * Revision 0.11  1994/03/25  20:40:57  reinpost
 * flush non-regular documents (errors, redirections) directly to stdout
 * this is done in an ugly way - code could possibly use some restructuring
 * to do this, the script had to be turned into a nph- script
 *
 * Revision 0.10  1994/03/11  13:33:55  reinpost
 * fix for #name anchors
 * a slightly different interface to the config.h routines
 * correction in names supplied to DIE_DIR error
 * send_fd() is now avoided, using direct read() / write()
 *
 * Revision 0.9  1994/03/02  21:21:38  reinpost
 * some trivial modifications
 *
 * Revision 0.8  1994/02/25  20:16:33  reinpost
 * proxy support added
 * some small changes in error messages
 * testing for existence of directories (not for all)
 * modified interface to read_config() and read_parameters()
 * some small bug fixes
 *
 * Revision 0.7  1994/02/18  13:17:52  reinpost
 * some compilers don't accept multi-line strings
 *
 * Revision 0.6  1994/02/17  21:23:01  reinpost
 * one interesting change: check cached files for being out of date
 *
 * Revision 0.5  1994/02/17  10:18:33  reinpost
 * new mime.c interface
 * some renamings of error messages etc.
 *
 * Revision 0.4  1994/02/10  10:48:29  reinpost
 * host string is now terminated with \0
 * readn() instead of read() eliminates scripts returning empty documents problem
 * some error messages were modified
 * parameters are now being read in config.c, by read_parameters()
 * and put into global variables
 *
 * log_if_debug is now used
 * stdout is explicitly closed, which again seems to eliminate some problems
 *
 * Revision 0.3  1994/02/01  19:38:04  reinpost
 * reading conf file now happend in config.c
 * a separate getenv() that always returns a string
 * a slightly different interface to filenames: url_to_file()
 * query results are now cached correctly, and optionally
 *
 * Revision 0.2  1994/01/25  14:51:57  reinpost
 * moved error routine die() to new error.c;
 * provided logging facilities (stolen from NCSA httpd 1.0 server code)
 * corrected a bug in request parsing (superfluous / in some occasions)
 * the query string is now appended to the filename used in cache (was a bug)
 * (no suppression of query caching yet)
 *
 * Revision 0.1  1994/01/20  12:07:24  reinpost
 * version which is pretty stable, but it doesn't use a database to
 * get and store the files - plain filenames all in a single directory
 * this is very undesirable because filenames tend to grow very long this way
 * and we need to collect extra info for files anyway
 */

static char rcsid[] =
  "$Id: cache.c,v 0.14 1994/05/17 13:33:04 reinpost Exp reinpost $";

/*
 * mechanism: suppose you are interested in a document with URL
 *   http://www.win.tue.nl/examples.html
 * and you want to cache it and its descendants on a caching server running on
 * machine wsinis10, post 4322 (which is my current setup).  Then open the URL
 * http://wsinis10:4322/mirror/http:%2f%2fwww.win.tue.nl%2fexamples.html
 * and the document will be served to you from the cache, having been fetched
 * from the original if it wasn't already there, and its links having been
 * translated in the same way, so they'll point back to the cache again.
 * Every document you then access will be cached.  We strongly recommend
 * to use this with Paul De Bra's fish search facility, because searching
 * otherwise is a constant harrassment of the Web (and much slower, too).
 * No cache refreshment mechanism has been designed yet, and the software
 * still contains some bugs and problems (e.g. headers are not passed through).
 * there is one special command: if the escaped URL is prefixed with
 * REFRESH_PREFIX, the document will be refreshed in cache.  We can prepend a
 * header to cached HTML documents in which this option is used.
 *
 * However, prepending a header with extra links has some disadvantages, and
 * making the refresh request part of the URL has as a disadvantage that
 * subsequent Reloads on the result will again refresh the cache.
 * I know of no way (except Redirects) to change the URL on a returned document.
 */

/* the #include stuff is not too portable */

#include <unistd.h>
#include <string.h>
#include <stdio.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/file.h>
#include <errno.h>
#include <sys/socket.h>
#include <signal.h>
#include <ctype.h>

#include "system.h"	/* system software platform dependencies */

#include "network.h"	/* from ~devet/WWW/Teletekst/wwwgate */

#include "mime.h"	/* to use mime.c (Rob McCool's http_mime.c) */
#include "util.h"	/* basic utilities, again from Rob McCool */

#include "constants.h"	/* basic constants */
#include "html.h"	/* HTML parsing and translation */
#include "database.h"	/* stores info and contents of documents */
#include "error.h"	/* fail printing an HTML error message */
#include "log.h"	/* print stuff to logfiles */
#include "config.h"	/* compile time and runtime configuration */
#include "cleanup.h"	/* cache cleanout */

static void parse_url(char *url, char *query_string,
  char *host, char *port, char *request)
/* fills host, port, and request from the URL given in url?query_string */
{
  /* this is an ad-hoc parser, to be replaced with a better one */

  char *s;
  register char c;

  if (strncasecmp(url,"http://",7))
  {
    /* it is not a http type URL */
    die(DIE_NO_HTTP_URL,url);
    /* we ought to print a REDIRECT here */
  }
  /* get host and portname */
  s = &url[7];
  while ((c = *s++) != ':' && c != '/');
  strncpy(host,&url[7],s-1-&url[7]);
  host[s-1-&url[7]] = '\0';  /* terminate host string */
  if (c == ':')
  {
    /* there is a port, too */
    int i = 0;
    while ((c = s[i]) != '/') ++i;
    strncpy(port,s,i);
    port[i] = '\0';
    s += i;
  }
  else
  {
    --s;  /* back onto the slash */
    /* the default port */
    strcpy(port,"80");
  }

  /* set the http request */
  strcpy(request,s);
  if (*query_string)
  {
    /* we have a query */
    strcat(request,"?");
    strncat(request,query_string,HUGE_STRING_LEN-strlen(request)-2);
  }
}

static int opened_socket(char *host, char *port)
/* opens the socket for R/W; dies on failure */
{
  int sock;
  if ((sock = open_connection(host,port)) < 0)
  {
    /* error making connection */
    char err[MAX_STRING_LEN+1];
    sprintf(err,"open_connection() to host %s, port %s failed, rc = %d",
	host,port,sock);
    log_error(err);
    sprintf(err,"could not open connection to host %s, port %s",host,port);
    die(DIE_BADSOCKET,err);
  }
  return(sock);
}

static int opened_file(char *file, char mode)
/* opens the file for mode ('R' or 'W'); dies on failure */
{
  int fd;
  
  switch (mode)
  {
    case 'R':
      if ((fd = open(file,O_RDONLY, 0600)) == -1)
      {
         die(DIE_RDCACHE,file);
         /* we've already established that it exists */
      }
      break;
    case 'W':
      if ((fd = open(file,O_WRONLY | O_SYNC | O_CREAT | O_TRUNC, 0644)) == -1)
      {
        /* error creating diskfile */
        die(DIE_WRCACHE,file);
      }
      break;
    default:
      die(DIE_BUG,"trying to open a file with unknown mode");
  }
  return(fd);
}

static void close_fd(int fd)
/* closes the file descriptor fd; might die on failure */
{
  close(fd);
  /* no error check */
}

static void die_on_config_error()
{
  int cb = current_badness();
  if (cb < 0)
  {
    log_warnings();
    if (cb == -2)  /* fatal */
    {
      die(DIE_CONFIG,"an argument or configuration value is fatally incorrect");
    }
  }
}

static void die_if_has_no_dir(char *file)
/* the directory for the given file exists */
/* we don't try to create it */
/* untested for relative filenames */
{
  char dir[MAX_STRING_LEN+1];
  struct stat finfo;
  int on_slash;

  strcpy(dir,file);
  on_slash = rind(dir,'/');
  if (on_slash == -1)
  {
    /* relative filename; try . */
    strcpy(dir,".");
  }
  else
  {
    dir[on_slash] = '\0';
  }

  if ((stat(dir,&finfo) == -1) || !S_ISDIR(finfo.st_mode))
  /* don't test whether we can read/write it */
  {
      die(DIE_DIR,dir);
  }
}

void send_document_request(int fd, char *request, int request_has_input)
/* sends a HTTP request to file descriptor fd; dies on failure */
{
  char *buf;
  int cl;
  int nb;

  if (request_has_input)
  {
    /* assume the input is still waiting on stdin */

    cl = atoi(env_content_length);
    if (cl <= 0)
    {
       /* no input or invalid content length; forget about it */
       return;
    }

    buf = (char *)malloc(cl+1);

    if (!buf)
    {
       die(DIE_RDIN,"couldn't allocate buffer memory");
    }
    /* nb = fread(buf,1,cl,stdin); */
    nb = readn(0,buf,cl);
    if (nb < cl) 
    {
       die(DIE_RDIN,"request input could not be read, or some input was left");
    }
    sprintf(&buf[nb],"%c%c",LF,LF);  /* terminating empty line */
	/* we really wish to use this on input, too */
    fclose(stdin);
  }
  else
  {
    buf = "";
  }
  if (raw_send_http_request(request,buf,fd) == -1)
  {
     log_error("writing HTTP request to remote site failed");
     die(DIE_BADSOCKET,"couldn't send the document request");
  }
}

static void send_document_header(int fd)
/*
 * sends a HTTP document header to fd (including MIME headers); which must be
 * open for reading; dies on failure
 */
{
  if (raw_send_http_header(fd) == -1)
    /* write error */
    die(DIE_WROUT,"trying to write the document headers");
}

static void get_document_header(int fd)
/*
 * gets a HTTP document header (including MIME header) from fd, which must be
 * open for reading; dies on failure
 */
{
  if (raw_get_http_header(fd) == -1)
  {
    die(DIE_BADSOCKET,"couldn't receive the document header");
    /* may be I/O or just the fact that no HTTP header was found */
  }
}

static void send_caching_header(char *url)
/* out of date - revise before use */
{
  char *escaped_url = (char *)malloc(strlen(url)+100);  /* some extra chars */
  strcpy(escaped_url,url);
  escape_url_with_slashes(escaped_url);
  fprintf(stdout,"<TITLE>blaaah</TITLE><H2>Cached document</H2>");
  fprintf(stdout,"This a copy of the document with URL <B>%s</B>",url);
  fprintf(stdout,"from the caching space <B>%s</B>.<P>",cache_prefix);
  fprintf(stdout,"You can <A HREF=\"%s%s%s\">refresh</A> this copy,",
    cache_prefix,REFRESH_PREFIX,escaped_url);
  fprintf(stdout,"or <A HREF=\"%s\">quit caching</A> this document",url);
  fprintf(stdout,"and its descendants.\n<P><HL>");
  free(escaped_url);
}

static void translate_urls_in_headers(char *abs_url, int transl_esc)
{
  char location[HUGE_STRING_LEN+1];
  char location_tr[HUGE_STRING_LEN+1];
  get_location(location);
  if (location[0])
  {
    /* must be an absolute URL, we assume */
    translate_url(location_tr,location,"",transl_esc);
    set_location(location_tr);
#if DEBUG
    {
      char msg[2*HUGE_STRING_LEN+2];
      sprintf(msg,"translating location %s to %s",location,location_tr);
      log_if_debug(msg,"");
    }
#endif /* DEBUG */
  }
}

static int fd_size(int fd)
/* gets the file size off an open fd */
/* dies on failure */
{
  struct stat finfo;

  if (fstat(fd,&finfo) || finfo.st_size < 0)
  {
    /* error getting info on diskfile - at least in the context it's used now */
    die(DIE_RDIN,"trouble getting file size");
  }

  return(finfo.st_size);
}

static void translate_html(int infd, int outfd, int in_size,
    char *bare_url, int translate_escaped)
/*
 * translate HTML tags in the document read off infd
 * writes the result to outfd; infd = outfd is allowed, but in_size must
 * be given; infd and outfd must be open for reading and writing, respectively
 * the bare_url is an absolute URL without query string or relative anchor
 * relative to which relative links must be parsed
 * BEWARE:
 * assumes the HTTP header of infd was already read and is present in MIME info
 * does write the HTTP header to outfd though
 */

/* doesn't clean up the cache file if anything goes wrong */
/* doesn't treat the headers separately, either */
{
  char *in_buf;
  char *out_buf;
  int in_bufsize;
  int out_bufsize;

  char *in;
  char *out;

  int rc;
  int nb;

  in_bufsize = in_size+1;
  in_buf = (char *)malloc(in_bufsize);
  out_bufsize = (in_bufsize < 10000 ? 20000 : 2*in_bufsize); /* not foolproof */
  out_buf = (char *)malloc(out_bufsize);

  in = in_buf;
  out = out_buf;

  nb = readn(infd,in_buf,in_bufsize);  /* maybe read() */
  if (nb <= 0)  /* which is now impossible */
  {
    die(DIE_CONV,"couldn't read the document");
  }
#if ST_JUTTEMIS
  {
    char err[MAX_STRING_LEN+1];
    sprintf(err,"%d of %d bytes were read",nb,in_bufsize);
    log_if_debug("translate:",err);
  }
#endif /* ST_JUTTEMIS */
  in[nb] = '\0'; /* who knows, maybe not null-terminated yet */

  translate_urls_in_headers(bare_url,translate_escaped);

  if ((rc =
    rel_to_abs(&out,&in,bare_url,out_bufsize,translate_escaped)))
  {
    if (rc == 2)
      die(DIE_CONV,"the link translation process failed");
    /* else rc == 1, an unfinished tag; leave it */
  }
  /* terminate the output if ending halfway a tag */
  *out = '\0';

  send_document_header(outfd);

  nb = write(outfd,out_buf,strlen(out_buf));
  if (nb < 0)
  {
    die(DIE_CONV,"couldn't write the document");
  }
#if ST_JUTTEMIS
  {
    char err[MAX_STRING_LEN+1];
    sprintf(err,"%d of %d bytes were written",nb,strlen(out_buf));
    log_if_debug("translate:",err);
  }
#endif /* ST_JUTTEMIS */
  free(in_buf);
  free(out_buf);
}

/* forward */ static void dph_transfer_document(int infd, int outfd);

static void transfer_document(int infd, int outfd)
/*
 * transfers a document, with headers, from the (open) infd to the (open) outfd
 * doesn't close infd or outfd; dies on failure
 */
{
  get_document_header(infd);
  dph_transfer_document(infd,outfd);
}

static void dph_transfer_document(int infd, int outfd)
/*
 * like previous, except it *d*oesn't *p*arse the *h*eader from infd,
 * assuming this has already taken place
 */
{
  int nb;
  char buf[BIG_BUF_LEN];

  buf[BIG_BUF_LEN-1] = '\0';  /* never to be overwritten by reads */

  nb = readn(infd,buf,BIG_BUF_LEN-1);  /* not read() */
  if (nb <= 0)  /* which is now impossible */
  {
    die(DIE_RDIN,"begin of document could not be read");
  }

  send_document_header(outfd);

  /* read the rest of the document and write it to fd */
  /* blockwise copy from input to output */

  while (nb > 0)
  {
    if (write(outfd,buf,nb) == -1)
      die(DIE_WROUT,"document could not be written to cache or to output");
#if ST_JUTTEMIS
{
  char last[11];
  strncpy(last,&buf[nb-11],10);
  last[10] = '\0';
  log_if_debug("last 10 characters written:",last);
}
#endif /* ST_JUTTEMIS */

    nb = readn(infd,buf,BIG_BUF_LEN - 1);
  }

  if (nb < 0)
  {
    die(DIE_RDIN,"a part of the document could not be read");
  }
}

static int method_is_cached(char *request_method)
/* are results of requests of this type cached at all? */
/* note: I'm not too happy with the fact that request_has_input is a */
/* local variable in main(); deal with it in a more serious way in future */
{
  /* only GET requests, at present */
  return(!strcmp(request_method,"GET"));
}

static void log_cache_request(char *translated, char *new)
/*
 * logs the request; whether it's a translated document, and whether or
 * not it is old, new, or not at all in the cache
 * uses global vars env_url and env_query_string
 */
{
  char msg[HUGE_STRING_LEN+1];
  char short_url[HUGE_STRING_LEN+1];
  sprintf(msg,"%s, %s in cache:", translated,new);
  sprintf(short_url,"%s%s", env_url, *env_query_string ? "?[...]" : "");
  log_msg(msg,short_url);
}

/* the main routine: read parameters from environment variables,        */
/* find the file, get it into cache if not already there                */

int main(int argc, char *argv[])
{
  char file[MAX_STRING_LEN+1];
  char file_tr[MAX_STRING_LEN+1];
  char host[MAX_STRING_LEN+1];
  char port[MAX_STRING_LEN+1];
  char request[HUGE_STRING_LEN];
  char *remote_url;
  int request_has_input;	/* does the HTTP request take input? */

  read_config();		/* configuration variables, from config file */
  read_parameters();		/* from environment variables */
  finish_global_variables();	/* adjust values; derive implicit ones */

  init_logging();
  init_mime();
  set_mime_headers_from_cgi_env();  /* we can't get the headers ourselves */

  die_on_config_error();
  dump_global_vars_if_debug();  /* none are modified beyond this point */

  /* check other parameter values, puke in case of problems */

  if (!strcmp(env_request_method,"GET"))
  {
    request_has_input = 0;
  }
  else if (!strcmp(env_request_method,"POST"))
  {
    request_has_input = 1;
  }
  else
  {
    /* I only handle GET or POST requests */
    die(DIE_WRONG_REQUEST,env_request_method);
  }

  if (strcmp(env_gateway_interface,"CGI/1.0")
   && strcmp(env_gateway_interface,"CGI/1.1"))
  /* I am a CGI 1.0 or 1.1 script and the server must be aware of that */
    die(DIE_WRONG_INTERFACE,env_gateway_interface);

  die_if_has_no_dir(transfer_logfile);
  die_if_has_no_dir(error_logfile);
  /* meer? */

  /* get the filenames under which the document is stored in cache */

  url_to_file(file,env_url_with_query);
  url_to_file_tr(file_tr,env_url_with_query);

  die_if_has_no_dir(file);
  die_if_has_no_dir(file_tr);

  remote_url = url_to_remote_source(env_url);
  /* this is the real URL according to expire.conf's d directive */ 
  log_if_debug("URL to remote source:",remote_url);

  parse_url(remote_url,env_query_string,host,port,request);

  /* serve the document; if necessary, get, cache and/or translate it first */

  if ((cached_file_status(file_tr,CF_REFRESH) == CF_NEW)
	&& method_is_cached(env_request_method) && !env_refresh)
  {
    /* the translated file is up to date in cache; serve it */
    int trfile;

    log_cache_request("translated","already");

    trfile = opened_file(file_tr,'R');
    transfer_document(trfile,1);
    close_fd(trfile);
  }
  else if ((cached_file_status(file,CF_REFRESH) == CF_NEW)
	&& method_is_cached(env_request_method) && !env_refresh)
  {
    /* the untranslated file is up to date in cache; serve it */
    int diskfile;

    diskfile = opened_file(file,'R');

    get_document_header(diskfile);
    if (!env_its_proxy && is_content_type("text/html"))
    {
      /* we need to translate it before serving */
      int trfile;

      log_cache_request("translated","already");

      trfile = opened_file(file_tr,'W');

      translate_html(diskfile,trfile,fd_size(diskfile),
	env_url, translate_escaped);

      close_fd(diskfile);
      close_fd(trfile);

      trfile = opened_file(file_tr,'R');
      transfer_document(trfile,1);
      close_fd(trfile);
    }
    else
    {
      log_cache_request("untranslated","already");


      dph_transfer_document(diskfile,1);
      close_fd(diskfile);
    }
  }
  else
  {
    /* document not to be cached, or not (up to date) in cache */
    int sock;

    sock = opened_socket(host,port);
    send_document_request(sock,request,request_has_input);
    get_document_header(sock);
    if (!is_regular_document_header())
    {
      /* don't cache irregular documents */
      log_cache_request("not a regular document","not stored");

      if (!env_its_proxy)
      {
	/* don't translate them, either; but do translate their headers */
        translate_urls_in_headers(env_url,translate_escaped);
      }
      dph_transfer_document(sock,1);
      close_fd(sock);
    }
    else if (!url_is_cached(env_url,env_query_string)
	|| !method_is_cached(env_request_method))
    {
      /* don't cache this document */
      if (!env_its_proxy && is_content_type("text/html"))
      {
        log_cache_request("translated","not stored");
#define BIG_GUESS 100000
/* assuming no HTML doc is ever larger than 100K ... */
        translate_html(sock,1,BIG_GUESS,env_url, translate_escaped);

        close_fd(sock);
      }
      else
      {
        log_cache_request("untranslated","not stored");

        dph_transfer_document(sock,1);
        close_fd(sock);
      }
    }
    else
    {
      /* we need to cache this document */
      int diskfile;
      int was_already_there = (cached_file_status(file,CF_REFRESH) == CF_OLD);

      diskfile = opened_file(file,'W');
      dph_transfer_document(sock,diskfile);
      close_fd(sock);

      diskfile = opened_file(file,'R');
      get_document_header(diskfile);

      if (!env_its_proxy && is_content_type("text/html"))
      {
        int trfile;
        log_cache_request("translated",
	    was_already_there ? "refreshed" : "new");

        trfile = opened_file(file_tr,'W');
  
        translate_html(diskfile,trfile,fd_size(diskfile),
          env_url, translate_escaped);
        close_fd(diskfile);
        close_fd(trfile);
  
        trfile = opened_file(file_tr,'R');
        transfer_document(trfile,1);
        close_fd(trfile);
      }
      else
      {
	log_cache_request("untranslated",
	  was_already_there ? "refreshed" : "new");

        dph_transfer_document(diskfile,1);
        close_fd(diskfile);
      }
    }
  }

  close(1);  /* not fclose(stdout); only unbuffered writes have been issued */

  quit_database();
  log_if_debug("exiting cache script for URL",env_url_with_query);
  quit_logging();

  exit(0);
}
