///////
   //    Scheduler.h
   //    Scheduler Class declaration
   //
   //    Class for managing the crawling process
   //
   //    Part of the ht://Check package
   //
   //    Copyright (c) 1999-2004 Comune di Prato - Prato - Italy
   //    Some Portions Copyright (c) 1995-2000 The ht://Dig Group <www.htdig.org>
   //    Some Portions Copyright (c) 2008 Devise.IT srl <http://www.devise.it/>
   //    Author: Gabriele Bartolini - Prato - Italy <angusgb@users.sourceforge.net>
   //
   //    For copyright details, see the file COPYING in your distribution
   //    or the GNU General Public License version 2 or later 
   //    <http://www.gnu.org/copyleft/gpl.html>
   //
   //    $Id: Scheduler.h,v 1.31 2008-11-16 18:28:51 angusgb Exp $
   //
   //    G.Bartolini
   //    started: 13.09.1999
///////

#ifndef _SCHEDULER_H
#define _SCHEDULER_H

#include <Object.h>

#ifdef HAVE_STD
#include <iostream>
#include <string>
#include <map>
#include <set>
#ifdef HAVE_NAMESPACES
using namespace std;
#endif
#else
#include <iostream.h>
#include <string.h>
#include <map.h>
#include <set.h>
#endif /* HAVE_STD */

#include <HtmysqlDB.h>
#include <HtDateTime.h>
#include <Configuration.h>
#include <HtRegex.h>
#include <Transport.h>
#include <HtHTTPBasic.h>
#include "SchedulerEntry.h"
#include "_Url.h"
#include "_Server.h"
#include "RunInfo.h"

class HtmlParser;

class Scheduler : public Object
{

   // Declaring friend classes (parser classes)
   friend class HtmlParser;

    // Dictionary type (using std::map) for Servers
    typedef std::map <std::string, _Server*> ServersDictionary;

    // String set
    typedef std::set <std::string> StringSet;

   public:
   
      // Construction / Destruction
      Scheduler();
      virtual ~Scheduler();

      enum Scheduler_Query_Type
      {
         Scheduler_Stored,
         Scheduler_Direct
      };

      enum Scheduler_Codes
      {
         Scheduler_OK,
         Scheduler_MemoryError,
         Scheduler_DBError,
         Scheduler_Interrupted
      };

      enum Scheduler_URL_Validation
      {
         Scheduler_URL_Valid,
         Scheduler_URL_MaxHopCount,
         Scheduler_URL_Excludes,
         Scheduler_URL_BadQueryString,
         Scheduler_URL_BadExtension,
         Scheduler_URL_NotValidExtension,
         Scheduler_URL_OutOfLimits,
         Scheduler_URL_FileProtocol,
         Scheduler_URL_EMail,
         Scheduler_URL_Javascript,
         Scheduler_URL_NotValidService,
         Scheduler_URL_Malformed,
	 Scheduler_URL_MaxUrlsCount
      };

///////
   //    Database Selection
///////

      Scheduler_Codes SelectDatabase(const std::string &name);


///////
   //    Restoring info from Database
///////

      Scheduler_Codes RestoreDatabase();


///////
   //    Set-up and Initialization
///////

      void SetOptions(Configuration &config);   // Set the options
      int Initial(const std::string &list);          // Initialize the URL list
      void SetArgumentList(int *_argc, char *** _argv)
         { argc = _argc; argv = _argv;}

///////
   //    Running process
///////

      Scheduler_Codes Run();
      void Stop() { stop = 1;}


///////
   //    Show info at the end of the crawl
///////

      Scheduler_Codes ShowStatusCode(ostream &output = std::cout);
      Scheduler_Codes ShowBrokenLinks(ostream &output = std::cout);
      Scheduler_Codes ShowAnchorNotFound(ostream &output = std::cout);
      //Scheduler_Codes ShowContentTypes(ostream &output = std::cout);
      Scheduler_Codes ShowContentTypesPerServer(ostream &output = std::cout);

      
///////
   //    Public access to protected attributes
///////

      Scheduler_URL_Validation IsAValidURL(const SchedulerEntry &);

      HtmysqlDB *GetDB() { return DB; }
      const HtDateTime *GetStartTime() { return &runinfo.StartTime; }
      const HtDateTime *GetFinishTime() { return &runinfo.FinishTime; }
      const int GetRunningTime() {
         return HtDateTime::GetDiff(runinfo.FinishTime, runinfo.StartTime); }
      void SetFinishTime() { runinfo.FinishTime.SettoNow(); }
      
      void SetDebugLevel(int d) { debug = d; }
      int GetDebugLevel() { return debug; }
	  void SetDropDatabase(const bool f) { drop_database = f; }
	  bool GetDropDatabase() const { return drop_database; }
      void SetStatsLevel(int l) { stats = l; }
      void SetInitializationLevel(int l) { erase = l; }
      const int GetInitializationLevel() const { return erase; }
      void SetUserAgent(const std::string &ua);


   protected:

///////
   //    Protected Methods
///////


 ///////
    //    Retrieve a URL
 ///////

      Transport::DocStatus Retrieve (const SchedulerEntry &s, _Url &url);


 ///////
    //   Add a URL or find an existant in the DB
    //   If OK, stores the new or found ID value into IDUrl
    //   If previous is set to true, CurrentSchedule is considered
    //   as it was the calling URL (so used for the referer setting
    //   and the hop count too). If previous is set to false, we don't
    //   set them, just think for example at the starting URL list.
 ///////

      Scheduler_Codes AddUrl(const std::string &u, unsigned int &IDUrl,
         bool previous=true);


 ///////
    //    Add a new server
 ///////

      _Server *AddServer(_Url &u);
      _Server *FindServer(const std::string &signature);


 ///////
    //    Search the Scheduler
 ///////

      int GetNext();
      int GetNext(const std::string &StrStatus);


 ///////
    //   Depending ont the Link table values (considering only those
    //   records with a LinkType='Direct') this method calculates the
    //   added size to the URL. That is to say: it adds sizes of the direct
    //   linked URLs (only once per document), those that are loaded
    //   automatically by the user agent together with the page (ex.: images).
 ///////

      Scheduler_Codes CalculateUrlSizeAdd(ostream &output = std::cout);

 ///////
    //    Check the HTML anchors
 ///////

      Scheduler_Codes SetHTMLAnchorsResults(ostream &output = std::cout);


 ///////
    //    Deserialize the memory dictionary
 ///////

      Scheduler_Codes DeserializeServers(); // about servers
      Scheduler_Codes DeserializeCookies(); // about servers


 ///////
    //    Tell us if we should retry to retrieve an URL depending on
    //    the first returned document status
 ///////

      int ShouldWeRetry(Transport::DocStatus DocumentStatus);


 ////////
   //     Check for the Proxy
 ///////

      bool UseProxy(const SchedulerEntry &);
      
///////
   //    Protected attributes
///////

      HtmysqlDB      *DB;                    // Database Pointer
      ServersDictionary servers;                // Servers
   
      SchedulerEntry  CurrentSchedule;       // CurrentSchedule retrieved
      SchedulerEntry  CurrentLinkSchedule;   // Current Link schedule
      SchedulerEntry  Referer;               // Referer

      _Server  *CurrentServer;               // Current server queried
      _Url     *CurrentUrl;                  // Current Url
      _Url     *Proxy;                       // Proxy Url
      std::string   Credentials;                  // Credentials string
      std::string   ProxyCredentials;             // Proxy Credentials string
      std::string   AcceptLanguage;               // HTTP accept-language directive
      
      // Temporary results for querying the Schedule database table
      HtmysqlQueryResult   ScheduleTmp;
      
      // Transport Pointers
      Transport            *TransportConnect;
      HtHTTP               *HTTPConnect;
      Transport_Response   *CurrentResponse;


      int erase;  // Erase the DB?
      int stop;   // Catch the signals
      bool drop_database;  // Erase the DB - without dropping the database
      bool deserialized;   // Boolean for deserialization of servers
      bool deserialized_cookies; // Boolean for deserialization of cookies

      HtCookieJar *_cookie_jar;  // Cookie jar manager object   
      RunInfo     runinfo;      // Run time Info

      // Execution info
      int *argc;
      char ***argv;
      std::string options_list;

///////
   //    Configuration variables
///////

      Configuration  *Config;          // Pointer to the configuration
      
      HtRegex     Limits;              // URL limits
      HtRegex     LimitsNormalized;    // URL limits (normalized)
      HtRegex     Excludes;            // URL exclusions
      HtRegex     BadQueryString;      // Bad query string
      HtRegex     ExcludeProxy;        // URLs to be excluded from the proxy
      
      StringSet  ValidExtensions;    // Extensions to be included
      StringSet  BadExtensions;      // Extensions to be excluded

      int debug;  // Debug info
      int stats;  // Statistics available?
      int parsed_urls; // number of parsed URLs
};

#endif
