;;; -*- Mode: Scheme -*-

;;;; Catching and handling errors so they don't kill Yenta.

(yreq "Parameters/persistence")
(yreq "Utilities/yenta-utils")

;;;; Keeping errors from killing us, and capturing the resulting backtraces.

;;; Basic form for protecting something.  This will return the value of the lambda
;;; PROC, unless it gets an error, in which case it returns 'protected:error.
;;; NOTE that, even though we catch the error, it will -also- cause SCM to blat
;;; out a backtrace to stderr.  See capturing-errors for how to avoid that.
(define (protected proc . args)
  (let ((result 'protected:error))
    (call-with-current-continuation
     (lambda (continue)
       (dynamic-wind
	(lambda () 'dummy)
	(lambda () (set! result (apply proc args)))
	(lambda () (continue result)))))))

;;; Runs PROC protected, and returns a cons whose cdr is a string, and whose
;;; car is the result of PROC if it completed, and 'protected:error otherwise.
;;; The string is "" if no error happened; otherwise, it is the backtrace.
;;; 
;;; It uses dynamic-wind to guarantee that we restore the error-output-port
;;; no matter what.  A blowup in the protected form can't screw us even
;;; without the dynamic-wind, but one in call-with-output-string could
;;; conceivably---and so could a user typing ^C, I suspect.  [Untested;
;;; I could try making the protected form some sort of sleep or loop & see.]
;;;
;;; NOTE that this rebinds current-error-port while it runs the protected form!
;;; This means that any legitimate output to cep will wind up being stuck in
;;; where the "backtrace" would be.  If Yenta routinely had to output things
;;; to cep, I'd include a version of scm_stack_trace which checks for a particular
;;; global being bound to a valid output port, and sent its output there if so
;;; (if not, it would log to cep anyway, so backtracing wouldn't be permanently
;;; broken).  However, Yenta -isn't- supposed to be sending -anything- to cep,
;;; since it's supposed to be using httpd as its UI, so this shouldn't be an
;;; issue.  We'll capture any stray output anyway, just in case, for debugging.
;;;
;;; Note also that what counts here is what stream you -think- you're outputting
;;; to---even though we rebind current-output-port to point at current-error-port
;;; when dumping, that doesn't mean that things that rebind current-error-port
;;; will get output sent to current-output-port, so repl still works, etc.
(define (prot:capturing-errors proc . args)
  (let ((old-error-port nil) (result nil) (backtrace nil))
    (dynamic-wind
     (lambda () (set! old-error-port (current-error-port)))
     (lambda () (set! backtrace
		      (call-with-output-string
			(lambda (port)
			  (set-current-error-port port)
			  (set! result (protected (lambda ()
						    (apply proc args))))))))
     (lambda () (set-current-error-port old-error-port)))
    (cons result backtrace)))

;;;; Accumulating errors somewhere, so we can inspect them.

;;; Set by sch-read-request, and logged for debugging in the event some scheduled task gets a fatal error.
;;; If we're running some server (which doesn't have a UI), this will just stay set to #f.  This includes
;;; -all- URL's, even those of images, since this may be the last thing a task does before erring.
(defvar *httpd:last-url-fetched* #f)

;;; This is a yvar, so we can accumulate them across restarts, if we
;;; haven't had a chance to dump them out yet.  (We can always decide
;;; later to clear it...)  Each entry is a single string, which is the
;;; entire contents of one entry produced by prot:accumulating-errors.
;;; We store strings so we can guarantee readability (e.g., no #<...>,
;;; etc) when the persistent-state file is reloaded with try-read.
(def-yenta-var *prot:accumulated-errors* '())

;;; +++
;;; Note!  -Only one- of these variables should ever be set in
;;; a customer world!  If you set both, then any such world that
;;; takes an error will log twice---once with its IP address, and
;;; once with its SID.  That allows unblinding that host's stats.
;;; It's not a -huge- problem unless many Yentas take errors, but
;;; it's also pointless---why get the same backtrace twice?
;;;
;;; If this is set, we'll attempt to log errors directly to the
;;; debugging server.  It's expected that this will only be set
;;; in central server worlds (e.g., stat receiver; bootstrap),
;;; but who knows...
(defvar *prot:errors-to-dbg-server* #f)

;;; If this is set, errors will cause us to log an event, which
;;; will eventually be reported to the stat server.  This is probably
;;; better for customer worlds, since that way the backtrace will be
;;; encrypted and so forth, and it's not clear how much use I could
;;; make of an IP address anyway---contacting the customer for more
;;; details would probably be scary.  Note that this should -not- be
;;; routinely set in devo worlds, except to test the mechanism itself,
;;; since devo worlds take lots of errors (e.g., from typos at the srepl),
;;; and I can't easily tell -their- errors from customer errors unless I
;;; keep track of the SID from every devo world.  (This is when IP addrs
;;; would be easier.)
(defvar *prot:errors-to-stat-server* #f)
;;; ---

;;; If this is set, we also include the current tasklist.
(defvar *prot:verbose-error-reports* #f)

;;; If proc gets an error, records Yenta version, proc, args, and the
;;; error, and optionally logs the event to the debugging server or
;;; creates an event so it will get logged to the stats server.
;;; Returns the result of the capturing-errors, in case the caller
;;; cares about the values and would like to do something with them.
(define (prot:accumulating-errors proc . args)
  (let* ((stuff (prot:capturing-errors apply proc args))
	 (result (car stuff))
	 (backtrace (cdr stuff)))
    (inc! *ctr:fatal-errors*)		; We took a fatal error in a task here, even if we can't manage to compute a backtrace...
    (when (or (eq? result 'protected:error)
	      (positive? (string-length backtrace)))
      (let ((entry (list (list *yenta-component-versions* *wb:dump-times* ; Version information.
			       (if (vector? (scheduler:current-task)) ; If this is a vector, then assume it's valid...
				   (scheduler:current-name)           ; ... and hence we can get the current taskname.
				   "")
			       *httpd:last-url-fetched*)
			 (list result backtrace errobj proc args)))) ; The actual error.
	(push! (format nil "~S" entry) *prot:accumulated-errors*)
	(when (or *prot:errors-to-dbg-server* *prot:errors-to-stat-server*)
	  (let* ((extra (if *prot:verbose-error-reports*
			    (call-with-output-string
			      (lambda (port)
				(scheduler:show-tasklist nil port)))
			    ""))
		 (full (format nil "~S~%~A" entry extra)))
	    (when *prot:errors-to-dbg-server* ; Customer worlds shouldn't have both of these set. ...
	      (logger:log 0 "~A" full))
	    (when *prot:errors-to-stat-server* ; ... See comment above at the variables themselves.
	      (stats:log-event full))))))
    stuff))

;;;; +++ Dead test code.

; (define (test-ce)
;   (format t "CEP is ~S~&" (current-error-port))
;   (force-output (current-output-port))
;   (let ((p (current-error-port)))
;     (capturing-errors
;      (lambda ()
;        (format t "CEP is ~S~&" (current-error-port))
;        (force-output (current-output-port))
;        (format (current-error-port) "Waiting 2 seconds...")
;        (force-output (current-output-port))
;        (sleep 2)
;        (format p "Done waiting.")
;        (force-output (current-output-port))
;        foo				; Force an error.
;        34
;        ))))
; 
; ;;; Returns a cons whose car is T if the PROC finished without error, and whose
; ;;; cdr is the value of PROC.  If an error occurred, returns (nil 'protected:error).
; ;;; This is mostly of pedagogical importance.  C.f. capturing-errors, as above.
; (define (protected-1 proc . args)
;   (let* ((won nil)
; 	 (result
; 	  (protected (lambda ()
; 		       (apply proc ars)
; 		       (set! won t)))))
;     (cons won result)))
; 
; ;;; Runs PROC protected, and returns a cons whose cdr is a string, and whose
; ;;; car is the result of PROC if it completed, and 'protected:error otherwise.
; ;;; The string is "" if no error happened; otherwise, it is the backtrace.
; (define (capturing-errors-unsafe proc . args)
;   (let ((old-error-port (current-error-port))
; 	(result nil))
;     (let ((backtrace
; 	   (call-with-output-string
; 	     (lambda (port)
; 	       (set-current-error-port port)
; 	       (set! result (protected (lambda ()
; 					 (apply proc args))))))))
;       (set-current-error-port old-error-port)
;       (cons result backtrace))))
; 
; ;;; An idea that didn't work.
; 
; (define (capturing-errors proc . args)
;   (let ((result nil) (backtrace nil))
;     (call-with-output-string
;       (lambda (port)
; 	(dynamic-wind
; 	 (lambda () (set! port (set-current-error-port port)))
; 	 (lambda () (set! result (protected (lambda ()
; 					      (apply proc args)))))
; 	 (lambda () (set! port (set-current-error-port port))))))
;     (cons result backtrace)))
; 
; ;;; Demonstrates a scheduler task getting killed due to an error.
; ; (scheduler:add-task! "kill me" 4 scheduler:always (lambda () (format t "I ran.~&") blah))

;;; End of file.
