(**************************************************************)
(*
 *  Ensemble, (Version 0.40)
 *  Copyright 1997 Cornell University
 *  All rights reserved.
 *
 *  See ensemble/doc/license.txt for further information.
 *)
(**************************************************************)
(**************************************************************)
(* SUSPECT.ML *)
(* Author: Mark Hayden, 11/96 *)
(**************************************************************)
open Trans
open Layer
open Event
open Util
open View
(**************************************************************)
let name = Trace.source_file "SUSPECT"
(**************************************************************)

type header = Ping of bool array * ltime array

type state = {
  max_idle	: int ;
  sweep		: Time.t ;

  failed        : bool array ;
  send		: ltime array ;
  recv          : ltime array ;
  mutable next_sweep : Time.t
}

let conv i a = string_of_int_array (Array.map (fun j -> i - j) a)

let dump (ls,vs) s =
  eprintf "SUSPECT:dump:%s\n" ls.name ;
  eprintf "  failed=%s\n" (string_of_bool_array s.failed) ;
  eprintf "  idle  =%s\n" (conv (s.send.(ls.rank)) s.recv)

let init () (ls,vs) = {
    max_idle	= Param.int vs.params "suspect_max_idle" ;
    sweep	= Param.time vs.params "suspect_sweep" ;

    failed      = array_create name ls.nmembers false ;
    send 	= array_create name ls.nmembers 0 ;
    recv 	= array_create name ls.nmembers 0 ;
    next_sweep = Time.invalid
}

let hdlrs s (ls,vs) {up_out=up;upnm_out=upnm;dn_out=dn;dnlm_out=dnlm;dnnm_out=dnnm} =
  let failwith m = dump (ls,vs) s ; failwith (name^":"^m) in
  let log = Trace.log name ls.name in

  let up_hdlr ev abv () = up ev abv
  and uplm_hdlr ev hdr = match getType ev,hdr with

    (* Ping Message: if from a live member, reset idle counter.
     *)
  | (ECast|ESend), Ping(failed,cnts) ->
      let origin = getOrigin ev in
      log (fun () -> sprintf "Ping from %d" origin) ;
      if origin <> ls.rank		(* Check just in case. *)
      && (not failed.(ls.rank))		(* BUG: could auto-fail him *)
      && (not s.failed.(origin))
      then (
	s.recv.(origin) <- cnts.(ls.rank) ;
        s.send.(origin) <- cnts.(origin)
      ) ;
      free name ev

  | _ -> failwith "unknown local message"

  and upnm_hdlr ev = match getType ev with

    (* EFail: mark the failed members.
     *)
  | EFail ->
      List.iter (fun rank ->
      	s.failed.(rank) <- true
      ) (getFailures ev) ;
      upnm ev

    (* EInit: request a first timer alarm to bootstrap things.
     *)
  | EInit ->
      dnnm (timerAlarm name Time.zero) ;
      upnm ev

    (* ETimer: every so often:
     *   1. check for suspicions
     *   2. ping the other members
     *)
  | ETimer ->
      let time = getTime ev in
      if time >= s.next_sweep then (
	s.next_sweep <- Time.add time s.sweep ;
	dnnm (timerAlarm name s.next_sweep) ; (* request next sweep *)
      	log (fun () -> sprintf "Pinging") ;

	array_incr s.send ls.rank ;
	dnlm (create name ECast[Unreliable])
	  (Ping(Array.copy s.failed,Array.copy s.send)) ;

	let suspicions = ref [] in
	for i = 0 to pred ls.nmembers do
	  if i <> ls.rank 
	  && not s.failed.(i) 
	  && s.send.(ls.rank) - s.recv.(i) > s.max_idle
	  then (
	    suspicions := i :: !suspicions
	  )
	done ;
	log (fun () -> sprintf "Idles:%s" (conv s.send.(ls.rank) s.recv)) ;
	if !suspicions <> [] then (
	  log (fun () -> sprintf "Suspect:%s" (string_of_int_list !suspicions)) ;
	  dnnm (suspectReason name !suspicions name) ;
	)
      ) ;
      upnm ev

  | EDump -> dump (ls,vs) s ; upnm ev
  | _ -> upnm ev

  and dn_hdlr ev abv = dn ev abv ()
  and dnnm_hdlr = dnnm

in {up_in=up_hdlr;uplm_in=uplm_hdlr;upnm_in=upnm_hdlr;dn_in=dn_hdlr;dnnm_in=dnnm_hdlr}

let l args vf = Layer.hdr init hdlrs None NoOpt args vf

let _ =
  Param.default "suspect_sweep" (Param.Time (Time.of_float 1.0)) ;
  Param.default "suspect_max_idle" (Param.Int 10) ;
  Layer.install name (Layer.init l)
