#!/usr/bin/perl
package ubic_watchdog;
BEGIN {
  $ubic_watchdog::VERSION = '1.25';
}

use strict;
use warnings;

# ABSTRACT: watchdog which checks all ubic services.


use Getopt::Long 2.33;
use Pod::Usage;

use Ubic::Logger;
use Ubic;
use Try::Tiny;
use List::MoreUtils qw(any);
use POSIX;

return 1 if caller();

my $COMPILE_TIMEOUT = 10;

my $verbose;
GetOptions(
    'v|verbose' => \$verbose,
    'compile-timeout=i' => \$COMPILE_TIMEOUT,
) or pod2usage(2);

my @filter;
{
    for my $arg (@ARGV) {
        $arg =~ /^[*\w.-]+$/ or die "Invalid argument '$arg', expected service name or shell-style glob";
        $arg =~ s/\./\\./g;
        $arg =~ s/\*/.*/g;
        push @filter, qr/^$arg$/;
    }
}

sub check($);

sub match($$) {
    my ($name, $filter) = @_;
    do {
        return 1 if $name =~ $filter;
    } while ($name =~ s/\.[^.]+$//);
    return;
}

sub load_services {
    my ($parent) = @_; # $parent can be a multiservice, or it can be "Ubic" class itself
    alarm($COMPILE_TIMEOUT);
    $SIG{ALRM} = sub {
        die "Couldn't compile $parent services in $COMPILE_TIMEOUT seconds";
    };
    my @services = $parent->services;
    alarm(0);
    return @services;
}

sub check_all {
    my @services = @_;
    for my $service (@services) {
        my $name = $service->full_name;
        if ($service->isa('Ubic::Multiservice')) {
            INFO("$name is multiservice, checking subservices") if $verbose;
            check_all(load_services($service));
            next;
        }
        if (@filter) {
            next unless any { match($name, $_) } @filter;
        }

        # trying to get logs a little bit more ordered
        STDOUT->flush;
        STDERR->flush;

        my $child = fork;
        unless (defined $child) {
            die "fork failed";
        }
        unless ($child) {
            POSIX::setsid; # so we could kill this watchdog and its children safely later
            check($service);
            exit;
        }
    }
    1 while wait() > 0;
    return;
}

sub check($) {
    my $service = shift;
    my $name = $service->full_name;
    if ($verbose) {
        INFO("Checking $name");
    }
    $0 = "ubic-watchdog $name";

    try {
        alarm($service->check_timeout);
        $SIG{ALRM} = sub {
            ERROR("$name check_timeout exceeded");
            STDOUT->flush;
            STDERR->flush;
            kill -9 => $$; # suicide
            ERROR "kill sent, still alive"; # should never happen, we called setsid earlier
        };

        my $watchdog_lock = do {
            my $guard = Ubic->access_guard($name);
            Ubic::SingletonLock->new(Ubic->get_data_dir()."/watchdog/lock/".$name, { blocking => 0 });
        };
        unless ($watchdog_lock) {
            if ($verbose) {
                INFO "$name is locked by another watchdog process";
            }
            return;
        }

        my $lock = Ubic->lock($name);
        unless (Ubic->is_enabled($name)) {
            INFO("$name disabled") if $verbose;
            return;
        }

        my $status = Ubic->status($name);
        unless ($status->status eq 'running') {
            Ubic->set_cached_status($name, $status->status); # following start can throw exception
            ERROR("$name is broken, restarting");
            Ubic->start($name);
        }
        $status = Ubic->status($name);
        alarm(0);

        if ($status->status ne 'running') {
            INFO("$name started, but status is still '$status'");
        }
        Ubic->set_cached_status($name, $status); # if service's start implementation is invalid, ubic-watchdog will restart it every minute, so be careful
    }
    catch {
        ERROR("Failed to check $name: $_");
    };

    INFO("$name checked") if $verbose;
}

my @services = load_services('Ubic');
check_all(@services);

__END__
=pod

=head1 NAME

ubic_watchdog - watchdog which checks all ubic services.

=head1 VERSION

version 1.25

=head1 SYNOPSIS

    ubic-watchdog [-v] [--compile-timeout=N]

    ubic-watchdog SERVICE SERVICE2 SERVICE_GLOB ...

=head1 DESCRIPTION

This is a generic watchdog for all ubic services.

It checks every enabled service by asking for its status and tries to start it if service is down or broken.

Services are checked in parallel fashion, each one in separate forked process.

=head1 PARAMETERS

B<-v> or B<--verbose> flag can be used to enable detailed logging.

B<--compile-timeout=N> option can be used to override service compilation timeout in seconds. Default is 10 seconds. This option is experimental and can be removed in future releases.

All other arguments are interpreted as service names. Arguments can contain C<*> symbol, which will be expanded in Unix shell fashion.

If service names are omitted, all services will be checked.

=head1 DEPLOYMENT

This script should be invoked every minute (or as often as you like), usually as a cron job.

=head1 AUTHOR

Vyacheslav Matyukhin <mmcleric@yandex-team.ru>

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2011 by Yandex LLC.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut

