diff --git a/README.md b/README.md index 68c9ae5..a8752d7 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,7 @@ Failed: database error ## Application health -This is the croncheck.php - it is mostly, and was originally only around the cron queues, but has grown to cover other aspects. +Named `croncheck.php` for compatibility with older versions of this plugin, this page executes all `status` check API checks, and shows any that return non-ok results. It is a nagios compliant checker to see if cron or any individual tasks are failing, with configurable thresholds @@ -88,6 +88,13 @@ The various thresholds can be configured with query params or cli args see this php croncheck.php -h ``` +## Site health (Moodle 39+) + +Accessible via `sitecheck.php`, this runs all Check API checks of type `status` and any errors are reported. + +Moodle 3.9 or higher is required, since this is when the [Check API](https://moodledev.io/docs/apis/subsystems/check) was implemented. This page also does not check the legacy task logs. + +`croncheck.php` overlaps a little since it also does check the check API, however, this new page is much cleaner, easier to understand and has a more consistent format. ## Failed login detection @@ -111,10 +118,16 @@ php loginchecker.php -h # Branches +| Branch | Version | +| ----------- | ----------- | +| master | Moodle 2.7 + | +| MOODLE_39_STABLE | Moodle 3.9 + | + The master branch is always stable and should retain very deep support for old Totara's and Moodle's back to Moodle 2.7 For this reason we will continue to support php5 for some time. +The MOODLE_39_STABLE branch uses the [Check API](https://moodledev.io/docs/apis/subsystems/check) exclusively. # Installation diff --git a/classes/check/failingtaskcheck.php b/classes/check/failingtaskcheck.php new file mode 100644 index 0000000..fb92a8d --- /dev/null +++ b/classes/check/failingtaskcheck.php @@ -0,0 +1,127 @@ +. + +namespace tool_heartbeat\check; + +use core\check\check; +use core\check\result; + +/** + * Task fail delay check + * + * This is very similar to the core tool_task::maxfaildelay check, except the output aggregates the number + * of each task, so if you have thousands of a task failing it does not spam the output. + * + * @package tool_heartbeat + * @copyright 2023 Matthew Hilton + * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later + */ +class failingtaskcheck extends check { + + /** @var int $warnthreshold Threshold in minutes after which should warn about tasks failing **/ + public $warnthreshold = 60; + + /** @var int $errorthreshold Threshold in minutes after which should error about tasks failing **/ + public $errorthreshold = 600; + + /** + * Constructor + */ + public function __construct() { + $this->id = 'cronfailingtasks'; + $this->name = get_string('checkfailingtaskcheck', 'tool_heartbeat'); + + $this->actionlink = new \action_link( + new \moodle_url('/admin/tasklogs.php'), + get_string('tasklogs', 'tool_task')); + } + + /** + * Return result + * @return result + */ + public function get_result() : result { + global $DB; + + $taskoutputs = []; + + // Instead of using task API here, we read directly from the database. + // This stops errors originating from broken tasks. + $scheduledtasks = $DB->get_records_sql("SELECT * FROM {task_scheduled} WHERE faildelay > 0 AND disabled = 0"); + + foreach ($scheduledtasks as $task) { + $taskoutputs[] = "SCHEDULED TASK: {$task->classname} Delay: {$task->faildelay}\n"; + } + + // Instead of using task API here, we read directly from the database. + // This stops errors originating from broken tasks, and allows the DB to de-duplicate them. + $adhoctasks = $DB->get_records_sql(" SELECT classname, COUNT(*) count, MAX(faildelay) faildelay, SUM(faildelay) cfaildelay + FROM {task_adhoc} + WHERE faildelay > 0 + GROUP BY classname + ORDER BY cfaildelay DESC"); + + foreach ($adhoctasks as $record) { + // Only add duplicate message if there are more than 1. + $duplicatemsg = $record->count > 1 ? " ({$record->count} duplicates!!!)" : ''; + $taskoutputs[] = "ADHOC TASK: {$record->classname} Delay: {$record->faildelay} {$duplicatemsg}\n"; + } + + // Find the largest faildelay out of both adhoc and scheduled tasks. + $alldelays = array_merge(array_column($adhoctasks, 'faildelay'), array_column($scheduledtasks, 'faildelay')); + $maxdelaymins = !empty($alldelays) ? max($alldelays) / 60 : 0; + + // Define a simple function to work out what the message should be based on the task outputs. + // Returns the [$summary, $details]. + $taskoutputfn = function($faildelaymins) use ($taskoutputs) { + $count = count($taskoutputs); + + if ($count == 1) { + // Only a single task is failing, so put it at the top level. + return [$taskoutputs[0], '']; + } + + if ($count > 1) { + // More than 1, add a message at the start that indicates how many. + return ["{$count} Moodle tasks reported errors, maximum faildelay > {$faildelaymins} mins", implode("", $taskoutputs)]; + } + + // There are 0 tasks are failing, default to nothing. + return ['', '']; + }; + + // Default to ok. + $status = result::OK; + $delay = 0; + + // Check if warn - if so then upgrade to warn. + if ($maxdelaymins > $this->warnthreshold) { + $status = result::WARNING; + $delay = $this->warnthreshold; + } + + // Check if error - if so then upgrade to error. + if ($maxdelaymins > $this->errorthreshold) { + $status = result::ERROR; + $delay = $this->errorthreshold; + } + + list($summary, $details) = $taskoutputfn($delay); + + return new result($status, nl2br($summary), nl2br($details)); + + } +} diff --git a/classes/checker.php b/classes/checker.php new file mode 100644 index 0000000..493e034 --- /dev/null +++ b/classes/checker.php @@ -0,0 +1,300 @@ +. + +namespace tool_heartbeat; + +use core\check\check; +use core\check\result; +use Throwable; + +/** + * Check API checker class + * + * Processes check API results and returns them in a nice format for nagios output. + * + * @package tool_heartbeat + * @author Matthew Hilton + * @copyright 2023, Catalyst IT + * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later + */ +class checker { + /** @var array Nagios level prefixes **/ + public const NAGIOS_PREFIXES = [ + 0 => "OK", + 1 => "WARNING", + 2 => "CRITICAL", + 3 => "UNKNOWN", + ]; + + /** + * Returns an array of check API messages. + * If exceptions are thrown, they are caught and returned as result messages as well. + * Note - OK results are not returned. + * + * @return array array of resultmessage objects + */ + public static function get_check_messages(): array { + // First try to get the checks, if this fails return a critical message (code is very broken). + $checks = []; + + try { + $checks = \core\check\manager::get_checks('status'); + } catch (Throwable $e) { + return [self::exception_to_message("Error getting checks: ", $e)]; + } + + // Remove any supressed checks from the list. + $checks = array_filter($checks, function($check) { + return !in_array(get_class($check), self::supressed_checks()); + }); + + // Execute each check and store their messages. + $messages = []; + + foreach ($checks as $check) { + try { + $messages[] = self::process_check_and_get_result($check); + } catch (Throwable $e) { + $messages[] = self::exception_to_message("Error processing check " . $check->get_ref() . ": ", $e); + } + } + + // Add any output buffer message. + $messages[] = self::get_ob_message(); + + // Filter out any OK messages, we don't care about these. + $messages = array_filter($messages, function($m) { + return $m->level != resultmessage::LEVEL_OK; + }); + + return $messages; + } + + /** + * Closes the output buffering, and if anything was outputted, a warning resultmessage is returned + * @return resultmessage + */ + private static function get_ob_message(): resultmessage { + $contents = ob_get_clean() ?: ''; + + // Default to OK. + $res = new resultmessage(); + $res->level = resultmessage::LEVEL_OK; + $res->title = 'Output buffering: No output buffered'; + $res->message = 'No output buffered'; + + if (!empty($contents)) { + $res->level = resultmessage::LEVEL_WARN; + $res->title = "Output buffering: Unexpected output"; + $res->message = $contents; + } + + // Process these using the HTML cleaning function. + list($title, $message) = self::process_title_and_message($res->title, $res->message, ""); + $res->title = $title; + $res->message = $message; + + return $res; + } + + /** + * Turns the given exception into a warning resultmessage. + * @param string $prefix + * @param Throwable $e + * @return resultmessage + */ + private static function exception_to_message(string $prefix, Throwable $e): resultmessage { + $res = new resultmessage(); + $res->level = resultmessage::LEVEL_WARN; + $res->title = $prefix . $e->getMessage(); + $res->message = (string) $e; + return $res; + } + + /** + * Processes the check and maps its result and status to a resultmessage. + * @param check $check + * @return resultmessage + */ + private static function process_check_and_get_result(check $check): resultmessage { + $res = new resultmessage(); + + $checkresult = $check->get_result(); + + // Map check result to nagios level. + $map = [ + result::OK => resultmessage::LEVEL_OK, + result::NA => resultmessage::LEVEL_OK, + result::WARNING => resultmessage::LEVEL_WARN, + result::CRITICAL => resultmessage::LEVEL_CRITICAL, + result::ERROR => resultmessage::LEVEL_CRITICAL, + result::UNKNOWN => resultmessage::LEVEL_UNKNOWN, + ]; + + // Get the level, or default to unknown. + $status = $checkresult->get_status(); + $res->level = isset($map[$status]) ? $map[$status] : resultmessage::LEVEL_UNKNOWN; + + list($title, $message) = self::process_title_and_message($check->get_name(), $checkresult->get_summary(), + $checkresult->get_details()); + $res->title = $title; + $res->message = $message; + + return $res; + } + + /** + * Parses, cleans and sets up the correct output. + * @param string $title + * @param string $summary + * @param string $details + * @return array array of [$title, $message] + */ + private static function process_title_and_message(string $title, string $summary, string $details): array { + // Strip tags from summary and details. + $summary = self::clean_text($summary); + $details = self::clean_text($details); + + // Get all the lines of the message. + $messagelines = explode("\n", $summary); + $messagelines = array_merge($messagelines, explode("\n", $details)); + + // Clean each one. + $messagelines = array_map(function($line) { + return self::clean_text($line); + }, $messagelines); + + // Remove empty lines. + $messagelines = array_filter($messagelines); + + // Use the first line in the title. + $title .= ": " . array_shift($messagelines); + + // Use the rest in the message. + $message = implode("\n", $messagelines); + + return [$title, $message]; + } + + /** + * Cleans the text ready for output. + * @param string $text + * @return string + */ + private static function clean_text(string $text): string { + // Convert any line breaks to newlines. + $text = str_replace("
", "\n", $text); + $text = str_replace("
", "\n", $text); + + // Strip tags. + $text = strip_tags($text); + + // Clean any pipe characters from the $msg. This is because pipe characters + // separate Nagios performance data from log data. + $text = str_replace("|", "[pipe]", $text); + + // Strip extra newlines. + $text = trim($text); + + return $text; + } + + /** + * From an array of resultmessage, determines the highest nagios level. + * Note, it considers UNKNOWN to be less than CRITICAL or WARNING. + * + * @param array $messages array of resultmessage objects + * @return int the calculated nagios level + */ + public static function determine_nagios_level(array $messages): int { + // Find the highest level. + $levels = array_column($messages, "level"); + + // Add a default "OK" in case no messages were returned. + $levels[] = resultmessage::LEVEL_OK; + + $hasunknown = !empty(array_filter($levels, function($l) { + return $l == resultmessage::LEVEL_UNKNOWN; + })); + + // Remove unknowns. + $levels = array_filter($levels, function($l) { + return $l != resultmessage::LEVEL_UNKNOWN; + }); + + $highest = max($levels); + + // If highest was OK but it had an unknown, return unknown. + // This stops UNKNOWN from masking WARNING or CRITICAL. + if ($highest == resultmessage::LEVEL_OK && $hasunknown) { + return resultmessage::LEVEL_UNKNOWN; + } + + // Else return the highest. + return $highest; + } + + /** + * Creates a summary from the given messages. + * If there are no messages or only OK, OK is returned. + * If there is a single message, its details are returned. + * If there are multiple messages, the levels are aggregated and turned into a summary. + * + * @param array $messages array of resultmessage objects + * @return string + */ + public static function create_summary(array $messages): string { + // Filter out any OK messages. + // Usually they are filtered out already, but in case they aren't. + $messages = array_filter($messages, function($m) { + return $m->level != resultmessage::LEVEL_OK; + }); + + // If no messages, return OK. + if (count($messages) == 0) { + return "OK"; + } + + // If only one message, use it as the top level. + if (count($messages) == 1) { + return self::clean_text(current($messages)->title); + } + + // Otherwise count how many of each level. + $counts = array_count_values(array_column($messages, 'level')); + + $countswithprefixes = []; + foreach ($counts as $level => $occurrences) { + $prefix = self::NAGIOS_PREFIXES[$level]; + $countswithprefixes[] = "{$occurrences} {$prefix}"; + } + + return "Multiple problems detected: " . implode(", ", $countswithprefixes); + } + + /** + * Stores any checks that are suppressed/ignored by this class. + * @return array array of class name strings of checks to ignore + */ + private static function supressed_checks(): array { + return [ + // These two supressed and replaced by a more detailed/useful version in this plugin. + \tool_task\check\maxfaildelay::class, + \tool_task\check\adhocqueue::class, + ]; + } +} + diff --git a/classes/resultmessage.php b/classes/resultmessage.php new file mode 100644 index 0000000..7de5958 --- /dev/null +++ b/classes/resultmessage.php @@ -0,0 +1,49 @@ +. + +namespace tool_heartbeat; + +/** + * A data-only class for holding a message about a result from a check API class. + * + * @package tool_heartbeat + * @author Matthew Hilton + * @copyright 2023, Catalyst IT + * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later + */ +class resultmessage { + /** @var int OK level **/ + public const LEVEL_OK = 0; + + /** @var int WARN level **/ + public const LEVEL_WARN = 1; + + /** @var int CRITICAL level **/ + public const LEVEL_CRITICAL = 2; + + /** @var int UNKNOWN level **/ + public const LEVEL_UNKNOWN = 3; + + /** @var int $level The level of this message **/ + public $level = self::LEVEL_UNKNOWN; + + /** @var string $title Title of the message **/ + public $title = ''; + + /** @var string $message Details of this message **/ + public $message = ''; +} + diff --git a/croncheck.php b/croncheck.php index 5446256..27632cd 100644 --- a/croncheck.php +++ b/croncheck.php @@ -15,387 +15,82 @@ // along with Moodle. If not, see . /** - * CRON health check + * Check API Health Check * * @package tool_heartbeat - * @copyright 2015 Brendan Heywood + * @copyright 2023 Matthew Hilton * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later * - * This can be run either as a web api, or on the CLI. When run on the - * CLI it conforms to the Nagios plugin standard. - * * See also: * - http://nagios.sourceforge.net/docs/3_0/pluginapi.html * - https://nagios-plugins.org/doc/guidelines.html#PLUGOUTPUT - * */ +use tool_heartbeat\checker; + // @codingStandardsIgnoreStart define('NO_UPGRADE_CHECK', true); -$cronthreshold = 6; // Hours. -$cronwarn = 2; // Hours. -$delaythreshold = 600; // Minutes. -$delaywarn = 60; // Minutes. -$legacythreshold = 60 * 6; // Minute. -$legacywarn = 60 * 2; // Minutes. - -// @codingStandardsIgnoreEnd - -// Start output buffering. This stops for e.g. debugging messages from breaking the output. -// When a nagios.php send_* function is called, they will collect the buffer -// and warn if it is not empty (but do it nicely). -ob_start(); - $dirroot = __DIR__ . '/../../../'; -if (isset($argv)) { - // If run from the CLI. - define('CLI_SCRIPT', true); - - $last = $argv[count($argv) - 1]; - if (preg_match("/(.*):(.+)/", $last, $matches)) { - $last = $matches[1]; - } - if ($last && is_dir($last) ) { - $dirroot = $last . '/'; - array_pop($_SERVER['argv']); - } - - require($dirroot.'config.php'); - require_once(__DIR__.'/nagios.php'); - require_once($CFG->libdir.'/clilib.php'); - - list($options, $unrecognized) = cli_get_params( - array( - 'help' => false, - 'cronwarn' => $cronwarn, - 'cronerror' => $cronthreshold, - 'delaywarn' => $delaywarn, - 'delayerror' => $delaythreshold, - 'legacywarn' => $legacywarn, - 'legacyerror' => $legacythreshold, - ), - array( - 'h' => 'help' - ) - ); - - if ($unrecognized) { - $unrecognized = implode("\n ", $unrecognized); - cli_error(get_string('cliunknowoption', 'admin', $unrecognized)); - } - - if ($options['help']) { - print "Check the moodle cron system for when it last ran and any task fail delays - -croncheck.php [options] [moodle path] - -Options: --h, --help Print out this help - --cronwarn=n Threshold for no cron run error in hours (default $cronwarn) - --cronerror=n Threshold for no cron run warn in hours (default $cronthreshold) - --delaywarn=n Threshold for fail delay cron error in minutes (default $delaywarn) - --delayerror=n Threshold for fail delay cron warn in minutes (default $delaythreshold) - --legacywarn=n Threshold for legacy cron warn in minutes (default $legacywarn) - --legacyerror=n Threshold for legacy cron error in minutes (default $legacythreshold) - -Example: -\$sudo -u www-data /usr/bin/php admin/tool/heartbeat/croncheck.php -"; - die; - } - -} else { - // If run from the web. - define('NO_MOODLE_COOKIES', true); - // Add requirement for IP validation. - require($dirroot.'config.php'); - require_once(__DIR__.'/nagios.php'); - tool_heartbeat\lib::validate_ip_against_config(); - - $options = array( - 'cronerror' => optional_param('cronerror', $cronthreshold, PARAM_INT), - 'cronwarn' => optional_param('cronwarn', $cronwarn, PARAM_INT), - 'delayerror' => optional_param('delayerror', $delaythreshold, PARAM_INT), - 'delaywarn' => optional_param('delaywarn', $delaywarn, PARAM_INT), - 'legacyerror' => optional_param('legacyerror', $legacythreshold, PARAM_INT), - 'legacywarn' => optional_param('legacywarn', $legacywarn, PARAM_INT), - ); - header("Content-Type: text/plain"); - - // Make sure varnish doesn't cache this. But it still might so go check it! - header('Pragma: no-cache'); - header('Cache-Control: private, no-cache, no-store, max-age=0, must-revalidate, proxy-revalidate'); - header('Expires: Tue, 04 Sep 2012 05:32:29 GMT'); -} - -if (isset($CFG->adminsetuppending)) { - send_critical("Admin setup pending, please set up admin account"); -} - -if (moodle_needs_upgrading()) { - $upgraderunning = get_config(null, 'upgraderunning'); - $initialinstall = during_initial_install(); - - $difference = format_time((time() > $upgraderunning ? (time() - $upgraderunning) : 300)); - - if (!$upgraderunning) { - send_critical("Moodle upgrade pending and is not running, cron execution suspended"); - } - - if ($upgraderunning >= time()) { - // Before the expected finish time. - if (!empty($initialinstall)) { - send_critical("Moodle installation is running, ETA > $difference, cron execution suspended"); - } else { - send_critical("Moolde upgrade is running, ETA > $difference, cron execution suspended"); - } - } - - /* - * After the expected finish time (timeout or other interruption) - * The "core_shutdown_manager::register_function('upgrade_finished_handler');" already handle these cases - * and unset config 'upgraderunning' - * The below critical ones can happen if core_shutdown_manager fails to run the handler function. - */ - if (!empty($initialinstall)) { - send_critical("Moodle installation is running, overdue by $difference "); - } else { - send_critical("Moodle upgrade is running, overdue by $difference "); - } -} - -// We want to periodically emit an error_log which we will detect elsewhere to -// confirm that all the various web server logs are not stale. -$nexterror = get_config('tool_heartbeat', 'nexterror'); -$errorperiod = get_config('tool_heartbeat', 'errorlog'); -if (!$errorperiod) { - $errorperiod = 30 * MINSECS; -} - -if (!$nexterror || time() > $nexterror) { - $nexterror = time() + $errorperiod; - $now = userdate(time()); - $next = userdate($nexterror); - $period = format_time($errorperiod); - // @codingStandardsIgnoreStart - error_log("heartbeat test $now, next test expected in $period at $next"); - // @codingStandardsIgnoreEnd - set_config('nexterror', $nexterror, 'tool_heartbeat'); -} - -if ($CFG->branch < 27) { - $lastcron = $DB->get_field_sql('SELECT MAX(lastcron) FROM {modules}'); - $currenttime = time(); - $difference = $currenttime - $lastcron; - - if ( $difference > $options['cronerror'] * 60 * 60 ) { - send_critical("Moodle cron ran > {$options['cronerror']} hours ago\nLast run at $when"); - } - - if ( $difference > $options['cronwarn'] * 60 * 60 ) { - send_warning("Moodle cron ran > {$options['cronwarn']} hours ago\nLast run at $when"); - } - - send_good("MOODLE CRON RUNNING\n"); -} - -$lastcron = $DB->get_field_sql('SELECT MAX(lastruntime) FROM {task_scheduled}'); -$currenttime = time(); -$difference = $currenttime - $lastcron; - -$testing = get_config('tool_heartbeat', 'testing'); -if ($testing == 'error') { - send_critical("Moodle this is a test $CFG->wwwroot/admin/settings.php?section=tool_heartbeat\n"); -} else if ($testing == 'warn') { - send_warning("Moodle this is a test $CFG->wwwroot/admin/settings.php?section=tool_heartbeat\n"); -} - -$when = userdate($lastcron); - -if ( $difference > $options['cronerror'] * 60 * 60 ) { - send_critical("Moodle cron ran > {$options['cronerror']} hours ago\nLast run at $when"); -} - -if ( $difference > $options['cronwarn'] * 60 * 60 ) { - send_warning("Moodle cron ran > {$options['cronwarn']} hours ago\nLast run at $when"); -} +// If run from the web. +define('NO_MOODLE_COOKIES', true); +// Add requirement for IP validation. +require($dirroot.'config.php'); +require_once(__DIR__.'/nagios.php'); +tool_heartbeat\lib::validate_ip_against_config(); -$taskoutputs = []; +header("Content-Type: text/plain"); -// Instead of using task API here, we read directly from the database. -// This stops errors originating from broken tasks. -$scheduledtasks = $DB->get_records_sql("SELECT * FROM {task_scheduled} WHERE faildelay > 0 AND disabled = 0"); +// Make sure varnish doesn't cache this. But it still might so go check it! +header('Pragma: no-cache'); +header('Cache-Control: private, no-cache, no-store, max-age=0, must-revalidate, proxy-revalidate'); +header('Expires: Tue, 04 Sep 2012 05:32:29 GMT'); -foreach ($scheduledtasks as $task) { - $taskoutputs[] = "SCHEDULED TASK: {$task->classname} Delay: {$task->faildelay}\n"; +if (isset($CFG->mnet_dispatcher_mode) and $CFG->mnet_dispatcher_mode !== 'off') { + // This is a core bug workaround, see MDL-77247 for more details. + require_once($CFG->dirroot.'/mnet/lib.php'); } -// Instead of using task API here, we read directly from the database. -// This stops errors originating from broken tasks, and allows the DB to de-duplicate them. -$adhoctasks = $DB->get_records_sql(" SELECT classname, COUNT(*) count, MAX(faildelay) faildelay, SUM(faildelay) cfaildelay - FROM {task_adhoc} - WHERE faildelay > 0 - GROUP BY classname - ORDER BY cfaildelay DESC"); - -foreach ($adhoctasks as $record) { - // Only add duplicate message if there are more than 1. - $duplicatemsg = $record->count > 1 ? " ({$record->count} duplicates!!!)" : ''; - $taskoutputs[] = "ADHOC TASK: {$record->classname} Delay: {$record->faildelay} {$duplicatemsg}\n"; -} - -// Find the largest faildelay out of both adhoc and scheduled tasks. -$alldelays = array_merge(array_column($adhoctasks, 'faildelay'), array_column($scheduledtasks, 'faildelay')); -$maxdelaymins = !empty($alldelays) ? max($alldelays) / 60 : 0; - -// Define a simple function to work out what the message should be based on the task outputs. -$taskoutputfn = function($faildelaymins) use ($taskoutputs) { - $count = count($taskoutputs); - - if ($count == 1) { - // Only a single task is failing, so put it at the top level. - return $taskoutputs[0]; - } - - if ($count > 1) { - // More than 1, add a message at the start that indicates how many. - return "{$count} Moodle tasks reported errors, maximum faildelay > {$faildelaymins} mins\n" . implode("", $taskoutputs); - } - - // There are 0 tasks are failing, default to nothing. - return ''; -}; - -// Send the warning or critical based on the faildelay. -$sendwarning = $maxdelaymins > $options['delaywarn']; -$sendcritical = $maxdelaymins > $options['delayerror']; - -if ($sendcritical) { - send_critical($taskoutputfn($options['delayerror'])); -} - -if ($sendwarning) { - send_warning($taskoutputfn($options['delaywarn'])); -} - -if ($CFG->branch < 403) { - $legacytask = \core\task\manager::get_scheduled_task('core\task\legacy_plugin_cron_task'); - $legacylastrun = $legacytask->get_last_run_time(); - if (!$legacylastrun) { - send_warning("Moodle legacy task isn't running (ie disabled)\n"); - } - $minsincelegacylastrun = floor((time() - $legacylastrun) / 60); // In minutes. - $when = userdate($legacylastrun); - if ( $minsincelegacylastrun > $options['legacyerror']) { - send_critical("Moodle legacy task last run $minsincelegacylastrun " - . "mins ago > {$options['legacyerror']} mins\nLast run at $when"); - } - if ( $minsincelegacylastrun > $options['legacywarn']) { - send_warning("Moodle legacy task last run $minsincelegacylastrun mins ago > {$options['legacywarn']} mins\nLast run at $when"); - } -} - -// If the Check API from 3.9 exists then call those as well. -if (class_exists('\core\check\manager')) { - - if (isset($CFG->mnet_dispatcher_mode) and $CFG->mnet_dispatcher_mode !== 'off') { - // This is a core bug workaround, see MDL-77247 for more details. - require_once($CFG->dirroot.'/mnet/lib.php'); - } +global $PAGE; - // Try find checks and catch any potential exceptions. - $checks = []; - try { - $checks = \core\check\manager::get_checks('status'); - } catch (\Throwable $e) { - // The check API exploded, so there is no point continuing. - send_critical("Error scanning checks: {$e}\n"); - } - - // Define a function to get the check result and determine if the error is critical or not. - $processcheckfn = function($check) { - $output = ''; - $critical = false; - - $ref = $check->get_ref(); - $result = $check->get_result(); - - $status = $result->get_status(); - - // Summary is treated as html. - $summary = $result->get_summary(); - $summary = html_to_text($summary, 80, false); - - if ($status == \core\check\result::WARNING || - $status == \core\check\result::CRITICAL || - $status == \core\check\result::ERROR) { - - // If we have an error, how should we handle it. - if ($status == \core\check\result::ERROR && !$critical) { - $mapping = get_config('tool_heartbeat', 'errorcritical'); - if ($mapping === 'critical') { - $critical = true; - } else if ($mapping === 'criticalbusiness') { - // Here we should only set the critical flag between 0900 and 1700 server time. - $time = new DateTime('now', core_date::get_server_timezone_object()); - $hour = (int) $time->format('H'); - $critical = ($hour >= 9 && $hour < 17); - } - } else if (!$critical) { - $critical = $status == \core\check\result::CRITICAL; - } - - $output .= $check->get_name() . "\n"; - $output .= "$summary\n"; - - $detail = new moodle_url('/report/status/index.php', ['detail' => $ref]); - $output .= 'Details: ' . $detail->out() . "\n"; - - $link = $check->get_action_link(); - if ($link) { - $output .= $link->url . "\n"; - } - } - - return [$output, $critical]; - }; - - // Check if any of them are critical, and catch any exceptions that might be thrown. - // This is an array of [$output, $critical]. - $checkoutputs = array_map(function($check) use ($processcheckfn) { - try { - return $processcheckfn($check); - } catch (\Throwable $e) { - $critical = true; - $output = "Check \"{$check->get_name()}\" threw an exception: {$e}\n"; - return [$output, $critical]; - } - }, $checks); - - // Combine outputs and remove any that are empty. - $outputs = array_filter(array_column($checkoutputs, 0)); - $output = implode("\n", $outputs); - - // If > 1 check reported warnings/errors, add a prefix which describes what has happened. - if (count($outputs) > 1) { - $output = count($outputs) . " status checks have reported warnings or errors: \n" . $output; - } - - // Check if any returned critical as true. - $critical = in_array(true, array_column($checkoutputs, 1)); - - // Strictly some of these could a critical but softly softly. - if ($output) { - // For now emit only criticals as criticals. Error status should be a critical later. - if ($critical) { - send_critical($output); - } else { - send_warning($output); - } - } - -} +// Start output buffering. This stops for e.g. debugging messages from breaking the output. +// When a nagios.php send_* function is called, they will collect the buffer +// and warn if it is not empty (but do it nicely). +ob_start(); -send_good("MOODLE CRON RUNNING\n"); +$messages = checker::get_check_messages(); + +// Construct the output message. +$PAGE->set_context(\context_system::instance()); + +// Indent the messages. +$msg = array_map(function($message) { + global $OUTPUT; + + $spacer = " "; + + // Add the spacer to the start of each message line. + $indentedlines = explode("\n", $message->message); + $indentedlines = array_map(function($line) use ($spacer) { + return $spacer . $line; + }, $indentedlines); + + $indentedmessage = implode("\n", $indentedlines); + + return $OUTPUT->render_from_template('tool_heartbeat/resultmessage', [ + 'prefix' => checker::NAGIOS_PREFIXES[$message->level], + 'title' => $message->title, + 'message' => $indentedmessage, + ]); +}, $messages); + +$msg = checker::create_summary($messages) . "\n" . implode("\n\n", $msg); +$msg = htmlspecialchars_decode($msg); + +$level = checker::determine_nagios_level($messages); +$prefix = checker::NAGIOS_PREFIXES[$level]; +$now = userdate(time()); + +printf("{$prefix}: $msg\n\n(Checked {$now})\n"); +exit($level); diff --git a/lang/en/tool_heartbeat.php b/lang/en/tool_heartbeat.php index f50c665..00761bc 100644 --- a/lang/en/tool_heartbeat.php +++ b/lang/en/tool_heartbeat.php @@ -79,6 +79,8 @@ $string['checktasklatencycheck'] = 'Task latency check'; $string['taskconfigbad'] = 'Bad configurations {$a}'; $string['tasklatencyok'] = 'Task latency OK.'; +$string['checkfailingtaskcheck'] = 'Failing tasks'; + /* * Privacy provider (GDPR) */ diff --git a/lib.php b/lib.php index f248803..1979f50 100644 --- a/lib.php +++ b/lib.php @@ -31,6 +31,7 @@ function tool_heartbeat_status_checks() { new \tool_heartbeat\check\authcheck(), new \tool_heartbeat\check\logstorecheck(), new \tool_heartbeat\check\tasklatencycheck(), + new \tool_heartbeat\check\failingtaskcheck(), ]; } diff --git a/templates/resultmessage.mustache b/templates/resultmessage.mustache new file mode 100644 index 0000000..111a408 --- /dev/null +++ b/templates/resultmessage.mustache @@ -0,0 +1,40 @@ + +{{! + This file is part of Moodle - https://moodle.org/ + + Moodle is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Moodle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Moodle. If not, see + . + }} + {{! + @template tool_heartbeat/resultmessage + + Template by JS to render output of result report getting (loading, url, error) + + Classes required for JS: + * none + + Context variables required for this template: + * none + + Example context (json): + { + "prefix": "CRTIICAL", + "title": "Something broke", + "message": "Some more details" + } + }} + +* {{prefix}} {{title}} +{{message}} + diff --git a/tests/checker_test.php b/tests/checker_test.php new file mode 100644 index 0000000..5a261aa --- /dev/null +++ b/tests/checker_test.php @@ -0,0 +1,152 @@ +. + +namespace tool_heartbeat; + +/** + * Test class for tool_heartbeat\checker + * + * @package tool_heartbeat + * @author Matthew Hilton + * @copyright 2023, Catalyst IT + * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later + */ +class checker_test extends \advanced_testcase { + /** + * Tests get_check_messages function + */ + public function test_get_check_messages() { + // Need to start output buffering, since get_check_messages closes it. + ob_start(); + + // Check API modifies DB state. + $this->resetAfterTest(true); + + // Just test that the check API is working, and this returns some checks (for example the ones included with this plugin). + $checks = checker::get_check_messages(); + $this->assertNotEmpty($checks); + } + + /** + * Provides values to determine_nagios_level test + * @return array + */ + public static function determine_nagios_level_provider(): array { + return [ + 'no messages' => [ + 'levels' => [], + 'nagioslevel' => resultmessage::LEVEL_OK, + ], + 'one OK message' => [ + 'levels' => [resultmessage::LEVEL_OK], + 'nagioslevel' => resultmessage::LEVEL_OK, + ], + 'one UNKNOWN message' => [ + 'levels' => [resultmessage::LEVEL_UNKNOWN], + 'nagioslevel' => resultmessage::LEVEL_UNKNOWN, + ], + 'one UNKNOWN and one OK' => [ + 'levels' => [resultmessage::LEVEL_UNKNOWN, resultmessage::LEVEL_OK], + 'nagioslevel' => resultmessage::LEVEL_UNKNOWN, + ], + 'one UNKNOWN and one WARNING' => [ + 'levels' => [resultmessage::LEVEL_UNKNOWN, resultmessage::LEVEL_WARN], + 'nagioslevel' => resultmessage::LEVEL_WARN, + ], + 'one UNKNOWN and on CRITICAL' => [ + 'levels' => [resultmessage::LEVEL_UNKNOWN, resultmessage::LEVEL_CRITICAL], + 'nagioslevel' => resultmessage::LEVEL_CRITICAL, + ], + ]; + } + + /** + * Tests determine_nagios_level function + * @param array $levels + * @param int $expectedlevel + * @dataProvider determine_nagios_level_provider + */ + public function test_determine_nagios_level(array $levels, int $expectedlevel) { + // Generate a series of dummy messages with the given levels. + $messages = array_map(function($level) { + $msg = new resultmessage(); + $msg->level = $level; + return $msg; + }, $levels); + + // Confirm the correct level outputted. + $level = checker::determine_nagios_level($messages); + $this->assertEquals($expectedlevel, $level); + } + + /** + * Provides values to test_create_summary test + * @return array + */ + public static function create_summary_provider(): array { + + $warnmsg = new resultmessage(); + $warnmsg->level = resultmessage::LEVEL_WARN; + $warnmsg->title = "test WARN title"; + + $okmsg = new resultmessage(); + $okmsg->level = resultmessage::LEVEL_OK; + $okmsg->title = "test OK title"; + + $criticalmsg = new resultmessage(); + $criticalmsg->level = resultmessage::LEVEL_CRITICAL; + $criticalmsg->title = "test CRITICAL title"; + + // Pipes should be cleaned from output and replaced with [pipe] + $criticalwithpipemsg = new resultmessage(); + $criticalwithpipemsg->level = resultmessage::LEVEL_CRITICAL; + $criticalwithpipemsg->title = "test CRITICAL title |"; + + return [ + 'no messages (no message displayed)' => [ + 'messages' => [], + 'expectedsummary' => "OK", + ], + 'only OK (no message displayed)' => [ + 'messages' => [$okmsg], + 'expectedsummary' => "OK", + ], + 'only WARNING (shows error in top level)' => [ + 'messages' => [$warnmsg], + 'expectedsummary' => $warnmsg->title, + ], + 'mix of warning levels (shows summary of levels without including OK)' => [ + 'messages' => [$warnmsg, $okmsg, $criticalmsg], + 'expectedsummary' => "Multiple problems detected: 1 WARNING, 1 CRITICAL", + ], + 'pipe char in output is cleaned' => [ + 'messages' => [$criticalwithpipemsg], + 'expectedsummary' => str_replace('|', '[pipe]', $criticalwithpipemsg->title) + ] + ]; + } + + /** + * Tests create_summary function + * @param array $messages + * @param string $expectedsummary + * @dataProvider create_summary_provider + */ + public function test_create_summary(array $messages, string $expectedsummary) { + $summary = checker::create_summary($messages); + $this->assertEquals($expectedsummary, $summary); + } +} diff --git a/version.php b/version.php index 68f9f9c..c64cd0d 100644 --- a/version.php +++ b/version.php @@ -26,7 +26,7 @@ $plugin->version = 2023101100; $plugin->release = 2023101100; // Match release exactly to version. -$plugin->requires = 2012120311; // Deep support going back to 2.4. -$plugin->supported = [24, 401]; +$plugin->requires = 2020061500; // Support for 3.9 and above, due to the Check API. +$plugin->supported = [39, 401]; $plugin->component = 'tool_heartbeat'; $plugin->maturity = MATURITY_STABLE;