diff --git a/README.md b/README.md index 68c9ae5..bcb2aad 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,13 @@ The various thresholds can be configured with query params or cli args see this php croncheck.php -h ``` +## Site health (Moodle 39+) + +Accessible via `sitecheck.php`, this runs all Check API checks of type `status` and any errors are reported. + +Moodle 3.9 or higher is required, since this is when the [Check API](https://moodledev.io/docs/apis/subsystems/check) was implemented. This page also does not check the legacy task logs. + +`croncheck.php` overlaps a little since it also does check the check API, however, this new page is much cleaner, easier to understand and has a more consistent format. ## Failed login detection diff --git a/classes/check/failingtaskcheck.php b/classes/check/failingtaskcheck.php new file mode 100644 index 0000000..fb92a8d --- /dev/null +++ b/classes/check/failingtaskcheck.php @@ -0,0 +1,127 @@ +. + +namespace tool_heartbeat\check; + +use core\check\check; +use core\check\result; + +/** + * Task fail delay check + * + * This is very similar to the core tool_task::maxfaildelay check, except the output aggregates the number + * of each task, so if you have thousands of a task failing it does not spam the output. + * + * @package tool_heartbeat + * @copyright 2023 Matthew Hilton + * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later + */ +class failingtaskcheck extends check { + + /** @var int $warnthreshold Threshold in minutes after which should warn about tasks failing **/ + public $warnthreshold = 60; + + /** @var int $errorthreshold Threshold in minutes after which should error about tasks failing **/ + public $errorthreshold = 600; + + /** + * Constructor + */ + public function __construct() { + $this->id = 'cronfailingtasks'; + $this->name = get_string('checkfailingtaskcheck', 'tool_heartbeat'); + + $this->actionlink = new \action_link( + new \moodle_url('/admin/tasklogs.php'), + get_string('tasklogs', 'tool_task')); + } + + /** + * Return result + * @return result + */ + public function get_result() : result { + global $DB; + + $taskoutputs = []; + + // Instead of using task API here, we read directly from the database. + // This stops errors originating from broken tasks. + $scheduledtasks = $DB->get_records_sql("SELECT * FROM {task_scheduled} WHERE faildelay > 0 AND disabled = 0"); + + foreach ($scheduledtasks as $task) { + $taskoutputs[] = "SCHEDULED TASK: {$task->classname} Delay: {$task->faildelay}\n"; + } + + // Instead of using task API here, we read directly from the database. + // This stops errors originating from broken tasks, and allows the DB to de-duplicate them. + $adhoctasks = $DB->get_records_sql(" SELECT classname, COUNT(*) count, MAX(faildelay) faildelay, SUM(faildelay) cfaildelay + FROM {task_adhoc} + WHERE faildelay > 0 + GROUP BY classname + ORDER BY cfaildelay DESC"); + + foreach ($adhoctasks as $record) { + // Only add duplicate message if there are more than 1. + $duplicatemsg = $record->count > 1 ? " ({$record->count} duplicates!!!)" : ''; + $taskoutputs[] = "ADHOC TASK: {$record->classname} Delay: {$record->faildelay} {$duplicatemsg}\n"; + } + + // Find the largest faildelay out of both adhoc and scheduled tasks. + $alldelays = array_merge(array_column($adhoctasks, 'faildelay'), array_column($scheduledtasks, 'faildelay')); + $maxdelaymins = !empty($alldelays) ? max($alldelays) / 60 : 0; + + // Define a simple function to work out what the message should be based on the task outputs. + // Returns the [$summary, $details]. + $taskoutputfn = function($faildelaymins) use ($taskoutputs) { + $count = count($taskoutputs); + + if ($count == 1) { + // Only a single task is failing, so put it at the top level. + return [$taskoutputs[0], '']; + } + + if ($count > 1) { + // More than 1, add a message at the start that indicates how many. + return ["{$count} Moodle tasks reported errors, maximum faildelay > {$faildelaymins} mins", implode("", $taskoutputs)]; + } + + // There are 0 tasks are failing, default to nothing. + return ['', '']; + }; + + // Default to ok. + $status = result::OK; + $delay = 0; + + // Check if warn - if so then upgrade to warn. + if ($maxdelaymins > $this->warnthreshold) { + $status = result::WARNING; + $delay = $this->warnthreshold; + } + + // Check if error - if so then upgrade to error. + if ($maxdelaymins > $this->errorthreshold) { + $status = result::ERROR; + $delay = $this->errorthreshold; + } + + list($summary, $details) = $taskoutputfn($delay); + + return new result($status, nl2br($summary), nl2br($details)); + + } +} diff --git a/classes/checker.php b/classes/checker.php new file mode 100644 index 0000000..cfca366 --- /dev/null +++ b/classes/checker.php @@ -0,0 +1,290 @@ +. + +namespace tool_heartbeat; + +use core\check\check; +use core\check\result; +use Throwable; + +/** + * Check API checker class + * + * Processes check API results and returns them in a nice format for nagios output. + * + * @package tool_heartbeat + * @author Matthew Hilton + * @copyright 2023, Catalyst IT + * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later + */ +class checker { + /** @var array Nagios level prefixes **/ + public const NAGIOS_PREFIXES = [ + 0 => "OK", + 1 => "WARNING", + 2 => "CRITICAL", + 3 => "UNKNOWN", + ]; + + /** + * Returns an array of check API messages. + * If exceptions are thrown, they are caught and returned as result messages as well. + * Note - OK results are not returned. + * + * @return array array of resultmessage objects + */ + public static function get_check_messages(): array { + // First try to get the checks, if this fails return a critical message (code is very broken). + $checks = []; + + try { + $checks = \core\check\manager::get_checks('status'); + } catch (Throwable $e) { + return [self::exception_to_message("Error getting checks: ", $e)]; + } + + // Remove any supressed checks from the list. + $checks = array_filter($checks, function($check) { + return !in_array(get_class($check), self::supressed_checks()); + }); + + // Execute each check and store their messages. + $messages = []; + + foreach ($checks as $check) { + try { + $messages[] = self::process_check_and_get_result($check); + } catch (Throwable $e) { + $messages[] = self::exception_to_message("Error processing check " . $check->get_ref() . ": ", $e); + } + } + + // Add any output buffer message. + $messages[] = self::get_ob_message(); + + // Filter out any OK messages, we don't care about these. + $messages = array_filter($messages, function($m) { + return $m->level != resultmessage::LEVEL_OK; + }); + + return $messages; + } + + /** + * Closes the output buffering, and if anything was outputted, a warning resultmessage is returned + * @return resultmessage + */ + private static function get_ob_message(): resultmessage { + $contents = ob_get_clean() ?: ''; + + // Default to OK. + $res = new resultmessage(); + $res->level = resultmessage::LEVEL_OK; + $res->title = 'Output buffering: No output buffered'; + $res->message = 'No output buffered'; + + if (!empty($contents)) { + $res->level = resultmessage::LEVEL_WARN; + $res->title = "Output buffering: Unexpected output"; + $res->message = $contents; + } + + // Process these using the HTML cleaning function. + list($title, $message) = self::process_title_and_message($res->title, $res->message, ""); + $res->title = $title; + $res->message = $message; + + return $res; + } + + /** + * Turns the given exception into a warning resultmessage. + * @param string $prefix + * @param Throwable $e + * @return resultmessage + */ + private static function exception_to_message(string $prefix, Throwable $e): resultmessage { + $res = new resultmessage(); + $res->level = resultmessage::LEVEL_WARN; + $res->title = $prefix . $e->getMessage(); + $res->message = (string) $e; + return $res; + } + + /** + * Processes the check and maps its result and status to a resultmessage. + * @param check $check + * @return resultmessage + */ + private static function process_check_and_get_result(check $check): resultmessage { + $res = new resultmessage(); + + $checkresult = $check->get_result(); + + // Map check result to nagios level. + $map = [ + result::OK => resultmessage::LEVEL_OK, + result::NA => resultmessage::LEVEL_OK, + result::WARNING => resultmessage::LEVEL_WARN, + result::CRITICAL => resultmessage::LEVEL_CRITICAL, + result::ERROR => resultmessage::LEVEL_CRITICAL, + result::UNKNOWN => resultmessage::LEVEL_UNKNOWN, + ]; + + // Get the level, or default to unknown. + $status = $checkresult->get_status(); + $res->level = isset($map[$status]) ? $map[$status] : resultmessage::LEVEL_UNKNOWN; + + list($title, $message) = self::process_title_and_message($check->get_name(), $checkresult->get_summary(), + $checkresult->get_details()); + $res->title = $title; + $res->message = $message; + + return $res; + } + + /** + * Parses, cleans and sets up the correct output. + * @param string $title + * @param string $summary + * @param string $details + * @return array array of [$title, $message] + */ + private static function process_title_and_message(string $title, string $summary, string $details): array { + // Strip tags from summary and details. + $summary = self::clean_text($summary); + $details = self::clean_text($details); + + // Get all the lines of the message. + $messagelines = explode("\n", $summary); + $messagelines = array_merge($messagelines, explode("\n", $details)); + + // Clean each one. + $messagelines = array_map(function($line) { + return self::clean_text($line); + }, $messagelines); + + // Remove empty lines. + $messagelines = array_filter($messagelines); + + // Use the first line in the title. + $title .= ": " . array_shift($messagelines); + + // Use the rest in the message. + $message = implode("\n", $messagelines); + + return [$title, $message]; + } + + /** + * Cleans the text ready for output. + * @param string $text + * @return string + */ + private static function clean_text(string $text): string { + // Convert any line breaks to newlines. + $text = str_replace("
", "\n", $text); + $text = str_replace("
", "\n", $text); + + // Strip tags. + $text = strip_tags($text); + + // Strip extra newlines. + $text = trim($text); + + return $text; + } + + /** + * From an array of resultmessage, determines the highest nagios level. + * Note, it considers UNKNOWN to be less than CRITICAL or WARNING. + * + * @param array $messages array of resultmessage objects + * @return int the calculated nagios level + */ + public static function determine_nagios_level(array $messages): int { + // Find the highest level. + $levels = array_column($messages, "level"); + + // Add a default "OK" in case no messages were returned. + $levels[] = resultmessage::LEVEL_OK; + + $hasunknown = !empty(array_filter($levels, function($l) { + return $l == resultmessage::LEVEL_UNKNOWN; + })); + + // Remove unknowns. + $levels = array_filter($levels, function($l) { + return $l != resultmessage::LEVEL_UNKNOWN; + }); + + $highest = max($levels); + + // If highest was OK but it had an unknown, return unknown. + // This stops UNKNOWN from masking WARNING or CRITICAL. + if ($highest == resultmessage::LEVEL_OK && $hasunknown) { + return resultmessage::LEVEL_UNKNOWN; + } + + // Else return the highest. + return $highest; + } + + /** + * Creates a summary from the given messages. + * If there are no messages or only OK, OK is returned. + * If there is a single message, its details are returned. + * If there are multiple messages, the levels are aggregated and turned into a summary. + * + * @param array $messages array of resultmessage objects + * @return string + */ + public static function create_summary(array $messages): string { + // If no messages, return OK. + if (count($messages) == 0) { + return "OK"; + } + + // If only one message, use it as the top level. + if (count($messages) == 1) { + return current($messages)->title; + } + + // Otherwise count how many of each level. + $counts = array_count_values(array_column($messages, 'level')); + + $countswithprefixes = []; + foreach ($counts as $level => $occurrences) { + $prefix = self::NAGIOS_PREFIXES[$level]; + $countswithprefixes[] = "{$occurrences} {$prefix}"; + } + + return "Multiple problems detected: " . implode(", ", $countswithprefixes); + } + + /** + * Stores any checks that are suppressed/ignored by this class. + * @return array array of class name strings of checks to ignore + */ + private static function supressed_checks(): array { + return [ + // These two supressed and replaced by a more detailed/useful version in this plugin. + \tool_task\check\maxfaildelay::class, + \tool_task\check\adhocqueue::class, + ]; + } +} + diff --git a/classes/resultmessage.php b/classes/resultmessage.php new file mode 100644 index 0000000..7de5958 --- /dev/null +++ b/classes/resultmessage.php @@ -0,0 +1,49 @@ +. + +namespace tool_heartbeat; + +/** + * A data-only class for holding a message about a result from a check API class. + * + * @package tool_heartbeat + * @author Matthew Hilton + * @copyright 2023, Catalyst IT + * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later + */ +class resultmessage { + /** @var int OK level **/ + public const LEVEL_OK = 0; + + /** @var int WARN level **/ + public const LEVEL_WARN = 1; + + /** @var int CRITICAL level **/ + public const LEVEL_CRITICAL = 2; + + /** @var int UNKNOWN level **/ + public const LEVEL_UNKNOWN = 3; + + /** @var int $level The level of this message **/ + public $level = self::LEVEL_UNKNOWN; + + /** @var string $title Title of the message **/ + public $title = ''; + + /** @var string $message Details of this message **/ + public $message = ''; +} + diff --git a/lang/en/tool_heartbeat.php b/lang/en/tool_heartbeat.php index f50c665..00761bc 100644 --- a/lang/en/tool_heartbeat.php +++ b/lang/en/tool_heartbeat.php @@ -79,6 +79,8 @@ $string['checktasklatencycheck'] = 'Task latency check'; $string['taskconfigbad'] = 'Bad configurations {$a}'; $string['tasklatencyok'] = 'Task latency OK.'; +$string['checkfailingtaskcheck'] = 'Failing tasks'; + /* * Privacy provider (GDPR) */ diff --git a/lib.php b/lib.php index f248803..1979f50 100644 --- a/lib.php +++ b/lib.php @@ -31,6 +31,7 @@ function tool_heartbeat_status_checks() { new \tool_heartbeat\check\authcheck(), new \tool_heartbeat\check\logstorecheck(), new \tool_heartbeat\check\tasklatencycheck(), + new \tool_heartbeat\check\failingtaskcheck(), ]; } diff --git a/sitecheck.php b/sitecheck.php new file mode 100644 index 0000000..52d6a74 --- /dev/null +++ b/sitecheck.php @@ -0,0 +1,84 @@ +. + +/** + * Check API Health Check + * + * @package tool_heartbeat + * @copyright 2023 Matthew Hilton + * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later + * + * See also: + * - http://nagios.sourceforge.net/docs/3_0/pluginapi.html + * - https://nagios-plugins.org/doc/guidelines.html#PLUGOUTPUT + */ + +use tool_heartbeat\checker; +use tool_heartbeat\resultmessage; + +// @codingStandardsIgnoreStart +define('NO_UPGRADE_CHECK', true); + +$dirroot = __DIR__ . '/../../../'; +require($dirroot.'config.php'); +require_once(__DIR__.'/nagios.php'); + +global $PAGE; + +if (isset($CFG->mnet_dispatcher_mode) and $CFG->mnet_dispatcher_mode !== 'off') { + // This is a core bug workaround, see MDL-77247 for more details. + require_once($CFG->dirroot.'/mnet/lib.php'); +} + +// Start output buffering. This stops for e.g. debugging messages from breaking the output. +// When a nagios.php send_* function is called, they will collect the buffer +// and warn if it is not empty (but do it nicely). +ob_start(); + +$messages = checker::get_check_messages(); + +// Construct the output message. +$PAGE->set_context(\context_system::instance()); + +// Indent the messages. +$msg = array_map(function($message) { + global $OUTPUT; + + $spacer = "  "; + + // Add the spacer to the start of each message line. + $indentedlines = explode("\n", $message->message); + $indentedlines = array_map(function($line) use ($spacer) { + return $spacer . $line; + }, $indentedlines); + + $indentedmessage = implode("\n", $indentedlines); + $indentedmessage = nl2br($indentedmessage); + + return $OUTPUT->render_from_template('tool_heartbeat/resultmessage', [ + 'prefix' => checker::NAGIOS_PREFIXES[$message->level], + 'title' => $message->title, + 'message' => $indentedmessage, + ]); +}, $messages); + +$msg = checker::create_summary($messages) . "
" . implode("", $msg); +$level = checker::determine_nagios_level($messages); +$prefix = checker::NAGIOS_PREFIXES[$level]; +$now = userdate(time()); + +printf("{$prefix}: $msg (Checked {$now})\n"); +exit($level); diff --git a/templates/resultmessage.mustache b/templates/resultmessage.mustache new file mode 100644 index 0000000..696ff1f --- /dev/null +++ b/templates/resultmessage.mustache @@ -0,0 +1,40 @@ + +{{! + This file is part of Moodle - https://moodle.org/ + + Moodle is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Moodle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Moodle. If not, see + . + }} + {{! + @template tool_heartbeat/resultmessage + + Template by JS to render output of result report getting (loading, url, error) + + Classes required for JS: + * none + + Context variables required for this template: + * none + + Example context (json): + { + "prefix": "CRTIICAL", + "title": "Something broke", + "message": "Some more details" + } + }} + +* {{prefix}} {{title}}
+ {{{message}}}
+
diff --git a/tests/checker_test.php b/tests/checker_test.php new file mode 100644 index 0000000..ea0f15e --- /dev/null +++ b/tests/checker_test.php @@ -0,0 +1,140 @@ +. + +namespace tool_heartbeat; + +/** + * Test class for tool_heartbeat\checker + * + * @package tool_heartbeat + * @author Matthew Hilton + * @copyright 2023, Catalyst IT + * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later + */ +class checker_test extends \advanced_testcase { + /** + * Tests get_check_messages function + */ + public function test_get_check_messages() { + // Check API modifies DB state. + $this->resetAfterTest(true); + + // Just test that the check API is working, and this returns some checks (for example the ones included with this plugin). + $checks = checker::get_check_messages(); + $this->assertNotEmpty($checks); + } + + /** + * Provides values to determine_nagios_level test + * @return array + */ + public static function determine_nagios_level_provider(): array { + return [ + 'no messages' => [ + 'levels' => [], + 'nagioslevel' => resultmessage::LEVEL_OK, + ], + 'one OK message' => [ + 'levels' => [resultmessage::LEVEL_OK], + 'nagioslevel' => resultmessage::LEVEL_OK, + ], + 'one UNKNOWN message' => [ + 'levels' => [resultmessage::LEVEL_UNKNOWN], + 'nagioslevel' => resultmessage::LEVEL_UNKNOWN, + ], + 'one UNKNOWN and one OK' => [ + 'levels' => [resultmessage::LEVEL_UNKNOWN, resultmessage::LEVEL_OK], + 'nagioslevel' => resultmessage::LEVEL_UNKNOWN, + ], + 'one UNKNOWN and one WARNING' => [ + 'levels' => [resultmessage::LEVEL_UNKNOWN, resultmessage::LEVEL_WARN], + 'nagioslevel' => resultmessage::LEVEL_WARN, + ], + 'one UNKNOWN and on CRITICAL' => [ + 'levels' => [resultmessage::LEVEL_UNKNOWN, resultmessage::LEVEL_CRITICAL], + 'nagioslevel' => resultmessage::LEVEL_CRITICAL, + ], + ]; + } + + /** + * Tests determine_nagios_level function + * @param array $levels + * @param int $expectedlevel + * @dataProvider determine_nagios_level_provider + */ + public function test_determine_nagios_level(array $levels, int $expectedlevel) { + // Generate a series of dummy messages with the given levels. + $messages = array_map(function($level) { + $msg = new resultmessage(); + $msg->level = $level; + return $msg; + }, $levels); + + // Confirm the correct level outputted. + $level = checker::determine_nagios_level($messages); + $this->assertEquals($expectedlevel, $level); + } + + /** + * Provides values to test_create_summary test + * @return array + */ + public static function create_summary_provider(): array { + + $warnmsg = new resultmessage(); + $warnmsg->level = resultmessage::LEVEL_WARN; + $warnmsg->title = "test WARN title"; + + $okmsg = new resultmessage(); + $okmsg->level = resultmessage::LEVEL_OK; + $okmsg->title = "test OK title"; + + $criticalmsg = new resultmessage(); + $criticalmsg->level = resultmessage::LEVEL_CRITICAL; + $criticalmsg->title = "test CRITICAL title"; + + return [ + 'no messages (no message displayed)' => [ + 'messages' => [], + 'expectedsummary' => "OK", + ], + 'only OK (no message displayed)' => [ + 'messages' => [$okmsg], + 'expectedsummary' => "OK", + ], + 'only WARNING (shows error in top level)' => [ + 'messages' => [$warnmsg], + 'expectedsummary' => $warnmsg->title, + ], + 'mix of warning levels (shows summary of levels without including OK)' => [ + 'messages' => [$warnmsg, $okmsg, $criticalmsg], + 'expectedsummary' => "Multiple problems detected: 1 WARNING, 1 CRITICAL", + ], + ]; + } + + /** + * Tests create_summary function + * @param array $messages + * @param string $expectedsummary + * @dataProvider create_summary_provider + */ + public function test_create_summary(array $messages, string $expectedsummary) { + $summary = checker::create_summary($messages); + $this->assertEquals($expectedsummary, $summary); + } +}