diff --git a/src/qlever/commands/index.py b/src/qlever/commands/index.py index 9b0b7286..d8393564 100644 --- a/src/qlever/commands/index.py +++ b/src/qlever/commands/index.py @@ -147,7 +147,7 @@ def get_input_options_for_json(self, args) -> str: raise self.InvalidInputJson( f"Element {i} in `MULTI_INPUT_JSON` must only contain " "the keys `format`, `graph`, and `parallel`. Contains " - "extra keys {extra_keys}.", + f"extra keys {extra_keys}.", input_spec, ) # Add the command-line options for this input stream. We use diff --git a/test/qlever/commands/test_index_execute.py b/test/qlever/commands/test_index_execute.py new file mode 100644 index 00000000..ebd1ff24 --- /dev/null +++ b/test/qlever/commands/test_index_execute.py @@ -0,0 +1,368 @@ +from __future__ import annotations + +import shlex +import unittest +from unittest.mock import MagicMock, call, patch + +from qlever.commands.index import IndexCommand + + +# Test execute of index command for basic case with successful execution +class TestIndexCommand(unittest.TestCase): + @patch("qlever.commands.index.run_command") + @patch("qlever.commands.index.Containerize") + @patch("qlever.commands.index.get_existing_index_files") + @patch("qlever.commands.index.get_total_file_size") + @patch("qlever.commands.index.glob") + def test_execute_successful_indexing_without_extras( + self, + mock_glob, + mock_get_total_file_size, + mock_get_existing_index_files, + mock_containerize, + mock_run_command, + ): + # Setup args + args = MagicMock() + args.name = "TestName" + args.format = "turtle" + args.cat_input_files = "cat input.nt" + args.index_binary = "/test/path/index-binary" + args.settings_json = '{"example": "settings"}' + args.input_files = "*.nt" + args.only_pso_and_pos_permutations = False + args.use_patterns = True + args.parallel_parsing = False + args.text_index = "Test Index" + args.stxxl_memory = False + args.system = "native" + args.show = False + args.overwrite_existing = False + args.index_container = "test_container" + args.image = "test_image" + args.multi_input_json = False + + # Mock glob, get_total_file_size, get_existing_index_files, + # run_command and containerize + mock_glob.glob.return_value = ["input1.nt", "input2.nt"] + mock_get_total_file_size.return_value = 5e9 # 5 GB + mock_get_existing_index_files.return_value = [] + mock_run_command.return_value = None + mock_containerize.supported_systems.return_value = ["docker"] + + # Instantiate and executing the IndexCommand + result = IndexCommand().execute(args) + + # Assertions + expected_index_cmd = ( + f"{args.cat_input_files} | {args.index_binary}" + f" -i {args.name} -s {args.name}.settings.json" + f" -F {args.format} -f - | tee" + f" {args.name}.index-log.txt" + ) + index_cmd_call = call(expected_index_cmd, show_output=True) + + expected_settings_json_cmd = ( + f"echo {shlex.quote(args.settings_json)} " + f"> {args.name}.settings.json" + ) + + expected_index_binary_cmd = f"{args.index_binary} --help" + + # Testing if run_command was called exactly 3 times with the correct + # parameters and in the correct order + mock_run_command.assert_has_calls( + [ + call(expected_index_binary_cmd), + call(expected_settings_json_cmd), + index_cmd_call, + ], + any_order=False, + ) + assert result + + # Test execute for file already existing + @patch("qlever.commands.index.run_command") + @patch("qlever.commands.index.Containerize") + @patch("qlever.commands.index.get_existing_index_files") + @patch("qlever.commands.index.get_total_file_size") + @patch("qlever.commands.index.log") + @patch("qlever.commands.index.glob") + def test_execute_indexing_with_already_existing_files( + self, + mock_glob, + mock_log, + mock_get_total_file_size, + mock_get_existing_index_files, + mock_containerize, + mock_run_command, + ): + # Setup args + args = MagicMock() + args.name = "TestName" + args.format = "turtle" + args.cat_input_files = "cat input.nt" + args.index_binary = "/test/path/index-binary" + args.settings_json = '{"example": "settings"}' + args.input_files = "*.nt" + args.only_pso_and_pos_permutations = False + args.use_patterns = True + args.text_index = None + args.stxxl_memory = None + args.system = "native" + args.show = False + args.overwrite_existing = False + args.index_container = "test_container" + args.image = "test_image" + args.multi_input_json = False + + # Mock glob, get_total_file_size, get_existing_index_files, + # run_command and containerize + mock_glob.glob.return_value = ["input1.nt", "input2.nt"] + mock_get_total_file_size.return_value = 5e9 # 5 GB + mock_get_existing_index_files.return_value = ["TestName.index"] + mock_run_command.return_value = None + mock_containerize.supported_systems.return_value = [] + + # Instantiate IndexCommand and execute the function + result = IndexCommand().execute(args) + + # Assertions + assert not result + # Verify that the error message was logged + mock_log.error.assert_called_once_with( + f'Index files for basename "{args.name}" found, if you ' + f"want to overwrite them, use --overwrite-existing" + ) + # Check that the info log contains the exception message + mock_log.info.assert_any_call( + f"Index files found: {mock_get_existing_index_files.return_value}" + ) + # Checking if run_command was only called once + # (not after detecting existing files) + mock_run_command.assert_called_once_with(f"{args.index_binary} --help") + + # Test execute for no index binary found + @patch("qlever.commands.index.run_command") + @patch("qlever.commands.index.Containerize") + @patch("qlever.commands.index.get_existing_index_files") + @patch("qlever.commands.index.get_total_file_size") + @patch("qlever.commands.index.log") + @patch("qlever.commands.index.glob") + def test_execute_fails_if_no_indexing_binary_is_found( + self, + mock_glob, + mock_log, + mock_get_total_file_size, + mock_get_existing_index_files, + mock_containerize, + mock_run_command, + ): + # Setup args + args = MagicMock() + args.name = "TestName" + args.format = "turtle" + args.cat_input_files = "cat input.nt" + args.index_binary = "/test/path/no-binary-found" + args.settings_json = '{"example": "settings"}' + args.input_files = "*.nt" + args.only_pso_and_pos_permutations = False + args.use_patterns = True + args.text_index = None + args.stxxl_memory = None + args.system = "native" + args.show = False + args.overwrite_existing = False + args.index_container = "test_container" + args.image = "test_image" + args.multi_input_json = False + + # Mock glob, get_total_file_size, get_existing_index_files, + # run_command and containerize + # if run_command is called throw an Exception with "Binary not found" + mock_glob.glob.return_value = ["input1.nt", "input2.nt"] + mock_get_total_file_size.return_value = 5e9 # 5 GB + mock_get_existing_index_files.return_value = [] + mock_run_command.side_effect = Exception("Binary not found") + mock_containerize.supported_systems.return_value = [] + + # Instantiate IndexCommand and execute the function + result = IndexCommand().execute(args) + + # Assertions + self.assertFalse(result) + # Verify that the error message was logged + mock_log.error.assert_called_once_with( + f'Running "{args.index_binary}" failed, ' + f"set `--index-binary` to a different binary or " + f"set `--system to a container system`" + ) + # Check that the info log contains the exception message + mock_log.info.assert_any_call( + "The error message was: Binary not found" + ) + # Checking if run_command was only called once + # (not after throwing an Exception) + mock_run_command.assert_called_once_with(f"{args.index_binary} --help") + + # Test execute for file size > 10gb + @patch("qlever.commands.index.run_command") + @patch("qlever.commands.index.Containerize") + @patch("qlever.commands.index.get_existing_index_files") + @patch("qlever.commands.index.get_total_file_size") + @patch("qlever.commands.index.glob") + def test_execute_total_file_size_greater_than_ten_gb( + self, + mock_glob, + mock_get_total_file_size, + mock_get_existing_index_files, + mock_containerize, + mock_run_command, + ): + # Setup args + args = MagicMock() + args.name = "TestName" + args.format = "turtle" + args.cat_input_files = "cat input.nt" + args.index_binary = "/test/path/index-binary" + args.settings_json = '{"example": "settings"}' + args.input_files = "*.nt" + args.only_pso_and_pos_permutations = False + args.use_patterns = True + args.parallel_parsing = False + args.text_index = None + args.stxxl_memory = None + args.system = "native" + args.show = False + args.overwrite_existing = False + args.index_container = "test_container" + args.image = "test_image" + args.multi_input_json = False + + # Mock glob, get_total_file_size, get_existing_index_files, + # run_command and containerize + mock_glob.glob.return_value = ["input1.nt", "input2.nt"] + mock_get_total_file_size.return_value = 15e9 # 15 GB + mock_get_existing_index_files.return_value = [] + mock_run_command.return_value = None + mock_containerize.supported_systems.return_value = [] + + # Instantiate IndexCommand and execute the function + result = IndexCommand().execute(args) + + # Assertions + expected_index_cmd = ( + f"ulimit -Sn 1048576; {args.cat_input_files} | {args.index_binary}" + f" -i {args.name} -s {args.name}.settings.json" + f" -F {args.format} -f -" + f" | tee {args.name}.index-log.txt" + ) + mock_run_command.assert_any_call(expected_index_cmd, show_output=True) + self.assertTrue(result) + + # Test elif branch for multi_input_json + @patch("qlever.commands.index.log") + @patch("qlever.commands.index.json") + def test_execute_get_input_options_error(self, mock_json, mock_log): + # Setup args + args = MagicMock() + args.cat_input_files = False + args.multi_input_json = '{"cmd": "test_data"}' + + # Simulate a JSON loading error + mock_json.loads.side_effect = Exception("Wrong format") + + # Instantiate IndexCommand and execute the function + result = IndexCommand().execute(args) + + error_msg = ( + "Failed to parse `MULTI_INPUT_JSON` as either JSON or " + "JSONL (Wrong format)" + ) + # Asserts + # Verify that the error message was logged + mock_log.error.assert_called_once_with(error_msg) + # Assert that log_info was called exactly 2 times with the + # correct arguments in order + mock_log.info.assert_has_calls( + [call(""), call(args.multi_input_json)], any_order=False + ) + assert not result + + # Test else branch for multi_input_json + @patch("qlever.commands.index.log") + def test_execute_cat_files_and_multi_json(self, mock_log): + # Setup args + args = MagicMock() + args.cat_input_files = True + args.multi_input_json = True + + # Instantiate IndexCommand and execute the function + result = IndexCommand().execute(args) + + error_msg = ( + "Specify exactly one of `CAT_INPUT_FILES` (for a " + "single input stream) or `MULTI_INPUT_JSON` (for " + "multiple input streams)" + ) + log_msg = "See `qlever index --help` for more information" + # Asserts + # Verify that the error message was logged + mock_log.error.assert_called_once_with(error_msg) + # Assert that log_info was called exactly 2 times with the + # correct arguments in order + mock_log.info.assert_has_calls( + [call(""), call(log_msg)], any_order=False + ) + assert not result + + # Tests all the extra additions to the index_cmd and the show option + @patch("qlever.commands.index.IndexCommand.get_input_options_for_json") + @patch("qlever.commands.index.IndexCommand.show") + def test_execute_successful_indexing_with_extras_and_show( + self, mock_show, mock_input_json + ): + # Setup args + args = MagicMock() + args.name = "TestName" + args.index_binary = "/test/path/index-binary" + args.multi_input_json = True + args.cat_input_files = False + args.only_pso_and_pos_permutations = True + args.use_patterns = False + args.text_index = "from_text_records_and_literals" + args.stxxl_memory = True + args.input_files = "*.nt" + args.system = "native" + args.settings_json = '{"example": "settings"}' + args.show = True + + # Mock get_input_options_for_json + mock_input_json.return_value = "test_input_stream" + + # Instantiate and executing the IndexCommand + result = IndexCommand().execute(args) + + # Assertions + expected_index_cmd = ( + f"{args.index_binary}" + f" -i {args.name} -s {args.name}.settings.json" + f" {mock_input_json.return_value}" + f" --only-pso-and-pos-permutations --no-patterns" + f" --no-patterns -w {args.name}.wordsfile.tsv" + f" -d {args.name}.docsfile.tsv" + f" --text-words-from-literals" + f" --stxxl-memory {args.stxxl_memory}" + f" | tee {args.name}.index-log.txt" + ) + settings_json_cmd = ( + f"echo {shlex.quote(args.settings_json)} " + f"> {args.name}.settings.json" + ) + + # Verify that show was called with the right parameters + mock_show.assert_called_once_with( + f"{settings_json_cmd}\n" f"{expected_index_cmd}", + only_show=args.show, + ) + assert result diff --git a/test/qlever/commands/test_index_other_methods.py b/test/qlever/commands/test_index_other_methods.py new file mode 100644 index 00000000..919d68ff --- /dev/null +++ b/test/qlever/commands/test_index_other_methods.py @@ -0,0 +1,202 @@ +import argparse +import unittest +from unittest.mock import MagicMock, patch + +from qlever.commands.index import IndexCommand + + +class TestIndexCommand(unittest.TestCase): + def setUp(self): + # Initialize an IndexCommand instance to access the helper function + self.index_command = IndexCommand() + + def test_description(self): + # Call the method + result = self.index_command.description() + + self.assertEqual(result, "Build the index for a given " "RDF dataset") + + def test_should_have_qleverfile(self): + # Call the method + result = self.index_command.should_have_qleverfile() + + assert result + + def test_relevant_qleverfile_arguments(self): + # Call the method + result = self.index_command.relevant_qleverfile_arguments() + + self.assertEqual( + result, + { + "data": ["name", "format"], + "index": [ + "input_files", + "cat_input_files", + "multi_input_json", + "parallel_parsing", + "settings_json", + "index_binary", + "only_pso_and_pos_permutations", + "use_patterns", + "text_index", + "stxxl_memory", + ], + "runtime": ["system", "image", "index_container"], + }, + ) + + def test_additional_arguments(self): + # Create a parser and a subparser + parser = argparse.ArgumentParser() + subparser = parser.add_argument_group("test") + + # Call the method + self.index_command.additional_arguments(subparser) + # Parse an empty argument list to see the default + args = parser.parse_args([]) + + # Test that the default value for cmdline_regex is set correctly + self.assertEqual(args.overwrite_existing, False) + + # Test that the help text for cmdline_regex is correctly set + argument_help = subparser._group_actions[-1].help + self.assertEqual( + argument_help, + "Overwrite an existing index, " "think twice before using.", + ) + + def test_get_input_options_for_json_valid_input(self): + # Mock args with a valid multi_input_json and format + args = MagicMock() + args.multi_input_json = ( + '[{"cmd": "test_data1", "format": "json"}, ' + '{"cmd": "test_data2"}]' + ) + args.format = "jsonld" # Default format if not specified in JSON + + result = self.index_command.get_input_options_for_json(args) + + # Expected command-line options string based on the JSON data + expected_result = ( + "-f <(test_data1) -g - -F json -f " "<(test_data2) -g - -F jsonld" + ) + self.assertEqual(result, expected_result) + + @patch("qlever.commands.index.json") + def test_get_input_options_for_json_invalid_format(self, mock_json): + # Mock args with invalid JSON format + args = MagicMock() + # Set invalid JSON string instead of array + args.multi_input_json = '{"cmd": "test_data"}' + + # Simulate a JSON loading error + mock_json.loads.side_effect = Exception("Wrong format") + + # Execute the function and check for the exception + with self.assertRaises(IndexCommand.InvalidInputJson) as context: + self.index_command.get_input_options_for_json(args) + + # Check if the correct error message is in the raised exception + self.assertEqual( + context.exception.error_message, + "Failed to parse `MULTI_INPUT_JSON` as either JSON " + "or JSONL (Wrong format)", + ) + self.assertEqual( + context.exception.additional_info, args.multi_input_json + ) + + def test_get_input_options_for_json_no_array(self): + # Mock args with JSON that's not an array + args = MagicMock() + args.multi_input_json = '{"key": "test_data1"}' + + with self.assertRaises(IndexCommand.InvalidInputJson) as context: + self.index_command.get_input_options_for_json(args) + + # Verify error message mentions array requirement + self.assertEqual( + "Element 0 in `MULTI_INPUT_JSON` must contain a key `cmd`", + context.exception.error_message, + ) + self.assertEqual( + {"key": "test_data1"}, context.exception.additional_info + ) + + def test_get_input_options_for_json_empty(self): + # Mock args with an empty JSON array + args = MagicMock() + args.multi_input_json = "[]" + + with self.assertRaises(IndexCommand.InvalidInputJson) as context: + self.index_command.get_input_options_for_json(args) + + # Verify error message mentions non-empty requirement + self.assertEqual( + context.exception.error_message, + "`MULTI_INPUT_JSON` must contain at least one element", + ) + self.assertEqual( + context.exception.additional_info, args.multi_input_json + ) + + def test_get_input_options_for_json_object_structure(self): + # Mock args where one of the JSON objects is missing the + # required "cmd" key + args = MagicMock() + args.multi_input_json = ( + '[{"cmd": "test_data1", "format": "json"}, ' '{"format": "json2"}]' + ) # Missing "cmd" + + with self.assertRaises(IndexCommand.InvalidInputJson) as context: + self.index_command.get_input_options_for_json(args) + + # Verify error mentions the missing `cmd` key + self.assertEqual( + "Element 1 in `MULTI_INPUT_JSON` must contain " "a key `cmd`", + context.exception.error_message, + ) + + self.assertEqual( + {"format": "json2"}, context.exception.additional_info + ) + + def test_get_input_options_for_json_object_type(self): + # Mock args where one of the JSON objects is not a dictionary + args = MagicMock() + args.multi_input_json = ( + '[{"cmd": "test_data1", "format": "json"}, ' "5]" + ) # Missing "cmd" + + with self.assertRaises(IndexCommand.InvalidInputJson) as context: + self.index_command.get_input_options_for_json(args) + + # Verify error mentions the missing `cmd` key + self.assertEqual( + "Element 1 in `MULTI_INPUT_JSON` must be a JSON " "object", + context.exception.error_message, + ) + + self.assertEqual(5, context.exception.additional_info) + + def test_get_input_options_for_json_extra_keys(self): + # Mock args where one of the JSON objects contains an extra key + args = MagicMock() + args.multi_input_json = '[{"cmd": "test_data1", "test_key": "data2"}]' + + with self.assertRaises(IndexCommand.InvalidInputJson) as context: + self.index_command.get_input_options_for_json(args) + + # Verify error mentions the missing `cmd` key + self.assertEqual( + "Element 0 in `MULTI_INPUT_JSON` must only " + "contain the keys `format`, `graph`, and `parallel`. " + "Contains extra keys {'test_key'}.", + context.exception.error_message, + ) + + self.assertEqual( + {"cmd": "test_data1", "test_key": "data2"}, + context.exception.additional_info, + )