From baefba62ba7142e3c7ef6949c4cd126efcd7ed69 Mon Sep 17 00:00:00 2001 From: Sean Mackesey Date: Fri, 20 Dec 2024 07:09:51 -0500 Subject: [PATCH] [components] Add --rebuild-component-registry option, `--skip-venv` option, build cache on code loc construction (#26591) ## Summary & Motivation Add a top-level `--rebuild-component-registry` option that clears the cache and refetches the component registry for the current environment. The reason this is called `--rebuild-component-registry` instead of `--rebuild-cache` is: - It is scoped to just the current environment, whereas the cache encompasses multiple environments - We may cache other info later Also add `--skip-venv` option to `dg generate code-location` to avoid automatic venv setup and cache population on construction. Mostly useful for testing. We also invoke this immediately after `uv sync` when constructing a new code location, so that the cache is loaded. ## How I Tested These Changes New unit tests + manual test of code location generation. --- .../libraries/dagster-dg/dagster_dg/cache.py | 10 ++- .../dagster-dg/dagster_dg/cli/__init__.py | 65 +++++++++++++++++-- .../dagster-dg/dagster_dg/cli/generate.py | 10 ++- .../dagster-dg/dagster_dg/context.py | 52 +++++++++------ .../dagster-dg/dagster_dg/generate.py | 26 ++++---- .../cli_tests/test_generate_commands.py | 40 ++++++++++-- .../dagster-dg/dagster_dg_tests/test_cache.py | 62 ++++++++++++++++-- .../dagster-dg/dagster_dg_tests/utils.py | 4 +- 8 files changed, 219 insertions(+), 50 deletions(-) diff --git a/python_modules/libraries/dagster-dg/dagster_dg/cache.py b/python_modules/libraries/dagster-dg/dagster_dg/cache.py index 71afef32046e9..d5e13eacf009e 100644 --- a/python_modules/libraries/dagster-dg/dagster_dg/cache.py +++ b/python_modules/libraries/dagster-dg/dagster_dg/cache.py @@ -47,9 +47,15 @@ def __init__(self, root_path: Path, logging_enabled: bool): self._root_path.mkdir(parents=True, exist_ok=True) self._logging_enabled = logging_enabled - def clear(self) -> None: + def clear_key(self, key: Tuple[str, ...]) -> None: + path = self._get_path(key) + if path.exists(): + path.unlink() + self.log(f"CACHE [clear-key]: {path}") + + def clear_all(self) -> None: shutil.rmtree(self._root_path) - self.log(f"CACHE [clear]: {self._root_path}") + self.log(f"CACHE [clear-all]: {self._root_path}") def get(self, key: Tuple[str, ...]) -> Optional[str]: path = self._get_path(key) diff --git a/python_modules/libraries/dagster-dg/dagster_dg/cli/__init__.py b/python_modules/libraries/dagster-dg/dagster_dg/cli/__init__.py index de007a61d537e..4d32561d772b3 100644 --- a/python_modules/libraries/dagster-dg/dagster_dg/cli/__init__.py +++ b/python_modules/libraries/dagster-dg/dagster_dg/cli/__init__.py @@ -1,3 +1,4 @@ +import sys from pathlib import Path import click @@ -7,6 +8,14 @@ from dagster_dg.cli.info import info_cli from dagster_dg.cli.list import list_cli from dagster_dg.config import DgConfig, set_config_on_cli_context +from dagster_dg.context import ( + DgContext, + ensure_uv_lock, + fetch_component_registry, + is_inside_code_location_directory, + make_cache_key, + resolve_code_location_root_directory, +) from dagster_dg.utils import DgClickGroup from dagster_dg.version import __version__ @@ -49,6 +58,15 @@ def create_dg_cli(): help="Clear the cache before running the command.", default=False, ) + @click.option( + "--rebuild-component-registry", + is_flag=True, + help=( + "Recompute and cache the set of available component types for the current environment." + " Note that this also happens automatically whenever the cache is detected to be stale." + ), + default=False, + ) @click.option( "--cache-dir", type=Path, @@ -64,6 +82,7 @@ def group( disable_cache: bool, cache_dir: Path, clear_cache: bool, + rebuild_component_registry: bool, ): """CLI tools for working with Dagster components.""" context.ensure_object(dict) @@ -73,19 +92,57 @@ def group( disable_cache=disable_cache, cache_dir=cache_dir, ) - if clear_cache: - DgCache.from_config(config).clear() + set_config_on_cli_context(context, config) + + if clear_cache and rebuild_component_registry: + click.echo( + click.style( + "Cannot specify both --clear-cache and --rebuild-component-registry.", fg="red" + ) + ) + sys.exit(1) + elif clear_cache: + DgCache.from_config(config).clear_all() if context.invoked_subcommand is None: context.exit(0) + elif rebuild_component_registry: + if context.invoked_subcommand is not None: + click.echo( + click.style( + "Cannot specify --rebuild-component-registry with a subcommand.", fg="red" + ) + ) + sys.exit(1) + _rebuild_component_registry(context) elif context.invoked_subcommand is None: click.echo(context.get_help()) context.exit(0) - set_config_on_cli_context(context, config) - return group +def _rebuild_component_registry(cli_context: click.Context): + dg_context = DgContext.from_cli_context(cli_context) + if not is_inside_code_location_directory(Path.cwd()): + click.echo( + click.style( + "This command must be run inside a Dagster code location directory.", fg="red" + ) + ) + sys.exit(1) + if not dg_context.cache: + click.echo( + click.style("Cache is disabled. This command cannot be run without a cache.", fg="red") + ) + sys.exit(1) + root_path = resolve_code_location_root_directory(Path.cwd()) + ensure_uv_lock(root_path) + key = make_cache_key(root_path, "component_registry_data") + dg_context.cache.clear_key(key) + # This will trigger a rebuild of the component registry + fetch_component_registry(Path.cwd(), dg_context) + + ENV_PREFIX = "DAGSTER_DG" cli = create_dg_cli() diff --git a/python_modules/libraries/dagster-dg/dagster_dg/cli/generate.py b/python_modules/libraries/dagster-dg/dagster_dg/cli/generate.py index e3812213cdc59..49fb5f822f26e 100644 --- a/python_modules/libraries/dagster-dg/dagster_dg/cli/generate.py +++ b/python_modules/libraries/dagster-dg/dagster_dg/cli/generate.py @@ -65,9 +65,15 @@ def generate_deployment_command(path: Path) -> None: " the location of the local Dagster clone will be read from the `DAGSTER_GIT_REPO_DIR` environment variable." ), ) +@click.option( + "--skip-venv", + is_flag=True, + default=False, + help="Do not create a virtual environment for the code location.", +) @click.pass_context def generate_code_location_command( - cli_context: click.Context, name: str, use_editable_dagster: Optional[str] + cli_context: click.Context, name: str, use_editable_dagster: Optional[str], skip_venv: bool ) -> None: """Generate a Dagster code location file structure and a uv-managed virtual environment scoped to the code location. @@ -117,7 +123,7 @@ def generate_code_location_command( else: editable_dagster_root = None - generate_code_location(code_location_path, editable_dagster_root) + generate_code_location(code_location_path, dg_context, editable_dagster_root, skip_venv) @generate_cli.command(name="component-type", cls=DgClickCommand) diff --git a/python_modules/libraries/dagster-dg/dagster_dg/context.py b/python_modules/libraries/dagster-dg/dagster_dg/context.py index af25fcd73ff06..1a6bb50966ed1 100644 --- a/python_modules/libraries/dagster-dg/dagster_dg/context.py +++ b/python_modules/libraries/dagster-dg/dagster_dg/context.py @@ -1,6 +1,7 @@ import hashlib import json import os +import subprocess from dataclasses import dataclass from pathlib import Path from typing import Final, Iterable, Optional, Tuple @@ -15,8 +16,10 @@ from dagster_dg.error import DgError from dagster_dg.utils import ( execute_code_location_command, + get_uv_command_env, hash_directory_metadata, hash_file_metadata, + pushd, ) @@ -43,13 +46,13 @@ def _is_deployment_root_directory(path: Path) -> bool: def is_inside_code_location_directory(path: Path) -> bool: try: - _resolve_code_location_root_directory(path) + resolve_code_location_root_directory(path) return True except DgError: return False -def _resolve_code_location_root_directory(path: Path) -> Path: +def resolve_code_location_root_directory(path: Path) -> Path: current_path = path.absolute() while not _is_code_location_root_directory(current_path): current_path = current_path.parent @@ -134,6 +137,31 @@ def make_cache_key(code_location_path: Path, data_type: CachableDataType) -> Tup return ("_".join(path_parts), env_hash, data_type) +def ensure_uv_lock(root_path: Path) -> None: + with pushd(root_path): + if not (root_path / "uv.lock").exists(): + subprocess.run(["uv", "sync"], check=True, env=get_uv_command_env()) + + +def fetch_component_registry(path: Path, dg_context: DgContext) -> RemoteComponentRegistry: + root_path = resolve_code_location_root_directory(path) + + cache = dg_context.cache + if cache: + cache_key = make_cache_key(root_path, "component_registry_data") + + raw_registry_data = cache.get(cache_key) if cache else None + if not raw_registry_data: + raw_registry_data = execute_code_location_command( + root_path, ["list", "component-types"], dg_context + ) + if cache: + cache.set(cache_key, raw_registry_data) + + registry_data = json.loads(raw_registry_data) + return RemoteComponentRegistry.from_dict(registry_data) + + @dataclass class CodeLocationDirectoryContext: """Class encapsulating contextual information about a components code location directory. @@ -155,23 +183,9 @@ class CodeLocationDirectoryContext: @classmethod def from_path(cls, path: Path, dg_context: DgContext) -> Self: - root_path = _resolve_code_location_root_directory(path) - - cache = dg_context.cache - if cache: - cache_key = make_cache_key(root_path, "component_registry_data") - - raw_registry_data = cache.get(cache_key) if cache else None - if not raw_registry_data: - raw_registry_data = execute_code_location_command( - root_path, ["list", "component-types"], dg_context - ) - if cache: - cache.set(cache_key, raw_registry_data) - - registry_data = json.loads(raw_registry_data) - component_registry = RemoteComponentRegistry.from_dict(registry_data) - + root_path = resolve_code_location_root_directory(path) + ensure_uv_lock(root_path) + component_registry = fetch_component_registry(path, dg_context) return cls( root_path=root_path, name=path.name, diff --git a/python_modules/libraries/dagster-dg/dagster_dg/generate.py b/python_modules/libraries/dagster-dg/dagster_dg/generate.py index 788c9c898dd8c..77d6c829d9b8d 100644 --- a/python_modules/libraries/dagster-dg/dagster_dg/generate.py +++ b/python_modules/libraries/dagster-dg/dagster_dg/generate.py @@ -1,20 +1,18 @@ import json import os -import subprocess import textwrap from pathlib import Path from typing import Any, Mapping, Optional import click -from dagster_dg.context import CodeLocationDirectoryContext, DgContext -from dagster_dg.utils import ( - camelcase, - execute_code_location_command, - generate_subtree, - get_uv_command_env, - pushd, +from dagster_dg.context import ( + CodeLocationDirectoryContext, + DgContext, + ensure_uv_lock, + fetch_component_registry, ) +from dagster_dg.utils import camelcase, execute_code_location_command, generate_subtree # ######################## # ##### DEPLOYMENT @@ -89,7 +87,12 @@ def get_pyproject_toml_uv_sources(editable_dagster_root: str) -> str: """) -def generate_code_location(path: Path, editable_dagster_root: Optional[str] = None) -> None: +def generate_code_location( + path: Path, + dg_context: DgContext, + editable_dagster_root: Optional[str] = None, + skip_venv: bool = False, +) -> None: click.echo(f"Creating a Dagster code location at {path}.") dependencies = get_pyproject_toml_dependencies(use_editable_dagster=bool(editable_dagster_root)) @@ -112,8 +115,9 @@ def generate_code_location(path: Path, editable_dagster_root: Optional[str] = No ) # Build the venv - with pushd(path): - subprocess.run(["uv", "sync"], check=True, env=get_uv_command_env()) + if not skip_venv: + ensure_uv_lock(path) + fetch_component_registry(path, dg_context) # Populate the cache # ######################## diff --git a/python_modules/libraries/dagster-dg/dagster_dg_tests/cli_tests/test_generate_commands.py b/python_modules/libraries/dagster-dg/dagster_dg_tests/cli_tests/test_generate_commands.py index 7bc7ebc634b0b..8cc61d0fe7e21 100644 --- a/python_modules/libraries/dagster-dg/dagster_dg_tests/cli_tests/test_generate_commands.py +++ b/python_modules/libraries/dagster-dg/dagster_dg_tests/cli_tests/test_generate_commands.py @@ -7,7 +7,7 @@ import pytest import tomli from dagster_dg.context import CodeLocationDirectoryContext, DgContext -from dagster_dg.utils import discover_git_root, ensure_dagster_dg_tests_import +from dagster_dg.utils import discover_git_root, ensure_dagster_dg_tests_import, pushd ensure_dagster_dg_tests_import() @@ -40,7 +40,12 @@ def test_generate_deployment_command_already_exists_fails() -> None: def test_generate_code_location_inside_deployment_success() -> None: - with ProxyRunner.test() as runner, isolated_example_deployment_foo(runner): + # Don't use the test component lib because it is not present in published dagster-components, + # which this test is currently accessing since we are not doing an editable install. + with ( + ProxyRunner.test(use_test_component_lib=False) as runner, + isolated_example_deployment_foo(runner), + ): result = runner.invoke("generate", "code-location", "bar") assert_runner_result(result) assert Path("code_locations/bar").exists() @@ -60,9 +65,16 @@ def test_generate_code_location_inside_deployment_success() -> None: # No tool.uv.sources added without --use-editable-dagster assert "uv" not in toml["tool"] + # Check cache was populated + with pushd("code_locations/bar"): + result = runner.invoke("--verbose", "list", "component-types") + assert "CACHE [hit]" in result.output + def test_generate_code_location_outside_deployment_success() -> None: - with ProxyRunner.test() as runner, runner.isolated_filesystem(): + # Don't use the test component lib because it is not present in published dagster-components, + # which this test is currently accessing since we are not doing an editable install. + with ProxyRunner.test(use_test_component_lib=False) as runner, runner.isolated_filesystem(): result = runner.invoke("generate", "code-location", "bar") assert_runner_result(result) assert Path("bar").exists() @@ -110,6 +122,24 @@ def test_generate_code_location_editable_dagster_success(mode: str, monkeypatch) } +def test_generate_code_location_skip_venv_success() -> None: + # Don't use the test component lib because it is not present in published dagster-components, + # which this test is currently accessing since we are not doing an editable install. + with ProxyRunner.test() as runner, runner.isolated_filesystem(): + result = runner.invoke("generate", "code-location", "--skip-venv", "bar") + assert_runner_result(result) + assert Path("bar").exists() + assert Path("bar/bar").exists() + assert Path("bar/bar/lib").exists() + assert Path("bar/bar/components").exists() + assert Path("bar/bar_tests").exists() + assert Path("bar/pyproject.toml").exists() + + # Check venv created + assert not Path("bar/.venv").exists() + assert not Path("bar/uv.lock").exists() + + def test_generate_code_location_editable_dagster_no_env_var_no_value_fails(monkeypatch) -> None: monkeypatch.setenv("DAGSTER_GIT_REPO_DIR", "") with ProxyRunner.test() as runner, isolated_example_deployment_foo(runner): @@ -120,9 +150,9 @@ def test_generate_code_location_editable_dagster_no_env_var_no_value_fails(monke def test_generate_code_location_already_exists_fails() -> None: with ProxyRunner.test() as runner, isolated_example_deployment_foo(runner): - result = runner.invoke("generate", "code-location", "bar") + result = runner.invoke("generate", "code-location", "bar", "--skip-venv") assert_runner_result(result) - result = runner.invoke("generate", "code-location", "bar") + result = runner.invoke("generate", "code-location", "bar", "--skip-venv") assert_runner_result(result, exit_0=False) assert "already exists" in result.output diff --git a/python_modules/libraries/dagster-dg/dagster_dg_tests/test_cache.py b/python_modules/libraries/dagster-dg/dagster_dg_tests/test_cache.py index 4636259c921fc..0389966cc75a6 100644 --- a/python_modules/libraries/dagster-dg/dagster_dg_tests/test_cache.py +++ b/python_modules/libraries/dagster-dg/dagster_dg_tests/test_cache.py @@ -1,4 +1,5 @@ import subprocess +from functools import partial from pathlib import Path import pytest @@ -9,9 +10,13 @@ isolated_example_code_location_bar, ) +# For all cache tests, avoid setting up venv in example code location so we do not prepopulate the +# cache (which is part of the venv setup routine). +example_code_location = partial(isolated_example_code_location_bar, skip_venv=True) + def test_load_from_cache(): - with ProxyRunner.test(verbose=True) as runner, isolated_example_code_location_bar(runner): + with ProxyRunner.test(verbose=True) as runner, example_code_location(runner): result = runner.invoke("list", "component-types") assert_runner_result(result) assert "CACHE [miss]" in result.output @@ -22,7 +27,7 @@ def test_load_from_cache(): def test_cache_invalidation_uv_lock(): - with ProxyRunner.test(verbose=True) as runner, isolated_example_code_location_bar(runner): + with ProxyRunner.test(verbose=True) as runner, example_code_location(runner): result = runner.invoke("list", "component-types") assert_runner_result(result) assert "CACHE [miss]" in result.output @@ -36,7 +41,7 @@ def test_cache_invalidation_uv_lock(): def test_cache_invalidation_modified_lib(): - with ProxyRunner.test(verbose=True) as runner, isolated_example_code_location_bar(runner): + with ProxyRunner.test(verbose=True) as runner, example_code_location(runner): result = runner.invoke("list", "component-types") assert_runner_result(result) assert "CACHE [miss]" in result.output @@ -51,7 +56,7 @@ def test_cache_invalidation_modified_lib(): def test_cache_no_invalidation_modified_pkg(): - with ProxyRunner.test(verbose=True) as runner, isolated_example_code_location_bar(runner): + with ProxyRunner.test(verbose=True) as runner, example_code_location(runner): result = runner.invoke("list", "component-types") assert_runner_result(result) assert "CACHE [miss]" in result.output @@ -66,7 +71,7 @@ def test_cache_no_invalidation_modified_pkg(): @pytest.mark.parametrize("with_command", [True, False]) def test_cache_clear(with_command: bool): - with ProxyRunner.test(verbose=True) as runner, isolated_example_code_location_bar(runner): + with ProxyRunner.test(verbose=True) as runner, example_code_location(runner): result = runner.invoke("list", "component-types") assert_runner_result(result) assert "CACHE [miss]" in result.output @@ -74,19 +79,64 @@ def test_cache_clear(with_command: bool): if with_command: result = runner.invoke("--clear-cache", "list", "component-types") + assert "CACHE [clear-all]" in result.output else: result = runner.invoke("--clear-cache") assert_runner_result(result) + assert "CACHE [clear-all]" in result.output result = runner.invoke("list", "component-types") assert_runner_result(result) assert "CACHE [miss]" in result.output +def test_rebuild_component_registry_success(): + with ProxyRunner.test(verbose=True) as runner, example_code_location(runner): + result = runner.invoke("--rebuild-component-registry") + assert_runner_result(result) + + # Run it again and ensure it clears the previous entry + result = runner.invoke("--rebuild-component-registry") + assert_runner_result(result) + assert "CACHE [clear-key]" in result.output + + result = runner.invoke("list", "component-types") + assert_runner_result(result) + assert "CACHE [hit]" in result.output + + +def test_rebuild_component_registry_fails_outside_code_location(): + with ProxyRunner.test(verbose=True) as runner, runner.isolated_filesystem(): + result = runner.invoke("--rebuild-component-registry") + assert_runner_result(result, exit_0=False) + assert "This command must be run inside a Dagster code location" in result.output + + +def test_rebuild_component_registry_fails_with_subcommand(): + with ProxyRunner.test(verbose=True) as runner, example_code_location(runner): + result = runner.invoke("--rebuild-component-registry", "list", "component-types") + assert_runner_result(result, exit_0=False) + assert "Cannot specify --rebuild-component-registry with a subcommand." in result.output + + +def test_rebuild_component_registry_fails_with_clear_cache(): + with ProxyRunner.test(verbose=True) as runner, example_code_location(runner): + result = runner.invoke("--rebuild-component-registry", "--clear-cache") + assert_runner_result(result, exit_0=False) + assert "Cannot specify both --clear-cache and --rebuild-component-registry" in result.output + + +def test_rebuild_component_registry_fails_with_disabled_cache(): + with ProxyRunner.test(verbose=True) as runner, example_code_location(runner): + result = runner.invoke("--rebuild-component-registry", "--disable-cache") + assert_runner_result(result, exit_0=False) + assert "Cache is disabled" in result.output + + def test_cache_disabled(): with ( ProxyRunner.test(verbose=True, disable_cache=True) as runner, - isolated_example_code_location_bar(runner), + example_code_location(runner), ): result = runner.invoke("list", "component-types") assert_runner_result(result) diff --git a/python_modules/libraries/dagster-dg/dagster_dg_tests/utils.py b/python_modules/libraries/dagster-dg/dagster_dg_tests/utils.py index d3628b5e229ed..0d695570d5b87 100644 --- a/python_modules/libraries/dagster-dg/dagster_dg_tests/utils.py +++ b/python_modules/libraries/dagster-dg/dagster_dg_tests/utils.py @@ -23,7 +23,7 @@ def isolated_example_deployment_foo(runner: Union[CliRunner, "ProxyRunner"]) -> @contextmanager def isolated_example_code_location_bar( - runner: Union[CliRunner, "ProxyRunner"], in_deployment: bool = True + runner: Union[CliRunner, "ProxyRunner"], in_deployment: bool = True, skip_venv: bool = False ) -> Iterator[None]: runner = ProxyRunner(runner) if isinstance(runner, CliRunner) else runner dagster_git_repo_dir = str(discover_git_root(Path(__file__))) @@ -34,6 +34,7 @@ def isolated_example_code_location_bar( "code-location", "--use-editable-dagster", dagster_git_repo_dir, + *(["--skip-venv"] if skip_venv else []), "bar", ) with pushd("code_locations/bar"): @@ -45,6 +46,7 @@ def isolated_example_code_location_bar( "code-location", "--use-editable-dagster", dagster_git_repo_dir, + *(["--skip-venv"] if skip_venv else []), "bar", ) with pushd("bar"):