diff --git a/dbt_project.yml b/dbt_project.yml index dd9a239a..4eb72ebe 100644 --- a/dbt_project.yml +++ b/dbt_project.yml @@ -90,3 +90,23 @@ vars: # -- Code complexity variables -- comment_chars: ["--"] + token_costs: { + "and": 0.1, + "or": 0.1, + "when": 0.5, + "coalesce": 1, + "distinct": 1, + "greatest": 1, + "least": 1, + "group": 1, + "join": 1, + "order": 1, + "select": 1, + "where": 1, + "having": 2, + "flatten": 3, + "unnest": 3, + "pivot": 3, + "partition by": 3, + "qualify": 3, + } diff --git a/docs/customization/overriding-variables.md b/docs/customization/overriding-variables.md index e4e0aa30..41f64b25 100644 --- a/docs/customization/overriding-variables.md +++ b/docs/customization/overriding-variables.md @@ -92,6 +92,13 @@ vars: chained_views_threshold: 8 ``` +## SQL code analysis + +| variable | description | default | +| ----------- | ----------- | ----------- | +| `comment_chars` | a list of strings used for inline comments | `["--"]` | +| `token_costs` | a dictionary of SQL tokens (words) and associated complexity weight,
used to estimate models complexity | see in the `dbt_project.yml` file of the package | + ## Execution | variable | description | default | diff --git a/docs/customization/querying-columns.md b/docs/customization/querying-columns-names-and-descriptions.md similarity index 74% rename from docs/customization/querying-columns.md rename to docs/customization/querying-columns-names-and-descriptions.md index 31289d40..8ba64202 100644 --- a/docs/customization/querying-columns.md +++ b/docs/customization/querying-columns-names-and-descriptions.md @@ -1,6 +1,8 @@ -# Querying columns with SQL +# Querying columns names and descriptions with SQL -The model `stg_columns` ([source](https://github.com/dbt-labs/dbt-project-evaluator/tree/main/models/staging/graph/stg_columns.sql)), created with the package, lists all the columns from all the dbt nodes (models, sources, tests, snapshots) +The model `stg_columns` ([source](https://github.com/dbt-labs/dbt-project-evaluator/tree/main/models/staging/graph/stg_columns.sql)), created with the package, lists all the columns configured in all the dbt nodes (models, sources, tests, snapshots). + +It will not list the columns of the models that have not explicitly been added to the YAML files. You can use this model to help with questions such as: diff --git a/docs/querying-the-dag.md b/docs/querying-the-dag.md index 3865e5d6..0efa7b8f 100644 --- a/docs/querying-the-dag.md +++ b/docs/querying-the-dag.md @@ -17,6 +17,8 @@ Building additional models and snapshots on top of this model could allow: ## Getting insights on potential refactoring work +- identifying models with a lof of lines of code +- identifying the models with the highest level of complexity leveraging the column `sql_complexity` from the table `int_all_graph_resources`, based on the weights defined in the `token_costs` variable - looking at the longest "chains" of models in a project - reviewing models with many/few direct dependents - identifying potential bottlenecks diff --git a/macros/calculate_number_lines.sql b/macros/calculate_number_lines.sql new file mode 100644 index 00000000..493cd20b --- /dev/null +++ b/macros/calculate_number_lines.sql @@ -0,0 +1,21 @@ +{% macro calculate_number_lines(node) %} + {{ return(adapter.dispatch('calculate_number_lines', 'dbt_project_evaluator')(node)) }} +{% endmacro %} + +{% macro default__calculate_number_lines(node) %} + + {% if node.resource_type == 'model' %} + + {% if execute %} + {%- set model_raw_sql = node.raw_sql or node.raw_code -%} + {%- else -%} + {%- set model_raw_sql = '' -%} + {%- endif -%} + + {{ return(model_raw_sql.count("\n")) + 1 }} + + {% endif %} + + {{ return(0) }} + +{% endmacro %} diff --git a/macros/calculate_sql_complexity.sql b/macros/calculate_sql_complexity.sql new file mode 100644 index 00000000..0d0ffa1c --- /dev/null +++ b/macros/calculate_sql_complexity.sql @@ -0,0 +1,37 @@ +{% macro calculate_sql_complexity(node) %} + {{ return(adapter.dispatch('calculate_sql_complexity', 'dbt_project_evaluator')(node)) }} +{% endmacro %} + +{% macro default__calculate_sql_complexity(node) %} + + {% if node.resource_type == 'model' and node.language == 'sql' %} + + {% if execute %} + {%- set model_raw_sql = node.raw_sql or node.raw_code -%} + {%- else -%} + {%- set model_raw_sql = '' -%} + {%- endif -%} + + {%- set re = modules.re -%} + {%- set ns = namespace(complexity = 0) -%} + + {# we remove the comments that start with -- , or other characters configured #} + {%- set comment_chars_match = "(" ~ var('comment_chars') | join("|") ~ ").*" -%} + {%- set model_raw_sql_no_comments = re.sub(comment_chars_match, '', model_raw_sql) -%} + + {%- for token, token_cost in var('token_costs').items() -%} + + {# this is not 100% perfect but it checks more or less if the token exists as a word by itself or followed by "("" like for least()/greatest() #} + {%- set token_with_boundaries = "\\b" ~ token ~ "[\\t\\r\\n (]" -%} + {%- set all_regex_matches = re.findall(token_with_boundaries, model_raw_sql_no_comments, re.IGNORECASE) -%} + {%- set ns.complexity = ns.complexity + token_cost * (all_regex_matches | length) -%} + + {%- endfor -%} + + {{ return(ns.complexity) }} + + {% endif %} + + {{ return(0) }} + +{% endmacro %} diff --git a/macros/unpack/get_node_values.sql b/macros/unpack/get_node_values.sql index 558eb2ec..1bd5e340 100644 --- a/macros/unpack/get_node_values.sql +++ b/macros/unpack/get_node_values.sql @@ -11,6 +11,8 @@ {%- for node in nodes_list -%} {%- set hard_coded_references = dbt_project_evaluator.find_all_hard_coded_references(node) -%} + {%- set number_lines = dbt_project_evaluator.calculate_number_lines(node) -%} + {%- set sql_complexity = dbt_project_evaluator.calculate_sql_complexity(node) -%} {%- set contract = node.contract.enforced if node.contract else false -%} {%- set exclude_node = dbt_project_evaluator.set_is_excluded(node, resource_type="node") -%} @@ -40,6 +42,8 @@ "''" if not node.column_name else wrap_string_with_quotes(dbt.escape_single_quotes(node.column_name)), wrap_string_with_quotes(node.meta | tojson), wrap_string_with_quotes(dbt.escape_single_quotes(hard_coded_references)), + number_lines, + sql_complexity, wrap_string_with_quotes(node.get('depends_on',{}).get('macros',[]) | tojson), "cast(" ~ dbt_project_evaluator.is_not_empty_string(node.test_metadata) | trim ~ " as boolean)", "cast(" ~ exclude_node ~ " as boolean)", diff --git a/mkdocs.yml b/mkdocs.yml index 4679af57..a9177b47 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -81,7 +81,7 @@ nav: - Configuring exceptions to the rules: customization/exceptions.md - Excluding packages and models/sources based on path: customization/excluding-packages-and-paths.md - Display issues in the logs: customization/issues-in-log.md - - Querying columns: customization/querying-columns.md + - Querying columns names and descriptions: customization/querying-columns-names-and-descriptions.md - Run in CI Check: ci-check.md - Querying the DAG: querying-the-dag.md - Contributing: contributing.md \ No newline at end of file diff --git a/models/marts/core/int_all_graph_resources.sql b/models/marts/core/int_all_graph_resources.sql index 6582b3eb..9ee98fba 100644 --- a/models/marts/core/int_all_graph_resources.sql +++ b/models/marts/core/int_all_graph_resources.sql @@ -112,6 +112,8 @@ joined as ( unioned_with_calc.loader, unioned_with_calc.identifier, unioned_with_calc.hard_coded_references, -- NULL for non-model resources + unioned_with_calc.number_lines, -- NULL for non-model resources + unioned_with_calc.sql_complexity, -- NULL for non-model resources unioned_with_calc.is_excluded -- NULL for metrics and exposures from unioned_with_calc diff --git a/models/staging/graph/stg_nodes.sql b/models/staging/graph/stg_nodes.sql index 2cdd3489..71025190 100644 --- a/models/staging/graph/stg_nodes.sql +++ b/models/staging/graph/stg_nodes.sql @@ -39,6 +39,8 @@ select cast(null as {{ dbt.type_string() }}) as column_name, cast(null as {{ dbt.type_string() }}) as meta, cast(null as {{ dbt.type_string() }}) as hard_coded_references, + cast(null as {{ dbt.type_int() }}) as number_lines, + cast(null as {{ dbt.type_float() }}) as sql_complexity, cast(null as {{ dbt.type_string() }}) as macro_dependencies, cast(True as boolean) as is_generic_test, cast(True as boolean) as is_excluded