diff --git a/.github/ISSUE_TEMPLATE/bug.yml b/.github/ISSUE_TEMPLATE/bug.yml new file mode 100644 index 00000000..4978fafd --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug.yml @@ -0,0 +1,66 @@ +# See https://docs.github.com/en/communities/using-templates-to-encourage-useful-issues-and-pull-requests/syntax-for-issue-forms +# and https://docs.github.com/en/communities/using-templates-to-encourage-useful-issues-and-pull-requests/syntax-for-githubs-form-schema +name: Bug Report +description: Something is not working with Tempo +title: "[BUG]: " +labels: ["bug", "needs-triage"] +projects: ["databrickslabs/12"] +body: + - type: checkboxes + attributes: + label: Is there an existing issue for this? + description: Please search to see if an issue already exists for the bug you encountered. + options: + - label: I have searched the existing issues + required: true + - type: textarea + attributes: + label: Current Behavior + description: | + A concise description of what you're experiencing. + **Do not paste links to attachments with logs and/or images, as all issues will attachments will get deleted.** + Use the `Relevant log output` field to paste redacted log output without personal identifying information (PII). + You can Ctrl/Cmd+V the screenshot, which would appear as a rendered image if it doesn't contain any PII. + validations: + required: false + - type: textarea + attributes: + label: Expected Behavior + description: A concise description of what you expected to happen. + validations: + required: false + - type: textarea + attributes: + label: Steps To Reproduce + description: Steps to reproduce the behavior. + placeholder: | + 1. In this environment... + 1. With this config... + 1. Run '...' + 1. See error... + validations: + required: false + - type: dropdown + id: cloud + attributes: + label: Cloud + description: What cloud are you using? + options: + - AWS + - Azure + - GCP + validations: + required: true + - type: textarea + id: version + attributes: + label: Version + description: What version of our software are you running? + validations: + required: true + - type: textarea + id: logs + attributes: + label: Relevant log output + description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. + render: shell \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 00000000..41af3259 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,9 @@ +blank_issues_enabled: false +contact_links: + - name: General Databricks questions + url: https://help.databricks.com/ + about: Issues related to Databricks and not related to UCX + + - name: UCX Documentation + url: https://databrickslabs.github.io/tempo/ + about: Documentation about Tempo \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/feature.yml b/.github/ISSUE_TEMPLATE/feature.yml new file mode 100644 index 00000000..7dcc0600 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature.yml @@ -0,0 +1,33 @@ +# See https://docs.github.com/en/communities/using-templates-to-encourage-useful-issues-and-pull-requests/syntax-for-issue-forms +# and https://docs.github.com/en/communities/using-templates-to-encourage-useful-issues-and-pull-requests/syntax-for-githubs-form-schema +name: Feature Request +description: Something new needs to happen with Tempo +title: "[FEATURE]: " +labels: ["enhancement", "needs-triage"] +projects: ["databrickslabs/13"] +body: + - type: checkboxes + attributes: + label: Is there an existing issue for this? + description: Please search to see if an issue already exists for the feature request you're willing to submit + options: + - label: I have searched the existing issues + required: true + - type: textarea + attributes: + label: Problem statement + description: A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + validations: + required: true + - type: textarea + attributes: + label: Proposed Solution + description: A clear and concise description of what you want to happen. + validations: + required: true + - type: textarea + attributes: + label: Additional Context + description: Add any other context, references or screenshots about the feature request here. + validations: + required: false diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 00000000..504a1aeb --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,29 @@ +## Changes + + +### Linked issues + + +Resolves #.. + +### Functionality + +- [ ] added relevant user documentation +- [ ] added a new Class method +- [ ] modified existing Class method: `...` +- [ ] added a new function +- [ ] modified existing function: `...` +- [ ] added a new test +- [ ] modified existing test: `...` +- [ ] added a new example +- [ ] modified existing example: `...` +- [ ] added a new utility +- [ ] modified existing utility: `...` + +### Tests + + +- [ ] manually tested +- [ ] added unit tests +- [ ] added integration tests +- [ ] verified on staging environment (screenshot attached) \ No newline at end of file diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml new file mode 100644 index 00000000..877ecfd7 --- /dev/null +++ b/.github/workflows/build-release.yml @@ -0,0 +1,78 @@ +name: build-release + +on: + pull_request: + types: [opened, synchronize] + push: + branches: ['master'] + +jobs: + release: + runs-on: ubuntu-latest + environment: release + permissions: + # Used to authenticate to PyPI via OIDC and sign the release's artifacts with sigstore-python. + id-token: write + # Used to attach signing artifacts to the published release. + contents: write + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + fetch-tags: true + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install tox + + - name: Build dist + working-directory: ./python + run: tox -e build-dist + + - name: Publish a Python distribution to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + if: startsWith(github.ref, 'refs/tags/v') + with: + packages-dir: python/dist/ + + docs: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + fetch-tags: true + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.9' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install tox + + - name: Build docs + working-directory: ./python + run: tox -e build-docs + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: html-docs + path: docs/_build/html/ + + - name: Deploy ๐Ÿš€ + uses: peaceiris/actions-gh-pages@v3 + if: startsWith(github.ref, 'refs/tags/v') + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: docs/_build/html diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml deleted file mode 100644 index ad042902..00000000 --- a/.github/workflows/codeql-analysis.yml +++ /dev/null @@ -1,70 +0,0 @@ -# For most projects, this workflow file will not need changing; you simply need -# to commit it to your repository. -# -# You may wish to alter this file to override the set of languages analyzed, -# or to provide custom queries or build logic. -# -# ******** NOTE ******** -# We have attempted to detect the languages in your repository. Please check -# the `language` matrix defined below to confirm you have the correct set of -# supported CodeQL languages. -# -name: "CodeQL" - -on: - push: - branches: [ master ] - pull_request: - # The branches below must be a subset of the branches above - branches: [ master ] - schedule: - - cron: '21 18 * * 3' - -jobs: - analyze: - name: Analyze - runs-on: ubuntu-latest - permissions: - actions: read - contents: read - security-events: write - - strategy: - fail-fast: false - matrix: - language: [ 'python' ] - # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] - # Learn more about CodeQL language support at https://git.io/codeql-language-support - - steps: - - name: Checkout repository - uses: actions/checkout@v2 - - # Initializes the CodeQL tools for scanning. - - name: Initialize CodeQL - uses: github/codeql-action/init@v1 - with: - languages: ${{ matrix.language }} - # If you wish to specify custom queries, you can do so here or in a config file. - # By default, queries listed here will override any specified in a config file. - # Prefix the list here with "+" to use these queries and those in the config file. - # queries: ./path/to/local/query, your-org/your-repo/queries@main - - # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). - # If this step fails, then you should remove it and run the build manually (see below) - - name: Autobuild - uses: github/codeql-action/autobuild@v1 - - # โ„น๏ธ Command-line programs to run using the OS shell. - # ๐Ÿ“š https://git.io/JvXDl - - # โœ๏ธ If the Autobuild fails above, remove it and uncomment the following three lines - # and modify them (or add more) to build your code if your project - # uses a compiled language - - #- run: | - # make bootstrap - # make release - - - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v1 diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml deleted file mode 100644 index a4158c26..00000000 --- a/.github/workflows/docs.yml +++ /dev/null @@ -1,42 +0,0 @@ - -name: docs - -on: - push: - tags: - - 'v*' # only release a versioned tag, such as v.X.Y.Z - -jobs: - build-docs: - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [ ubuntu-latest ] - env: - OS: ${{ matrix.os }} - PYTHON: '3.9' - steps: - - name: Checkout Code - uses: actions/checkout@v3 - with: - fetch-depth: '0' - - name: Copy Requirements - uses: canastro/copy-file-action@master - with: - source: "python/requirements.txt" - target: "docs/requirements.txt" - - name: Build HTML - uses: ammaraskar/sphinx-action@0.4 - with: - pre-build-command: "apt-get update -y && apt-get install -y git && git config --global --add safe.directory /github/workspace" - - name: Upload artifacts - uses: actions/upload-artifact@v1 - with: - name: html-docs - path: docs/_build/html/ - - name: Deploy ๐Ÿš€ - uses: peaceiris/actions-gh-pages@v3 - if: $${{ github.ref }} == 'refs/heads/master' - with: - github_token: ${{ secrets.GITHUB_TOKEN }} - publish_dir: docs/_build/html diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml deleted file mode 100644 index f374d26f..00000000 --- a/.github/workflows/release.yml +++ /dev/null @@ -1,48 +0,0 @@ -name: release - -on: - push: - tags: - - 'v*' # only release a versioned tag, such as v.X.Y.Z - -jobs: - release: - runs-on: ubuntu-latest - environment: release - permissions: - # Used to authenticate to PyPI via OIDC and sign the release's artifacts with sigstore-python. - id-token: write - # Used to attach signing artifacts to the published release. - contents: write - - steps: - - uses: actions/checkout@v1 - - - name: Set up Python - uses: actions/setup-python@v1 - with: - python-version: 3.9 - - - uses: actions/cache@v2 - id: cache - with: - path: ~/.cache/pip - key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} - restore-keys: | - ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} - ${{ runner.os }}-pip- - - name: Install pip - run: python -m pip install --upgrade pip - - - name: Install dependencies - working-directory: ./python - run: pip install -U -r requirements.txt - - - name: Build dist - working-directory: ./python - run: python setup.py clean bdist_wheel - - - name: Publish a Python distribution to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 - with: - packages_dir: python/dist/ diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 27d2d8a3..6c151e5c 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,78 +1,108 @@ -name: build +name: test on: - push: - branches: [ '*' ] pull_request: - branches: [ 'master' ] + types: [opened, synchronize] + push: + branches: ['master'] + workflow_dispatch: jobs: - black-linting: - runs-on: ubuntu-latest - name: Black Lint - steps: - - uses: actions/checkout@v2 - - uses: psf/black@stable - with: - options: "--check --verbose" - src: "./python" - version: "23.3.0" - flake8-lint: + lint-and-check: runs-on: ubuntu-latest - name: Flake8 Lint steps: - - name: Check out source repository - uses: actions/checkout@v2 - - name: Set up Python environment - uses: actions/setup-python@v2 - with: - python-version: "3.9" - - name: flake8 Lint - uses: py-actions/flake8@v2 - with: - args: "--config python/.flake8" - path: "./python" - type-checks: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + fetch-tags: true + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install tox tox-gh-actions + - name: Execute tox envs + working-directory: ./python + run: tox -e lint -- --check --diff + - name: Execute tox envs + working-directory: ./python + run: tox -e type-check + + analyze: runs-on: ubuntu-latest - name: Type Checks + permissions: + actions: read + contents: read + security-events: write + strategy: + fail-fast: false + matrix: + language: [ 'python' ] + # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] + # Learn more about CodeQL language support at https://git.io/codeql-language-support steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 - with: - python-version: "3.9" - - name: Type check - working-directory: ./python - run: | - pip install tox - tox -e type-check + - name: Checkout repository + uses: actions/checkout@v4 + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: ${{ matrix.language }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + # queries: ./path/to/local/query, your-org/your-repo/queries@main + # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). + # If this step fails, then you should remove it and run the build manually (see below) + - name: Autobuild + uses: github/codeql-action/autobuild@v3 + # โ„น๏ธ Command-line programs to run using the OS shell. + # ๐Ÿ“š https://git.io/JvXDl + # โœ๏ธ If the Autobuild fails above, remove it and uncomment the following three lines + # and modify them (or add more) to build your code if your project + # uses a compiled language + #- run: | + # make bootstrap + # make release + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 + test: - name: Build and Test Module - runs-on: ${{ matrix.os }} + needs: lint-and-check + runs-on: ubuntu-latest strategy: matrix: - os: [ubuntu-latest] - env: - OS: ${{ matrix.os }} - PYTHON: '3.9' + config: + - py: '3.9' + dbr: dbr113 + - py: '3.9' + dbr: dbr122 + - py: '3.10' + dbr: dbr133 + - py: '3.10' + dbr: dbr143 + fail-fast: false steps: - - uses: actions/checkout@master - - name: Setup Python - uses: actions/setup-python@master + - uses: actions/checkout@v4 with: - python-version: 3.9 - - name: Set Spark env + fetch-depth: 0 + fetch-tags: true + - name: Set up Python ${{ matrix.config.py }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.config.py }} + - name: Install dependencies run: | - export SPARK_LOCAL_IP=127.0.0.1 - export SPARK_SUBMIT_OPTS="--illegal-access=permit -Dio.netty.tryReflectionSetAccessible=true" - - name: Generate coverage report + python -m pip install --upgrade pip + python -m pip install tox + - name: Execute tox envs working-directory: ./python - run: | - python -I -m pip install 'coverage<8,>=7' pyspark==3.2.1 -r requirements.txt - coverage run -m unittest discover -s tests -p '*_tests.py' - coverage combine - coverage xml + run: tox -e ${{ matrix.config.dbr }},coverage-report - name: Publish test coverage - uses: codecov/codecov-action@v3 + uses: codecov/codecov-action@v4 with: fail_ci_if_error: true files: ./python/coverage.xml + token: ${{ secrets.CODECOV_TOKEN }} \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 04bf3428..86fd4a69 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -12,18 +12,18 @@ Be sure to carefully follow the instructions to configure your shell environment Use `pyenv` to install the following Python versions for testing. ```bash -pyenv install 3.7 3.8 3.9 +pyenv install 3.8 3.9 3.10 ``` You will probably want to set one of these versions as your global Python version. This will be the version of Python that is used when you run `python` commands in your terminal. For example, to set Python 3.9 as your global Python version, run the following command: ```bash -pyenv global 3.9 +pyenv global 3.10 ``` Within the `tempo/python` folder, run the below command to create a `.python-version` file that will tell `pyenv` which Python version to use when running commands in this directory: ```bash -pyenv local 3.7 3.8 3.9 +pyenv local 3.8 3.9 3.10 ``` This allows `tox` to create virtual environments using any of the Python versions listed in the `.python-version` file. @@ -43,12 +43,9 @@ Run the following command in your terminal to create a virtual environment in th tox --devenv .venv -e {environment-name} ``` The `โ€”devenv` flag tells `tox` to create a development environment, and `.venv` is the folder where the virtual environment will be created. -Pre-defined environments can be found within the `tox.ini` file for different Python versions and their corresponding PySpark version. They include: -- py37-pyspark300 -- py38-pyspark312 -- py38-pyspark321 -- py39-pyspark330 -- py39-pyspark332 + +## Environments we test +The environments we test against are defined within the `tox.ini` file, and the requirements for those environments are stored in `python/tests/requirements`. The makeup of these environments is inspired by the [Databricks Runtime](https://docs.databricks.com/en/release-notes/runtime/index.html#) (hence the naming convention), but it's important to note that developing Databricks is **not** a requirement. We're simply mimicking some of the different runtime versions because (a) we recognize that much of the user base uses `tempo` on Databricks and (b) it saves development time spent trying to build out test environments with different versions of Python and PySpark from scratch. ## Run tests locally for one or more environments You can run tests locally for one or more environments defined enviornments without setting up a development environment first. @@ -67,9 +64,10 @@ This will run tests for all listed environments. ### Run additional checks locally `tox` has special environments for additional checks that must be performed as part of the PR process. These include formatting, linting, type checking, etc. These environments are also defined in the `tox.ini`file and skip installing dependencies listed in the `requirements.txt` file and building the distribution when those are not required . They can be specified using the `-e` flag: -* format * lint * type-check +* build-dist +* build-docs * coverage-report # Code style & Standards diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 00000000..b96f09f0 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,6 @@ +sphinx-autobuild +sphinx-copybutton +sphinx +sphinx-design +sphinx-panels +furo \ No newline at end of file diff --git a/python/install_cmd.txt b/python/install_cmd.txt new file mode 100644 index 00000000..057e3233 --- /dev/null +++ b/python/install_cmd.txt @@ -0,0 +1 @@ +pip install --no-binary pyarrow,pandas,scipy {opts} {packages} diff --git a/python/pyproject.toml b/python/pyproject.toml index d7a6d464..0de78f8f 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,2 +1,7 @@ [build-system] -requires = ["semver"] # PEP 518 - what is required to build +# PEP 518 - what is required to build this project +requires = [ + "semver>=3,<4", + "setuptools>=69,<70", + "wheel>=0.37,<1", +] diff --git a/python/requirements.txt b/python/requirements.txt deleted file mode 100644 index 1a6844a9..00000000 --- a/python/requirements.txt +++ /dev/null @@ -1,19 +0,0 @@ -ipython==8.10.0 -numpy==1.24.3 -chispa==0.9.2 -pandas==1.5.2 -pyarrow==12.0.0 -python-dateutil==2.8.2 -pytz==2022.7.1 -scipy==1.10.1 -six==1.16.0 -wheel==0.38.4 -semver==2.13.0 -sphinx-autobuild==2021.3.14 -furo==2022.9.29 -sphinx-copybutton==0.5.1 -Sphinx==4.5.0 -sphinx-design==0.2.0 -sphinx-panels==0.6.0 -jsonref==1.1.0 -python-dateutil==2.8.2 diff --git a/python/requirements/dbr113.txt b/python/requirements/dbr113.txt new file mode 100644 index 00000000..a12535ff --- /dev/null +++ b/python/requirements/dbr113.txt @@ -0,0 +1,7 @@ +delta-spark~=2.1.0 +ipython~=7.32.0 +numpy~=1.20.3 +pandas~=1.3.4 +pyarrow~=7.0.0 +pyspark~=3.3.0 +scipy~=1.7.1 \ No newline at end of file diff --git a/python/requirements/dbr122.txt b/python/requirements/dbr122.txt new file mode 100644 index 00000000..73bd3071 --- /dev/null +++ b/python/requirements/dbr122.txt @@ -0,0 +1,7 @@ +delta-spark~=2.2.0 +ipython~=8.5.0 +numpy~=1.21.5 +pandas~=1.4.2 +pyarrow~=7.0.0 +pyspark~=3.3.2 +scipy~=1.7.3 \ No newline at end of file diff --git a/python/requirements/dbr133.txt b/python/requirements/dbr133.txt new file mode 100644 index 00000000..6eb67e61 --- /dev/null +++ b/python/requirements/dbr133.txt @@ -0,0 +1,7 @@ +delta-spark~=2.4.0 +ipython~=8.10.0 +numpy~=1.21.5 +pandas~=1.4.4 +pyarrow~=8.0.0 +pyspark~=3.4.1 +scipy~=1.9.1 \ No newline at end of file diff --git a/python/requirements/dbr143.txt b/python/requirements/dbr143.txt new file mode 100644 index 00000000..165cc0c7 --- /dev/null +++ b/python/requirements/dbr143.txt @@ -0,0 +1,7 @@ +delta-spark~=3.1.0 +ipython~=8.14.0 +numpy~=1.23.5 +pandas~=1.5.3 +pyarrow~=8.0.0 +pyspark~=3.5.0 +scipy~=1.10.0 \ No newline at end of file diff --git a/python/requirements/dev.txt b/python/requirements/dev.txt new file mode 100644 index 00000000..2fbed1d1 --- /dev/null +++ b/python/requirements/dev.txt @@ -0,0 +1,6 @@ +pip>=23,<24 +chispa>=0.10,<1 +coverage>=7,<8 +jsonref>=1,<2 +packaging>=24,<25 +python-dateutil>=2,<3 \ No newline at end of file diff --git a/python/setup.py b/python/setup.py index a2d6d6a4..8ac0c757 100644 --- a/python/setup.py +++ b/python/setup.py @@ -21,7 +21,6 @@ long_description_content_type="text/markdown", url="https://databrickslabs.github.io/tempo/", packages=find_packages(where=".", include=["tempo"]), - install_requires=["ipython", "pandas", "scipy"], extras_require=dict(tests=["pytest"]), classifiers=[ "Programming Language :: Python :: 3", diff --git a/python/tempo/io.py b/python/tempo/io.py index f3466ef5..22fe4d8e 100644 --- a/python/tempo/io.py +++ b/python/tempo/io.py @@ -1,16 +1,14 @@ from __future__ import annotations import logging -import os from collections import deque from typing import Optional import pyspark.sql.functions as sfn +import tempo.tsdf as t_tsdf from pyspark.sql import SparkSession from pyspark.sql.utils import ParseException -import tempo.tsdf as t_tsdf - logger = logging.getLogger(__name__) @@ -31,12 +29,6 @@ def write( df = tsdf.df ts_col = tsdf.ts_col partitionCols = tsdf.partitionCols - if optimizationCols: - optimizationCols = optimizationCols + ["event_time"] - else: - optimizationCols = ["event_time"] - - useDeltaOpt = os.getenv("DATABRICKS_RUNTIME_VERSION") is not None view_df = df.withColumn("event_dt", sfn.to_date(sfn.col(ts_col))).withColumn( "event_time", @@ -52,11 +44,12 @@ def write( tabName ) - if useDeltaOpt: + if optimizationCols: try: spark.sql( "optimize {} zorder by {}".format( - tabName, "(" + ",".join(partitionCols + optimizationCols) + ")" + tabName, + "(" + ",".join(partitionCols + optimizationCols + [ts_col]) + ")", ) ) except ParseException as e: @@ -65,8 +58,3 @@ def write( e ) ) - else: - logger.warning( - "Delta optimizations attempted on a non-Databricks platform. " - "Switch to use Databricks Runtime to get optimization advantages." - ) diff --git a/python/tempo/tsdf.py b/python/tempo/tsdf.py index 8ae1d44f..f5c0a86f 100644 --- a/python/tempo/tsdf.py +++ b/python/tempo/tsdf.py @@ -8,8 +8,8 @@ import numpy as np import pandas as pd import pyspark.sql.functions as sfn -from IPython.core.display import HTML -from IPython.display import display as ipydisplay +from IPython.core.display import HTML # type: ignore +from IPython.display import display as ipydisplay # type: ignore from pyspark.sql import SparkSession from pyspark.sql.column import Column from pyspark.sql.dataframe import DataFrame @@ -65,9 +65,13 @@ def __init__( # Timestamp string matching then do some pattern matching to extract # the time stamp. if isinstance(df.schema[ts_col].dataType, StringType): # pragma: no cover - sample_ts = df.limit(1).collect()[0][0] + sample_ts = df.select(ts_col).limit(1).collect()[0][0] self.__validate_ts_string(sample_ts) - self.df = self.__add_double_ts().withColumnRenamed("double_ts", self.ts_col) + self.df = ( + self.__add_double_ts() + .drop(self.ts_col) + .withColumnRenamed("double_ts", self.ts_col) + ) """ Make sure DF is ordered by its respective ts_col and partition columns. @@ -77,6 +81,51 @@ def __init__( # Helper functions # + @staticmethod + def parse_nanos_timestamp( + df: DataFrame, + str_ts_col: str, + ts_fmt: str = "yyyy-MM-dd HH:mm:ss", + double_ts_col: Optional[str] = None, + parsed_ts_col: Optional[str] = None, + ) -> DataFrame: + """ + Parse a string timestamp column with nanosecond precision into a double timestamp column. + + :param df: DataFrame containing the string timestamp column + :param str_ts_col: Name of the string timestamp column + :param ts_fmt: Format of the string timestamp column (default: "yyyy-MM-dd HH:mm:ss") + :param double_ts_col: Name of the double timestamp column to create, if None + the source string column will be overwritten + :param parsed_ts_col: Name of the parsed timestamp column to create, if None + no parsed timestamp column will be kept + + :return: DataFrame with the double timestamp column + """ + + # add a parsed timestamp column if requested + src_df = ( + df.withColumn(parsed_ts_col, sfn.to_timestamp(sfn.col(str_ts_col), ts_fmt)) + if parsed_ts_col + else df + ) + + return ( + src_df.withColumn( + "nanos", + sfn.when( + sfn.col(str_ts_col).contains("."), + sfn.concat(sfn.lit("0."), sfn.split(sfn.col(str_ts_col), r"\.")[1]), + ) + .otherwise(0) + .cast("double"), + ) + .withColumn("long_ts", sfn.unix_timestamp(str_ts_col, ts_fmt)) + .withColumn( + (double_ts_col or str_ts_col), sfn.col("long_ts") + sfn.col("nanos") + ) + ) + def __add_double_ts(self) -> DataFrame: """Add a double (epoch) version of the string timestamp out to nanos""" return ( @@ -338,15 +387,12 @@ def __getTimePartitions(self, tsPartitionVal: int, fraction: float = 0.1) -> "TS def select(self, *cols: Union[str, List[str]]) -> "TSDF": """ pyspark.sql.DataFrame.select() method's equivalent for TSDF objects - Parameters - ---------- - cols : str or list of strs - column names (string). - If one of the column names is '*', that column is expanded to include all columns - in the current :class:`TSDF`. - - Examples - -------- + + :param cols: str or list of strs column names (string). If one of the column names is '*', that + column is expanded to include all columns in the current :class:`TSDF`. + + ## Examples + .. code-block:: python tsdf.select('*').collect() [Row(age=2, name='Alice'), Row(age=5, name='Bob')] tsdf.select('name', 'age').collect() @@ -533,23 +579,22 @@ def show( """ pyspark.sql.DataFrame.show() method's equivalent for TSDF objects - Parameters - ---------- - n : int, optional - Number of rows to show. - truncate : bool or int, optional - If set to ``True``, truncate strings longer than 20 chars by default. - If set to a number greater than one, truncates long strings to length ``truncate`` + :param n: Number of rows to show. (default: 20) + :param truncate: If set to True, truncate strings longer than 20 chars by default. + If set to a number greater than one, truncates long strings to length truncate and align cells right. - vertical : bool, optional - If set to ``True``, print output rows vertically (one line - per column value). + :param vertical: If set to True, print output rows vertically (one line per column value). - Example to show usage - --------------------- + ## Example to show usage: + .. code-block:: python from pyspark.sql.functions import * - phone_accel_df = spark.read.format("csv").option("header", "true").load("dbfs:/home/tempo/Phones_accelerometer").withColumn("event_ts", (col("Arrival_Time").cast("double")/1000).cast("timestamp")).withColumn("x", col("x").cast("double")).withColumn("y", col("y").cast("double")).withColumn("z", col("z").cast("double")).withColumn("event_ts_dbl", col("event_ts").cast("double")) + phone_accel_df = spark.read.format("csv").option("header", "true").load("dbfs:/home/tempo/Phones_accelerometer") \n + .withColumn("event_ts", (col("Arrival_Time").cast("double")/1000).cast("timestamp")) \n + .withColumn("x", col("x").cast("double")) \n + .withColumn("y", col("y").cast("double")) \n + .withColumn("z", col("z").cast("double")) \n + .withColumn("event_ts_dbl", col("event_ts").cast("double")) from tempo import * @@ -557,7 +602,6 @@ def show( # Call show method here phone_accel_tsdf.show() - """ # validate k <= n if k > n: diff --git a/python/tempo/utils.py b/python/tempo/utils.py index d539da1b..a45a52e5 100644 --- a/python/tempo/utils.py +++ b/python/tempo/utils.py @@ -5,16 +5,14 @@ import warnings from typing import List, Optional, Union, overload -from IPython import get_ipython -from IPython.core.display import HTML -from IPython.display import display as ipydisplay -from pandas.core.frame import DataFrame as pandasDataFrame - import pyspark.sql.functions as sfn -from pyspark.sql.dataframe import DataFrame - import tempo.resample as t_resample import tempo.tsdf as t_tsdf +from IPython import get_ipython # type: ignore +from IPython.core.display import HTML # type: ignore +from IPython.display import display as ipydisplay # type: ignore +from pandas.core.frame import DataFrame as pandasDataFrame +from pyspark.sql.dataframe import DataFrame logger = logging.getLogger(__name__) IS_DATABRICKS = "DB_HOME" in os.environ.keys() @@ -139,13 +137,11 @@ def calculate_time_horizon( @overload -def display_html(df: pandasDataFrame) -> None: - ... +def display_html(df: pandasDataFrame) -> None: ... @overload -def display_html(df: DataFrame) -> None: - ... +def display_html(df: DataFrame) -> None: ... def display_html(df: Union[pandasDataFrame, DataFrame]) -> None: @@ -179,6 +175,46 @@ def get_display_df(tsdf: t_tsdf.TSDF, k: int) -> DataFrame: return tsdf.latest(k).df.orderBy(orderCols) +@overload +def display_improvised(obj: t_tsdf.TSDF) -> None: ... + + +@overload +def display_improvised(obj: pandasDataFrame) -> None: ... + + +@overload +def display_improvised(obj: DataFrame) -> None: ... + + +def display_improvised(obj: Union[t_tsdf.TSDF, pandasDataFrame, DataFrame]) -> None: + if isinstance(obj, t_tsdf.TSDF): + method(get_display_df(obj, k=5)) + else: + method(obj) + + +@overload +def display_html_improvised(obj: Optional[t_tsdf.TSDF]) -> None: ... + + +@overload +def display_html_improvised(obj: Optional[pandasDataFrame]) -> None: ... + + +@overload +def display_html_improvised(obj: Optional[DataFrame]) -> None: ... + + +def display_html_improvised( + obj: Union[t_tsdf.TSDF, pandasDataFrame, DataFrame] +) -> None: + if isinstance(obj, t_tsdf.TSDF): + display_html(get_display_df(obj, k=5)) + else: + display_html(obj) + + ENV_CAN_RENDER_HTML = _is_capable_of_html_rendering() if ( @@ -191,48 +227,10 @@ def get_display_df(tsdf: t_tsdf.TSDF, k: int) -> DataFrame: # Under 'display' key in user_ns the original databricks display method is present # to know more refer: /databricks/python_shell/scripts/db_ipykernel_launcher.py - @overload - def display_improvised(obj: t_tsdf.TSDF) -> None: - ... - - @overload - def display_improvised(obj: pandasDataFrame) -> None: - ... - - @overload - def display_improvised(obj: DataFrame) -> None: - ... - - def display_improvised(obj: Union[t_tsdf.TSDF, pandasDataFrame, DataFrame]) -> None: - if isinstance(obj, t_tsdf.TSDF): - method(get_display_df(obj, k=5)) - else: - method(obj) - display = display_improvised elif ENV_CAN_RENDER_HTML: - @overload - def display_html_improvised(obj: Optional[t_tsdf.TSDF]) -> None: - ... - - @overload - def display_html_improvised(obj: Optional[pandasDataFrame]) -> None: - ... - - @overload - def display_html_improvised(obj: Optional[DataFrame]) -> None: - ... - - def display_html_improvised( - obj: Union[t_tsdf.TSDF, pandasDataFrame, DataFrame] - ) -> None: - if isinstance(obj, t_tsdf.TSDF): - display_html(get_display_df(obj, k=5)) - else: - display_html(obj) - display = display_html_improvised else: diff --git a/python/tests/as_of_join_tests.py b/python/tests/as_of_join_tests.py index 0b02c866..7a8a5165 100644 --- a/python/tests/as_of_join_tests.py +++ b/python/tests/as_of_join_tests.py @@ -6,13 +6,13 @@ class AsOfJoinTest(SparkTest): def test_asof_join(self): - """AS-OF Join with out a time-partition test""" + """AS-OF Join without a time-partition test""" # Construct dataframes - tsdf_left = self.get_data_as_tsdf("left") - tsdf_right = self.get_data_as_tsdf("right") - dfExpected = self.get_data_as_sdf("expected") - noRightPrefixdfExpected = self.get_data_as_sdf("expected_no_right_prefix") + tsdf_left = self.get_test_df_builder("left").as_tsdf() + tsdf_right = self.get_test_df_builder("right").as_tsdf() + df_expected = self.get_test_df_builder("expected").as_sdf() + no_right_prefixdf_expected = self.get_test_df_builder("expected_no_right_prefix").as_sdf() # perform the join joined_df = tsdf_left.asofJoin( @@ -23,24 +23,24 @@ def test_asof_join(self): ).df # joined dataframe should equal the expected dataframe - self.assertDataFrameEquality(joined_df, dfExpected) - self.assertDataFrameEquality(non_prefix_joined_df, noRightPrefixdfExpected) + self.assertDataFrameEquality(joined_df, df_expected) + self.assertDataFrameEquality(non_prefix_joined_df, no_right_prefixdf_expected) spark_sql_joined_df = tsdf_left.asofJoin( tsdf_right, left_prefix="left", right_prefix="right" ).df - self.assertDataFrameEquality(spark_sql_joined_df, dfExpected) + self.assertDataFrameEquality(spark_sql_joined_df, df_expected) def test_asof_join_skip_nulls_disabled(self): """AS-OF Join with skip nulls disabled""" # fetch test data - tsdf_left = self.get_data_as_tsdf("left") - tsdf_right = self.get_data_as_tsdf("right") - dfExpectedSkipNulls = self.get_data_as_sdf("expected_skip_nulls") - dfExpectedSkipNullsDisabled = self.get_data_as_sdf( + tsdf_left = self.get_test_df_builder("left").as_tsdf() + tsdf_right = self.get_test_df_builder("right").as_tsdf() + df_expected_skip_nulls = self.get_test_df_builder("expected_skip_nulls").as_sdf() + df_expected_skip_nulls_disabled = self.get_test_df_builder( "expected_skip_nulls_disabled" - ) + ).as_sdf() # perform the join with skip nulls enabled (default) joined_df = tsdf_left.asofJoin( @@ -48,7 +48,7 @@ def test_asof_join_skip_nulls_disabled(self): ).df # joined dataframe should equal the expected dataframe with nulls skipped - self.assertDataFrameEquality(joined_df, dfExpectedSkipNulls) + self.assertDataFrameEquality(joined_df, df_expected_skip_nulls) # perform the join with skip nulls disabled joined_df = tsdf_left.asofJoin( @@ -56,29 +56,29 @@ def test_asof_join_skip_nulls_disabled(self): ).df # joined dataframe should equal the expected dataframe without nulls skipped - self.assertDataFrameEquality(joined_df, dfExpectedSkipNullsDisabled) + self.assertDataFrameEquality(joined_df, df_expected_skip_nulls_disabled) def test_sequence_number_sort(self): """Skew AS-OF Join with Partition Window Test""" # fetch test data - tsdf_left = self.get_data_as_tsdf("left") - tsdf_right = self.get_data_as_tsdf("right") - dfExpected = self.get_data_as_sdf("expected") + tsdf_left = self.get_test_df_builder("left").as_tsdf() + tsdf_right = self.get_test_df_builder("right").as_tsdf() + df_expected = self.get_test_df_builder("expected").as_sdf() # perform the join joined_df = tsdf_left.asofJoin(tsdf_right, right_prefix="right").df # joined dataframe should equal the expected dataframe - self.assertDataFrameEquality(joined_df, dfExpected) + self.assertDataFrameEquality(joined_df, df_expected) def test_partitioned_asof_join(self): """AS-OF Join with a time-partition""" with self.assertLogs(level="WARNING") as warning_captured: # fetch test data - tsdf_left = self.get_data_as_tsdf("left") - tsdf_right = self.get_data_as_tsdf("right") - dfExpected = self.get_data_as_sdf("expected") + tsdf_left = self.get_test_df_builder("left").as_tsdf() + tsdf_right = self.get_test_df_builder("right").as_tsdf() + df_expected = self.get_test_df_builder("expected").as_sdf() joined_df = tsdf_left.asofJoin( tsdf_right, @@ -88,7 +88,7 @@ def test_partitioned_asof_join(self): fraction=0.1, ).df - self.assertDataFrameEquality(joined_df, dfExpected) + self.assertDataFrameEquality(joined_df, df_expected) self.assertEqual( warning_captured.output, [ @@ -103,15 +103,17 @@ def test_asof_join_nanos(self): """As of join with nanosecond timestamps""" # fetch test data - tsdf_left = self.get_data_as_tsdf("left") - tsdf_right = self.get_data_as_tsdf("right") - dfExpected = self.get_data_as_sdf("expected") + tsdf_left = self.get_test_df_builder("left").as_tsdf() + tsdf_right = self.get_test_df_builder("right").as_tsdf() + dfExpected = self.get_test_df_builder("expected").as_sdf() # perform join joined_df = tsdf_left.asofJoin( tsdf_right, left_prefix="left", right_prefix="right" ).df + joined_df.show() + # compare self.assertDataFrameEquality(joined_df, dfExpected) @@ -119,8 +121,8 @@ def test_asof_join_tolerance(self): """As of join with tolerance band""" # fetch test data - tsdf_left = self.get_data_as_tsdf("left") - tsdf_right = self.get_data_as_tsdf("right") + tsdf_left = self.get_test_df_builder("left").as_tsdf() + tsdf_right = self.get_test_df_builder("right").as_tsdf() tolerance_test_values = [None, 0, 5.5, 7, 10] for tolerance in tolerance_test_values: @@ -133,17 +135,17 @@ def test_asof_join_tolerance(self): ).df # compare - expected_tolerance = self.get_data_as_sdf(f"expected_tolerance_{tolerance}") + expected_tolerance = self.get_test_df_builder(f"expected_tolerance_{tolerance}").as_sdf() self.assertDataFrameEquality(joined_df, expected_tolerance) def test_asof_join_sql_join_opt_and_bytes_threshold(self): - """AS-OF Join with out a time-partition test""" + """AS-OF Join without a time-partition test""" with patch("tempo.tsdf.TSDF._TSDF__getBytesFromPlan", return_value=1000): # Construct dataframes - tsdf_left = self.get_data_as_tsdf("left") - tsdf_right = self.get_data_as_tsdf("right") - dfExpected = self.get_data_as_sdf("expected") - noRightPrefixdfExpected = self.get_data_as_sdf("expected_no_right_prefix") + tsdf_left = self.get_test_df_builder("left").as_tsdf() + tsdf_right = self.get_test_df_builder("right").as_tsdf() + df_expected = self.get_test_df_builder("expected").as_sdf() + no_right_prefixdf_expected = self.get_test_df_builder("expected_no_right_prefix").as_sdf() # perform the join joined_df = tsdf_left.asofJoin( @@ -154,13 +156,13 @@ def test_asof_join_sql_join_opt_and_bytes_threshold(self): ).df # joined dataframe should equal the expected dataframe - self.assertDataFrameEquality(joined_df, dfExpected) - self.assertDataFrameEquality(non_prefix_joined_df, noRightPrefixdfExpected) + self.assertDataFrameEquality(joined_df, df_expected) + self.assertDataFrameEquality(non_prefix_joined_df, no_right_prefixdf_expected) spark_sql_joined_df = tsdf_left.asofJoin( tsdf_right, left_prefix="left", right_prefix="right" ).df - self.assertDataFrameEquality(spark_sql_joined_df, dfExpected) + self.assertDataFrameEquality(spark_sql_joined_df, df_expected) # MAIN diff --git a/python/tests/base.py b/python/tests/base.py index 7da859c8..8538a1ce 100644 --- a/python/tests/base.py +++ b/python/tests/base.py @@ -1,20 +1,168 @@ import os -import re import unittest import warnings -from typing import Union +from typing import Union, Optional import jsonref -from chispa import assert_df_equality - import pyspark.sql.functions as sfn +from chispa import assert_df_equality +from delta.pip_utils import configure_spark_with_delta_pip from pyspark.sql import SparkSession from pyspark.sql.dataframe import DataFrame - from tempo.intervals import IntervalsDF from tempo.tsdf import TSDF +class TestDataFrameBuilder: + """ + A class to hold metadata about a Spark DataFrame + """ + + def __init__(self, spark: SparkSession, test_data: dict): + """ + :param spark: the SparkSession to use + :param test_data: a dictionary containing the test data & metadata + """ + self.spark = spark + self.__test_data = test_data + + # Spark DataFrame metadata + + @property + def df(self) -> dict: + """ + :return: the DataFrame component of the test data + """ + return self.__test_data["df"] + + @property + def df_schema(self) -> str: + """ + :return: the schema component of the test data + """ + return self.df["schema"] + + def df_data(self) -> list: + """ + :return: the data component of the test data + """ + return self.df["data"] + + # TSDF metadata + + @property + def tsdf_constructor(self) -> Optional[str]: + """ + :return: the name of the TSDF constructor to use + """ + return self.__test_data.get("tsdf_constructor", None) + + @property + def idf_construct(self) -> Optional[str]: + """ + :return: the name of the IntervalsDF constructor to use + """ + return self.__test_data.get("idf_constructor", None) + + @property + def tsdf(self) -> dict: + """ + :return: the timestamp index metadata component of the test data + """ + return self.__test_data["tsdf"] + + @property + def idf(self) -> dict: + """ + :return: the start and end timestamp index metadata component of the test data + """ + return self.__test_data["idf"] + + @property + def ts_schema(self) -> Optional[dict]: + """ + :return: the timestamp index schema component of the test data + """ + return self.tsdf.get("ts_schema", None) + + @property + def ts_idx_class(self) -> str: + """ + :return: the timestamp index class component of the test data + """ + return self.ts_schema["ts_idx_class"] + + @property + def ts_col(self) -> str: + """ + :return: the timestamp column component of the test data + """ + return self.ts_schema["ts_col"] + + @property + def ts_idx(self) -> dict: + """ + :return: the timestamp index data component of the test data + """ + return self.ts_schema["ts_idx"] + + # Builder functions + + def as_sdf(self) -> DataFrame: + """ + Constructs a Spark Dataframe from the test data + """ + # build dataframe + df = self.spark.createDataFrame(self.df_data(), self.df_schema) + + # convert timestamp columns + if "ts_convert" in self.df: + for ts_col in self.df["ts_convert"]: + # handle nested columns + if "." in ts_col: + col, field = ts_col.split(".") + convert_field_expr = sfn.to_timestamp(sfn.col(col).getField(field)) + df = df.withColumn( + col, sfn.col(col).withField(field, convert_field_expr) + ) + else: + df = df.withColumn(ts_col, sfn.to_timestamp(ts_col)) + # convert date columns + if "date_convert" in self.df: + for date_col in self.df["date_convert"]: + # handle nested columns + if "." in date_col: + col, field = date_col.split(".") + convert_field_expr = sfn.to_timestamp(sfn.col(col).getField(field)) + df = df.withColumn( + col, sfn.col(col).withField(field, convert_field_expr) + ) + else: + df = df.withColumn(date_col, sfn.to_date(date_col)) + + return df + + def as_tsdf(self) -> TSDF: + """ + Constructs a TSDF from the test data + """ + sdf = self.as_sdf() + if self.tsdf_constructor is not None: + return getattr(TSDF, self.tsdf_constructor)(sdf, **self.tsdf) + else: + return TSDF(sdf, **self.tsdf) + + def as_idf(self) -> IntervalsDF: + """ + Constructs a IntervalsDF from the test data + """ + sdf = self.as_sdf() + if self.idf_construct is not None: + return getattr(IntervalsDF, self.idf_construct)(sdf, **self.idf) + else: + return IntervalsDF(self.as_sdf(), **self.idf) + + class SparkTest(unittest.TestCase): # # Fixtures @@ -28,9 +176,11 @@ class SparkTest(unittest.TestCase): def setUpClass(cls) -> None: # create and configure PySpark Session cls.spark = ( - SparkSession.builder.appName("unit-tests") - .config("spark.jars.packages", "io.delta:delta-core_2.12:1.1.0") - .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") + configure_spark_with_delta_pip(SparkSession.builder.appName("unit-tests")) + .config( + "spark.sql.extensions", + "io.delta.sql.DeltaSparkSessionExtension", + ) .config( "spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog", @@ -67,25 +217,6 @@ def tearDown(self) -> None: # Utility Functions # - def get_data_as_sdf(self, name: str, convert_ts_col=True): - td = self.test_data[name] - ts_cols = [] - if convert_ts_col and (td.get("ts_col", None) or td.get("other_ts_cols", [])): - ts_cols = [td["ts_col"]] if "ts_col" in td else [] - ts_cols.extend(td.get("other_ts_cols", [])) - return self.buildTestDF(td["schema"], td["data"], ts_cols) - - def get_data_as_tsdf(self, name: str, convert_ts_col=True): - df = self.get_data_as_sdf(name, convert_ts_col) - td = self.test_data[name] - tsdf = TSDF( - df, - ts_col=td["ts_col"], - partition_cols=td.get("partition_cols", None), - sequence_col=td.get("sequence_col", None), - ) - return tsdf - def get_data_as_idf(self, name: str, convert_ts_col=True): df = self.get_data_as_sdf(name, convert_ts_col) td = self.test_data[name] @@ -111,7 +242,8 @@ def __getTestDataFilePath(self, test_file_name: str) -> str: dir_path = "./tests" elif cwd != "tests": raise RuntimeError( - f"Cannot locate test data file {test_file_name}, running from dir {os.getcwd()}" + f"Cannot locate test data file {test_file_name}, running from dir" + f" {os.getcwd()}" ) # return appropriate path @@ -124,7 +256,7 @@ def __loadTestData(self, test_case_path: str) -> dict: :param test_case_path: string representation of the data path e.g. : "tsdf_tests.BasicTests.test_describe" :type test_case_path: str """ - file_name, class_name, func_name = test_case_path.split(".") + file_name, class_name, func_name = test_case_path.split(".")[-3:] # find our test data file test_data_file = self.__getTestDataFilePath(file_name) @@ -135,40 +267,11 @@ def __loadTestData(self, test_case_path: str) -> dict: # proces the data file with open(test_data_file, "r") as f: data_metadata_from_json = jsonref.load(f) - # warn if data not present - if class_name not in data_metadata_from_json: - warnings.warn(f"Could not load test data for {file_name}.{class_name}") - return {} - if func_name not in data_metadata_from_json[class_name]: - warnings.warn( - f"Could not load test data for {file_name}.{class_name}.{func_name}" - ) - return {} + # return the data return data_metadata_from_json[class_name][func_name] - def buildTestDF(self, schema, data, ts_cols=["event_ts"]): - """ - Constructs a Spark Dataframe from the given components - :param schema: the schema to use for the Dataframe - :param data: values to use for the Dataframe - :param ts_cols: list of column names to be converted to Timestamp values - :return: a Spark Dataframe, constructed from the given schema and values - """ - # build dataframe - df = self.spark.createDataFrame(data, schema) - - # check if ts_col follows standard timestamp format, then check if timestamp has micro/nanoseconds - for tsc in ts_cols: - ts_value = str(df.select(ts_cols).limit(1).collect()[0][0]) - ts_pattern = r"^\d{4}-\d{2}-\d{2}| \d{2}:\d{2}:\d{2}\.\d*$" - decimal_pattern = r"[.]\d+" - if re.match(ts_pattern, str(ts_value)) is not None: - if ( - re.search(decimal_pattern, ts_value) is None - or len(re.search(decimal_pattern, ts_value)[0]) <= 4 - ): - df = df.withColumn(tsc, sfn.to_timestamp(sfn.col(tsc))) - return df + def get_test_df_builder(self, name: str) -> TestDataFrameBuilder: + return TestDataFrameBuilder(self.spark, self.test_data[name]) # # Assertion Functions @@ -200,12 +303,10 @@ def assertSchemaContainsField(self, schema, field): # the attributes of the fields must be equal self.assertFieldsEqual(field, schema[field.name]) - @staticmethod def assertDataFrameEquality( - df1: Union[IntervalsDF, TSDF, DataFrame], - df2: Union[IntervalsDF, TSDF, DataFrame], - from_tsdf: bool = False, - from_idf: bool = False, + self, + df1: Union[TSDF, DataFrame, IntervalsDF], + df2: Union[TSDF, DataFrame, IntervalsDF], ignore_row_order: bool = False, ignore_column_order: bool = True, ignore_nullable: bool = True, @@ -215,14 +316,28 @@ def assertDataFrameEquality( That is, they have equivalent schemas, and both contain the same values """ - if from_tsdf or from_idf: + # handle TSDFs + if isinstance(df1, TSDF): + # df2 must also be a TSDF + self.assertIsInstance(df2, TSDF) + # get the underlying Spark DataFrames + df1 = df1.df + df2 = df2.df + + # Handle IDFs + if isinstance(df1, IntervalsDF): + # df2 must also be a IntervalsDF + self.assertIsInstance(df2, IntervalsDF) + # get the underlying Spark DataFrames df1 = df1.df df2 = df2.df + # handle DataFrames assert_df_equality( df1, df2, ignore_row_order=ignore_row_order, ignore_column_order=ignore_column_order, ignore_nullable=ignore_nullable, + ignore_metadata=True, ) diff --git a/python/tests/interpol_tests.py b/python/tests/interpol_tests.py index 0235a011..49754ee0 100644 --- a/python/tests/interpol_tests.py +++ b/python/tests/interpol_tests.py @@ -24,7 +24,7 @@ def test_validate_fill_method(self): ) def test_validate_col_exist_in_df(self): - input_df: DataFrame = self.get_data_as_sdf("input_data") + input_df: DataFrame = self.get_test_df_builder("init").as_sdf() self.assertRaises( ValueError, @@ -54,7 +54,7 @@ def test_validate_col_exist_in_df(self): ) def test_validate_col_target_cols_data_type(self): - input_df: DataFrame = self.get_data_as_sdf("input_data") + input_df: DataFrame = self.get_test_df_builder("init").as_sdf() self.assertRaises( TypeError, @@ -69,7 +69,7 @@ def test_fill_validation(self): """Test fill parameter is valid.""" # load test data - input_tsdf: TSDF = self.get_data_as_tsdf("input_data") + input_tsdf: TSDF = self.get_test_df_builder("init").as_tsdf() # interpolate self.assertRaises( @@ -89,7 +89,7 @@ def test_target_column_validation(self): """Test target columns exist in schema, and are of the right type (numeric).""" # load test data - input_tsdf: TSDF = self.get_data_as_tsdf("input_data") + input_tsdf: TSDF = self.get_test_df_builder("init").as_tsdf() # interpolate self.assertRaises( @@ -109,7 +109,7 @@ def test_partition_column_validation(self): """Test partition columns exist in schema.""" # load test data - input_tsdf: TSDF = self.get_data_as_tsdf("input_data") + input_tsdf: TSDF = self.get_test_df_builder("init").as_tsdf() # interpolate self.assertRaises( @@ -129,7 +129,7 @@ def test_ts_column_validation(self): """Test time series column exist in schema.""" # load test data - input_tsdf: TSDF = self.get_data_as_tsdf("input_data") + input_tsdf: TSDF = self.get_test_df_builder("init").as_tsdf() # interpolate self.assertRaises( @@ -154,8 +154,8 @@ def test_zero_fill_interpolation(self): """ # load test data - simple_input_tsdf: TSDF = self.get_data_as_tsdf("simple_input_data") - expected_df: DataFrame = self.get_data_as_sdf("expected_data") + simple_input_tsdf: TSDF = self.get_test_df_builder("simple_init").as_tsdf() + expected_df: DataFrame = self.get_test_df_builder("expected").as_sdf() # interpolate actual_df: DataFrame = self.interpolate_helper.interpolate( @@ -180,8 +180,8 @@ def test_zero_fill_interpolation_no_perform_checks(self): """ # load test data - simple_input_tsdf: TSDF = self.get_data_as_tsdf("simple_input_data") - expected_df: DataFrame = self.get_data_as_sdf("expected_data") + simple_input_tsdf: TSDF = self.get_test_df_builder("simple_init").as_tsdf() + expected_df: DataFrame = self.get_test_df_builder("expected").as_sdf() # interpolate actual_df: DataFrame = self.interpolate_helper.interpolate( @@ -207,8 +207,8 @@ def test_null_fill_interpolation(self): """ # load test data - simple_input_tsdf: TSDF = self.get_data_as_tsdf("simple_input_data") - expected_df: DataFrame = self.get_data_as_sdf("expected_data") + simple_input_tsdf: TSDF = self.get_test_df_builder("simple_init").as_tsdf() + expected_df: DataFrame = self.get_test_df_builder("expected").as_sdf() # interpolate actual_df: DataFrame = self.interpolate_helper.interpolate( @@ -234,8 +234,8 @@ def test_back_fill_interpolation(self): """ # load test data - simple_input_tsdf: TSDF = self.get_data_as_tsdf("simple_input_data") - expected_df: DataFrame = self.get_data_as_sdf("expected_data") + simple_input_tsdf: TSDF = self.get_test_df_builder("simple_init").as_tsdf() + expected_df: DataFrame = self.get_test_df_builder("expected").as_sdf() # interpolate actual_df: DataFrame = self.interpolate_helper.interpolate( @@ -261,8 +261,8 @@ def test_forward_fill_interpolation(self): """ # load test data - simple_input_tsdf: TSDF = self.get_data_as_tsdf("simple_input_data") - expected_df: DataFrame = self.get_data_as_sdf("expected_data") + simple_input_tsdf: TSDF = self.get_test_df_builder("simple_init").as_tsdf() + expected_df: DataFrame = self.get_test_df_builder("expected").as_sdf() # interpolate actual_df: DataFrame = self.interpolate_helper.interpolate( @@ -288,8 +288,8 @@ def test_linear_fill_interpolation(self): """ # load test data - simple_input_tsdf: TSDF = self.get_data_as_tsdf("simple_input_data") - expected_df: DataFrame = self.get_data_as_sdf("expected_data") + simple_input_tsdf: TSDF = self.get_test_df_builder("simple_init").as_tsdf() + expected_df: DataFrame = self.get_test_df_builder("expected").as_sdf() # interpolate actual_df: DataFrame = self.interpolate_helper.interpolate( @@ -313,8 +313,8 @@ def test_different_freq_abbreviations(self): """ # load test data - simple_input_tsdf: TSDF = self.get_data_as_tsdf("simple_input_data") - expected_df: DataFrame = self.get_data_as_sdf("expected_data") + simple_input_tsdf: TSDF = self.get_test_df_builder("simple_init").as_tsdf() + expected_df: DataFrame = self.get_test_df_builder("expected").as_sdf() # interpolate actual_df: DataFrame = self.interpolate_helper.interpolate( @@ -340,8 +340,8 @@ def test_show_interpolated(self): """ # load test data - simple_input_tsdf: TSDF = self.get_data_as_tsdf("simple_input_data") - expected_df: DataFrame = self.get_data_as_sdf("expected_data") + simple_input_tsdf: TSDF = self.get_test_df_builder("simple_init").as_tsdf() + expected_df: DataFrame = self.get_test_df_builder("expected").as_sdf() # interpolate actual_df: DataFrame = self.interpolate_helper.interpolate( @@ -358,7 +358,7 @@ def test_show_interpolated(self): self.assertDataFrameEquality(expected_df, actual_df, ignore_nullable=True) def test_validate_ts_col_data_type_is_not_timestamp(self): - input_df: DataFrame = self.get_data_as_sdf("input_data") + input_df: DataFrame = self.get_test_df_builder("init").as_sdf() self.assertRaises( ValueError, @@ -374,7 +374,7 @@ def test_interpolation_freq_is_none(self): """Test a ValueError is raised when freq is None.""" # load test data - simple_input_tsdf: TSDF = self.get_data_as_tsdf("input_data") + simple_input_tsdf: TSDF = self.get_test_df_builder("init").as_tsdf() # interpolate self.assertRaises( @@ -394,7 +394,7 @@ def test_interpolation_func_is_none(self): """Test a ValueError is raised when func is None.""" # load test data - simple_input_tsdf: TSDF = self.get_data_as_tsdf("input_data") + simple_input_tsdf: TSDF = self.get_test_df_builder("init").as_tsdf() # interpolate self.assertRaises( @@ -414,7 +414,7 @@ def test_interpolation_func_is_callable(self): """Test ValueError is raised when func is callable.""" # load test data - simple_input_tsdf: TSDF = self.get_data_as_tsdf("input_data") + simple_input_tsdf: TSDF = self.get_test_df_builder("init").as_tsdf() # interpolate self.assertRaises( @@ -434,7 +434,7 @@ def test_interpolation_freq_is_not_supported_type(self): """Test ValueError is raised when func is callable.""" # load test data - simple_input_tsdf: TSDF = self.get_data_as_tsdf("input_data") + simple_input_tsdf: TSDF = self.get_test_df_builder("init").as_tsdf() # interpolate self.assertRaises( @@ -459,8 +459,8 @@ def test_interpolation_using_default_tsdf_params(self): """ # load test data - simple_input_tsdf: TSDF = self.get_data_as_tsdf("simple_input_data") - expected_df: DataFrame = self.get_data_as_sdf("expected") + simple_input_tsdf: TSDF = self.get_test_df_builder("simple_init").as_tsdf() + expected_df: DataFrame = self.get_test_df_builder("expected").as_sdf() # interpolate actual_df: DataFrame = simple_input_tsdf.interpolate( @@ -475,8 +475,8 @@ def test_interpolation_using_custom_params(self): modified params.""" # Modify input DataFrame using different ts_col - simple_input_tsdf: TSDF = self.get_data_as_tsdf("simple_input_data") - expected_df: DataFrame = self.get_data_as_sdf("expected") + simple_input_tsdf: TSDF = self.get_test_df_builder("simple_init").as_tsdf() + expected_df: DataFrame = self.get_test_df_builder("expected").as_sdf() input_tsdf = TSDF( simple_input_tsdf.df.withColumnRenamed("event_ts", "other_ts_col"), @@ -501,7 +501,7 @@ def test_tsdf_constructor_params_are_updated(self): interpolation.""" # load test data - simple_input_tsdf: TSDF = self.get_data_as_tsdf("simple_input_data") + simple_input_tsdf: TSDF = self.get_test_df_builder("simple_init").as_tsdf() actual_tsdf: TSDF = simple_input_tsdf.interpolate( ts_col="event_ts", @@ -520,8 +520,8 @@ def test_interpolation_on_sampled_data(self): """Verify interpolation can be chained with resample within the TSDF class""" # load test data - simple_input_tsdf: TSDF = self.get_data_as_tsdf("simple_input_data") - expected_df: DataFrame = self.get_data_as_sdf("expected") + simple_input_tsdf: TSDF = self.get_test_df_builder("simple_init").as_tsdf() + expected_df: DataFrame = self.get_test_df_builder("expected").as_sdf() actual_df: DataFrame = ( simple_input_tsdf.resample(freq="30 seconds", func="mean", fill=None) @@ -538,8 +538,8 @@ def test_defaults_with_resampled_df(self): # self.buildTestingDataFrame() # load test data - simple_input_tsdf = self.get_data_as_tsdf("simple_input_data") - expected_df: DataFrame = self.get_data_as_sdf("expected", convert_ts_col=True) + simple_input_tsdf = self.get_test_df_builder("simple_init").as_tsdf() + expected_df: DataFrame = self.get_test_df_builder("expected").as_sdf() actual_df: DataFrame = ( simple_input_tsdf.resample(freq="30 seconds", func="mean", fill=None) diff --git a/python/tests/intervals_tests.py b/python/tests/intervals_tests.py index ca0bde7a..805055c1 100644 --- a/python/tests/intervals_tests.py +++ b/python/tests/intervals_tests.py @@ -74,7 +74,7 @@ class IntervalsDFTests(SparkTest): ] def test_init_series_str(self): - df_input = self.get_data_as_sdf("input") + df_input = self.get_test_df_builder("init").as_sdf() idf = IntervalsDF(df_input, "start_ts", "end_ts", "series_1") @@ -91,7 +91,7 @@ def test_init_series_str(self): self.assertCountEqual(idf.metric_columns, ["metric_1", "metric_2"]) def test_init_series_comma_seperated_str(self): - df_input = self.get_data_as_sdf("input") + df_input = self.get_test_df_builder("init").as_sdf() idf = IntervalsDF(df_input, "start_ts", "end_ts", "series_1, series_2") @@ -108,7 +108,7 @@ def test_init_series_comma_seperated_str(self): self.assertCountEqual(idf.metric_columns, ["metric_1", "metric_2"]) def test_init_series_tuple(self): - df_input = self.get_data_as_sdf("input") + df_input = self.get_test_df_builder("init").as_sdf() idf = IntervalsDF(df_input, "start_ts", "end_ts", ("series_1",)) @@ -125,7 +125,7 @@ def test_init_series_tuple(self): self.assertCountEqual(idf.metric_columns, ["metric_1", "metric_2"]) def test_init_series_list(self): - df_input = self.get_data_as_sdf("input") + df_input = self.get_test_df_builder("init").as_sdf() idf = IntervalsDF(df_input, "start_ts", "end_ts", ["series_1"]) @@ -142,7 +142,7 @@ def test_init_series_list(self): self.assertCountEqual(idf.metric_columns, ["metric_1", "metric_2"]) def test_init_series_none(self): - df_input = self.get_data_as_sdf("input") + df_input = self.get_test_df_builder("init").as_sdf() idf = IntervalsDF(df_input, "start_ts", "end_ts", None) @@ -159,7 +159,7 @@ def test_init_series_none(self): self.assertCountEqual(idf.metric_columns, ["metric_1", "metric_2"]) def test_init_series_int(self): - df_input = self.get_data_as_sdf("input") + df_input = self.get_test_df_builder("init").as_sdf() self.assertRaises( ValueError, @@ -171,14 +171,12 @@ def test_init_series_int(self): ) def test_window_property(self): - df_input = self.get_data_as_sdf("input") - - idf = IntervalsDF(df_input, "start_ts", "end_ts", "series_1") + idf: IntervalsDF = self.get_test_df_builder("init").as_idf() self.assertIsInstance(idf.window, pyspark.sql.window.WindowSpec) def test_fromStackedMetrics_series_str(self): - df_input = self.get_data_as_sdf("input") + df_input = self.get_test_df_builder("init").as_sdf() self.assertRaises( ValueError, @@ -192,7 +190,7 @@ def test_fromStackedMetrics_series_str(self): ) def test_fromStackedMetrics_series_tuple(self): - df_input = self.get_data_as_sdf("input") + df_input = self.get_test_df_builder("init").as_sdf() self.assertRaises( ValueError, @@ -206,8 +204,8 @@ def test_fromStackedMetrics_series_tuple(self): ) def test_fromStackedMetrics_series_list(self): - df_input = self.get_data_as_sdf("input") - idf_expected = self.get_data_as_idf("expected") + df_input = self.get_test_df_builder("init").as_sdf() + idf_expected = self.get_test_df_builder("expected").as_idf() df_input = df_input.withColumn( "start_ts", f.to_timestamp("start_ts") @@ -224,11 +222,11 @@ def test_fromStackedMetrics_series_list(self): "metric_value", ) - self.assertDataFrameEquality(idf, idf_expected, from_idf=True) + self.assertDataFrameEquality(idf, idf_expected) def test_fromStackedMetrics_metric_names(self): - df_input = self.get_data_as_sdf("input") - idf_expected = self.get_data_as_idf("expected") + df_input = self.get_test_df_builder("init").as_sdf() + idf_expected = self.get_test_df_builder("expected").as_idf() df_input = df_input.withColumn( "start_ts", f.to_timestamp("start_ts") @@ -246,21 +244,21 @@ def test_fromStackedMetrics_metric_names(self): ["metric_1", "metric_2"], ) - self.assertDataFrameEquality(idf, idf_expected, from_idf=True) + self.assertDataFrameEquality(idf, idf_expected) def test_make_disjoint(self): - idf_input = self.get_data_as_idf("input") - idf_expected = self.get_data_as_idf("expected") + idf_input = self.get_test_df_builder("init").as_idf() + idf_expected = self.get_test_df_builder("expected").as_idf() idf_actual = idf_input.make_disjoint() self.assertDataFrameEquality( - idf_expected, idf_actual, from_idf=True, ignore_row_order=True + idf_expected, idf_actual, ignore_row_order=True ) def test_make_disjoint_contains_interval_already_disjoint(self): - idf_input = self.get_data_as_idf("input") - idf_expected = self.get_data_as_idf("expected") + idf_input = self.get_test_df_builder("init").as_idf() + idf_expected = self.get_test_df_builder("expected").as_idf() print("expected") print(idf_expected.df.toPandas()) @@ -269,72 +267,72 @@ def test_make_disjoint_contains_interval_already_disjoint(self): print(idf_actual) # self.assertDataFrameEquality( - # idf_expected, idf_actual, from_idf=True, ignore_row_order=True + # idf_expected, idf_actual, ignore_row_order=True # ) def test_make_disjoint_contains_intervals_equal(self): - idf_input = self.get_data_as_idf("input") - idf_expected = self.get_data_as_idf("expected") + idf_input = self.get_test_df_builder("init").as_idf() + idf_expected = self.get_test_df_builder("expected").as_idf() idf_actual = idf_input.make_disjoint() self.assertDataFrameEquality( - idf_expected, idf_actual, from_idf=True, ignore_row_order=True + idf_expected, idf_actual, ignore_row_order=True ) def test_make_disjoint_intervals_same_start(self): - idf_input = self.get_data_as_idf("input") - idf_expected = self.get_data_as_idf("expected") + idf_input = self.get_test_df_builder("init").as_idf() + idf_expected = self.get_test_df_builder("expected").as_idf() idf_actual = idf_input.make_disjoint() self.assertDataFrameEquality( - idf_expected, idf_actual, from_idf=True, ignore_row_order=True + idf_expected, idf_actual, ignore_row_order=True ) def test_make_disjoint_intervals_same_end(self): - idf_input = self.get_data_as_idf("input") - idf_expected = self.get_data_as_idf("expected") + idf_input = self.get_test_df_builder("init").as_idf() + idf_expected = self.get_test_df_builder("expected").as_idf() idf_actual = idf_input.make_disjoint() self.assertDataFrameEquality( - idf_expected, idf_actual, from_idf=True, ignore_row_order=True + idf_expected, idf_actual, ignore_row_order=True ) def test_make_disjoint_multiple_series(self): - idf_input = self.get_data_as_idf("input") - idf_expected = self.get_data_as_idf("expected") + idf_input = self.get_test_df_builder("init").as_idf() + idf_expected = self.get_test_df_builder("expected").as_idf() idf_actual = idf_input.make_disjoint() self.assertDataFrameEquality( - idf_expected, idf_actual, from_idf=True, ignore_row_order=True + idf_expected, idf_actual, ignore_row_order=True ) def test_make_disjoint_single_metric(self): - idf_input = self.get_data_as_idf("input") - idf_expected = self.get_data_as_idf("expected") + idf_input = self.get_test_df_builder("init").as_idf() + idf_expected = self.get_test_df_builder("expected").as_idf() idf_actual = idf_input.make_disjoint() self.assertDataFrameEquality( - idf_expected, idf_actual, from_idf=True, ignore_row_order=True + idf_expected, idf_actual, ignore_row_order=True ) def test_make_disjoint_interval_is_subset(self): - idf_input = self.get_data_as_idf("input") - idf_expected = self.get_data_as_idf("expected") + idf_input = self.get_test_df_builder("init").as_idf() + idf_expected = self.get_test_df_builder("expected").as_idf() idf_actual = idf_input.make_disjoint() self.assertDataFrameEquality( - idf_expected, idf_actual, from_idf=True, ignore_row_order=True + idf_expected, idf_actual, ignore_row_order=True ) def test_union_other_idf(self): - idf_input_1 = self.get_data_as_idf("input") - idf_input_2 = self.get_data_as_idf("input") + idf_input_1 = self.get_test_df_builder("init").as_idf() + idf_input_2 = self.get_test_df_builder("init").as_idf() count_idf_1 = idf_input_1.df.count() count_idf_2 = idf_input_2.df.count() @@ -346,21 +344,21 @@ def test_union_other_idf(self): self.assertEqual(count_idf_1 + count_idf_2, count_union) def test_union_other_df(self): - idf_input = self.get_data_as_idf("input") - df_input = self.get_data_as_sdf("input") + idf_input = self.get_test_df_builder("init").as_idf() + df_input = self.get_test_df_builder("init").as_sdf() self.assertRaises(TypeError, idf_input.union, df_input) def test_union_other_list_dicts(self): - idf_input = self.get_data_as_idf("input") + idf_input = self.get_test_df_builder("init").as_idf() self.assertRaises( TypeError, idf_input.union, IntervalsDFTests.union_tests_dict_input ) def test_unionByName_other_idf(self): - idf_input_1 = self.get_data_as_idf("input") - idf_input_2 = self.get_data_as_idf("input") + idf_input_1 = self.get_test_df_builder("init").as_idf() + idf_input_2 = self.get_test_df_builder("init").as_idf() count_idf_1 = idf_input_1.df.count() count_idf_2 = idf_input_2.df.count() @@ -372,41 +370,42 @@ def test_unionByName_other_idf(self): self.assertEqual(count_idf_1 + count_idf_2, count_union_by_name) def test_unionByName_other_df(self): - idf_input = self.get_data_as_idf("input") - df_input = self.get_data_as_sdf("input") + idf_input = self.get_test_df_builder("init").as_idf() + df_input = self.get_test_df_builder("init").as_sdf() self.assertRaises(TypeError, idf_input.unionByName, df_input) def test_unionByName_other_list_dicts(self): - idf_input = self.get_data_as_idf("input") + idf_input = self.get_test_df_builder("init").as_idf() self.assertRaises( TypeError, idf_input.unionByName, IntervalsDFTests.union_tests_dict_input ) def test_unionByName_extra_column(self): - idf_extra_col = self.get_data_as_idf("input_extra_col") - idf_input = self.get_data_as_idf("input") + idf_extra_col = self.get_test_df_builder("init_extra_col").as_idf() + idf_input = self.get_test_df_builder("init").as_idf() self.assertRaises(AnalysisException, idf_extra_col.unionByName, idf_input) def test_unionByName_other_extra_column(self): - idf_input = self.get_data_as_idf("input") - idf_extra_col = self.get_data_as_idf("input_extra_col") + idf_input = self.get_test_df_builder("init").as_idf() + idf_extra_col = self.get_test_df_builder("init_extra_col").as_idf() self.assertRaises(AnalysisException, idf_input.unionByName, idf_extra_col) def test_toDF(self): - idf_input = self.get_data_as_idf("input") - expected_df = self.get_data_as_sdf("input") + # NB: init is used for both since the expected df is the same + idf_input = self.get_test_df_builder("init").as_idf() + expected_df = self.get_test_df_builder("init").as_sdf() actual_df = idf_input.toDF() self.assertDataFrameEquality(actual_df, expected_df) def test_toDF_stack(self): - idf_input = self.get_data_as_idf("input") - expected_df = self.get_data_as_sdf("expected") + idf_input = self.get_test_df_builder("init").as_idf() + expected_df = self.get_test_df_builder("expected").as_sdf() expected_df = expected_df.withColumn( "start_ts", f.to_timestamp("start_ts") @@ -419,14 +418,14 @@ def test_toDF_stack(self): def test_make_disjoint_issue_268(self): # https://github.com/databrickslabs/tempo/issues/268 - idf_input = self.get_data_as_idf("input") - idf_expected = self.get_data_as_idf("expected") + idf_input = self.get_test_df_builder("init").as_idf() + idf_expected = self.get_test_df_builder("expected").as_idf() idf_actual = idf_input.make_disjoint() idf_actual.df.show(truncate=False) self.assertDataFrameEquality( - idf_expected, idf_actual, from_idf=True, ignore_row_order=True + idf_expected, idf_actual, ignore_row_order=True ) diff --git a/python/tests/io_tests.py b/python/tests/io_tests.py index 44b837e3..e3edad10 100644 --- a/python/tests/io_tests.py +++ b/python/tests/io_tests.py @@ -1,10 +1,12 @@ import logging -import os import unittest -from unittest import mock +from importlib.metadata import version +from packaging import version as pkg_version from tests.base import SparkTest +DELTA_VERSION = version("delta-spark") + class DeltaWriteTest(SparkTest): def test_write_to_delta_without_optimization_cols(self): @@ -13,7 +15,7 @@ def test_write_to_delta_without_optimization_cols(self): table_name = "my_table_no_optimization_col" # load test data - input_tsdf = self.get_data_as_tsdf("input_data") + input_tsdf = self.get_test_df_builder("init").as_tsdf() # test write to delta input_tsdf.write(self.spark, table_name) @@ -28,7 +30,7 @@ def test_write_to_delta_with_optimization_cols(self): table_name = "my_table_optimization_col" # load test data - input_tsdf = self.get_data_as_tsdf("input_data") + input_tsdf = self.get_test_df_builder("init").as_tsdf() # test write to delta input_tsdf.write(self.spark, table_name, ["date"]) @@ -37,56 +39,37 @@ def test_write_to_delta_with_optimization_cols(self): # should be equal to the expected dataframe self.assertEqual(self.spark.table(table_name).count(), 7) - def test_write_to_delta_non_dbr_environment_logging(self): - """Test logging when writing""" - - table_name = "my_table_optimization_col" - - # load test data - input_tsdf = self.get_data_as_tsdf("input_data") - - with self.assertLogs(level="WARNING") as warning_captured: - # test write to delta - input_tsdf.write(self.spark, table_name, ["date"]) - - self.assertEqual(len(warning_captured.records), 1) - self.assertEqual( - warning_captured.output, - [ - "WARNING:tempo.io:" - "Delta optimizations attempted on a non-Databricks platform. " - "Switch to use Databricks Runtime to get optimization advantages." - ], - ) - - @mock.patch.dict(os.environ, {"DATABRICKS_RUNTIME_VERSION": "10.4"}) def test_write_to_delta_bad_dbr_environment_logging(self): """Test useDeltaOpt Exception""" table_name = "my_table_optimization_col_fails" # load test data - input_tsdf = self.get_data_as_tsdf("input_data") - - with self.assertLogs(level="ERROR") as error_captured: - # test write to delta - input_tsdf.write(self.spark, table_name, ["date"]) - - self.assertEqual(len(error_captured.records), 1) - print(error_captured.output) - self.assertEqual( - error_captured.output, - [ - "ERROR:tempo.io:" - "Delta optimizations attempted, but was not successful.\nError: \nmismatched input " - "'optimize' expecting {'(', 'ADD', 'ALTER', 'ANALYZE', 'CACHE', 'CLEAR', 'COMMENT', 'COMMIT', " - "'CREATE', 'DELETE', 'DESC', 'DESCRIBE', 'DFS', 'DROP', 'EXPLAIN', 'EXPORT', 'FROM', 'GRANT', " - "'IMPORT', 'INSERT', 'LIST', 'LOAD', 'LOCK', 'MAP', 'MERGE', 'MSCK', 'REDUCE', 'REFRESH', 'REPLACE', " - "'RESET', 'REVOKE', 'ROLLBACK', 'SELECT', 'SET', 'SHOW', 'START', 'TABLE', 'TRUNCATE', 'UNCACHE', " - "'UNLOCK', 'UPDATE', 'USE', 'VALUES', 'WITH'}(line 1, pos 0)\n\n== SQL ==\noptimize " - "my_table_optimization_col_fails zorder by (symbol,date,event_time)\n^^^\n" - ], - ) + input_tsdf = self.get_test_df_builder("init").as_tsdf() + + if pkg_version.parse(DELTA_VERSION) < pkg_version.parse("2.0.0"): + + with self.assertLogs(level="ERROR") as error_captured: + # should fail to run optimize + input_tsdf.write(self.spark, table_name, ["date"]) + + self.assertEqual(len(error_captured.records), 1) + print(error_captured.output) + self.assertEqual( + error_captured.output, + [ + "ERROR:tempo.io:" + "Delta optimizations attempted, but was not successful.\nError: \nmismatched input " + "'optimize' expecting {'(', 'ADD', 'ALTER', 'ANALYZE', 'CACHE', 'CLEAR', 'COMMENT', 'COMMIT', " + "'CREATE', 'DELETE', 'DESC', 'DESCRIBE', 'DFS', 'DROP', 'EXPLAIN', 'EXPORT', 'FROM', 'GRANT', " + "'IMPORT', 'INSERT', 'LIST', 'LOAD', 'LOCK', 'MAP', 'MERGE', 'MSCK', 'REDUCE', 'REFRESH', 'REPLACE', " + "'RESET', 'REVOKE', 'ROLLBACK', 'SELECT', 'SET', 'SHOW', 'START', 'TABLE', 'TRUNCATE', 'UNCACHE', " + "'UNLOCK', 'UPDATE', 'USE', 'VALUES', 'WITH'}(line 1, pos 0)\n\n== SQL ==\noptimize " + "my_table_optimization_col_fails zorder by (symbol,date,event_time)\n^^^\n" + ], + ) + else: + pass # MAIN diff --git a/python/tests/resample_tests.py b/python/tests/resample_tests.py index 0f41dcfe..accba3f7 100644 --- a/python/tests/resample_tests.py +++ b/python/tests/resample_tests.py @@ -12,23 +12,23 @@ class ResampleUnitTests(SparkTest): def test_appendAggKey_freq_is_none(self): - input_tsdf = self.get_data_as_tsdf("input_data") + input_tsdf = self.get_test_df_builder("init").as_tsdf() self.assertRaises(TypeError, _appendAggKey, input_tsdf) def test_appendAggKey_freq_microsecond(self): - input_tsdf = self.get_data_as_tsdf("input_data") + input_tsdf = self.get_test_df_builder("init").as_tsdf() - appendAggKey_tuple = _appendAggKey(input_tsdf, "1 MICROSECOND") - appendAggKey_tsdf = appendAggKey_tuple[0] + append_agg_key_tuple = _appendAggKey(input_tsdf, "1 MICROSECOND") + append_agg_key_tsdf = append_agg_key_tuple[0] - self.assertIsInstance(appendAggKey_tsdf, TSDF) - self.assertIn("agg_key", appendAggKey_tsdf.df.columns) - self.assertEqual(appendAggKey_tuple[1], "1") - self.assertEqual(appendAggKey_tuple[2], "microseconds") + self.assertIsInstance(append_agg_key_tsdf, TSDF) + self.assertIn("agg_key", append_agg_key_tsdf.df.columns) + self.assertEqual(append_agg_key_tuple[1], "1") + self.assertEqual(append_agg_key_tuple[2], "microseconds") def test_appendAggKey_freq_is_invalid(self): - input_tsdf = self.get_data_as_tsdf("input_data") + input_tsdf = self.get_test_df_builder("init").as_tsdf() self.assertRaises( ValueError, @@ -38,14 +38,14 @@ def test_appendAggKey_freq_is_invalid(self): ) def test_aggregate_floor(self): - input_tsdf = self.get_data_as_tsdf("input_data") - expected_data = self.get_data_as_sdf("expected_data") + input_tsdf = self.get_test_df_builder("init").as_tsdf() + expected_df = self.get_test_df_builder("expected").as_sdf() aggregate_df = aggregate(input_tsdf, "1 DAY", "floor") self.assertDataFrameEquality( aggregate_df, - expected_data, + expected_df, ) def test_aggregate_average(self): @@ -55,8 +55,8 @@ def test_aggregate_average(self): # is this intentional? # resample.py -> lines 86 to 87 # occurring in all `func` arguments but causing null values for "mean" - input_tsdf = self.get_data_as_tsdf("input_data") - expected_data = self.get_data_as_sdf("expected_data") + input_tsdf = self.get_test_df_builder("init").as_tsdf() + expected_df = self.get_test_df_builder("expected").as_sdf() # explicitly declaring metricCols to remove DATE so that test can pass for now aggregate_df = aggregate( @@ -65,67 +65,67 @@ def test_aggregate_average(self): self.assertDataFrameEquality( aggregate_df, - expected_data, + expected_df, ) def test_aggregate_min(self): - input_tsdf = self.get_data_as_tsdf("input_data") - expected_data = self.get_data_as_sdf("expected_data") + input_tsdf = self.get_test_df_builder("init").as_tsdf() + expected_df = self.get_test_df_builder("expected").as_sdf() aggregate_df = aggregate(input_tsdf, "1 DAY", "min") self.assertDataFrameEquality( aggregate_df, - expected_data, + expected_df, ) def test_aggregate_min_with_prefix(self): - input_tsdf = self.get_data_as_tsdf("input_data") - expected_data = self.get_data_as_sdf("expected_data") + input_tsdf = self.get_test_df_builder("init").as_tsdf() + expected_df = self.get_test_df_builder("expected").as_sdf() aggregate_df = aggregate(input_tsdf, "1 DAY", "min", prefix="min") self.assertDataFrameEquality( aggregate_df, - expected_data, + expected_df, ) def test_aggregate_min_with_fill(self): - input_tsdf = self.get_data_as_tsdf("input_data") - expected_data = self.get_data_as_sdf("expected_data") + input_tsdf = self.get_test_df_builder("init").as_tsdf() + expected_df = self.get_test_df_builder("expected").as_sdf() aggregate_df = aggregate(input_tsdf, "1 DAY", "min", fill=True) self.assertDataFrameEquality( aggregate_df, - expected_data, + expected_df, ) def test_aggregate_max(self): - input_tsdf = self.get_data_as_tsdf("input_data") - expected_data = self.get_data_as_sdf("expected_data") + input_tsdf = self.get_test_df_builder("init").as_tsdf() + expected_df = self.get_test_df_builder("expected").as_sdf() aggregate_df = aggregate(input_tsdf, "1 DAY", "max") self.assertDataFrameEquality( aggregate_df, - expected_data, + expected_df, ) def test_aggregate_ceiling(self): - input_tsdf = self.get_data_as_tsdf("input_data") - expected_data = self.get_data_as_sdf("expected_data") + input_tsdf = self.get_test_df_builder("init").as_tsdf() + expected_df = self.get_test_df_builder("expected").as_sdf() aggregate_df = aggregate(input_tsdf, "1 DAY", "ceil") self.assertDataFrameEquality( aggregate_df, - expected_data, + expected_df, ) def test_aggregate_invalid_func_arg(self): # TODO : we should not be hitting an UnboundLocalError - input_tsdf = self.get_data_as_tsdf("input_data") + input_tsdf = self.get_test_df_builder("init").as_tsdf() self.assertRaises(UnboundLocalError, aggregate, input_tsdf, "1 DAY", "average") diff --git a/python/tests/tsdf_tests.py b/python/tests/tsdf_tests.py index c36263e4..df25b462 100644 --- a/python/tests/tsdf_tests.py +++ b/python/tests/tsdf_tests.py @@ -18,7 +18,8 @@ class TSDFBaseTests(SparkTest): def test_TSDF_init(self): - tsdf_init = self.get_data_as_tsdf("init") + + tsdf_init = self.get_test_df_builder("init").as_tsdf() self.assertIsInstance(tsdf_init.df, DataFrame) self.assertEqual(tsdf_init.ts_col, "event_ts") @@ -29,7 +30,7 @@ def test_describe(self): """AS-OF Join without a time-partition test""" # Construct dataframes - tsdf_init = self.get_data_as_tsdf("init") + tsdf_init = self.get_test_df_builder("init").as_tsdf() # generate description dataframe res = tsdf_init.describe() @@ -57,7 +58,7 @@ def test_describe(self): ) def test__getSparkPlan(self): - init_tsdf = self.get_data_as_tsdf("init") + init_tsdf = self.get_test_df_builder("init").as_tsdf() plan = init_tsdf._TSDF__getSparkPlan(init_tsdf.df, self.spark) @@ -67,7 +68,7 @@ def test__getSparkPlan(self): self.assertIn("sizeInBytes", plan) def test__getBytesFromPlan(self): - init_tsdf = self.get_data_as_tsdf("init") + init_tsdf = self.get_test_df_builder("init").as_tsdf() _bytes = init_tsdf._TSDF__getBytesFromPlan(init_tsdf.df, self.spark) @@ -77,7 +78,7 @@ def test__getBytesFromPlan(self): def test__getBytesFromPlan_search_result_is_None(self, mock__getSparkPlan): mock__getSparkPlan.return_value = "will not match search value" - init_tsdf = self.get_data_as_tsdf("init") + init_tsdf = self.get_test_df_builder("init").as_tsdf() self.assertRaises( ValueError, @@ -90,7 +91,7 @@ def test__getBytesFromPlan_search_result_is_None(self, mock__getSparkPlan): def test__getBytesFromPlan_size_in_MiB(self, mock__getSparkPlan): mock__getSparkPlan.return_value = "' Statistics(sizeInBytes=1.0 MiB) '" - init_tsdf = self.get_data_as_tsdf("init") + init_tsdf = self.get_test_df_builder("init").as_tsdf() _bytes = init_tsdf._TSDF__getBytesFromPlan(init_tsdf.df, self.spark) expected = 1 * 1024 * 1024 @@ -101,7 +102,7 @@ def test__getBytesFromPlan_size_in_MiB(self, mock__getSparkPlan): def test__getBytesFromPlan_size_in_KiB(self, mock__getSparkPlan): mock__getSparkPlan.return_value = "' Statistics(sizeInBytes=1.0 KiB) '" - init_tsdf = self.get_data_as_tsdf("init") + init_tsdf = self.get_test_df_builder("init").as_tsdf() _bytes = init_tsdf._TSDF__getBytesFromPlan(init_tsdf.df, self.spark) @@ -111,7 +112,7 @@ def test__getBytesFromPlan_size_in_KiB(self, mock__getSparkPlan): def test__getBytesFromPlan_size_in_GiB(self, mock__getSparkPlan): mock__getSparkPlan.return_value = "' Statistics(sizeInBytes=1.0 GiB) '" - init_tsdf = self.get_data_as_tsdf("init") + init_tsdf = self.get_test_df_builder("init").as_tsdf() _bytes = init_tsdf._TSDF__getBytesFromPlan(init_tsdf.df, self.spark) @@ -130,7 +131,7 @@ def __tsdf_with_double_tscol(tsdf: TSDF) -> TSDF: return TSDF(with_double_tscol_df, tsdf.ts_col, tsdf.partitionCols) def test__add_double_ts(self): - init_tsdf = self.get_data_as_tsdf("init") + init_tsdf = self.get_test_df_builder("init").as_tsdf() df = init_tsdf._TSDF__add_double_ts() schema_string = df.schema.simpleString() @@ -165,12 +166,12 @@ def test__validate_ts_string_invalid(self): ) def test__validated_column_not_string(self): - init_df = self.get_data_as_tsdf("init").df + init_df = self.get_test_df_builder("init").as_sdf() self.assertRaises(TypeError, TSDF._TSDF__validated_column, init_df, 0) def test__validated_column_not_found(self): - init_df = self.get_data_as_tsdf("init").df + init_df = self.get_test_df_builder("init").as_sdf() self.assertRaises( ValueError, @@ -180,7 +181,7 @@ def test__validated_column_not_found(self): ) def test__validated_column(self): - init_df = self.get_data_as_tsdf("init").df + init_df = self.get_test_df_builder("init").as_sdf() self.assertEqual( TSDF._TSDF__validated_column(init_df, "symbol"), @@ -188,7 +189,7 @@ def test__validated_column(self): ) def test__validated_columns_string(self): - init_tsdf = self.get_data_as_tsdf("init") + init_tsdf = self.get_test_df_builder("init").as_tsdf() self.assertEqual( init_tsdf._TSDF__validated_columns(init_tsdf.df, "symbol"), @@ -196,7 +197,7 @@ def test__validated_columns_string(self): ) def test__validated_columns_none(self): - init_tsdf = self.get_data_as_tsdf("init") + init_tsdf = self.get_test_df_builder("init").as_tsdf() self.assertEqual( init_tsdf._TSDF__validated_columns(init_tsdf.df, None), @@ -204,7 +205,7 @@ def test__validated_columns_none(self): ) def test__validated_columns_tuple(self): - init_tsdf = self.get_data_as_tsdf("init") + init_tsdf = self.get_test_df_builder("init").as_tsdf() self.assertRaises( TypeError, @@ -214,7 +215,7 @@ def test__validated_columns_tuple(self): ) def test__validated_columns_list_multiple_elems(self): - init_tsdf = self.get_data_as_tsdf("init") + init_tsdf = self.get_test_df_builder("init").as_tsdf() self.assertEqual( init_tsdf._TSDF__validated_columns( @@ -225,19 +226,19 @@ def test__validated_columns_list_multiple_elems(self): ) def test__checkPartitionCols(self): - init_tsdf = self.get_data_as_tsdf("init") - right_tsdf = self.get_data_as_tsdf("right_tsdf") + init_tsdf = self.get_test_df_builder("init").as_tsdf() + right_tsdf = self.get_test_df_builder("right_tsdf").as_tsdf() self.assertRaises(ValueError, init_tsdf._TSDF__checkPartitionCols, right_tsdf) def test__validateTsColMatch(self): - init_tsdf = self.get_data_as_tsdf("init") - right_tsdf = self.get_data_as_tsdf("right_tsdf") + init_tsdf = self.get_test_df_builder("init").as_tsdf() + right_tsdf = self.get_test_df_builder("right_tsdf").as_tsdf() self.assertRaises(ValueError, init_tsdf._TSDF__validateTsColMatch, right_tsdf) def test__addPrefixToColumns_non_empty_string(self): - init_tsdf = self.get_data_as_tsdf("init") + init_tsdf = self.get_test_df_builder("init").as_tsdf() df = init_tsdf._TSDF__addPrefixToColumns(["event_ts"], "prefix").df @@ -246,7 +247,7 @@ def test__addPrefixToColumns_non_empty_string(self): self.assertIn("prefix_event_ts", schema_string) def test__addPrefixToColumns_empty_string(self): - init_tsdf = self.get_data_as_tsdf("init") + init_tsdf = self.get_test_df_builder("init").as_tsdf() df = init_tsdf._TSDF__addPrefixToColumns(["event_ts"], "").df @@ -256,7 +257,7 @@ def test__addPrefixToColumns_empty_string(self): self.assertIn(",event_ts", schema_string) def test__addColumnsFromOtherDF(self): - init_tsdf = self.get_data_as_tsdf("init") + init_tsdf = self.get_test_df_builder("init").as_tsdf() df = init_tsdf._TSDF__addColumnsFromOtherDF(["another_col"]).df @@ -265,8 +266,8 @@ def test__addColumnsFromOtherDF(self): self.assertIn("another_col", schema_string) def test__combineTSDF(self): - init1_tsdf = self.get_data_as_tsdf("init") - init2_tsdf = self.get_data_as_tsdf("init") + init1_tsdf = self.get_test_df_builder("init").as_tsdf() + init2_tsdf = self.get_test_df_builder("init").as_tsdf() union_tsdf = init1_tsdf._TSDF__combineTSDF(init2_tsdf, "combined_ts_col") df = union_tsdf.df @@ -281,51 +282,43 @@ def test__getLastRightRow(self): pass def test__getTimePartitions(self): - init_tsdf = self.get_data_as_tsdf("init") - expected_tsdf = self.get_data_as_tsdf("expected") + init_tsdf = self.get_test_df_builder("init").as_tsdf() + expected_tsdf = self.get_test_df_builder("expected").as_tsdf() actual_tsdf = init_tsdf._TSDF__getTimePartitions(10) - self.assertDataFrameEquality( - actual_tsdf, - expected_tsdf, - from_tsdf=True, - ) + self.assertDataFrameEquality(actual_tsdf, expected_tsdf) def test__getTimePartitions_with_fraction(self): - init_tsdf = self.get_data_as_tsdf("init") - expected_tsdf = self.get_data_as_tsdf("expected") + init_tsdf = self.get_test_df_builder("init").as_tsdf() + expected_tsdf = self.get_test_df_builder("expected").as_tsdf() actual_tsdf = init_tsdf._TSDF__getTimePartitions(10, 0.25) - self.assertDataFrameEquality( - actual_tsdf, - expected_tsdf, - from_tsdf=True, - ) + self.assertDataFrameEquality(actual_tsdf, expected_tsdf) def test_select_empty(self): # TODO: Can we narrow down to types of Exception? - init_tsdf = self.get_data_as_tsdf("init") + init_tsdf = self.get_test_df_builder("init").as_tsdf() self.assertRaises(Exception, init_tsdf.select) def test_select_only_required_cols(self): - init_tsdf = self.get_data_as_tsdf("init") + init_tsdf = self.get_test_df_builder("init").as_tsdf() tsdf = init_tsdf.select("event_ts", "symbol") self.assertEqual(tsdf.df.columns, ["event_ts", "symbol"]) def test_select_all_cols(self): - init_tsdf = self.get_data_as_tsdf("init") + init_tsdf = self.get_test_df_builder("init").as_tsdf() tsdf = init_tsdf.select("event_ts", "symbol", "trade_pr") self.assertEqual(tsdf.df.columns, ["event_ts", "symbol", "trade_pr"]) def test_show(self): - init_tsdf = self.get_data_as_tsdf("init") + init_tsdf = self.get_test_df_builder("init").as_tsdf() captured_output = StringIO() sys.stdout = captured_output @@ -350,7 +343,7 @@ def test_show(self): ) def test_show_n_5(self): - init_tsdf = self.get_data_as_tsdf("init") + init_tsdf = self.get_test_df_builder("init").as_tsdf() captured_output = StringIO() sys.stdout = captured_output @@ -373,14 +366,14 @@ def test_show_n_5(self): ) def test_show_k_gt_n(self): - init_tsdf = self.get_data_as_tsdf("init") + init_tsdf = self.get_test_df_builder("init").as_tsdf() captured_output = StringIO() sys.stdout = captured_output self.assertRaises(ValueError, init_tsdf.show, 5, 10) def test_show_truncate_false(self): - init_tsdf = self.get_data_as_tsdf("init") + init_tsdf = self.get_test_df_builder("init").as_tsdf() captured_output = StringIO() sys.stdout = captured_output @@ -405,7 +398,7 @@ def test_show_truncate_false(self): ) def test_show_vertical_true(self): - init_tsdf = self.get_data_as_tsdf("init") + init_tsdf = self.get_test_df_builder("init").as_tsdf() captured_output = StringIO() sys.stdout = captured_output @@ -450,7 +443,7 @@ def test_show_vertical_true(self): ) def test_show_vertical_true_n_5(self): - init_tsdf = self.get_data_as_tsdf("init") + init_tsdf = self.get_test_df_builder("init").as_tsdf() captured_output = StringIO() sys.stdout = captured_output @@ -484,7 +477,7 @@ def test_show_vertical_true_n_5(self): ) def test_show_truncate_false_vertical_true(self): - init_tsdf = self.get_data_as_tsdf("init") + init_tsdf = self.get_test_df_builder("init").as_tsdf() captured_output = StringIO() sys.stdout = captured_output @@ -532,20 +525,20 @@ def test_at_string_timestamp(self): """ Test of time-slicing at(..) function using a string timestamp """ - init_tsdf = self.get_data_as_tsdf("init") - expected_tsdf = self.get_data_as_tsdf("expected") + init_tsdf = self.get_test_df_builder("init").as_tsdf() + expected_tsdf = self.get_test_df_builder("expected").as_tsdf() target_ts = "2020-09-01 00:02:10" at_tsdf = init_tsdf.at(target_ts) - self.assertDataFrameEquality(at_tsdf, expected_tsdf, from_tsdf=True) + self.assertDataFrameEquality(at_tsdf, expected_tsdf) def test_at_numeric_timestamp(self): """ Test of time-slicint at(..) function using a numeric timestamp """ - init_tsdf = self.get_data_as_tsdf("init") - expected_tsdf = self.get_data_as_tsdf("expected") + init_tsdf = self.get_test_df_builder("init").as_tsdf() + expected_tsdf = self.get_test_df_builder("expected").as_tsdf() # test with numeric ts_col init_dbl_tsdf = self.__tsdf_with_double_tscol(init_tsdf) @@ -555,23 +548,23 @@ def test_at_numeric_timestamp(self): target_dbl = self.__timestamp_to_double(target_ts) at_dbl_tsdf = init_dbl_tsdf.at(target_dbl) - self.assertDataFrameEquality(at_dbl_tsdf, expected_dbl_tsdf, from_tsdf=True) + self.assertDataFrameEquality(at_dbl_tsdf, expected_dbl_tsdf) def test_before_string_timestamp(self): """ Test of time-slicing before(..) function """ - init_tsdf = self.get_data_as_tsdf("init") - expected_tsdf = self.get_data_as_tsdf("expected") + init_tsdf = self.get_test_df_builder("init").as_tsdf() + expected_tsdf = self.get_test_df_builder("expected").as_tsdf() target_ts = "2020-09-01 00:02:10" before_tsdf = init_tsdf.before(target_ts) - self.assertDataFrameEquality(before_tsdf, expected_tsdf, from_tsdf=True) + self.assertDataFrameEquality(before_tsdf, expected_tsdf) def test_before_numeric_timestamp(self): - init_tsdf = self.get_data_as_tsdf("init") - expected_tsdf = self.get_data_as_tsdf("expected") + init_tsdf = self.get_test_df_builder("init").as_tsdf() + expected_tsdf = self.get_test_df_builder("expected").as_tsdf() # test with numeric ts_col init_dbl_tsdf = self.__tsdf_with_double_tscol(init_tsdf) @@ -581,26 +574,26 @@ def test_before_numeric_timestamp(self): target_dbl = self.__timestamp_to_double(target_ts) before_dbl_tsdf = init_dbl_tsdf.before(target_dbl) - self.assertDataFrameEquality(before_dbl_tsdf, expected_dbl_tsdf, from_tsdf=True) + self.assertDataFrameEquality(before_dbl_tsdf, expected_dbl_tsdf) def test_atOrBefore_string_timestamp(self): """ Test of time-slicing atOrBefore(..) function """ - init_tsdf = self.get_data_as_tsdf("init") - expected_tsdf = self.get_data_as_tsdf("expected") + init_tsdf = self.get_test_df_builder("init").as_tsdf() + expected_tsdf = self.get_test_df_builder("expected").as_tsdf() target_ts = "2020-09-01 00:02:10" before_tsdf = init_tsdf.atOrBefore(target_ts) - self.assertDataFrameEquality(before_tsdf, expected_tsdf, from_tsdf=True) + self.assertDataFrameEquality(before_tsdf, expected_tsdf) def test_atOrBefore_numeric_timestamp(self): """ Test of time-slicing atOrBefore(..) function """ - init_tsdf = self.get_data_as_tsdf("init") - expected_tsdf = self.get_data_as_tsdf("expected") + init_tsdf = self.get_test_df_builder("init").as_tsdf() + expected_tsdf = self.get_test_df_builder("expected").as_tsdf() target_ts = "2020-09-01 00:02:10" @@ -611,26 +604,26 @@ def test_atOrBefore_numeric_timestamp(self): target_dbl = self.__timestamp_to_double(target_ts) before_dbl_tsdf = init_dbl_tsdf.atOrBefore(target_dbl) - self.assertDataFrameEquality(before_dbl_tsdf, expected_dbl_tsdf, from_tsdf=True) + self.assertDataFrameEquality(before_dbl_tsdf, expected_dbl_tsdf) def test_after_string_timestamp(self): """ Test of time-slicing after(..) function """ - init_tsdf = self.get_data_as_tsdf("init") - expected_tsdf = self.get_data_as_tsdf("expected") + init_tsdf = self.get_test_df_builder("init").as_tsdf() + expected_tsdf = self.get_test_df_builder("expected").as_tsdf() target_ts = "2020-09-01 00:02:10" after_tsdf = init_tsdf.after(target_ts) - self.assertDataFrameEquality(after_tsdf, expected_tsdf, from_tsdf=True) + self.assertDataFrameEquality(after_tsdf, expected_tsdf) def test_after_numeric_timestamp(self): """ Test of time-slicing after(..) function """ - init_tsdf = self.get_data_as_tsdf("init") - expected_tsdf = self.get_data_as_tsdf("expected") + init_tsdf = self.get_test_df_builder("init").as_tsdf() + expected_tsdf = self.get_test_df_builder("expected").as_tsdf() target_ts = "2020-09-01 00:02:10" @@ -641,26 +634,26 @@ def test_after_numeric_timestamp(self): target_dbl = self.__timestamp_to_double(target_ts) after_dbl_tsdf = init_dbl_tsdf.after(target_dbl) - self.assertDataFrameEquality(after_dbl_tsdf, expected_dbl_tsdf, from_tsdf=True) + self.assertDataFrameEquality(after_dbl_tsdf, expected_dbl_tsdf) def test_atOrAfter_string_timestamp(self): """ Test of time-slicing atOrAfter(..) function """ - init_tsdf = self.get_data_as_tsdf("init") - expected_tsdf = self.get_data_as_tsdf("expected") + init_tsdf = self.get_test_df_builder("init").as_tsdf() + expected_tsdf = self.get_test_df_builder("expected").as_tsdf() target_ts = "2020-09-01 00:02:10" after_tsdf = init_tsdf.atOrAfter(target_ts) - self.assertDataFrameEquality(after_tsdf, expected_tsdf, from_tsdf=True) + self.assertDataFrameEquality(after_tsdf, expected_tsdf) def test_atOrAfter_numeric_timestamp(self): """ Test of time-slicing atOrAfter(..) function """ - init_tsdf = self.get_data_as_tsdf("init") - expected_tsdf = self.get_data_as_tsdf("expected") + init_tsdf = self.get_test_df_builder("init").as_tsdf() + expected_tsdf = self.get_test_df_builder("expected").as_tsdf() target_ts = "2020-09-01 00:02:10" @@ -671,27 +664,27 @@ def test_atOrAfter_numeric_timestamp(self): target_dbl = self.__timestamp_to_double(target_ts) after_dbl_tsdf = init_dbl_tsdf.atOrAfter(target_dbl) - self.assertDataFrameEquality(after_dbl_tsdf, expected_dbl_tsdf, from_tsdf=True) + self.assertDataFrameEquality(after_dbl_tsdf, expected_dbl_tsdf) def test_between_string_timestamp(self): """ Test of time-slicing between(..) function """ - init_tsdf = self.get_data_as_tsdf("init") - expected_tsdf = self.get_data_as_tsdf("expected") + init_tsdf = self.get_test_df_builder("init").as_tsdf() + expected_tsdf = self.get_test_df_builder("expected").as_tsdf() ts1 = "2020-08-01 00:01:10" ts2 = "2020-09-01 00:18:00" between_tsdf = init_tsdf.between(ts1, ts2) - self.assertDataFrameEquality(between_tsdf, expected_tsdf, from_tsdf=True) + self.assertDataFrameEquality(between_tsdf, expected_tsdf) def test_between_numeric_timestamp(self): """ Test of time-slicing between(..) function """ - init_tsdf = self.get_data_as_tsdf("init") - expected_tsdf = self.get_data_as_tsdf("expected") + init_tsdf = self.get_test_df_builder("init").as_tsdf() + expected_tsdf = self.get_test_df_builder("expected").as_tsdf() ts1 = "2020-08-01 00:01:10" ts2 = "2020-09-01 00:18:00" @@ -705,28 +698,28 @@ def test_between_numeric_timestamp(self): between_dbl_tsdf = init_dbl_tsdf.between(ts1_dbl, ts2_dbl) self.assertDataFrameEquality( - between_dbl_tsdf, expected_dbl_tsdf, from_tsdf=True + between_dbl_tsdf, expected_dbl_tsdf ) def test_between_exclusive_string_timestamp(self): """ Test of time-slicing between(..) function """ - init_tsdf = self.get_data_as_tsdf("init") - expected_tsdf = self.get_data_as_tsdf("expected") + init_tsdf = self.get_test_df_builder("init").as_tsdf() + expected_tsdf = self.get_test_df_builder("expected").as_tsdf() ts1 = "2020-08-01 00:01:10" ts2 = "2020-09-01 00:18:00" between_tsdf = init_tsdf.between(ts1, ts2, inclusive=False) - self.assertDataFrameEquality(between_tsdf, expected_tsdf, from_tsdf=True) + self.assertDataFrameEquality(between_tsdf, expected_tsdf) def test_between_exclusive_numeric_timestamp(self): """ Test of time-slicing between(..) function """ - init_tsdf = self.get_data_as_tsdf("init") - expected_tsdf = self.get_data_as_tsdf("expected") + init_tsdf = self.get_test_df_builder("init").as_tsdf() + expected_tsdf = self.get_test_df_builder("expected").as_tsdf() ts1 = "2020-08-01 00:01:10" ts2 = "2020-09-01 00:18:00" @@ -740,26 +733,26 @@ def test_between_exclusive_numeric_timestamp(self): between_dbl_tsdf = init_dbl_tsdf.between(ts1_dbl, ts2_dbl, inclusive=False) self.assertDataFrameEquality( - between_dbl_tsdf, expected_dbl_tsdf, from_tsdf=True + between_dbl_tsdf, expected_dbl_tsdf ) def test_earliest_string_timestamp(self): """ Test of time-slicing earliest(..) function """ - init_tsdf = self.get_data_as_tsdf("init") - expected_tsdf = self.get_data_as_tsdf("expected") + init_tsdf = self.get_test_df_builder("init").as_tsdf() + expected_tsdf = self.get_test_df_builder("expected").as_tsdf() earliest_tsdf = init_tsdf.earliest(n=3) - self.assertDataFrameEquality(earliest_tsdf, expected_tsdf, from_tsdf=True) + self.assertDataFrameEquality(earliest_tsdf, expected_tsdf) def test_earliest_numeric_timestamp(self): """ Test of time-slicing earliest(..) function """ - init_tsdf = self.get_data_as_tsdf("init") - expected_tsdf = self.get_data_as_tsdf("expected") + init_tsdf = self.get_test_df_builder("init").as_tsdf() + expected_tsdf = self.get_test_df_builder("expected").as_tsdf() # test with numeric ts_col init_dbl_tsdf = self.__tsdf_with_double_tscol(init_tsdf) @@ -768,28 +761,28 @@ def test_earliest_numeric_timestamp(self): earliest_dbl_tsdf = init_dbl_tsdf.earliest(n=3) self.assertDataFrameEquality( - earliest_dbl_tsdf, expected_dbl_tsdf, from_tsdf=True + earliest_dbl_tsdf, expected_dbl_tsdf ) def test_latest_string_timestamp(self): """ Test of time-slicing latest(..) function """ - init_tsdf = self.get_data_as_tsdf("init") - expected_tsdf = self.get_data_as_tsdf("expected") + init_tsdf = self.get_test_df_builder("init").as_tsdf() + expected_tsdf = self.get_test_df_builder("expected").as_tsdf() latest_tsdf = init_tsdf.latest(n=3) self.assertDataFrameEquality( - latest_tsdf, expected_tsdf, ignore_row_order=True, from_tsdf=True + latest_tsdf, expected_tsdf, ignore_row_order=True ) def test_latest_numeric_timestamp(self): """ Test of time-slicing latest(..) function """ - init_tsdf = self.get_data_as_tsdf("init") - expected_tsdf = self.get_data_as_tsdf("expected") + init_tsdf = self.get_test_df_builder("init").as_tsdf() + expected_tsdf = self.get_test_df_builder("expected").as_tsdf() # test with numeric ts_col init_dbl_tsdf = self.__tsdf_with_double_tscol(init_tsdf) @@ -798,27 +791,27 @@ def test_latest_numeric_timestamp(self): latest_dbl_tsdf = init_dbl_tsdf.latest(n=3) self.assertDataFrameEquality( - latest_dbl_tsdf, expected_dbl_tsdf, ignore_row_order=True, from_tsdf=True + latest_dbl_tsdf, expected_dbl_tsdf, ignore_row_order=True ) def test_priorTo_string_timestamp(self): """ Test of time-slicing priorTo(..) function """ - init_tsdf = self.get_data_as_tsdf("init") - expected_tsdf = self.get_data_as_tsdf("expected") + init_tsdf = self.get_test_df_builder("init").as_tsdf() + expected_tsdf = self.get_test_df_builder("expected").as_tsdf() target_ts = "2020-09-01 00:02:00" prior_tsdf = init_tsdf.priorTo(target_ts) - self.assertDataFrameEquality(prior_tsdf, expected_tsdf, from_tsdf=True) + self.assertDataFrameEquality(prior_tsdf, expected_tsdf, ignore_column_order=True,) def test_priorTo_numeric_timestamp(self): """ Test of time-slicing priorTo(..) function """ - init_tsdf = self.get_data_as_tsdf("init") - expected_tsdf = self.get_data_as_tsdf("expected") + init_tsdf = self.get_test_df_builder("init").as_tsdf() + expected_tsdf = self.get_test_df_builder("expected").as_tsdf() target_ts = "2020-09-01 00:02:00" @@ -829,26 +822,26 @@ def test_priorTo_numeric_timestamp(self): target_dbl = self.__timestamp_to_double(target_ts) prior_dbl_tsdf = init_dbl_tsdf.priorTo(target_dbl) - self.assertDataFrameEquality(prior_dbl_tsdf, expected_dbl_tsdf, from_tsdf=True) + self.assertDataFrameEquality(prior_dbl_tsdf, expected_dbl_tsdf, ignore_column_order=True,) def test_subsequentTo_string_timestamp(self): """ Test of time-slicing subsequentTo(..) function """ - init_tsdf = self.get_data_as_tsdf("init") - expected_tsdf = self.get_data_as_tsdf("expected") + init_tsdf = self.get_test_df_builder("init").as_tsdf() + expected_tsdf = self.get_test_df_builder("expected").as_tsdf() target_ts = "2020-09-01 00:02:00" subsequent_tsdf = init_tsdf.subsequentTo(target_ts) - self.assertDataFrameEquality(subsequent_tsdf, expected_tsdf, from_tsdf=True) + self.assertDataFrameEquality(subsequent_tsdf, expected_tsdf) def test_subsequentTo_numeric_timestamp(self): """ Test of time-slicing subsequentTo(..) function """ - init_tsdf = self.get_data_as_tsdf("init") - expected_tsdf = self.get_data_as_tsdf("expected") + init_tsdf = self.get_test_df_builder("init").as_tsdf() + expected_tsdf = self.get_test_df_builder("expected").as_tsdf() target_ts = "2020-09-01 00:02:00" @@ -860,87 +853,84 @@ def test_subsequentTo_numeric_timestamp(self): subsequent_dbl_tsdf = init_dbl_tsdf.subsequentTo(target_dbl) self.assertDataFrameEquality( - subsequent_dbl_tsdf, expected_dbl_tsdf, from_tsdf=True + subsequent_dbl_tsdf, expected_dbl_tsdf ) def test__rowsBetweenWindow(self): - init_tsdf = self.get_data_as_tsdf("init") + init_tsdf = self.get_test_df_builder("init").as_tsdf() self.assertIsInstance(init_tsdf._TSDF__rowsBetweenWindow(1, 1), WindowSpec) def test_withPartitionCols(self): - init_tsdf = self.get_data_as_tsdf("init") + init_tsdf = self.get_test_df_builder("init").as_tsdf() actual_tsdf = init_tsdf.withPartitionCols(["symbol"]) self.assertEqual(init_tsdf.partitionCols, []) self.assertEqual(actual_tsdf.partitionCols, ["symbol"]) - def test_tsdf_interpolate(self): - ... - class FourierTransformTest(SparkTest): def test_fourier_transform(self): """Test of fourier transform functionality in TSDF objects""" # construct dataframes - tsdf_init = self.get_data_as_tsdf("init") - dfExpected = self.get_data_as_sdf("expected") + tsdf_init = self.get_test_df_builder("init").as_tsdf() + df_expected = self.get_test_df_builder("expected").as_sdf() # convert to TSDF result_tsdf = tsdf_init.fourier_transform(1, "val") # should be equal to the expected dataframe - self.assertDataFrameEquality(result_tsdf.df, dfExpected) + self.assertDataFrameEquality(result_tsdf.df, df_expected) def test_fourier_transform_valid_sequence_col_empty_partition_cols(self): """Test of fourier transform functionality in TSDF objects""" # construct dataframes - tsdf_init = self.get_data_as_tsdf("init") - dfExpected = self.get_data_as_sdf("expected") + tsdf_init = self.get_test_df_builder("init").as_tsdf() + df_expected = self.get_test_df_builder("expected").as_sdf() # convert to TSDF result_tsdf = tsdf_init.fourier_transform(1, "val") # should be equal to the expected dataframe - self.assertDataFrameEquality(result_tsdf.df, dfExpected) + self.assertDataFrameEquality(result_tsdf.df, df_expected) def test_fourier_transform_valid_sequence_col_valid_partition_cols(self): """Test of fourier transform functionality in TSDF objects""" # construct dataframes - tsdf_init = self.get_data_as_tsdf("init") - dfExpected = self.get_data_as_sdf("expected") + tsdf_init = self.get_test_df_builder("init").as_tsdf() + df_expected = self.get_test_df_builder("expected").as_sdf() # convert to TSDF result_tsdf = tsdf_init.fourier_transform(1, "val") # should be equal to the expected dataframe - self.assertDataFrameEquality(result_tsdf.df, dfExpected) + self.assertDataFrameEquality(result_tsdf.df, df_expected) def test_fourier_transform_no_sequence_col_empty_partition_cols(self): """Test of fourier transform functionality in TSDF objects""" # construct dataframes - tsdf_init = self.get_data_as_tsdf("init") - dfExpected = self.get_data_as_sdf("expected") + tsdf_init = self.get_test_df_builder("init").as_tsdf() + df_expected = self.get_test_df_builder("expected").as_sdf() # convert to TSDF result_tsdf = tsdf_init.fourier_transform(1, "val") # should be equal to the expected dataframe - self.assertDataFrameEquality(result_tsdf.df, dfExpected) + self.assertDataFrameEquality(result_tsdf.df, df_expected) class RangeStatsTest(SparkTest): def test_range_stats(self): - """Test of range stats for 20 minute rolling window""" + """Test of range stats for 20-minute rolling window""" # construct dataframes - tsdf_init = self.get_data_as_tsdf("init") - dfExpected = self.get_data_as_sdf("expected") + tsdf_init = self.get_test_df_builder("init").as_tsdf() + df_expected = self.get_test_df_builder("expected").as_sdf() # convert to TSDF @@ -961,7 +951,7 @@ def test_range_stats(self): ) # cast to decimal with precision in cents for simplicity - dfExpected = dfExpected.select( + df_expected = df_expected.select( sfn.col("symbol"), sfn.col("event_ts"), sfn.col("mean_trade_pr").cast("decimal(5, 2)"), @@ -974,14 +964,14 @@ def test_range_stats(self): ) # should be equal to the expected dataframe - self.assertDataFrameEquality(featured_df, dfExpected) + self.assertDataFrameEquality(featured_df, df_expected) def test_group_stats(self): """Test of range stats for 20 minute rolling window""" # construct dataframes - tsdf_init = self.get_data_as_tsdf("init") - dfExpected = self.get_data_as_sdf("expected") + tsdf_init = self.get_test_df_builder("init").as_tsdf() + df_expected = self.get_test_df_builder("expected").as_sdf() # using lookback of 20 minutes featured_df = tsdf_init.withGroupedStats(freq="1 min").df @@ -999,7 +989,7 @@ def test_group_stats(self): ) # cast to decimal with precision in cents for simplicity - dfExpected = dfExpected.select( + df_expected = df_expected.select( sfn.col("symbol"), sfn.col("event_ts"), sfn.col("mean_trade_pr").cast("decimal(5, 2)"), @@ -1011,7 +1001,7 @@ def test_group_stats(self): ) # should be equal to the expected dataframe - self.assertDataFrameEquality(featured_df, dfExpected) + self.assertDataFrameEquality(featured_df, df_expected) class ResampleTest(SparkTest): @@ -1019,10 +1009,10 @@ def test_resample(self): """Test of range stats for 20 minute rolling window""" # construct dataframes - tsdf_input = self.get_data_as_tsdf("input") - dfExpected = self.get_data_as_sdf("expected") - expected_30s_df = self.get_data_as_sdf("expected30m") - barsExpected = self.get_data_as_sdf("expectedbars") + tsdf_input = self.get_test_df_builder("input").as_tsdf() + df_expected = self.get_test_df_builder("expected").as_sdf() + expected_30s_df = self.get_test_df_builder("expected30m").as_sdf() + bars_expected = self.get_test_df_builder("expectedbars").as_sdf() # 1 minute aggregation featured_df = tsdf_input.resample(freq="min", func="floor", prefix="floor").df @@ -1036,33 +1026,33 @@ def test_resample(self): ).df # should be equal to the expected dataframe - self.assertDataFrameEquality(featured_df, dfExpected) + self.assertDataFrameEquality(featured_df, df_expected) self.assertDataFrameEquality(resample_30m, expected_30s_df) # test bars summary - self.assertDataFrameEquality(bars, barsExpected) + self.assertDataFrameEquality(bars, bars_expected) def test_resample_millis(self): """Test of resampling for millisecond windows""" # construct dataframes - tsdf_init = self.get_data_as_tsdf("init") - dfExpected = self.get_data_as_sdf("expectedms") + tsdf_init = self.get_test_df_builder("init").as_tsdf() + df_expected = self.get_test_df_builder("expectedms").as_sdf() # 30 minute aggregation resample_ms = tsdf_init.resample(freq="ms", func="mean").df.withColumn( "trade_pr", sfn.round(sfn.col("trade_pr"), 2) ) - self.assertDataFrameEquality(resample_ms, dfExpected) + self.assertDataFrameEquality(resample_ms, df_expected) def test_upsample(self): - """Test of range stats for 20 minute rolling window""" + """Test of range stats for 20-minute rolling window""" # construct dataframes - tsdf_input = self.get_data_as_tsdf("input") - expected_30s_df = self.get_data_as_sdf("expected30m") - barsExpected = self.get_data_as_sdf("expectedbars") + tsdf_input = self.get_test_df_builder("input").as_tsdf() + expected_30s_df = self.get_test_df_builder("expected30m").as_sdf() + bars_expected = self.get_test_df_builder("expectedbars").as_sdf() resample_30m = tsdf_input.resample( freq="5 minutes", func="mean", fill=True @@ -1085,7 +1075,7 @@ def test_upsample(self): self.assertDataFrameEquality(upsampled, expected_30s_df) # test bars summary - self.assertDataFrameEquality(bars, barsExpected) + self.assertDataFrameEquality(bars, bars_expected) class ExtractStateIntervalsTest(SparkTest): @@ -1093,8 +1083,8 @@ class ExtractStateIntervalsTest(SparkTest): def test_eq_0(self): # construct dataframes - input_tsdf: TSDF = self.get_data_as_tsdf("input") - expected_df: DataFrame = self.get_data_as_sdf("expected") + input_tsdf: TSDF = self.get_test_df_builder("input").as_tsdf() + expected_df: DataFrame = self.get_test_df_builder("expected").as_sdf() # call extractStateIntervals method intervals_eq_1_df: DataFrame = input_tsdf.extractStateIntervals( @@ -1110,8 +1100,8 @@ def test_eq_0(self): def test_eq_1(self): # construct dataframes - input_tsdf: TSDF = self.get_data_as_tsdf("input") - expected_df: DataFrame = self.get_data_as_sdf("expected") + input_tsdf: TSDF = self.get_test_df_builder("input").as_tsdf() + expected_df: DataFrame = self.get_test_df_builder("expected").as_sdf() # call extractStateIntervals method intervals_eq_1_df: DataFrame = input_tsdf.extractStateIntervals( @@ -1127,8 +1117,8 @@ def test_eq_1(self): def test_ne_0(self): # construct dataframes - input_tsdf: TSDF = self.get_data_as_tsdf("input") - expected_df: DataFrame = self.get_data_as_sdf("expected") + input_tsdf: TSDF = self.get_test_df_builder("input").as_tsdf() + expected_df: DataFrame = self.get_test_df_builder("expected").as_sdf() # call extractStateIntervals method intervals_ne_0_df: DataFrame = input_tsdf.extractStateIntervals( @@ -1144,8 +1134,8 @@ def test_ne_0(self): def test_ne_1(self): # construct dataframes - input_tsdf: TSDF = self.get_data_as_tsdf("input") - expected_df: DataFrame = self.get_data_as_sdf("expected") + input_tsdf: TSDF = self.get_test_df_builder("input").as_tsdf() + expected_df: DataFrame = self.get_test_df_builder("expected").as_sdf() # call extractStateIntervals method intervals_ne_0_df: DataFrame = input_tsdf.extractStateIntervals( @@ -1161,8 +1151,8 @@ def test_ne_1(self): def test_gt_0(self): # construct dataframes - input_tsdf: TSDF = self.get_data_as_tsdf("input") - expected_df: DataFrame = self.get_data_as_sdf("expected") + input_tsdf: TSDF = self.get_test_df_builder("input").as_tsdf() + expected_df: DataFrame = self.get_test_df_builder("expected").as_sdf() # call extractStateIntervals method intervals_gt_df: DataFrame = input_tsdf.extractStateIntervals( @@ -1173,8 +1163,8 @@ def test_gt_0(self): def test_gt_1(self): # construct dataframes - input_tsdf: TSDF = self.get_data_as_tsdf("input") - expected_df: DataFrame = self.get_data_as_sdf("expected") + input_tsdf: TSDF = self.get_test_df_builder("input").as_tsdf() + expected_df: DataFrame = self.get_test_df_builder("expected").as_sdf() # call extractStateIntervals method intervals_gt_df: DataFrame = input_tsdf.extractStateIntervals( @@ -1185,8 +1175,8 @@ def test_gt_1(self): def test_lt_0(self): # construct dataframes - input_tsdf: TSDF = self.get_data_as_tsdf("input") - expected_df: DataFrame = self.get_data_as_sdf("expected") + input_tsdf: TSDF = self.get_test_df_builder("input").as_tsdf() + expected_df: DataFrame = self.get_test_df_builder("expected").as_sdf() # call extractStateIntervals method intervals_lt_df: DataFrame = input_tsdf.extractStateIntervals( @@ -1198,8 +1188,8 @@ def test_lt_0(self): def test_lt_1(self): # construct dataframes - input_tsdf: TSDF = self.get_data_as_tsdf("input") - expected_df: DataFrame = self.get_data_as_sdf("expected") + input_tsdf: TSDF = self.get_test_df_builder("input").as_tsdf() + expected_df: DataFrame = self.get_test_df_builder("expected").as_sdf() # call extractStateIntervals method intervals_lt_df: DataFrame = input_tsdf.extractStateIntervals( @@ -1211,8 +1201,8 @@ def test_lt_1(self): def test_gte_0(self): # construct dataframes - input_tsdf: TSDF = self.get_data_as_tsdf("input") - expected_df: DataFrame = self.get_data_as_sdf("expected") + input_tsdf: TSDF = self.get_test_df_builder("input").as_tsdf() + expected_df: DataFrame = self.get_test_df_builder("expected").as_sdf() # call extractStateIntervals method intervals_gt_df: DataFrame = input_tsdf.extractStateIntervals( @@ -1223,8 +1213,8 @@ def test_gte_0(self): def test_gte_1(self): # construct dataframes - input_tsdf: TSDF = self.get_data_as_tsdf("input") - expected_df: DataFrame = self.get_data_as_sdf("expected") + input_tsdf: TSDF = self.get_test_df_builder("input").as_tsdf() + expected_df: DataFrame = self.get_test_df_builder("expected").as_sdf() # call extractStateIntervals method intervals_gt_df: DataFrame = input_tsdf.extractStateIntervals( @@ -1235,8 +1225,8 @@ def test_gte_1(self): def test_lte_0(self): # construct dataframes - input_tsdf: TSDF = self.get_data_as_tsdf("input") - expected_df: DataFrame = self.get_data_as_sdf("expected") + input_tsdf: TSDF = self.get_test_df_builder("input").as_tsdf() + expected_df: DataFrame = self.get_test_df_builder("expected").as_sdf() # call extractStateIntervals method intervals_lte_df: DataFrame = input_tsdf.extractStateIntervals( @@ -1248,8 +1238,8 @@ def test_lte_0(self): def test_lte_1(self): # construct dataframes - input_tsdf: TSDF = self.get_data_as_tsdf("input") - expected_df: DataFrame = self.get_data_as_sdf("expected") + input_tsdf: TSDF = self.get_test_df_builder("input").as_tsdf() + expected_df: DataFrame = self.get_test_df_builder("expected").as_sdf() # call extractStateIntervals method intervals_lte_df: DataFrame = input_tsdf.extractStateIntervals( @@ -1261,8 +1251,8 @@ def test_lte_1(self): def test_threshold_fn(self): # construct dataframes - input_tsdf: TSDF = self.get_data_as_tsdf("input") - expected_df: DataFrame = self.get_data_as_sdf("expected") + input_tsdf: TSDF = self.get_test_df_builder("input").as_tsdf() + expected_df: DataFrame = self.get_test_df_builder("expected").as_sdf() # threshold state function def threshold_fn(a: Column, b: Column) -> Column: @@ -1278,8 +1268,8 @@ def threshold_fn(a: Column, b: Column) -> Column: def test_null_safe_eq_0(self): # construct dataframes - input_tsdf: TSDF = self.get_data_as_tsdf("input") - expected_df: DataFrame = self.get_data_as_sdf("expected") + input_tsdf: TSDF = self.get_test_df_builder("input").as_tsdf() + expected_df: DataFrame = self.get_test_df_builder("expected").as_sdf() intervals_eq_df: DataFrame = input_tsdf.extractStateIntervals( "metric_1", "metric_2", "metric_3", state_definition="<=>" @@ -1292,8 +1282,8 @@ def test_null_safe_eq_0(self): def test_null_safe_eq_1(self): # construct dataframes - input_tsdf: TSDF = self.get_data_as_tsdf("input") - expected_df: DataFrame = self.get_data_as_sdf("expected") + input_tsdf: TSDF = self.get_test_df_builder("input").as_tsdf() + expected_df: DataFrame = self.get_test_df_builder("expected").as_sdf() intervals_eq_df: DataFrame = input_tsdf.extractStateIntervals( "metric_1", "metric_2", "metric_3", state_definition="<=>" @@ -1306,8 +1296,8 @@ def test_null_safe_eq_1(self): def test_adjacent_intervals(self): # construct dataframes - input_tsdf: TSDF = self.get_data_as_tsdf("input") - expected_df: DataFrame = self.get_data_as_sdf("expected") + input_tsdf: TSDF = self.get_test_df_builder("input").as_tsdf() + expected_df: DataFrame = self.get_test_df_builder("expected").as_sdf() intervals_eq_df: DataFrame = input_tsdf.extractStateIntervals( "metric_1", "metric_2", "metric_3" @@ -1318,7 +1308,7 @@ def test_adjacent_intervals(self): def test_invalid_state_definition_str(self): # construct dataframes - input_tsdf: TSDF = self.get_data_as_tsdf("input") + input_tsdf: TSDF = self.get_test_df_builder("input").as_tsdf() try: input_tsdf.extractStateIntervals( @@ -1329,7 +1319,7 @@ def test_invalid_state_definition_str(self): def test_invalid_state_definition_type(self): # construct dataframes - input_tsdf: TSDF = self.get_data_as_tsdf("input") + input_tsdf: TSDF = self.get_test_df_builder("input").as_tsdf() try: input_tsdf.extractStateIntervals( diff --git a/python/tests/unit_test_data/as_of_join_tests.json b/python/tests/unit_test_data/as_of_join_tests.json index 0b7bba7e..6c183b8b 100644 --- a/python/tests/unit_test_data/as_of_join_tests.json +++ b/python/tests/unit_test_data/as_of_join_tests.json @@ -1,15 +1,20 @@ { "__SharedData": { "shared_left": { - "schema": "symbol string, event_ts string, trade_pr float", - "ts_col": "event_ts", - "partition_cols": ["symbol"], - "data": [ - ["S1", "2020-08-01 00:00:10", 349.21], - ["S1", "2020-08-01 00:01:12", 351.32], - ["S1", "2020-09-01 00:02:10", 361.1], - ["S1", "2020-09-01 00:19:12", 362.1] - ] + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["symbol"] + }, + "df": { + "schema": "symbol string, event_ts string, trade_pr float", + "ts_convert": ["event_ts"], + "data": [ + ["S1", "2020-08-01 00:00:10", 349.21], + ["S1", "2020-08-01 00:01:12", 351.32], + ["S1", "2020-09-01 00:02:10", 361.1], + ["S1", "2020-09-01 00:19:12", 362.1] + ] + } }, "test_asof_expected_data": [ ["S1", "2020-08-01 00:00:10", 349.21, "2020-08-01 00:00:01", 345.11, 351.12], @@ -24,32 +29,45 @@ "$ref": "#/__SharedData/shared_left" }, "right": { - "schema": "symbol string, event_ts string, bid_pr float, ask_pr float", - "ts_col": "event_ts", - "partition_cols": ["symbol"], - "data": [ - ["S1", "2020-08-01 00:00:01", 345.11, 351.12], - ["S1", "2020-08-01 00:01:05", 348.10, 353.13], - ["S1", "2020-09-01 00:02:01", 358.93, 365.12], - ["S1", "2020-09-01 00:15:01", 359.21, 365.31] - ] + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["symbol"] + }, + "df": { + "schema": "symbol string, event_ts string, bid_pr float, ask_pr float", + "ts_convert": ["event_ts"], + "data": [ + ["S1", "2020-08-01 00:00:01", 345.11, 351.12], + ["S1", "2020-08-01 00:01:05", 348.10, 353.13], + ["S1", "2020-09-01 00:02:01", 358.93, 365.12], + ["S1", "2020-09-01 00:15:01", 359.21, 365.31] + ] + } }, "expected": { - "schema": "symbol string, left_event_ts string, left_trade_pr float, right_event_ts string, right_bid_pr float, right_ask_pr float", - "ts_col": "left_event_ts", - "partition_cols": ["symbol"], - "other_ts_cols": ["right_event_ts"], - "data": { - "$ref": "#/__SharedData/test_asof_expected_data" + "tsdf": { + "ts_col": "left_event_ts", + "partition_cols": ["symbol"] + }, + "df": { + "schema": "symbol string, left_event_ts string, left_trade_pr float, right_event_ts string, right_bid_pr float, right_ask_pr float", + "ts_convert": ["left_event_ts", "right_event_ts"], + "data": { + "$ref": "#/__SharedData/test_asof_expected_data" + } } }, "expected_no_right_prefix": { - "schema": "symbol string, left_event_ts string, left_trade_pr float, event_ts string, bid_pr float, ask_pr float", - "ts_col": "left_event_ts", - "partition_cols": ["symbol"], - "other_ts_cols": ["event_ts"], - "data": { - "$ref": "#/__SharedData/test_asof_expected_data" + "tsdf": { + "ts_col": "left_event_ts", + "partition_cols": ["symbol"] + }, + "df": { + "schema": "symbol string, left_event_ts string, left_trade_pr float, event_ts string, bid_pr float, ask_pr float", + "ts_convert": ["left_event_ts", "event_ts"], + "data": { + "$ref": "#/__SharedData/test_asof_expected_data" + } } } }, @@ -58,158 +76,210 @@ "$ref": "#/__SharedData/shared_left" }, "right": { - "schema": "symbol string, event_ts string, bid_pr float, ask_pr float", - "ts_col": "event_ts", - "partition_cols": ["symbol"], - "data": [ - ["S1", "2020-08-01 00:00:01", 345.11, 351.12], - ["S1", "2020-08-01 00:01:05", null, 353.13], - ["S1", "2020-09-01 00:02:01", null, null], - ["S1", "2020-09-01 00:15:01", 359.21, 365.31] - ] + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["symbol"] + }, + "df": { + "schema": "symbol string, event_ts string, bid_pr float, ask_pr float", + "ts_convert": ["event_ts"], + "data": [ + ["S1", "2020-08-01 00:00:01", 345.11, 351.12], + ["S1", "2020-08-01 00:01:05", null, 353.13], + ["S1", "2020-09-01 00:02:01", null, null], + ["S1", "2020-09-01 00:15:01", 359.21, 365.31] + ] + } }, "expected_skip_nulls": { - "schema": "symbol string, left_event_ts string, left_trade_pr float, right_event_ts string, right_bid_pr float, right_ask_pr float", - "ts_col": "left_event_ts", - "partition_cols": ["symbol"], - "other_ts_cols": ["right_event_ts"], - "data": [ - ["S1", "2020-08-01 00:00:10", 349.21, "2020-08-01 00:00:01", 345.11, 351.12], - ["S1", "2020-08-01 00:01:12", 351.32, "2020-08-01 00:01:05", 345.11, 353.13], - ["S1", "2020-09-01 00:02:10", 361.1, "2020-09-01 00:02:01", 345.11, 353.13], - ["S1", "2020-09-01 00:19:12", 362.1, "2020-09-01 00:15:01", 359.21, 365.31] - ] + "tsdf": { + "ts_col": "left_event_ts", + "partition_cols": ["symbol"] + }, + "df": { + "schema": "symbol string, left_event_ts string, left_trade_pr float, right_event_ts string, right_bid_pr float, right_ask_pr float", + "ts_convert": ["left_event_ts", "right_event_ts"], + "data": [ + ["S1", "2020-08-01 00:00:10", 349.21, "2020-08-01 00:00:01", 345.11, 351.12], + ["S1", "2020-08-01 00:01:12", 351.32, "2020-08-01 00:01:05", 345.11, 353.13], + ["S1", "2020-09-01 00:02:10", 361.1, "2020-09-01 00:02:01", 345.11, 353.13], + ["S1", "2020-09-01 00:19:12", 362.1, "2020-09-01 00:15:01", 359.21, 365.31] + ] + } }, "expected_skip_nulls_disabled": { - "schema": "symbol string, left_event_ts string, left_trade_pr float, right_event_ts string, right_bid_pr float, right_ask_pr float", - "ts_col": "left_event_ts", - "partition_cols": ["symbol"], - "other_ts_cols": ["right_event_ts"], - "data": [ - ["S1", "2020-08-01 00:00:10", 349.21, "2020-08-01 00:00:01", 345.11, 351.12], - ["S1", "2020-08-01 00:01:12", 351.32, "2020-08-01 00:01:05", null, 353.13], - ["S1", "2020-09-01 00:02:10", 361.1, "2020-09-01 00:02:01", null, null], - ["S1", "2020-09-01 00:19:12", 362.1, "2020-09-01 00:15:01", 359.21, 365.31] - ] + "tsdf": { + "ts_col": "left_event_ts", + "partition_cols": ["symbol"] + }, + "df": { + "schema": "symbol string, left_event_ts string, left_trade_pr float, right_event_ts string, right_bid_pr float, right_ask_pr float", + "ts_convert": ["left_event_ts", "right_event_ts"], + "data": [ + ["S1", "2020-08-01 00:00:10", 349.21, "2020-08-01 00:00:01", 345.11, 351.12], + ["S1", "2020-08-01 00:01:12", 351.32, "2020-08-01 00:01:05", null, 353.13], + ["S1", "2020-09-01 00:02:10", 361.1, "2020-09-01 00:02:01", null, null], + ["S1", "2020-09-01 00:19:12", 362.1, "2020-09-01 00:15:01", 359.21, 365.31] + ] + } } }, "test_sequence_number_sort": { "left": { - "schema": "symbol string, event_ts string, trade_pr float, trade_id int", - "ts_col": "event_ts", - "partition_cols": ["symbol"], - "data": [ - ["S1", "2020-08-01 00:00:10", 349.21, 1], - ["S1", "2020-08-01 00:00:10", 350.21, 5], - ["S1", "2020-08-01 00:01:12", 351.32, 2], - ["S1", "2020-09-01 00:02:10", 361.1, 3], - ["S1", "2020-09-01 00:19:12", 362.1, 4] - ] + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["symbol"] + }, + "df": { + "schema": "symbol string, event_ts string, trade_pr float, trade_id int", + "ts_convert": ["event_ts"], + "data": [ + ["S1", "2020-08-01 00:00:10", 349.21, 1], + ["S1", "2020-08-01 00:00:10", 350.21, 5], + ["S1", "2020-08-01 00:01:12", 351.32, 2], + ["S1", "2020-09-01 00:02:10", 361.1, 3], + ["S1", "2020-09-01 00:19:12", 362.1, 4] + ] + } }, "right": { - "schema": "symbol string, event_ts string, bid_pr float, ask_pr float, seq_nb long", - "ts_col": "event_ts", - "partition_cols": ["symbol"], - "sequence_col": "seq_nb", - "data": [ - ["S1", "2020-08-01 00:00:01", 345.11, 351.12, 1], - ["S1", "2020-08-01 00:00:10", 19.11, 20.12, 1], - ["S1", "2020-08-01 00:01:05", 348.10, 1000.13, 3], - ["S1", "2020-08-01 00:01:05", 348.10, 100.13, 2], - ["S1", "2020-09-01 00:02:01", 358.93, 365.12, 4], - ["S1", "2020-09-01 00:15:01", 359.21, 365.31, 5] - ] + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["symbol"], + "sequence_col": "seq_nb" + }, + "df": { + "schema": "symbol string, event_ts string, bid_pr float, ask_pr float, seq_nb long", + "ts_convert": ["event_ts"], + "data": [ + ["S1", "2020-08-01 00:00:01", 345.11, 351.12, 1], + ["S1", "2020-08-01 00:00:10", 19.11, 20.12, 1], + ["S1", "2020-08-01 00:01:05", 348.10, 1000.13, 3], + ["S1", "2020-08-01 00:01:05", 348.10, 100.13, 2], + ["S1", "2020-09-01 00:02:01", 358.93, 365.12, 4], + ["S1", "2020-09-01 00:15:01", 359.21, 365.31, 5] + ] + } }, "expected": { - "schema": "symbol string, event_ts string, trade_pr float, trade_id int, right_event_ts string, right_bid_pr float, right_ask_pr float, right_seq_nb long", - "ts_col": "event_ts", - "partition_cols": ["symbol"], - "other_ts_cols": ["right_event_ts"], - "data": [ - ["S1", "2020-08-01 00:00:10", 349.21, 1, "2020-08-01 00:00:10", 19.11, 20.12, 1], - ["S1", "2020-08-01 00:00:10", 350.21, 5, "2020-08-01 00:00:10", 19.11, 20.12, 1], - ["S1", "2020-08-01 00:01:12", 351.32, 2, "2020-08-01 00:01:05", 348.10, 1000.13, 3], - ["S1", "2020-09-01 00:02:10", 361.1, 3, "2020-09-01 00:02:01", 358.93, 365.12, 4], - ["S1", "2020-09-01 00:19:12", 362.1, 4, "2020-09-01 00:15:01", 359.21, 365.31, 5] - ] + "tsdf": { + "ts_col": "left_event_ts", + "partition_cols": ["symbol"] + }, + "df": { + "schema": "symbol string, event_ts string, trade_pr float, trade_id int, right_event_ts string, right_bid_pr float, right_ask_pr float, right_seq_nb long", + "ts_convert": ["event_ts", "right_event_ts"], + "data": [ + ["S1", "2020-08-01 00:00:10", 349.21, 1, "2020-08-01 00:00:10", 19.11, 20.12, 1], + ["S1", "2020-08-01 00:00:10", 350.21, 5, "2020-08-01 00:00:10", 19.11, 20.12, 1], + ["S1", "2020-08-01 00:01:12", 351.32, 2, "2020-08-01 00:01:05", 348.10, 1000.13, 3], + ["S1", "2020-09-01 00:02:10", 361.1, 3, "2020-09-01 00:02:01", 358.93, 365.12, 4], + ["S1", "2020-09-01 00:19:12", 362.1, 4, "2020-09-01 00:15:01", 359.21, 365.31, 5] + ] + } } }, "test_partitioned_asof_join": { "left": { - "schema": "symbol string, event_ts string, trade_pr float", - "ts_col": "event_ts", - "partition_cols": ["symbol"], - "data": [ - ["S1", "2020-08-01 00:00:02", 349.21], - ["S1", "2020-08-01 00:00:08", 351.32], - ["S1", "2020-08-01 00:00:11", 361.12], - ["S1", "2020-08-01 00:00:18", 364.31], - ["S1", "2020-08-01 00:00:19", 362.94], - ["S1", "2020-08-01 00:00:21", 364.27], - ["S1", "2020-08-01 00:00:23", 367.36] - ] + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["symbol"] + }, + "df": { + "schema": "symbol string, event_ts string, trade_pr float", + "ts_convert": ["event_ts"], + "data": [ + ["S1", "2020-08-01 00:00:02", 349.21], + ["S1", "2020-08-01 00:00:08", 351.32], + ["S1", "2020-08-01 00:00:11", 361.12], + ["S1", "2020-08-01 00:00:18", 364.31], + ["S1", "2020-08-01 00:00:19", 362.94], + ["S1", "2020-08-01 00:00:21", 364.27], + ["S1", "2020-08-01 00:00:23", 367.36] + ] + } }, "right": { - "schema": "symbol string, event_ts string, bid_pr float, ask_pr float", - "ts_col": "event_ts", - "partition_cols": ["symbol"], - "data": [ - ["S1", "2020-08-01 00:00:01", 345.11, 351.12], - ["S1", "2020-08-01 00:00:09", 348.10, 353.13], - ["S1", "2020-08-01 00:00:12", 358.93, 365.12], - ["S1", "2020-08-01 00:00:19", 359.21, 365.31] - ] + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["symbol"] + }, + "df": { + "schema": "symbol string, event_ts string, bid_pr float, ask_pr float", + "ts_convert": ["event_ts"], + "data": [ + ["S1", "2020-08-01 00:00:01", 345.11, 351.12], + ["S1", "2020-08-01 00:00:09", 348.10, 353.13], + ["S1", "2020-08-01 00:00:12", 358.93, 365.12], + ["S1", "2020-08-01 00:00:19", 359.21, 365.31] + ] + } }, "expected": { - "schema": "symbol string, left_event_ts string, left_trade_pr float, right_event_ts string, right_bid_pr float, right_ask_pr float", - "ts_col": "left_event_ts", - "partition_cols": ["symbol"], - "other_ts_cols": ["right_event_ts"], - "data": [ - ["S1", "2020-08-01 00:00:02", 349.21, "2020-08-01 00:00:01", 345.11, 351.12], - ["S1", "2020-08-01 00:00:08", 351.32, "2020-08-01 00:00:01", 345.11, 351.12], - ["S1", "2020-08-01 00:00:11", 361.12, "2020-08-01 00:00:09", 348.10, 353.13], - ["S1", "2020-08-01 00:00:18", 364.31, "2020-08-01 00:00:12", 358.93, 365.12], - ["S1", "2020-08-01 00:00:19", 362.94, "2020-08-01 00:00:19", 359.21, 365.31], - ["S1", "2020-08-01 00:00:21", 364.27, "2020-08-01 00:00:19", 359.21, 365.31], - ["S1", "2020-08-01 00:00:23", 367.36, "2020-08-01 00:00:19", 359.21, 365.31] - ] + "tsdf": { + "ts_col": "left_event_ts", + "partition_cols": ["symbol"] + }, + "df": { + "schema": "symbol string, left_event_ts string, left_trade_pr float, right_event_ts string, right_bid_pr float, right_ask_pr float", + "ts_convert": ["left_event_ts", "right_event_ts"], + "data": [ + ["S1", "2020-08-01 00:00:02", 349.21, "2020-08-01 00:00:01", 345.11, 351.12], + ["S1", "2020-08-01 00:00:08", 351.32, "2020-08-01 00:00:01", 345.11, 351.12], + ["S1", "2020-08-01 00:00:11", 361.12, "2020-08-01 00:00:09", 348.10, 353.13], + ["S1", "2020-08-01 00:00:18", 364.31, "2020-08-01 00:00:12", 358.93, 365.12], + ["S1", "2020-08-01 00:00:19", 362.94, "2020-08-01 00:00:19", 359.21, 365.31], + ["S1", "2020-08-01 00:00:21", 364.27, "2020-08-01 00:00:19", 359.21, 365.31], + ["S1", "2020-08-01 00:00:23", 367.36, "2020-08-01 00:00:19", 359.21, 365.31] + ] + } } }, "test_asof_join_nanos": { "left": { - "schema": "symbol string, event_ts string, trade_pr float", - "ts_col": "event_ts", - "partition_cols": ["symbol"], - "data": [ - ["S1", "2022-01-01 09:59:59.123456789", 349.21], - ["S1", "2022-01-01 10:00:00.123456788", 351.32], - ["S1", "2022-01-01 10:00:00.123456789", 361.12], - ["S1", "2022-01-01 10:00:01.123456789", 364.31] - ] + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["symbol"] + }, + "df": { + "schema": "symbol string, event_ts string, trade_pr float", + "data": [ + ["S1", "2020-08-01 00:00:10.123456789", 349.21], + ["S1", "2020-08-01 00:01:12.123456789", 351.32], + ["S1", "2020-09-01 00:02:10.123456789", 361.1], + ["S1", "2020-09-01 00:19:12.123456789", 362.1] + ] + } }, "right": { - "schema": "symbol string, event_ts string, bid_pr float, ask_pr float", - "ts_col": "event_ts", - "partition_cols": ["symbol"], - "data": [ - ["S1", "2022-01-01 10:00:00.1234567", 345.11, 351.12], - ["S1", "2022-01-01 10:00:00.12345671", 348.10, 353.13], - ["S1", "2022-01-01 10:00:00.12345675", 358.93, 365.12], - ["S1", "2022-01-01 10:00:00.12345677", 358.91, 365.33], - ["S1", "2022-01-01 10:00:01.10000001", 359.21, 365.31] - ] + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["symbol"] + }, + "df": { + "schema": "symbol string, event_ts string, bid_pr float, ask_pr float", + "data": [ + ["S1", "2020-08-01 00:00:01.123456789", 345.11, 351.12], + ["S1", "2020-08-01 00:01:05.123456789", 348.10, 353.13], + ["S1", "2020-09-01 00:02:01.123456789", 358.93, 365.12], + ["S1", "2020-09-01 00:15:01.123456789", 359.21, 365.31] + ] + } }, "expected": { - "schema": "symbol string, left_event_ts string, left_trade_pr float, right_event_ts string, right_ask_pr float, right_bid_pr float", - "ts_col": "left_event_ts", - "partition_cols": ["symbol"], - "data": [ - ["S1", "2022-01-01 09:59:59.123456789", 349.21, null, null, null], - ["S1", "2022-01-01 10:00:00.123456788", 351.32, "2022-01-01 10:00:00.12345677", 365.33, 358.91], - ["S1", "2022-01-01 10:00:00.123456789", 361.12, "2022-01-01 10:00:00.12345677", 365.33, 358.91], - ["S1", "2022-01-01 10:00:01.123456789", 364.31, "2022-01-01 10:00:01.10000001", 365.31, 359.21] - ] + "tsdf": { + "ts_col": "left_event_ts", + "partition_cols": ["symbol"] + }, + "df": { + "schema": "symbol string, left_event_ts double, left_trade_pr float, right_event_ts double, right_bid_pr float, right_ask_pr float", + "data": [ + ["S1", 1.5962400101234567E9, 349.21, 1.5962400011234567E9, 345.11, 351.12], + ["S1", 1.5962400721234567E9, 351.32, 1.5962400651234567E9, 348.10, 353.13], + ["S1", 1.5989185301234567E9, 361.1, 1.5989185211234567E9, 358.93, 365.12], + ["S1", 1.5989195521234567E9, 362.1, 1.5989193011234567E9, 359.21, 365.31] + ] + } } }, "test_asof_join_tolerance": { @@ -217,76 +287,101 @@ "$ref": "#/__SharedData/shared_left" }, "right": { - "schema": "symbol string, event_ts string, bid_pr float, ask_pr float", - "ts_col": "event_ts", - "partition_cols": ["symbol"], - "data": [ - ["S1", "2020-08-01 00:00:01", 345.11, 351.12], - ["S1", "2020-08-01 00:00:10", 345.22, 351.33], - ["S1", "2020-08-01 00:01:05", 348.10, 353.13], - ["S1", "2020-09-01 00:02:01", 358.93, 365.12], - ["S1", "2020-09-01 00:15:01", 359.21, 365.31] - ] + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["symbol"] + }, + "df": { + "schema": "symbol string, event_ts string, bid_pr float, ask_pr float", + "ts_convert": ["event_ts"], + "data": [ + ["S1", "2020-08-01 00:00:01", 345.11, 351.12], + ["S1", "2020-08-01 00:00:10", 345.22, 351.33], + ["S1", "2020-08-01 00:01:05", 348.10, 353.13], + ["S1", "2020-09-01 00:02:01", 358.93, 365.12], + ["S1", "2020-09-01 00:15:01", 359.21, 365.31] + ] + } }, "expected_tolerance_None": { - "schema": "symbol string, left_event_ts string, left_trade_pr float, right_event_ts string, right_bid_pr float, right_ask_pr float", - "ts_col": "left_event_ts", - "partition_cols": ["symbol"], - "other_ts_cols": ["right_event_ts"], - "data": [ - ["S1", "2020-08-01 00:00:10", 349.21, "2020-08-01 00:00:10", 345.22, 351.33], - ["S1", "2020-08-01 00:01:12", 351.32, "2020-08-01 00:01:05", 348.10, 353.13], - ["S1", "2020-09-01 00:02:10", 361.1, "2020-09-01 00:02:01", 358.93, 365.12], - ["S1", "2020-09-01 00:19:12", 362.1, "2020-09-01 00:15:01", 359.21, 365.31] - ] + "tsdf": { + "ts_col": "left_event_ts", + "partition_cols": ["symbol"] + }, + "df": { + "schema": "symbol string, left_event_ts string, left_trade_pr float, right_event_ts string, right_bid_pr float, right_ask_pr float", + "ts_convert": ["left_event_ts", "right_event_ts"], + "data": [ + ["S1", "2020-08-01 00:00:10", 349.21, "2020-08-01 00:00:10", 345.22, 351.33], + ["S1", "2020-08-01 00:01:12", 351.32, "2020-08-01 00:01:05", 348.10, 353.13], + ["S1", "2020-09-01 00:02:10", 361.1, "2020-09-01 00:02:01", 358.93, 365.12], + ["S1", "2020-09-01 00:19:12", 362.1, "2020-09-01 00:15:01", 359.21, 365.31] + ] + } }, "expected_tolerance_0": { - "schema": "symbol string, left_event_ts string, left_trade_pr float, right_event_ts string, right_bid_pr float, right_ask_pr float", - "ts_col": "left_event_ts", - "partition_cols": ["symbol"], - "other_ts_cols": ["right_event_ts"], - "data": [ - ["S1", "2020-08-01 00:00:10", 349.21, "2020-08-01 00:00:10", 345.22, 351.33], - ["S1", "2020-08-01 00:01:12", 351.32, null, null, null], - ["S1", "2020-09-01 00:02:10", 361.1, null, null, null], - ["S1", "2020-09-01 00:19:12", 362.1, null, null, null] - ] + "tsdf": { + "ts_col": "left_event_ts", + "partition_cols": ["symbol"] + }, + "df": { + "schema": "symbol string, left_event_ts string, left_trade_pr float, right_event_ts string, right_bid_pr float, right_ask_pr float", + "ts_convert": ["left_event_ts", "right_event_ts"], + "data": [ + ["S1", "2020-08-01 00:00:10", 349.21, "2020-08-01 00:00:10", 345.22, 351.33], + ["S1", "2020-08-01 00:01:12", 351.32, null, null, null], + ["S1", "2020-09-01 00:02:10", 361.1, null, null, null], + ["S1", "2020-09-01 00:19:12", 362.1, null, null, null] + ] + } }, "expected_tolerance_5.5": { - "schema": "symbol string, left_event_ts string, left_trade_pr float, right_event_ts string, right_bid_pr float, right_ask_pr float", - "ts_col": "left_event_ts", - "partition_cols": ["symbol"], - "other_ts_cols": ["right_event_ts"], - "data": [ - ["S1", "2020-08-01 00:00:10", 349.21, "2020-08-01 00:00:10", 345.22, 351.33], - ["S1", "2020-08-01 00:01:12", 351.32, null, null, null], - ["S1", "2020-09-01 00:02:10", 361.1, null, null, null], - ["S1", "2020-09-01 00:19:12", 362.1, null, null, null] - ] + "tsdf": { + "ts_col": "left_event_ts", + "partition_cols": ["symbol"] + }, + "df": { + "schema": "symbol string, left_event_ts string, left_trade_pr float, right_event_ts string, right_bid_pr float, right_ask_pr float", + "ts_convert": ["left_event_ts", "right_event_ts"], + "data": [ + ["S1", "2020-08-01 00:00:10", 349.21, "2020-08-01 00:00:10", 345.22, 351.33], + ["S1", "2020-08-01 00:01:12", 351.32, null, null, null], + ["S1", "2020-09-01 00:02:10", 361.1, null, null, null], + ["S1", "2020-09-01 00:19:12", 362.1, null, null, null] + ] + } }, "expected_tolerance_7": { - "schema": "symbol string, left_event_ts string, left_trade_pr float, right_event_ts string, right_bid_pr float, right_ask_pr float", - "ts_col": "left_event_ts", - "partition_cols": ["symbol"], - "other_ts_cols": ["right_event_ts"], - "data": [ - ["S1", "2020-08-01 00:00:10", 349.21, "2020-08-01 00:00:10", 345.22, 351.33], - ["S1", "2020-08-01 00:01:12", 351.32, "2020-08-01 00:01:05", 348.10, 353.13], - ["S1", "2020-09-01 00:02:10", 361.1, null, null, null], - ["S1", "2020-09-01 00:19:12", 362.1, null, null, null] - ] + "tsdf": { + "ts_col": "left_event_ts", + "partition_cols": ["symbol"] + }, + "df": { + "schema": "symbol string, left_event_ts string, left_trade_pr float, right_event_ts string, right_bid_pr float, right_ask_pr float", + "ts_convert": ["left_event_ts", "right_event_ts"], + "data": [ + ["S1", "2020-08-01 00:00:10", 349.21, "2020-08-01 00:00:10", 345.22, 351.33], + ["S1", "2020-08-01 00:01:12", 351.32, "2020-08-01 00:01:05", 348.10, 353.13], + ["S1", "2020-09-01 00:02:10", 361.1, null, null, null], + ["S1", "2020-09-01 00:19:12", 362.1, null, null, null] + ] + } }, "expected_tolerance_10": { - "schema": "symbol string, left_event_ts string, left_trade_pr float, right_event_ts string, right_bid_pr float, right_ask_pr float", - "ts_col": "left_event_ts", - "partition_cols": ["symbol"], - "other_ts_cols": ["right_event_ts"], - "data": [ - ["S1", "2020-08-01 00:00:10", 349.21, "2020-08-01 00:00:10", 345.22, 351.33], - ["S1", "2020-08-01 00:01:12", 351.32, "2020-08-01 00:01:05", 348.10, 353.13], - ["S1", "2020-09-01 00:02:10", 361.1, "2020-09-01 00:02:01", 358.93, 365.12], - ["S1", "2020-09-01 00:19:12", 362.1, null, null, null] - ] + "tsdf": { + "ts_col": "left_event_ts", + "partition_cols": ["symbol"] + }, + "df": { + "schema": "symbol string, left_event_ts string, left_trade_pr float, right_event_ts string, right_bid_pr float, right_ask_pr float", + "ts_convert": ["left_event_ts", "right_event_ts"], + "data": [ + ["S1", "2020-08-01 00:00:10", 349.21, "2020-08-01 00:00:10", 345.22, 351.33], + ["S1", "2020-08-01 00:01:12", 351.32, "2020-08-01 00:01:05", 348.10, 353.13], + ["S1", "2020-09-01 00:02:10", 361.1, "2020-09-01 00:02:01", 358.93, 365.12], + ["S1", "2020-09-01 00:19:12", 362.1, null, null, null] + ] + } } }, "test_asof_join_sql_join_opt_and_bytes_threshold": { diff --git a/python/tests/unit_test_data/interpol_tests.json b/python/tests/unit_test_data/interpol_tests.json index ebea1a81..0f30061d 100644 --- a/python/tests/unit_test_data/interpol_tests.json +++ b/python/tests/unit_test_data/interpol_tests.json @@ -1,144 +1,17 @@ { "__SharedData": { - "input_data": { - "schema": "partition_a string, partition_b string, event_ts string, value_a float, value_b float", - "ts_col": "event_ts", - "partition_cols": [ - "partition_a", - "partition_b" - ], - "data": [ - [ - "A", - "A-1", - "2020-01-01 00:01:10", - 349.21, - null - ], - [ - "A", - "A-1", - "2020-01-01 00:02:03", - null, - 4.0 - ], - [ - "A", - "A-2", - "2020-01-01 00:01:15", - 340.21, - 9.0 - ], - [ - "B", - "B-1", - "2020-01-01 00:01:15", - 362.1, - 4.0 - ], - [ - "A", - "A-2", - "2020-01-01 00:01:17", - 353.32, - 8.0 - ], - [ - "B", - "B-2", - "2020-01-01 00:02:14", - null, - 6.0 - ], - [ - "A", - "A-1", - "2020-01-01 00:03:02", - 351.32, - 7.0 - ], - [ - "B", - "B-2", - "2020-01-01 00:01:12", - 361.1, - 5.0 - ] - ] - }, - "simple_input_data": { - "schema": "partition_a string, partition_b string, event_ts string, value_a float, value_b float", - "ts_col": "event_ts", - "partition_cols": [ - "partition_a", - "partition_b" - ], - "data": [ - [ - "A", - "A-1", - "2020-01-01 00:00:10", - 0.0, - null - ], - [ - "A", - "A-1", - "2020-01-01 00:01:10", - 2.0, - 2.0 - ], - [ - "A", - "A-1", - "2020-01-01 00:01:32", - null, - null - ], - [ - "A", - "A-1", - "2020-01-01 00:02:03", - null, - null - ], - [ - "A", - "A-1", - "2020-01-01 00:03:32", - null, - 7.0 - ], - [ - "A", - "A-1", - "2020-01-01 00:04:12", - 8.0, - 8.0 - ], - [ - "A", - "A-1", - "2020-01-01 00:05:31", - 11.0, - null - ] - ] - } - }, - "InterpolationUnitTest": { - "test_validate_col_exist_in_df": { - "input_data": { - "$ref": "#/__SharedData/input_data" - } - }, - "test_validate_col_target_cols_data_type": { - "input_data": { - "schema": "partition_a string, partition_b string, event_ts string, string_target string, float_target float", + "init": { + "tsdf": { "ts_col": "event_ts", "partition_cols": [ "partition_a", "partition_b" + ] + }, + "df": { + "schema": "partition_a string, partition_b string, event_ts string, value_a float, value_b float", + "ts_convert": [ + "event_ts" ], "data": [ [ @@ -200,1405 +73,1563 @@ ] } }, - "test_fill_validation": { - "input_data": { - "$ref": "#/__SharedData/input_data" - } - }, - "test_target_column_validation": { - "input_data": { - "$ref": "#/__SharedData/input_data" - } - }, - "test_partition_column_validation": { - "input_data": { - "$ref": "#/__SharedData/input_data" - } - }, - "test_ts_column_validation": { - "input_data": { - "$ref": "#/__SharedData/input_data" - } - }, - "test_zero_fill_interpolation": { - "simple_input_data": { - "$ref": "#/__SharedData/simple_input_data" - }, - "expected_data": { - "schema": "partition_a string, partition_b string, event_ts string, value_a double, value_b double, is_ts_interpolated boolean, is_interpolated_value_a boolean, is_interpolated_value_b boolean", + "simple_init": { + "tsdf": { "ts_col": "event_ts", "partition_cols": [ "partition_a", "partition_b" + ] + }, + "df": { + "schema": "partition_a string, partition_b string, event_ts string, value_a float, value_b float", + "ts_convert": [ + "event_ts" ], "data": [ [ "A", "A-1", - "2020-01-01 00:00:00", - 0.0, - 0.0, - false, - false, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:00:30", - 0.0, + "2020-01-01 00:00:10", 0.0, - true, - true, - true + null ], [ "A", "A-1", - "2020-01-01 00:01:00", - 2.0, + "2020-01-01 00:01:10", 2.0, - false, - false, - false - ], - [ - "A", - "A-1", - "2020-01-01 00:01:30", - 0.0, - 0.0, - false, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:02:00", - 0.0, - 0.0, - false, - true, - true + 2.0 ], [ "A", "A-1", - "2020-01-01 00:02:30", - 0.0, - 0.0, - true, - true, - true + "2020-01-01 00:01:32", + null, + null ], [ "A", "A-1", - "2020-01-01 00:03:00", - 0.0, - 0.0, - true, - true, - true + "2020-01-01 00:02:03", + null, + null ], [ "A", "A-1", - "2020-01-01 00:03:30", - 0.0, - 7.0, - false, - true, - false + "2020-01-01 00:03:32", + null, + 7.0 ], [ "A", "A-1", - "2020-01-01 00:04:00", + "2020-01-01 00:04:12", 8.0, - 8.0, - false, - false, - false - ], - [ - "A", - "A-1", - "2020-01-01 00:04:30", - 0.0, - 0.0, - true, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:05:00", - 0.0, - 0.0, - true, - true, - true + 8.0 ], [ "A", "A-1", - "2020-01-01 00:05:30", + "2020-01-01 00:05:31", 11.0, - 0.0, - false, - false, - true + null ] ] } + } + }, + "InterpolationUnitTest": { + "test_is_resampled_type": { + "init": { + "$ref": "#/__SharedData/init" + } + }, + "test_validate_fill_method": { + "init": { + "$ref": "#/__SharedData/init" + } + }, + "test_validate_col_exist_in_df": { + "init": { + "$ref": "#/__SharedData/init" + } + }, + "test_validate_col_target_cols_data_type": { + "init": { + "df": { + "schema": "partition_a string, partition_b string, event_ts string, string_target string, float_target float", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "A", + "A-1", + "2020-01-01 00:01:10", + 349.21, + null + ], + [ + "A", + "A-1", + "2020-01-01 00:02:03", + null, + 4.0 + ], + [ + "A", + "A-2", + "2020-01-01 00:01:15", + 340.21, + 9.0 + ], + [ + "B", + "B-1", + "2020-01-01 00:01:15", + 362.1, + 4.0 + ], + [ + "A", + "A-2", + "2020-01-01 00:01:17", + 353.32, + 8.0 + ], + [ + "B", + "B-2", + "2020-01-01 00:02:14", + null, + 6.0 + ], + [ + "A", + "A-1", + "2020-01-01 00:03:02", + 351.32, + 7.0 + ], + [ + "B", + "B-2", + "2020-01-01 00:01:12", + 361.1, + 5.0 + ] + ] + } + } + }, + "test_fill_validation": { + "init": { + "$ref": "#/__SharedData/init" + } + }, + "test_target_column_validation": { + "init": { + "$ref": "#/__SharedData/init" + } + }, + "test_partition_column_validation": { + "init": { + "$ref": "#/__SharedData/init" + } + }, + "test_ts_column_validation": { + "init": { + "$ref": "#/__SharedData/init" + } + }, + "test_zero_fill_interpolation": { + "simple_init": { + "$ref": "#/__SharedData/simple_init" + }, + "expected": { + "tsdf": { + "ts_col": "event_ts", + "partition_cols": [ + "partition_a", + "partition_b" + ] + }, + "df": { + "schema": "partition_a string, partition_b string, event_ts string, value_a double, value_b double, is_ts_interpolated boolean, is_interpolated_value_a boolean, is_interpolated_value_b boolean", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "A", + "A-1", + "2020-01-01 00:00:00", + 0.0, + 0.0, + false, + false, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:00:30", + 0.0, + 0.0, + true, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:01:00", + 2.0, + 2.0, + false, + false, + false + ], + [ + "A", + "A-1", + "2020-01-01 00:01:30", + 0.0, + 0.0, + false, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:02:00", + 0.0, + 0.0, + false, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:02:30", + 0.0, + 0.0, + true, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:03:00", + 0.0, + 0.0, + true, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:03:30", + 0.0, + 7.0, + false, + true, + false + ], + [ + "A", + "A-1", + "2020-01-01 00:04:00", + 8.0, + 8.0, + false, + false, + false + ], + [ + "A", + "A-1", + "2020-01-01 00:04:30", + 0.0, + 0.0, + true, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:05:00", + 0.0, + 0.0, + true, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:05:30", + 11.0, + 0.0, + false, + false, + true + ] + ] + } + } }, "test_zero_fill_interpolation_no_perform_checks": { - "simple_input_data": { - "$ref": "#/__SharedData/simple_input_data" + "simple_init": { + "$ref": "#/__SharedData/simple_init" }, - "expected_data": { - "$ref": "#/InterpolationUnitTest/test_zero_fill_interpolation/expected_data" + "expected": { + "$ref": "#/InterpolationUnitTest/test_zero_fill_interpolation/expected" } }, "test_null_fill_interpolation": { - "simple_input_data": { - "$ref": "#/__SharedData/simple_input_data" + "simple_init": { + "$ref": "#/__SharedData/simple_init" }, - "expected_data": { - "schema": "partition_a string, partition_b string, event_ts string, value_a double, value_b double, is_ts_interpolated boolean, is_interpolated_value_a boolean, is_interpolated_value_b boolean", - "ts_col": "event_ts", - "partition_cols": [ - "partition_a", - "partition_b" - ], - "data": [ - [ - "A", - "A-1", - "2020-01-01 00:00:00", - 0.0, - null, - false, - false, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:00:30", - null, - null, - true, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:01:00", - 2.0, - 2.0, - false, - false, - false - ], - [ - "A", - "A-1", - "2020-01-01 00:01:30", - null, - null, - false, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:02:00", - null, - null, - false, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:02:30", - null, - null, - true, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:03:00", - null, - null, - true, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:03:30", - null, - 7.0, - false, - true, - false - ], - [ - "A", - "A-1", - "2020-01-01 00:04:00", - 8.0, - 8.0, - false, - false, - false - ], - [ - "A", - "A-1", - "2020-01-01 00:04:30", - null, - null, - true, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:05:00", - null, - null, - true, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:05:30", - 11.0, - null, - false, - false, - true + "expected": { + "df": { + "schema": "partition_a string, partition_b string, event_ts string, value_a double, value_b double, is_ts_interpolated boolean, is_interpolated_value_a boolean, is_interpolated_value_b boolean", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "A", + "A-1", + "2020-01-01 00:00:00", + 0.0, + null, + false, + false, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:00:30", + null, + null, + true, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:01:00", + 2.0, + 2.0, + false, + false, + false + ], + [ + "A", + "A-1", + "2020-01-01 00:01:30", + null, + null, + false, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:02:00", + null, + null, + false, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:02:30", + null, + null, + true, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:03:00", + null, + null, + true, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:03:30", + null, + 7.0, + false, + true, + false + ], + [ + "A", + "A-1", + "2020-01-01 00:04:00", + 8.0, + 8.0, + false, + false, + false + ], + [ + "A", + "A-1", + "2020-01-01 00:04:30", + null, + null, + true, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:05:00", + null, + null, + true, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:05:30", + 11.0, + null, + false, + false, + true + ] ] - ] + } } }, "test_back_fill_interpolation": { - "simple_input_data": { - "$ref": "#/__SharedData/simple_input_data" + "simple_init": { + "$ref": "#/__SharedData/simple_init" }, - "expected_data": { - "schema": "partition_a string, partition_b string, event_ts string, value_a double, value_b double, is_ts_interpolated boolean, is_interpolated_value_a boolean, is_interpolated_value_b boolean", - "ts_col": "event_ts", - "partition_cols": [ - "partition_a", - "partition_b" - ], - "data": [ - [ - "A", - "A-1", - "2020-01-01 00:00:00", - 0.0, - 2.0, - false, - false, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:00:30", - 2.0, - 2.0, - true, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:01:00", - 2.0, - 2.0, - false, - false, - false - ], - [ - "A", - "A-1", - "2020-01-01 00:01:30", - 8.0, - 7.0, - false, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:02:00", - 8.0, - 7.0, - false, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:02:30", - 8.0, - 7.0, - true, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:03:00", - 8.0, - 7.0, - true, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:03:30", - 8.0, - 7.0, - false, - true, - false - ], - [ - "A", - "A-1", - "2020-01-01 00:04:00", - 8.0, - 8.0, - false, - false, - false - ], - [ - "A", - "A-1", - "2020-01-01 00:04:30", - 11.0, - null, - true, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:05:00", - 11.0, - null, - true, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:05:30", - 11.0, - null, - false, - false, - true - ] - ] - } - }, - "test_forward_fill_interpolation": { - "simple_input_data": { - "$ref": "#/__SharedData/simple_input_data" - }, - "expected_data": { - "schema": "partition_a string, partition_b string, event_ts string, value_a double, value_b double, is_ts_interpolated boolean, is_interpolated_value_a boolean, is_interpolated_value_b boolean", - "ts_col": "event_ts", - "partition_cols": [ - "partition_a", - "partition_b" - ], - "data": [ - [ - "A", - "A-1", - "2020-01-01 00:00:00", - 0.0, - null, - false, - false, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:00:30", - 0.0, - null, - true, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:01:00", - 2.0, - 2.0, - false, - false, - false - ], - [ - "A", - "A-1", - "2020-01-01 00:01:30", - 2.0, - 2.0, - false, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:02:00", - 2.0, - 2.0, - false, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:02:30", - 2.0, - 2.0, - true, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:03:00", - 2.0, - 2.0, - true, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:03:30", - 2.0, - 7.0, - false, - true, - false - ], - [ - "A", - "A-1", - "2020-01-01 00:04:00", - 8.0, - 8.0, - false, - false, - false - ], - [ - "A", - "A-1", - "2020-01-01 00:04:30", - 8.0, - 8.0, - true, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:05:00", - 8.0, - 8.0, - true, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:05:30", - 11.0, - 8.0, - false, - false, - true - ] - ] - } - }, - "test_linear_fill_interpolation": { - "simple_input_data": { - "$ref": "#/__SharedData/simple_input_data" - }, - "expected_data": { - "schema": "partition_a string, partition_b string, event_ts string, value_a double, value_b double, is_ts_interpolated boolean, is_interpolated_value_a boolean, is_interpolated_value_b boolean", - "ts_col": "event_ts", - "partition_cols": [ - "partition_a", - "partition_b" - ], - "data": [ - [ - "A", - "A-1", - "2020-01-01 00:00:00", - 0.0, - null, - false, - false, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:00:30", - 1.0, - null, - true, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:01:00", - 2.0, - 2.0, - false, - false, - false - ], - [ - "A", - "A-1", - "2020-01-01 00:01:30", - 3.0, - 3.0, - false, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:02:00", - 4.0, - 4.0, - false, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:02:30", - 5.0, - 5.0, - true, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:03:00", - 6.0, - 6.0, - true, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:03:30", - 7.0, - 7.0, - false, - true, - false - ], - [ - "A", - "A-1", - "2020-01-01 00:04:00", - 8.0, - 8.0, - false, - false, - false - ], - [ - "A", - "A-1", - "2020-01-01 00:04:30", - 9.0, - null, - true, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:05:00", - 10.0, - null, - true, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:05:30", - 11.0, - null, - false, - false, - true - ] - ] - } - }, - "test_different_freq_abbreviations": { - "simple_input_data": { - "$ref": "#/__SharedData/simple_input_data" - }, - "expected_data": { - "schema": "partition_a string, partition_b string, event_ts string, value_a double, value_b double, is_ts_interpolated boolean, is_interpolated_value_a boolean, is_interpolated_value_b boolean", - "ts_col": "event_ts", - "partition_cols": [ - "partition_a", - "partition_b" - ], - "data": [ - [ - "A", - "A-1", - "2020-01-01 00:00:00", - 0.0, - null, - false, - false, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:00:30", - 1.0, - null, - true, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:01:00", - 2.0, - 2.0, - false, - false, - false - ], - [ - "A", - "A-1", - "2020-01-01 00:01:30", - 3.0, - 3.0, - false, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:02:00", - 4.0, - 4.0, - false, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:02:30", - 5.0, - 5.0, - true, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:03:00", - 6.0, - 6.0, - true, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:03:30", - 7.0, - 7.0, - false, - true, - false - ], - [ - "A", - "A-1", - "2020-01-01 00:04:00", - 8.0, - 8.0, - false, - false, - false - ], - [ - "A", - "A-1", - "2020-01-01 00:04:30", - 9.0, - null, - true, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:05:00", - 10.0, - null, - true, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:05:30", - 11.0, - null, - false, - false, - true - ] - ] - } - }, - "test_show_interpolated": { - "simple_input_data": { - "$ref": "#/__SharedData/simple_input_data" - }, - "expected_data": { - "schema": "partition_a string, partition_b string, event_ts string, value_a double, value_b double", - "ts_col": "event_ts", - "partition_cols": [ - "partition_a", - "partition_b" - ], - "data": [ - [ - "A", - "A-1", - "2020-01-01 00:00:00", - 0.0, - null - ], - [ - "A", - "A-1", - "2020-01-01 00:00:30", - 1.0, - null - ], - [ - "A", - "A-1", - "2020-01-01 00:01:00", - 2.0, - 2.0 - ], - [ - "A", - "A-1", - "2020-01-01 00:01:30", - 3.0, - 3.0 - ], - [ - "A", - "A-1", - "2020-01-01 00:02:00", - 4.0, - 4.0 - ], - [ - "A", - "A-1", - "2020-01-01 00:02:30", - 5.0, - 5.0 - ], - [ - "A", - "A-1", - "2020-01-01 00:03:00", - 6.0, - 6.0 - ], - [ - "A", - "A-1", - "2020-01-01 00:03:30", - 7.0, - 7.0 - ], - [ - "A", - "A-1", - "2020-01-01 00:04:00", - 8.0, - 8.0 - ], - [ - "A", - "A-1", - "2020-01-01 00:04:30", - 9.0, - null - ], - [ - "A", - "A-1", - "2020-01-01 00:05:00", - 10.0, - null - ], - [ - "A", - "A-1", - "2020-01-01 00:05:30", - 11.0, - null - ] - ] - } - }, - "test_validate_ts_col_data_type_is_not_timestamp": { - "input_data": { - "$ref": "#/__SharedData/input_data" - } - }, - "test_interpolation_freq_is_none": { - "input_data": { - "$ref": "#/__SharedData/input_data" - } - }, - "test_interpolation_func_is_none": { - "input_data": { - "$ref": "#/__SharedData/input_data" - } - }, - "test_interpolation_func_is_callable": { - "input_data": { - "$ref": "#/__SharedData/input_data" - } - }, - "test_interpolation_freq_is_not_supported_type": { - "input_data": { - "$ref": "#/__SharedData/input_data" - } - } - }, - "InterpolationIntegrationTest": { - "test_interpolation_using_default_tsdf_params": { - "input_data": { - "$ref": "#/__SharedData/input_data" - }, - "simple_input_data": { - "$ref": "#/__SharedData/simple_input_data" - }, - "expected": { - "schema": "partition_a string, partition_b string, event_ts string, value_a double, value_b double", - "ts_col": "event_ts", - "partition_cols": [ - "partition_a", - "partition_b" - ], - "data": [ - [ - "A", - "A-1", - "2020-01-01 00:00:00", - 0.0, - null - ], - [ - "A", - "A-1", - "2020-01-01 00:00:30", - 1.0, - null - ], - [ - "A", - "A-1", - "2020-01-01 00:01:00", - 2.0, - 2.0 - ], - [ - "A", - "A-1", - "2020-01-01 00:01:30", - 3.0, - 3.0 - ], - [ - "A", - "A-1", - "2020-01-01 00:02:00", - 4.0, - 4.0 - ], - [ - "A", - "A-1", - "2020-01-01 00:02:30", - 5.0, - 5.0 - ], - [ - "A", - "A-1", - "2020-01-01 00:03:00", - 6.0, - 6.0 - ], - [ - "A", - "A-1", - "2020-01-01 00:03:30", - 7.0, - 7.0 - ], - [ - "A", - "A-1", - "2020-01-01 00:04:00", - 8.0, - 8.0 - ], - [ - "A", - "A-1", - "2020-01-01 00:04:30", - 9.0, - null - ], - [ - "A", - "A-1", - "2020-01-01 00:05:00", - 10.0, - null - ], - [ - "A", - "A-1", - "2020-01-01 00:05:30", - 11.0, - null + "expected": { + "df": { + "schema": "partition_a string, partition_b string, event_ts string, value_a double, value_b double, is_ts_interpolated boolean, is_interpolated_value_a boolean, is_interpolated_value_b boolean", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "A", + "A-1", + "2020-01-01 00:00:00", + 0.0, + 2.0, + false, + false, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:00:30", + 2.0, + 2.0, + true, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:01:00", + 2.0, + 2.0, + false, + false, + false + ], + [ + "A", + "A-1", + "2020-01-01 00:01:30", + 8.0, + 7.0, + false, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:02:00", + 8.0, + 7.0, + false, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:02:30", + 8.0, + 7.0, + true, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:03:00", + 8.0, + 7.0, + true, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:03:30", + 8.0, + 7.0, + false, + true, + false + ], + [ + "A", + "A-1", + "2020-01-01 00:04:00", + 8.0, + 8.0, + false, + false, + false + ], + [ + "A", + "A-1", + "2020-01-01 00:04:30", + 11.0, + null, + true, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:05:00", + 11.0, + null, + true, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:05:30", + 11.0, + null, + false, + false, + true + ] ] - ] + } + } + }, + "test_forward_fill_interpolation": { + "simple_init": { + "$ref": "#/__SharedData/simple_init" + }, + "expected": { + "df": { + "schema": "partition_a string, partition_b string, event_ts string, value_a double, value_b double, is_ts_interpolated boolean, is_interpolated_value_a boolean, is_interpolated_value_b boolean", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "A", + "A-1", + "2020-01-01 00:00:00", + 0.0, + null, + false, + false, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:00:30", + 0.0, + null, + true, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:01:00", + 2.0, + 2.0, + false, + false, + false + ], + [ + "A", + "A-1", + "2020-01-01 00:01:30", + 2.0, + 2.0, + false, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:02:00", + 2.0, + 2.0, + false, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:02:30", + 2.0, + 2.0, + true, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:03:00", + 2.0, + 2.0, + true, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:03:30", + 2.0, + 7.0, + false, + true, + false + ], + [ + "A", + "A-1", + "2020-01-01 00:04:00", + 8.0, + 8.0, + false, + false, + false + ], + [ + "A", + "A-1", + "2020-01-01 00:04:30", + 8.0, + 8.0, + true, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:05:00", + 8.0, + 8.0, + true, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:05:30", + 11.0, + 8.0, + false, + false, + true + ] + ] + } + } + }, + "test_linear_fill_interpolation": { + "simple_init": { + "$ref": "#/__SharedData/simple_init" + }, + "expected": { + "df": { + "schema": "partition_a string, partition_b string, event_ts string, value_a double, value_b double, is_ts_interpolated boolean, is_interpolated_value_a boolean, is_interpolated_value_b boolean", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "A", + "A-1", + "2020-01-01 00:00:00", + 0.0, + null, + false, + false, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:00:30", + 1.0, + null, + true, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:01:00", + 2.0, + 2.0, + false, + false, + false + ], + [ + "A", + "A-1", + "2020-01-01 00:01:30", + 3.0, + 3.0, + false, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:02:00", + 4.0, + 4.0, + false, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:02:30", + 5.0, + 5.0, + true, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:03:00", + 6.0, + 6.0, + true, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:03:30", + 7.0, + 7.0, + false, + true, + false + ], + [ + "A", + "A-1", + "2020-01-01 00:04:00", + 8.0, + 8.0, + false, + false, + false + ], + [ + "A", + "A-1", + "2020-01-01 00:04:30", + 9.0, + null, + true, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:05:00", + 10.0, + null, + true, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:05:30", + 11.0, + null, + false, + false, + true + ] + ] + } + } + }, + "test_different_freq_abbreviations": { + "simple_init": { + "$ref": "#/__SharedData/simple_init" + }, + "expected": { + "df": { + "schema": "partition_a string, partition_b string, event_ts string, value_a double, value_b double, is_ts_interpolated boolean, is_interpolated_value_a boolean, is_interpolated_value_b boolean", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "A", + "A-1", + "2020-01-01 00:00:00", + 0.0, + null, + false, + false, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:00:30", + 1.0, + null, + true, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:01:00", + 2.0, + 2.0, + false, + false, + false + ], + [ + "A", + "A-1", + "2020-01-01 00:01:30", + 3.0, + 3.0, + false, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:02:00", + 4.0, + 4.0, + false, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:02:30", + 5.0, + 5.0, + true, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:03:00", + 6.0, + 6.0, + true, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:03:30", + 7.0, + 7.0, + false, + true, + false + ], + [ + "A", + "A-1", + "2020-01-01 00:04:00", + 8.0, + 8.0, + false, + false, + false + ], + [ + "A", + "A-1", + "2020-01-01 00:04:30", + 9.0, + null, + true, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:05:00", + 10.0, + null, + true, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:05:30", + 11.0, + null, + false, + false, + true + ] + ] + } + } + }, + "test_show_interpolated": { + "simple_init": { + "$ref": "#/__SharedData/simple_init" + }, + "expected": { + "df": { + "schema": "partition_a string, partition_b string, event_ts string, value_a double, value_b double", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "A", + "A-1", + "2020-01-01 00:00:00", + 0.0, + null + ], + [ + "A", + "A-1", + "2020-01-01 00:00:30", + 1.0, + null + ], + [ + "A", + "A-1", + "2020-01-01 00:01:00", + 2.0, + 2.0 + ], + [ + "A", + "A-1", + "2020-01-01 00:01:30", + 3.0, + 3.0 + ], + [ + "A", + "A-1", + "2020-01-01 00:02:00", + 4.0, + 4.0 + ], + [ + "A", + "A-1", + "2020-01-01 00:02:30", + 5.0, + 5.0 + ], + [ + "A", + "A-1", + "2020-01-01 00:03:00", + 6.0, + 6.0 + ], + [ + "A", + "A-1", + "2020-01-01 00:03:30", + 7.0, + 7.0 + ], + [ + "A", + "A-1", + "2020-01-01 00:04:00", + 8.0, + 8.0 + ], + [ + "A", + "A-1", + "2020-01-01 00:04:30", + 9.0, + null + ], + [ + "A", + "A-1", + "2020-01-01 00:05:00", + 10.0, + null + ], + [ + "A", + "A-1", + "2020-01-01 00:05:30", + 11.0, + null + ] + ] + } + } + }, + "test_validate_ts_col_data_type_is_not_timestamp": { + "init": { + "$ref": "#/__SharedData/init" + } + }, + "test_interpolation_freq_is_none": { + "init": { + "$ref": "#/__SharedData/init" + } + }, + "test_interpolation_func_is_none": { + "init": { + "$ref": "#/__SharedData/init" + } + }, + "test_interpolation_func_is_callable": { + "init": { + "$ref": "#/__SharedData/init" + } + }, + "test_interpolation_freq_is_not_supported_type": { + "init": { + "$ref": "#/__SharedData/init" + } + } + }, + "InterpolationIntegrationTest": { + "test_interpolation_using_default_tsdf_params": { + "init": { + "$ref": "#/__SharedData/init" + }, + "simple_init": { + "$ref": "#/__SharedData/simple_init" + }, + "expected": { + "df": { + "schema": "partition_a string, partition_b string, event_ts string, value_a double, value_b double", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "A", + "A-1", + "2020-01-01 00:00:00", + 0.0, + null + ], + [ + "A", + "A-1", + "2020-01-01 00:00:30", + 1.0, + null + ], + [ + "A", + "A-1", + "2020-01-01 00:01:00", + 2.0, + 2.0 + ], + [ + "A", + "A-1", + "2020-01-01 00:01:30", + 3.0, + 3.0 + ], + [ + "A", + "A-1", + "2020-01-01 00:02:00", + 4.0, + 4.0 + ], + [ + "A", + "A-1", + "2020-01-01 00:02:30", + 5.0, + 5.0 + ], + [ + "A", + "A-1", + "2020-01-01 00:03:00", + 6.0, + 6.0 + ], + [ + "A", + "A-1", + "2020-01-01 00:03:30", + 7.0, + 7.0 + ], + [ + "A", + "A-1", + "2020-01-01 00:04:00", + 8.0, + 8.0 + ], + [ + "A", + "A-1", + "2020-01-01 00:04:30", + 9.0, + null + ], + [ + "A", + "A-1", + "2020-01-01 00:05:00", + 10.0, + null + ], + [ + "A", + "A-1", + "2020-01-01 00:05:30", + 11.0, + null + ] + ] + } } }, "test_interpolation_using_custom_params": { - "input_data": { - "$ref": "#/__SharedData/input_data" + "init": { + "$ref": "#/__SharedData/init" }, - "simple_input_data": { - "$ref": "#/__SharedData/simple_input_data" + "simple_init": { + "$ref": "#/__SharedData/simple_init" }, "expected": { - "schema": "partition_a string, partition_b string, other_ts_col string, value_a double, is_ts_interpolated boolean, is_interpolated_value_a boolean", - "ts_col": "other_ts_col", - "partition_cols": [ - "partition_a", - "partition_b" - ], - "data": [ - [ - "A", - "A-1", - "2020-01-01 00:00:00", - 0.0, - false, - false - ], - [ - "A", - "A-1", - "2020-01-01 00:00:30", - 1.0, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:01:00", - 2.0, - false, - false - ], - [ - "A", - "A-1", - "2020-01-01 00:01:30", - 3.0, - false, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:02:00", - 4.0, - false, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:02:30", - 5.0, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:03:00", - 6.0, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:03:30", - 7.0, - false, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:04:00", - 8.0, - false, - false - ], - [ - "A", - "A-1", - "2020-01-01 00:04:30", - 9.0, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:05:00", - 10.0, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:05:30", - 11.0, - false, - false + "df": { + "schema": "partition_a string, partition_b string, other_ts_col string, value_a double, is_ts_interpolated boolean, is_interpolated_value_a boolean", + "ts_convert": [ + "other_ts_col" + ], + "data": [ + [ + "A", + "A-1", + "2020-01-01 00:00:00", + 0.0, + false, + false + ], + [ + "A", + "A-1", + "2020-01-01 00:00:30", + 1.0, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:01:00", + 2.0, + false, + false + ], + [ + "A", + "A-1", + "2020-01-01 00:01:30", + 3.0, + false, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:02:00", + 4.0, + false, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:02:30", + 5.0, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:03:00", + 6.0, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:03:30", + 7.0, + false, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:04:00", + 8.0, + false, + false + ], + [ + "A", + "A-1", + "2020-01-01 00:04:30", + 9.0, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:05:00", + 10.0, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:05:30", + 11.0, + false, + false + ] ] - ] + } } }, "test_interpolation_on_sampled_data": { - "input_data": { - "$ref": "#/__SharedData/input_data" + "init": { + "$ref": "#/__SharedData/init" }, - "simple_input_data": { - "$ref": "#/__SharedData/simple_input_data" + "simple_init": { + "$ref": "#/__SharedData/simple_init" }, "expected": { - "schema": "partition_a string, partition_b string, event_ts string, value_a double, is_ts_interpolated boolean, is_interpolated_value_a boolean", - "ts_col": "event_ts", - "partition_cols": [ - "partition_a", - "partition_b" - ], - "data": [ - [ - "A", - "A-1", - "2020-01-01 00:00:00", - 0.0, - false, - false - ], - [ - "A", - "A-1", - "2020-01-01 00:00:30", - 1.0, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:01:00", - 2.0, - false, - false - ], - [ - "A", - "A-1", - "2020-01-01 00:01:30", - 3.0, - false, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:02:00", - 4.0, - false, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:02:30", - 5.0, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:03:00", - 6.0, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:03:30", - 7.0, - false, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:04:00", - 8.0, - false, - false - ], - [ - "A", - "A-1", - "2020-01-01 00:04:30", - 9.0, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:05:00", - 10.0, - true, - true - ], - [ - "A", - "A-1", - "2020-01-01 00:05:30", - 11.0, - false, - false + "df": { + "schema": "partition_a string, partition_b string, event_ts string, value_a double, is_ts_interpolated boolean, is_interpolated_value_a boolean", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "A", + "A-1", + "2020-01-01 00:00:00", + 0.0, + false, + false + ], + [ + "A", + "A-1", + "2020-01-01 00:00:30", + 1.0, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:01:00", + 2.0, + false, + false + ], + [ + "A", + "A-1", + "2020-01-01 00:01:30", + 3.0, + false, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:02:00", + 4.0, + false, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:02:30", + 5.0, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:03:00", + 6.0, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:03:30", + 7.0, + false, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:04:00", + 8.0, + false, + false + ], + [ + "A", + "A-1", + "2020-01-01 00:04:30", + 9.0, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:05:00", + 10.0, + true, + true + ], + [ + "A", + "A-1", + "2020-01-01 00:05:30", + 11.0, + false, + false + ] ] - ] + } } }, "test_defaults_with_resampled_df": { - "input_data": { - "$ref": "#/__SharedData/input_data" + "init": { + "$ref": "#/__SharedData/init" }, - "simple_input_data": { - "$ref": "#/__SharedData/simple_input_data" + "simple_init": { + "$ref": "#/__SharedData/simple_init" }, "expected": { - "schema": "partition_a string, partition_b string, event_ts string, value_a double, value_b double", - "ts_col": "event_ts", - "partition_cols": [ - "partition_a", - "partition_b" - ], - "data": [ - [ - "A", - "A-1", - "2020-01-01 00:00:00", - 0.0, - null - ], - [ - "A", - "A-1", - "2020-01-01 00:00:30", - 0.0, - null - ], - [ - "A", - "A-1", - "2020-01-01 00:01:00", - 2.0, - 2.0 - ], - [ - "A", - "A-1", - "2020-01-01 00:01:30", - 2.0, - 2.0 - ], - [ - "A", - "A-1", - "2020-01-01 00:02:00", - 2.0, - 2.0 - ], - [ - "A", - "A-1", - "2020-01-01 00:02:30", - 2.0, - 2.0 - ], - [ - "A", - "A-1", - "2020-01-01 00:03:00", - 2.0, - 2.0 - ], - [ - "A", - "A-1", - "2020-01-01 00:03:30", - 2.0, - 7.0 - ], - [ - "A", - "A-1", - "2020-01-01 00:04:00", - 8.0, - 8.0 - ], - [ - "A", - "A-1", - "2020-01-01 00:04:30", - 8.0, - 8.0 - ], - [ - "A", - "A-1", - "2020-01-01 00:05:00", - 8.0, - 8.0 - ], - [ - "A", - "A-1", - "2020-01-01 00:05:30", - 11.0, - 8.0 + "df": { + "schema": "partition_a string, partition_b string, event_ts string, value_a double, value_b double", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "A", + "A-1", + "2020-01-01 00:00:00", + 0.0, + null + ], + [ + "A", + "A-1", + "2020-01-01 00:00:30", + 0.0, + null + ], + [ + "A", + "A-1", + "2020-01-01 00:01:00", + 2.0, + 2.0 + ], + [ + "A", + "A-1", + "2020-01-01 00:01:30", + 2.0, + 2.0 + ], + [ + "A", + "A-1", + "2020-01-01 00:02:00", + 2.0, + 2.0 + ], + [ + "A", + "A-1", + "2020-01-01 00:02:30", + 2.0, + 2.0 + ], + [ + "A", + "A-1", + "2020-01-01 00:03:00", + 2.0, + 2.0 + ], + [ + "A", + "A-1", + "2020-01-01 00:03:30", + 2.0, + 7.0 + ], + [ + "A", + "A-1", + "2020-01-01 00:04:00", + 8.0, + 8.0 + ], + [ + "A", + "A-1", + "2020-01-01 00:04:30", + 8.0, + 8.0 + ], + [ + "A", + "A-1", + "2020-01-01 00:05:00", + 8.0, + 8.0 + ], + [ + "A", + "A-1", + "2020-01-01 00:05:30", + 11.0, + 8.0 + ] ] - ] + } } }, "test_tsdf_constructor_params_are_updated": { - "simple_input_data": { - "$ref": "#/__SharedData/simple_input_data" + "simple_init": { + "$ref": "#/__SharedData/simple_init" } } } diff --git a/python/tests/unit_test_data/intervals_tests.json b/python/tests/unit_test_data/intervals_tests.json index 722ddbec..22b01a96 100644 --- a/python/tests/unit_test_data/intervals_tests.json +++ b/python/tests/unit_test_data/intervals_tests.json @@ -1,66 +1,22 @@ { "__SharedData": { "init": { - "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL, series_1 STRING NOT NULL, metric_1 INT, metric_2 INT", - "other_ts_cols": [ - "start_ts", - "end_ts" - ], - "start_ts": "start_ts", - "end_ts": "end_ts", - "series": [ - "series_1" - ], - "data": [ - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:14", - "v1", - 5, - null - ], - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:11", - "v1", - null, - 0 - ], - [ - "2020-08-01 00:00:11", - "2020-08-01 00:00:12", - "v1", - null, - 4 - ] - ] - } - }, - "IntervalsDFTests": { - "test_init_series_str": { - "input": { - "$ref": "#/__SharedData/init" - } - }, - "test_init_series_comma_seperated_str": { - "input": { - "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL, series_1 STRING NOT NULL, series_2 STRING NOT NULL, metric_1 INT, metric_2 INT", - "other_ts_cols": [ - "start_ts", - "end_ts" - ], + "idf": { "start_ts": "start_ts", "end_ts": "end_ts", - "series": [ - "series_1", - "series_2" + "series_ids": ["series_1"] + }, + "df": { + "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL, series_1 STRING NOT NULL, metric_1 INT, metric_2 INT", + "ts_convert": [ + "start_ts", + "end_ts" ], "data": [ [ "2020-08-01 00:00:09", "2020-08-01 00:00:14", "v1", - "v2", 5, null ], @@ -68,7 +24,6 @@ "2020-08-01 00:00:09", "2020-08-01 00:00:11", "v1", - "v2", null, 0 ], @@ -76,351 +31,380 @@ "2020-08-01 00:00:11", "2020-08-01 00:00:12", "v1", - "v2", null, 4 ] ] } + } + }, + "IntervalsDFTests": { + "test_init_series_str": { + "init": { + "$ref": "#/__SharedData/init" + } + }, + "test_init_series_comma_seperated_str": { + "init": { + "df": { + "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL, series_1 STRING NOT NULL, series_2 STRING NOT NULL, metric_1 INT, metric_2 INT", + "ts_convert": [ + "start_ts", + "end_ts" + ], + "data": [ + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:14", + "v1", + "v2", + 5, + null + ], + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:11", + "v1", + "v2", + null, + 0 + ], + [ + "2020-08-01 00:00:11", + "2020-08-01 00:00:12", + "v1", + "v2", + null, + 4 + ] + ] + } + } }, "test_init_series_tuple": { - "input": { + "init": { "$ref": "#/__SharedData/init" } }, "test_init_series_list": { - "input": { + "init": { "$ref": "#/__SharedData/init" } }, "test_init_series_none": { - "input": { - "$ref": "#/__SharedData/init" + "init": { + "idf": { + "start_ts": "start_ts", + "end_ts": "end_ts", + "series_ids": [] + }, + "df": { + "$ref": "#/__SharedData/init/df" + } } }, "test_init_series_int": { - "input": { + "init": { "$ref": "#/__SharedData/init" } }, "test_window_property": { - "input": { + "init": { "$ref": "#/__SharedData/init" } }, "test_init_metric_none": { - "input": { + "init": { "$ref": "#/__SharedData/init" } }, "test_fromStackedMetrics_series_str": { - "input": { - "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL, series_1 STRING NOT NULL, metric_name STRING NOT NULL, metric_value INT NOT NULL", - "data": [ - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:14", - "v1", - "metric_1", - 5 - ], - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:11", - "v1", - "metric_2", - 0 - ], - [ - "2020-08-01 00:00:11", - "2020-08-01 00:00:12", - "v1", - "metric_2", - 4 + "init": { + "df": { + "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL, series_1 STRING NOT NULL, metric_name STRING NOT NULL, metric_value INT NOT NULL", + "data": [ + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:14", + "v1", + "metric_1", + 5 + ], + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:11", + "v1", + "metric_2", + 0 + ], + [ + "2020-08-01 00:00:11", + "2020-08-01 00:00:12", + "v1", + "metric_2", + 4 + ] ] - ] + } } }, "test_fromStackedMetrics_series_tuple": { - "input": { - "$ref": "#/IntervalsDFTests/test_fromStackedMetrics_series_str/input" + "init": { + "$ref": "#/IntervalsDFTests/test_fromStackedMetrics_series_str/init" } }, "test_fromStackedMetrics_series_list": { - "input": { - "$ref": "#/IntervalsDFTests/test_fromStackedMetrics_series_str/input" + "init": { + "$ref": "#/IntervalsDFTests/test_fromStackedMetrics_series_str/init" }, "expected": { - "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL, series_1 STRING NOT NULL, metric_1 INT, metric_2 INT", - "other_ts_cols": [ - "start_ts", - "end_ts" - ], - "start_ts": "start_ts", - "end_ts": "end_ts", - "series": [ - "series_1" - ], - "data": [ - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:14", - "v1", - 5, - null - ], - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:11", - "v1", - null, - 0 + "idf": { + "start_ts": "start_ts", + "end_ts": "end_ts", + "series_ids": ["series_1"] + }, + "df": { + "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL, series_1 STRING NOT NULL, metric_1 INT, metric_2 INT", + "ts_convert": [ + "start_ts", + "end_ts" ], - [ - "2020-08-01 00:00:11", - "2020-08-01 00:00:12", - "v1", - null, - 4 + "data": [ + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:14", + "v1", + 5, + null + ], + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:11", + "v1", + null, + 0 + ], + [ + "2020-08-01 00:00:11", + "2020-08-01 00:00:12", + "v1", + null, + 4 + ] ] - ] + } } }, "test_fromStackedMetrics_metric_names": { - "input": { - "$ref": "#/IntervalsDFTests/test_fromStackedMetrics_series_list/input" + "init": { + "$ref": "#/IntervalsDFTests/test_fromStackedMetrics_series_list/init" }, "expected": { "$ref": "#/IntervalsDFTests/test_fromStackedMetrics_series_list/expected" } }, "test_make_disjoint": { - "input": { - "schema": { - "$ref": "#/__SharedData/init/schema" - }, - "other_ts_cols": { - "$ref": "#/__SharedData/init/other_ts_cols" - }, - "start_ts": { - "$ref": "#/__SharedData/init/start_ts" - }, - "end_ts": { - "$ref": "#/__SharedData/init/end_ts" - }, - "series": { - "$ref": "#/__SharedData/init/series" - }, - "data": [ - [ - "2020-08-01 00:00:10", - "2020-08-01 00:00:14", - "v1", - 5, - null - ], - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:11", - "v1", - null, - 0 + "init": { + "idf": { + "$ref": "#/__SharedData/init/idf" + }, + "df": { + "schema": { + "$ref": "#/__SharedData/init/df/schema" + }, + "ts_convert": { + "$ref": "#/__SharedData/init/df/ts_convert" + }, + "data": [ + [ + "2020-08-01 00:00:10", + "2020-08-01 00:00:14", + "v1", + 5, + null + ], + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:11", + "v1", + null, + 0 + ] ] - ] + } }, "expected": { - "schema": { - "$ref": "#/__SharedData/init/schema" - }, - "other_ts_cols": { - "$ref": "#/__SharedData/init/other_ts_cols" - }, - "start_ts": { - "$ref": "#/__SharedData/init/start_ts" - }, - "end_ts": { - "$ref": "#/__SharedData/init/end_ts" - }, - "series": { - "$ref": "#/__SharedData/init/series" - }, - "data": [ - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:10", - "v1", - null, - 0 - ], - [ - "2020-08-01 00:00:10", - "2020-08-01 00:00:11", - "v1", - 5, - 0 - ], - [ - "2020-08-01 00:00:11", - "2020-08-01 00:00:14", - "v1", - 5, - null + "idf": { + "$ref": "#/__SharedData/init/idf" + }, + "df": { + "schema": { + "$ref": "#/__SharedData/init/df/schema" + }, + "ts_convert": { + "$ref": "#/__SharedData/init/df/ts_convert" + }, + "data": [ + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:10", + "v1", + null, + 0 + ], + [ + "2020-08-01 00:00:10", + "2020-08-01 00:00:11", + "v1", + 5, + 0 + ], + [ + "2020-08-01 00:00:11", + "2020-08-01 00:00:14", + "v1", + 5, + null + ] ] - ] + } } }, "test_make_disjoint_contains_interval_already_disjoint": { - "input": { - "schema": { - "$ref": "#/__SharedData/init/schema" - }, - "other_ts_cols": { - "$ref": "#/__SharedData/init/other_ts_cols" - }, - "start_ts": { - "$ref": "#/__SharedData/init/start_ts" - }, - "end_ts": { - "$ref": "#/__SharedData/init/end_ts" - }, - "series": { - "$ref": "#/__SharedData/init/series" - }, - "data": [ - [ - "2020-08-01 00:00:10", - "2020-08-01 00:00:13", - "v1", - 5, - null - ], - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:12", - "v1", - null, - 0 - ], - [ - "2020-08-01 00:00:13", - "2020-08-01 00:00:14", - "v1", - null, - 4 + "init": { + "idf": { + "$ref": "#/__SharedData/init/idf" + }, + "df": { + "schema": { + "$ref": "#/__SharedData/init/df/schema" + }, + "ts_convert": { + "$ref": "#/__SharedData/init/df/ts_convert" + }, + "data": [ + [ + "2020-08-01 00:00:10", + "2020-08-01 00:00:13", + "v1", + 5, + null + ], + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:12", + "v1", + null, + 0 + ], + [ + "2020-08-01 00:00:13", + "2020-08-01 00:00:14", + "v1", + null, + 4 + ] ] - ] + } }, "expected": { - "schema": { - "$ref": "#/__SharedData/init/schema" - }, - "other_ts_cols": { - "$ref": "#/__SharedData/init/other_ts_cols" - }, - "start_ts": { - "$ref": "#/__SharedData/init/start_ts" - }, - "end_ts": { - "$ref": "#/__SharedData/init/end_ts" - }, - "series": { - "$ref": "#/__SharedData/init/series" - }, - "data": [ - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:10", - "v1", - null, - 0 - ], - [ - "2020-08-01 00:00:10", - "2020-08-01 00:00:12", - "v1", - 5, - 0 - ], - [ - "2020-08-01 00:00:12", - "2020-08-01 00:00:13", - "v1", - 5, - null - ], - [ - "2020-08-01 00:00:13", - "2020-08-01 00:00:14", - "v1", - null, - 4 + "idf": { + "$ref": "#/__SharedData/init/idf" + }, + "df": { + "schema": { + "$ref": "#/__SharedData/init/df/schema" + }, + "ts_convert": { + "$ref": "#/__SharedData/init/df/ts_convert" + }, + "data": [ + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:10", + "v1", + null, + 0 + ], + [ + "2020-08-01 00:00:10", + "2020-08-01 00:00:12", + "v1", + 5, + 0 + ], + [ + "2020-08-01 00:00:12", + "2020-08-01 00:00:13", + "v1", + 5, + null + ], + [ + "2020-08-01 00:00:13", + "2020-08-01 00:00:14", + "v1", + null, + 4 + ] ] - ] + } } }, "test_make_disjoint_contains_intervals_equal": { - "input": { - "schema": { - "$ref": "#/__SharedData/init/schema" - }, - "other_ts_cols": { - "$ref": "#/__SharedData/init/other_ts_cols" - }, - "start_ts": { - "$ref": "#/__SharedData/init/start_ts" - }, - "end_ts": { - "$ref": "#/__SharedData/init/end_ts" - }, - "series": { - "$ref": "#/__SharedData/init/series" - }, - "data": [ - [ - "2020-08-01 00:00:10", - "2020-08-01 00:00:13", - "v1", - 5, - null - ], - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:12", - "v1", - null, - 0 - ], - [ - "2020-08-01 00:00:13", - "2020-08-01 00:00:14", - "v1", - null, - 4 - ], - [ - "2020-08-01 00:00:13", - "2020-08-01 00:00:14", - "v1", - 7, - null + "init": { + "idf": { + "$ref": "#/__SharedData/init/idf" + }, + "df": { + "schema": { + "$ref": "#/__SharedData/init/df/schema" + }, + "ts_convert": { + "$ref": "#/__SharedData/init/df/ts_convert" + }, + "data": [ + [ + "2020-08-01 00:00:10", + "2020-08-01 00:00:13", + "v1", + 5, + null + ], + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:12", + "v1", + null, + 0 + ], + [ + "2020-08-01 00:00:13", + "2020-08-01 00:00:14", + "v1", + null, + 4 + ], + [ + "2020-08-01 00:00:13", + "2020-08-01 00:00:14", + "v1", + 7, + null + ] ] - ] + } }, "expected": { - "schema": { - "$ref": "#/__SharedData/init/schema" - }, - "other_ts_cols": { - "$ref": "#/__SharedData/init/other_ts_cols" + "idf": { + "$ref": "#/__SharedData/init/idf" }, - "start_ts": { - "$ref": "#/__SharedData/init/start_ts" - }, - "end_ts": { - "$ref": "#/__SharedData/init/end_ts" + "df": { + "schema": { + "$ref": "#/__SharedData/init/df/schema" }, - "series": { - "$ref": "#/__SharedData/init/series" + "ts_convert": { + "$ref": "#/__SharedData/init/df/ts_convert" }, "data": [ [ @@ -452,637 +436,612 @@ 4 ] ] + } } }, "test_make_disjoint_intervals_same_start": { - "input": { - "schema": { - "$ref": "#/__SharedData/init/schema" - }, - "other_ts_cols": { - "$ref": "#/__SharedData/init/other_ts_cols" - }, - "start_ts": { - "$ref": "#/__SharedData/init/start_ts" - }, - "end_ts": { - "$ref": "#/__SharedData/init/end_ts" - }, - "series": { - "$ref": "#/__SharedData/init/series" - }, - "data": [ - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:14", - "v1", - 5, - null - ], - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:11", - "v1", - null, - 0 + "init": { + "idf": { + "$ref": "#/__SharedData/init/idf" + }, + "df": { + "schema": { + "$ref": "#/__SharedData/init/df/schema" + }, + "ts_convert": { + "$ref": "#/__SharedData/init/df/ts_convert" + }, + "data": [ + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:14", + "v1", + 5, + null + ], + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:11", + "v1", + null, + 0 + ] ] - ] + } }, "expected": { - "schema": { - "$ref": "#/__SharedData/init/schema" - }, - "other_ts_cols": { - "$ref": "#/__SharedData/init/other_ts_cols" - }, - "start_ts": { - "$ref": "#/__SharedData/init/start_ts" - }, - "end_ts": { - "$ref": "#/__SharedData/init/end_ts" - }, - "series": { - "$ref": "#/__SharedData/init/series" - }, - "data": [ - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:11", - "v1", - 5, - 0 - ], - [ - "2020-08-01 00:00:11", - "2020-08-01 00:00:14", - "v1", - 5, - null + "idf": { + "$ref": "#/__SharedData/init/idf" + }, + "df": { + "schema": { + "$ref": "#/__SharedData/init/df/schema" + }, + "ts_convert": { + "$ref": "#/__SharedData/init/df/ts_convert" + }, + "data": [ + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:11", + "v1", + 5, + 0 + ], + [ + "2020-08-01 00:00:11", + "2020-08-01 00:00:14", + "v1", + 5, + null + ] ] - ] + } } }, "test_make_disjoint_intervals_same_end": { - "input": { - "schema": { - "$ref": "#/__SharedData/init/schema" - }, - "other_ts_cols": { - "$ref": "#/__SharedData/init/other_ts_cols" - }, - "start_ts": { - "$ref": "#/__SharedData/init/start_ts" - }, - "end_ts": { - "$ref": "#/__SharedData/init/end_ts" - }, - "series": { - "$ref": "#/__SharedData/init/series" - }, - "data": [ - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:14", - "v1", - 5, - null - ], - [ - "2020-08-01 00:00:11", - "2020-08-01 00:00:14", - "v1", - null, - 0 + "init": { + "idf": { + "$ref": "#/__SharedData/init/idf" + }, + "df": { + "schema": { + "$ref": "#/__SharedData/init/df/schema" + }, + "ts_convert": { + "$ref": "#/__SharedData/init/df/ts_convert" + }, + "data": [ + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:14", + "v1", + 5, + null + ], + [ + "2020-08-01 00:00:11", + "2020-08-01 00:00:14", + "v1", + null, + 0 + ] ] - ] + } }, "expected": { - "schema": { - "$ref": "#/__SharedData/init/schema" - }, - "other_ts_cols": { - "$ref": "#/__SharedData/init/other_ts_cols" - }, - "start_ts": { - "$ref": "#/__SharedData/init/start_ts" - }, - "end_ts": { - "$ref": "#/__SharedData/init/end_ts" - }, - "series": { - "$ref": "#/__SharedData/init/series" - }, - "data": [ - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:11", - "v1", - 5, - null - ], - [ - "2020-08-01 00:00:11", - "2020-08-01 00:00:14", - "v1", - 5, - 0 + "idf": { + "$ref": "#/__SharedData/init/idf" + }, + "df": { + "schema": { + "$ref": "#/__SharedData/init/df/schema" + }, + "ts_convert": { + "$ref": "#/__SharedData/init/df/ts_convert" + }, + "data": [ + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:11", + "v1", + 5, + null + ], + [ + "2020-08-01 00:00:11", + "2020-08-01 00:00:14", + "v1", + 5, + 0 + ] ] - ] + } } }, "test_make_disjoint_multiple_series": { - "input": { - "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL, series_1 STRING NOT NULL, series_2 STRING NOT NULL, metric_1 INT, metric_2 INT", - "other_ts_cols": { - "$ref": "#/__SharedData/init/other_ts_cols" - }, - "start_ts": { - "$ref": "#/__SharedData/init/start_ts" - }, - "end_ts": { - "$ref": "#/__SharedData/init/end_ts" + "init": { + "idf": { + "start_ts": { + "$ref": "#/__SharedData/init/idf/start_ts" + }, + "end_ts": { + "$ref": "#/__SharedData/init/idf/end_ts" + }, + "series_ids": [ + "series_1", + "series_2" + ] }, - "series": [ - "series_1", - "series_2" - ], - "data": [ - [ - "2020-08-01 00:00:10", - "2020-08-01 00:00:14", - "v1", - "foo", - 5, - null - ], - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:14", - "v1", - "bar", - 3, - 2 - ], - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:11", - "v1", - "foo", - null, - 0 - ], - [ - "2020-08-01 00:00:10", - "2020-08-01 00:00:13", - "v2", - "foo", - 5, - null - ], - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:12", - "v2", - "foo", - null, - 0 - ], - [ - "2020-08-01 00:00:13", - "2020-08-01 00:00:14", - "v2", - "foo", - null, - 4 - ], - [ - "2020-08-01 00:00:13", - "2020-08-01 00:00:14", - "v2", - "foo", - 6, - 3 + "df": { + "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL, series_1 STRING NOT NULL, series_2 STRING NOT NULL, metric_1 INT, metric_2 INT", + "ts_convert": { + "$ref": "#/__SharedData/init/df/ts_convert" + }, + "data": [ + [ + "2020-08-01 00:00:10", + "2020-08-01 00:00:14", + "v1", + "foo", + 5, + null + ], + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:14", + "v1", + "bar", + 3, + 2 + ], + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:11", + "v1", + "foo", + null, + 0 + ], + [ + "2020-08-01 00:00:10", + "2020-08-01 00:00:13", + "v2", + "foo", + 5, + null + ], + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:12", + "v2", + "foo", + null, + 0 + ], + [ + "2020-08-01 00:00:13", + "2020-08-01 00:00:14", + "v2", + "foo", + null, + 4 + ], + [ + "2020-08-01 00:00:13", + "2020-08-01 00:00:14", + "v2", + "foo", + 6, + 3 + ] ] - ] + } }, "expected": { - "schema": { - "$ref": "#/IntervalsDFTests/test_make_disjoint_multiple_series/input/schema" - }, - "other_ts_cols": { - "$ref": "#/__SharedData/init/other_ts_cols" - }, - "start_ts": { - "$ref": "#/__SharedData/init/start_ts" - }, - "end_ts": { - "$ref": "#/__SharedData/init/end_ts" - }, - "series": { - "$ref": "#/IntervalsDFTests/test_make_disjoint_multiple_series/input/series" - }, - "data": [ - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:14", - "v1", - "bar", - 3, - 2 - ], - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:10", - "v1", - "foo", - null, - 0 - ], - [ - "2020-08-01 00:00:10", - "2020-08-01 00:00:11", - "v1", - "foo", - 5, - 0 - ], - [ - "2020-08-01 00:00:11", - "2020-08-01 00:00:14", - "v1", - "foo", - 5, - null - ], - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:10", - "v2", - "foo", - null, - 0 - ], - [ - "2020-08-01 00:00:10", - "2020-08-01 00:00:12", - "v2", - "foo", - 5, - 0 - ], - [ - "2020-08-01 00:00:12", - "2020-08-01 00:00:13", - "v2", - "foo", - 5, - null - ], - [ - "2020-08-01 00:00:13", - "2020-08-01 00:00:14", - "v2", - "foo", - 6, - 4 + "idf": { + "start_ts": { + "$ref": "#/__SharedData/init/idf/start_ts" + }, + "end_ts": { + "$ref": "#/__SharedData/init/idf/end_ts" + }, + "series_ids": { + "$ref": "#/IntervalsDFTests/test_make_disjoint_multiple_series/init/idf/series_ids" + } + }, + "df": { + "schema": { + "$ref": "#/IntervalsDFTests/test_make_disjoint_multiple_series/init/df/schema" + }, + "ts_convert": { + "$ref": "#/__SharedData/init/df/ts_convert" + }, + "data": [ + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:14", + "v1", + "bar", + 3, + 2 + ], + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:10", + "v1", + "foo", + null, + 0 + ], + [ + "2020-08-01 00:00:10", + "2020-08-01 00:00:11", + "v1", + "foo", + 5, + 0 + ], + [ + "2020-08-01 00:00:11", + "2020-08-01 00:00:14", + "v1", + "foo", + 5, + null + ], + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:10", + "v2", + "foo", + null, + 0 + ], + [ + "2020-08-01 00:00:10", + "2020-08-01 00:00:12", + "v2", + "foo", + 5, + 0 + ], + [ + "2020-08-01 00:00:12", + "2020-08-01 00:00:13", + "v2", + "foo", + 5, + null + ], + [ + "2020-08-01 00:00:13", + "2020-08-01 00:00:14", + "v2", + "foo", + 6, + 4 + ] ] - ] + } } }, "test_make_disjoint_single_metric": { - "input": { - "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL, series_1 STRING NOT NULL, metric_1 INT", - "other_ts_cols": { - "$ref": "#/__SharedData/init/other_ts_cols" - }, - "start_ts": { - "$ref": "#/__SharedData/init/start_ts" - }, - "end_ts": { - "$ref": "#/__SharedData/init/end_ts" - }, - "series": { - "$ref": "#/__SharedData/init/series" - }, - "data": [ - [ - "2020-08-01 00:00:11", - "2020-08-01 00:00:14", - "v1", - 5 - ], - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:11", - "v1", - 4 + "init": { + "idf": { + "$ref": "#/__SharedData/init/idf" + }, + "df": { + "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL, series_1 STRING NOT NULL, metric_1 INT", + "ts_convert": { + "$ref": "#/__SharedData/init/df/ts_convert" + }, + "data": [ + [ + "2020-08-01 00:00:11", + "2020-08-01 00:00:14", + "v1", + 5 + ], + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:11", + "v1", + 4 + ] ] - ] + } }, "expected": { - "schema": { - "$ref": "#/IntervalsDFTests/test_make_disjoint_single_metric/input/schema" - }, - "other_ts_cols": { - "$ref": "#/__SharedData/init/other_ts_cols" - }, - "start_ts": { - "$ref": "#/__SharedData/init/start_ts" - }, - "end_ts": { - "$ref": "#/__SharedData/init/end_ts" - }, - "series": { - "$ref": "#/__SharedData/init/series" - }, - "data": { - "$ref": "#/IntervalsDFTests/test_make_disjoint_single_metric/input/data" + "idf": { + "$ref": "#/__SharedData/init/idf" + }, + "df": { + "schema": { + "$ref": "#/IntervalsDFTests/test_make_disjoint_single_metric/init/df/schema" + }, + "ts_convert": { + "$ref": "#/__SharedData/init/df/ts_convert" + }, + "data": { + "$ref": "#/IntervalsDFTests/test_make_disjoint_single_metric/init/df/data" + } } } }, "test_make_disjoint_interval_is_subset": { - "input": { - "schema": { - "$ref": "#/__SharedData/init/schema" - }, - "other_ts_cols": { - "$ref": "#/__SharedData/init/other_ts_cols" - }, - "start_ts": { - "$ref": "#/__SharedData/init/start_ts" - }, - "end_ts": { - "$ref": "#/__SharedData/init/end_ts" - }, - "series": { - "$ref": "#/__SharedData/init/series" - }, - "data": [ - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:14", - "v1", - 5, - null - ], - [ - "2020-08-01 00:00:10", - "2020-08-01 00:00:11", - "v1", - null, - 0 + "init": { + "idf": { + "$ref": "#/__SharedData/init/idf" + }, + "df": { + "schema": { + "$ref": "#/__SharedData/init/df/schema" + }, + "ts_convert": { + "$ref": "#/__SharedData/init/df/ts_convert" + }, + "data": [ + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:14", + "v1", + 5, + null + ], + [ + "2020-08-01 00:00:10", + "2020-08-01 00:00:11", + "v1", + null, + 0 + ] ] - ] + } }, "expected": { - "schema": { - "$ref": "#/__SharedData/init/schema" - }, - "other_ts_cols": { - "$ref": "#/__SharedData/init/other_ts_cols" - }, - "start_ts": { - "$ref": "#/__SharedData/init/start_ts" - }, - "end_ts": { - "$ref": "#/__SharedData/init/end_ts" - }, - "series": { - "$ref": "#/__SharedData/init/series" - }, - "data": [ - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:10", - "v1", - 5, - null - ], - [ - "2020-08-01 00:00:10", - "2020-08-01 00:00:11", - "v1", - 5, - 0 - ], - [ - "2020-08-01 00:00:11", - "2020-08-01 00:00:14", - "v1", - 5, - null + "idf": { + "$ref": "#/__SharedData/init/idf" + }, + "df": { + "schema": { + "$ref": "#/__SharedData/init/df/schema" + }, + "ts_convert": { + "$ref": "#/__SharedData/init/df/ts_convert" + }, + "data": [ + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:10", + "v1", + 5, + null + ], + [ + "2020-08-01 00:00:10", + "2020-08-01 00:00:11", + "v1", + 5, + 0 + ], + [ + "2020-08-01 00:00:11", + "2020-08-01 00:00:14", + "v1", + 5, + null + ] ] - ] + } } }, "test_union_other_idf": { - "input": { + "init": { "$ref": "#/__SharedData/init" } }, "test_union_other_df": { - "input": { + "init": { "$ref": "#/__SharedData/init" } }, "test_union_other_list_dicts": { - "input": { + "init": { "$ref": "#/__SharedData/init" } }, "test_unionByName_other_idf": { - "input": { + "init": { "$ref": "#/__SharedData/init" } }, "test_unionByName_other_df": { - "input": { + "init": { "$ref": "#/__SharedData/init" } }, "test_unionByName_other_list_dicts": { - "input": { + "init": { "$ref": "#/__SharedData/init" } }, "test_unionByName_extra_column": { - "input": { + "init": { "$ref": "#/__SharedData/init" }, - "input_extra_col": { - "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL, series_1 STRING NOT NULL, metric_1 INT, metric_2 INT, metric_3 INT", - "other_ts_cols": [ - "start_ts", - "end_ts" - ], - "start_ts": "start_ts", - "end_ts": "end_ts", - "series": [ - "series_1" - ], - "data": [ - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:14", - "v1", - 5, - null, - 1 + "init_extra_col": { + "idf": { + "$ref": "#/__SharedData/init/idf" + }, + "df": { + "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL, series_1 STRING NOT NULL, metric_1 INT, metric_2 INT, metric_3 INT", + "ts_convert": [ + "start_ts", + "end_ts" ], - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:11", - "v1", - null, - 0, - 2 - ], - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:12", - "v1", - null, - 4, - 3 + "data": [ + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:14", + "v1", + 5, + null, + 1 + ], + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:11", + "v1", + null, + 0, + 2 + ], + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:12", + "v1", + null, + 4, + 3 + ] ] - ] + } } }, "test_unionByName_other_extra_column": { - "input": { + "init": { "$ref": "#/__SharedData/init" }, - "input_extra_col": { - "$ref": "#/IntervalsDFTests/test_unionByName_extra_column/input_extra_col" + "init_extra_col": { + "$ref": "#/IntervalsDFTests/test_unionByName_extra_column/init_extra_col" } }, "test_toDF": { - "input": { + "init": { "$ref": "#/__SharedData/init" } }, "test_toDF_stack": { - "input": { + "init": { "$ref": "#/IntervalsDFTests/test_fromStackedMetrics_series_list/expected" }, "expected": { - "$ref": "#/IntervalsDFTests/test_fromStackedMetrics_series_list/input" + "$ref": "#/IntervalsDFTests/test_fromStackedMetrics_series_list/init" } }, "test_make_disjoint_issue_268": { - "input": { - "schema": "start_timestamp STRING NOT NULL, end_timestamp STRING NOT NULL, id STRING NOT NULL, s1 INT, s2 INT, s3 INT, s4 INT", - "other_ts_cols": [ - "start_timestamp", - "end_timestamp" - ], - "start_ts": "start_timestamp", - "end_ts": "end_timestamp", - "series": [ - "id" - ], - "data": [ - [ - "2020-08-01 00:00:14", - "2020-08-01 00:00:17", - "id123", - null, - 1, - null, - null + "init": { + "idf": { + "start_ts": "start_timestamp", + "end_ts": "end_timestamp", + "series_ids": ["id"] + }, + "df": { + "schema": "start_timestamp STRING NOT NULL, end_timestamp STRING NOT NULL, id STRING NOT NULL, s1 INT, s2 INT, s3 INT, s4 INT", + "ts_convert": [ + "start_timestamp", + "end_timestamp" ], - [ - "2020-08-01 00:00:14", - "2020-08-01 00:00:16", - "id123", - null, - null, - null, - 1 - ], - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:11", - "id123", - 1, - null, - null, - null - ], - [ - "2020-08-01 00:00:10", - "2020-08-01 00:00:16", - "id123", - 1, - null, - null, - null - ], - [ - "2020-08-01 00:00:14", - "2020-08-01 00:00:21", - "id123", - null, - null, - 1, - null + "data": [ + [ + "2020-08-01 00:00:14", + "2020-08-01 00:00:17", + "id123", + null, + 1, + null, + null + ], + [ + "2020-08-01 00:00:14", + "2020-08-01 00:00:16", + "id123", + null, + null, + null, + 1 + ], + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:11", + "id123", + 1, + null, + null, + null + ], + [ + "2020-08-01 00:00:10", + "2020-08-01 00:00:16", + "id123", + 1, + null, + null, + null + ], + [ + "2020-08-01 00:00:14", + "2020-08-01 00:00:21", + "id123", + null, + null, + 1, + null + ] ] - ] + } }, "expected": { - "schema": { - "$ref": "#/IntervalsDFTests/test_make_disjoint_issue_268/input/schema" - }, - "other_ts_cols": { - "$ref": "#/IntervalsDFTests/test_make_disjoint_issue_268/input/other_ts_cols" - }, - "start_ts": { - "$ref": "#/IntervalsDFTests/test_make_disjoint_issue_268/input/start_ts" - }, - "end_ts": { - "$ref": "#/IntervalsDFTests/test_make_disjoint_issue_268/input/end_ts" - }, - "series": { - "$ref": "#/IntervalsDFTests/test_make_disjoint_issue_268/input/series" - }, - "data": [ - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:14", - "id123", - 1, - null, - null, - null - ], - [ - "2020-08-01 00:00:14", - "2020-08-01 00:00:16", - "id123", - 1, - 1, - 1, - 1 - ], - [ - "2020-08-01 00:00:16", - "2020-08-01 00:00:17", - "id123", - null, - 1, - 1, - null - ], - [ - "2020-08-01 00:00:17", - "2020-08-01 00:00:21", - "id123", - null, - null, - 1, - null + "idf": { + "$ref": "#/IntervalsDFTests/test_make_disjoint_issue_268/init/idf" + }, + "df": { + "schema": { + "$ref": "#/IntervalsDFTests/test_make_disjoint_issue_268/init/df/schema" + }, + "ts_convert": { + "$ref": "#/IntervalsDFTests/test_make_disjoint_issue_268/init/df/ts_convert" + }, + "data": [ + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:14", + "id123", + 1, + null, + null, + null + ], + [ + "2020-08-01 00:00:14", + "2020-08-01 00:00:16", + "id123", + 1, + 1, + 1, + 1 + ], + [ + "2020-08-01 00:00:16", + "2020-08-01 00:00:17", + "id123", + null, + 1, + 1, + null + ], + [ + "2020-08-01 00:00:17", + "2020-08-01 00:00:21", + "id123", + null, + null, + 1, + null + ] ] - ] + } } } } diff --git a/python/tests/unit_test_data/io_tests.json b/python/tests/unit_test_data/io_tests.json index f8bc9904..0321bd14 100644 --- a/python/tests/unit_test_data/io_tests.json +++ b/python/tests/unit_test_data/io_tests.json @@ -1,83 +1,86 @@ { "__SharedData": { - "input_data": { - "schema": "symbol string, date string, event_ts string, trade_pr float, trade_pr_2 float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "SAME_DT", - "2020-08-01 00:00:10", - 349.21, - 10.0 - ], - [ - "S1", - "SAME_DT", - "2020-08-01 00:00:11", - 340.21, - 9.0 - ], - [ - "S1", - "SAME_DT", - "2020-08-01 00:01:12", - 353.32, - 8.0 - ], - [ - "S1", - "SAME_DT", - "2020-08-01 00:01:13", - 351.32, - 7.0 - ], - [ - "S1", - "SAME_DT", - "2020-08-01 00:01:14", - 350.32, - 6.0 - ], - [ - "S1", - "SAME_DT", - "2020-09-01 00:01:12", - 361.1, - 5.0 - ], - [ - "S1", - "SAME_DT", - "2020-09-01 00:19:12", - 362.1, - 4.0 + "init": { + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["symbol"] + }, + "df": { + "schema": "symbol string, date string, event_ts string, trade_pr float, trade_pr_2 float", + "ts_convert": ["event_ts"], + "data": [ + [ + "S1", + "SAME_DT", + "2020-08-01 00:00:10", + 349.21, + 10.0 + ], + [ + "S1", + "SAME_DT", + "2020-08-01 00:00:11", + 340.21, + 9.0 + ], + [ + "S1", + "SAME_DT", + "2020-08-01 00:01:12", + 353.32, + 8.0 + ], + [ + "S1", + "SAME_DT", + "2020-08-01 00:01:13", + 351.32, + 7.0 + ], + [ + "S1", + "SAME_DT", + "2020-08-01 00:01:14", + 350.32, + 6.0 + ], + [ + "S1", + "SAME_DT", + "2020-09-01 00:01:12", + 361.1, + 5.0 + ], + [ + "S1", + "SAME_DT", + "2020-09-01 00:19:12", + 362.1, + 4.0 + ] ] - ] + } } }, "DeltaWriteTest": { "test_write_to_delta_without_optimization_cols": { - "input_data": { - "$ref": "#/__SharedData/input_data" + "init": { + "$ref": "#/__SharedData/init" } }, "test_write_to_delta_with_optimization_cols": { - "input_data": { - "$ref": "#/__SharedData/input_data" + "init": { + "$ref": "#/__SharedData/init" } }, "test_write_to_delta_non_dbr_environment_logging": { - "input_data": { - "$ref": "#/__SharedData/input_data" + "init": { + "$ref": "#/__SharedData/init" } }, "test_write_to_delta_bad_dbr_environment_logging": { - "input_data": { - "$ref": "#/__SharedData/input_data" + "init": { + "$ref": "#/__SharedData/init" } } } diff --git a/python/tests/unit_test_data/json-fixer.ipynb b/python/tests/unit_test_data/json-fixer.ipynb new file mode 100644 index 00000000..7c5a5cb1 --- /dev/null +++ b/python/tests/unit_test_data/json-fixer.ipynb @@ -0,0 +1,287 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "with open('./resample_tests.json', 'r') as file:\n", + " before = json.load(file)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "def update_dict(dictionary, key, value):\n", + " if value is not None:\n", + " dictionary[key] = value" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "after = {}\n", + "for i in before.keys(): # i is test class\n", + " if i == \"__SharedData\":\n", + " continue\n", + " after[i] = {}\n", + " for j in before[i].keys(): # j is test method\n", + " after[i][j] = {}\n", + " for k in before[i][j].keys(): # input, expected, etc.\n", + " tsdf = {}\n", + " update_dict(tsdf, \"ts_col\", before[i][j][k].get(\"ts_col\", None))\n", + " update_dict(tsdf, \"other_ts_cols\", before[i][j][k].get(\"other_ts_cols\", None))\n", + " update_dict(tsdf, \"partition_cols\", before[i][j][k].get(\"partition_cols\", None))\n", + " update_dict(tsdf, \"sequence_col\", before[i][j][k].get(\"sequence_col\", None))\n", + " update_dict(tsdf, \"start_ts\", before[i][j][k].get(\"start_ts\", None))\n", + " update_dict(tsdf, \"end_ts\", before[i][j][k].get(\"end_ts\", None))\n", + " update_dict(tsdf, \"series\", before[i][j][k].get(\"series\", None))\n", + " sdf = {}\n", + " update_dict(sdf, \"schema\", before[i][j][k].get(\"schema\", None))\n", + " update_dict(sdf, \"ts_convert\", before[i][j][k].get(\"ts_convert\", None))\n", + " update_dict(sdf, \"data\", before[i][j][k].get(\"data\", None))\n", + " after[i][j][k] = {\n", + " \"tsdf\": tsdf,\n", + " \"df\": sdf,\n", + " \"$ref\": before[i][j][k].get(\"$ref\", None)\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "after_2 = {}\n", + "for i in before.keys(): # i is test class\n", + " if i != \"__SharedData\":\n", + " continue\n", + " after_2[i] = {}\n", + " for j in before[i].keys(): # j is test method\n", + " tsdf = {}\n", + " update_dict(tsdf, \"ts_col\", before[i][j].get(\"ts_col\", None))\n", + " update_dict(tsdf, \"other_ts_cols\", before[i][j].get(\"other_ts_cols\", None))\n", + " update_dict(tsdf, \"partition_cols\", before[i][j].get(\"partition_cols\", None))\n", + " update_dict(tsdf, \"sequence_col\", before[i][j].get(\"sequence_col\", None))\n", + " update_dict(tsdf, \"start_ts\", before[i][j].get(\"start_ts\", None))\n", + " update_dict(tsdf, \"end_ts\", before[i][j].get(\"end_ts\", None))\n", + " update_dict(tsdf, \"series\", before[i][j].get(\"series\", None))\n", + " sdf = {}\n", + " update_dict(sdf, \"schema\", before[i][j].get(\"schema\", None))\n", + " update_dict(sdf, \"ts_convert\", before[i][j].get(\"ts_convert\", None))\n", + " update_dict(sdf, \"data\", before[i][j].get(\"data\", None))\n", + " after_2[i][j] = {\n", + " \"tsdf\": tsdf,\n", + " \"df\": sdf,\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'__SharedData': {'input_data': {'tsdf': {'ts_col': 'event_ts',\n", + " 'partition_cols': ['symbol']},\n", + " 'df': {'schema': 'symbol string, date string, event_ts string, trade_pr float, trade_pr_2 float',\n", + " 'data': [['S1', 'SAME_DT', '2020-08-01 00:00:10', 349.21, 10.0],\n", + " ['S1', 'SAME_DT', '2020-08-01 00:00:11', 340.21, 9.0],\n", + " ['S1', 'SAME_DT', '2020-08-01 00:01:12', 353.32, 8.0],\n", + " ['S1', 'SAME_DT', '2020-08-01 00:01:13', 351.32, 7.0],\n", + " ['S1', 'SAME_DT', '2020-08-01 00:01:14', 350.32, 6.0],\n", + " ['S1', 'SAME_DT', '2020-09-01 00:01:12', 361.1, 5.0],\n", + " ['S1', 'SAME_DT', '2020-09-01 00:19:12', 362.1, 4.0]]}}}}" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "after_2" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'ResampleUnitTests': {'test_appendAggKey_freq_is_none': {'input_data': {'tsdf': {},\n", + " 'df': {},\n", + " '$ref': '#/__SharedData/input_data'}},\n", + " 'test_appendAggKey_freq_microsecond': {'input_data': {'tsdf': {},\n", + " 'df': {},\n", + " '$ref': '#/__SharedData/input_data'}},\n", + " 'test_appendAggKey_freq_is_invalid': {'input_data': {'tsdf': {},\n", + " 'df': {},\n", + " '$ref': '#/__SharedData/input_data'}},\n", + " 'test_aggregate_floor': {'input_data': {'tsdf': {},\n", + " 'df': {},\n", + " '$ref': '#/__SharedData/input_data'},\n", + " 'expected_data': {'tsdf': {'ts_col': 'event_ts',\n", + " 'partition_cols': ['symbol']},\n", + " 'df': {'schema': 'symbol string, event_ts string, date string, trade_pr float, trade_pr_2 float',\n", + " 'data': [['S1', '2020-08-01 00:00:00', 'SAME_DT', 349.21, 10.0],\n", + " ['S1', '2020-09-01 00:00:00', 'SAME_DT', 361.1, 5.0]]},\n", + " '$ref': None}},\n", + " 'test_aggregate_average': {'input_data': {'tsdf': {},\n", + " 'df': {},\n", + " '$ref': '#/__SharedData/input_data'},\n", + " 'expected_data': {'tsdf': {'ts_col': 'event_ts',\n", + " 'partition_cols': ['symbol']},\n", + " 'df': {'schema': 'symbol string, event_ts string, trade_pr double, trade_pr_2 double',\n", + " 'data': [['S1', '2020-08-01 00:00:00', 348.8760009765625, 8.0],\n", + " ['S1', '2020-09-01 00:00:00', 361.6000061035156, 4.5]]},\n", + " '$ref': None}},\n", + " 'test_aggregate_min': {'input_data': {'tsdf': {},\n", + " 'df': {},\n", + " '$ref': '#/__SharedData/input_data'},\n", + " 'expected_data': {'tsdf': {'ts_col': 'event_ts',\n", + " 'partition_cols': ['symbol']},\n", + " 'df': {'schema': 'symbol string, event_ts string, date string, trade_pr float, trade_pr_2 float',\n", + " 'data': [['S1', '2020-08-01 00:00:00', 'SAME_DT', 340.21, 6.0],\n", + " ['S1', '2020-09-01 00:00:00', 'SAME_DT', 361.1, 4.0]]},\n", + " '$ref': None}},\n", + " 'test_aggregate_min_with_prefix': {'input_data': {'tsdf': {},\n", + " 'df': {},\n", + " '$ref': '#/__SharedData/input_data'},\n", + " 'expected_data': {'tsdf': {'ts_col': 'event_ts',\n", + " 'partition_cols': ['symbol']},\n", + " 'df': {'schema': 'symbol string, event_ts string, min_date string, min_trade_pr float, min_trade_pr_2 float',\n", + " 'data': {'$ref': '#/ResampleUnitTests/test_aggregate_min/expected_data/data'}},\n", + " '$ref': None}},\n", + " 'test_aggregate_min_with_fill': {'input_data': {'tsdf': {},\n", + " 'df': {},\n", + " '$ref': '#/__SharedData/input_data'},\n", + " 'expected_data': {'tsdf': {'ts_col': 'event_ts',\n", + " 'partition_cols': ['symbol']},\n", + " 'df': {'schema': 'symbol string, event_ts string, date string, trade_pr float, trade_pr_2 float',\n", + " 'data': [['S1', '2020-08-01 00:00:00', 'SAME_DT', 340.21, 6.0],\n", + " ['S1', '2020-08-02 00:00:00', None, 0.0, 0.0],\n", + " ['S1', '2020-08-03 00:00:00', None, 0.0, 0.0],\n", + " ['S1', '2020-08-04 00:00:00', None, 0.0, 0.0],\n", + " ['S1', '2020-08-05 00:00:00', None, 0.0, 0.0],\n", + " ['S1', '2020-08-06 00:00:00', None, 0.0, 0.0],\n", + " ['S1', '2020-08-07 00:00:00', None, 0.0, 0.0],\n", + " ['S1', '2020-08-08 00:00:00', None, 0.0, 0.0],\n", + " ['S1', '2020-08-09 00:00:00', None, 0.0, 0.0],\n", + " ['S1', '2020-08-10 00:00:00', None, 0.0, 0.0],\n", + " ['S1', '2020-08-11 00:00:00', None, 0.0, 0.0],\n", + " ['S1', '2020-08-12 00:00:00', None, 0.0, 0.0],\n", + " ['S1', '2020-08-13 00:00:00', None, 0.0, 0.0],\n", + " ['S1', '2020-08-14 00:00:00', None, 0.0, 0.0],\n", + " ['S1', '2020-08-15 00:00:00', None, 0.0, 0.0],\n", + " ['S1', '2020-08-16 00:00:00', None, 0.0, 0.0],\n", + " ['S1', '2020-08-17 00:00:00', None, 0.0, 0.0],\n", + " ['S1', '2020-08-18 00:00:00', None, 0.0, 0.0],\n", + " ['S1', '2020-08-19 00:00:00', None, 0.0, 0.0],\n", + " ['S1', '2020-08-20 00:00:00', None, 0.0, 0.0],\n", + " ['S1', '2020-08-21 00:00:00', None, 0.0, 0.0],\n", + " ['S1', '2020-08-22 00:00:00', None, 0.0, 0.0],\n", + " ['S1', '2020-08-23 00:00:00', None, 0.0, 0.0],\n", + " ['S1', '2020-08-24 00:00:00', None, 0.0, 0.0],\n", + " ['S1', '2020-08-25 00:00:00', None, 0.0, 0.0],\n", + " ['S1', '2020-08-26 00:00:00', None, 0.0, 0.0],\n", + " ['S1', '2020-08-27 00:00:00', None, 0.0, 0.0],\n", + " ['S1', '2020-08-28 00:00:00', None, 0.0, 0.0],\n", + " ['S1', '2020-08-29 00:00:00', None, 0.0, 0.0],\n", + " ['S1', '2020-08-30 00:00:00', None, 0.0, 0.0],\n", + " ['S1', '2020-08-31 00:00:00', None, 0.0, 0.0],\n", + " ['S1', '2020-09-01 00:00:00', 'SAME_DT', 361.1, 4.0]]},\n", + " '$ref': None}},\n", + " 'test_aggregate_max': {'input_data': {'tsdf': {},\n", + " 'df': {},\n", + " '$ref': '#/__SharedData/input_data'},\n", + " 'expected_data': {'tsdf': {'ts_col': 'event_ts',\n", + " 'partition_cols': ['symbol']},\n", + " 'df': {'schema': 'symbol string, event_ts string, date string, trade_pr float, trade_pr_2 float',\n", + " 'data': [['S1', '2020-08-01 00:00:00', 'SAME_DT', 353.32, 10.0],\n", + " ['S1', '2020-09-01 00:00:00', 'SAME_DT', 362.1, 5.0]]},\n", + " '$ref': None}},\n", + " 'test_aggregate_ceiling': {'input_data': {'tsdf': {},\n", + " 'df': {},\n", + " '$ref': '#/__SharedData/input_data'},\n", + " 'expected_data': {'tsdf': {'ts_col': 'event_ts',\n", + " 'partition_cols': ['symbol']},\n", + " 'df': {'schema': 'symbol string, event_ts string, date string, trade_pr float, trade_pr_2 float',\n", + " 'data': [['S1', '2020-08-01 00:00:00', 'SAME_DT', 350.32, 6.0],\n", + " ['S1', '2020-09-01 00:00:00', 'SAME_DT', 362.1, 4.0]]},\n", + " '$ref': None}},\n", + " 'test_aggregate_invalid_func_arg': {'input_data': {'tsdf': {},\n", + " 'df': {},\n", + " '$ref': '#/__SharedData/input_data'},\n", + " 'expected_data': {'tsdf': {'ts_col': 'event_ts',\n", + " 'partition_cols': ['symbol']},\n", + " 'df': {'schema': 'symbol string, event_ts string, date string, trade_pr float, trade_pr_2 float',\n", + " 'data': [['S1', '2020-07-31 20:00:00', 'SAME_DT', 348.88, 8.0],\n", + " ['S1', '2020-08-31 20:00:00', 'SAME_DT', 361.6, 4.5]]},\n", + " '$ref': None}}},\n", + " '__SharedData': {'input_data': {'tsdf': {'ts_col': 'event_ts',\n", + " 'partition_cols': ['symbol']},\n", + " 'df': {'schema': 'symbol string, date string, event_ts string, trade_pr float, trade_pr_2 float',\n", + " 'data': [['S1', 'SAME_DT', '2020-08-01 00:00:10', 349.21, 10.0],\n", + " ['S1', 'SAME_DT', '2020-08-01 00:00:11', 340.21, 9.0],\n", + " ['S1', 'SAME_DT', '2020-08-01 00:01:12', 353.32, 8.0],\n", + " ['S1', 'SAME_DT', '2020-08-01 00:01:13', 351.32, 7.0],\n", + " ['S1', 'SAME_DT', '2020-08-01 00:01:14', 350.32, 6.0],\n", + " ['S1', 'SAME_DT', '2020-09-01 00:01:12', 361.1, 5.0],\n", + " ['S1', 'SAME_DT', '2020-09-01 00:19:12', 362.1, 4.0]]}}}}" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "combined = after | after_2\n", + "combined" + ] + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": "" + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv142", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/python/tests/unit_test_data/resample_tests.json b/python/tests/unit_test_data/resample_tests.json index 19b22acb..cd429e04 100644 --- a/python/tests/unit_test_data/resample_tests.json +++ b/python/tests/unit_test_data/resample_tests.json @@ -1,498 +1,556 @@ { "__SharedData": { - "input_data": { - "schema": "symbol string, date string, event_ts string, trade_pr float, trade_pr_2 float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "SAME_DT", - "2020-08-01 00:00:10", - 349.21, - 10.0 - ], - [ - "S1", - "SAME_DT", - "2020-08-01 00:00:11", - 340.21, - 9.0 - ], - [ - "S1", - "SAME_DT", - "2020-08-01 00:01:12", - 353.32, - 8.0 - ], - [ - "S1", - "SAME_DT", - "2020-08-01 00:01:13", - 351.32, - 7.0 - ], - [ - "S1", - "SAME_DT", - "2020-08-01 00:01:14", - 350.32, - 6.0 - ], - [ - "S1", - "SAME_DT", - "2020-09-01 00:01:12", - 361.1, - 5.0 - ], - [ - "S1", - "SAME_DT", - "2020-09-01 00:19:12", - 362.1, - 4.0 - ] - ] - } - }, - "ResampleUnitTests": { - "test_appendAggKey_freq_is_none": { - "input_data": { - "$ref": "#/__SharedData/input_data" - } - }, - "test_appendAggKey_freq_microsecond": { - "input_data": { - "$ref": "#/__SharedData/input_data" - } - }, - "test_appendAggKey_freq_is_invalid": { - "input_data": { - "$ref": "#/__SharedData/input_data" - } - }, - "test_aggregate_floor": { - "input_data": { - "$ref": "#/__SharedData/input_data" - }, - "expected_data": { - "schema": "symbol string, event_ts string, date string, trade_pr float, trade_pr_2 float", + "init": { + "tsdf": { "ts_col": "event_ts", "partition_cols": [ "symbol" + ] + }, + "df": { + "schema": "symbol string, date string, event_ts string, trade_pr float, trade_pr_2 float", + "ts_convert": [ + "event_ts" ], "data": [ [ "S1", - "2020-08-01 00:00:00", "SAME_DT", + "2020-08-01 00:00:10", 349.21, 10.0 ], [ "S1", - "2020-09-01 00:00:00", "SAME_DT", - 361.1, - 5.0 - ] - ] - } - }, - "test_aggregate_average": { - "input_data": { - "$ref": "#/__SharedData/input_data" - }, - "expected_data": { - "schema": "symbol string, event_ts string, trade_pr double, trade_pr_2 double", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ + "2020-08-01 00:00:11", + 340.21, + 9.0 + ], [ "S1", - "2020-08-01 00:00:00", - 348.8760009765625, + "SAME_DT", + "2020-08-01 00:01:12", + 353.32, 8.0 ], [ "S1", - "2020-09-01 00:00:00", - 361.6000061035156, - 4.5 - ] - ] - } - }, - "test_aggregate_min": { - "input_data": { - "$ref": "#/__SharedData/input_data" - }, - "expected_data": { - "schema": "symbol string, event_ts string, date string, trade_pr float, trade_pr_2 float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ + "SAME_DT", + "2020-08-01 00:01:13", + 351.32, + 7.0 + ], [ "S1", - "2020-08-01 00:00:00", "SAME_DT", - 340.21, + "2020-08-01 00:01:14", + 350.32, 6.0 ], [ "S1", - "2020-09-01 00:00:00", "SAME_DT", + "2020-09-01 00:01:12", 361.1, + 5.0 + ], + [ + "S1", + "SAME_DT", + "2020-09-01 00:19:12", + 362.1, 4.0 ] ] } + } + }, + "ResampleUnitTests": { + "test_appendAggKey_freq_is_none": { + "init": { + "$ref": "#/__SharedData/init" + } + }, + "test_appendAggKey_freq_microsecond": { + "init": { + "$ref": "#/__SharedData/init" + } + }, + "test_appendAggKey_freq_is_invalid": { + "init": { + "$ref": "#/__SharedData/init" + } + }, + "test_aggregate_floor": { + "init": { + "$ref": "#/__SharedData/init" + }, + "expected": { + "tsdf": { + "$ref": "#/__SharedData/init/tsdf" + }, + "df": { + "schema": "symbol string, event_ts string, date string, trade_pr float, trade_pr_2 float", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "S1", + "2020-08-01 00:00:00", + "SAME_DT", + 349.21, + 10.0 + ], + [ + "S1", + "2020-09-01 00:00:00", + "SAME_DT", + 361.1, + 5.0 + ] + ] + } + } + }, + "test_aggregate_average": { + "init": { + "$ref": "#/__SharedData/init" + }, + "expected": { + "tsdf": { + "$ref": "#/__SharedData/init/tsdf" + }, + "df": { + "schema": "symbol string, event_ts string, trade_pr double, trade_pr_2 double", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "S1", + "2020-08-01 00:00:00", + 348.8760009765625, + 8.0 + ], + [ + "S1", + "2020-09-01 00:00:00", + 361.6000061035156, + 4.5 + ] + ] + } + } + }, + "test_aggregate_min": { + "init": { + "$ref": "#/__SharedData/init" + }, + "expected": { + "tsdf": { + "$ref": "#/__SharedData/init/tsdf" + }, + "df": { + "schema": { + "$ref": "#/ResampleUnitTests/test_aggregate_floor/expected/df/schema" + }, + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "S1", + "2020-08-01 00:00:00", + "SAME_DT", + 340.21, + 6.0 + ], + [ + "S1", + "2020-09-01 00:00:00", + "SAME_DT", + 361.1, + 4.0 + ] + ] + } + } }, "test_aggregate_min_with_prefix": { - "input_data": { - "$ref": "#/__SharedData/input_data" + "init": { + "$ref": "#/__SharedData/init" }, - "expected_data": { - "schema": "symbol string, event_ts string, min_date string, min_trade_pr float, min_trade_pr_2 float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": { - "$ref": "#/ResampleUnitTests/test_aggregate_min/expected_data/data" + "expected": { + "tsdf": { + "$ref": "#/__SharedData/init/tsdf" + }, + "df": { + "schema": "symbol string, event_ts string, min_date string, min_trade_pr float, min_trade_pr_2 float", + "ts_convert": [ + "event_ts" + ], + "data": { + "$ref": "#/ResampleUnitTests/test_aggregate_min/expected/df/data" + } } } }, "test_aggregate_min_with_fill": { - "input_data": { - "$ref": "#/__SharedData/input_data" + "init": { + "$ref": "#/__SharedData/init" }, - "expected_data": { - "schema": "symbol string, event_ts string, date string, trade_pr float, trade_pr_2 float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "2020-08-01 00:00:00", - "SAME_DT", - 340.21, - 6.0 - ], - [ - "S1", - "2020-08-02 00:00:00", - null, - 0.0, - 0.0 - ], - [ - "S1", - "2020-08-03 00:00:00", - null, - 0.0, - 0.0 - ], - [ - "S1", - "2020-08-04 00:00:00", - null, - 0.0, - 0.0 - ], - [ - "S1", - "2020-08-05 00:00:00", - null, - 0.0, - 0.0 - ], - [ - "S1", - "2020-08-06 00:00:00", - null, - 0.0, - 0.0 - ], - [ - "S1", - "2020-08-07 00:00:00", - null, - 0.0, - 0.0 - ], - [ - "S1", - "2020-08-08 00:00:00", - null, - 0.0, - 0.0 - ], - [ - "S1", - "2020-08-09 00:00:00", - null, - 0.0, - 0.0 - ], - [ - "S1", - "2020-08-10 00:00:00", - null, - 0.0, - 0.0 - ], - [ - "S1", - "2020-08-11 00:00:00", - null, - 0.0, - 0.0 - ], - [ - "S1", - "2020-08-12 00:00:00", - null, - 0.0, - 0.0 - ], - [ - "S1", - "2020-08-13 00:00:00", - null, - 0.0, - 0.0 - ], - [ - "S1", - "2020-08-14 00:00:00", - null, - 0.0, - 0.0 - ], - [ - "S1", - "2020-08-15 00:00:00", - null, - 0.0, - 0.0 - ], - [ - "S1", - "2020-08-16 00:00:00", - null, - 0.0, - 0.0 - ], - [ - "S1", - "2020-08-17 00:00:00", - null, - 0.0, - 0.0 - ], - [ - "S1", - "2020-08-18 00:00:00", - null, - 0.0, - 0.0 - ], - [ - "S1", - "2020-08-19 00:00:00", - null, - 0.0, - 0.0 - ], - [ - "S1", - "2020-08-20 00:00:00", - null, - 0.0, - 0.0 - ], - [ - "S1", - "2020-08-21 00:00:00", - null, - 0.0, - 0.0 - ], - [ - "S1", - "2020-08-22 00:00:00", - null, - 0.0, - 0.0 - ], - [ - "S1", - "2020-08-23 00:00:00", - null, - 0.0, - 0.0 - ], - [ - "S1", - "2020-08-24 00:00:00", - null, - 0.0, - 0.0 - ], - [ - "S1", - "2020-08-25 00:00:00", - null, - 0.0, - 0.0 - ], - [ - "S1", - "2020-08-26 00:00:00", - null, - 0.0, - 0.0 - ], - [ - "S1", - "2020-08-27 00:00:00", - null, - 0.0, - 0.0 - ], - [ - "S1", - "2020-08-28 00:00:00", - null, - 0.0, - 0.0 - ], - [ - "S1", - "2020-08-29 00:00:00", - null, - 0.0, - 0.0 - ], - [ - "S1", - "2020-08-30 00:00:00", - null, - 0.0, - 0.0 - ], - [ - "S1", - "2020-08-31 00:00:00", - null, - 0.0, - 0.0 - ], - [ - "S1", - "2020-09-01 00:00:00", - "SAME_DT", - 361.1, - 4.0 + "expected": { + "tsdf": { + "$ref": "#/__SharedData/init/tsdf" + }, + "df": { + "schema": { + "$ref": "#/ResampleUnitTests/test_aggregate_min/expected/df/schema" + }, + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "S1", + "2020-08-01 00:00:00", + "SAME_DT", + 340.21, + 6.0 + ], + [ + "S1", + "2020-08-02 00:00:00", + null, + 0.0, + 0.0 + ], + [ + "S1", + "2020-08-03 00:00:00", + null, + 0.0, + 0.0 + ], + [ + "S1", + "2020-08-04 00:00:00", + null, + 0.0, + 0.0 + ], + [ + "S1", + "2020-08-05 00:00:00", + null, + 0.0, + 0.0 + ], + [ + "S1", + "2020-08-06 00:00:00", + null, + 0.0, + 0.0 + ], + [ + "S1", + "2020-08-07 00:00:00", + null, + 0.0, + 0.0 + ], + [ + "S1", + "2020-08-08 00:00:00", + null, + 0.0, + 0.0 + ], + [ + "S1", + "2020-08-09 00:00:00", + null, + 0.0, + 0.0 + ], + [ + "S1", + "2020-08-10 00:00:00", + null, + 0.0, + 0.0 + ], + [ + "S1", + "2020-08-11 00:00:00", + null, + 0.0, + 0.0 + ], + [ + "S1", + "2020-08-12 00:00:00", + null, + 0.0, + 0.0 + ], + [ + "S1", + "2020-08-13 00:00:00", + null, + 0.0, + 0.0 + ], + [ + "S1", + "2020-08-14 00:00:00", + null, + 0.0, + 0.0 + ], + [ + "S1", + "2020-08-15 00:00:00", + null, + 0.0, + 0.0 + ], + [ + "S1", + "2020-08-16 00:00:00", + null, + 0.0, + 0.0 + ], + [ + "S1", + "2020-08-17 00:00:00", + null, + 0.0, + 0.0 + ], + [ + "S1", + "2020-08-18 00:00:00", + null, + 0.0, + 0.0 + ], + [ + "S1", + "2020-08-19 00:00:00", + null, + 0.0, + 0.0 + ], + [ + "S1", + "2020-08-20 00:00:00", + null, + 0.0, + 0.0 + ], + [ + "S1", + "2020-08-21 00:00:00", + null, + 0.0, + 0.0 + ], + [ + "S1", + "2020-08-22 00:00:00", + null, + 0.0, + 0.0 + ], + [ + "S1", + "2020-08-23 00:00:00", + null, + 0.0, + 0.0 + ], + [ + "S1", + "2020-08-24 00:00:00", + null, + 0.0, + 0.0 + ], + [ + "S1", + "2020-08-25 00:00:00", + null, + 0.0, + 0.0 + ], + [ + "S1", + "2020-08-26 00:00:00", + null, + 0.0, + 0.0 + ], + [ + "S1", + "2020-08-27 00:00:00", + null, + 0.0, + 0.0 + ], + [ + "S1", + "2020-08-28 00:00:00", + null, + 0.0, + 0.0 + ], + [ + "S1", + "2020-08-29 00:00:00", + null, + 0.0, + 0.0 + ], + [ + "S1", + "2020-08-30 00:00:00", + null, + 0.0, + 0.0 + ], + [ + "S1", + "2020-08-31 00:00:00", + null, + 0.0, + 0.0 + ], + [ + "S1", + "2020-09-01 00:00:00", + "SAME_DT", + 361.1, + 4.0 + ] ] - ] + } } }, "test_aggregate_max": { - "input_data": { - "$ref": "#/__SharedData/input_data" + "init": { + "$ref": "#/__SharedData/init" }, - "expected_data": { - "schema": "symbol string, event_ts string, date string, trade_pr float, trade_pr_2 float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "2020-08-01 00:00:00", - "SAME_DT", - 353.32, - 10.0 - ], - [ - "S1", - "2020-09-01 00:00:00", - "SAME_DT", - 362.1, - 5.0 + "expected": { + "tsdf": { + "$ref": "#/__SharedData/init/tsdf" + }, + "df": { + "schema": { + "$ref": "#/ResampleUnitTests/test_aggregate_floor/expected/df/schema" + }, + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "S1", + "2020-08-01 00:00:00", + "SAME_DT", + 353.32, + 10.0 + ], + [ + "S1", + "2020-09-01 00:00:00", + "SAME_DT", + 362.1, + 5.0 + ] ] - ] + } } }, "test_aggregate_ceiling": { - "input_data": { - "$ref": "#/__SharedData/input_data" + "init": { + "$ref": "#/__SharedData/init" }, - "expected_data": { - "schema": "symbol string, event_ts string, date string, trade_pr float, trade_pr_2 float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "2020-08-01 00:00:00", - "SAME_DT", - 350.32, - 6.0 - ], - [ - "S1", - "2020-09-01 00:00:00", - "SAME_DT", - 362.1, - 4.0 + "expected": { + "tsdf": { + "$ref": "#/__SharedData/init/tsdf" + }, + "df": { + "schema": { + "$ref": "#/ResampleUnitTests/test_aggregate_floor/expected/df/schema" + }, + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "S1", + "2020-08-01 00:00:00", + "SAME_DT", + 350.32, + 6.0 + ], + [ + "S1", + "2020-09-01 00:00:00", + "SAME_DT", + 362.1, + 4.0 + ] ] - ] + } } }, "test_aggregate_invalid_func_arg": { - "input_data": { - "$ref": "#/__SharedData/input_data" + "init": { + "$ref": "#/__SharedData/init" }, - "expected_data": { - "schema": "symbol string, event_ts string, date string, trade_pr float, trade_pr_2 float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "2020-07-31 20:00:00", - "SAME_DT", - 348.88, - 8.0 - ], - [ - "S1", - "2020-08-31 20:00:00", - "SAME_DT", - 361.60, - 4.5 + "expected": { + "tsdf": { + "$ref": "#/__SharedData/init/tsdf" + }, + "df": { + "schema": { + "$ref": "#/ResampleUnitTests/test_aggregate_floor/expected/df/schema" + }, + "data": [ + [ + "S1", + "2020-07-31 20:00:00", + "SAME_DT", + 348.88, + 8.0 + ], + [ + "S1", + "2020-08-31 20:00:00", + "SAME_DT", + 361.6, + 4.5 + ] ] - ] + } } - } + }, + "test_check_allowable_freq_none": {}, + "test_check_allowable_freq_microsecond": {}, + "test_check_allowable_freq_millisecond": {}, + "test_check_allowable_freq_second": {}, + "test_check_allowable_freq_minute": {}, + "test_check_allowable_freq_hour": {}, + "test_check_allowable_freq_day": {}, + "test_check_allowable_freq_no_interval": {}, + "test_check_allowable_freq_exception_not_in_allowable_freqs": {}, + "test_check_allowable_freq_exception": {}, + "test_validate_func_exists_type_error": {}, + "test_validate_func_exists_value_error": {} } } \ No newline at end of file diff --git a/python/tests/unit_test_data/tsdf_tests.json b/python/tests/unit_test_data/tsdf_tests.json index 3cf1482a..99386d93 100644 --- a/python/tests/unit_test_data/tsdf_tests.json +++ b/python/tests/unit_test_data/tsdf_tests.json @@ -1,53 +1,58 @@ { "__SharedData": { "temp_slice_init_data": { - "schema": "symbol string, event_ts string, trade_pr float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "2020-08-01 00:00:10", - 349.21 - ], - [ - "S1", - "2020-08-01 00:01:12", - 351.32 - ], - [ - "S1", - "2020-09-01 00:02:10", - 361.1 - ], - [ - "S1", - "2020-09-01 00:19:12", - 362.1 - ], - [ - "S2", - "2020-08-01 00:01:10", - 743.01 - ], - [ - "S2", - "2020-08-01 00:01:24", - 751.92 - ], - [ - "S2", - "2020-09-01 00:02:10", - 761.10 - ], - [ - "S2", - "2020-09-01 00:20:42", - 762.33 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": [ + "symbol" + ] + }, + "df": { + "schema": "symbol string, event_ts string, trade_pr float", + "ts_convert": ["event_ts"], + "data": [ + [ + "S1", + "2020-08-01 00:00:10", + 349.21 + ], + [ + "S1", + "2020-08-01 00:01:12", + 351.32 + ], + [ + "S1", + "2020-09-01 00:02:10", + 361.1 + ], + [ + "S1", + "2020-09-01 00:19:12", + 362.1 + ], + [ + "S2", + "2020-08-01 00:01:10", + 743.01 + ], + [ + "S2", + "2020-08-01 00:01:24", + 751.92 + ], + [ + "S2", + "2020-09-01 00:02:10", + 761.10 + ], + [ + "S2", + "2020-09-01 00:20:42", + 762.33 + ] ] - ] + } } }, "TSDFBaseTests": { @@ -61,6 +66,31 @@ "$ref": "#/__SharedData/temp_slice_init_data" } }, + "test__validate_ts_string_valid": { + "init": { + "$ref": "#/__SharedData/temp_slice_init_data" + } + }, + "test__validate_ts_string_alt_format_valid": { + "init": { + "$ref": "#/__SharedData/temp_slice_init_data" + } + }, + "test__validate_ts_string_with_microseconds_valid": { + "init": { + "$ref": "#/__SharedData/temp_slice_init_data" + } + }, + "test__validate_ts_string_alt_format_with_microseconds_valid": { + "init": { + "$ref": "#/__SharedData/temp_slice_init_data" + } + }, + "test__validate_ts_string_invalid": { + "init": { + "$ref": "#/__SharedData/temp_slice_init_data" + } + }, "test__validated_column_not_string": { "init": { "$ref": "#/__SharedData/temp_slice_init_data" @@ -101,18 +131,25 @@ "$ref": "#/__SharedData/temp_slice_init_data" }, "right_tsdf": { - "schema": "symbol string, event_ts string, trade_pr float", - "ts_col": "event_ts", - "partition_cols": [ - "event_ts" - ], - "data": [ - [ - "S1", - "2020-08-01 00:00:10", - 349.21 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": [ + "event_ts" ] - ] + }, + "df": { + "schema": "symbol string, event_ts string, trade_pr float", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "S1", + "2020-08-01 00:00:10", + 349.21 + ] + ] + } } }, "test__validateTsColMatch": { @@ -120,18 +157,22 @@ "$ref": "#/__SharedData/temp_slice_init_data" }, "right_tsdf": { - "schema": "symbol string, event_ts int, trade_pr float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - 1596240010, - 349.21 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": [ + "symbol" ] - ] + }, + "df": { + "schema": "symbol string, event_ts int, trade_pr float", + "data": [ + [ + "S1", + 1596240010, + 349.21 + ] + ] + } } }, "test__addPrefixToColumns_non_empty_string": { @@ -164,69 +205,76 @@ "$ref": "#/__SharedData/temp_slice_init_data" }, "expected": { - "schema": "symbol string, event_ts string, trade_pr float, ts_partition int, is_original int", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "2020-08-01 00:00:10", - 349.21, - 1596240010, - 1 - ], - [ - "S1", - "2020-08-01 00:01:12", - 351.32, - 1596240070, - 1 - ], - [ - "S1", - "2020-09-01 00:02:10", - 361.1, - 1598918530, - 1 - ], - [ - "S1", - "2020-09-01 00:19:12", - 362.1, - 1598919550, - 1 - ], - [ - "S2", - "2020-08-01 00:01:10", - 743.01, - 1596240070, - 1 - ], - [ - "S2", - "2020-08-01 00:01:24", - 751.92, - 1596240080, - 1 - ], - [ - "S2", - "2020-09-01 00:02:10", - 761.1, - 1598918530, - 1 - ], - [ - "S2", - "2020-09-01 00:20:42", - 762.33, - 1598919640, - 1 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": [ + "symbol" ] - ] + }, + "df": { + "schema": "symbol string, event_ts string, trade_pr float, ts_partition int, is_original int", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "S1", + "2020-08-01 00:00:10", + 349.21, + 1596240010, + 1 + ], + [ + "S1", + "2020-08-01 00:01:12", + 351.32, + 1596240070, + 1 + ], + [ + "S1", + "2020-09-01 00:02:10", + 361.1, + 1598918530, + 1 + ], + [ + "S1", + "2020-09-01 00:19:12", + 362.1, + 1598919550, + 1 + ], + [ + "S2", + "2020-08-01 00:01:10", + 743.01, + 1596240070, + 1 + ], + [ + "S2", + "2020-08-01 00:01:24", + 751.92, + 1596240080, + 1 + ], + [ + "S2", + "2020-09-01 00:02:10", + 761.1, + 1598918530, + 1 + ], + [ + "S2", + "2020-09-01 00:20:42", + 762.33, + 1598919640, + 1 + ] + ] + } } }, "test__getTimePartitions_with_fraction": { @@ -234,69 +282,76 @@ "$ref": "#/__SharedData/temp_slice_init_data" }, "expected": { - "schema": "symbol string, event_ts string, trade_pr float, ts_partition int, is_original int", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "2020-08-01 00:00:10", - 349.21, - 1596240010, - 1 - ], - [ - "S1", - "2020-08-01 00:01:12", - 351.32, - 1596240070, - 1 - ], - [ - "S1", - "2020-09-01 00:02:10", - 361.1, - 1598918530, - 1 - ], - [ - "S1", - "2020-09-01 00:19:12", - 362.1, - 1598919550, - 1 - ], - [ - "S2", - "2020-08-01 00:01:10", - 743.01, - 1596240070, - 1 - ], - [ - "S2", - "2020-08-01 00:01:24", - 751.92, - 1596240080, - 1 - ], - [ - "S2", - "2020-09-01 00:02:10", - 761.1, - 1598918530, - 1 - ], - [ - "S2", - "2020-09-01 00:20:42", - 762.33, - 1598919640, - 1 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": [ + "symbol" ] - ] + }, + "df": { + "schema": "symbol string, event_ts string, trade_pr float, ts_partition int, is_original int", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "S1", + "2020-08-01 00:00:10", + 349.21, + 1596240010, + 1 + ], + [ + "S1", + "2020-08-01 00:01:12", + 351.32, + 1596240070, + 1 + ], + [ + "S1", + "2020-09-01 00:02:10", + 361.1, + 1598918530, + 1 + ], + [ + "S1", + "2020-09-01 00:19:12", + 362.1, + 1598919550, + 1 + ], + [ + "S2", + "2020-08-01 00:01:10", + 743.01, + 1596240070, + 1 + ], + [ + "S2", + "2020-08-01 00:01:24", + 751.92, + 1596240080, + 1 + ], + [ + "S2", + "2020-09-01 00:02:10", + 761.1, + 1598918530, + 1 + ], + [ + "S2", + "2020-09-01 00:20:42", + 762.33, + 1598919640, + 1 + ] + ] + } } }, "test_select_empty": { @@ -351,10 +406,16 @@ }, "test_describe": { "init": { + "tsdf": { + "ts_col": "event_ts", + "partition_cols": [ + "symbol" + ] + }, + "df": { "schema": "symbol string, event_ts string, trade_pr float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" + "ts_convert": [ + "event_ts" ], "data": [ [ @@ -378,6 +439,7 @@ 362.1 ] ] + } } }, "test__getSparkPlan": { @@ -387,33 +449,40 @@ }, "test__getBytesFromPlan": { "init": { - "schema": "symbol string, event_ts string, trade_pr float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "2020-08-01 00:00:10", - 349.21 - ], - [ - "S1", - "2020-08-01 00:01:12", - 351.32 - ], - [ - "S1", - "2020-09-01 00:02:10", - 361.1 - ], - [ - "S1", - "2020-09-01 00:19:12", - 362.1 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": [ + "symbol" ] - ] + }, + "df": { + "schema": "symbol string, event_ts string, trade_pr float", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "S1", + "2020-08-01 00:00:10", + 349.21 + ], + [ + "S1", + "2020-08-01 00:01:12", + 351.32 + ], + [ + "S1", + "2020-09-01 00:02:10", + 361.1 + ], + [ + "S1", + "2020-09-01 00:19:12", + 362.1 + ] + ] + } } }, "test__getBytesFromPlan_search_result_is_None": { @@ -441,23 +510,30 @@ "$ref": "#/__SharedData/temp_slice_init_data" }, "expected": { - "schema": "symbol string, event_ts string, trade_pr float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "2020-09-01 00:02:10", - 361.1 - ], - [ - "S2", - "2020-09-01 00:02:10", - 761.10 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": [ + "symbol" ] - ] + }, + "df": { + "schema": "symbol string, event_ts string, trade_pr float", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "S1", + "2020-09-01 00:02:10", + 361.1 + ], + [ + "S2", + "2020-09-01 00:02:10", + 761.10 + ] + ] + } } }, "test_at_numeric_timestamp": { @@ -473,33 +549,38 @@ "$ref": "#/__SharedData/temp_slice_init_data" }, "expected": { - "schema": "symbol string, event_ts string, trade_pr float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "2020-08-01 00:00:10", - 349.21 - ], - [ - "S1", - "2020-08-01 00:01:12", - 351.32 - ], - [ - "S2", - "2020-08-01 00:01:10", - 743.01 - ], - [ - "S2", - "2020-08-01 00:01:24", - 751.92 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": [ + "symbol" ] - ] + }, + "df": { + "schema": "symbol string, event_ts string, trade_pr float", + "ts_convert": ["event_ts"], + "data": [ + [ + "S1", + "2020-08-01 00:00:10", + 349.21 + ], + [ + "S1", + "2020-08-01 00:01:12", + 351.32 + ], + [ + "S2", + "2020-08-01 00:01:10", + 743.01 + ], + [ + "S2", + "2020-08-01 00:01:24", + 751.92 + ] + ] + } } }, "test_before_numeric_timestamp": { @@ -515,43 +596,50 @@ "$ref": "#/__SharedData/temp_slice_init_data" }, "expected": { - "schema": "symbol string, event_ts string, trade_pr float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "2020-08-01 00:00:10", - 349.21 - ], - [ - "S1", - "2020-08-01 00:01:12", - 351.32 - ], - [ - "S1", - "2020-09-01 00:02:10", - 361.1 - ], - [ - "S2", - "2020-08-01 00:01:10", - 743.01 - ], - [ - "S2", - "2020-08-01 00:01:24", - 751.92 - ], - [ - "S2", - "2020-09-01 00:02:10", - 761.10 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": [ + "symbol" ] - ] + }, + "df": { + "schema": "symbol string, event_ts string, trade_pr float", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "S1", + "2020-08-01 00:00:10", + 349.21 + ], + [ + "S1", + "2020-08-01 00:01:12", + 351.32 + ], + [ + "S1", + "2020-09-01 00:02:10", + 361.1 + ], + [ + "S2", + "2020-08-01 00:01:10", + 743.01 + ], + [ + "S2", + "2020-08-01 00:01:24", + 751.92 + ], + [ + "S2", + "2020-09-01 00:02:10", + 761.10 + ] + ] + } } }, "test_atOrBefore_numeric_timestamp": { @@ -567,23 +655,28 @@ "$ref": "#/__SharedData/temp_slice_init_data" }, "expected": { - "schema": "symbol string, event_ts string, trade_pr float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "2020-09-01 00:19:12", - 362.1 - ], - [ - "S2", - "2020-09-01 00:20:42", - 762.33 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": [ + "symbol" ] - ] + }, + "df": { + "schema": "symbol string, event_ts string, trade_pr float", + "ts_convert": ["event_ts"], + "data": [ + [ + "S1", + "2020-09-01 00:19:12", + 362.1 + ], + [ + "S2", + "2020-09-01 00:20:42", + 762.33 + ] + ] + } } }, "test_after_numeric_timestamp": { @@ -599,33 +692,40 @@ "$ref": "#/__SharedData/temp_slice_init_data" }, "expected": { - "schema": "symbol string, event_ts string, trade_pr float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "2020-09-01 00:02:10", - 361.1 - ], - [ - "S1", - "2020-09-01 00:19:12", - 362.1 - ], - [ - "S2", - "2020-09-01 00:02:10", - 761.10 - ], - [ - "S2", - "2020-09-01 00:20:42", - 762.33 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": [ + "symbol" + ] + }, + "df": { + "schema": "symbol string, event_ts string, trade_pr float", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "S1", + "2020-09-01 00:02:10", + 361.1 + ], + [ + "S1", + "2020-09-01 00:19:12", + 362.1 + ], + [ + "S2", + "2020-09-01 00:02:10", + 761.10 + ], + [ + "S2", + "2020-09-01 00:20:42", + 762.33 + ] ] - ] + } } }, "test_atOrAfter_numeric_timestamp": { @@ -641,38 +741,45 @@ "$ref": "#/__SharedData/temp_slice_init_data" }, "expected": { - "schema": "symbol string, event_ts string, trade_pr float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "2020-08-01 00:01:12", - 351.32 - ], - [ - "S1", - "2020-09-01 00:02:10", - 361.1 - ], - [ - "S2", - "2020-08-01 00:01:10", - 743.01 - ], - [ - "S2", - "2020-08-01 00:01:24", - 751.92 - ], - [ - "S2", - "2020-09-01 00:02:10", - 761.10 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": [ + "symbol" ] - ] + }, + "df": { + "schema": "symbol string, event_ts string, trade_pr float", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "S1", + "2020-08-01 00:01:12", + 351.32 + ], + [ + "S1", + "2020-09-01 00:02:10", + 361.1 + ], + [ + "S2", + "2020-08-01 00:01:10", + 743.01 + ], + [ + "S2", + "2020-08-01 00:01:24", + 751.92 + ], + [ + "S2", + "2020-09-01 00:02:10", + 761.10 + ] + ] + } } }, "test_between_numeric_timestamp": { @@ -688,33 +795,41 @@ "$ref": "#/__SharedData/temp_slice_init_data" }, "expected": { - "schema": "symbol string, event_ts string, trade_pr float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "2020-08-01 00:01:12", - 351.32 - ], - [ - "S1", - "2020-09-01 00:02:10", - 361.1 - ], - [ - "S2", - "2020-08-01 00:01:24", - 751.92 - ], - [ - "S2", - "2020-09-01 00:02:10", - 761.10 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": [ + "symbol" ] - ] + }, + "df": { + "schema": "symbol string, event_ts string, trade_pr float", + "ts_convert": ["event_ts"], + "partition_cols": [ + "symbol" + ], + "data": [ + [ + "S1", + "2020-08-01 00:01:12", + 351.32 + ], + [ + "S1", + "2020-09-01 00:02:10", + 361.1 + ], + [ + "S2", + "2020-08-01 00:01:24", + 751.92 + ], + [ + "S2", + "2020-09-01 00:02:10", + 761.10 + ] + ] + } } }, "test_between_exclusive_numeric_timestamp": { @@ -730,43 +845,48 @@ "$ref": "#/__SharedData/temp_slice_init_data" }, "expected": { - "schema": "symbol string, event_ts string, trade_pr float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "2020-08-01 00:00:10", - 349.21 - ], - [ - "S1", - "2020-08-01 00:01:12", - 351.32 - ], - [ - "S1", - "2020-09-01 00:02:10", - 361.1 - ], - [ - "S2", - "2020-08-01 00:01:10", - 743.01 - ], - [ - "S2", - "2020-08-01 00:01:24", - 751.92 - ], - [ - "S2", - "2020-09-01 00:02:10", - 761.10 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": [ + "symbol" ] - ] + }, + "df": { + "schema": "symbol string, event_ts string, trade_pr float", + "ts_convert": ["event_ts"], + "data": [ + [ + "S1", + "2020-08-01 00:00:10", + 349.21 + ], + [ + "S1", + "2020-08-01 00:01:12", + 351.32 + ], + [ + "S1", + "2020-09-01 00:02:10", + 361.1 + ], + [ + "S2", + "2020-08-01 00:01:10", + 743.01 + ], + [ + "S2", + "2020-08-01 00:01:24", + 751.92 + ], + [ + "S2", + "2020-09-01 00:02:10", + 761.10 + ] + ] + } } }, "test_earliest_numeric_timestamp": { @@ -782,43 +902,53 @@ "$ref": "#/__SharedData/temp_slice_init_data" }, "expected": { - "schema": "symbol string, event_ts string, trade_pr float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "2020-08-01 00:01:12", - 351.32 - ], - [ - "S1", - "2020-09-01 00:02:10", - 361.1 - ], - [ - "S1", - "2020-09-01 00:19:12", - 362.1 - ], - [ - "S2", - "2020-08-01 00:01:24", - 751.92 - ], - [ - "S2", - "2020-09-01 00:02:10", - 761.10 - ], - [ - "S2", - "2020-09-01 00:20:42", - 762.33 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": [ + "symbol" ] - ] + }, + "df": { + "schema": "symbol string, event_ts string, trade_pr float", + "ts_convert": [ + "event_ts" + ], + "partition_cols": [ + "symbol" + ], + "data": [ + [ + "S1", + "2020-08-01 00:01:12", + 351.32 + ], + [ + "S1", + "2020-09-01 00:02:10", + 361.1 + ], + [ + "S1", + "2020-09-01 00:19:12", + 362.1 + ], + [ + "S2", + "2020-08-01 00:01:24", + 751.92 + ], + [ + "S2", + "2020-09-01 00:02:10", + 761.10 + ], + [ + "S2", + "2020-09-01 00:20:42", + 762.33 + ] + ] + } } }, "test_latest_numeric_timestamp": { @@ -834,23 +964,30 @@ "$ref": "#/__SharedData/temp_slice_init_data" }, "expected": { - "schema": "symbol string, event_ts string, trade_pr float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "2020-08-01 00:01:12", - 351.32 - ], - [ - "S2", - "2020-08-01 00:01:24", - 751.92 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": [ + "symbol" ] - ] + }, + "df": { + "schema": "symbol string, event_ts string, trade_pr float", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "S1", + "2020-08-01 00:01:12", + 351.32 + ], + [ + "S2", + "2020-08-01 00:01:24", + 751.92 + ] + ] + } } }, "test_priorTo_numeric_timestamp": { @@ -866,23 +1003,30 @@ "$ref": "#/__SharedData/temp_slice_init_data" }, "expected": { - "schema": "symbol string, event_ts string, trade_pr float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "2020-09-01 00:02:10", - 361.1 - ], - [ - "S2", - "2020-09-01 00:02:10", - 761.10 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": [ + "symbol" ] - ] + }, + "df": { + "schema": "symbol string, event_ts string, trade_pr float", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "S1", + "2020-09-01 00:02:10", + 361.1 + ], + [ + "S2", + "2020-09-01 00:02:10", + 761.10 + ] + ] + } } }, "test_subsequentTo_numeric_timestamp": { @@ -900,860 +1044,990 @@ }, "test_withPartitionCols": { "init": { - "schema": "symbol string, event_ts string, trade_pr float", - "ts_col": "event_ts", - "data": { - "$ref": "#/__SharedData/temp_slice_init_data/data" + "tsdf": { + "ts_col": "event_ts" + }, + "df": { + "schema": "symbol string, event_ts string, trade_pr float", + "ts_convert": [ + "event_ts" + ], + "data": { + "$ref": "#/__SharedData/temp_slice_init_data/df/data" + } + } + } + }, + "test_tsdf_interpolate": { + "init": { + "tsdf": { + "ts_col": "event_ts" + }, + "df": { + "schema": "symbol string, event_ts string, trade_pr float", + "ts_convert": [ + "event_ts" + ], + "data": { + "$ref": "#/__SharedData/temp_slice_init_data/df/data" + } } + }, + "expected": { + "tsdf": { + "ts_col": "event_ts" + }, + "df": { + "schema": "event_ts string, trade_pr float", + "ts_convert": [ + "event_ts" + ], + "data": [ + ["2020-09-01 00:20:38", 0.0], + ["2020-09-01 00:20:39", 0.0], + ["2020-09-01 00:20:40", 0.0], + ["2020-09-01 00:20:41", 0.0], + ["2020-09-01 00:20:42", 762.33] + ] + } + } } }, "FourierTransformTest": { "test_fourier_transform": { "init": { - "schema": "group string, time long, val double", - "ts_col": "time", - "partition_cols": [ - "group" - ], - "data": [ - [ - "Emissions", - 1949, - 2206.690829 - ], - [ - "Emissions", - 1950, - 2382.046176 - ], - [ - "Emissions", - 1951, - 2526.687327 - ], - [ - "Emissions", - 1952, - 2473.373964 - ], - [ - "WindGen", - 1980, - 0.0 - ], - [ - "WindGen", - 1981, - 0.0 - ], - [ - "WindGen", - 1982, - 0.0 - ], - [ - "WindGen", - 1983, - 0.029667962 + "tsdf": { + "ts_col": "time", + "partition_cols": ["group"] + }, + "df": { + "schema": "group string, time long, val double", + "ts_convert": [ + "time" + ], + "data": [ + [ + "Emissions", + 1949, + 2206.690829 + ], + [ + "Emissions", + 1950, + 2382.046176 + ], + [ + "Emissions", + 1951, + 2526.687327 + ], + [ + "Emissions", + 1952, + 2473.373964 + ], + [ + "WindGen", + 1980, + 0.0 + ], + [ + "WindGen", + 1981, + 0.0 + ], + [ + "WindGen", + 1982, + 0.0 + ], + [ + "WindGen", + 1983, + 0.029667962 + ] ] - ] + } }, "expected": { - "schema": "group string, time long, val double, freq double, ft_real double, ft_imag double", - "ts_col": "time", - "partition_cols": [ - "group" - ], - "data": [ - [ - "Emissions", - 1949, - 2206.690829, - 0.0, - 9588.798296, - -0.0 - ], - [ - "Emissions", - 1950, - 2382.046176, - 0.25, - -319.996498, - 91.32778800000006 - ], - [ - "Emissions", - 1951, - 2526.687327, - -0.5, - -122.0419839999995, - -0.0 - ], - [ - "Emissions", - 1952, - 2473.373964, - -0.25, - -319.996498, - -91.32778800000006 - ], - [ - "WindGen", - 1980, - 0.0, - 0.0, - 0.029667962, - -0.0 - ], - [ - "WindGen", - 1981, - 0.0, - 0.25, - 0.0, - 0.029667962 - ], - [ - "WindGen", - 1982, - 0.0, - -0.5, - -0.029667962, - -0.0 - ], - [ - "WindGen", - 1983, - 0.029667962, - -0.25, - 0.0, - -0.029667962 + "tsdf": { + "ts_col": "time", + "partition_cols": ["group"] + }, + "df": { + "schema": "group string, time long, val double, freq double, ft_real double, ft_imag double", + "ts_convert": ["time"], + "data": [ + [ + "Emissions", + 1949, + 2206.690829, + 0.0, + 9588.798296, + -0.0 + ], + [ + "Emissions", + 1950, + 2382.046176, + 0.25, + -319.996498, + 91.32778800000006 + ], + [ + "Emissions", + 1951, + 2526.687327, + -0.5, + -122.0419839999995, + -0.0 + ], + [ + "Emissions", + 1952, + 2473.373964, + -0.25, + -319.996498, + -91.32778800000006 + ], + [ + "WindGen", + 1980, + 0.0, + 0.0, + 0.029667962, + -0.0 + ], + [ + "WindGen", + 1981, + 0.0, + 0.25, + 0.0, + 0.029667962 + ], + [ + "WindGen", + 1982, + 0.0, + -0.5, + -0.029667962, + -0.0 + ], + [ + "WindGen", + 1983, + 0.029667962, + -0.25, + 0.0, + -0.029667962 + ] ] - ] + } } }, "test_fourier_transform_no_sequence_col_empty_partition_cols": { "init": { - "schema": { - "$ref": "#/FourierTransformTest/test_fourier_transform/init/schema" + "tsdf": { + "ts_col": "time", + "partition_cols": [] }, - "ts_col": "time", - "partition_cols": [], - "data": { - "$ref": "#/FourierTransformTest/test_fourier_transform/init/data" + "df": { + "schema": { + "$ref": "#/FourierTransformTest/test_fourier_transform/init/df/schema" + }, + "ts_convert": ["time"], + "data": { + "$ref": "#/FourierTransformTest/test_fourier_transform/init/df/data" + } } }, "expected": { - "schema": "time long, val double, freq double, ft_real double, ft_imag double", - "ts_col": "time", - "data": [ - [ - 1949, - 2206.690829, - 0.0, - 9588.827963962001, - -0.0 - ], - [ - 1950, - 2382.046176, - 0.125, - 2142.1333092115465, - -5959.966855086621 - ], - [ - 1951, - 2526.687327, - 0.25, - -319.996498, - 91.35745596200013 - ], - [ - 1952, - 2473.373964, - 0.375, - 2271.2483487884538, - -906.5922010866211 - ], - [ - 1980, - 0.0, - -0.5, - -122.07165196199912, - -0.0 - ], - [ - 1981, - 0.0, - -0.375, - 2271.2483487884538, - 906.5922010866211 - ], - [ - 1982, - 0.0, - -0.25, - -319.996498, - -91.35745596200013 - ], - [ - 1983, - 0.029667962, - -0.125, - 2142.1333092115465, - 5959.966855086621 + "tsdf": { + "ts_col": "time", + "partition_cols": [] + }, + "df": { + "schema": "time long, val double, freq double, ft_real double, ft_imag double", + "ts_convert": [ + "time" + ], + "data": [ + [ + 1949, + 2206.690829, + 0.0, + 9588.827963962001, + -0.0 + ], + [ + 1950, + 2382.046176, + 0.125, + 2142.1333092115465, + -5959.966855086621 + ], + [ + 1951, + 2526.687327, + 0.25, + -319.996498, + 91.35745596200013 + ], + [ + 1952, + 2473.373964, + 0.375, + 2271.2483487884538, + -906.5922010866211 + ], + [ + 1980, + 0.0, + -0.5, + -122.07165196199912, + -0.0 + ], + [ + 1981, + 0.0, + -0.375, + 2271.2483487884538, + 906.5922010866211 + ], + [ + 1982, + 0.0, + -0.25, + -319.996498, + -91.35745596200013 + ], + [ + 1983, + 0.029667962, + -0.125, + 2142.1333092115465, + 5959.966855086621 + ] ] - ] + } } }, "test_fourier_transform_valid_sequence_col_empty_partition_cols": { "init": { - "schema": "sequence int, time long, val double", - "ts_col": "time", - "sequence_col": "sequence", - "partition_cols": [], - "data": [ - [ - 1, - 1949, - 2206.690829 - ], - [ - 2, - 1950, - 2382.046176 - ], - [ - 3, - 1951, - 2526.687327 - ], - [ - 4, - 1952, - 2473.373964 - ], - [ - 5, - 1980, - 0.0 - ], - [ - 6, - 1981, - 0.0 - ], - [ - 7, - 1982, - 0.0 - ], - [ - 8, - 1983, - 0.029667962 + "tsdf": { + "ts_col": "time", + "sequence_col": "sequence", + "partition_cols": [] + }, + "df": { + "schema": "sequence int, time long, val double", + "ts_convert": ["time"], + "data": [ + [ + 1, + 1949, + 2206.690829 + ], + [ + 2, + 1950, + 2382.046176 + ], + [ + 3, + 1951, + 2526.687327 + ], + [ + 4, + 1952, + 2473.373964 + ], + [ + 5, + 1980, + 0.0 + ], + [ + 6, + 1981, + 0.0 + ], + [ + 7, + 1982, + 0.0 + ], + [ + 8, + 1983, + 0.029667962 + ] ] - ] + } }, "expected": { - "schema": "sequence int, time long, val double, freq double, ft_real double, ft_imag double", - "ts_col": "time", - "partition_cols": [], - "data": [ - [ - 1, - 1949, - 2206.690829, - 0.0, - 9588.827963962001, - 0.0 - ], - [ - 2, - 1950, - 2382.046176, - 0.125, - 2142.1333092115465, - -5959.966855086621 - ], - [ - 3, - 1951, - 2526.687327, - 0.25, - -319.996498, - 91.35745596200013 - ], - [ - 4, - 1952, - 2473.373964, - 0.375, - 2271.2483487884538, - -906.5922010866211 - ], - [ - 5, - 1980, - 0.0, - -0.5, - -122.07165196199912, - -0.0 - ], - [ - 6, - 1981, - 0.0, - -0.375, - 2271.2483487884538, - 906.5922010866211 - ], - [ - 7, - 1982, - 0.0, - -0.25, - -319.996498, - -91.35745596200013 - ], - [ - 8, - 1983, - 0.029667962, - -0.125, - 2142.1333092115465, - 5959.966855086621 + "tsdf": { + "ts_col": "time", + "partition_cols": [] + }, + "df": { + "schema": "sequence int, time long, val double, freq double, ft_real double, ft_imag double", + "ts_convert": [ + "time" + ], + "data": [ + [ + 1, + 1949, + 2206.690829, + 0.0, + 9588.827963962001, + 0.0 + ], + [ + 2, + 1950, + 2382.046176, + 0.125, + 2142.1333092115465, + -5959.966855086621 + ], + [ + 3, + 1951, + 2526.687327, + 0.25, + -319.996498, + 91.35745596200013 + ], + [ + 4, + 1952, + 2473.373964, + 0.375, + 2271.2483487884538, + -906.5922010866211 + ], + [ + 5, + 1980, + 0.0, + -0.5, + -122.07165196199912, + -0.0 + ], + [ + 6, + 1981, + 0.0, + -0.375, + 2271.2483487884538, + 906.5922010866211 + ], + [ + 7, + 1982, + 0.0, + -0.25, + -319.996498, + -91.35745596200013 + ], + [ + 8, + 1983, + 0.029667962, + -0.125, + 2142.1333092115465, + 5959.966855086621 + ] ] - ] + } } }, "test_fourier_transform_valid_sequence_col_valid_partition_cols": { "init": { - "schema": "group string, sequence int, time long, val double", - "ts_col": "time", - "sequence_col": "sequence", - "partition_cols": [ - "group" - ], - "data": [ - [ - "Emissions", - 1, - 1949, - 2206.690829 - ], - [ - "Emissions", - 2, - 1950, - 2382.046176 - ], - [ - "Emissions", - 3, - 1951, - 2526.687327 - ], - [ - "Emissions", - 4, - 1952, - 2473.373964 - ], - [ - "WindGen", - 1, - 1980, - 0.0 - ], - [ - "WindGen", - 2, - 1981, - 0.0 - ], - [ - "WindGen", - 3, - 1982, - 0.0 - ], - [ - "WindGen", - 4, - 1983, - 0.029667962 + "tsdf": { + "ts_col": "time", + "sequence_col": "sequence", + "partition_cols": ["group"] + }, + "df": { + "schema": "group string, sequence int, time long, val double", + "ts_convert": ["time"], + "data": [ + [ + "Emissions", + 1, + 1949, + 2206.690829 + ], + [ + "Emissions", + 2, + 1950, + 2382.046176 + ], + [ + "Emissions", + 3, + 1951, + 2526.687327 + ], + [ + "Emissions", + 4, + 1952, + 2473.373964 + ], + [ + "WindGen", + 1, + 1980, + 0.0 + ], + [ + "WindGen", + 2, + 1981, + 0.0 + ], + [ + "WindGen", + 3, + 1982, + 0.0 + ], + [ + "WindGen", + 4, + 1983, + 0.029667962 + ] ] - ] + } }, "expected": { - "schema": "group string, sequence int, time long, val double, freq double, ft_real double, ft_imag double", - "ts_col": "time", - "partition_cols": [ - "group" - ], - "data": [ - [ - "Emissions", - 1, - 1949, - 2206.690829, - 0.0, - 9588.798296, - 0.0 - ], - [ - "Emissions", - 2, - 1950, - 2382.046176, - 0.25, - -319.996498, - 91.32778800000006 - ], - [ - "Emissions", - 3, - 1951, - 2526.687327, - -0.5, - -122.0419839999995, - 0.0 - ], - [ - "Emissions", - 4, - 1952, - 2473.373964, - -0.25, - -319.996498, - -91.32778800000006 - ], - [ - "WindGen", - 1, - 1980, - 0.0, - 0.0, - 0.029667962, - 0.0 - ], - [ - "WindGen", - 2, - 1981, - 0.0, - 0.25, - 0.0, - 0.029667962 - ], - [ - "WindGen", - 3, - 1982, - 0.0, - -0.5, - -0.029667962, - -0.0 - ], - [ - "WindGen", - 4, - 1983, - 0.029667962, - -0.25, - 0.0, - -0.029667962 + "tsdf": { + "ts_col": "time", + "partition_cols": ["group"] + }, + "df": { + "schema": "group string, sequence int, time long, val double, freq double, ft_real double, ft_imag double", + "ts_convert": [ + "time" + ], + "data": [ + [ + "Emissions", + 1, + 1949, + 2206.690829, + 0.0, + 9588.798296, + 0.0 + ], + [ + "Emissions", + 2, + 1950, + 2382.046176, + 0.25, + -319.996498, + 91.32778800000006 + ], + [ + "Emissions", + 3, + 1951, + 2526.687327, + -0.5, + -122.0419839999995, + 0.0 + ], + [ + "Emissions", + 4, + 1952, + 2473.373964, + -0.25, + -319.996498, + -91.32778800000006 + ], + [ + "WindGen", + 1, + 1980, + 0.0, + 0.0, + 0.029667962, + 0.0 + ], + [ + "WindGen", + 2, + 1981, + 0.0, + 0.25, + 0.0, + 0.029667962 + ], + [ + "WindGen", + 3, + 1982, + 0.0, + -0.5, + -0.029667962, + -0.0 + ], + [ + "WindGen", + 4, + 1983, + 0.029667962, + -0.25, + 0.0, + -0.029667962 + ] ] - ] + } } } }, "RangeStatsTest": { "test_range_stats": { "init": { - "schema": "symbol string, event_ts string, trade_pr float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["symbol"] + }, + "df": { + "schema": "symbol string, event_ts string, trade_pr float", + "ts_convert": ["event_ts"], + "data": [ + [ + "S1", + "2020-08-01 00:00:10", + 349.21 + ], + [ + "S1", + "2020-08-01 00:01:12", + 351.32 + ], + [ + "S1", + "2020-09-01 00:02:10", + 361.1 + ], + [ + "S1", + "2020-09-01 00:19:12", + 362.1 + ] + ] + } + }, + "expected": { + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["symbol"] + }, + "df": { + "schema": "symbol string, event_ts string, mean_trade_pr float, count_trade_pr long, min_trade_pr float, max_trade_pr float, sum_trade_pr float, stddev_trade_pr float, zscore_trade_pr float", + "ts_convert": ["event_ts"], + "data": [ + [ + "S1", + "2020-08-01 00:00:10", + 349.21, + 1, + 349.21, + 349.21, + 349.21, + null, + null + ], + [ + "S1", + "2020-08-01 00:01:12", + 350.26, + 2, + 349.21, + 351.32, + 700.53, + 1.49, + 0.71 + ], + [ + "S1", + "2020-09-01 00:02:10", + 361.1, + 1, + 361.1, + 361.1, + 361.1, + null, + null + ], + [ + "S1", + "2020-09-01 00:19:12", + 361.6, + 2, + 361.1, + 362.1, + 723.2, + 0.71, + 0.71 + ] + ] + } + } + }, + "test_group_stats": { + "init": { + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["symbol"] + }, + "df": { + "schema": "symbol string, event_ts string, trade_pr float, index integer", + "ts_convert": ["event_ts"], + "data": [ + [ + "S1", + "2020-08-01 00:00:10", + 349.21, + 1 + ], + [ + "S1", + "2020-08-01 00:00:33", + 351.32, + 1 + ], + [ + "S1", + "2020-09-01 00:02:10", + 361.1, + 1 + ], + [ + "S1", + "2020-09-01 00:02:49", + 362.1, + 1 + ] + ] + } + }, + "expected": { + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["symbol"] + }, + "df": { + "schema": "symbol string, event_ts string, mean_trade_pr float, count_trade_pr long, min_trade_pr float, max_trade_pr float, sum_trade_pr float, stddev_trade_pr float, mean_index integer, count_index integer, min_index integer, max_index integer, sum_index integer, stddev_index integer", + "ts_convert": ["event_ts"], + "data": [ + [ + "S1", + "2020-08-01 00:00:00", + 350.26, + 2, + 349.21, + 351.32, + 700.53, + 1.49, + 1, + 2, + 1, + 1, + 2, + 0 + ], + [ + "S1", + "2020-09-01 00:02:00", + 361.6, + 2, + 361.1, + 362.1, + 723.2, + 0.71, + 1, + 2, + 1, + 1, + 2, + 0 + ] + ] + } + } + } + }, + "ResampleTest": { + "test_resample": { + "input": { + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["symbol"] + }, + "df": { + "schema": "symbol string, date string, event_ts string, trade_pr float, trade_pr_2 float", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "S1", + "SAME_DT", + "2020-08-01 00:00:10", + 349.21, + 10.0 + ], + [ + "S1", + "SAME_DT", + "2020-08-01 00:00:11", + 340.21, + 9.0 + ], + [ + "S1", + "SAME_DT", + "2020-08-01 00:01:12", + 353.32, + 8.0 + ], + [ + "S1", + "SAME_DT", + "2020-08-01 00:01:13", + 351.32, + 7.0 + ], + [ + "S1", + "SAME_DT", + "2020-08-01 00:01:14", + 350.32, + 6.0 + ], + [ + "S1", + "SAME_DT", + "2020-09-01 00:01:12", + 361.1, + 5.0 + ], + [ + "S1", + "SAME_DT", + "2020-09-01 00:19:12", + 362.1, + 4.0 + ] + ] + } + }, + "expected": { + "tsdf": { + "ts_col": "event_ts", + "partition_cols": [ + "symbol" + ] + }, + "df": { + "schema": "symbol string, event_ts string, floor_trade_pr float, floor_date string, floor_trade_pr_2 float", + "ts_convert": [ + "event_ts" ], "data": [ [ "S1", - "2020-08-01 00:00:10", - 349.21 + "2020-08-01 00:00:00", + 349.21, + "SAME_DT", + 10.0 ], [ "S1", - "2020-08-01 00:01:12", - 351.32 + "2020-08-01 00:01:00", + 353.32, + "SAME_DT", + 8.0 ], [ "S1", - "2020-09-01 00:02:10", - 361.1 + "2020-09-01 00:01:00", + 361.1, + "SAME_DT", + 5.0 ], [ "S1", - "2020-09-01 00:19:12", - 362.1 - ] - ] - }, - "expected": { - "schema": "symbol string, event_ts string, mean_trade_pr float, count_trade_pr long, min_trade_pr float, max_trade_pr float, sum_trade_pr float, stddev_trade_pr float, zscore_trade_pr float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "2020-08-01 00:00:10", - 349.21, - 1, - 349.21, - 349.21, - 349.21, - null, - null - ], - [ - "S1", - "2020-08-01 00:01:12", - 350.26, - 2, - 349.21, - 351.32, - 700.53, - 1.49, - 0.71 - ], - [ - "S1", - "2020-09-01 00:02:10", - 361.1, - 1, - 361.1, - 361.1, - 361.1, - null, - null - ], - [ - "S1", - "2020-09-01 00:19:12", - 361.6, - 2, - 361.1, - 362.1, - 723.2, - 0.71, - 0.71 - ] - ] - } - }, - "test_group_stats": { - "init": { - "schema": "symbol string, event_ts string, trade_pr float, index integer", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "2020-08-01 00:00:10", - 349.21, - 1 - ], - [ - "S1", - "2020-08-01 00:00:33", - 351.32, - 1 - ], - [ - "S1", - "2020-09-01 00:02:10", - 361.1, - 1 - ], - [ - "S1", - "2020-09-01 00:02:49", - 362.1, - 1 - ] - ] - }, - "expected": { - "schema": "symbol string, event_ts string, mean_trade_pr float, count_trade_pr long, min_trade_pr float, max_trade_pr float, sum_trade_pr float, stddev_trade_pr float, mean_index integer, count_index integer, min_index integer, max_index integer, sum_index integer, stddev_index integer", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "2020-08-01 00:00:00", - 350.26, - 2, - 349.21, - 351.32, - 700.53, - 1.49, - 1, - 2, - 1, - 1, - 2, - 0 - ], - [ - "S1", - "2020-09-01 00:02:00", - 361.6, - 2, - 361.1, - 362.1, - 723.2, - 0.71, - 1, - 2, - 1, - 1, - 2, - 0 - ] - ] - } - } - }, - "ResampleTest": { - "test_resample": { - "input": { - "schema": "symbol string, date string, event_ts string, trade_pr float, trade_pr_2 float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "SAME_DT", - "2020-08-01 00:00:10", - 349.21, - 10.0 - ], - [ - "S1", - "SAME_DT", - "2020-08-01 00:00:11", - 340.21, - 9.0 - ], - [ - "S1", - "SAME_DT", - "2020-08-01 00:01:12", - 353.32, - 8.0 - ], - [ - "S1", - "SAME_DT", - "2020-08-01 00:01:13", - 351.32, - 7.0 - ], - [ - "S1", - "SAME_DT", - "2020-08-01 00:01:14", - 350.32, - 6.0 - ], - [ - "S1", - "SAME_DT", - "2020-09-01 00:01:12", - 361.1, - 5.0 - ], - [ - "S1", - "SAME_DT", - "2020-09-01 00:19:12", - 362.1, - 4.0 - ] - ] - }, - "expected": { - "schema": "symbol string, event_ts string, floor_trade_pr float, floor_date string, floor_trade_pr_2 float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "2020-08-01 00:00:00", - 349.21, - "SAME_DT", - 10.0 - ], - [ - "S1", - "2020-08-01 00:01:00", - 353.32, - "SAME_DT", - 8.0 - ], - [ - "S1", - "2020-09-01 00:01:00", - 361.1, - "SAME_DT", - 5.0 - ], - [ - "S1", - "2020-09-01 00:19:00", - 362.1, - "SAME_DT", - 4.0 + "2020-09-01 00:19:00", + 362.1, + "SAME_DT", + 4.0 ] ] + } }, "expected30m": { - "schema": "symbol string, event_ts string, date double, trade_pr double, trade_pr_2 double", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "2020-08-01 00:00:00", - null, - 348.88, - 8.0 - ], - [ - "S1", - "2020-09-01 00:00:00", - null, - 361.1, - 5.0 - ], - [ - "S1", - "2020-09-01 00:15:00", - null, - 362.1, - 4.0 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": [ + "symbol" ] - ] + }, + "df": { + "schema": "symbol string, event_ts string, date double, trade_pr double, trade_pr_2 double", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "S1", + "2020-08-01 00:00:00", + null, + 348.88, + 8.0 + ], + [ + "S1", + "2020-09-01 00:00:00", + null, + 361.1, + 5.0 + ], + [ + "S1", + "2020-09-01 00:15:00", + null, + 362.1, + 4.0 + ] + ] + } }, "expectedbars": { - "schema": "symbol string, event_ts string, close_trade_pr float, close_trade_pr_2 float, high_trade_pr float, high_trade_pr_2 float, low_trade_pr float, low_trade_pr_2 float, open_trade_pr float, open_trade_pr_2 float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "2020-08-01 00:00:00", - 340.21, - 9.0, - 349.21, - 10.0, - 340.21, - 9.0, - 349.21, - 10.0 - ], - [ - "S1", - "2020-08-01 00:01:00", - 350.32, - 6.0, - 353.32, - 8.0, - 350.32, - 6.0, - 353.32, - 8.0 - ], - [ - "S1", - "2020-09-01 00:01:00", - 361.1, - 5.0, - 361.1, - 5.0, - 361.1, - 5.0, - 361.1, - 5.0 - ], - [ - "S1", - "2020-09-01 00:19:00", - 362.1, - 4.0, - 362.1, - 4.0, - 362.1, - 4.0, - 362.1, - 4.0 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": [ + "symbol" ] - ] + }, + "df": { + "schema": "symbol string, event_ts string, close_trade_pr float, close_trade_pr_2 float, high_trade_pr float, high_trade_pr_2 float, low_trade_pr float, low_trade_pr_2 float, open_trade_pr float, open_trade_pr_2 float", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "S1", + "2020-08-01 00:00:00", + 340.21, + 9.0, + 349.21, + 10.0, + 340.21, + 9.0, + 349.21, + 10.0 + ], + [ + "S1", + "2020-08-01 00:01:00", + 350.32, + 6.0, + 353.32, + 8.0, + 350.32, + 6.0, + 353.32, + 8.0 + ], + [ + "S1", + "2020-09-01 00:01:00", + 361.1, + 5.0, + 361.1, + 5.0, + 361.1, + 5.0, + 361.1, + 5.0 + ], + [ + "S1", + "2020-09-01 00:19:00", + 362.1, + 4.0, + 362.1, + 4.0, + 362.1, + 4.0, + 362.1, + 4.0 + ] + ] + } } }, "test_resample_millis": { "init": { - "schema": "symbol string, date string, event_ts string, trade_pr float, trade_pr_2 float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "SAME_DT", - "2020-08-01 00:00:10.12345", - 349.21, - 10.0 - ], - [ - "S1", - "SAME_DT", - "2020-08-01 00:00:10.123", - 340.21, - 9.0 - ], - [ - "S1", - "SAME_DT", - "2020-08-01 00:00:10.124", - 353.32, - 8.0 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["symbol"] + }, + "df": { + "schema": "symbol string, date string, event_ts string, trade_pr float, trade_pr_2 float", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "S1", + "SAME_DT", + "2020-08-01 00:00:10.12345", + 349.21, + 10.0 + ], + [ + "S1", + "SAME_DT", + "2020-08-01 00:00:10.123", + 340.21, + 9.0 + ], + [ + "S1", + "SAME_DT", + "2020-08-01 00:00:10.124", + 353.32, + 8.0 + ] ] - ] + } }, "expectedms": { + "tsdf": { + "ts_col": "event_ts", + "partition_cols": [ + "symbol" + ] + }, + "df": { "schema": "symbol string, event_ts string, date double, trade_pr double, trade_pr_2 double", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], + "ts_convert": ["event_ts"], "data": [ [ "S1", @@ -1770,1986 +2044,2097 @@ 8.0 ] ] + } } }, "test_upsample": { "input": { - "schema": "symbol string, date string, event_ts string, trade_pr float, trade_pr_2 float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "SAME_DT", - "2020-08-01 00:00:10", - 349.21, - 10.0 - ], - [ - "S1", - "SAME_DT", - "2020-08-01 00:00:11", - 340.21, - 9.0 - ], - [ - "S1", - "SAME_DT", - "2020-08-01 00:01:12", - 353.32, - 8.0 - ], - [ - "S1", - "SAME_DT", - "2020-08-01 00:01:13", - 351.32, - 7.0 - ], - [ - "S1", - "SAME_DT", - "2020-08-01 00:01:14", - 350.32, - 6.0 - ], - [ - "S1", - "SAME_DT", - "2020-09-01 00:01:12", - 361.1, - 5.0 - ], - [ - "S1", - "SAME_DT", - "2020-09-01 00:19:12", - 362.1, - 4.0 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["symbol"] + }, + "df": { + "schema": "symbol string, date string, event_ts string, trade_pr float, trade_pr_2 float", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "S1", + "SAME_DT", + "2020-08-01 00:00:10", + 349.21, + 10.0 + ], + [ + "S1", + "SAME_DT", + "2020-08-01 00:00:11", + 340.21, + 9.0 + ], + [ + "S1", + "SAME_DT", + "2020-08-01 00:01:12", + 353.32, + 8.0 + ], + [ + "S1", + "SAME_DT", + "2020-08-01 00:01:13", + 351.32, + 7.0 + ], + [ + "S1", + "SAME_DT", + "2020-08-01 00:01:14", + 350.32, + 6.0 + ], + [ + "S1", + "SAME_DT", + "2020-09-01 00:01:12", + 361.1, + 5.0 + ], + [ + "S1", + "SAME_DT", + "2020-09-01 00:19:12", + 362.1, + 4.0 + ] ] - ] + } }, "expected": { - "schema": "symbol string, event_ts string, floor_trade_pr float, floor_date string, floor_trade_pr_2 float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "2020-08-01 00:00:00", - 349.21, - "SAME_DT", - 10.0 - ], - [ - "S1", - "2020-08-01 00:01:00", - 353.32, - "SAME_DT", - 8.0 - ], - [ - "S1", - "2020-09-01 00:01:00", - 361.1, - "SAME_DT", - 5.0 - ], - [ - "S1", - "2020-09-01 00:19:00", - 362.1, - "SAME_DT", - 4.0 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": [ + "symbol" ] - ] + }, + "df": { + "schema": "symbol string, event_ts string, floor_trade_pr float, floor_date string, floor_trade_pr_2 float", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "S1", + "2020-08-01 00:00:00", + 349.21, + "SAME_DT", + 10.0 + ], + [ + "S1", + "2020-08-01 00:01:00", + 353.32, + "SAME_DT", + 8.0 + ], + [ + "S1", + "2020-09-01 00:01:00", + 361.1, + "SAME_DT", + 5.0 + ], + [ + "S1", + "2020-09-01 00:19:00", + 362.1, + "SAME_DT", + 4.0 + ] + ] + } }, "expected30m": { - "schema": "symbol string, event_ts string, date double, trade_pr double, trade_pr_2 double", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "2020-08-01 00:00:00", - 0.0, - 348.88, - 8.0 - ], - [ - "S1", - "2020-08-01 00:05:00", - 0.0, - 0.0, - 0.0 - ], - [ - "S1", - "2020-09-01 00:00:00", - 0.0, - 361.1, - 5.0 - ], - [ - "S1", - "2020-09-01 00:15:00", - 0.0, - 362.1, - 4.0 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": [ + "symbol" ] - ] + }, + "df": { + "schema": "symbol string, event_ts string, date double, trade_pr double, trade_pr_2 double", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "S1", + "2020-08-01 00:00:00", + 0.0, + 348.88, + 8.0 + ], + [ + "S1", + "2020-08-01 00:05:00", + 0.0, + 0.0, + 0.0 + ], + [ + "S1", + "2020-09-01 00:00:00", + 0.0, + 361.1, + 5.0 + ], + [ + "S1", + "2020-09-01 00:15:00", + 0.0, + 362.1, + 4.0 + ] + ] + } }, "expectedbars": { - "schema": "symbol string, event_ts string, close_trade_pr float, close_trade_pr_2 float, high_trade_pr float, high_trade_pr_2 float, low_trade_pr float, low_trade_pr_2 float, open_trade_pr float, open_trade_pr_2 float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "2020-08-01 00:00:00", - 340.21, - 9.0, - 349.21, - 10.0, - 340.21, - 9.0, - 349.21, - 10.0 - ], - [ - "S1", - "2020-08-01 00:01:00", - 350.32, - 6.0, - 353.32, - 8.0, - 350.32, - 6.0, - 353.32, - 8.0 - ], - [ - "S1", - "2020-09-01 00:01:00", - 361.1, - 5.0, - 361.1, - 5.0, - 361.1, - 5.0, - 361.1, - 5.0 - ], - [ - "S1", - "2020-09-01 00:19:00", - 362.1, - 4.0, - 362.1, - 4.0, - 362.1, - 4.0, - 362.1, - 4.0 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": [ + "symbol" ] - ] + }, + "df": { + "schema": "symbol string, event_ts string, close_trade_pr float, close_trade_pr_2 float, high_trade_pr float, high_trade_pr_2 float, low_trade_pr float, low_trade_pr_2 float, open_trade_pr float, open_trade_pr_2 float", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "S1", + "2020-08-01 00:00:00", + 340.21, + 9.0, + 349.21, + 10.0, + 340.21, + 9.0, + 349.21, + 10.0 + ], + [ + "S1", + "2020-08-01 00:01:00", + 350.32, + 6.0, + 353.32, + 8.0, + 350.32, + 6.0, + 353.32, + 8.0 + ], + [ + "S1", + "2020-09-01 00:01:00", + 361.1, + 5.0, + 361.1, + 5.0, + 361.1, + 5.0, + 361.1, + 5.0 + ], + [ + "S1", + "2020-09-01 00:19:00", + 362.1, + 4.0, + 362.1, + 4.0, + 362.1, + 4.0, + 362.1, + 4.0 + ] + ] + } } } }, "ExtractStateIntervalsTest": { "test_eq_0": { "input": { - "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT NOT NULL, metric_2 FLOAT NOT NULL, metric_3 FLOAT NOT NULL", - "ts_col": "event_ts", - "partition_cols": [ - "identifier_1", - "identifier_2", - "identifier_3" - ], - "data": [ - [ - "2020-08-01 00:00:09", - "v1", - "foo", - "bar", - 4.1, - 4.1, - 4.1 - ], - [ - "2020-08-01 00:00:10", - "v1", - "foo", - "bar", - 4.1, - 4.1, - 4.1 - ], - [ - "2020-08-01 00:00:11", - "v1", - "foo", - "bar", - 5.0, - 5.0, - 5.0 - ], - [ - "2020-08-01 00:01:12", - "v1", - "foo", - "bar", - 10.7, - 10.7, - 10.7 - ], - [ - "2020-08-01 00:01:13", - "v1", - "foo", - "bar", - 10.7, - 10.7, - 10.7 - ], - [ - "2020-08-01 00:01:14", - "v1", - "foo", - "bar", - 10.7, - 10.7, - 10.7 - ], - [ - "2020-08-01 00:01:15", - "v1", - "foo", - "bar", - 42.3, - 42.3, - 42.3 - ], - [ - "2020-08-01 00:01:16", - "v1", - "foo", - "bar", - 37.6, - 37.6, - 37.6 - ], - [ - "2020-08-01 00:01:17", - "v1", - "foo", - "bar", - 61.5, - 61.5, - 61.5 - ], - [ - "2020-09-01 00:01:12", - "v1", - "foo", - "bar", - 28.9, - 28.9, - 28.9 - ], - [ - "2020-09-01 00:19:12", - "v1", - "foo", - "bar", - 0.1, - 0.1, - 0.1 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["identifier_1", "identifier_2", "identifier_3"] + }, + "df": { + "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT NOT NULL, metric_2 FLOAT NOT NULL, metric_3 FLOAT NOT NULL", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "2020-08-01 00:00:09", + "v1", + "foo", + "bar", + 4.1, + 4.1, + 4.1 + ], + [ + "2020-08-01 00:00:10", + "v1", + "foo", + "bar", + 4.1, + 4.1, + 4.1 + ], + [ + "2020-08-01 00:00:11", + "v1", + "foo", + "bar", + 5.0, + 5.0, + 5.0 + ], + [ + "2020-08-01 00:01:12", + "v1", + "foo", + "bar", + 10.7, + 10.7, + 10.7 + ], + [ + "2020-08-01 00:01:13", + "v1", + "foo", + "bar", + 10.7, + 10.7, + 10.7 + ], + [ + "2020-08-01 00:01:14", + "v1", + "foo", + "bar", + 10.7, + 10.7, + 10.7 + ], + [ + "2020-08-01 00:01:15", + "v1", + "foo", + "bar", + 42.3, + 42.3, + 42.3 + ], + [ + "2020-08-01 00:01:16", + "v1", + "foo", + "bar", + 37.6, + 37.6, + 37.6 + ], + [ + "2020-08-01 00:01:17", + "v1", + "foo", + "bar", + 61.5, + 61.5, + 61.5 + ], + [ + "2020-09-01 00:01:12", + "v1", + "foo", + "bar", + 28.9, + 28.9, + 28.9 + ], + [ + "2020-09-01 00:19:12", + "v1", + "foo", + "bar", + 0.1, + 0.1, + 0.1 + ] ] - ] + } }, "expected": { - "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL,identifier_1 STRING NOT NULL,identifier_2 STRING NOT NULL,identifier_3 STRING NOT NULL", - "other_ts_cols": [ - "start_ts", - "end_ts" - ], - "data": [ - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:10", - "v1", - "foo", - "bar" - ], - [ - "2020-08-01 00:01:12", - "2020-08-01 00:01:14", - "v1", - "foo", - "bar" + "df": { + "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL,identifier_1 STRING NOT NULL,identifier_2 STRING NOT NULL,identifier_3 STRING NOT NULL", + "ts_convert": [ + "start_ts", + "end_ts" + ], + "data": [ + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:10", + "v1", + "foo", + "bar" + ], + [ + "2020-08-01 00:01:12", + "2020-08-01 00:01:14", + "v1", + "foo", + "bar" + ] ] - ] + } } }, "test_eq_1": { "input": { - "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT, metric_2 FLOAT, metric_3 FLOAT", - "ts_col": "event_ts", - "partition_cols": [ - "identifier_1", - "identifier_2", - "identifier_3" - ], - "data": [ - [ - "2020-08-01 00:00:09", - "v1", - "foo", - "bar", - 4.1, - null, - 4.1 - ], - [ - "2020-08-01 00:00:10", - "v1", - "foo", - "bar", - 4.1, - null, - 4.1 - ], - [ - "2020-08-01 00:00:11", - "v1", - "foo", - "bar", - 5.0, - 5.0, - 5.0 - ], - [ - "2020-08-01 00:01:12", - "v1", - "foo", - "bar", - 10.7, - 10.7, - 10.7 - ], - [ - "2020-08-01 00:01:13", - "v1", - "foo", - "bar", - 10.7, - 10.7, - 10.7 - ], - [ - "2020-08-01 00:01:14", - "v1", - "foo", - "bar", - 10.7, - 10.7, - null - ], - [ - "2020-08-01 00:01:15", - "v1", - "foo", - "bar", - 42.3, - 42.3, - 42.3 - ], - [ - "2020-08-01 00:01:16", - "v1", - "foo", - "bar", - 37.6, - 37.6, - 37.6 - ], - [ - "2020-08-01 00:01:17", - "v1", - "foo", - "bar", - 61.5, - 61.5, - 61.5 - ], - [ - "2020-09-01 00:01:12", - "v1", - "foo", - "bar", - 28.9, - 28.9, - 28.9 - ], - [ - "2020-09-01 00:19:12", - "v1", - "foo", - "bar", - 0.1, - 0.1, - 0.1 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["identifier_1", "identifier_2", "identifier_3"] + }, + "df": { + "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT, metric_2 FLOAT, metric_3 FLOAT", + "ts_convert": ["event_ts"], + "data": [ + [ + "2020-08-01 00:00:09", + "v1", + "foo", + "bar", + 4.1, + null, + 4.1 + ], + [ + "2020-08-01 00:00:10", + "v1", + "foo", + "bar", + 4.1, + null, + 4.1 + ], + [ + "2020-08-01 00:00:11", + "v1", + "foo", + "bar", + 5.0, + 5.0, + 5.0 + ], + [ + "2020-08-01 00:01:12", + "v1", + "foo", + "bar", + 10.7, + 10.7, + 10.7 + ], + [ + "2020-08-01 00:01:13", + "v1", + "foo", + "bar", + 10.7, + 10.7, + 10.7 + ], + [ + "2020-08-01 00:01:14", + "v1", + "foo", + "bar", + 10.7, + 10.7, + null + ], + [ + "2020-08-01 00:01:15", + "v1", + "foo", + "bar", + 42.3, + 42.3, + 42.3 + ], + [ + "2020-08-01 00:01:16", + "v1", + "foo", + "bar", + 37.6, + 37.6, + 37.6 + ], + [ + "2020-08-01 00:01:17", + "v1", + "foo", + "bar", + 61.5, + 61.5, + 61.5 + ], + [ + "2020-09-01 00:01:12", + "v1", + "foo", + "bar", + 28.9, + 28.9, + 28.9 + ], + [ + "2020-09-01 00:19:12", + "v1", + "foo", + "bar", + 0.1, + 0.1, + 0.1 + ] ] - ] + } }, "expected": { - "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL,identifier_1 STRING NOT NULL,identifier_2 STRING NOT NULL,identifier_3 STRING NOT NULL", - "other_ts_cols": [ - "start_ts", - "end_ts" - ], - "data": [ - [ - "2020-08-01 00:01:12", - "2020-08-01 00:01:13", - "v1", - "foo", - "bar" + "df": { + "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL,identifier_1 STRING NOT NULL,identifier_2 STRING NOT NULL,identifier_3 STRING NOT NULL", + "ts_convert": [ + "start_ts", + "end_ts" + ], + "data": [ + [ + "2020-08-01 00:01:12", + "2020-08-01 00:01:13", + "v1", + "foo", + "bar" + ] ] - ] + } } }, "test_ne_0": { "input": { - "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT NOT NULL, metric_2 FLOAT NOT NULL, metric_3 FLOAT NOT NULL", - "ts_col": "event_ts", - "partition_cols": [ - "identifier_1", - "identifier_2", - "identifier_3" - ], - "data": [ - [ - "2020-08-01 00:00:09", - "v1", - "foo", - "bar", - 4.1, - 4.1, - 4.1 - ], - [ - "2020-08-01 00:00:10", - "v1", - "foo", - "bar", - 4.1, - 4.1, - 4.1 - ], - [ - "2020-08-01 00:00:11", - "v1", - "foo", - "bar", - 5.0, - 5.0, - 5.0 - ], - [ - "2020-08-01 00:01:12", - "v1", - "foo", - "bar", - 10.7, - 10.7, - 10.7 - ], - [ - "2020-08-01 00:01:13", - "v1", - "foo", - "bar", - 10.7, - 10.7, - 10.7 - ], - [ - "2020-08-01 00:01:14", - "v1", - "foo", - "bar", - 10.7, - 10.7, - 10.7 - ], - [ - "2020-08-01 00:01:15", - "v1", - "foo", - "bar", - 42.3, - 42.3, - 42.3 - ], - [ - "2020-08-01 00:01:16", - "v1", - "foo", - "bar", - 37.6, - 37.6, - 37.6 - ], - [ - "2020-08-01 00:01:17", - "v1", - "foo", - "bar", - 61.5, - 61.5, - 61.5 - ], - [ - "2020-09-01 00:01:12", - "v1", - "foo", - "bar", - 28.9, - 28.9, - 28.9 - ], - [ - "2020-09-01 00:19:12", - "v1", - "foo", - "bar", - 0.1, - 0.1, - 0.1 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["identifier_1", "identifier_2", "identifier_3"] + }, + "df": { + "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT NOT NULL, metric_2 FLOAT NOT NULL, metric_3 FLOAT NOT NULL", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "2020-08-01 00:00:09", + "v1", + "foo", + "bar", + 4.1, + 4.1, + 4.1 + ], + [ + "2020-08-01 00:00:10", + "v1", + "foo", + "bar", + 4.1, + 4.1, + 4.1 + ], + [ + "2020-08-01 00:00:11", + "v1", + "foo", + "bar", + 5.0, + 5.0, + 5.0 + ], + [ + "2020-08-01 00:01:12", + "v1", + "foo", + "bar", + 10.7, + 10.7, + 10.7 + ], + [ + "2020-08-01 00:01:13", + "v1", + "foo", + "bar", + 10.7, + 10.7, + 10.7 + ], + [ + "2020-08-01 00:01:14", + "v1", + "foo", + "bar", + 10.7, + 10.7, + 10.7 + ], + [ + "2020-08-01 00:01:15", + "v1", + "foo", + "bar", + 42.3, + 42.3, + 42.3 + ], + [ + "2020-08-01 00:01:16", + "v1", + "foo", + "bar", + 37.6, + 37.6, + 37.6 + ], + [ + "2020-08-01 00:01:17", + "v1", + "foo", + "bar", + 61.5, + 61.5, + 61.5 + ], + [ + "2020-09-01 00:01:12", + "v1", + "foo", + "bar", + 28.9, + 28.9, + 28.9 + ], + [ + "2020-09-01 00:19:12", + "v1", + "foo", + "bar", + 0.1, + 0.1, + 0.1 + ] ] - ] + } }, "expected": { - "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL,identifier_1 STRING NOT NULL,identifier_2 STRING NOT NULL,identifier_3 STRING NOT NULL", - "other_ts_cols": [ - "start_ts", - "end_ts" - ], - "data": [ - [ - "2020-08-01 00:00:10", - "2020-08-01 00:01:12", - "v1", - "foo", - "bar" - ], - [ - "2020-08-01 00:01:14", - "2020-09-01 00:19:12", - "v1", - "foo", - "bar" + "df": { + "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL,identifier_1 STRING NOT NULL,identifier_2 STRING NOT NULL,identifier_3 STRING NOT NULL", + "ts_convert": [ + "start_ts", + "end_ts" + ], + "data": [ + [ + "2020-08-01 00:00:10", + "2020-08-01 00:01:12", + "v1", + "foo", + "bar" + ], + [ + "2020-08-01 00:01:14", + "2020-09-01 00:19:12", + "v1", + "foo", + "bar" + ] ] - ] + } } }, "test_ne_1": { "input": { - "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT NOT NULL, metric_2 FLOAT NOT NULL, metric_3 FLOAT NOT NULL", - "ts_col": "event_ts", - "partition_cols": [ - "identifier_1", - "identifier_2", - "identifier_3" - ], - "data": [ - [ - "2020-08-01 00:00:09", - "v1", - "foo", - "bar", - 4.1, - 4.1, - 4.1 - ], - [ - "2020-08-01 00:00:10", - "v1", - "foo", - "bar", - 4.1, - 4.0, - 4.2 - ], - [ - "2020-08-01 00:00:11", - "v1", - "foo", - "bar", - 4.3, - 4.1, - 4.7 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["identifier_1", "identifier_2", "identifier_3"] + }, + "df": { + "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT NOT NULL, metric_2 FLOAT NOT NULL, metric_3 FLOAT NOT NULL", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "2020-08-01 00:00:09", + "v1", + "foo", + "bar", + 4.1, + 4.1, + 4.1 + ], + [ + "2020-08-01 00:00:10", + "v1", + "foo", + "bar", + 4.1, + 4.0, + 4.2 + ], + [ + "2020-08-01 00:00:11", + "v1", + "foo", + "bar", + 4.3, + 4.1, + 4.7 + ] ] - ] + } }, "expected": { - "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL,identifier_1 STRING NOT NULL,identifier_2 STRING NOT NULL,identifier_3 STRING NOT NULL", - "other_ts_cols": [ - "start_ts", - "end_ts" - ], - "data": [ - [ - "2020-08-01 00:00:10", - "2020-08-01 00:00:11", - "v1", - "foo", - "bar" + "df": { + "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL,identifier_1 STRING NOT NULL,identifier_2 STRING NOT NULL,identifier_3 STRING NOT NULL", + "ts_convert": [ + "start_ts", + "end_ts" + ], + "data": [ + [ + "2020-08-01 00:00:10", + "2020-08-01 00:00:11", + "v1", + "foo", + "bar" + ] ] - ] + } } }, "test_gt_0": { "input": { - "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT NOT NULL, metric_2 FLOAT NOT NULL, metric_3 FLOAT NOT NULL", - "ts_col": "event_ts", - "partition_cols": [ - "identifier_1", - "identifier_2", - "identifier_3" - ], - "data": [ - [ - "2020-08-01 00:00:09", - "v1", - "foo", - "bar", - 4.1, - 4.1, - 4.1 - ], - [ - "2020-08-01 00:00:10", - "v1", - "foo", - "bar", - 4.1, - 4.1, - 4.1 - ], - [ - "2020-08-01 00:00:11", - "v1", - "foo", - "bar", - 5.0, - 5.0, - 5.0 - ], - [ - "2020-08-01 00:01:12", - "v1", - "foo", - "bar", - 10.7, - 10.7, - 10.7 - ], - [ - "2020-08-01 00:01:13", - "v1", - "foo", - "bar", - 10.7, - 10.7, - 10.7 - ], - [ - "2020-08-01 00:01:14", - "v1", - "foo", - "bar", - 10.7, - 10.7, - 10.7 - ], - [ - "2020-08-01 00:01:15", - "v1", - "foo", - "bar", - 42.3, - 42.3, - 42.3 - ], - [ - "2020-08-01 00:01:16", - "v1", - "foo", - "bar", - 37.6, - 37.6, - 37.6 - ], - [ - "2020-08-01 00:01:17", - "v1", - "foo", - "bar", - 61.5, - 61.5, - 61.5 - ], - [ - "2020-09-01 00:01:12", - "v1", - "foo", - "bar", - 28.9, - 28.9, - 28.9 - ], - [ - "2020-09-01 00:19:12", - "v1", - "foo", - "bar", - 0.1, - 0.1, - 0.1 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["identifier_1", "identifier_2", "identifier_3"] + }, + "df": { + "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT NOT NULL, metric_2 FLOAT NOT NULL, metric_3 FLOAT NOT NULL", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "2020-08-01 00:00:09", + "v1", + "foo", + "bar", + 4.1, + 4.1, + 4.1 + ], + [ + "2020-08-01 00:00:10", + "v1", + "foo", + "bar", + 4.1, + 4.1, + 4.1 + ], + [ + "2020-08-01 00:00:11", + "v1", + "foo", + "bar", + 5.0, + 5.0, + 5.0 + ], + [ + "2020-08-01 00:01:12", + "v1", + "foo", + "bar", + 10.7, + 10.7, + 10.7 + ], + [ + "2020-08-01 00:01:13", + "v1", + "foo", + "bar", + 10.7, + 10.7, + 10.7 + ], + [ + "2020-08-01 00:01:14", + "v1", + "foo", + "bar", + 10.7, + 10.7, + 10.7 + ], + [ + "2020-08-01 00:01:15", + "v1", + "foo", + "bar", + 42.3, + 42.3, + 42.3 + ], + [ + "2020-08-01 00:01:16", + "v1", + "foo", + "bar", + 37.6, + 37.6, + 37.6 + ], + [ + "2020-08-01 00:01:17", + "v1", + "foo", + "bar", + 61.5, + 61.5, + 61.5 + ], + [ + "2020-09-01 00:01:12", + "v1", + "foo", + "bar", + 28.9, + 28.9, + 28.9 + ], + [ + "2020-09-01 00:19:12", + "v1", + "foo", + "bar", + 0.1, + 0.1, + 0.1 + ] ] - ] + } }, "expected": { - "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL,identifier_1 STRING NOT NULL,identifier_2 STRING NOT NULL,identifier_3 STRING NOT NULL", - "other_ts_cols": [ - "start_ts", - "end_ts" - ], - "data": [ - [ - "2020-08-01 00:00:10", - "2020-08-01 00:01:12", - "v1", - "foo", - "bar" - ], - [ - "2020-08-01 00:01:14", - "2020-08-01 00:01:15", - "v1", - "foo", - "bar" - ], - [ - "2020-08-01 00:01:16", - "2020-08-01 00:01:17", - "v1", - "foo", - "bar" + "df": { + "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL,identifier_1 STRING NOT NULL,identifier_2 STRING NOT NULL,identifier_3 STRING NOT NULL", + "ts_convert": [ + "start_ts", + "end_ts" + ], + "data": [ + [ + "2020-08-01 00:00:10", + "2020-08-01 00:01:12", + "v1", + "foo", + "bar" + ], + [ + "2020-08-01 00:01:14", + "2020-08-01 00:01:15", + "v1", + "foo", + "bar" + ], + [ + "2020-08-01 00:01:16", + "2020-08-01 00:01:17", + "v1", + "foo", + "bar" + ] ] - ] + } } }, "test_gt_1": { - "input": { - "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT NOT NULL, metric_2 FLOAT NOT NULL, metric_3 FLOAT NOT NULL", - "ts_col": "event_ts", - "partition_cols": [ - "identifier_1", - "identifier_2", - "identifier_3" - ], - "data": [ - [ - "2020-08-01 00:00:09", - "v1", - "foo", - "bar", - 4.3, - 4.1, - 4.7 - ], - [ - "2020-08-01 00:00:10", - "v1", - "foo", - "bar", - 4.4, - 4.0, - 4.6 - ], - [ - "2020-08-01 00:00:11", - "v1", - "foo", - "bar", - 4.5, - 4.1, - 4.7 - ] - ] - }, - "expected": { - "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL,identifier_1 STRING NOT NULL,identifier_2 STRING NOT NULL,identifier_3 STRING NOT NULL", - "other_ts_cols": [ - "start_ts", - "end_ts" - ], - "data": [ - [ - "2020-08-01 00:00:10", - "2020-08-01 00:00:11", - "v1", - "foo", - "bar" - ] - ] - } - }, - "test_lt_0": { - "input": { - "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT NOT NULL, metric_2 FLOAT NOT NULL, metric_3 FLOAT NOT NULL", - "ts_col": "event_ts", - "partition_cols": [ - "identifier_1", - "identifier_2", - "identifier_3" - ], - "data": [ - [ - "2020-08-01 00:00:09", - "v1", - "foo", - "bar", - 4.1, - 4.1, - 4.1 - ], - [ - "2020-08-01 00:00:10", - "v1", - "foo", - "bar", - 4.1, - 4.1, - 4.1 - ], - [ - "2020-08-01 00:00:11", - "v1", - "foo", - "bar", - 5.0, - 5.0, - 5.0 - ], - [ - "2020-08-01 00:01:12", - "v1", - "foo", - "bar", - 10.7, - 10.7, - 10.7 - ], - [ - "2020-08-01 00:01:13", - "v1", - "foo", - "bar", - 10.7, - 10.7, - 10.7 - ], - [ - "2020-08-01 00:01:14", - "v1", - "foo", - "bar", - 10.7, - 10.7, - 10.7 - ], - [ - "2020-08-01 00:01:15", - "v1", - "foo", - "bar", - 42.3, - 42.3, - 42.3 - ], - [ - "2020-08-01 00:01:16", - "v1", - "foo", - "bar", - 37.6, - 37.6, - 37.6 - ], - [ - "2020-08-01 00:01:17", - "v1", - "foo", - "bar", - 61.5, - 61.5, - 61.5 - ], - [ - "2020-09-01 00:01:12", - "v1", - "foo", - "bar", - 28.9, - 28.9, - 28.9 - ], - [ - "2020-09-01 00:19:12", - "v1", - "foo", - "bar", - 0.1, - 0.1, - 0.1 - ] - ] - }, - "expected": { - "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL,identifier_1 STRING NOT NULL,identifier_2 STRING NOT NULL,identifier_3 STRING NOT NULL", - "other_ts_cols": [ - "start_ts", - "end_ts" - ], - "data": [ - [ - "2020-08-01 00:01:15", - "2020-08-01 00:01:16", - "v1", - "foo", - "bar" - ], - [ - "2020-08-01 00:01:17", - "2020-09-01 00:19:12", - "v1", - "foo", - "bar" - ] - ] - } - }, - "test_lt_1": { - "input": { - "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT NOT NULL, metric_2 FLOAT NOT NULL, metric_3 FLOAT NOT NULL", - "ts_col": "event_ts", - "partition_cols": [ - "identifier_1", - "identifier_2", - "identifier_3" - ], - "data": [ - [ - "2020-08-01 00:00:09", - "v1", - "foo", - "bar", - 4.3, - 4.1, - 4.7 - ], - [ - "2020-08-01 00:00:10", - "v1", - "foo", - "bar", - 4.2, - 4.2, - 4.8 - ], - [ - "2020-08-01 00:00:11", - "v1", - "foo", - "bar", - 4.1, - 4.1, - 4.7 - ] - ] - }, - "expected": { - "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL,identifier_1 STRING NOT NULL,identifier_2 STRING NOT NULL,identifier_3 STRING NOT NULL", - "other_ts_cols": [ - "start_ts", - "end_ts" - ], - "data": [ - [ - "2020-08-01 00:00:10", - "2020-08-01 00:00:11", - "v1", - "foo", - "bar" - ] - ] - } - }, - "test_gte_0": { - "input": { - "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT NOT NULL, metric_2 FLOAT NOT NULL, metric_3 FLOAT NOT NULL", - "ts_col": "event_ts", - "partition_cols": [ - "identifier_1", - "identifier_2", - "identifier_3" - ], - "data": [ - [ - "2020-08-01 00:00:09", - "v1", - "foo", - "bar", - 4.1, - 4.1, - 4.1 - ], - [ - "2020-08-01 00:00:10", - "v1", - "foo", - "bar", - 4.1, - 4.1, - 4.1 - ], - [ - "2020-08-01 00:00:11", - "v1", - "foo", - "bar", - 5.0, - 5.0, - 5.0 - ], - [ - "2020-08-01 00:01:12", - "v1", - "foo", - "bar", - 10.7, - 10.7, - 10.7 - ], - [ - "2020-08-01 00:01:13", - "v1", - "foo", - "bar", - 10.7, - 10.7, - 10.7 - ], - [ - "2020-08-01 00:01:14", - "v1", - "foo", - "bar", - 10.7, - 10.7, - 10.7 - ], - [ - "2020-08-01 00:01:15", - "v1", - "foo", - "bar", - 42.3, - 42.3, - 42.3 - ], - [ - "2020-08-01 00:01:16", - "v1", - "foo", - "bar", - 37.6, - 37.6, - 37.6 - ], - [ - "2020-08-01 00:01:17", - "v1", - "foo", - "bar", - 61.5, - 61.5, - 61.5 - ], - [ - "2020-09-01 00:01:12", - "v1", - "foo", - "bar", - 28.9, - 28.9, - 28.9 - ], - [ - "2020-09-01 00:19:12", - "v1", - "foo", - "bar", - 0.1, - 0.1, - 0.1 - ] - ] - }, - "expected": { - "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL,identifier_1 STRING NOT NULL,identifier_2 STRING NOT NULL,identifier_3 STRING NOT NULL", - "other_ts_cols": [ - "start_ts", - "end_ts" - ], - "data": [ - [ - "2020-08-01 00:00:09", - "2020-08-01 00:01:15", - "v1", - "foo", - "bar" - ], - [ - "2020-08-01 00:01:16", - "2020-08-01 00:01:17", - "v1", - "foo", - "bar" - ] - ] - } - }, - "test_gte_1": { - "input": { - "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT NOT NULL, metric_2 FLOAT NOT NULL, metric_3 FLOAT NOT NULL", - "ts_col": "event_ts", - "partition_cols": [ - "identifier_1", - "identifier_2", - "identifier_3" - ], - "data": [ - [ - "2020-08-01 00:00:09", - "v1", - "foo", - "bar", - 4.3, - 4.1, - 4.7 - ], - [ - "2020-08-01 00:00:10", - "v1", - "foo", - "bar", - 4.4, - 4.0, - 4.6 - ], - [ - "2020-08-01 00:00:11", - "v1", - "foo", - "bar", - 4.5, - 4.0, - 4.7 - ] - ] - }, - "expected": { - "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL,identifier_1 STRING NOT NULL,identifier_2 STRING NOT NULL,identifier_3 STRING NOT NULL", - "other_ts_cols": [ - "start_ts", - "end_ts" - ], - "data": [ - [ - "2020-08-01 00:00:10", - "2020-08-01 00:00:11", - "v1", - "foo", - "bar" - ] - ] - } - }, - "test_lte_0": { - "input": { - "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT NOT NULL, metric_2 FLOAT NOT NULL, metric_3 FLOAT NOT NULL", - "ts_col": "event_ts", - "partition_cols": [ - "identifier_1", - "identifier_2", - "identifier_3" - ], - "data": [ - [ - "2020-08-01 00:00:09", - "v1", - "foo", - "bar", - 4.1, - 4.1, - 4.1 - ], - [ - "2020-08-01 00:00:10", - "v1", - "foo", - "bar", - 4.1, - 4.1, - 4.1 - ], - [ - "2020-08-01 00:00:11", - "v1", - "foo", - "bar", - 5.0, - 5.0, - 5.0 - ], - [ - "2020-08-01 00:01:12", - "v1", - "foo", - "bar", - 10.7, - 10.7, - 10.7 - ], - [ - "2020-08-01 00:01:13", - "v1", - "foo", - "bar", - 10.7, - 10.7, - 10.7 - ], - [ - "2020-08-01 00:01:14", - "v1", - "foo", - "bar", - 10.7, - 10.7, - 10.7 - ], - [ - "2020-08-01 00:01:15", - "v1", - "foo", - "bar", - 42.3, - 42.3, - 42.3 - ], - [ - "2020-08-01 00:01:16", - "v1", - "foo", - "bar", - 37.6, - 37.6, - 37.6 - ], - [ - "2020-08-01 00:01:17", - "v1", - "foo", - "bar", - 61.5, - 61.5, - 61.5 - ], - [ - "2020-09-01 00:01:12", - "v1", - "foo", - "bar", - 28.9, - 28.9, - 28.9 - ], - [ - "2020-09-01 00:19:12", - "v1", - "foo", - "bar", - 0.1, - 0.1, - 0.1 - ] - ] - }, - "expected": { - "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL,identifier_1 STRING NOT NULL,identifier_2 STRING NOT NULL,identifier_3 STRING NOT NULL", - "other_ts_cols": [ - "start_ts", - "end_ts" - ], - "data": [ - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:10", - "v1", - "foo", - "bar" - ], - [ - "2020-08-01 00:01:12", - "2020-08-01 00:01:14", - "v1", - "foo", - "bar" - ], - [ - "2020-08-01 00:01:15", - "2020-08-01 00:01:16", - "v1", - "foo", - "bar" - ], - [ - "2020-08-01 00:01:17", - "2020-09-01 00:19:12", - "v1", - "foo", - "bar" - ] - ] - } - }, - "test_lte_1": { - "input": { - "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT NOT NULL, metric_2 FLOAT NOT NULL, metric_3 FLOAT NOT NULL", - "ts_col": "event_ts", - "partition_cols": [ - "identifier_1", - "identifier_2", - "identifier_3" - ], - "data": [ - [ - "2020-08-01 00:00:09", - "v1", - "foo", - "bar", - 4.3, - 4.1, - 4.7 - ], - [ - "2020-08-01 00:00:10", - "v1", - "foo", - "bar", - 4.2, - 4.2, - 4.8 - ], - [ - "2020-08-01 00:00:11", - "v1", - "foo", - "bar", - 4.1, - 4.2, - 4.7 - ] - ] - }, - "expected": { - "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL,identifier_1 STRING NOT NULL,identifier_2 STRING NOT NULL,identifier_3 STRING NOT NULL", - "other_ts_cols": [ - "start_ts", - "end_ts" - ], - "data": [ - [ - "2020-08-01 00:00:10", - "2020-08-01 00:00:11", - "v1", - "foo", - "bar" - ] - ] - } - }, - "test_threshold_fn": { - "input": { - "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT NOT NULL, metric_2 FLOAT NOT NULL, metric_3 FLOAT NOT NULL", - "ts_col": "event_ts", - "partition_cols": [ - "identifier_1", - "identifier_2", - "identifier_3" - ], - "data": [ - [ - "2020-08-01 00:00:09", - "v1", - "foo", - "bar", - 4.1, - 4.1, - 4.1 - ], - [ - "2020-08-01 00:00:10", - "v1", - "foo", - "bar", - 4.1, - 4.1, - 4.1 - ], - [ - "2020-08-01 00:00:11", - "v1", - "foo", - "bar", - 5.0, - 5.0, - 5.0 - ], - [ - "2020-08-01 00:01:12", - "v1", - "foo", - "bar", - 10.7, - 10.7, - 10.7 - ], - [ - "2020-08-01 00:01:13", - "v1", - "foo", - "bar", - 10.7, - 10.7, - 10.7 - ], - [ - "2020-08-01 00:01:14", - "v1", - "foo", - "bar", - 10.7, - 10.7, - 10.7 - ], - [ - "2020-08-01 00:01:15", - "v1", - "foo", - "bar", - 42.3, - 42.3, - 42.3 - ], - [ - "2020-08-01 00:01:16", - "v1", - "foo", - "bar", - 37.6, - 37.6, - 37.6 - ], - [ - "2020-08-01 00:01:17", - "v1", - "foo", - "bar", - 61.5, - 61.5, - 61.5 - ], - [ - "2020-09-01 00:01:12", - "v1", - "foo", - "bar", - 28.9, - 28.9, - 28.9 - ], - [ - "2020-09-01 00:19:12", - "v1", - "foo", - "bar", - 0.1, - 0.1, - 0.1 - ] - ] - }, - "expected": { - "schema": "start_ts: STRING, end_ts: STRING, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL ,identifier_3 STRING NOT NULL", - "other_ts_cols": [ - "start_ts", - "end_ts" - ], - "data": [ - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:10", - "v1", - "foo", - "bar" - ], - [ - "2020-08-01 00:01:12", - "2020-08-01 00:01:14", - "v1", - "foo", - "bar" - ] - ] - } - }, - "test_null_safe_eq_0": { - "input": { - "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT, metric_2 FLOAT, metric_3 FLOAT", - "ts_col": "event_ts", - "partition_cols": [ - "identifier_1", - "identifier_2", - "identifier_3" - ], - "data": [ - [ - "2020-08-01 00:00:09", - "v1", - "foo", - "bar", - 4.1, - null, - 4.1 - ], - [ - "2020-08-01 00:00:10", - "v1", - "foo", - "bar", - 4.1, - null, - 4.1 - ], - [ - "2020-08-01 00:00:11", - "v1", - "foo", - "bar", - 5.0, - 5.0, - 5.0 - ], - [ - "2020-08-01 00:01:12", - "v1", - "foo", - "bar", - null, - 10.7, - 10.7 - ], - [ - "2020-08-01 00:01:13", - "v1", - "foo", - "bar", - null, - 10.7, - 10.7 - ], - [ - "2020-08-01 00:01:14", - "v1", - "foo", - "bar", - null, - 10.7, - 10.7 - ], - [ - "2020-08-01 00:01:15", - "v1", - "foo", - "bar", - 42.3, - 42.3, - 42.3 - ], - [ - "2020-08-01 00:01:16", - "v1", - "foo", - "bar", - 37.6, - 37.6, - 37.6 - ], - [ - "2020-08-01 00:01:17", - "v1", - "foo", - "bar", - 61.5, - 61.5, - 61.5 - ], - [ - "2020-09-01 00:01:12", - "v1", - "foo", - "bar", - 28.9, - 28.9, - 28.9 - ], - [ - "2020-09-01 00:19:12", - "v1", - "foo", - "bar", - 0.1, - 0.1, - 0.1 + "input": { + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["identifier_1", "identifier_2", "identifier_3"] + }, + "df": { + "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT NOT NULL, metric_2 FLOAT NOT NULL, metric_3 FLOAT NOT NULL", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "2020-08-01 00:00:09", + "v1", + "foo", + "bar", + 4.3, + 4.1, + 4.7 + ], + [ + "2020-08-01 00:00:10", + "v1", + "foo", + "bar", + 4.4, + 4.0, + 4.6 + ], + [ + "2020-08-01 00:00:11", + "v1", + "foo", + "bar", + 4.5, + 4.1, + 4.7 + ] ] - ] + } + }, + "expected": { + "df": { + "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL,identifier_1 STRING NOT NULL,identifier_2 STRING NOT NULL,identifier_3 STRING NOT NULL", + "ts_convert": [ + "start_ts", + "end_ts" + ], + "data": [ + [ + "2020-08-01 00:00:10", + "2020-08-01 00:00:11", + "v1", + "foo", + "bar" + ] + ] + } + } + }, + "test_lt_0": { + "input": { + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["identifier_1", "identifier_2", "identifier_3"] + }, + "df": { + "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT NOT NULL, metric_2 FLOAT NOT NULL, metric_3 FLOAT NOT NULL", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "2020-08-01 00:00:09", + "v1", + "foo", + "bar", + 4.1, + 4.1, + 4.1 + ], + [ + "2020-08-01 00:00:10", + "v1", + "foo", + "bar", + 4.1, + 4.1, + 4.1 + ], + [ + "2020-08-01 00:00:11", + "v1", + "foo", + "bar", + 5.0, + 5.0, + 5.0 + ], + [ + "2020-08-01 00:01:12", + "v1", + "foo", + "bar", + 10.7, + 10.7, + 10.7 + ], + [ + "2020-08-01 00:01:13", + "v1", + "foo", + "bar", + 10.7, + 10.7, + 10.7 + ], + [ + "2020-08-01 00:01:14", + "v1", + "foo", + "bar", + 10.7, + 10.7, + 10.7 + ], + [ + "2020-08-01 00:01:15", + "v1", + "foo", + "bar", + 42.3, + 42.3, + 42.3 + ], + [ + "2020-08-01 00:01:16", + "v1", + "foo", + "bar", + 37.6, + 37.6, + 37.6 + ], + [ + "2020-08-01 00:01:17", + "v1", + "foo", + "bar", + 61.5, + 61.5, + 61.5 + ], + [ + "2020-09-01 00:01:12", + "v1", + "foo", + "bar", + 28.9, + 28.9, + 28.9 + ], + [ + "2020-09-01 00:19:12", + "v1", + "foo", + "bar", + 0.1, + 0.1, + 0.1 + ] + ] + } + }, + "expected": { + "df": { + "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL,identifier_1 STRING NOT NULL,identifier_2 STRING NOT NULL,identifier_3 STRING NOT NULL", + "ts_convert": [ + "start_ts", + "end_ts" + ], + "data": [ + [ + "2020-08-01 00:01:15", + "2020-08-01 00:01:16", + "v1", + "foo", + "bar" + ], + [ + "2020-08-01 00:01:17", + "2020-09-01 00:19:12", + "v1", + "foo", + "bar" + ] + ] + } + } + }, + "test_lt_1": { + "input": { + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["identifier_1", "identifier_2", "identifier_3"] + }, + "df": { + "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT NOT NULL, metric_2 FLOAT NOT NULL, metric_3 FLOAT NOT NULL", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "2020-08-01 00:00:09", + "v1", + "foo", + "bar", + 4.3, + 4.1, + 4.7 + ], + [ + "2020-08-01 00:00:10", + "v1", + "foo", + "bar", + 4.2, + 4.2, + 4.8 + ], + [ + "2020-08-01 00:00:11", + "v1", + "foo", + "bar", + 4.1, + 4.1, + 4.7 + ] + ] + } + }, + "expected": { + "df": { + "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL,identifier_1 STRING NOT NULL,identifier_2 STRING NOT NULL,identifier_3 STRING NOT NULL", + "ts_convert": [ + "start_ts", + "end_ts" + ], + "data": [ + [ + "2020-08-01 00:00:10", + "2020-08-01 00:00:11", + "v1", + "foo", + "bar" + ] + ] + } + } + }, + "test_gte_0": { + "input": { + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["identifier_1", "identifier_2", "identifier_3"] + }, + "df": { + "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT NOT NULL, metric_2 FLOAT NOT NULL, metric_3 FLOAT NOT NULL", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "2020-08-01 00:00:09", + "v1", + "foo", + "bar", + 4.1, + 4.1, + 4.1 + ], + [ + "2020-08-01 00:00:10", + "v1", + "foo", + "bar", + 4.1, + 4.1, + 4.1 + ], + [ + "2020-08-01 00:00:11", + "v1", + "foo", + "bar", + 5.0, + 5.0, + 5.0 + ], + [ + "2020-08-01 00:01:12", + "v1", + "foo", + "bar", + 10.7, + 10.7, + 10.7 + ], + [ + "2020-08-01 00:01:13", + "v1", + "foo", + "bar", + 10.7, + 10.7, + 10.7 + ], + [ + "2020-08-01 00:01:14", + "v1", + "foo", + "bar", + 10.7, + 10.7, + 10.7 + ], + [ + "2020-08-01 00:01:15", + "v1", + "foo", + "bar", + 42.3, + 42.3, + 42.3 + ], + [ + "2020-08-01 00:01:16", + "v1", + "foo", + "bar", + 37.6, + 37.6, + 37.6 + ], + [ + "2020-08-01 00:01:17", + "v1", + "foo", + "bar", + 61.5, + 61.5, + 61.5 + ], + [ + "2020-09-01 00:01:12", + "v1", + "foo", + "bar", + 28.9, + 28.9, + 28.9 + ], + [ + "2020-09-01 00:19:12", + "v1", + "foo", + "bar", + 0.1, + 0.1, + 0.1 + ] + ] + } }, "expected": { + "df": { "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL,identifier_1 STRING NOT NULL,identifier_2 STRING NOT NULL,identifier_3 STRING NOT NULL", - "other_ts_cols": [ + "ts_convert": [ "start_ts", "end_ts" ], "data": [ [ "2020-08-01 00:00:09", - "2020-08-01 00:00:10", + "2020-08-01 00:01:15", "v1", "foo", "bar" ], [ - "2020-08-01 00:01:12", - "2020-08-01 00:01:14", + "2020-08-01 00:01:16", + "2020-08-01 00:01:17", "v1", "foo", "bar" ] ] + } + } + }, + "test_gte_1": { + "input": { + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["identifier_1", "identifier_2", "identifier_3"] + }, + "df": { + "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT NOT NULL, metric_2 FLOAT NOT NULL, metric_3 FLOAT NOT NULL", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "2020-08-01 00:00:09", + "v1", + "foo", + "bar", + 4.3, + 4.1, + 4.7 + ], + [ + "2020-08-01 00:00:10", + "v1", + "foo", + "bar", + 4.4, + 4.0, + 4.6 + ], + [ + "2020-08-01 00:00:11", + "v1", + "foo", + "bar", + 4.5, + 4.0, + 4.7 + ] + ] + } + }, + "expected": { + "df": { + "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL,identifier_1 STRING NOT NULL,identifier_2 STRING NOT NULL,identifier_3 STRING NOT NULL", + "ts_convert": [ + "start_ts", + "end_ts" + ], + "data": [ + [ + "2020-08-01 00:00:10", + "2020-08-01 00:00:11", + "v1", + "foo", + "bar" + ] + ] + } + } + }, + "test_lte_0": { + "input": { + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["identifier_1", "identifier_2", "identifier_3"] + }, + "df": { + "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT NOT NULL, metric_2 FLOAT NOT NULL, metric_3 FLOAT NOT NULL", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "2020-08-01 00:00:09", + "v1", + "foo", + "bar", + 4.1, + 4.1, + 4.1 + ], + [ + "2020-08-01 00:00:10", + "v1", + "foo", + "bar", + 4.1, + 4.1, + 4.1 + ], + [ + "2020-08-01 00:00:11", + "v1", + "foo", + "bar", + 5.0, + 5.0, + 5.0 + ], + [ + "2020-08-01 00:01:12", + "v1", + "foo", + "bar", + 10.7, + 10.7, + 10.7 + ], + [ + "2020-08-01 00:01:13", + "v1", + "foo", + "bar", + 10.7, + 10.7, + 10.7 + ], + [ + "2020-08-01 00:01:14", + "v1", + "foo", + "bar", + 10.7, + 10.7, + 10.7 + ], + [ + "2020-08-01 00:01:15", + "v1", + "foo", + "bar", + 42.3, + 42.3, + 42.3 + ], + [ + "2020-08-01 00:01:16", + "v1", + "foo", + "bar", + 37.6, + 37.6, + 37.6 + ], + [ + "2020-08-01 00:01:17", + "v1", + "foo", + "bar", + 61.5, + 61.5, + 61.5 + ], + [ + "2020-09-01 00:01:12", + "v1", + "foo", + "bar", + 28.9, + 28.9, + 28.9 + ], + [ + "2020-09-01 00:19:12", + "v1", + "foo", + "bar", + 0.1, + 0.1, + 0.1 + ] + ] + } + }, + "expected": { + "df": { + "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL,identifier_1 STRING NOT NULL,identifier_2 STRING NOT NULL,identifier_3 STRING NOT NULL", + "ts_convert": [ + "start_ts", + "end_ts" + ], + "data": [ + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:10", + "v1", + "foo", + "bar" + ], + [ + "2020-08-01 00:01:12", + "2020-08-01 00:01:14", + "v1", + "foo", + "bar" + ], + [ + "2020-08-01 00:01:15", + "2020-08-01 00:01:16", + "v1", + "foo", + "bar" + ], + [ + "2020-08-01 00:01:17", + "2020-09-01 00:19:12", + "v1", + "foo", + "bar" + ] + ] + } + } + }, + "test_lte_1": { + "input": { + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["identifier_1", "identifier_2", "identifier_3"] + }, + "df": { + "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT NOT NULL, metric_2 FLOAT NOT NULL, metric_3 FLOAT NOT NULL", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "2020-08-01 00:00:09", + "v1", + "foo", + "bar", + 4.3, + 4.1, + 4.7 + ], + [ + "2020-08-01 00:00:10", + "v1", + "foo", + "bar", + 4.2, + 4.2, + 4.8 + ], + [ + "2020-08-01 00:00:11", + "v1", + "foo", + "bar", + 4.1, + 4.2, + 4.7 + ] + ] + } + }, + "expected": { + "df": { + "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL,identifier_1 STRING NOT NULL,identifier_2 STRING NOT NULL,identifier_3 STRING NOT NULL", + "ts_convert": [ + "start_ts", + "end_ts" + ], + "data": [ + [ + "2020-08-01 00:00:10", + "2020-08-01 00:00:11", + "v1", + "foo", + "bar" + ] + ] + } + } + }, + "test_threshold_fn": { + "input": { + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["identifier_1", "identifier_2", "identifier_3"] + }, + "df": { + "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT NOT NULL, metric_2 FLOAT NOT NULL, metric_3 FLOAT NOT NULL", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "2020-08-01 00:00:09", + "v1", + "foo", + "bar", + 4.1, + 4.1, + 4.1 + ], + [ + "2020-08-01 00:00:10", + "v1", + "foo", + "bar", + 4.1, + 4.1, + 4.1 + ], + [ + "2020-08-01 00:00:11", + "v1", + "foo", + "bar", + 5.0, + 5.0, + 5.0 + ], + [ + "2020-08-01 00:01:12", + "v1", + "foo", + "bar", + 10.7, + 10.7, + 10.7 + ], + [ + "2020-08-01 00:01:13", + "v1", + "foo", + "bar", + 10.7, + 10.7, + 10.7 + ], + [ + "2020-08-01 00:01:14", + "v1", + "foo", + "bar", + 10.7, + 10.7, + 10.7 + ], + [ + "2020-08-01 00:01:15", + "v1", + "foo", + "bar", + 42.3, + 42.3, + 42.3 + ], + [ + "2020-08-01 00:01:16", + "v1", + "foo", + "bar", + 37.6, + 37.6, + 37.6 + ], + [ + "2020-08-01 00:01:17", + "v1", + "foo", + "bar", + 61.5, + 61.5, + 61.5 + ], + [ + "2020-09-01 00:01:12", + "v1", + "foo", + "bar", + 28.9, + 28.9, + 28.9 + ], + [ + "2020-09-01 00:19:12", + "v1", + "foo", + "bar", + 0.1, + 0.1, + 0.1 + ] + ] + } + }, + "expected": { + "df": { + "schema": "start_ts: STRING, end_ts: STRING, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL ,identifier_3 STRING NOT NULL", + "ts_convert": [ + "start_ts", + "end_ts" + ], + "data": [ + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:10", + "v1", + "foo", + "bar" + ], + [ + "2020-08-01 00:01:12", + "2020-08-01 00:01:14", + "v1", + "foo", + "bar" + ] + ] + } + } + }, + "test_null_safe_eq_0": { + "input": { + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["identifier_1", "identifier_2", "identifier_3"] + }, + "df": { + "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT, metric_2 FLOAT, metric_3 FLOAT", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "2020-08-01 00:00:09", + "v1", + "foo", + "bar", + 4.1, + null, + 4.1 + ], + [ + "2020-08-01 00:00:10", + "v1", + "foo", + "bar", + 4.1, + null, + 4.1 + ], + [ + "2020-08-01 00:00:11", + "v1", + "foo", + "bar", + 5.0, + 5.0, + 5.0 + ], + [ + "2020-08-01 00:01:12", + "v1", + "foo", + "bar", + null, + 10.7, + 10.7 + ], + [ + "2020-08-01 00:01:13", + "v1", + "foo", + "bar", + null, + 10.7, + 10.7 + ], + [ + "2020-08-01 00:01:14", + "v1", + "foo", + "bar", + null, + 10.7, + 10.7 + ], + [ + "2020-08-01 00:01:15", + "v1", + "foo", + "bar", + 42.3, + 42.3, + 42.3 + ], + [ + "2020-08-01 00:01:16", + "v1", + "foo", + "bar", + 37.6, + 37.6, + 37.6 + ], + [ + "2020-08-01 00:01:17", + "v1", + "foo", + "bar", + 61.5, + 61.5, + 61.5 + ], + [ + "2020-09-01 00:01:12", + "v1", + "foo", + "bar", + 28.9, + 28.9, + 28.9 + ], + [ + "2020-09-01 00:19:12", + "v1", + "foo", + "bar", + 0.1, + 0.1, + 0.1 + ] + ] + } + }, + "expected": { + "df": { + "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL,identifier_1 STRING NOT NULL,identifier_2 STRING NOT NULL,identifier_3 STRING NOT NULL", + "ts_convert": [ + "start_ts", + "end_ts" + ], + "data": [ + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:10", + "v1", + "foo", + "bar" + ], + [ + "2020-08-01 00:01:12", + "2020-08-01 00:01:14", + "v1", + "foo", + "bar" + ] + ] + } } }, "test_null_safe_eq_1": { "input": { - "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT, metric_2 FLOAT, metric_3 FLOAT", - "ts_col": "event_ts", - "partition_cols": [ - "identifier_1", - "identifier_2", - "identifier_3" - ], - "data": [ - [ - "2020-08-01 00:00:09", - "v1", - "foo", - "bar", - 4.1, - null, - 4.1 - ], - [ - "2020-08-01 00:00:10", - "v1", - "foo", - "bar", - 4.1, - 4.1, - null - ], - [ - "2020-08-01 00:00:11", - "v1", - "foo", - "bar", - 5.0, - 5.0, - 5.0 - ], - [ - "2020-08-01 00:01:12", - "v1", - "foo", - "bar", - null, - 10.7, - 10.7 - ], - [ - "2020-08-01 00:01:13", - "v1", - "foo", - "bar", - null, - 10.7, - 10.7 - ], - [ - "2020-08-01 00:01:14", - "v1", - "foo", - "bar", - 10.7, - null, - 10.7 - ], - [ - "2020-08-01 00:01:15", - "v1", - "foo", - "bar", - 42.3, - 42.3, - 42.3 - ], - [ - "2020-08-01 00:01:16", - "v1", - "foo", - "bar", - 37.6, - 37.6, - 37.6 - ], - [ - "2020-08-01 00:01:17", - "v1", - "foo", - "bar", - 61.5, - 61.5, - 61.5 - ], - [ - "2020-09-01 00:01:12", - "v1", - "foo", - "bar", - 28.9, - 28.9, - 28.9 - ], - [ - "2020-09-01 00:19:12", - "v1", - "foo", - "bar", - 0.1, - 0.1, - 0.1 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["identifier_1", "identifier_2", "identifier_3"] + }, + "df": { + "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT, metric_2 FLOAT, metric_3 FLOAT", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "2020-08-01 00:00:09", + "v1", + "foo", + "bar", + 4.1, + null, + 4.1 + ], + [ + "2020-08-01 00:00:10", + "v1", + "foo", + "bar", + 4.1, + 4.1, + null + ], + [ + "2020-08-01 00:00:11", + "v1", + "foo", + "bar", + 5.0, + 5.0, + 5.0 + ], + [ + "2020-08-01 00:01:12", + "v1", + "foo", + "bar", + null, + 10.7, + 10.7 + ], + [ + "2020-08-01 00:01:13", + "v1", + "foo", + "bar", + null, + 10.7, + 10.7 + ], + [ + "2020-08-01 00:01:14", + "v1", + "foo", + "bar", + 10.7, + null, + 10.7 + ], + [ + "2020-08-01 00:01:15", + "v1", + "foo", + "bar", + 42.3, + 42.3, + 42.3 + ], + [ + "2020-08-01 00:01:16", + "v1", + "foo", + "bar", + 37.6, + 37.6, + 37.6 + ], + [ + "2020-08-01 00:01:17", + "v1", + "foo", + "bar", + 61.5, + 61.5, + 61.5 + ], + [ + "2020-09-01 00:01:12", + "v1", + "foo", + "bar", + 28.9, + 28.9, + 28.9 + ], + [ + "2020-09-01 00:19:12", + "v1", + "foo", + "bar", + 0.1, + 0.1, + 0.1 + ] ] - ] + } }, "expected": { - "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL,identifier_1 STRING NOT NULL,identifier_2 STRING NOT NULL,identifier_3 STRING NOT NULL", - "other_ts_cols": [ - "start_ts", - "end_ts" - ], - "data": [ - [ - "2020-08-01 00:01:12", - "2020-08-01 00:01:13", - "v1", - "foo", - "bar" + "df": { + "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL,identifier_1 STRING NOT NULL,identifier_2 STRING NOT NULL,identifier_3 STRING NOT NULL", + "ts_convert": [ + "start_ts", + "end_ts" + ], + "data": [ + [ + "2020-08-01 00:01:12", + "2020-08-01 00:01:13", + "v1", + "foo", + "bar" + ] ] - ] + } } }, "test_adjacent_intervals": { "input": { - "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT, metric_2 FLOAT, metric_3 FLOAT", - "ts_col": "event_ts", - "partition_cols": [ - "identifier_1", - "identifier_2", - "identifier_3" - ], - "data": [ - [ - "2020-08-01 00:00:09", - "v1", - "foo", - "bar", - 4.1, - 4.1, - 4.1 - ], - [ - "2020-08-01 00:00:10", - "v1", - "foo", - "bar", - 4.1, - 4.1, - 4.1 - ], - [ - "2020-08-01 00:00:11", - "v1", - "foo", - "bar", - 5.0, - 5.0, - 5.0 - ], - [ - "2020-08-01 00:00:12", - "v1", - "foo", - "bar", - 5.0, - 5.0, - 5.0 - ], - [ - "2020-08-01 00:01:12", - "v1", - "foo", - "bar", - 10.7, - 10.7, - 10.7 - ], - [ - "2020-08-01 00:01:13", - "v1", - "foo", - "bar", - 10.7, - 10.7, - 10.7 - ], - [ - "2020-08-01 00:01:14", - "v1", - "foo", - "bar", - 10.7, - 10.7, - 10.7 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["identifier_1", "identifier_2", "identifier_3"] + }, + "df": { + "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT, metric_2 FLOAT, metric_3 FLOAT", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "2020-08-01 00:00:09", + "v1", + "foo", + "bar", + 4.1, + 4.1, + 4.1 + ], + [ + "2020-08-01 00:00:10", + "v1", + "foo", + "bar", + 4.1, + 4.1, + 4.1 + ], + [ + "2020-08-01 00:00:11", + "v1", + "foo", + "bar", + 5.0, + 5.0, + 5.0 + ], + [ + "2020-08-01 00:00:12", + "v1", + "foo", + "bar", + 5.0, + 5.0, + 5.0 + ], + [ + "2020-08-01 00:01:12", + "v1", + "foo", + "bar", + 10.7, + 10.7, + 10.7 + ], + [ + "2020-08-01 00:01:13", + "v1", + "foo", + "bar", + 10.7, + 10.7, + 10.7 + ], + [ + "2020-08-01 00:01:14", + "v1", + "foo", + "bar", + 10.7, + 10.7, + 10.7 + ] ] - ] + } }, "expected": { - "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL,identifier_1 STRING NOT NULL,identifier_2 STRING NOT NULL,identifier_3 STRING NOT NULL", - "other_ts_cols": [ - "start_ts", - "end_ts" - ], - "data": [ - [ - "2020-08-01 00:00:09", - "2020-08-01 00:00:10", - "v1", - "foo", - "bar" - ], - [ - "2020-08-01 00:00:11", - "2020-08-01 00:00:12", - "v1", - "foo", - "bar" - ], - [ - "2020-08-01 00:01:12", - "2020-08-01 00:01:14", - "v1", - "foo", - "bar" + "df": { + "schema": "start_ts STRING NOT NULL, end_ts STRING NOT NULL,identifier_1 STRING NOT NULL,identifier_2 STRING NOT NULL,identifier_3 STRING NOT NULL", + "ts_convert": [ + "start_ts", + "end_ts" + ], + "data": [ + [ + "2020-08-01 00:00:09", + "2020-08-01 00:00:10", + "v1", + "foo", + "bar" + ], + [ + "2020-08-01 00:00:11", + "2020-08-01 00:00:12", + "v1", + "foo", + "bar" + ], + [ + "2020-08-01 00:01:12", + "2020-08-01 00:01:14", + "v1", + "foo", + "bar" + ] ] - ] + } } }, "test_invalid_state_definition_str": { "input": { - "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT NOT NULL, metric_2 FLOAT NOT NULL, metric_3 FLOAT NOT NULL", - "ts_col": "event_ts", - "partition_cols": [ - "identifier_1", - "identifier_2", - "identifier_3" - ], - "data": [ - [ - "2020-08-01 00:00:09", - "v1", - "foo", - "bar", - 4.1, - 4.1, - 4.1 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["identifier_1", "identifier_2", "identifier_3"] + }, + "df": { + "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT NOT NULL, metric_2 FLOAT NOT NULL, metric_3 FLOAT NOT NULL", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "2020-08-01 00:00:09", + "v1", + "foo", + "bar", + 4.1, + 4.1, + 4.1 + ] ] - ] + } } }, "test_invalid_state_definition_type": { "input": { - "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT NOT NULL, metric_2 FLOAT NOT NULL, metric_3 FLOAT NOT NULL", - "ts_col": "event_ts", - "partition_cols": [ - "identifier_1", - "identifier_2", - "identifier_3" - ], - "data": [ - [ - "2020-08-01 00:00:09", - "v1", - "foo", - "bar", - 4.1, - 4.1, - 4.1 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": ["identifier_1", "identifier_2", "identifier_3"] + }, + "df": { + "schema": "event_ts STRING NOT NULL, identifier_1 STRING NOT NULL, identifier_2 STRING NOT NULL, identifier_3 STRING NOT NULL, metric_1 FLOAT NOT NULL, metric_2 FLOAT NOT NULL, metric_3 FLOAT NOT NULL", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "2020-08-01 00:00:09", + "v1", + "foo", + "bar", + 4.1, + 4.1, + 4.1 + ] ] - ] + } } } } diff --git a/python/tests/unit_test_data/utils_tests.json b/python/tests/unit_test_data/utils_tests.json index d279dffb..727ce41f 100644 --- a/python/tests/unit_test_data/utils_tests.json +++ b/python/tests/unit_test_data/utils_tests.json @@ -1,314 +1,345 @@ { "__SharedData": { - "init_data": { - "schema": "symbol string, event_ts string, trade_pr float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "2020-08-01 00:00:10", - 349.21 - ], - [ - "S1", - "2020-08-01 00:01:12", - 351.32 - ], - [ - "S1", - "2020-09-01 00:02:10", - 361.1 - ], - [ - "S1", - "2020-09-01 00:19:12", - 362.1 - ], - [ - "S2", - "2020-08-01 00:01:10", - 743.01 - ], - [ - "S2", - "2020-08-01 00:01:24", - 751.92 - ], - [ - "S2", - "2020-09-01 00:02:10", - 761.10 - ], - [ - "S2", - "2020-09-01 00:20:42", - 762.33 - ] - ] - } - }, - "UtilsTest": { - "test_calculate_time_horizon": { - "simple_input": { - "schema": "partition_a string, partition_b string, event_ts string, value_a float, value_b float", + "init": { + "tsdf": { "ts_col": "event_ts", "partition_cols": [ - "partition_a", - "partition_b" + "symbol" + ] + }, + "df": { + "schema": "symbol string, event_ts string, trade_pr float", + "ts_convert": [ + "event_ts" ], "data": [ [ - "A", - "A-1", - "2020-01-01 00:00:10", - 0.0, - null - ], - [ - "A", - "A-1", - "2020-01-01 00:01:10", - 2.0, - 2.0 - ], - [ - "A", - "A-1", - "2020-01-01 00:01:32", - null, - null - ], - [ - "A", - "A-1", - "2020-01-01 00:02:03", - null, - null - ], - [ - "A", - "A-1", - "2020-01-01 00:03:32", - null, - 7.0 - ], - [ - "A", - "A-1", - "2020-01-01 00:04:12", - 8.0, - 8.0 - ], - [ - "A", - "A-1", - "2020-01-01 00:05:31", - 11.0, - null - ], - [ - "A", - "A-2", - "2020-01-01 00:00:10", - 0.0, - null - ], - [ - "A", - "A-2", - "2020-01-01 00:01:10", - 2.0, - 2.0 - ], - [ - "A", - "A-2", - "2020-01-01 00:01:32", - null, - null - ], - [ - "A", - "A-2", - "2020-01-01 00:02:03", - null, - null + "S1", + "2020-08-01 00:00:10", + 349.21 ], [ - "A", - "A-2", - "2020-01-01 00:04:12", - 8.0, - 8.0 + "S1", + "2020-08-01 00:01:12", + 351.32 ], [ - "A", - "A-2", - "2020-01-01 00:05:31", - 11.0, - null + "S1", + "2020-09-01 00:02:10", + 361.1 ], [ - "B", - "A-2", - "2020-01-01 00:01:10", - 2.0, - 2.0 + "S1", + "2020-09-01 00:19:12", + 362.1 ], [ - "B", - "A-2", - "2020-01-01 00:01:32", - null, - null + "S2", + "2020-08-01 00:01:10", + 743.01 ], [ - "B", - "A-2", - "2020-01-01 00:02:03", - null, - null + "S2", + "2020-08-01 00:01:24", + 751.92 ], [ - "B", - "A-2", - "2020-01-01 00:03:32", - null, - 7.0 + "S2", + "2020-09-01 00:02:10", + 761.10 ], [ - "B", - "A-2", - "2020-01-01 00:04:12", - 8.0, - 8.0 + "S2", + "2020-09-01 00:20:42", + 762.33 ] ] } + } + }, + "UtilsTest": { + "test_display": {}, + "test_calculate_time_horizon": { + "init": { + "tsdf": { + "ts_col": "event_ts", + "partition_cols": [ + "partition_a", + "partition_b" + ] + }, + "df": { + "schema": "partition_a string, partition_b string, event_ts string, value_a float, value_b float", + "ts_convert": [ + "event_ts" + ], + "data": [ + [ + "A", + "A-1", + "2020-01-01 00:00:10", + 0.0, + null + ], + [ + "A", + "A-1", + "2020-01-01 00:01:10", + 2.0, + 2.0 + ], + [ + "A", + "A-1", + "2020-01-01 00:01:32", + null, + null + ], + [ + "A", + "A-1", + "2020-01-01 00:02:03", + null, + null + ], + [ + "A", + "A-1", + "2020-01-01 00:03:32", + null, + 7.0 + ], + [ + "A", + "A-1", + "2020-01-01 00:04:12", + 8.0, + 8.0 + ], + [ + "A", + "A-1", + "2020-01-01 00:05:31", + 11.0, + null + ], + [ + "A", + "A-2", + "2020-01-01 00:00:10", + 0.0, + null + ], + [ + "A", + "A-2", + "2020-01-01 00:01:10", + 2.0, + 2.0 + ], + [ + "A", + "A-2", + "2020-01-01 00:01:32", + null, + null + ], + [ + "A", + "A-2", + "2020-01-01 00:02:03", + null, + null + ], + [ + "A", + "A-2", + "2020-01-01 00:04:12", + 8.0, + 8.0 + ], + [ + "A", + "A-2", + "2020-01-01 00:05:31", + 11.0, + null + ], + [ + "B", + "A-2", + "2020-01-01 00:01:10", + 2.0, + 2.0 + ], + [ + "B", + "A-2", + "2020-01-01 00:01:32", + null, + null + ], + [ + "B", + "A-2", + "2020-01-01 00:02:03", + null, + null + ], + [ + "B", + "A-2", + "2020-01-01 00:03:32", + null, + 7.0 + ], + [ + "B", + "A-2", + "2020-01-01 00:04:12", + 8.0, + 8.0 + ] + ] + } + } }, "test_display_html_TSDF": { "init": { - "$ref": "#/__SharedData/init_data" + "$ref": "#/__SharedData/init" } }, "test_display_html_dataframe": { "init": { - "$ref": "#/__SharedData/init_data" + "$ref": "#/__SharedData/init" } }, "test_display_html_pandas_dataframe": { "init": { - "$ref": "#/__SharedData/init_data" + "$ref": "#/__SharedData/init" } }, "test_display_unavailable": { "init": { - "$ref": "#/__SharedData/init_data" + "$ref": "#/__SharedData/init" } }, "test_get_display_df": { "init": { - "$ref": "#/__SharedData/init_data" + "$ref": "#/__SharedData/init" }, "expected": { - "schema": "symbol string, event_ts string, trade_pr float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "data": [ - [ - "S1", - "2020-09-01 00:02:10", - 361.1 - ], - [ - "S1", - "2020-09-01 00:19:12", - 362.1 - ], - [ - "S2", - "2020-09-01 00:02:10", - 761.1 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": [ + "symbol" + ] + }, + "df": { + "schema": "symbol string, event_ts string, trade_pr float", + "ts_convert": [ + "event_ts" ], - [ - "S2", - "2020-09-01 00:20:42", - 762.33 + "data": [ + [ + "S1", + "2020-09-01 00:02:10", + 361.1 + ], + [ + "S1", + "2020-09-01 00:19:12", + 362.1 + ], + [ + "S2", + "2020-09-01 00:02:10", + 761.1 + ], + [ + "S2", + "2020-09-01 00:20:42", + 762.33 + ] ] - ] + } } }, "test_get_display_df_sequence_col": { "init": { - "schema": "symbol string, secondary_symbol string, event_ts string, trade_pr float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "sequence_col": "secondary_symbol", - "data": [ - [ - "S1", - "t1", - "2020-08-01 00:00:10", - 349.21 + "tsdf": { + "ts_col": "event_ts", + "partition_cols": [ + "symbol" ], - [ - "S1", - "t1", - "2020-08-01 00:01:12", - 351.32 - ], - [ - "S1", - "t2", - "2020-09-01 00:02:10", - 361.1 - ], - [ - "S1", - "t3", - "2020-09-01 00:19:12", - 362.1 - ], - [ - "S2", - "t1", - "2020-08-01 00:01:10", - 743.01 - ], - [ - "S2", - "t2", - "2020-08-01 00:01:24", - 751.92 - ], - [ - "S2", - "t2", - "2020-09-01 00:02:10", - 761.10 - ], - [ - "S2", - "t2", - "2020-09-01 00:20:42", - 762.33 + "sequence_col": "secondary_symbol" + }, + "df": { + "schema": "symbol string, secondary_symbol string, event_ts string, trade_pr float", + "ts_convert": ["event_ts"], + "data": [ + [ + "S1", + "t1", + "2020-08-01 00:00:10", + 349.21 + ], + [ + "S1", + "t1", + "2020-08-01 00:01:12", + 351.32 + ], + [ + "S1", + "t2", + "2020-09-01 00:02:10", + 361.1 + ], + [ + "S1", + "t3", + "2020-09-01 00:19:12", + 362.1 + ], + [ + "S2", + "t1", + "2020-08-01 00:01:10", + 743.01 + ], + [ + "S2", + "t2", + "2020-08-01 00:01:24", + 751.92 + ], + [ + "S2", + "t2", + "2020-09-01 00:02:10", + 761.10 + ], + [ + "S2", + "t2", + "2020-09-01 00:20:42", + 762.33 + ] ] - ] + } }, "expected": { + "tsdf": { + "ts_col": "event_ts", + "partition_cols": [ + "symbol" + ], + "sequence_col": "secondary_symbol" + }, + "df": { "schema": "symbol string, secondary_symbol string, event_ts string, trade_pr float", - "ts_col": "event_ts", - "partition_cols": [ - "symbol" - ], - "sequence_col": "secondary_symbol", + "ts_convert": ["event_ts"], "data": [ [ "S1", @@ -335,6 +366,7 @@ 762.33 ] ] + } } } } diff --git a/python/tests/utils_tests.py b/python/tests/utils_tests.py index 6e634047..2839ee04 100644 --- a/python/tests/utils_tests.py +++ b/python/tests/utils_tests.py @@ -1,7 +1,7 @@ import sys import unittest from io import StringIO -from unittest import mock +from unittest.mock import patch, create_autospec, MagicMock from tempo.utils import * # noqa: F403 from tests.tsdf_tests import SparkTest @@ -20,17 +20,17 @@ def test_display(self): else: self.assertEqual(id(display), id(display_unavailable)) - @mock.patch.dict(os.environ, {"TZ": "UTC"}) + @patch.dict(os.environ, {"TZ": "UTC"}) def test_calculate_time_horizon(self): """Test calculate time horizon warning and number of expected output rows""" # fetch test data - simple_input_tsdf = self.get_data_as_tsdf("simple_input") + tsdf = self.get_test_df_builder("init").as_tsdf() with warnings.catch_warnings(record=True) as w: calculate_time_horizon( - simple_input_tsdf.df, - simple_input_tsdf.ts_col, + tsdf.df, + tsdf.ts_col, "30 seconds", ["partition_a", "partition_b"], ) @@ -49,10 +49,10 @@ def test_calculate_time_horizon(self): assert warning_message.strip() == str(w[-1].message).strip() def test_display_html_TSDF(self): - init_tsdf = self.get_data_as_tsdf("init") + tsdf = self.get_test_df_builder("init").as_tsdf() with self.assertLogs(level="ERROR") as error_captured: - display_html(init_tsdf) + display_html(tsdf) self.assertEqual(len(error_captured.records), 1) self.assertEqual( @@ -61,11 +61,11 @@ def test_display_html_TSDF(self): ) def test_display_html_dataframe(self): - init_tsdf = self.get_data_as_tsdf("init") + sdf = self.get_test_df_builder("init").as_sdf() captured_output = StringIO() sys.stdout = captured_output - display_html(init_tsdf.df) + display_html(sdf) self.assertEqual( captured_output.getvalue(), ( @@ -87,8 +87,8 @@ def test_display_html_dataframe(self): ) def test_display_html_pandas_dataframe(self): - init_tsdf = self.get_data_as_tsdf("init") - pandas_dataframe = init_tsdf.df.toPandas() + sdf = self.get_test_df_builder("init").as_sdf() + pandas_dataframe = sdf.toPandas() captured_output = StringIO() sys.stdout = captured_output @@ -120,18 +120,18 @@ def test_display_unavailable(self): ) def test_get_display_df(self): - init_tsdf = self.get_data_as_tsdf("init") - expected_df = self.get_data_as_sdf("expected") + init = self.get_test_df_builder("init").as_tsdf() + expected_df = self.get_test_df_builder("expected").as_sdf() - actual_df = get_display_df(init_tsdf, 2) + actual_df = get_display_df(init, 2) self.assertDataFrameEquality(actual_df, expected_df) def test_get_display_df_sequence_col(self): - init_tsdf = self.get_data_as_tsdf("init") - expected_df = self.get_data_as_sdf("expected") + init = self.get_test_df_builder("init").as_tsdf() + expected_df = self.get_test_df_builder("expected").as_sdf() - actual_df = get_display_df(init_tsdf, 2) + actual_df = get_display_df(init, 2) self.assertDataFrameEquality(actual_df, expected_df) diff --git a/python/tox.ini b/python/tox.ini index d6af2f91..3d236e34 100644 --- a/python/tox.ini +++ b/python/tox.ini @@ -2,57 +2,41 @@ requires = tox>4,<5 virtualenv>20,<21 - wheel>=0.38,<1 -isolated_build = true +isolated_build = True envlist = - format - lint - type-check - build-dist ; Mirror Supported LTS DBR versions here: https://docs.databricks.com/release-notes/runtime/ ; Use correct PySpark version based on Python version present in env name - py37-pyspark300, - py38-pyspark{312,321}, - py39-pyspark{330,332} + dbr{113,122,133,143} + coverage-report skip_missing_interpreters = true - [testenv] description = run the tests under {envname} package = wheel wheel_build_env = .pkg setenv = COVERAGE_FILE = .coverage.{envname} +basepython = + dbr143: py310 + dbr133: py310 + dbr122: py39 + dbr113: py39 deps = - pyspark300: pyspark==3.0.0 - pyspark312: pyspark==3.1.2 - pyspark321: pyspark==3.2.1 - pyspark330: pyspark==3.3.0 - pyspark332: pyspark==3.3.2 - coverage>=7,<8 - -rrequirements.txt + -rrequirements/dev.txt + -rrequirements/{envname}.txt commands = - coverage --version + coverage erase coverage run -m unittest discover -s tests -p '*_tests.py' -[testenv:format] -description = run formatters -skipsdist = true -skip_install = true -deps = - black -commands = - black {toxinidir} - [testenv:lint] description = run linters skipsdist = true skip_install = true deps = flake8 - black + black==24.4.1 commands = - black --check {toxinidir}/tempo + black {posargs} {toxinidir}/tempo flake8 --config {toxinidir}/.flake8 {toxinidir}/tempo [testenv:type-check] @@ -62,24 +46,28 @@ skip_install = true deps = mypy>=1,<2 pandas-stubs>=2,<3 - types-pytz>=2023,<2024 - -rrequirements.txt + numpy + types-openpyxl commands = - mypy {toxinidir}/tempo + mypy --install-types {toxinidir}/tempo [testenv:build-dist] description = build distribution skip_install = true deps = build + semver commands = - python -m build --sdist --wheel {posargs: {toxinidir}} + python setup.py clean bdist_wheel -[testenv:cov-init] -setenv = - COVERAGE_FILE = .coverage +[testenv:build-docs] +description = build distribution +allowlist_externals = make +deps = + -r ../docs/requirements.txt + semver commands = - coverage erase + make --directory ../docs html [testenv:coverage-report] description = combine coverage data and generate reports @@ -89,7 +77,6 @@ skip_install = true setenv = COVERAGE_FILE = .coverage commands = - coverage --version coverage combine coverage report -m coverage xml