From b9f990d842012ea5b0572ee6fbd85ce1ed1d49b3 Mon Sep 17 00:00:00 2001 From: Dmitry Sorokin <40151847+DimedS@users.noreply.github.com> Date: Thu, 17 Aug 2023 18:09:12 +0100 Subject: [PATCH] fix(datasets): Revert the optional dependencies back to setup.py (#310) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move optional dependencies for kedro-datasets to setup.py Signed-off-by: Dmitry Sorokin Co-authored-by: Juan Luis Cano Rodríguez --- kedro-datasets/pyproject.toml | 246 ++-------------------------------- kedro-datasets/setup.py | 210 +++++++++++++++++++++++++++++ 2 files changed, 222 insertions(+), 234 deletions(-) create mode 100644 kedro-datasets/setup.py diff --git a/kedro-datasets/pyproject.toml b/kedro-datasets/pyproject.toml index 90b177f8d..31c66c227 100644 --- a/kedro-datasets/pyproject.toml +++ b/kedro-datasets/pyproject.toml @@ -5,238 +5,16 @@ build-backend = "setuptools.build_meta" [project] name = "kedro-datasets" authors = [ - {name = "Kedro"} + {name = "Kedro"} ] description = "Kedro-Datasets is where you can find all of Kedro's data connectors." requires-python = ">=3.7, <3.11" license = {text = "Apache Software License (Apache 2.0)"} dependencies = [ - "kedro>=0.16", - "lazy_loader", -] -dynamic = ["readme", "version"] - -[project.optional-dependencies] -# Base dependencies -hdfs-base = ["hdfs>=2.5.8, <3.0"] -pandas-base = ["pandas>=1.3, <3.0"] -plotly-base = ["plotly>=4.8.0, <6.0"] -s3fs-base = ["s3fs>=0.3.0, <0.5"] -spark-base = ["pyspark>=2.2, <4.0"] - - -# Datasets dependencies -api = ["kedro-datasets[api.APIDataSet]"] -"api.APIDataSet" = ["requests~=2.20"] - -biosequence = ["kedro-datasets[biosequence.BioSequenceDataSet]"] -"biosequence.BioSequenceDataSet" = ["biopython~=1.73"] - -dask = ["kedro-datasets[dask.ParquetDataSet]"] -"dask.ParquetDataSet" = ["dask[complete]>=2021.10", "triad>=0.6.7, <1.0"] - -databricks = ["kedro-datasets[databricks.ManagedTableDataSet]"] -"databricks.ManagedTableDataSet" = ["kedro-datasets[spark-base,pandas-base]", "delta-spark~=1.2.1"] - -geopandas = ["kedro-datasets[geopandas.GeoJSONDataSet]"] -"geopandas.GeoJSONDataSet" = ["geopandas>=0.6.0, <1.0", "pyproj~=3.0"] - -holoviews = ["kedro-datasets[holoviews.HoloviewsWriter]"] -"holoviews.HoloviewsWriter" = ["holoviews~=1.13.0"] - -matplotlib = ["kedro-datasets[matplotlib.MatplotlibWriter]"] -"matplotlib.MatplotlibWriter" = ["matplotlib>=3.0.3, <4.0"] - -networkx = ["kedro-datasets[networkx.NetworkXDataSet]"] -"networkx.NetworkXDataSet" = ["networkx~=2.4"] - -pandas = [ - """kedro-datasets[\ - pandas.CSVDataSet,\ - pandas.ExcelDataSet,\ - pandas.DeltaTableDataSet,\ - pandas.FeatherDataSet,\ - pandas.GenericDataSet,\ - pandas.GBQTableDataSet,\ - pandas.GBQQueryDataSet,\ - pandas.HDFDataSet,\ - pandas.JSONDataSet,\ - pandas.ParquetDataSet,\ - pandas.SQLTableDataSet,\ - pandas.SQLQueryDataSet,\ - pandas.XMLDataSet\ - ]""" -] -"pandas.CSVDataSet" = ["kedro-datasets[pandas-base]"] -"pandas.ExcelDataSet" = ["kedro-datasets[pandas-base]", "openpyxl>=3.0.6, <4.0"] -"pandas.DeltaTableDataSet" = ["kedro-datasets[pandas-base]", "deltalake>=0.10.0"] -"pandas.FeatherDataSet" = ["kedro-datasets[pandas-base]"] -"pandas.GenericDataSet" = ["kedro-datasets[pandas-base]"] -"pandas.GBQTableDataSet" = [ - "kedro-datasets[pandas-base]", - "pandas-gbq>=0.12.0, <0.18.0" -] -"pandas.GBQQueryDataSet" = [ - "kedro-datasets[pandas-base]", - "pandas-gbq>=0.12.0, <0.18.0" -] -"pandas.HDFDataSet" = [ - "kedro-datasets[pandas-base]", - "tables~=3.6.0; platform_system == 'Windows'", - "tables~=3.6; platform_system != 'Windows'" -] -"pandas.JSONDataSet" = ["kedro-datasets[pandas-base]"] -"pandas.ParquetDataSet" = ["kedro-datasets[pandas-base]", "pyarrow>=6.0"] -"pandas.SQLTableDataSet" = ["kedro-datasets[pandas-base]", "SQLAlchemy>=1.4, <3.0"] -"pandas.SQLQueryDataSet" = [ - "kedro-datasets[pandas-base]", - "SQLAlchemy>=1.4, <3.0", - "pyodbc~=4.0" -] -"pandas.XMLDataSet" = ["kedro-datasets[pandas-base]", "lxml~=4.6"] - -pickle = ["kedro-datasets[pickle.PickleDataSet]"] -"pickle.PickleDataSet" = ["compress-pickle[lz4]~=2.1.0"] - -pillow = ["kedro-datasets[pillow.ImageDataSet]"] -"pillow.ImageDataSet" = ["Pillow~=9.0"] - -plotly = ["kedro-datasets[plotly.PlotlyDataSet,plotly.JSONDataSet]"] -"plotly.JSONDataSet" = ["kedro-datasets[plotly-base]"] -"plotly.PlotlyDataSet" = ["kedro-datasets[pandas-base,plotly-base]"] - -polars = ["kedro-datasets[polars.CSVDataSet]"] -"polars.CSVDataSet" = ["polars~=0.17.0"] - -redis = ["kedro-datasets[redis.PickleDataSet]"] -"redis.PickleDataSet" = ["redis~=4.1"] - -snowflake = ["kedro-datasets[snowflake.SnowparkTableDataSet]"] -"snowflake.SnowparkTableDataSet" = [ - "snowflake-snowpark-python~=1.0.0; python_version == '3.8'", - "pyarrow~=8.0" -] - -spark = [ - "kedro-datasets[spark.DeltaTableDataSet,spark.SparkDataSet,spark.SparkHiveDataSet,spark.SparkJDBCDataSet]" -] -"spark.DeltaTableDataSet" = [ - "kedro-datasets[spark-base,hdfs-base,s3fs-base]", - "delta-spark>=1.0, <3.0" -] -"spark.SparkDataSet" = ["kedro-datasets[spark-base,hdfs-base,s3fs-base]"] -"spark.SparkHiveDataSet" = ["kedro-datasets[spark-base,hdfs-base,s3fs-base]"] -"spark.SparkJDBCDataSet" = ["kedro-datasets[spark-base,hdfs-base,s3fs-base]"] - -svmlight = ["kedro-datasets[svmlight.SVMLightDataSet]"] -"svmlight.SVMLightDataSet" = ["scikit-learn~=1.0.2", "scipy~=1.7.3"] - -tensorflow = ["kedro-datasets[tensorflow.TensorFlowModelDataSet]"] -"tensorflow.TensorFlowModelDataSet" = [ - # currently only TensorFlow V2 supported for saving and loading. - # V1 requires HDF5 and serialises differently - "tensorflow~=2.0; platform_system != 'Darwin' or platform_machine != 'arm64'", - # https://developer.apple.com/metal/tensorflow-plugin/ - "tensorflow-macos~=2.0; platform_system == 'Darwin' and platform_machine == 'arm64'" -] - -video = ["kedro-datasets[video.VideoDataSet]"] -"video.VideoDataSet" = ["opencv-python~=4.5.5.64"] - -yaml = ["kedro-datasets[yaml.YAMLDataSet]"] -"yaml.YAMLDataSet" = ["kedro-datasets[pandas-base]", "PyYAML>=4.2, <7.0"] - -all = [ - """kedro-datasets[\ - api,biosequence,dask,databricks,\ - geopandas,holoviews,matplotlib,\ - networkx,pickle,pillow,plotly,\ - polars,snowflake,redis,spark,svmlight,\ - tensorflow,video,yaml\ - ]""" -] -docs = [ - # docutils>=0.17 changed the HTML - # see https://github.com/readthedocs/sphinx_rtd_theme/issues/1115 - "docutils==0.16", - "sphinx~=5.3.0", - "sphinx_rtd_theme==1.2.0", - # Regression on sphinx-autodoc-typehints 1.21 - # that creates some problematic docstrings - "sphinx-autodoc-typehints==1.20.2", - "sphinx_copybutton==0.3.1", - "sphinx-notfound-page", - "ipykernel>=5.3, <7.0", - "sphinxcontrib-mermaid~=0.7.1", - "myst-parser~=1.0.0", - "Jinja2<3.1.0" -] -test = [ - "adlfs>=2021.7.1, <=2022.2", - "bandit>=1.6.2, <2.0", - "behave==1.2.6", - "biopython~=1.73", - "blacken-docs==1.9.2", - "black~=22.0", - "compress-pickle[lz4]~=1.2.0", - "coverage[toml]", - "dask[complete]", - "delta-spark~=1.2.1", - "deltalake>=0.10.0", - # 1.2.0 has a bug that breaks some of our tests: https://github.com/delta-io/delta/issues/1070 - "dill~=0.3.1", - "filelock>=3.4.0, <4.0", - "gcsfs>=2021.4, <=2022.1", - "geopandas>=0.6.0, <1.0", - "hdfs>=2.5.8, <3.0", - "holoviews~=1.13.0", - "import-linter[toml]==1.2.6", - "ipython>=7.31.1, <8.0", - "Jinja2<3.1.0", - "joblib>=0.14", - "jupyterlab~=3.0", - "jupyter~=1.0", - "lxml~=4.6", - "matplotlib>=3.0.3, <3.4; python_version < '3.10'", # 3.4.0 breaks holoviews - "matplotlib>=3.5, <3.6; python_version == '3.10'", - "memory_profiler>=0.50.0, <1.0", - "moto==1.3.7; python_version < '3.10'", - "moto==3.0.4; python_version == '3.10'", - "networkx~=2.4", - "opencv-python~=4.5.5.64", - "openpyxl>=3.0.3, <4.0", - "pandas-gbq>=0.12.0, <0.18.0", - "pandas>=1.3, <2", # 1.3 for read_xml/to_xml, <2 for compatibility with Spark < 3.4 - "Pillow~=9.0", - "plotly>=4.8.0, <6.0", - "polars~=0.15.13", - "pre-commit>=2.9.2, <3.0", # The hook `mypy` requires pre-commit version 2.9.2. - "psutil==5.8.0", - "pyarrow~=8.0", - "pylint>=2.5.2, <3.0", - "pyodbc~=4.0.35", - "pyproj~=3.0", - "pyspark>=2.2, <4.0", - "pytest-cov~=3.0", - "pytest-mock>=1.7.1, <2.0", - "pytest-xdist[psutil]~=2.2.1", - "pytest~=7.2", - "redis~=4.1", - "requests-mock~=1.6", - "requests~=2.20", - "s3fs>=0.3.0, <0.5", # Needs to be at least 0.3.0 to make use of `cachable` attribute on S3FileSystem. - "scikit-learn~=1.0.2", - "scipy~=1.7.3", - "snowflake-snowpark-python~=1.0.0; python_version == '3.8'", - "SQLAlchemy>=1.4, <3.0", - # The `Inspector.has_table()` method replaces the `Engine.has_table()` method in version 1.4. - "tables~=3.7", - "tensorflow-macos~=2.0; platform_system == 'Darwin' and platform_machine == 'arm64'", - "tensorflow~=2.0; platform_system != 'Darwin' or platform_machine != 'arm64'", - "triad>=0.6.7, <1.0", - "trufflehog~=2.1", - "xlsxwriter~=1.0" + "kedro>=0.16", + "lazy_loader", ] +dynamic = ["readme", "version", "optional-dependencies"] [project.urls] Source = "https://github.com/kedro-org/kedro-plugins/tree/main/kedro-datasets" @@ -257,18 +35,18 @@ profile = "black" [tool.pylint.master] ignore = "CVS" load-plugins = [ - "pylint.extensions.docparams", - "pylint.extensions.no_self_use" + "pylint.extensions.docparams", + "pylint.extensions.no_self_use" ] extension-pkg-whitelist = "cv2" unsafe-load-any-extension = false [tool.pylint.messages_control] disable = [ - "ungrouped-imports", - "duplicate-code", - "too-many-instance-attributes", - "too-few-public-methods", # https://github.com/pylint-dev/pylint/issues/8865 + "ungrouped-imports", + "duplicate-code", + "too-many-instance-attributes", + "too-few-public-methods", # https://github.com/pylint-dev/pylint/issues/8865 ] enable = ["useless-suppression"] @@ -283,8 +61,8 @@ indent-string = " " [tool.pylint.miscellaneous] notes = [ - "FIXME", - "XXX" + "FIXME", + "XXX" ] [tool.pylint.design] diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py new file mode 100644 index 000000000..63cd364cc --- /dev/null +++ b/kedro-datasets/setup.py @@ -0,0 +1,210 @@ +from itertools import chain + +from setuptools import setup + +# at least 1.3 to be able to use XMLDataSet and pandas integration with fsspec +PANDAS = "pandas>=1.3, <3.0" +SPARK = "pyspark>=2.2, <4.0" +HDFS = "hdfs>=2.5.8, <3.0" +S3FS = "s3fs>=0.3.0, <0.5" +POLARS = "polars~=0.17.0" +DELTA = "delta-spark~=1.2.1" + + +def _collect_requirements(requires): + return sorted(set(chain.from_iterable(requires.values()))) + + +api_require = {"api.APIDataSet": ["requests~=2.20"]} +biosequence_require = {"biosequence.BioSequenceDataSet": ["biopython~=1.73"]} +dask_require = { + "dask.ParquetDataSet": ["dask[complete]>=2021.10", "triad>=0.6.7, <1.0"] +} +databricks_require = {"databricks.ManagedTableDataSet": [SPARK, PANDAS, DELTA]} +geopandas_require = { + "geopandas.GeoJSONDataSet": ["geopandas>=0.6.0, <1.0", "pyproj~=3.0"] +} +holoviews_require = {"holoviews.HoloviewsWriter": ["holoviews~=1.13.0"]} +matplotlib_require = {"matplotlib.MatplotlibWriter": ["matplotlib>=3.0.3, <4.0"]} +networkx_require = {"networkx.NetworkXDataSet": ["networkx~=2.4"]} +pandas_require = { + "pandas.CSVDataSet": [PANDAS], + "pandas.ExcelDataSet": [PANDAS, "openpyxl>=3.0.6, <4.0"], + "pandas.DeltaTableDataSet": [PANDAS, "deltalake>=0.10.0"], + "pandas.FeatherDataSet": [PANDAS], + "pandas.GBQTableDataSet": [PANDAS, "pandas-gbq>=0.12.0, <0.18.0"], + "pandas.GBQQueryDataSet": [PANDAS, "pandas-gbq>=0.12.0, <0.18.0"], + "pandas.HDFDataSet": [ + PANDAS, + "tables~=3.6.0; platform_system == 'Windows'", + "tables~=3.6; platform_system != 'Windows'", + ], + "pandas.JSONDataSet": [PANDAS], + "pandas.ParquetDataSet": [PANDAS, "pyarrow>=6.0"], + "pandas.SQLTableDataSet": [PANDAS, "SQLAlchemy>=1.4, <3.0"], + "pandas.SQLQueryDataSet": [PANDAS, "SQLAlchemy>=1.4, <3.0", "pyodbc~=4.0"], + "pandas.XMLDataSet": [PANDAS, "lxml~=4.6"], + "pandas.GenericDataSet": [PANDAS], +} +pickle_require = {"pickle.PickleDataSet": ["compress-pickle[lz4]~=2.1.0"]} +pillow_require = {"pillow.ImageDataSet": ["Pillow~=9.0"]} +plotly_require = { + "plotly.PlotlyDataSet": [PANDAS, "plotly>=4.8.0, <6.0"], + "plotly.JSONDataSet": ["plotly>=4.8.0, <6.0"], +} +polars_require = {"polars.CSVDataSet": [POLARS]} +redis_require = {"redis.PickleDataSet": ["redis~=4.1"]} +snowflake_require = { + "snowflake.SnowparkTableDataSet": [ + "snowflake-snowpark-python~=1.0.0", + "pyarrow~=8.0", + ] +} +spark_require = { + "spark.SparkDataSet": [SPARK, HDFS, S3FS], + "spark.SparkHiveDataSet": [SPARK, HDFS, S3FS], + "spark.SparkJDBCDataSet": [SPARK, HDFS, S3FS], + "spark.DeltaTableDataSet": [SPARK, HDFS, S3FS, "delta-spark>=1.0, <3.0"], +} +svmlight_require = {"svmlight.SVMLightDataSet": ["scikit-learn~=1.0.2", "scipy~=1.7.3"]} +tensorflow_require = { + "tensorflow.TensorFlowModelDataSet": [ + # currently only TensorFlow V2 supported for saving and loading. + # V1 requires HDF5 and serialises differently + "tensorflow~=2.0; platform_system != 'Darwin' or platform_machine != 'arm64'", + # https://developer.apple.com/metal/tensorflow-plugin/ + "tensorflow-macos~=2.0; platform_system == 'Darwin' and platform_machine == 'arm64'", + ] +} +video_require = {"video.VideoDataSet": ["opencv-python~=4.5.5.64"]} +yaml_require = {"yaml.YAMLDataSet": [PANDAS, "PyYAML>=4.2, <7.0"]} + +extras_require = { + "api": _collect_requirements(api_require), + "biosequence": _collect_requirements(biosequence_require), + "dask": _collect_requirements(dask_require), + "databricks": _collect_requirements(databricks_require), + "geopandas": _collect_requirements(geopandas_require), + "holoviews": _collect_requirements(holoviews_require), + "matplotlib": _collect_requirements(matplotlib_require), + "networkx": _collect_requirements(networkx_require), + "pandas": _collect_requirements(pandas_require), + "pickle": _collect_requirements(pickle_require), + "pillow": _collect_requirements(pillow_require), + "plotly": _collect_requirements(plotly_require), + "polars": _collect_requirements(polars_require), + "redis": _collect_requirements(redis_require), + "snowflake": _collect_requirements(snowflake_require), + "spark": _collect_requirements(spark_require), + "svmlight": _collect_requirements(svmlight_require), + "tensorflow": _collect_requirements(tensorflow_require), + "video": _collect_requirements(video_require), + "yaml": _collect_requirements(yaml_require), + **api_require, + **biosequence_require, + **dask_require, + **databricks_require, + **geopandas_require, + **holoviews_require, + **matplotlib_require, + **networkx_require, + **pandas_require, + **pickle_require, + **pillow_require, + **plotly_require, + **polars_require, + **snowflake_require, + **spark_require, + **svmlight_require, + **tensorflow_require, + **video_require, + **yaml_require, +} + +extras_require["all"] = _collect_requirements(extras_require) +extras_require["docs"] = [ + # docutils>=0.17 changed the HTML + # see https://github.com/readthedocs/sphinx_rtd_theme/issues/1115 + "docutils==0.16", + "sphinx~=5.3.0", + "sphinx_rtd_theme==1.2.0", + # Regression on sphinx-autodoc-typehints 1.21 + # that creates some problematic docstrings + "sphinx-autodoc-typehints==1.20.2", + "sphinx_copybutton==0.3.1", + "sphinx-notfound-page", + "ipykernel>=5.3, <7.0", + "sphinxcontrib-mermaid~=0.7.1", + "myst-parser~=1.0.0", + "Jinja2<3.1.0", +] +extras_require["test"] = [ + "adlfs>=2021.7.1, <=2022.2", + "bandit>=1.6.2, <2.0", + "behave==1.2.6", + "biopython~=1.73", + "blacken-docs==1.9.2", + "black~=22.0", + "compress-pickle[lz4]~=1.2.0", + "coverage[toml]", + "dask[complete]", + "delta-spark~=1.2.1", + # 1.2.0 has a bug that breaks some of our tests: https://github.com/delta-io/delta/issues/1070 + "deltalake>=0.10.0", + "dill~=0.3.1", + "filelock>=3.4.0, <4.0", + "gcsfs>=2021.4, <=2022.1", + "geopandas>=0.6.0, <1.0", + "hdfs>=2.5.8, <3.0", + "holoviews~=1.13.0", + "import-linter[toml]==1.2.6", + "ipython>=7.31.1, <8.0", + "Jinja2<3.1.0", + "joblib>=0.14", + "jupyterlab~=3.0", + "jupyter~=1.0", + "lxml~=4.6", + "matplotlib>=3.0.3, <3.4; python_version < '3.10'", # 3.4.0 breaks holoviews + "matplotlib>=3.5, <3.6; python_version == '3.10'", + "memory_profiler>=0.50.0, <1.0", + "moto==1.3.7; python_version < '3.10'", + "moto==3.0.4; python_version == '3.10'", + "networkx~=2.4", + "opencv-python~=4.5.5.64", + "openpyxl>=3.0.3, <4.0", + "pandas-gbq>=0.12.0, <0.18.0", + "pandas>=1.3, <2", # 1.3 for read_xml/to_xml, <2 for compatibility with Spark < 3.4 + "Pillow~=9.0", + "plotly>=4.8.0, <6.0", + "polars~=0.15.13", + "pre-commit>=2.9.2, <3.0", # The hook `mypy` requires pre-commit version 2.9.2. + "psutil==5.8.0", + "pyarrow~=8.0", + "pylint>=2.5.2, <3.0", + "pyodbc~=4.0.35", + "pyproj~=3.0", + "pyspark>=2.2, <4.0", + "pytest-cov~=3.0", + "pytest-mock>=1.7.1, <2.0", + "pytest-xdist[psutil]~=2.2.1", + "pytest~=7.2", + "redis~=4.1", + "requests-mock~=1.6", + "requests~=2.20", + "s3fs>=0.3.0, <0.5", # Needs to be at least 0.3.0 to make use of `cachable` attribute on S3FileSystem. + "scikit-learn~=1.0.2", + "scipy~=1.7.3", + "snowflake-snowpark-python~=1.0.0; python_version == '3.8'", + "SQLAlchemy>=1.4, <3.0", + # The `Inspector.has_table()` method replaces the `Engine.has_table()` method in version 1.4. + "tables~=3.7", + "tensorflow-macos~=2.0; platform_system == 'Darwin' and platform_machine == 'arm64'", + "tensorflow~=2.0; platform_system != 'Darwin' or platform_machine != 'arm64'", + "triad>=0.6.7, <1.0", + "trufflehog~=2.1", + "xlsxwriter~=1.0", +] + +setup( + extras_require=extras_require, +)