diff --git a/docs/_toc.yml b/docs/_toc.yml index 38309dbcfc..9cd039b879 100644 --- a/docs/_toc.yml +++ b/docs/_toc.yml @@ -39,7 +39,7 @@ parts: - caption: User Reference chapters: - - file: source/reference/evaql + - file: source/reference/evaql title: Query Language sections: - file: source/reference/evaql/load_csv @@ -65,25 +65,26 @@ parts: - file: source/reference/api title: Python API - + - file: source/reference/rest_api title: REST API - file: source/reference/databases/index title: Data Sources - sections: + sections: - file: source/reference/databases/postgres - file: source/reference/databases/sqlite - file: source/reference/databases/mysql - file: source/reference/databases/mariadb - file: source/reference/databases/clickhouse - file: source/reference/databases/github + - file: source/reference/databases/arxiv - file: source/reference/databases/snowflake - file: source/reference/databases/hackernews - file: source/reference/vector_databases/index title: Vector Databases - sections: + sections: - file: source/reference/vector_databases/faiss - file: source/reference/vector_databases/chromadb - file: source/reference/vector_databases/qdrant @@ -106,9 +107,9 @@ parts: - file: source/reference/ai/hf title: Hugging Face - file: source/reference/ai/openai - title: OpenAI + title: OpenAI - file: source/reference/ai/yolo - title: YOLO + title: YOLO - file: source/reference/ai/stablediffusion title: Stable Diffusion @@ -117,7 +118,7 @@ parts: - file: source/reference/optimizations title: Optimizations - + # - file: source/reference/io # title: IO Descriptors diff --git a/docs/source/reference/databases/arxiv.rst b/docs/source/reference/databases/arxiv.rst new file mode 100644 index 0000000000..a949cf0c9c --- /dev/null +++ b/docs/source/reference/databases/arxiv.rst @@ -0,0 +1,53 @@ +Arxiv +========== + +The connection to Arxiv is based on the `Arxiv `_ library. + +Dependency +---------- + +* Arxiv + + +Parameters +---------- + +Required: + +* ``query`` is the search query in the Arxiv repository. For example, Nuclear Physics. +* ``max_results`` is the max number of results to display. For example, 10. + +Create Connection +----------------- + +.. code-block:: text + + CREATE DATABASE arxiv_data WITH ENGINE = 'arxiv', PARAMETERS = { + "query": "Nuclear Physics", + "max_results": "10" + }; + +Supported Tables +---------------- + +* ``search_results``: Lists the relevant articles in the arxiv repository. Check `table_column_info.py `_ for all the available columns in the table. + +.. code-block:: sql + + SELECT * FROM arxiv_data.search_results; + +Here is the query output: + +.. code-block:: + + +---------------------------------------------------+-----+---------------------------------------------+ + | search_results.title | ... | search_results.doi | + |---------------------------------------------------|-----|---------------------------------------------| + | Nuclear Symmetry Energy Extracted from Laborat... | ... | 10.1080/10619127.2017.1388681 | + | Neutrino astrophysics and its connections to n... | ... | 10.1088/1742-6596/1056/1/012060 | + | ... | ... | ... | + +---------------------------------------------------+-----+---------------------------------------------+ + +.. note:: + + Looking for another table from Arxiv? You can add a table mapping in `arxiv_handler.py `_, or simply raise a `Feature Request `_. diff --git a/evadb/third_party/databases/arxiv/__init__.py b/evadb/third_party/databases/arxiv/__init__.py new file mode 100644 index 0000000000..8e2f02ef4e --- /dev/null +++ b/evadb/third_party/databases/arxiv/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""arxiv integration""" diff --git a/evadb/third_party/databases/arxiv/arxiv_handler.py b/evadb/third_party/databases/arxiv/arxiv_handler.py new file mode 100644 index 0000000000..b64107a8e4 --- /dev/null +++ b/evadb/third_party/databases/arxiv/arxiv_handler.py @@ -0,0 +1,143 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import arxiv +import pandas as pd + +from evadb.third_party.databases.arxiv.table_column_info import ARXIV_COLUMNS +from evadb.third_party.databases.types import ( + DBHandler, + DBHandlerResponse, + DBHandlerStatus, +) + + +class ArxivHandler(DBHandler): + def __init__(self, name: str, **kwargs): + """ + Initialize the handler. + Args: + name (str): name of the DB handler instance + **kwargs: arbitrary keyword arguments for establishing the connection. + """ + super().__init__(name, **kwargs) + self.query = kwargs.get("query", "") + self.max_results = int(kwargs.get("max_results", 0)) + + @property + def supported_table(self): + def _arxiv_generator(): + for eachRow in self.connection.results( + arxiv.Search(query=self.query, max_results=self.max_results) + ): + yield { + property_name: getattr(eachRow, property_name) + for property_name, _ in ARXIV_COLUMNS + } + + mapping = { + "search_results": { + "columns": ARXIV_COLUMNS, + "generator": _arxiv_generator(), + }, + } + return mapping + + def connect(self): + """ + Set up the connection required by the handler. + Returns: + DBHandlerStatus + """ + try: + self.connection = arxiv.Client() + return DBHandlerStatus(status=True) + except Exception as e: + return DBHandlerStatus(status=False, error=str(e)) + + def disconnect(self): + """ + Close any existing connections. + """ + pass + + def check_connection(self) -> DBHandlerStatus: + """ + Check connection to the handler. + Returns: + DBHandlerStatus + """ + if self.connection: + return DBHandlerStatus(status=True) + else: + return DBHandlerStatus(status=False, error="Not connected to the database.") + + def get_tables(self) -> DBHandlerResponse: + """ + Return the list of tables in the database. + Returns: + DBHandlerResponse + """ + if not self.connection: + return DBHandlerResponse(data=None, error="Not connected to the internet.") + + try: + tables_df = pd.DataFrame( + list(self.supported_table.keys()), columns=["table_name"] + ) + return DBHandlerResponse(data=tables_df) + except Exception as e: + return DBHandlerResponse(data=None, error=str(e)) + + def get_columns(self, table_name: str) -> DBHandlerResponse: + """ + Returns the list of columns for the given table. + Args: + table_name (str): name of the table whose columns are to be retrieved. + Returns: + DBHandlerResponse + """ + if not self.connection: + return DBHandlerResponse(data=None, error="Not connected to the database.") + try: + columns_df = pd.DataFrame( + self.supported_table[table_name]["columns"], columns=["name", "dtype"] + ) + return DBHandlerResponse(data=columns_df) + except Exception as e: + return DBHandlerResponse(data=None, error=str(e)) + + def select(self, table_name: str) -> DBHandlerResponse: + """ + Returns a generator that yields the data from the given table. + Args: + table_name (str): name of the table whose data is to be retrieved. + Returns: + DBHandlerResponse + """ + if not self.connection: + return DBHandlerResponse(data=None, error="Not connected to the database.") + try: + if table_name not in self.supported_table: + return DBHandlerResponse( + data=None, + error="{} is not supported or does not exist.".format(table_name), + ) + # TODO: Projection column trimming optimization opportunity + return DBHandlerResponse( + data=None, + data_generator=self.supported_table[table_name]["generator"], + ) + except Exception as e: + return DBHandlerResponse(data=None, error=str(e)) diff --git a/evadb/third_party/databases/arxiv/table_column_info.py b/evadb/third_party/databases/arxiv/table_column_info.py new file mode 100644 index 0000000000..49041314b4 --- /dev/null +++ b/evadb/third_party/databases/arxiv/table_column_info.py @@ -0,0 +1,27 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ARXIV_COLUMNS = [ + ["title", str], + ["entry_id", str], + ["published", str], + ["updated", str], + ["summary", str], + ["authors", object], + ["comment", str], + ["primary_category", str], + ["journal_ref", str], + ["doi", str], +] diff --git a/evadb/third_party/databases/interface.py b/evadb/third_party/databases/interface.py index cacb4110f1..49b8aed44a 100644 --- a/evadb/third_party/databases/interface.py +++ b/evadb/third_party/databases/interface.py @@ -48,6 +48,8 @@ def _get_database_handler(engine: str, **kwargs): return mod.SnowFlakeDbHandler(engine, **kwargs) elif engine == "github": return mod.GithubHandler(engine, **kwargs) + elif engine == "arxiv": + return mod.ArxivHandler(engine, **kwargs) elif engine == "hackernews": return mod.HackernewsSearchHandler(engine, **kwargs) elif engine == "slack": diff --git a/test/integration_tests/long/test_arxiv_datasource.py b/test/integration_tests/long/test_arxiv_datasource.py new file mode 100644 index 0000000000..fdeb86b8be --- /dev/null +++ b/test/integration_tests/long/test_arxiv_datasource.py @@ -0,0 +1,59 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest +from test.util import get_evadb_for_testing + +import pytest + +from evadb.server.command_handler import execute_query_fetch_all +from evadb.third_party.databases.arxiv.table_column_info import ARXIV_COLUMNS + + +@pytest.mark.notparallel +class ArxivDataSourceTest(unittest.TestCase): + def setUp(self): + self.evadb = get_evadb_for_testing() + # reset the catalog manager before running each test + self.evadb.catalog().reset() + + def tearDown(self): + execute_query_fetch_all(self.evadb, "DROP DATABASE IF EXISTS arxiv_data;") + + @pytest.mark.skip( + reason="Need https://github.com/georgia-tech-db/evadb/pull/1280 for a cost-based rebatch optimization" + ) + @pytest.mark.xfail(reason="Flaky testcase due to `bad request` error message") + def test_should_run_select_query_in_arxiv(self): + # Create database. + params = { + "query": "Nuclear Physics", + "max_results": "10", + } + query = f"""CREATE DATABASE arxiv_data + WITH ENGINE = "arxiv", + PARAMETERS = {params};""" + execute_query_fetch_all(self.evadb, query) + + query = "SELECT * FROM arxiv_data.search_results LIMIT 10;" + batch = execute_query_fetch_all(self.evadb, query) + self.assertEqual(len(batch), 10) + expected_column = list( + ["search_results.{}".format(col) for col, _ in ARXIV_COLUMNS] + ) + self.assertEqual(batch.columns, expected_column) + + +if __name__ == "__main__": + unittest.main()