From 614d67698eb0c5814325491109f58ac4c2b06937 Mon Sep 17 00:00:00 2001 From: Aayush Acharya Date: Fri, 29 Dec 2023 22:34:13 +0545 Subject: [PATCH 1/4] feat: arXiv datasource addition, closes #1161 --- docs/source/reference/databases/arxiv.rst | 53 +++++++ evadb/third_party/databases/arxiv/__init__.py | 15 ++ .../databases/arxiv/arxiv_handler.py | 149 ++++++++++++++++++ .../databases/arxiv/table_column_info.py | 27 ++++ evadb/third_party/databases/interface.py | 2 + 5 files changed, 246 insertions(+) create mode 100644 docs/source/reference/databases/arxiv.rst create mode 100644 evadb/third_party/databases/arxiv/__init__.py create mode 100644 evadb/third_party/databases/arxiv/arxiv_handler.py create mode 100644 evadb/third_party/databases/arxiv/table_column_info.py diff --git a/docs/source/reference/databases/arxiv.rst b/docs/source/reference/databases/arxiv.rst new file mode 100644 index 0000000000..a949cf0c9c --- /dev/null +++ b/docs/source/reference/databases/arxiv.rst @@ -0,0 +1,53 @@ +Arxiv +========== + +The connection to Arxiv is based on the `Arxiv `_ library. + +Dependency +---------- + +* Arxiv + + +Parameters +---------- + +Required: + +* ``query`` is the search query in the Arxiv repository. For example, Nuclear Physics. +* ``max_results`` is the max number of results to display. For example, 10. + +Create Connection +----------------- + +.. code-block:: text + + CREATE DATABASE arxiv_data WITH ENGINE = 'arxiv', PARAMETERS = { + "query": "Nuclear Physics", + "max_results": "10" + }; + +Supported Tables +---------------- + +* ``search_results``: Lists the relevant articles in the arxiv repository. Check `table_column_info.py `_ for all the available columns in the table. + +.. code-block:: sql + + SELECT * FROM arxiv_data.search_results; + +Here is the query output: + +.. code-block:: + + +---------------------------------------------------+-----+---------------------------------------------+ + | search_results.title | ... | search_results.doi | + |---------------------------------------------------|-----|---------------------------------------------| + | Nuclear Symmetry Energy Extracted from Laborat... | ... | 10.1080/10619127.2017.1388681 | + | Neutrino astrophysics and its connections to n... | ... | 10.1088/1742-6596/1056/1/012060 | + | ... | ... | ... | + +---------------------------------------------------+-----+---------------------------------------------+ + +.. note:: + + Looking for another table from Arxiv? You can add a table mapping in `arxiv_handler.py `_, or simply raise a `Feature Request `_. diff --git a/evadb/third_party/databases/arxiv/__init__.py b/evadb/third_party/databases/arxiv/__init__.py new file mode 100644 index 0000000000..7324d51f90 --- /dev/null +++ b/evadb/third_party/databases/arxiv/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""arxiv integration""" \ No newline at end of file diff --git a/evadb/third_party/databases/arxiv/arxiv_handler.py b/evadb/third_party/databases/arxiv/arxiv_handler.py new file mode 100644 index 0000000000..7670f82ea9 --- /dev/null +++ b/evadb/third_party/databases/arxiv/arxiv_handler.py @@ -0,0 +1,149 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import arxiv +import pandas as pd + +from evadb.third_party.databases.arxiv.table_column_info import ARXIV_COLUMNS +from evadb.third_party.databases.types import ( + DBHandler, + DBHandlerResponse, + DBHandlerStatus, +) + + +class ArxivHandler(DBHandler): + def __init__(self, name: str, **kwargs): + """ + Initialize the handler. + Args: + name (str): name of the DB handler instance + **kwargs: arbitrary keyword arguments for establishing the connection. + """ + super().__init__(name, **kwargs) + self.query=kwargs.get("query","") + self.max_results=int(kwargs.get("max_results",0)) + + @property + def supported_table(self): + def _arxiv_generator(): + for eachRow in self.connection.results(arxiv.Search( + query=self.query, + max_results=self.max_results + )): + yield { + property_name: getattr(eachRow, property_name) + for property_name, _ in ARXIV_COLUMNS + } + + mapping = { + "search_results": { + "columns": ARXIV_COLUMNS, + "generator": _arxiv_generator(), + }, + } + return mapping + + + + def connect(self): + """ + Set up the connection required by the handler. + Returns: + DBHandlerStatus + """ + try: + self.connection=arxiv.Client() + return DBHandlerStatus(status=True) + except Exception as e: + return DBHandlerStatus(status=False, error=str(e)) + + def disconnect(self): + """ + Close any existing connections. + """ + pass + + def check_connection(self) -> DBHandlerStatus: + """ + Check connection to the handler. + Returns: + DBHandlerStatus + """ + if self.connection: + return DBHandlerStatus(status=True) + else: + return DBHandlerStatus(status=False, error="Not connected to the database.") + + def get_tables(self) -> DBHandlerResponse: + """ + Return the list of tables in the database. + Returns: + DBHandlerResponse + """ + if not self.connection: + return DBHandlerResponse(data=None, error="Not connected to the internet.") + + try: + tables_df = pd.DataFrame( + list(self.supported_table.keys()), columns=["table_name"] + ) + return DBHandlerResponse(data=tables_df) + except Exception as e: + return DBHandlerResponse(data=None, error=str(e)) + + def get_columns(self, table_name: str) -> DBHandlerResponse: + """ + Returns the list of columns for the given table. + Args: + table_name (str): name of the table whose columns are to be retrieved. + Returns: + DBHandlerResponse + """ + if not self.connection: + return DBHandlerResponse(data=None, error="Not connected to the database.") + try: + columns_df = pd.DataFrame( + self.supported_table[table_name]["columns"], columns=["name", "dtype"] + ) + return DBHandlerResponse(data=columns_df) + except Exception as e: + return DBHandlerResponse(data=None, error=str(e)) + + + + def select(self, table_name: str) -> DBHandlerResponse: + """ + Returns a generator that yields the data from the given table. + Args: + table_name (str): name of the table whose data is to be retrieved. + Returns: + DBHandlerResponse + """ + if not self.connection: + return DBHandlerResponse(data=None, error="Not connected to the database.") + try: + if table_name not in self.supported_table: + return DBHandlerResponse( + data=None, + error="{} is not supported or does not exist.".format(table_name), + ) + # TODO: Projection column trimming optimization opportunity + return DBHandlerResponse( + data=None, + data_generator=self.supported_table[table_name]["generator"], + ) + except Exception as e: + return DBHandlerResponse(data=None, error=str(e)) + \ No newline at end of file diff --git a/evadb/third_party/databases/arxiv/table_column_info.py b/evadb/third_party/databases/arxiv/table_column_info.py new file mode 100644 index 0000000000..66b47b5144 --- /dev/null +++ b/evadb/third_party/databases/arxiv/table_column_info.py @@ -0,0 +1,27 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ARXIV_COLUMNS=[ + ["title",str], + ["entry_id",str], + ["published",str], + ["updated",str], + ["summary",str], + ["authors",object], + ["comment",str], + ["primary_category",str], + ["journal_ref",str], + ["doi",str], +] \ No newline at end of file diff --git a/evadb/third_party/databases/interface.py b/evadb/third_party/databases/interface.py index cacb4110f1..eacb41c016 100644 --- a/evadb/third_party/databases/interface.py +++ b/evadb/third_party/databases/interface.py @@ -48,6 +48,8 @@ def _get_database_handler(engine: str, **kwargs): return mod.SnowFlakeDbHandler(engine, **kwargs) elif engine == "github": return mod.GithubHandler(engine, **kwargs) + elif engine == "arxiv": + return mod.ArxivHandler(engine,**kwargs) elif engine == "hackernews": return mod.HackernewsSearchHandler(engine, **kwargs) elif engine == "slack": From 4cabf08848080cc7bad30e3813ff271e94a460fa Mon Sep 17 00:00:00 2001 From: Aayush Acharya Date: Fri, 29 Dec 2023 22:39:40 +0545 Subject: [PATCH 2/4] fix: included arxiv in toctreeg --- docs/_toc.yml | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/docs/_toc.yml b/docs/_toc.yml index 38309dbcfc..9cd039b879 100644 --- a/docs/_toc.yml +++ b/docs/_toc.yml @@ -39,7 +39,7 @@ parts: - caption: User Reference chapters: - - file: source/reference/evaql + - file: source/reference/evaql title: Query Language sections: - file: source/reference/evaql/load_csv @@ -65,25 +65,26 @@ parts: - file: source/reference/api title: Python API - + - file: source/reference/rest_api title: REST API - file: source/reference/databases/index title: Data Sources - sections: + sections: - file: source/reference/databases/postgres - file: source/reference/databases/sqlite - file: source/reference/databases/mysql - file: source/reference/databases/mariadb - file: source/reference/databases/clickhouse - file: source/reference/databases/github + - file: source/reference/databases/arxiv - file: source/reference/databases/snowflake - file: source/reference/databases/hackernews - file: source/reference/vector_databases/index title: Vector Databases - sections: + sections: - file: source/reference/vector_databases/faiss - file: source/reference/vector_databases/chromadb - file: source/reference/vector_databases/qdrant @@ -106,9 +107,9 @@ parts: - file: source/reference/ai/hf title: Hugging Face - file: source/reference/ai/openai - title: OpenAI + title: OpenAI - file: source/reference/ai/yolo - title: YOLO + title: YOLO - file: source/reference/ai/stablediffusion title: Stable Diffusion @@ -117,7 +118,7 @@ parts: - file: source/reference/optimizations title: Optimizations - + # - file: source/reference/io # title: IO Descriptors From 85e60534d392dd52b285a66c4b0df1a873bb0e0c Mon Sep 17 00:00:00 2001 From: Aayush Acharya Date: Fri, 29 Dec 2023 22:54:53 +0545 Subject: [PATCH 3/4] chore: formatted arxiv third party db code --- evadb/third_party/databases/arxiv/__init__.py | 2 +- .../databases/arxiv/arxiv_handler.py | 26 +++++++------------ .../databases/arxiv/table_column_info.py | 24 ++++++++--------- evadb/third_party/databases/interface.py | 2 +- 4 files changed, 24 insertions(+), 30 deletions(-) diff --git a/evadb/third_party/databases/arxiv/__init__.py b/evadb/third_party/databases/arxiv/__init__.py index 7324d51f90..8e2f02ef4e 100644 --- a/evadb/third_party/databases/arxiv/__init__.py +++ b/evadb/third_party/databases/arxiv/__init__.py @@ -12,4 +12,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""arxiv integration""" \ No newline at end of file +"""arxiv integration""" diff --git a/evadb/third_party/databases/arxiv/arxiv_handler.py b/evadb/third_party/databases/arxiv/arxiv_handler.py index 7670f82ea9..b64107a8e4 100644 --- a/evadb/third_party/databases/arxiv/arxiv_handler.py +++ b/evadb/third_party/databases/arxiv/arxiv_handler.py @@ -32,16 +32,15 @@ def __init__(self, name: str, **kwargs): **kwargs: arbitrary keyword arguments for establishing the connection. """ super().__init__(name, **kwargs) - self.query=kwargs.get("query","") - self.max_results=int(kwargs.get("max_results",0)) + self.query = kwargs.get("query", "") + self.max_results = int(kwargs.get("max_results", 0)) @property def supported_table(self): def _arxiv_generator(): - for eachRow in self.connection.results(arxiv.Search( - query=self.query, - max_results=self.max_results - )): + for eachRow in self.connection.results( + arxiv.Search(query=self.query, max_results=self.max_results) + ): yield { property_name: getattr(eachRow, property_name) for property_name, _ in ARXIV_COLUMNS @@ -55,8 +54,6 @@ def _arxiv_generator(): } return mapping - - def connect(self): """ Set up the connection required by the handler. @@ -64,17 +61,17 @@ def connect(self): DBHandlerStatus """ try: - self.connection=arxiv.Client() + self.connection = arxiv.Client() return DBHandlerStatus(status=True) except Exception as e: return DBHandlerStatus(status=False, error=str(e)) - + def disconnect(self): """ Close any existing connections. """ pass - + def check_connection(self) -> DBHandlerStatus: """ Check connection to the handler. @@ -85,7 +82,7 @@ def check_connection(self) -> DBHandlerStatus: return DBHandlerStatus(status=True) else: return DBHandlerStatus(status=False, error="Not connected to the database.") - + def get_tables(self) -> DBHandlerResponse: """ Return the list of tables in the database. @@ -102,7 +99,7 @@ def get_tables(self) -> DBHandlerResponse: return DBHandlerResponse(data=tables_df) except Exception as e: return DBHandlerResponse(data=None, error=str(e)) - + def get_columns(self, table_name: str) -> DBHandlerResponse: """ Returns the list of columns for the given table. @@ -120,8 +117,6 @@ def get_columns(self, table_name: str) -> DBHandlerResponse: return DBHandlerResponse(data=columns_df) except Exception as e: return DBHandlerResponse(data=None, error=str(e)) - - def select(self, table_name: str) -> DBHandlerResponse: """ @@ -146,4 +141,3 @@ def select(self, table_name: str) -> DBHandlerResponse: ) except Exception as e: return DBHandlerResponse(data=None, error=str(e)) - \ No newline at end of file diff --git a/evadb/third_party/databases/arxiv/table_column_info.py b/evadb/third_party/databases/arxiv/table_column_info.py index 66b47b5144..49041314b4 100644 --- a/evadb/third_party/databases/arxiv/table_column_info.py +++ b/evadb/third_party/databases/arxiv/table_column_info.py @@ -13,15 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARXIV_COLUMNS=[ - ["title",str], - ["entry_id",str], - ["published",str], - ["updated",str], - ["summary",str], - ["authors",object], - ["comment",str], - ["primary_category",str], - ["journal_ref",str], - ["doi",str], -] \ No newline at end of file +ARXIV_COLUMNS = [ + ["title", str], + ["entry_id", str], + ["published", str], + ["updated", str], + ["summary", str], + ["authors", object], + ["comment", str], + ["primary_category", str], + ["journal_ref", str], + ["doi", str], +] diff --git a/evadb/third_party/databases/interface.py b/evadb/third_party/databases/interface.py index eacb41c016..49b8aed44a 100644 --- a/evadb/third_party/databases/interface.py +++ b/evadb/third_party/databases/interface.py @@ -49,7 +49,7 @@ def _get_database_handler(engine: str, **kwargs): elif engine == "github": return mod.GithubHandler(engine, **kwargs) elif engine == "arxiv": - return mod.ArxivHandler(engine,**kwargs) + return mod.ArxivHandler(engine, **kwargs) elif engine == "hackernews": return mod.HackernewsSearchHandler(engine, **kwargs) elif engine == "slack": From e0c584684cfad8f0f846522ed57af6eb083af754 Mon Sep 17 00:00:00 2001 From: Aayush Acharya Date: Fri, 29 Dec 2023 23:17:51 +0545 Subject: [PATCH 4/4] feat: added arxiv datasource test case --- .../long/test_arxiv_datasource.py | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 test/integration_tests/long/test_arxiv_datasource.py diff --git a/test/integration_tests/long/test_arxiv_datasource.py b/test/integration_tests/long/test_arxiv_datasource.py new file mode 100644 index 0000000000..fdeb86b8be --- /dev/null +++ b/test/integration_tests/long/test_arxiv_datasource.py @@ -0,0 +1,59 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest +from test.util import get_evadb_for_testing + +import pytest + +from evadb.server.command_handler import execute_query_fetch_all +from evadb.third_party.databases.arxiv.table_column_info import ARXIV_COLUMNS + + +@pytest.mark.notparallel +class ArxivDataSourceTest(unittest.TestCase): + def setUp(self): + self.evadb = get_evadb_for_testing() + # reset the catalog manager before running each test + self.evadb.catalog().reset() + + def tearDown(self): + execute_query_fetch_all(self.evadb, "DROP DATABASE IF EXISTS arxiv_data;") + + @pytest.mark.skip( + reason="Need https://github.com/georgia-tech-db/evadb/pull/1280 for a cost-based rebatch optimization" + ) + @pytest.mark.xfail(reason="Flaky testcase due to `bad request` error message") + def test_should_run_select_query_in_arxiv(self): + # Create database. + params = { + "query": "Nuclear Physics", + "max_results": "10", + } + query = f"""CREATE DATABASE arxiv_data + WITH ENGINE = "arxiv", + PARAMETERS = {params};""" + execute_query_fetch_all(self.evadb, query) + + query = "SELECT * FROM arxiv_data.search_results LIMIT 10;" + batch = execute_query_fetch_all(self.evadb, query) + self.assertEqual(len(batch), 10) + expected_column = list( + ["search_results.{}".format(col) for col, _ in ARXIV_COLUMNS] + ) + self.assertEqual(batch.columns, expected_column) + + +if __name__ == "__main__": + unittest.main()