From 0d90ff321a21e83cc4618238e991cee43f586433 Mon Sep 17 00:00:00 2001 From: Max Wang Date: Thu, 9 Nov 2023 15:45:25 -0500 Subject: [PATCH 1/3] Update to become the Benchmark Runner --- .github/workflows/pypi.yml | 55 ++++++++++++++++++++++++++++++++++++++ benchmarks/__init__.py | 1 + benchmarks/cli/eval.py | 46 ++++++++++++++++++++++++------- benchmarks/cli/fetch.py | 7 ++--- benchmarks/cli/score.py | 7 +++-- benchmarks/main.py | 36 +++++++++++++++++++++++++ benchmarks/request.py | 32 +++++++++++----------- setup.py | 12 ++++++++- 8 files changed, 163 insertions(+), 33 deletions(-) create mode 100644 .github/workflows/pypi.yml create mode 100644 benchmarks/main.py diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml new file mode 100644 index 0000000..d506497 --- /dev/null +++ b/.github/workflows/pypi.yml @@ -0,0 +1,55 @@ +name: pypi + +on: + push: + tags: + - v* + +jobs: + build: + name: Build Python package + runs-on: ubuntu-latest + permissions: + id-token: write + steps: + - name: Check out the repo + uses: actions/checkout@v4 + with: + ref: ${{ github.event.release.target_commitish }} + - name: Set up Python 3.9 + uses: actions/setup-python@v4 + with: + python-version: 3.9 + - name: Install build + run: >- + python -m + pip install + build + --user + - name: Build a binary wheel and a source tarball + run: >- + python -m + build + - name: Store the distribution packages + uses: actions/upload-artifact@v3 + with: + name: python-package-distributions + path: dist/ + publish-to-pypi: + name: Publish Python package to PyPI + needs: + - build + runs-on: ubuntu-latest + environment: + name: pypi + url: https://pypi.org/p/benchmarks-runner + permissions: + id-token: write + steps: + - name: Download all the dists + uses: actions/download-artifact@v3 + with: + name: python-package-distributions + path: dist/ + - name: Publish distribution 📦 to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py index e69de29..aca69e3 100644 --- a/benchmarks/__init__.py +++ b/benchmarks/__init__.py @@ -0,0 +1 @@ +from .main import run_benchmarks \ No newline at end of file diff --git a/benchmarks/cli/eval.py b/benchmarks/cli/eval.py index f5ab7c8..8735b67 100644 --- a/benchmarks/cli/eval.py +++ b/benchmarks/cli/eval.py @@ -1,4 +1,6 @@ from argparse import ArgumentParser +import base64 +import io from matplotlib import pyplot as plt from pathlib import Path import os @@ -17,40 +19,63 @@ 'Mean Reciprocal Rank\t': 'mean_reciprocal_rank' } -def evaluate_ara_results(results_dir, args): +def evaluate_ara_results( + benchmark, + results_dir, + k: int = 20, + save_plots: bool = False, + save_json: bool = False, +): results = evaluate_results( - args.benchmark, + benchmark, results_dir, - k=args.k + k=k, ) + imgs = {} - if args.plots: + if save_plots: plots_dir = Path(results_dir) assert plots_dir.exists(), f"{plots_dir} does not exist." results.plot_precision() plt.gcf().savefig(plots_dir / 'precision.png') + with io.BytesIO() as buffer: + plt.gcf().savefig(buffer, format="png") + buffer.seek(0) + imgs["precision"] = base64.b64encode(buffer.read()).decode() results.plot_recall() plt.gcf().savefig(plots_dir / 'recall.png') + with io.BytesIO() as buffer: + plt.gcf().savefig(buffer, format="png") + buffer.seek(0) + imgs["recall"] = base64.b64encode(buffer.read()).decode() results.plot_mAP() plt.gcf().savefig(plots_dir / 'mAP.png') + with io.BytesIO() as buffer: + plt.gcf().savefig(buffer, format="png") + buffer.seek(0) + imgs["mAP"] = base64.b64encode(buffer.read()).decode() results.plot_top_k_accuracy() plt.gcf().savefig(plots_dir / 'top_k_accuracy.png') + with io.BytesIO() as buffer: + plt.gcf().savefig(buffer, format="png") + buffer.seek(0) + imgs["top_k_accuracy"] = base64.b64encode(buffer.read()).decode() ks = [1, 5, 10, 20, 50, 100, 200, 500] - while ks[-1] >= args.k: + while ks[-1] >= k: ks.pop() - ks.append(args.k) + ks.append(k) - if args.json: + if save_json: results.to_json(results_dir) output = [ '', - "Benchmark: {}".format(args.benchmark), + "Benchmark: {}".format(benchmark), "Results Directory: {}\n".format(results_dir), "\t\t\t{}".format('\t'.join(['k={}'.format(k) for k in ks])) ] @@ -69,6 +94,7 @@ def evaluate_ara_results(results_dir, args): output.append('') print("\n".join(output)) + return results.output_dict, imgs def main(): parser = ArgumentParser(description="Run a benchmark on a set of results.") @@ -109,6 +135,6 @@ def main(): # evaluate results for each ARA for ara in [ara for ara in os.listdir(args.results_dir) if os.path.isdir(args.results_dir)]: results_dir = os.path.join(args.results_dir, ara) - evaluate_ara_results(results_dir, args) + evaluate_ara_results(args.benchmark, results_dir, args.k, args.plots, args.json) else: - evaluate_ara_results(args.results_dir, args) + evaluate_ara_results(args.benchmark, args.results_dir, args.k, args.plots, args.json) diff --git a/benchmarks/cli/fetch.py b/benchmarks/cli/fetch.py index fe21621..1c1e926 100644 --- a/benchmarks/cli/fetch.py +++ b/benchmarks/cli/fetch.py @@ -1,4 +1,5 @@ from argparse import ArgumentParser +import asyncio from benchmarks.request import fetch_results @@ -38,13 +39,13 @@ def main(): ) args = parser.parse_args() - fetch_results( + output_dir = asyncio.run(fetch_results( args.benchmark, args.target, args.results_dir, overwrite=args.overwrite, scored=not args.unscored, num_concurrent_requests=args.n - ) - + )) + print(f"Results saved to: {output_dir}") diff --git a/benchmarks/cli/score.py b/benchmarks/cli/score.py index 44f5513..b98f50f 100644 --- a/benchmarks/cli/score.py +++ b/benchmarks/cli/score.py @@ -1,4 +1,5 @@ from argparse import ArgumentParser +import asyncio from benchmarks.request import score_results @@ -28,11 +29,9 @@ def main(): ) args = parser.parse_args() - score_results( + asyncio.run(score_results( args.unscored_results_dir, args.target, args.scored_results_dir, num_concurrent_requests=args.n - ) - - + )) diff --git a/benchmarks/main.py b/benchmarks/main.py new file mode 100644 index 0000000..a8eac3d --- /dev/null +++ b/benchmarks/main.py @@ -0,0 +1,36 @@ +"""Main Benchmarks Test Runner entry.""" +import asyncio +import os +import tempfile + +from benchmarks.request import fetch_results +from benchmarks.cli.eval import evaluate_ara_results + + +async def run_benchmarks( + benchmark: str, + target: str, +): + """Run benchmark tests.""" + with tempfile.TemporaryDirectory() as tmpdir: + output_dir = await fetch_results(benchmark, target, tmpdir) + output_dict = {} + output_imgs = {} + if target == 'ars': + # evaluate results for each ARA + for ara in [ara for ara in os.listdir(output_dir) if os.path.isdir(output_dir)]: + results_dir = os.path.join(output_dir, ara) + ara_output_dict, ara_imgs = evaluate_ara_results(benchmark, results_dir, save_plots=True) + output_imgs[ara] = ara_imgs + else: + output_dict, imgs = evaluate_ara_results(benchmark, output_dir, save_plots=True) + output_imgs[target] = imgs + + return output_dict, imgs + + +if __name__ == "__main__": + asyncio.run(run_benchmarks( + "ameliorates", + "aragorn", + )) diff --git a/benchmarks/request.py b/benchmarks/request.py index 5382ed4..76cd266 100644 --- a/benchmarks/request.py +++ b/benchmarks/request.py @@ -18,7 +18,7 @@ # double the ARS timeout, just in case. The ARS should set all queries to error after 5 mins MAX_QUERY_TIME = os.getenv("MAX_QUERY_TIME", 600) -def fetch_results( +async def fetch_results( benchmark: str, target: str, results_dir: str, @@ -82,7 +82,7 @@ def fetch_results( Path(output_dir).mkdir(parents=True, exist_ok=True) if target == "ars": - send_requests_to_ars( + await send_requests_to_ars( uids, messages, url, @@ -93,7 +93,7 @@ def fetch_results( else: - send_requests_store_results( + await send_requests_store_results( uids, messages, url, @@ -101,9 +101,11 @@ def fetch_results( num_concurrent_requests, progress, ) + + return output_dir -def send_requests_to_ars( +async def send_requests_to_ars( uids: Sequence[str], messages: Sequence[dict], url: str, @@ -116,7 +118,7 @@ def send_requests_to_ars( send_request_to_ars(uid, msg, url, output_dir, pbar) for uid, msg in zip(uids, messages) ] - asyncio.run(gather(*coroutines, limit=num_concurrent_requests)) + await gather(*coroutines, limit=num_concurrent_requests) if pbar is not None: pbar.close() @@ -190,18 +192,18 @@ async def send_request_to_ars( response = await send_request(uid, f"{url}/messages/{parent_pk}", msg, request_type="get") merged_pk = response.get("fields", {}).get("merged_version") if merged_pk is None: - raise Exception("Failed to get the ARS merged message.") - - merged_message = await send_request(uid, f"{url}/messages/{merged_pk}", msg, request_type="get") - Path(os.path.join(output_dir, "ars")).mkdir(parents=True, exist_ok=True) - with open(os.path.join(output_dir, "ars", f"{uid}.json"), "w") as file: - json.dump(merged_message, file) + print(f"Failed to get the ARS merged message from pk: {parent_pk}.") + else: + merged_message = await send_request(uid, f"{url}/messages/{merged_pk}", msg, request_type="get") + Path(os.path.join(output_dir, "ars")).mkdir(parents=True, exist_ok=True) + with open(os.path.join(output_dir, "ars", f"{uid}.json"), "w") as file: + json.dump(merged_message, file) if pbar: pbar.update() -def score_results( +async def score_results( unscored_results_dir: str, target: str, scored_results_dir: str, @@ -241,7 +243,7 @@ def score_results( message['workflow'] = workflow messages.append(message) - send_requests_store_results( + await send_requests_store_results( uids, messages, url, @@ -250,7 +252,7 @@ def score_results( progress ) -def send_requests_store_results( +async def send_requests_store_results( uids: Sequence[str], messages: Sequence[dict], url: str, @@ -263,7 +265,7 @@ def send_requests_store_results( send_request_store_result(uid, msg, url, output_dir, pbar) for uid, msg in zip(uids, messages) ] - asyncio.run(gather(*coroutines, limit=num_concurrent_requests)) + await gather(*coroutines, limit=num_concurrent_requests) if pbar is not None: pbar.close() diff --git a/setup.py b/setup.py index 1f0c989..03c3e90 100644 --- a/setup.py +++ b/setup.py @@ -1,15 +1,25 @@ from setuptools import find_packages, setup +with open("README.md", encoding="utf-8") as readme_file: + readme = readme_file.read() + setup( name='benchmarks', version='0.1.0', + author="Max Wang", + author_email="max@covar.com", + url="https://github.com/TranslatorSRI/Benchmarks", + description="Translator Benchmarks Runner", + long_description_content_type="text/markdown", + long_description=readme, + include_package_data=True, packages=find_packages(), install_requires=[ 'httpx', 'matplotlib', 'numpy', 'requests', - 'tqdm' + 'tqdm', ], entry_points={ 'console_scripts': [ From ece9598de2933a133a93a0111b0a44b6689c8da1 Mon Sep 17 00:00:00 2001 From: Max Wang Date: Fri, 10 Nov 2023 09:41:05 -0500 Subject: [PATCH 2/3] Rename main package to benchmark-runner and update readme --- README.md | 11 +++++++++++ {benchmarks => benchmarks_runner}/__init__.py | 0 {benchmarks => benchmarks_runner}/cli/__init__.py | 0 {benchmarks => benchmarks_runner}/cli/eval.py | 2 +- {benchmarks => benchmarks_runner}/cli/fetch.py | 2 +- {benchmarks => benchmarks_runner}/cli/score.py | 2 +- {benchmarks => benchmarks_runner}/eval.py | 2 +- {benchmarks => benchmarks_runner}/main.py | 4 ++-- {benchmarks => benchmarks_runner}/request.py | 0 {benchmarks => benchmarks_runner}/utils/__init__.py | 0 {benchmarks => benchmarks_runner}/utils/asyncio.py | 0 {benchmarks => benchmarks_runner}/utils/benchmark.py | 2 +- {benchmarks => benchmarks_runner}/utils/constants.py | 0 {benchmarks => benchmarks_runner}/utils/normalize.py | 0 setup.py | 2 +- 15 files changed, 19 insertions(+), 8 deletions(-) rename {benchmarks => benchmarks_runner}/__init__.py (100%) rename {benchmarks => benchmarks_runner}/cli/__init__.py (100%) rename {benchmarks => benchmarks_runner}/cli/eval.py (98%) rename {benchmarks => benchmarks_runner}/cli/fetch.py (96%) rename {benchmarks => benchmarks_runner}/cli/score.py (94%) rename {benchmarks => benchmarks_runner}/eval.py (99%) rename {benchmarks => benchmarks_runner}/main.py (90%) rename {benchmarks => benchmarks_runner}/request.py (100%) rename {benchmarks => benchmarks_runner}/utils/__init__.py (100%) rename {benchmarks => benchmarks_runner}/utils/asyncio.py (100%) rename {benchmarks => benchmarks_runner}/utils/benchmark.py (99%) rename {benchmarks => benchmarks_runner}/utils/constants.py (100%) rename {benchmarks => benchmarks_runner}/utils/normalize.py (100%) diff --git a/README.md b/README.md index e71caa6..37a7835 100644 --- a/README.md +++ b/README.md @@ -63,3 +63,14 @@ _Requires python 3.9._ - Install dependencies: `pip install -r requirements.txt` - Start the frontend server: `python server.py` - Open in your browser + +## Benchmark Runner +The benchmarks can be installed from pypi and used as part of the Translator-wide automated testing. +- `pip install benchmarks-runner` +To run benchmarks: +```python +from benchmarks_runner import run_benchmarks + +run_benchmarks(, ) +``` +where benchmark is the name of a benchmark that is specified in config/benchmarks.json, and a target that is specified in config/targets.json diff --git a/benchmarks/__init__.py b/benchmarks_runner/__init__.py similarity index 100% rename from benchmarks/__init__.py rename to benchmarks_runner/__init__.py diff --git a/benchmarks/cli/__init__.py b/benchmarks_runner/cli/__init__.py similarity index 100% rename from benchmarks/cli/__init__.py rename to benchmarks_runner/cli/__init__.py diff --git a/benchmarks/cli/eval.py b/benchmarks_runner/cli/eval.py similarity index 98% rename from benchmarks/cli/eval.py rename to benchmarks_runner/cli/eval.py index 8735b67..725da26 100644 --- a/benchmarks/cli/eval.py +++ b/benchmarks_runner/cli/eval.py @@ -6,7 +6,7 @@ import os -from benchmarks.eval import evaluate_results +from benchmarks_runner.eval import evaluate_results metrics_at_k = { 'Precision @ k\t\t': 'precision_at_k', diff --git a/benchmarks/cli/fetch.py b/benchmarks_runner/cli/fetch.py similarity index 96% rename from benchmarks/cli/fetch.py rename to benchmarks_runner/cli/fetch.py index 1c1e926..135665a 100644 --- a/benchmarks/cli/fetch.py +++ b/benchmarks_runner/cli/fetch.py @@ -1,7 +1,7 @@ from argparse import ArgumentParser import asyncio -from benchmarks.request import fetch_results +from benchmarks_runner.request import fetch_results def main(): diff --git a/benchmarks/cli/score.py b/benchmarks_runner/cli/score.py similarity index 94% rename from benchmarks/cli/score.py rename to benchmarks_runner/cli/score.py index b98f50f..e629559 100644 --- a/benchmarks/cli/score.py +++ b/benchmarks_runner/cli/score.py @@ -1,7 +1,7 @@ from argparse import ArgumentParser import asyncio -from benchmarks.request import score_results +from benchmarks_runner.request import score_results def main(): diff --git a/benchmarks/eval.py b/benchmarks_runner/eval.py similarity index 99% rename from benchmarks/eval.py rename to benchmarks_runner/eval.py index f0f70cb..8b42407 100644 --- a/benchmarks/eval.py +++ b/benchmarks_runner/eval.py @@ -11,7 +11,7 @@ from matplotlib import pyplot as plt from matplotlib.axes import Axes -from benchmarks.utils.benchmark import benchmark_ground_truth +from benchmarks_runner.utils.benchmark import benchmark_ground_truth def evaluate_results( diff --git a/benchmarks/main.py b/benchmarks_runner/main.py similarity index 90% rename from benchmarks/main.py rename to benchmarks_runner/main.py index a8eac3d..ff094c1 100644 --- a/benchmarks/main.py +++ b/benchmarks_runner/main.py @@ -3,8 +3,8 @@ import os import tempfile -from benchmarks.request import fetch_results -from benchmarks.cli.eval import evaluate_ara_results +from benchmarks_runner.request import fetch_results +from benchmarks_runner.cli.eval import evaluate_ara_results async def run_benchmarks( diff --git a/benchmarks/request.py b/benchmarks_runner/request.py similarity index 100% rename from benchmarks/request.py rename to benchmarks_runner/request.py diff --git a/benchmarks/utils/__init__.py b/benchmarks_runner/utils/__init__.py similarity index 100% rename from benchmarks/utils/__init__.py rename to benchmarks_runner/utils/__init__.py diff --git a/benchmarks/utils/asyncio.py b/benchmarks_runner/utils/asyncio.py similarity index 100% rename from benchmarks/utils/asyncio.py rename to benchmarks_runner/utils/asyncio.py diff --git a/benchmarks/utils/benchmark.py b/benchmarks_runner/utils/benchmark.py similarity index 99% rename from benchmarks/utils/benchmark.py rename to benchmarks_runner/utils/benchmark.py index abb65a3..fb2750b 100644 --- a/benchmarks/utils/benchmark.py +++ b/benchmarks_runner/utils/benchmark.py @@ -5,7 +5,7 @@ from copy import deepcopy from typing import Dict, List, Sequence, Tuple -from benchmarks.utils.constants import BENCHMARKS, CONFIG_DIR +from benchmarks_runner.utils.constants import BENCHMARKS, CONFIG_DIR from .normalize import get_normalizer diff --git a/benchmarks/utils/constants.py b/benchmarks_runner/utils/constants.py similarity index 100% rename from benchmarks/utils/constants.py rename to benchmarks_runner/utils/constants.py diff --git a/benchmarks/utils/normalize.py b/benchmarks_runner/utils/normalize.py similarity index 100% rename from benchmarks/utils/normalize.py rename to benchmarks_runner/utils/normalize.py diff --git a/setup.py b/setup.py index 03c3e90..48c138a 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ readme = readme_file.read() setup( - name='benchmarks', + name='benchmarks-runner', version='0.1.0', author="Max Wang", author_email="max@covar.com", From 7577b1a7b2d0b72762231e7aa235b8b3d767bfc3 Mon Sep 17 00:00:00 2001 From: Max Wang Date: Mon, 13 Nov 2023 08:49:10 -0500 Subject: [PATCH 3/3] Fix issues from Andrew's feedback --- README.md | 15 ++++++++++++++- requirements.txt | 1 + setup.py | 6 +++--- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 37a7835..059975b 100644 --- a/README.md +++ b/README.md @@ -69,8 +69,21 @@ The benchmarks can be installed from pypi and used as part of the Translator-wid - `pip install benchmarks-runner` To run benchmarks: ```python +import asyncio from benchmarks_runner import run_benchmarks -run_benchmarks(, ) +output = asyncio.run(run_benchmarks(, )) ``` where benchmark is the name of a benchmark that is specified in config/benchmarks.json, and a target that is specified in config/targets.json + +### Sample Output +``` +Benchmark: GTRx +Results Directory: /tmp/tmpaf10m9_q/GTRx/bte/2023-11-10_13-03-11 + k=1 k=5 k=10 k=20 +Precision @ k 0.0000 0.0500 0.0250 0.0125 +Recall @ k 0.0000 0.2500 0.2500 0.2500 +mAP @ k 0.0000 0.0833 0.0833 0.0833 +Top-k Accuracy 0.0000 0.2500 0.2500 0.2500 +Mean Reciprocal Rank 0.08333333333333333 +``` diff --git a/requirements.txt b/requirements.txt index 2d5dc23..171ccfc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,7 @@ httpx matplotlib numpy python-dotenv +reasoner_pydantic requests tqdm uvicorn diff --git a/setup.py b/setup.py index 48c138a..54dfcba 100644 --- a/setup.py +++ b/setup.py @@ -24,11 +24,11 @@ entry_points={ 'console_scripts': [ 'benchmarks_eval = ' - 'benchmarks.cli.eval:main', + 'benchmarks_runner.cli.eval:main', 'benchmarks_fetch = ' - 'benchmarks.cli.fetch:main', + 'benchmarks_runner.cli.fetch:main', 'benchmarks_score = ' - 'benchmarks.cli.score:main', + 'benchmarks_runner.cli.score:main', ] } )