From 0d90ff321a21e83cc4618238e991cee43f586433 Mon Sep 17 00:00:00 2001
From: Max Wang <max@covar.com>
Date: Thu, 9 Nov 2023 15:45:25 -0500
Subject: [PATCH 1/3] Update to become the Benchmark Runner

---
 .github/workflows/pypi.yml | 55 ++++++++++++++++++++++++++++++++++++++
 benchmarks/__init__.py     |  1 +
 benchmarks/cli/eval.py     | 46 ++++++++++++++++++++++++-------
 benchmarks/cli/fetch.py    |  7 ++---
 benchmarks/cli/score.py    |  7 +++--
 benchmarks/main.py         | 36 +++++++++++++++++++++++++
 benchmarks/request.py      | 32 +++++++++++-----------
 setup.py                   | 12 ++++++++-
 8 files changed, 163 insertions(+), 33 deletions(-)
 create mode 100644 .github/workflows/pypi.yml
 create mode 100644 benchmarks/main.py

diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
new file mode 100644
index 0000000..d506497
--- /dev/null
+++ b/.github/workflows/pypi.yml
@@ -0,0 +1,55 @@
+name: pypi
+
+on:
+  push:
+    tags:
+      - v*
+
+jobs:
+  build:
+    name: Build Python package
+    runs-on: ubuntu-latest
+    permissions:
+      id-token: write
+    steps:
+    - name: Check out the repo
+      uses: actions/checkout@v4
+      with:
+          ref: ${{ github.event.release.target_commitish }}
+    - name: Set up Python 3.9
+      uses: actions/setup-python@v4
+      with:
+        python-version: 3.9
+    - name: Install build
+      run: >-
+        python -m
+        pip install
+        build
+        --user
+    - name: Build a binary wheel and a source tarball
+      run: >-
+        python -m
+        build
+    - name: Store the distribution packages
+      uses: actions/upload-artifact@v3
+      with:
+        name: python-package-distributions
+        path: dist/
+  publish-to-pypi:
+    name: Publish Python package to PyPI
+    needs:
+    - build
+    runs-on: ubuntu-latest
+    environment:
+      name: pypi
+      url: https://pypi.org/p/benchmarks-runner
+    permissions:
+      id-token: write
+    steps:
+    - name: Download all the dists
+      uses: actions/download-artifact@v3
+      with:
+        name: python-package-distributions
+        path: dist/
+    - name: Publish distribution 📦 to PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
index e69de29..aca69e3 100644
--- a/benchmarks/__init__.py
+++ b/benchmarks/__init__.py
@@ -0,0 +1 @@
+from .main import run_benchmarks
\ No newline at end of file
diff --git a/benchmarks/cli/eval.py b/benchmarks/cli/eval.py
index f5ab7c8..8735b67 100644
--- a/benchmarks/cli/eval.py
+++ b/benchmarks/cli/eval.py
@@ -1,4 +1,6 @@
 from argparse import ArgumentParser
+import base64
+import io
 from matplotlib import pyplot as plt
 from pathlib import Path
 import os
@@ -17,40 +19,63 @@
     'Mean Reciprocal Rank\t': 'mean_reciprocal_rank'
 }
 
-def evaluate_ara_results(results_dir, args):
+def evaluate_ara_results(
+    benchmark,
+    results_dir,
+    k: int = 20,
+    save_plots: bool = False,
+    save_json: bool = False,
+):
     results = evaluate_results(
-        args.benchmark,
+        benchmark,
         results_dir,
-        k=args.k
+        k=k,
     )
+    imgs = {}
 
-    if args.plots:
+    if save_plots:
         plots_dir = Path(results_dir)
         assert plots_dir.exists(), f"{plots_dir} does not exist."
 
         results.plot_precision()
         plt.gcf().savefig(plots_dir / 'precision.png')
+        with io.BytesIO() as buffer:
+            plt.gcf().savefig(buffer, format="png")
+            buffer.seek(0)
+            imgs["precision"] = base64.b64encode(buffer.read()).decode()
         
         results.plot_recall()
         plt.gcf().savefig(plots_dir / 'recall.png')
+        with io.BytesIO() as buffer:
+            plt.gcf().savefig(buffer, format="png")
+            buffer.seek(0)
+            imgs["recall"] = base64.b64encode(buffer.read()).decode()
         
         results.plot_mAP()
         plt.gcf().savefig(plots_dir / 'mAP.png')
+        with io.BytesIO() as buffer:
+            plt.gcf().savefig(buffer, format="png")
+            buffer.seek(0)
+            imgs["mAP"] = base64.b64encode(buffer.read()).decode()
 
         results.plot_top_k_accuracy()
         plt.gcf().savefig(plots_dir / 'top_k_accuracy.png')
+        with io.BytesIO() as buffer:
+            plt.gcf().savefig(buffer, format="png")
+            buffer.seek(0)
+            imgs["top_k_accuracy"] = base64.b64encode(buffer.read()).decode()
 
     ks = [1, 5, 10, 20, 50, 100, 200, 500]
-    while ks[-1] >= args.k:
+    while ks[-1] >= k:
         ks.pop()
-    ks.append(args.k)
+    ks.append(k)
 
-    if args.json:
+    if save_json:
         results.to_json(results_dir)
 
     output = [
         '',
-        "Benchmark: {}".format(args.benchmark),
+        "Benchmark: {}".format(benchmark),
         "Results Directory: {}\n".format(results_dir),
         "\t\t\t{}".format('\t'.join(['k={}'.format(k) for k in ks]))
     ]
@@ -69,6 +94,7 @@ def evaluate_ara_results(results_dir, args):
     output.append('')
 
     print("\n".join(output))
+    return results.output_dict, imgs
 
 def main():
     parser = ArgumentParser(description="Run a benchmark on a set of results.")
@@ -109,6 +135,6 @@ def main():
         # evaluate results for each ARA
         for ara in [ara for ara in os.listdir(args.results_dir) if os.path.isdir(args.results_dir)]:
             results_dir = os.path.join(args.results_dir, ara)
-            evaluate_ara_results(results_dir, args)
+            evaluate_ara_results(args.benchmark, results_dir, args.k, args.plots, args.json)
     else:
-        evaluate_ara_results(args.results_dir, args)
+        evaluate_ara_results(args.benchmark, args.results_dir, args.k, args.plots, args.json)
diff --git a/benchmarks/cli/fetch.py b/benchmarks/cli/fetch.py
index fe21621..1c1e926 100644
--- a/benchmarks/cli/fetch.py
+++ b/benchmarks/cli/fetch.py
@@ -1,4 +1,5 @@
 from argparse import ArgumentParser
+import asyncio
 
 from benchmarks.request import fetch_results
 
@@ -38,13 +39,13 @@ def main():
     )
     args = parser.parse_args()
 
-    fetch_results(
+    output_dir = asyncio.run(fetch_results(
         args.benchmark,
         args.target,
         args.results_dir,
         overwrite=args.overwrite,
         scored=not args.unscored,
         num_concurrent_requests=args.n
-    )
-
+    ))
 
+    print(f"Results saved to: {output_dir}")
diff --git a/benchmarks/cli/score.py b/benchmarks/cli/score.py
index 44f5513..b98f50f 100644
--- a/benchmarks/cli/score.py
+++ b/benchmarks/cli/score.py
@@ -1,4 +1,5 @@
 from argparse import ArgumentParser
+import asyncio
 
 from benchmarks.request import score_results
 
@@ -28,11 +29,9 @@ def main():
     )
     args = parser.parse_args()
 
-    score_results(
+    asyncio.run(score_results(
         args.unscored_results_dir,
         args.target,
         args.scored_results_dir,
         num_concurrent_requests=args.n
-    )
-
-
+    ))
diff --git a/benchmarks/main.py b/benchmarks/main.py
new file mode 100644
index 0000000..a8eac3d
--- /dev/null
+++ b/benchmarks/main.py
@@ -0,0 +1,36 @@
+"""Main Benchmarks Test Runner entry."""
+import asyncio
+import os
+import tempfile
+
+from benchmarks.request import fetch_results
+from benchmarks.cli.eval import evaluate_ara_results
+
+
+async def run_benchmarks(
+    benchmark: str,
+    target: str,
+):
+    """Run benchmark tests."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        output_dir = await fetch_results(benchmark, target, tmpdir)
+        output_dict = {}
+        output_imgs = {}
+        if target == 'ars':
+            # evaluate results for each ARA
+            for ara in [ara for ara in os.listdir(output_dir) if os.path.isdir(output_dir)]:
+                results_dir = os.path.join(output_dir, ara)
+                ara_output_dict, ara_imgs = evaluate_ara_results(benchmark, results_dir, save_plots=True)
+                output_imgs[ara] = ara_imgs
+        else:
+            output_dict, imgs = evaluate_ara_results(benchmark, output_dir, save_plots=True)
+            output_imgs[target] = imgs
+
+    return output_dict, imgs
+
+
+if __name__ == "__main__":
+    asyncio.run(run_benchmarks(
+        "ameliorates",
+        "aragorn",
+    ))
diff --git a/benchmarks/request.py b/benchmarks/request.py
index 5382ed4..76cd266 100644
--- a/benchmarks/request.py
+++ b/benchmarks/request.py
@@ -18,7 +18,7 @@
 # double the ARS timeout, just in case. The ARS should set all queries to error after 5 mins
 MAX_QUERY_TIME = os.getenv("MAX_QUERY_TIME", 600)
 
-def fetch_results(
+async def fetch_results(
     benchmark: str,
     target: str,
     results_dir: str,
@@ -82,7 +82,7 @@ def fetch_results(
     Path(output_dir).mkdir(parents=True, exist_ok=True)
 
     if target == "ars":
-        send_requests_to_ars(
+        await send_requests_to_ars(
             uids,
             messages,
             url,
@@ -93,7 +93,7 @@ def fetch_results(
 
     else:
 
-        send_requests_store_results(
+        await send_requests_store_results(
             uids,
             messages,
             url,
@@ -101,9 +101,11 @@ def fetch_results(
             num_concurrent_requests,
             progress,
         )
+    
+    return output_dir
 
 
-def send_requests_to_ars(
+async def send_requests_to_ars(
     uids: Sequence[str],
     messages: Sequence[dict],
     url: str,
@@ -116,7 +118,7 @@ def send_requests_to_ars(
         send_request_to_ars(uid, msg, url, output_dir, pbar)
         for uid, msg in zip(uids, messages)
     ]
-    asyncio.run(gather(*coroutines, limit=num_concurrent_requests))
+    await gather(*coroutines, limit=num_concurrent_requests)
 
     if pbar is not None:
         pbar.close()
@@ -190,18 +192,18 @@ async def send_request_to_ars(
     response = await send_request(uid, f"{url}/messages/{parent_pk}", msg, request_type="get")
     merged_pk = response.get("fields", {}).get("merged_version")
     if merged_pk is None:
-        raise Exception("Failed to get the ARS merged message.")
-    
-    merged_message = await send_request(uid, f"{url}/messages/{merged_pk}", msg, request_type="get")
-    Path(os.path.join(output_dir, "ars")).mkdir(parents=True, exist_ok=True)
-    with open(os.path.join(output_dir, "ars", f"{uid}.json"), "w") as file:
-        json.dump(merged_message, file)
+        print(f"Failed to get the ARS merged message from pk: {parent_pk}.")
+    else:
+        merged_message = await send_request(uid, f"{url}/messages/{merged_pk}", msg, request_type="get")
+        Path(os.path.join(output_dir, "ars")).mkdir(parents=True, exist_ok=True)
+        with open(os.path.join(output_dir, "ars", f"{uid}.json"), "w") as file:
+            json.dump(merged_message, file)
 
     if pbar:
         pbar.update()
 
 
-def score_results(
+async def score_results(
     unscored_results_dir: str,
     target: str,
     scored_results_dir: str,
@@ -241,7 +243,7 @@ def score_results(
                 message['workflow'] = workflow
             messages.append(message)
 
-    send_requests_store_results(
+    await send_requests_store_results(
         uids,
         messages,
         url,
@@ -250,7 +252,7 @@ def score_results(
         progress
     )
 
-def send_requests_store_results(
+async def send_requests_store_results(
     uids: Sequence[str],
     messages: Sequence[dict],
     url: str,
@@ -263,7 +265,7 @@ def send_requests_store_results(
         send_request_store_result(uid, msg, url, output_dir, pbar)
         for uid, msg in zip(uids, messages)
     ]
-    asyncio.run(gather(*coroutines, limit=num_concurrent_requests))
+    await gather(*coroutines, limit=num_concurrent_requests)
     
     if pbar is not None:
         pbar.close()
diff --git a/setup.py b/setup.py
index 1f0c989..03c3e90 100644
--- a/setup.py
+++ b/setup.py
@@ -1,15 +1,25 @@
 from setuptools import find_packages, setup
 
+with open("README.md", encoding="utf-8") as readme_file:
+    readme = readme_file.read()
+
 setup(
     name='benchmarks',
     version='0.1.0',
+    author="Max Wang",
+    author_email="max@covar.com",
+    url="https://github.com/TranslatorSRI/Benchmarks",
+    description="Translator Benchmarks Runner",
+    long_description_content_type="text/markdown",
+    long_description=readme,
+    include_package_data=True,
     packages=find_packages(),
     install_requires=[
         'httpx',
         'matplotlib',
         'numpy',
         'requests',
-        'tqdm'
+        'tqdm',
     ],
     entry_points={
         'console_scripts': [

From ece9598de2933a133a93a0111b0a44b6689c8da1 Mon Sep 17 00:00:00 2001
From: Max Wang <max@covar.com>
Date: Fri, 10 Nov 2023 09:41:05 -0500
Subject: [PATCH 2/3] Rename main package to benchmark-runner and update readme

---
 README.md                                            | 11 +++++++++++
 {benchmarks => benchmarks_runner}/__init__.py        |  0
 {benchmarks => benchmarks_runner}/cli/__init__.py    |  0
 {benchmarks => benchmarks_runner}/cli/eval.py        |  2 +-
 {benchmarks => benchmarks_runner}/cli/fetch.py       |  2 +-
 {benchmarks => benchmarks_runner}/cli/score.py       |  2 +-
 {benchmarks => benchmarks_runner}/eval.py            |  2 +-
 {benchmarks => benchmarks_runner}/main.py            |  4 ++--
 {benchmarks => benchmarks_runner}/request.py         |  0
 {benchmarks => benchmarks_runner}/utils/__init__.py  |  0
 {benchmarks => benchmarks_runner}/utils/asyncio.py   |  0
 {benchmarks => benchmarks_runner}/utils/benchmark.py |  2 +-
 {benchmarks => benchmarks_runner}/utils/constants.py |  0
 {benchmarks => benchmarks_runner}/utils/normalize.py |  0
 setup.py                                             |  2 +-
 15 files changed, 19 insertions(+), 8 deletions(-)
 rename {benchmarks => benchmarks_runner}/__init__.py (100%)
 rename {benchmarks => benchmarks_runner}/cli/__init__.py (100%)
 rename {benchmarks => benchmarks_runner}/cli/eval.py (98%)
 rename {benchmarks => benchmarks_runner}/cli/fetch.py (96%)
 rename {benchmarks => benchmarks_runner}/cli/score.py (94%)
 rename {benchmarks => benchmarks_runner}/eval.py (99%)
 rename {benchmarks => benchmarks_runner}/main.py (90%)
 rename {benchmarks => benchmarks_runner}/request.py (100%)
 rename {benchmarks => benchmarks_runner}/utils/__init__.py (100%)
 rename {benchmarks => benchmarks_runner}/utils/asyncio.py (100%)
 rename {benchmarks => benchmarks_runner}/utils/benchmark.py (99%)
 rename {benchmarks => benchmarks_runner}/utils/constants.py (100%)
 rename {benchmarks => benchmarks_runner}/utils/normalize.py (100%)

diff --git a/README.md b/README.md
index e71caa6..37a7835 100644
--- a/README.md
+++ b/README.md
@@ -63,3 +63,14 @@ _Requires python 3.9._
 - Install dependencies: `pip install -r requirements.txt`
 - Start the frontend server: `python server.py`
 - Open in your browser
+
+## Benchmark Runner
+The benchmarks can be installed from pypi and used as part of the Translator-wide automated testing.
+- `pip install benchmarks-runner`
+To run benchmarks:
+```python
+from benchmarks_runner import run_benchmarks
+
+run_benchmarks(<benchmark>, <target>)
+```
+where benchmark is the name of a benchmark that is specified in config/benchmarks.json, and a target that is specified in config/targets.json
diff --git a/benchmarks/__init__.py b/benchmarks_runner/__init__.py
similarity index 100%
rename from benchmarks/__init__.py
rename to benchmarks_runner/__init__.py
diff --git a/benchmarks/cli/__init__.py b/benchmarks_runner/cli/__init__.py
similarity index 100%
rename from benchmarks/cli/__init__.py
rename to benchmarks_runner/cli/__init__.py
diff --git a/benchmarks/cli/eval.py b/benchmarks_runner/cli/eval.py
similarity index 98%
rename from benchmarks/cli/eval.py
rename to benchmarks_runner/cli/eval.py
index 8735b67..725da26 100644
--- a/benchmarks/cli/eval.py
+++ b/benchmarks_runner/cli/eval.py
@@ -6,7 +6,7 @@
 import os
 
 
-from benchmarks.eval import evaluate_results
+from benchmarks_runner.eval import evaluate_results
 
 metrics_at_k = {
     'Precision @ k\t\t': 'precision_at_k',
diff --git a/benchmarks/cli/fetch.py b/benchmarks_runner/cli/fetch.py
similarity index 96%
rename from benchmarks/cli/fetch.py
rename to benchmarks_runner/cli/fetch.py
index 1c1e926..135665a 100644
--- a/benchmarks/cli/fetch.py
+++ b/benchmarks_runner/cli/fetch.py
@@ -1,7 +1,7 @@
 from argparse import ArgumentParser
 import asyncio
 
-from benchmarks.request import fetch_results
+from benchmarks_runner.request import fetch_results
 
 
 def main():
diff --git a/benchmarks/cli/score.py b/benchmarks_runner/cli/score.py
similarity index 94%
rename from benchmarks/cli/score.py
rename to benchmarks_runner/cli/score.py
index b98f50f..e629559 100644
--- a/benchmarks/cli/score.py
+++ b/benchmarks_runner/cli/score.py
@@ -1,7 +1,7 @@
 from argparse import ArgumentParser
 import asyncio
 
-from benchmarks.request import score_results
+from benchmarks_runner.request import score_results
 
 
 def main():
diff --git a/benchmarks/eval.py b/benchmarks_runner/eval.py
similarity index 99%
rename from benchmarks/eval.py
rename to benchmarks_runner/eval.py
index f0f70cb..8b42407 100644
--- a/benchmarks/eval.py
+++ b/benchmarks_runner/eval.py
@@ -11,7 +11,7 @@
 from matplotlib import pyplot as plt
 from matplotlib.axes import Axes
 
-from benchmarks.utils.benchmark import benchmark_ground_truth
+from benchmarks_runner.utils.benchmark import benchmark_ground_truth
 
 
 def evaluate_results(
diff --git a/benchmarks/main.py b/benchmarks_runner/main.py
similarity index 90%
rename from benchmarks/main.py
rename to benchmarks_runner/main.py
index a8eac3d..ff094c1 100644
--- a/benchmarks/main.py
+++ b/benchmarks_runner/main.py
@@ -3,8 +3,8 @@
 import os
 import tempfile
 
-from benchmarks.request import fetch_results
-from benchmarks.cli.eval import evaluate_ara_results
+from benchmarks_runner.request import fetch_results
+from benchmarks_runner.cli.eval import evaluate_ara_results
 
 
 async def run_benchmarks(
diff --git a/benchmarks/request.py b/benchmarks_runner/request.py
similarity index 100%
rename from benchmarks/request.py
rename to benchmarks_runner/request.py
diff --git a/benchmarks/utils/__init__.py b/benchmarks_runner/utils/__init__.py
similarity index 100%
rename from benchmarks/utils/__init__.py
rename to benchmarks_runner/utils/__init__.py
diff --git a/benchmarks/utils/asyncio.py b/benchmarks_runner/utils/asyncio.py
similarity index 100%
rename from benchmarks/utils/asyncio.py
rename to benchmarks_runner/utils/asyncio.py
diff --git a/benchmarks/utils/benchmark.py b/benchmarks_runner/utils/benchmark.py
similarity index 99%
rename from benchmarks/utils/benchmark.py
rename to benchmarks_runner/utils/benchmark.py
index abb65a3..fb2750b 100644
--- a/benchmarks/utils/benchmark.py
+++ b/benchmarks_runner/utils/benchmark.py
@@ -5,7 +5,7 @@
 from copy import deepcopy
 from typing import Dict, List, Sequence, Tuple
 
-from benchmarks.utils.constants import BENCHMARKS, CONFIG_DIR
+from benchmarks_runner.utils.constants import BENCHMARKS, CONFIG_DIR
 
 from .normalize import get_normalizer
 
diff --git a/benchmarks/utils/constants.py b/benchmarks_runner/utils/constants.py
similarity index 100%
rename from benchmarks/utils/constants.py
rename to benchmarks_runner/utils/constants.py
diff --git a/benchmarks/utils/normalize.py b/benchmarks_runner/utils/normalize.py
similarity index 100%
rename from benchmarks/utils/normalize.py
rename to benchmarks_runner/utils/normalize.py
diff --git a/setup.py b/setup.py
index 03c3e90..48c138a 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@
     readme = readme_file.read()
 
 setup(
-    name='benchmarks',
+    name='benchmarks-runner',
     version='0.1.0',
     author="Max Wang",
     author_email="max@covar.com",

From 7577b1a7b2d0b72762231e7aa235b8b3d767bfc3 Mon Sep 17 00:00:00 2001
From: Max Wang <max@covar.com>
Date: Mon, 13 Nov 2023 08:49:10 -0500
Subject: [PATCH 3/3] Fix issues from Andrew's feedback

---
 README.md        | 15 ++++++++++++++-
 requirements.txt |  1 +
 setup.py         |  6 +++---
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 37a7835..059975b 100644
--- a/README.md
+++ b/README.md
@@ -69,8 +69,21 @@ The benchmarks can be installed from pypi and used as part of the Translator-wid
 - `pip install benchmarks-runner`
 To run benchmarks:
 ```python
+import asyncio
 from benchmarks_runner import run_benchmarks
 
-run_benchmarks(<benchmark>, <target>)
+output = asyncio.run(run_benchmarks(<benchmark>, <target>))
 ```
 where benchmark is the name of a benchmark that is specified in config/benchmarks.json, and a target that is specified in config/targets.json
+
+### Sample Output
+```
+Benchmark: GTRx
+Results Directory: /tmp/tmpaf10m9_q/GTRx/bte/2023-11-10_13-03-11
+                        k=1     k=5     k=10    k=20
+Precision @ k           0.0000  0.0500  0.0250  0.0125
+Recall @ k              0.0000  0.2500  0.2500  0.2500
+mAP @ k                 0.0000  0.0833  0.0833  0.0833
+Top-k Accuracy          0.0000  0.2500  0.2500  0.2500
+Mean Reciprocal Rank    0.08333333333333333
+```
diff --git a/requirements.txt b/requirements.txt
index 2d5dc23..171ccfc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,6 +3,7 @@ httpx
 matplotlib
 numpy
 python-dotenv
+reasoner_pydantic
 requests
 tqdm
 uvicorn
diff --git a/setup.py b/setup.py
index 48c138a..54dfcba 100644
--- a/setup.py
+++ b/setup.py
@@ -24,11 +24,11 @@
     entry_points={
         'console_scripts': [
             'benchmarks_eval = '
-            'benchmarks.cli.eval:main',
+            'benchmarks_runner.cli.eval:main',
             'benchmarks_fetch = '
-            'benchmarks.cli.fetch:main',
+            'benchmarks_runner.cli.fetch:main',
             'benchmarks_score = '
-            'benchmarks.cli.score:main',
+            'benchmarks_runner.cli.score:main',
         ]
     }
 )