TranslatorSRI · maximusunc · Nov 15, 2023 · Nov 9, 2023 · Nov 10, 2023 · Nov 13, 2023
diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
@@ -0,0 +1,55 @@
+name: pypi
+
+on:
+  push:
+    tags:
+      - v*
+
+jobs:
+  build:
+    name: Build Python package
+    runs-on: ubuntu-latest
+    permissions:
+      id-token: write
+    steps:
+    - name: Check out the repo
+      uses: actions/checkout@v4
+      with:
+          ref: ${{ github.event.release.target_commitish }}
+    - name: Set up Python 3.9
+      uses: actions/setup-python@v4
+      with:
+        python-version: 3.9
+    - name: Install build
+      run: >-
+        python -m
+        pip install
+        build
+        --user
+    - name: Build a binary wheel and a source tarball
+      run: >-
+        python -m
+        build
+    - name: Store the distribution packages
+      uses: actions/upload-artifact@v3
+      with:
+        name: python-package-distributions
+        path: dist/
+  publish-to-pypi:
+    name: Publish Python package to PyPI
+    needs:
+    - build
+    runs-on: ubuntu-latest
+    environment:
+      name: pypi
+      url: https://pypi.org/p/benchmarks-runner
+    permissions:
+      id-token: write
+    steps:
+    - name: Download all the dists
+      uses: actions/download-artifact@v3
+      with:
+        name: python-package-distributions
+        path: dist/
+    - name: Publish distribution 📦 to PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/README.md b/README.md
@@ -63,3 +63,27 @@ _Requires python 3.9._
 - Install dependencies: `pip install -r requirements.txt`
 - Start the frontend server: `python server.py`
 - Open in your browser
+
+## Benchmark Runner
+The benchmarks can be installed from pypi and used as part of the Translator-wide automated testing.
+- `pip install benchmarks-runner`
+To run benchmarks:
+```python
+import asyncio
+from benchmarks_runner import run_benchmarks
+
+output = asyncio.run(run_benchmarks(<benchmark>, <target>))
+```
+where benchmark is the name of a benchmark that is specified in config/benchmarks.json, and a target that is specified in config/targets.json
+
+### Sample Output
+```
+Benchmark: GTRx
+Results Directory: /tmp/tmpaf10m9_q/GTRx/bte/2023-11-10_13-03-11
+                        k=1     k=5     k=10    k=20
+Precision @ k           0.0000  0.0500  0.0250  0.0125
+Recall @ k              0.0000  0.2500  0.2500  0.2500
+mAP @ k                 0.0000  0.0833  0.0833  0.0833
+Top-k Accuracy          0.0000  0.2500  0.2500  0.2500
+Mean Reciprocal Rank    0.08333333333333333
+```
diff --git a/benchmarks/utils/__init__.py b/benchmarks/utils/__init__.py
diff --git a/benchmarks_runner/__init__.py b/benchmarks_runner/__init__.py
@@ -0,0 +1 @@
+from .main import run_benchmarks
diff --git a/benchmarks/__init__.py → benchmarks_runner/cli/__init__.py b/benchmarks/__init__.py → benchmarks_runner/cli/__init__.py
diff --git a/benchmarks/cli/eval.py → benchmarks_runner/cli/eval.py b/benchmarks/cli/eval.py → benchmarks_runner/cli/eval.py
@@ -1,10 +1,12 @@
 from argparse import ArgumentParser
+import base64
+import io
 from matplotlib import pyplot as plt
 from pathlib import Path
 import os
 
 
-from benchmarks.eval import evaluate_results
+from benchmarks_runner.eval import evaluate_results
 
 metrics_at_k = {
     'Precision @ k\t\t': 'precision_at_k',
@@ -17,40 +19,63 @@
     'Mean Reciprocal Rank\t': 'mean_reciprocal_rank'
 }
 
-def evaluate_ara_results(results_dir, args):
+def evaluate_ara_results(
+    benchmark,
+    results_dir,
+    k: int = 20,
+    save_plots: bool = False,
+    save_json: bool = False,
+):
     results = evaluate_results(
-        args.benchmark,
+        benchmark,
         results_dir,
-        k=args.k
+        k=k,
     )
+    imgs = {}
 
-    if args.plots:
+    if save_plots:
         plots_dir = Path(results_dir)
         assert plots_dir.exists(), f"{plots_dir} does not exist."
 
         results.plot_precision()
         plt.gcf().savefig(plots_dir / 'precision.png')
+        with io.BytesIO() as buffer:
+            plt.gcf().savefig(buffer, format="png")
+            buffer.seek(0)
+            imgs["precision"] = base64.b64encode(buffer.read()).decode()
 
         results.plot_recall()
         plt.gcf().savefig(plots_dir / 'recall.png')
+        with io.BytesIO() as buffer:
+            plt.gcf().savefig(buffer, format="png")
+            buffer.seek(0)
+            imgs["recall"] = base64.b64encode(buffer.read()).decode()
 
         results.plot_mAP()
         plt.gcf().savefig(plots_dir / 'mAP.png')
+        with io.BytesIO() as buffer:
+            plt.gcf().savefig(buffer, format="png")
+            buffer.seek(0)
+            imgs["mAP"] = base64.b64encode(buffer.read()).decode()
 
         results.plot_top_k_accuracy()
         plt.gcf().savefig(plots_dir / 'top_k_accuracy.png')
+        with io.BytesIO() as buffer:
+            plt.gcf().savefig(buffer, format="png")
+            buffer.seek(0)
+            imgs["top_k_accuracy"] = base64.b64encode(buffer.read()).decode()
 
     ks = [1, 5, 10, 20, 50, 100, 200, 500]
-    while ks[-1] >= args.k:
+    while ks[-1] >= k:
         ks.pop()
-    ks.append(args.k)
+    ks.append(k)
 
-    if args.json:
+    if save_json:
         results.to_json(results_dir)
 
     output = [
         '',
-        "Benchmark: {}".format(args.benchmark),
+        "Benchmark: {}".format(benchmark),
         "Results Directory: {}\n".format(results_dir),
         "\t\t\t{}".format('\t'.join(['k={}'.format(k) for k in ks]))
     ]
@@ -69,6 +94,7 @@ def evaluate_ara_results(results_dir, args):
     output.append('')
 
     print("\n".join(output))
+    return results.output_dict, imgs
 
 def main():
     parser = ArgumentParser(description="Run a benchmark on a set of results.")
@@ -109,6 +135,6 @@ def main():
         # evaluate results for each ARA
         for ara in [ara for ara in os.listdir(args.results_dir) if os.path.isdir(args.results_dir)]:
             results_dir = os.path.join(args.results_dir, ara)
-            evaluate_ara_results(results_dir, args)
+            evaluate_ara_results(args.benchmark, results_dir, args.k, args.plots, args.json)
     else:
-        evaluate_ara_results(args.results_dir, args)
+        evaluate_ara_results(args.benchmark, args.results_dir, args.k, args.plots, args.json)
diff --git a/benchmarks/cli/fetch.py → benchmarks_runner/cli/fetch.py b/benchmarks/cli/fetch.py → benchmarks_runner/cli/fetch.py
@@ -1,6 +1,7 @@
 from argparse import ArgumentParser
+import asyncio
 
-from benchmarks.request import fetch_results
+from benchmarks_runner.request import fetch_results
 
 
 def main():
@@ -38,13 +39,13 @@ def main():
     )
     args = parser.parse_args()
 
-    fetch_results(
+    output_dir = asyncio.run(fetch_results(
         args.benchmark,
         args.target,
         args.results_dir,
         overwrite=args.overwrite,
         scored=not args.unscored,
         num_concurrent_requests=args.n
-    )
-
+    ))
 
+    print(f"Results saved to: {output_dir}")
diff --git a/benchmarks/cli/score.py → benchmarks_runner/cli/score.py b/benchmarks/cli/score.py → benchmarks_runner/cli/score.py
@@ -1,6 +1,7 @@
 from argparse import ArgumentParser
+import asyncio
 
-from benchmarks.request import score_results
+from benchmarks_runner.request import score_results
 
 
 def main():
@@ -28,11 +29,9 @@ def main():
     )
     args = parser.parse_args()
 
-    score_results(
+    asyncio.run(score_results(
         args.unscored_results_dir,
         args.target,
         args.scored_results_dir,
         num_concurrent_requests=args.n
-    )
-
-
+    ))
diff --git a/benchmarks/eval.py → benchmarks_runner/eval.py b/benchmarks/eval.py → benchmarks_runner/eval.py
@@ -11,7 +11,7 @@
 from matplotlib import pyplot as plt
 from matplotlib.axes import Axes
 
-from benchmarks.utils.benchmark import benchmark_ground_truth
+from benchmarks_runner.utils.benchmark import benchmark_ground_truth
 
 
 def evaluate_results(

diff --git a/benchmarks_runner/main.py b/benchmarks_runner/main.py
@@ -0,0 +1,36 @@
+"""Main Benchmarks Test Runner entry."""
+import asyncio
+import os
+import tempfile
+
+from benchmarks_runner.request import fetch_results
+from benchmarks_runner.cli.eval import evaluate_ara_results
+
+
+async def run_benchmarks(
+    benchmark: str,
+    target: str,
+):
+    """Run benchmark tests."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        output_dir = await fetch_results(benchmark, target, tmpdir)
+        output_dict = {}
+        output_imgs = {}
+        if target == 'ars':
+            # evaluate results for each ARA
+            for ara in [ara for ara in os.listdir(output_dir) if os.path.isdir(output_dir)]:
+                results_dir = os.path.join(output_dir, ara)
+                ara_output_dict, ara_imgs = evaluate_ara_results(benchmark, results_dir, save_plots=True)
+                output_imgs[ara] = ara_imgs
+        else:
+            output_dict, imgs = evaluate_ara_results(benchmark, output_dir, save_plots=True)
+            output_imgs[target] = imgs
+
+    return output_dict, imgs
+
+
+if __name__ == "__main__":
+    asyncio.run(run_benchmarks(
+        "ameliorates",
+        "aragorn",
+    ))
diff --git a/benchmarks/request.py → benchmarks_runner/request.py b/benchmarks/request.py → benchmarks_runner/request.py
@@ -18,7 +18,7 @@
 # double the ARS timeout, just in case. The ARS should set all queries to error after 5 mins
 MAX_QUERY_TIME = os.getenv("MAX_QUERY_TIME", 600)
 
-def fetch_results(
+async def fetch_results(
     benchmark: str,
     target: str,
     results_dir: str,
@@ -82,7 +82,7 @@ def fetch_results(
     Path(output_dir).mkdir(parents=True, exist_ok=True)
 
     if target == "ars":
-        send_requests_to_ars(
+        await send_requests_to_ars(
             uids,
             messages,
             url,
@@ -93,17 +93,19 @@ def fetch_results(
 
     else:
 
-        send_requests_store_results(
+        await send_requests_store_results(
             uids,
             messages,
             url,
             output_dir,
             num_concurrent_requests,
             progress,
         )
+
+    return output_dir
 
 
-def send_requests_to_ars(
+async def send_requests_to_ars(
     uids: Sequence[str],
     messages: Sequence[dict],
     url: str,
@@ -116,7 +118,7 @@ def send_requests_to_ars(
         send_request_to_ars(uid, msg, url, output_dir, pbar)
         for uid, msg in zip(uids, messages)
     ]
-    asyncio.run(gather(*coroutines, limit=num_concurrent_requests))
+    await gather(*coroutines, limit=num_concurrent_requests)
 
     if pbar is not None:
         pbar.close()
@@ -190,18 +192,18 @@ async def send_request_to_ars(
     response = await send_request(uid, f"{url}/messages/{parent_pk}", msg, request_type="get")
     merged_pk = response.get("fields", {}).get("merged_version")
     if merged_pk is None:
-        raise Exception("Failed to get the ARS merged message.")
-
-    merged_message = await send_request(uid, f"{url}/messages/{merged_pk}", msg, request_type="get")
-    Path(os.path.join(output_dir, "ars")).mkdir(parents=True, exist_ok=True)
-    with open(os.path.join(output_dir, "ars", f"{uid}.json"), "w") as file:
-        json.dump(merged_message, file)
+        print(f"Failed to get the ARS merged message from pk: {parent_pk}.")
+    else:
+        merged_message = await send_request(uid, f"{url}/messages/{merged_pk}", msg, request_type="get")
+        Path(os.path.join(output_dir, "ars")).mkdir(parents=True, exist_ok=True)
+        with open(os.path.join(output_dir, "ars", f"{uid}.json"), "w") as file:
+            json.dump(merged_message, file)
 
     if pbar:
         pbar.update()
 
 
-def score_results(
+async def score_results(
     unscored_results_dir: str,
     target: str,
     scored_results_dir: str,
@@ -241,7 +243,7 @@ def score_results(
                 message['workflow'] = workflow
             messages.append(message)
 
-    send_requests_store_results(
+    await send_requests_store_results(
         uids,
         messages,
         url,
@@ -250,7 +252,7 @@ def score_results(
         progress
     )
 
-def send_requests_store_results(
+async def send_requests_store_results(
     uids: Sequence[str],
     messages: Sequence[dict],
     url: str,
@@ -263,7 +265,7 @@ def send_requests_store_results(
         send_request_store_result(uid, msg, url, output_dir, pbar)
         for uid, msg in zip(uids, messages)
     ]
-    asyncio.run(gather(*coroutines, limit=num_concurrent_requests))
+    await gather(*coroutines, limit=num_concurrent_requests)
 
     if pbar is not None:
         pbar.close()

diff --git a/benchmarks/cli/__init__.py → benchmarks_runner/utils/__init__.py b/benchmarks/cli/__init__.py → benchmarks_runner/utils/__init__.py
diff --git a/benchmarks/utils/asyncio.py → benchmarks_runner/utils/asyncio.py b/benchmarks/utils/asyncio.py → benchmarks_runner/utils/asyncio.py
diff --git a/benchmarks/utils/benchmark.py → benchmarks_runner/utils/benchmark.py b/benchmarks/utils/benchmark.py → benchmarks_runner/utils/benchmark.py
@@ -5,7 +5,7 @@
 from copy import deepcopy
 from typing import Dict, List, Sequence, Tuple
 
-from benchmarks.utils.constants import BENCHMARKS, CONFIG_DIR
+from benchmarks_runner.utils.constants import BENCHMARKS, CONFIG_DIR
 
 from .normalize import get_normalizer
 

diff --git a/benchmarks/utils/constants.py → benchmarks_runner/utils/constants.py b/benchmarks/utils/constants.py → benchmarks_runner/utils/constants.py
diff --git a/benchmarks/utils/normalize.py → benchmarks_runner/utils/normalize.py b/benchmarks/utils/normalize.py → benchmarks_runner/utils/normalize.py