diff --git a/bench/generation/README.md b/bench/generation/README.md index 3d42e2b7..a6f61b5b 100644 --- a/bench/generation/README.md +++ b/bench/generation/README.md @@ -8,7 +8,7 @@ This repository contains scripts to evaluate the performances of quantized model A `evaluate_model.py` utility script is also provided to evaluate the metrics on a specific model for several quantization configurations, and output the result to a `png` barchart and/or a `json` file. -The paragraphs below display results for some popular models. +The paragraphs below display results for some popular models on a NVIDIA A100 GPU. ## facebook/opt-125m @@ -137,3 +137,19 @@ The paragraphs below display results for some popular models. + +## google/gemma-2b + +
+
+ google-gemma-2b Lambada prediction accuracy +
+
+
+ +
+
+ google-gemma-2b Lambada prediction accuracy +
+
+
diff --git a/bench/generation/charts/HuggingFaceH4-zephyr-7b-beta_Perplexity.png b/bench/generation/charts/HuggingFaceH4-zephyr-7b-beta_Perplexity.png index d6464e5f..037ca265 100644 Binary files a/bench/generation/charts/HuggingFaceH4-zephyr-7b-beta_Perplexity.png and b/bench/generation/charts/HuggingFaceH4-zephyr-7b-beta_Perplexity.png differ diff --git a/bench/generation/charts/NousResearch-Llama-2-7b-hf_Latency__ms_.png b/bench/generation/charts/NousResearch-Llama-2-7b-hf_Latency__ms_.png index 29df7e91..2a2bffa9 100644 Binary files a/bench/generation/charts/NousResearch-Llama-2-7b-hf_Latency__ms_.png and b/bench/generation/charts/NousResearch-Llama-2-7b-hf_Latency__ms_.png differ diff --git a/bench/generation/charts/NousResearch-Llama-2-7b-hf_Perplexity.png b/bench/generation/charts/NousResearch-Llama-2-7b-hf_Perplexity.png index cc8b0c53..cf8b32f5 100644 Binary files a/bench/generation/charts/NousResearch-Llama-2-7b-hf_Perplexity.png and b/bench/generation/charts/NousResearch-Llama-2-7b-hf_Perplexity.png differ diff --git a/bench/generation/charts/facebook-opt-1.3b_Latency__ms_.png b/bench/generation/charts/facebook-opt-1.3b_Latency__ms_.png index a4b91190..bdda0626 100644 Binary files a/bench/generation/charts/facebook-opt-1.3b_Latency__ms_.png and b/bench/generation/charts/facebook-opt-1.3b_Latency__ms_.png differ diff --git a/bench/generation/charts/facebook-opt-1.3b_Perplexity.png b/bench/generation/charts/facebook-opt-1.3b_Perplexity.png index db84a649..eab87e6c 100644 Binary files a/bench/generation/charts/facebook-opt-1.3b_Perplexity.png and b/bench/generation/charts/facebook-opt-1.3b_Perplexity.png differ diff --git a/bench/generation/charts/facebook-opt-125m_Accuracy.png b/bench/generation/charts/facebook-opt-125m_Accuracy.png index 826fff7f..d8a7b259 100644 Binary files a/bench/generation/charts/facebook-opt-125m_Accuracy.png and b/bench/generation/charts/facebook-opt-125m_Accuracy.png differ diff --git a/bench/generation/charts/facebook-opt-125m_Latency__ms_.png b/bench/generation/charts/facebook-opt-125m_Latency__ms_.png index 96bbfa95..50396cbd 100644 Binary files a/bench/generation/charts/facebook-opt-125m_Latency__ms_.png and b/bench/generation/charts/facebook-opt-125m_Latency__ms_.png differ diff --git a/bench/generation/charts/facebook-opt-125m_Perplexity.png b/bench/generation/charts/facebook-opt-125m_Perplexity.png index afa75010..c2dd4ebf 100644 Binary files a/bench/generation/charts/facebook-opt-125m_Perplexity.png and b/bench/generation/charts/facebook-opt-125m_Perplexity.png differ diff --git a/bench/generation/charts/facebook-opt-350m_Latency__ms_.png b/bench/generation/charts/facebook-opt-350m_Latency__ms_.png index 6c718039..d977a8c8 100644 Binary files a/bench/generation/charts/facebook-opt-350m_Latency__ms_.png and b/bench/generation/charts/facebook-opt-350m_Latency__ms_.png differ diff --git a/bench/generation/charts/facebook-opt-350m_Perplexity.png b/bench/generation/charts/facebook-opt-350m_Perplexity.png index 9459defe..8d5c2dde 100644 Binary files a/bench/generation/charts/facebook-opt-350m_Perplexity.png and b/bench/generation/charts/facebook-opt-350m_Perplexity.png differ diff --git a/bench/generation/charts/google-gemma-2b_Accuracy.png b/bench/generation/charts/google-gemma-2b_Accuracy.png new file mode 100644 index 00000000..aa073fc2 Binary files /dev/null and b/bench/generation/charts/google-gemma-2b_Accuracy.png differ diff --git a/bench/generation/charts/google-gemma-2b_Latency__ms_.png b/bench/generation/charts/google-gemma-2b_Latency__ms_.png new file mode 100644 index 00000000..66ad0665 Binary files /dev/null and b/bench/generation/charts/google-gemma-2b_Latency__ms_.png differ diff --git a/bench/generation/charts/google-gemma-2b_Perplexity.png b/bench/generation/charts/google-gemma-2b_Perplexity.png new file mode 100644 index 00000000..37863ad5 Binary files /dev/null and b/bench/generation/charts/google-gemma-2b_Perplexity.png differ diff --git a/bench/generation/charts/princeton-nlp-Sheared-LLaMA-1.3B_Accuracy.png b/bench/generation/charts/princeton-nlp-Sheared-LLaMA-1.3B_Accuracy.png index 3be77939..da77164b 100644 Binary files a/bench/generation/charts/princeton-nlp-Sheared-LLaMA-1.3B_Accuracy.png and b/bench/generation/charts/princeton-nlp-Sheared-LLaMA-1.3B_Accuracy.png differ diff --git a/bench/generation/charts/princeton-nlp-Sheared-LLaMA-1.3B_Latency__ms_.png b/bench/generation/charts/princeton-nlp-Sheared-LLaMA-1.3B_Latency__ms_.png new file mode 100644 index 00000000..3ea1a3ea Binary files /dev/null and b/bench/generation/charts/princeton-nlp-Sheared-LLaMA-1.3B_Latency__ms_.png differ diff --git a/bench/generation/charts/princeton-nlp-Sheared-LLaMA-1.3B_Perplexity.png b/bench/generation/charts/princeton-nlp-Sheared-LLaMA-1.3B_Perplexity.png index b6e3531b..5f18b387 100644 Binary files a/bench/generation/charts/princeton-nlp-Sheared-LLaMA-1.3B_Perplexity.png and b/bench/generation/charts/princeton-nlp-Sheared-LLaMA-1.3B_Perplexity.png differ