From c6fe64e5acb9e8f8216b1a07717bbc62338aa511 Mon Sep 17 00:00:00 2001
From: Marcel Wilnicki <marcel.wilnicki@gmail.com>
Date: Thu, 22 Aug 2024 14:40:42 +0200
Subject: [PATCH 1/8] first commit, refresh code, add x86 dockerfile, update
 readme

---
 benchmarks/README.md          | 51 ++++++++++++++---------------------
 benchmarks/run.py             | 27 ++++++++++---------
 benchmarks/run.sh             |  2 +-
 benchmarks/utils/Dockerfile   |  7 +++++
 benchmarks/utils/benchmark.py |  6 ++---
 5 files changed, 46 insertions(+), 47 deletions(-)
 create mode 100644 benchmarks/utils/Dockerfile

diff --git a/benchmarks/README.md b/benchmarks/README.md
index ee922e0..28393c9 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -1,39 +1,28 @@
-# Running benchmark
+![Ampere AI](https://ampereaimodelzoo.s3.eu-central-1.amazonaws.com/ampere_logo_®_primary_stacked_rgb.png "Ampere AI")
+# Wrapper for multi-process / batched benchmark of llama.cpp
 
-This benchmarking tool runs multi-process, throughput-oriented benchmark of Ampere optimized llama.cpp using arbitrary model(s) provided by the user. 
-The benchmarking script spawns multiple parallel streams of token generation using llama.cpp and provides user with aggregate metrics of both prompt eval and token generation stages.
-Underneath, the _batched-bench_ script from upstream llama.cpp project is being used in an unaltered form.
-The script orchestrates the benchmark inside Docker container from the outside environment, **therefore this script should not be run inside Docker container.**
 
-## Setup
-Few dependencies need to be installed first. On Debian-based systems you can use the setup script.
+## ARM
+Instructions assume you have a debian based OS
 ```bash
 sudo bash setup_deb.sh
+# vim download_models.sh # uncomment / add models you want to download
+bash download_models.sh
+# vim run.sh # modify run.sh
+nohup sudo bash run.sh
 ```
+Benchmarks will take few hours in default setting, going over various combinations of n_proc x n_threads x batch_size x prompt_size x model_size 😵‍💫
+After they complete you will find .csv files with results in the benchmarks directory of this repo.
 
-## Downloading models
-Any GGUF model is expected to work, if you experience troubles running your network of choice please raise an [issue](https://github.com/AmpereComputingAI/llama.cpp/issues/new/choose).
-Benchmarking script expects models to be placed under _**llama.cpp/benchmarks/models**_ dir.
+## x86
+Instructions assume you have a debian based OS
 ```bash
-mkdir -p models
-huggingface-cli download QuantFactory/Meta-Llama-3-8B-Instruct-GGUF Meta-Llama-3-8B-Instruct.Q8_0.gguf --local-dir models --local-dir-use-symlinks False
-```
-
-## Benchmark
-Provide run.py Python script with following arguments:
-- -m, filename(s) of model(s) that should be available under _**llama.cpp/benchmarks/models**_ directory, multiple models can be provided
-- -t, threadpool(s) per single process, e.g., if there are 20 threads available on the system, if -t 10 is provided, 2 parallel processes will be spawned, each using 10 threads;
-  multiple threadpools can be provided and they will be treated as separate cases to benchmark
-- -b, batch size(s) to benchmark, meaning separate token generation streams handled as a single batch; multiple batch sizes can be provided and they will be treated as separate cases to benchmark
-- -p, prompt size(s) to benchmark, size of an input prompt; multiple prompt sizes can be provided and they will be treated as separate cases to benchmark
-- -r, thread-range, e.g., on an 80-thread system, it should be input as 0-79, unless user wants to use just a subset of available threads, say 16-63 (48 threads indexed 16<>63)
-```bash
-python3 run.py -m Meta-Llama-3-8B-Instruct.Q8_0.gguf -t 10 16 32 40 64 80 -b 1 2 4 8 16 32 64 -p 512 -r 0-79
-```
+sudo bash setup_deb.sh
+# vim download_models.sh # uncomment / add models you want to download
+bash download_models.sh
 
-## Quick run on 80t OCI A1 system
-```bash
-bash setup_deb.sh  # works on Debian-based systems
-bash download_models.sh  # uncomment preferred models in the file, by default llama3 q8_0 will be downloaded
-bash run.sh  # modify to adjust number of threads available and other parameters
-```
+cd utils
+sudo docker build -t llama_x86 .
+# vim run.sh # modify run.sh
+nohup sudo bash run.sh
+```
\ No newline at end of file
diff --git a/benchmarks/run.py b/benchmarks/run.py
index f58e421..f8d4a25 100644
--- a/benchmarks/run.py
+++ b/benchmarks/run.py
@@ -11,19 +11,18 @@ def get_file_dir():
     return os.path.dirname(os.path.realpath(__file__))
 
 
-def docker_init():
-    tag = "amperecomputingai/llama.cpp:1.2.3"
-    if subprocess.run(
-            ["docker", "pull", tag]).returncode != 0:
-        print("Docker pull process failed!")
-        sys.exit(1)
+def docker_init(docker_image):
+    # if subprocess.run(
+    #        ["docker", "pull", docker_image]).returncode != 0:
+    #    print("Docker pull process failed!")
+    #    sys.exit(1)
     container_name = "llama_benchmark"
     subprocess.run(["docker", "rm", "-f", container_name])
     memory = (psutil.virtual_memory().total >> 30) - 30  # leave 30GB for OS
     assert memory > 10, "less than 10GB of memory available on the system for llama.cpp"
     if subprocess.run(
             ["docker", "run", "--privileged=true", "--name", container_name, "-d", "-m", f"{str(memory)}g", "-v",
-             f"{get_file_dir()}:/runner", "--entrypoint", "/bin/bash", "-it", tag]).returncode != 0:
+             f"{get_file_dir()}:/runner", "--entrypoint", "/bin/bash", "-it", docker_image]).returncode != 0:
         print("Docker run process failed!")
         sys.exit(1)
     return container_name
@@ -52,7 +51,8 @@ def docker_start():
 def benchmark(docker_container_name, args):
     num_available_threads = len(parse_threads_range(args.threads_range))
     if num_available_threads < max(args.num_threads):
-        print(f"Requested number of threads ({max(args.num_threads)}) exceeds threads available ({num_available_threads})")
+        print(
+            f"Requested number of threads ({max(args.num_threads)}) exceeds threads available ({num_available_threads})")
         sys.exit(1)
 
     docker_restart(docker_container_name)
@@ -63,11 +63,11 @@ def benchmark(docker_container_name, args):
                     num_processes = int(num_available_threads / num_threads)
                     case = f"{num_processes} x {num_threads} [proc x threads], bs = {batch_size}"
                     print(f"\nRunning {case}")
-    
+
                     cmd = (f"cd /runner; python3 utils/benchmark.py -m models/{model} -n {str(num_processes)} "
                            f"-t {str(num_threads)} -b {str(batch_size)} -p {str(prompt_size)} -r {args.threads_range}")
                     cmd = ["docker", "exec", "-i", docker_container_name, "bash", "-c", cmd]
-    
+
                     print(f"Executing: {' '.join(cmd)}")
                     success = False
                     start = time.time()
@@ -90,6 +90,9 @@ def parse_args():
     parser.add_argument("-m", "--model_names",
                         type=str, required=True, nargs="+",
                         help="model names, e.g. 'Meta-Llama-3-8B-Instruct.Q8_0.gguf'")
+    parser.add_argument("-d", "--docker_image",
+                        type=str, required=True,
+                        help="Docker image to use for benchmarking")
     parser.add_argument("-t", "--num_threads",
                         type=int, required=True, nargs="+",
                         help="number of threads per process to use")
@@ -111,8 +114,8 @@ def parse_args():
 
 def main():
     args = parse_args()
-    benchmark(docker_init(), args)
+    benchmark(docker_init(args.docker_image), args)
 
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file
diff --git a/benchmarks/run.sh b/benchmarks/run.sh
index 996749e..eab7203 100644
--- a/benchmarks/run.sh
+++ b/benchmarks/run.sh
@@ -1,4 +1,4 @@
 set -e
 
-python3 run.py -m Meta-Llama-3-8B-Instruct.Q8_0.gguf -t 10 16 32 40 64 80 -b 1 2 4 8 16 32 64 -p 512 -r 0-79
+python3 run.py -m Meta-Llama-3-8B-Instruct.Q8_0.gguf -t 8 12 16 24 48 -b 1 2 4 8 16 32 -p 128 -r 0-47 -d llama.cpp:latest
 rm -f /tmp/log_power
diff --git a/benchmarks/utils/Dockerfile b/benchmarks/utils/Dockerfile
new file mode 100644
index 0000000..310814c
--- /dev/null
+++ b/benchmarks/utils/Dockerfile
@@ -0,0 +1,7 @@
+FROM ubuntu:22.04
+ARG DEBIAN_FRONTEND=noninteractive
+RUN apt-get update -y && apt-get install -y build-essential cmake vim wget git numactl libopenblas-dev pkg-config python3 python3-pip libnuma-dev clang
+RUN mkdir /workspace
+RUN mkdir /llm
+RUN cd /workspace && git clone -b b3615 https://github.com/ggerganov/llama.cpp.git && cd llama.cpp && make -j && mv /workspace/llama.cpp/llama-batched-bench /llm/
+RUN rm -R /workspace
\ No newline at end of file
diff --git a/benchmarks/utils/benchmark.py b/benchmarks/utils/benchmark.py
index 141e7ec..2e1b738 100644
--- a/benchmarks/utils/benchmark.py
+++ b/benchmarks/utils/benchmark.py
@@ -117,8 +117,8 @@ def main():
     for n in range(args.num_processes):
         logfile = f"{logs_dir}/log_{n}"
         cmd = ["numactl", f"--physcpubind={gen_threads_config(args.num_threads, n)}",
-               "/llm/batched-bench", args.model, str(args.kv_cache), "2048", "512", "0", "0", "0", str(args.prompt_size), str(TOKENS),
-               str(args.batch_size), str(args.num_threads)]
+                "/llm/llama-batched-bench", "-m", args.model, "-c", str(args.kv_cache), "-b", "2048", "-ub", "512", "-npp", str(args.prompt_size), "-ntg", str(TOKENS),
+               "-npl", str(args.batch_size), "-t", str(args.num_threads), "-tb", str(args.num_threads), "-td", str(args.num_threads)]
         current_subprocesses.append(
             subprocess.Popen(cmd, stdout=open(logfile, 'wb'), stderr=open(logfile, 'wb')))
     start = time.time()
@@ -130,4 +130,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file

From a6e4b32a3ca695695bd0cfe2110218f584e700df Mon Sep 17 00:00:00 2001
From: Marcel Wilnicki <marcel.wilnicki@gmail.com>
Date: Thu, 22 Aug 2024 14:55:31 +0200
Subject: [PATCH 2/8] wip

---
 benchmarks/README.md | 14 +++++++++-----
 benchmarks/run.sh    |  2 +-
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 28393c9..208f17d 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -5,24 +5,28 @@
 ## ARM
 Instructions assume you have a debian based OS
 ```bash
+cd benchmarks
 sudo bash setup_deb.sh
 # vim download_models.sh # uncomment / add models you want to download
 bash download_models.sh
-# vim run.sh # modify run.sh
+# vim run.sh # modify run.sh (provide the name of the docker image, threads, batch sizes etc.)
 nohup sudo bash run.sh
 ```
-Benchmarks will take few hours in default setting, going over various combinations of n_proc x n_threads x batch_size x prompt_size x model_size 😵‍💫
-After they complete you will find .csv files with results in the benchmarks directory of this repo.
 
 ## x86
 Instructions assume you have a debian based OS
 ```bash
+cd benchmarks
 sudo bash setup_deb.sh
 # vim download_models.sh # uncomment / add models you want to download
 bash download_models.sh
 
 cd utils
 sudo docker build -t llama_x86 .
-# vim run.sh # modify run.sh
+cd ..
+# vim run.sh # modify run.sh (provide the name of the docker image, threads, batch sizes etc.)
 nohup sudo bash run.sh
-```
\ No newline at end of file
+```
+
+Benchmarks will take few hours in default setting, going over various combinations of n_proc x n_threads x batch_size x prompt_size x model_size 😵‍💫
+After they complete you will find .csv files with results in the benchmarks directory of this repo.
\ No newline at end of file
diff --git a/benchmarks/run.sh b/benchmarks/run.sh
index eab7203..6286c60 100644
--- a/benchmarks/run.sh
+++ b/benchmarks/run.sh
@@ -1,4 +1,4 @@
 set -e
 
-python3 run.py -m Meta-Llama-3-8B-Instruct.Q8_0.gguf -t 8 12 16 24 48 -b 1 2 4 8 16 32 -p 128 -r 0-47 -d llama.cpp:latest
+python3 run.py -m Meta-Llama-3-8B-Instruct.Q8_0.gguf -t 8 12 16 24 48 -b 1 2 4 8 16 32 -p 128 -r 0-47 -d amperecomputingai/llama.cpp:latest
 rm -f /tmp/log_power

From 4bf2b534760ee91b9592c5d8cfc582d2c06df8ed Mon Sep 17 00:00:00 2001
From: Marcel Wilnicki <marcel.wilnicki@gmail.com>
Date: Thu, 22 Aug 2024 20:36:10 +0200
Subject: [PATCH 3/8] wip

---
 benchmarks/download_models.sh | 3 +++
 benchmarks/run.sh             | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/benchmarks/download_models.sh b/benchmarks/download_models.sh
index dbb3329..89942f1 100644
--- a/benchmarks/download_models.sh
+++ b/benchmarks/download_models.sh
@@ -9,3 +9,6 @@ mkdir -p $SCRIPT_DIR/models
 #huggingface-cli download TheBloke/Llama-2-70B-GGUF llama-2-70b.Q4_K_M.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False
 huggingface-cli download QuantFactory/Meta-Llama-3-8B-Instruct-GGUF Meta-Llama-3-8B-Instruct.Q8_0.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False
 #huggingface-cli download QuantFactory/Meta-Llama-3-8B-Instruct-GGUF Meta-Llama-3-8B-Instruct.Q4_K_M.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False
+
+wget -P models https://ampereaimodelzoo.s3.eu-central-1.amazonaws.com/gguf/Meta-Llama-3-8B-Instruct.Q4_K_4.gguf
+wget -P models https://ampereaimodelzoo.s3.eu-central-1.amazonaws.com/gguf/Meta-Llama-3-8B-Instruct.Q8R16.gguf
\ No newline at end of file
diff --git a/benchmarks/run.sh b/benchmarks/run.sh
index 6286c60..157f5b1 100644
--- a/benchmarks/run.sh
+++ b/benchmarks/run.sh
@@ -1,4 +1,4 @@
 set -e
 
-python3 run.py -m Meta-Llama-3-8B-Instruct.Q8_0.gguf -t 8 12 16 24 48 -b 1 2 4 8 16 32 -p 128 -r 0-47 -d amperecomputingai/llama.cpp:latest
+python3 run.py -m Meta-Llama-3-8B-Instruct.Q4_K_4.gguf -t 128 -b 1 32 -p 128 -r 0-127 -d amperecomputingai/llama.cpp:latest
 rm -f /tmp/log_power

From 8ffab2e275c8e5596177fb5016fb8125ab373496 Mon Sep 17 00:00:00 2001
From: Marcel Wilnicki <marcel.wilnicki@gmail.com>
Date: Thu, 22 Aug 2024 21:29:06 +0200
Subject: [PATCH 4/8] wip

---
 benchmarks/README.md | 20 ++++++++++++++++++--
 benchmarks/run.sh    |  3 +--
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 208f17d..3308bce 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -28,5 +28,21 @@ cd ..
 nohup sudo bash run.sh
 ```
 
-Benchmarks will take few hours in default setting, going over various combinations of n_proc x n_threads x batch_size x prompt_size x model_size 😵‍💫
-After they complete you will find .csv files with results in the benchmarks directory of this repo.
\ No newline at end of file
+Benchmarks will take a moment in default setting.
+After they complete you will find .csv files with results in the benchmarks directory of this repo.
+
+### results on Altra Max with 128 threads on 1 process:
+
+#### Meta-Llama-3-8B-Instruct.Q4_K_4.gguf
+
+| Batch Size | total token generation capability, tps |
+|------------|----------------------------------------|
+| 1          | 26.13                                  |
+| 32         | 102.85                                 |
+
+#### Meta-Llama-3-8B-Instruct.Q8R16.gguf
+
+| Batch Size | total token generation capability, tps |
+|------------|----------------------------------------|
+| 1          | 18.37                                  |
+| 32         | 121.19                                 |
\ No newline at end of file
diff --git a/benchmarks/run.sh b/benchmarks/run.sh
index 157f5b1..a7c59a8 100644
--- a/benchmarks/run.sh
+++ b/benchmarks/run.sh
@@ -1,4 +1,3 @@
 set -e
 
-python3 run.py -m Meta-Llama-3-8B-Instruct.Q4_K_4.gguf -t 128 -b 1 32 -p 128 -r 0-127 -d amperecomputingai/llama.cpp:latest
-rm -f /tmp/log_power
+python3 run.py -m Meta-Llama-3-8B-Instruct.Q4_K_4.gguf Meta-Llama-3-8B-Instruct.Q8R16.gguf -t 128 -b 1 32 -p 128 -r 0-127 -d amperecomputingai/llama.cpp:latest

From 55831a2246758d25f7f1a35a7bdabea95764463f Mon Sep 17 00:00:00 2001
From: Marcel Wilnicki <marcel.wilnicki@gmail.com>
Date: Mon, 26 Aug 2024 23:42:11 +0200
Subject: [PATCH 5/8] wip

---
 ...-3-8B-Instruct.Q4_K_4.gguf@PP128@TG256.csv | 48 +++++++++++++++++++
 benchmarks/README.md                          | 18 ++++---
 2 files changed, 56 insertions(+), 10 deletions(-)
 create mode 100644 benchmarks/Meta-Llama-3-8B-Instruct.Q4_K_4.gguf@PP128@TG256.csv

diff --git a/benchmarks/Meta-Llama-3-8B-Instruct.Q4_K_4.gguf@PP128@TG256.csv b/benchmarks/Meta-Llama-3-8B-Instruct.Q4_K_4.gguf@PP128@TG256.csv
new file mode 100644
index 0000000..6e750ed
--- /dev/null
+++ b/benchmarks/Meta-Llama-3-8B-Instruct.Q4_K_4.gguf@PP128@TG256.csv
@@ -0,0 +1,48 @@
+n_proc,n_threads,batch_size,prompt_size,output_tokens,pp_throughput_tps,pp_avg_latency_sec,tg_throughput_tps,tg_avg_latency_sec,pp+tg_throughput_tps,concurrency
+16,8,1,128,256,396.38615284214126,5.166812500000001,53.9083856577509,0.29706616210937503,70.1353850368713,16
+10,12,1,128,256,370.02208672228636,3.4593000000000003,60.55562550646733,0.16513789062499998,83.72397252807151,10
+8,16,1,128,256,384.57009450522344,2.66275,52.60371538348852,0.15346875000000004,61.203753511445825,8
+5,24,1,128,256,350.31630996146095,1.827,48.527500826418034,0.10303984375,67.38497174744674,5
+4,32,1,128,256,363.9239005060785,1.407,43.38138992258577,0.09220800781249999,60.9620574694396,4
+2,48,1,128,256,269.9215189310567,0.9484999999999999,44.06198301912035,0.045390625000000004,61.102713024106926,2
+2,64,1,128,256,321.4985100042571,0.7965,38.03764799362936,0.052583984375000004,53.44839585218178,2
+1,128,1,128,256,217.31748726655348,0.589,26.0905014268243,0.038328125,36.91952696856072,1
+16,8,2,128,256,418.19975883865374,9.794625000000002,110.64770308944307,0.28933642578125,139.80794611568743,32
+10,12,2,128,256,383.6810120647905,6.672500000000001,123.93800740714173,0.16137148437499998,159.67731875168928,20
+8,16,2,128,256,405.32717604607586,5.052875,95.94191098030699,0.16694531250000003,120.60774999018491,16
+5,24,2,128,256,370.0741462149616,3.4588,106.02687031013078,0.09431953125,138.05004314063848,10
+4,32,2,128,256,388.9014496996768,2.63325,89.76305574034251,0.0891298828125,119.70541246152048,8
+2,48,2,128,256,291.0064463943994,1.7595,69.88091980884391,0.057240234375000004,93.4932132205247,4
+2,64,2,128,256,357.67939408136175,1.4315,68.79871778810667,0.058140625,94.06577255190152,4
+1,128,2,128,256,268.6253934942288,0.953,47.66337739713275,0.0419609375,65.66908935442497,2
+16,8,4,128,256,417.81774681469346,19.6070625,198.07566817111712,0.32324731445312505,230.91884578161554,64
+10,12,4,128,256,370.82495128900973,13.809199999999999,200.5339634743387,0.199467578125,235.79981578139393,40
+8,16,4,128,256,410.30508573974623,9.982875,169.34048025895996,0.18912255859375002,198.97017390459538,32
+5,24,4,128,256,366.9544687265254,6.9768,180.1851591265522,0.11099921875,215.5971029139296,20
+4,32,4,128,256,392.2466482549231,5.2215,150.15846988520602,0.10655761718750001,187.8726722319053,16
+2,48,4,128,256,284.5256753784404,3.599,115.78471872089322,0.06909375000000001,144.2185812872635,8
+2,64,4,128,256,359.8034102940944,2.846,118.77514277579678,0.06735546875000001,152.3280606932117,8
+1,128,4,128,256,281.3186813186813,1.82,77.25969518635884,0.0517734375,101.89730662067136,4
+16,8,8,128,256,414.18260946754066,39.55800000000001,262.8345920588023,0.48716674804687504,287.23702664796633,128
+10,12,8,128,256,369.413996605541,27.72,247.47420827069718,0.32326640625,277.6798546519511,80
+8,16,8,128,256,406.2895840736078,20.163125,249.33713209113648,0.25785986328125,250.1603200293156,64
+5,24,8,128,256,362.30700183382214,14.131800000000002,224.36780736740155,0.17828125,255.81666472361474,40
+4,32,8,128,256,391.70590920021067,10.457,218.82873225516573,0.146515625,241.2439139312078,32
+2,48,8,128,256,292.82435876636123,6.994,155.60832128208014,0.10282226562499999,184.3273730949238,16
+2,64,8,128,256,359.96800722648663,5.689500000000001,172.949889329783,0.09251562499999999,207.99620840245097,16
+1,128,8,128,256,279.93439037725534,3.658,112.24377945851147,0.0712734375,140.24835646457268,8
+16,8,16,128,256,394.31195681747676,83.102625,220.8452042927132,1.159221923828125,255.87073197395068,256
+10,12,16,128,256,354.8340487251948,57.71869999999999,220.08028218556274,0.7270101562500001,251.46215584596305,160
+8,16,16,128,256,386.3011539159702,42.413250000000005,242.90285753371558,0.5270434570312499,271.0428798305982,128
+5,24,16,128,256,343.2492894093842,29.8336,219.71721397054426,0.3641046875,249.1160919913069,80
+4,32,16,128,256,368.9281498405472,22.2055,223.26757403084713,0.28668554687500003,254.80559875583202,64
+2,48,16,128,256,271.58209471225365,15.082,159.5917106959114,0.20051171875,184.9293421824913,32
+2,64,16,128,256,335.98882759548496,12.190999999999999,184.51845046619877,0.17343750000000002,215.52223099184425,32
+1,128,16,128,256,258.9127686472819,7.91,122.06824616301594,0.13107421875,148.17315808513203,16
+10,12,32,128,256,322.83991537277467,126.8774,151.25965729387573,2.1155941406250003,183.3408680939443,320
+8,16,32,128,256,346.5517507716242,94.55537500000003,176.17688788074838,1.453111328125,209.7223585231709,256
+5,24,32,128,256,308.451153580514,66.4,169.41941024156043,0.94440390625,198.67935144644,160
+4,32,32,128,256,326.4712519372915,50.186,175.4579368054558,0.7295205078125,207.09879664273433,128
+2,48,32,128,256,236.30887564930254,34.6665,127.3553249285536,0.50253125,150.41496315518887,64
+2,64,32,128,256,289.23018367609586,28.3245,146.60042212942176,0.436591796875,174.04482844091922,64
+1,128,32,128,256,219.80144888650386,18.635,101.9019541988531,0.31402734375,124.08862318986931,32
\ No newline at end of file
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 3308bce..bbf757f 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -1,4 +1,3 @@
-![Ampere AI](https://ampereaimodelzoo.s3.eu-central-1.amazonaws.com/ampere_logo_®_primary_stacked_rgb.png "Ampere AI")
 # Wrapper for multi-process / batched benchmark of llama.cpp
 
 
@@ -31,18 +30,17 @@ nohup sudo bash run.sh
 Benchmarks will take a moment in default setting.
 After they complete you will find .csv files with results in the benchmarks directory of this repo.
 
-### results on Altra Max with 128 threads on 1 process:
+### results on Altra Max
 
 #### Meta-Llama-3-8B-Instruct.Q4_K_4.gguf
 
-| Batch Size | total token generation capability, tps |
-|------------|----------------------------------------|
-| 1          | 26.13                                  |
-| 32         | 102.85                                 |
+| n_proc | n_threads | batch_size | prompt_size | output_tokens | total token generation capability, tps |
+|--------|-----------|------------|-------------|---------------|----------------------------------------|
+| 16     | 8         | 8          | 128         | 256           | 262.8345921                            |
+
 
 #### Meta-Llama-3-8B-Instruct.Q8R16.gguf
 
-| Batch Size | total token generation capability, tps |
-|------------|----------------------------------------|
-| 1          | 18.37                                  |
-| 32         | 121.19                                 |
\ No newline at end of file
+| n_proc | n_threads | batch_size | prompt_size | output_tokens | total token generation capability, tps |
+|--------|-----------|------------|-------------|---------------|----------------------------------------|
+| 10     | 12        | 16         | 128         | 256           | 294.2275261                            |
\ No newline at end of file

From 3b8a0abbe19e811341f8deeed9539d53ae889368 Mon Sep 17 00:00:00 2001
From: Marcel Wilnicki <marcel.wilnicki@gmail.com>
Date: Mon, 26 Aug 2024 23:42:49 +0200
Subject: [PATCH 6/8] wip

---
 benchmarks/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index bbf757f..84f672c 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -36,11 +36,11 @@ After they complete you will find .csv files with results in the benchmarks dire
 
 | n_proc | n_threads | batch_size | prompt_size | output_tokens | total token generation capability, tps |
 |--------|-----------|------------|-------------|---------------|----------------------------------------|
-| 16     | 8         | 8          | 128         | 256           | 262.8345921                            |
+| 16     | 8         | 8          | 128         | 256           | 262.83                                 |
 
 
 #### Meta-Llama-3-8B-Instruct.Q8R16.gguf
 
 | n_proc | n_threads | batch_size | prompt_size | output_tokens | total token generation capability, tps |
 |--------|-----------|------------|-------------|---------------|----------------------------------------|
-| 10     | 12        | 16         | 128         | 256           | 294.2275261                            |
\ No newline at end of file
+| 10     | 12        | 16         | 128         | 256           | 294.23                                 |
\ No newline at end of file

From f110e3d70966de27f33fccd89a238f866a597eca Mon Sep 17 00:00:00 2001
From: Marcel Wilnicki <marcel.wilnicki@gmail.com>
Date: Mon, 26 Aug 2024 23:46:30 +0200
Subject: [PATCH 7/8] wip

---
 benchmarks/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 84f672c..06717d7 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -31,6 +31,7 @@ Benchmarks will take a moment in default setting.
 After they complete you will find .csv files with results in the benchmarks directory of this repo.
 
 ### results on Altra Max
+the results were gathered using amperecomputingai/llama.cpp:1.2.6 image with aio optimizations on an Altra Max.
 
 #### Meta-Llama-3-8B-Instruct.Q4_K_4.gguf
 

From 863c1d47e7d0f0be958e32933e0f79a8adbf6664 Mon Sep 17 00:00:00 2001
From: Marcel Wilnicki <marcel.wilnicki@gmail.com>
Date: Tue, 27 Aug 2024 11:48:23 +0200
Subject: [PATCH 8/8] remove run.sh

---
 benchmarks/README.md          | 23 ++++++++++++++++++-----
 benchmarks/download_models.sh |  2 +-
 benchmarks/run.sh             |  3 ---
 3 files changed, 19 insertions(+), 9 deletions(-)
 delete mode 100644 benchmarks/run.sh

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 06717d7..ec19eb3 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -8,8 +8,8 @@ cd benchmarks
 sudo bash setup_deb.sh
 # vim download_models.sh # uncomment / add models you want to download
 bash download_models.sh
-# vim run.sh # modify run.sh (provide the name of the docker image, threads, batch sizes etc.)
-nohup sudo bash run.sh
+# quick run
+sudo python3 run.py -m Meta-Llama-3-8B-Instruct.Q4_K_4.gguf Meta-Llama-3-8B-Instruct.Q8R16.gguf -t 128 -b 1 -p 128 -r 0-127 -d amperecomputingai/llama.cpp:latest
 ```
 
 ## x86
@@ -23,8 +23,8 @@ bash download_models.sh
 cd utils
 sudo docker build -t llama_x86 .
 cd ..
-# vim run.sh # modify run.sh (provide the name of the docker image, threads, batch sizes etc.)
-nohup sudo bash run.sh
+# quick run
+python3 run.py -m Meta-Llama-3-8B-Instruct.Q4_K_M.gguf Meta-Llama-3-8B-Instruct.Q8_0.gguf -t 128 -b 1 -p 128 -r 0-127 -d llama_x86:latest
 ```
 
 Benchmarks will take a moment in default setting.
@@ -44,4 +44,17 @@ the results were gathered using amperecomputingai/llama.cpp:1.2.6 image with aio
 
 | n_proc | n_threads | batch_size | prompt_size | output_tokens | total token generation capability, tps |
 |--------|-----------|------------|-------------|---------------|----------------------------------------|
-| 10     | 12        | 16         | 128         | 256           | 294.23                                 |
\ No newline at end of file
+| 10     | 12        | 16         | 128         | 256           | 294.23                                 |
+
+
+## run.py options
+Provide run.py Python script with following arguments:
+- -m, filename(s) of model(s) that should be available under _**llama.cpp/benchmarks/models**_ directory, multiple models can be provided
+- -t, threadpool(s) per single process, e.g., if there are 20 threads available on the system, if -t 10 is provided, 2 parallel processes will be spawned, each using 10 threads;
+  multiple threadpools can be provided and they will be treated as separate cases to benchmark
+- -b, batch size(s) to benchmark, meaning separate token generation streams handled as a single batch; multiple batch sizes can be provided and they will be treated as separate cases to benchmark
+- -p, prompt size(s) to benchmark, size of an input prompt; multiple prompt sizes can be provided and they will be treated as separate cases to benchmark
+- -r, thread-range, e.g., on an 80-thread system, it should be input as 0-79, unless user wants to use just a subset of available threads, say 16-63 (48 threads indexed 16<>63)
+```bash
+python3 run.py -m Meta-Llama-3-8B-Instruct.Q8_0.gguf -t 10 16 32 40 64 80 -b 1 2 4 8 16 32 64 -p 512 -r 0-79
+```
\ No newline at end of file
diff --git a/benchmarks/download_models.sh b/benchmarks/download_models.sh
index 89942f1..f5bfc59 100644
--- a/benchmarks/download_models.sh
+++ b/benchmarks/download_models.sh
@@ -8,7 +8,7 @@ mkdir -p $SCRIPT_DIR/models
 #huggingface-cli download TheBloke/Llama-2-13B-GGUF llama-2-13b.Q8_0.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False
 #huggingface-cli download TheBloke/Llama-2-70B-GGUF llama-2-70b.Q4_K_M.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False
 huggingface-cli download QuantFactory/Meta-Llama-3-8B-Instruct-GGUF Meta-Llama-3-8B-Instruct.Q8_0.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False
-#huggingface-cli download QuantFactory/Meta-Llama-3-8B-Instruct-GGUF Meta-Llama-3-8B-Instruct.Q4_K_M.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False
+huggingface-cli download QuantFactory/Meta-Llama-3-8B-Instruct-GGUF Meta-Llama-3-8B-Instruct.Q4_K_M.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False
 
 wget -P models https://ampereaimodelzoo.s3.eu-central-1.amazonaws.com/gguf/Meta-Llama-3-8B-Instruct.Q4_K_4.gguf
 wget -P models https://ampereaimodelzoo.s3.eu-central-1.amazonaws.com/gguf/Meta-Llama-3-8B-Instruct.Q8R16.gguf
\ No newline at end of file
diff --git a/benchmarks/run.sh b/benchmarks/run.sh
deleted file mode 100644
index a7c59a8..0000000
--- a/benchmarks/run.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-set -e
-
-python3 run.py -m Meta-Llama-3-8B-Instruct.Q4_K_4.gguf Meta-Llama-3-8B-Instruct.Q8R16.gguf -t 128 -b 1 32 -p 128 -r 0-127 -d amperecomputingai/llama.cpp:latest