From c6fe64e5acb9e8f8216b1a07717bbc62338aa511 Mon Sep 17 00:00:00 2001 From: Marcel Wilnicki Date: Thu, 22 Aug 2024 14:40:42 +0200 Subject: [PATCH 1/8] first commit, refresh code, add x86 dockerfile, update readme --- benchmarks/README.md | 51 ++++++++++++++--------------------- benchmarks/run.py | 27 ++++++++++--------- benchmarks/run.sh | 2 +- benchmarks/utils/Dockerfile | 7 +++++ benchmarks/utils/benchmark.py | 6 ++--- 5 files changed, 46 insertions(+), 47 deletions(-) create mode 100644 benchmarks/utils/Dockerfile diff --git a/benchmarks/README.md b/benchmarks/README.md index ee922e0..28393c9 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -1,39 +1,28 @@ -# Running benchmark +![Ampere AI](https://ampereaimodelzoo.s3.eu-central-1.amazonaws.com/ampere_logo_®_primary_stacked_rgb.png "Ampere AI") +# Wrapper for multi-process / batched benchmark of llama.cpp -This benchmarking tool runs multi-process, throughput-oriented benchmark of Ampere optimized llama.cpp using arbitrary model(s) provided by the user. -The benchmarking script spawns multiple parallel streams of token generation using llama.cpp and provides user with aggregate metrics of both prompt eval and token generation stages. -Underneath, the _batched-bench_ script from upstream llama.cpp project is being used in an unaltered form. -The script orchestrates the benchmark inside Docker container from the outside environment, **therefore this script should not be run inside Docker container.** -## Setup -Few dependencies need to be installed first. On Debian-based systems you can use the setup script. +## ARM +Instructions assume you have a debian based OS ```bash sudo bash setup_deb.sh +# vim download_models.sh # uncomment / add models you want to download +bash download_models.sh +# vim run.sh # modify run.sh +nohup sudo bash run.sh ``` +Benchmarks will take few hours in default setting, going over various combinations of n_proc x n_threads x batch_size x prompt_size x model_size 😵‍💫 +After they complete you will find .csv files with results in the benchmarks directory of this repo. -## Downloading models -Any GGUF model is expected to work, if you experience troubles running your network of choice please raise an [issue](https://github.com/AmpereComputingAI/llama.cpp/issues/new/choose). -Benchmarking script expects models to be placed under _**llama.cpp/benchmarks/models**_ dir. +## x86 +Instructions assume you have a debian based OS ```bash -mkdir -p models -huggingface-cli download QuantFactory/Meta-Llama-3-8B-Instruct-GGUF Meta-Llama-3-8B-Instruct.Q8_0.gguf --local-dir models --local-dir-use-symlinks False -``` - -## Benchmark -Provide run.py Python script with following arguments: -- -m, filename(s) of model(s) that should be available under _**llama.cpp/benchmarks/models**_ directory, multiple models can be provided -- -t, threadpool(s) per single process, e.g., if there are 20 threads available on the system, if -t 10 is provided, 2 parallel processes will be spawned, each using 10 threads; - multiple threadpools can be provided and they will be treated as separate cases to benchmark -- -b, batch size(s) to benchmark, meaning separate token generation streams handled as a single batch; multiple batch sizes can be provided and they will be treated as separate cases to benchmark -- -p, prompt size(s) to benchmark, size of an input prompt; multiple prompt sizes can be provided and they will be treated as separate cases to benchmark -- -r, thread-range, e.g., on an 80-thread system, it should be input as 0-79, unless user wants to use just a subset of available threads, say 16-63 (48 threads indexed 16<>63) -```bash -python3 run.py -m Meta-Llama-3-8B-Instruct.Q8_0.gguf -t 10 16 32 40 64 80 -b 1 2 4 8 16 32 64 -p 512 -r 0-79 -``` +sudo bash setup_deb.sh +# vim download_models.sh # uncomment / add models you want to download +bash download_models.sh -## Quick run on 80t OCI A1 system -```bash -bash setup_deb.sh # works on Debian-based systems -bash download_models.sh # uncomment preferred models in the file, by default llama3 q8_0 will be downloaded -bash run.sh # modify to adjust number of threads available and other parameters -``` +cd utils +sudo docker build -t llama_x86 . +# vim run.sh # modify run.sh +nohup sudo bash run.sh +``` \ No newline at end of file diff --git a/benchmarks/run.py b/benchmarks/run.py index f58e421..f8d4a25 100644 --- a/benchmarks/run.py +++ b/benchmarks/run.py @@ -11,19 +11,18 @@ def get_file_dir(): return os.path.dirname(os.path.realpath(__file__)) -def docker_init(): - tag = "amperecomputingai/llama.cpp:1.2.3" - if subprocess.run( - ["docker", "pull", tag]).returncode != 0: - print("Docker pull process failed!") - sys.exit(1) +def docker_init(docker_image): + # if subprocess.run( + # ["docker", "pull", docker_image]).returncode != 0: + # print("Docker pull process failed!") + # sys.exit(1) container_name = "llama_benchmark" subprocess.run(["docker", "rm", "-f", container_name]) memory = (psutil.virtual_memory().total >> 30) - 30 # leave 30GB for OS assert memory > 10, "less than 10GB of memory available on the system for llama.cpp" if subprocess.run( ["docker", "run", "--privileged=true", "--name", container_name, "-d", "-m", f"{str(memory)}g", "-v", - f"{get_file_dir()}:/runner", "--entrypoint", "/bin/bash", "-it", tag]).returncode != 0: + f"{get_file_dir()}:/runner", "--entrypoint", "/bin/bash", "-it", docker_image]).returncode != 0: print("Docker run process failed!") sys.exit(1) return container_name @@ -52,7 +51,8 @@ def docker_start(): def benchmark(docker_container_name, args): num_available_threads = len(parse_threads_range(args.threads_range)) if num_available_threads < max(args.num_threads): - print(f"Requested number of threads ({max(args.num_threads)}) exceeds threads available ({num_available_threads})") + print( + f"Requested number of threads ({max(args.num_threads)}) exceeds threads available ({num_available_threads})") sys.exit(1) docker_restart(docker_container_name) @@ -63,11 +63,11 @@ def benchmark(docker_container_name, args): num_processes = int(num_available_threads / num_threads) case = f"{num_processes} x {num_threads} [proc x threads], bs = {batch_size}" print(f"\nRunning {case}") - + cmd = (f"cd /runner; python3 utils/benchmark.py -m models/{model} -n {str(num_processes)} " f"-t {str(num_threads)} -b {str(batch_size)} -p {str(prompt_size)} -r {args.threads_range}") cmd = ["docker", "exec", "-i", docker_container_name, "bash", "-c", cmd] - + print(f"Executing: {' '.join(cmd)}") success = False start = time.time() @@ -90,6 +90,9 @@ def parse_args(): parser.add_argument("-m", "--model_names", type=str, required=True, nargs="+", help="model names, e.g. 'Meta-Llama-3-8B-Instruct.Q8_0.gguf'") + parser.add_argument("-d", "--docker_image", + type=str, required=True, + help="Docker image to use for benchmarking") parser.add_argument("-t", "--num_threads", type=int, required=True, nargs="+", help="number of threads per process to use") @@ -111,8 +114,8 @@ def parse_args(): def main(): args = parse_args() - benchmark(docker_init(), args) + benchmark(docker_init(args.docker_image), args) if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/benchmarks/run.sh b/benchmarks/run.sh index 996749e..eab7203 100644 --- a/benchmarks/run.sh +++ b/benchmarks/run.sh @@ -1,4 +1,4 @@ set -e -python3 run.py -m Meta-Llama-3-8B-Instruct.Q8_0.gguf -t 10 16 32 40 64 80 -b 1 2 4 8 16 32 64 -p 512 -r 0-79 +python3 run.py -m Meta-Llama-3-8B-Instruct.Q8_0.gguf -t 8 12 16 24 48 -b 1 2 4 8 16 32 -p 128 -r 0-47 -d llama.cpp:latest rm -f /tmp/log_power diff --git a/benchmarks/utils/Dockerfile b/benchmarks/utils/Dockerfile new file mode 100644 index 0000000..310814c --- /dev/null +++ b/benchmarks/utils/Dockerfile @@ -0,0 +1,7 @@ +FROM ubuntu:22.04 +ARG DEBIAN_FRONTEND=noninteractive +RUN apt-get update -y && apt-get install -y build-essential cmake vim wget git numactl libopenblas-dev pkg-config python3 python3-pip libnuma-dev clang +RUN mkdir /workspace +RUN mkdir /llm +RUN cd /workspace && git clone -b b3615 https://github.com/ggerganov/llama.cpp.git && cd llama.cpp && make -j && mv /workspace/llama.cpp/llama-batched-bench /llm/ +RUN rm -R /workspace \ No newline at end of file diff --git a/benchmarks/utils/benchmark.py b/benchmarks/utils/benchmark.py index 141e7ec..2e1b738 100644 --- a/benchmarks/utils/benchmark.py +++ b/benchmarks/utils/benchmark.py @@ -117,8 +117,8 @@ def main(): for n in range(args.num_processes): logfile = f"{logs_dir}/log_{n}" cmd = ["numactl", f"--physcpubind={gen_threads_config(args.num_threads, n)}", - "/llm/batched-bench", args.model, str(args.kv_cache), "2048", "512", "0", "0", "0", str(args.prompt_size), str(TOKENS), - str(args.batch_size), str(args.num_threads)] + "/llm/llama-batched-bench", "-m", args.model, "-c", str(args.kv_cache), "-b", "2048", "-ub", "512", "-npp", str(args.prompt_size), "-ntg", str(TOKENS), + "-npl", str(args.batch_size), "-t", str(args.num_threads), "-tb", str(args.num_threads), "-td", str(args.num_threads)] current_subprocesses.append( subprocess.Popen(cmd, stdout=open(logfile, 'wb'), stderr=open(logfile, 'wb'))) start = time.time() @@ -130,4 +130,4 @@ def main(): if __name__ == "__main__": - main() + main() \ No newline at end of file From a6e4b32a3ca695695bd0cfe2110218f584e700df Mon Sep 17 00:00:00 2001 From: Marcel Wilnicki Date: Thu, 22 Aug 2024 14:55:31 +0200 Subject: [PATCH 2/8] wip --- benchmarks/README.md | 14 +++++++++----- benchmarks/run.sh | 2 +- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index 28393c9..208f17d 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -5,24 +5,28 @@ ## ARM Instructions assume you have a debian based OS ```bash +cd benchmarks sudo bash setup_deb.sh # vim download_models.sh # uncomment / add models you want to download bash download_models.sh -# vim run.sh # modify run.sh +# vim run.sh # modify run.sh (provide the name of the docker image, threads, batch sizes etc.) nohup sudo bash run.sh ``` -Benchmarks will take few hours in default setting, going over various combinations of n_proc x n_threads x batch_size x prompt_size x model_size 😵‍💫 -After they complete you will find .csv files with results in the benchmarks directory of this repo. ## x86 Instructions assume you have a debian based OS ```bash +cd benchmarks sudo bash setup_deb.sh # vim download_models.sh # uncomment / add models you want to download bash download_models.sh cd utils sudo docker build -t llama_x86 . -# vim run.sh # modify run.sh +cd .. +# vim run.sh # modify run.sh (provide the name of the docker image, threads, batch sizes etc.) nohup sudo bash run.sh -``` \ No newline at end of file +``` + +Benchmarks will take few hours in default setting, going over various combinations of n_proc x n_threads x batch_size x prompt_size x model_size 😵‍💫 +After they complete you will find .csv files with results in the benchmarks directory of this repo. \ No newline at end of file diff --git a/benchmarks/run.sh b/benchmarks/run.sh index eab7203..6286c60 100644 --- a/benchmarks/run.sh +++ b/benchmarks/run.sh @@ -1,4 +1,4 @@ set -e -python3 run.py -m Meta-Llama-3-8B-Instruct.Q8_0.gguf -t 8 12 16 24 48 -b 1 2 4 8 16 32 -p 128 -r 0-47 -d llama.cpp:latest +python3 run.py -m Meta-Llama-3-8B-Instruct.Q8_0.gguf -t 8 12 16 24 48 -b 1 2 4 8 16 32 -p 128 -r 0-47 -d amperecomputingai/llama.cpp:latest rm -f /tmp/log_power From 4bf2b534760ee91b9592c5d8cfc582d2c06df8ed Mon Sep 17 00:00:00 2001 From: Marcel Wilnicki Date: Thu, 22 Aug 2024 20:36:10 +0200 Subject: [PATCH 3/8] wip --- benchmarks/download_models.sh | 3 +++ benchmarks/run.sh | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/benchmarks/download_models.sh b/benchmarks/download_models.sh index dbb3329..89942f1 100644 --- a/benchmarks/download_models.sh +++ b/benchmarks/download_models.sh @@ -9,3 +9,6 @@ mkdir -p $SCRIPT_DIR/models #huggingface-cli download TheBloke/Llama-2-70B-GGUF llama-2-70b.Q4_K_M.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False huggingface-cli download QuantFactory/Meta-Llama-3-8B-Instruct-GGUF Meta-Llama-3-8B-Instruct.Q8_0.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False #huggingface-cli download QuantFactory/Meta-Llama-3-8B-Instruct-GGUF Meta-Llama-3-8B-Instruct.Q4_K_M.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False + +wget -P models https://ampereaimodelzoo.s3.eu-central-1.amazonaws.com/gguf/Meta-Llama-3-8B-Instruct.Q4_K_4.gguf +wget -P models https://ampereaimodelzoo.s3.eu-central-1.amazonaws.com/gguf/Meta-Llama-3-8B-Instruct.Q8R16.gguf \ No newline at end of file diff --git a/benchmarks/run.sh b/benchmarks/run.sh index 6286c60..157f5b1 100644 --- a/benchmarks/run.sh +++ b/benchmarks/run.sh @@ -1,4 +1,4 @@ set -e -python3 run.py -m Meta-Llama-3-8B-Instruct.Q8_0.gguf -t 8 12 16 24 48 -b 1 2 4 8 16 32 -p 128 -r 0-47 -d amperecomputingai/llama.cpp:latest +python3 run.py -m Meta-Llama-3-8B-Instruct.Q4_K_4.gguf -t 128 -b 1 32 -p 128 -r 0-127 -d amperecomputingai/llama.cpp:latest rm -f /tmp/log_power From 8ffab2e275c8e5596177fb5016fb8125ab373496 Mon Sep 17 00:00:00 2001 From: Marcel Wilnicki Date: Thu, 22 Aug 2024 21:29:06 +0200 Subject: [PATCH 4/8] wip --- benchmarks/README.md | 20 ++++++++++++++++++-- benchmarks/run.sh | 3 +-- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index 208f17d..3308bce 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -28,5 +28,21 @@ cd .. nohup sudo bash run.sh ``` -Benchmarks will take few hours in default setting, going over various combinations of n_proc x n_threads x batch_size x prompt_size x model_size 😵‍💫 -After they complete you will find .csv files with results in the benchmarks directory of this repo. \ No newline at end of file +Benchmarks will take a moment in default setting. +After they complete you will find .csv files with results in the benchmarks directory of this repo. + +### results on Altra Max with 128 threads on 1 process: + +#### Meta-Llama-3-8B-Instruct.Q4_K_4.gguf + +| Batch Size | total token generation capability, tps | +|------------|----------------------------------------| +| 1 | 26.13 | +| 32 | 102.85 | + +#### Meta-Llama-3-8B-Instruct.Q8R16.gguf + +| Batch Size | total token generation capability, tps | +|------------|----------------------------------------| +| 1 | 18.37 | +| 32 | 121.19 | \ No newline at end of file diff --git a/benchmarks/run.sh b/benchmarks/run.sh index 157f5b1..a7c59a8 100644 --- a/benchmarks/run.sh +++ b/benchmarks/run.sh @@ -1,4 +1,3 @@ set -e -python3 run.py -m Meta-Llama-3-8B-Instruct.Q4_K_4.gguf -t 128 -b 1 32 -p 128 -r 0-127 -d amperecomputingai/llama.cpp:latest -rm -f /tmp/log_power +python3 run.py -m Meta-Llama-3-8B-Instruct.Q4_K_4.gguf Meta-Llama-3-8B-Instruct.Q8R16.gguf -t 128 -b 1 32 -p 128 -r 0-127 -d amperecomputingai/llama.cpp:latest From 55831a2246758d25f7f1a35a7bdabea95764463f Mon Sep 17 00:00:00 2001 From: Marcel Wilnicki Date: Mon, 26 Aug 2024 23:42:11 +0200 Subject: [PATCH 5/8] wip --- ...-3-8B-Instruct.Q4_K_4.gguf@PP128@TG256.csv | 48 +++++++++++++++++++ benchmarks/README.md | 18 ++++--- 2 files changed, 56 insertions(+), 10 deletions(-) create mode 100644 benchmarks/Meta-Llama-3-8B-Instruct.Q4_K_4.gguf@PP128@TG256.csv diff --git a/benchmarks/Meta-Llama-3-8B-Instruct.Q4_K_4.gguf@PP128@TG256.csv b/benchmarks/Meta-Llama-3-8B-Instruct.Q4_K_4.gguf@PP128@TG256.csv new file mode 100644 index 0000000..6e750ed --- /dev/null +++ b/benchmarks/Meta-Llama-3-8B-Instruct.Q4_K_4.gguf@PP128@TG256.csv @@ -0,0 +1,48 @@ +n_proc,n_threads,batch_size,prompt_size,output_tokens,pp_throughput_tps,pp_avg_latency_sec,tg_throughput_tps,tg_avg_latency_sec,pp+tg_throughput_tps,concurrency +16,8,1,128,256,396.38615284214126,5.166812500000001,53.9083856577509,0.29706616210937503,70.1353850368713,16 +10,12,1,128,256,370.02208672228636,3.4593000000000003,60.55562550646733,0.16513789062499998,83.72397252807151,10 +8,16,1,128,256,384.57009450522344,2.66275,52.60371538348852,0.15346875000000004,61.203753511445825,8 +5,24,1,128,256,350.31630996146095,1.827,48.527500826418034,0.10303984375,67.38497174744674,5 +4,32,1,128,256,363.9239005060785,1.407,43.38138992258577,0.09220800781249999,60.9620574694396,4 +2,48,1,128,256,269.9215189310567,0.9484999999999999,44.06198301912035,0.045390625000000004,61.102713024106926,2 +2,64,1,128,256,321.4985100042571,0.7965,38.03764799362936,0.052583984375000004,53.44839585218178,2 +1,128,1,128,256,217.31748726655348,0.589,26.0905014268243,0.038328125,36.91952696856072,1 +16,8,2,128,256,418.19975883865374,9.794625000000002,110.64770308944307,0.28933642578125,139.80794611568743,32 +10,12,2,128,256,383.6810120647905,6.672500000000001,123.93800740714173,0.16137148437499998,159.67731875168928,20 +8,16,2,128,256,405.32717604607586,5.052875,95.94191098030699,0.16694531250000003,120.60774999018491,16 +5,24,2,128,256,370.0741462149616,3.4588,106.02687031013078,0.09431953125,138.05004314063848,10 +4,32,2,128,256,388.9014496996768,2.63325,89.76305574034251,0.0891298828125,119.70541246152048,8 +2,48,2,128,256,291.0064463943994,1.7595,69.88091980884391,0.057240234375000004,93.4932132205247,4 +2,64,2,128,256,357.67939408136175,1.4315,68.79871778810667,0.058140625,94.06577255190152,4 +1,128,2,128,256,268.6253934942288,0.953,47.66337739713275,0.0419609375,65.66908935442497,2 +16,8,4,128,256,417.81774681469346,19.6070625,198.07566817111712,0.32324731445312505,230.91884578161554,64 +10,12,4,128,256,370.82495128900973,13.809199999999999,200.5339634743387,0.199467578125,235.79981578139393,40 +8,16,4,128,256,410.30508573974623,9.982875,169.34048025895996,0.18912255859375002,198.97017390459538,32 +5,24,4,128,256,366.9544687265254,6.9768,180.1851591265522,0.11099921875,215.5971029139296,20 +4,32,4,128,256,392.2466482549231,5.2215,150.15846988520602,0.10655761718750001,187.8726722319053,16 +2,48,4,128,256,284.5256753784404,3.599,115.78471872089322,0.06909375000000001,144.2185812872635,8 +2,64,4,128,256,359.8034102940944,2.846,118.77514277579678,0.06735546875000001,152.3280606932117,8 +1,128,4,128,256,281.3186813186813,1.82,77.25969518635884,0.0517734375,101.89730662067136,4 +16,8,8,128,256,414.18260946754066,39.55800000000001,262.8345920588023,0.48716674804687504,287.23702664796633,128 +10,12,8,128,256,369.413996605541,27.72,247.47420827069718,0.32326640625,277.6798546519511,80 +8,16,8,128,256,406.2895840736078,20.163125,249.33713209113648,0.25785986328125,250.1603200293156,64 +5,24,8,128,256,362.30700183382214,14.131800000000002,224.36780736740155,0.17828125,255.81666472361474,40 +4,32,8,128,256,391.70590920021067,10.457,218.82873225516573,0.146515625,241.2439139312078,32 +2,48,8,128,256,292.82435876636123,6.994,155.60832128208014,0.10282226562499999,184.3273730949238,16 +2,64,8,128,256,359.96800722648663,5.689500000000001,172.949889329783,0.09251562499999999,207.99620840245097,16 +1,128,8,128,256,279.93439037725534,3.658,112.24377945851147,0.0712734375,140.24835646457268,8 +16,8,16,128,256,394.31195681747676,83.102625,220.8452042927132,1.159221923828125,255.87073197395068,256 +10,12,16,128,256,354.8340487251948,57.71869999999999,220.08028218556274,0.7270101562500001,251.46215584596305,160 +8,16,16,128,256,386.3011539159702,42.413250000000005,242.90285753371558,0.5270434570312499,271.0428798305982,128 +5,24,16,128,256,343.2492894093842,29.8336,219.71721397054426,0.3641046875,249.1160919913069,80 +4,32,16,128,256,368.9281498405472,22.2055,223.26757403084713,0.28668554687500003,254.80559875583202,64 +2,48,16,128,256,271.58209471225365,15.082,159.5917106959114,0.20051171875,184.9293421824913,32 +2,64,16,128,256,335.98882759548496,12.190999999999999,184.51845046619877,0.17343750000000002,215.52223099184425,32 +1,128,16,128,256,258.9127686472819,7.91,122.06824616301594,0.13107421875,148.17315808513203,16 +10,12,32,128,256,322.83991537277467,126.8774,151.25965729387573,2.1155941406250003,183.3408680939443,320 +8,16,32,128,256,346.5517507716242,94.55537500000003,176.17688788074838,1.453111328125,209.7223585231709,256 +5,24,32,128,256,308.451153580514,66.4,169.41941024156043,0.94440390625,198.67935144644,160 +4,32,32,128,256,326.4712519372915,50.186,175.4579368054558,0.7295205078125,207.09879664273433,128 +2,48,32,128,256,236.30887564930254,34.6665,127.3553249285536,0.50253125,150.41496315518887,64 +2,64,32,128,256,289.23018367609586,28.3245,146.60042212942176,0.436591796875,174.04482844091922,64 +1,128,32,128,256,219.80144888650386,18.635,101.9019541988531,0.31402734375,124.08862318986931,32 \ No newline at end of file diff --git a/benchmarks/README.md b/benchmarks/README.md index 3308bce..bbf757f 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -1,4 +1,3 @@ -![Ampere AI](https://ampereaimodelzoo.s3.eu-central-1.amazonaws.com/ampere_logo_®_primary_stacked_rgb.png "Ampere AI") # Wrapper for multi-process / batched benchmark of llama.cpp @@ -31,18 +30,17 @@ nohup sudo bash run.sh Benchmarks will take a moment in default setting. After they complete you will find .csv files with results in the benchmarks directory of this repo. -### results on Altra Max with 128 threads on 1 process: +### results on Altra Max #### Meta-Llama-3-8B-Instruct.Q4_K_4.gguf -| Batch Size | total token generation capability, tps | -|------------|----------------------------------------| -| 1 | 26.13 | -| 32 | 102.85 | +| n_proc | n_threads | batch_size | prompt_size | output_tokens | total token generation capability, tps | +|--------|-----------|------------|-------------|---------------|----------------------------------------| +| 16 | 8 | 8 | 128 | 256 | 262.8345921 | + #### Meta-Llama-3-8B-Instruct.Q8R16.gguf -| Batch Size | total token generation capability, tps | -|------------|----------------------------------------| -| 1 | 18.37 | -| 32 | 121.19 | \ No newline at end of file +| n_proc | n_threads | batch_size | prompt_size | output_tokens | total token generation capability, tps | +|--------|-----------|------------|-------------|---------------|----------------------------------------| +| 10 | 12 | 16 | 128 | 256 | 294.2275261 | \ No newline at end of file From 3b8a0abbe19e811341f8deeed9539d53ae889368 Mon Sep 17 00:00:00 2001 From: Marcel Wilnicki Date: Mon, 26 Aug 2024 23:42:49 +0200 Subject: [PATCH 6/8] wip --- benchmarks/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index bbf757f..84f672c 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -36,11 +36,11 @@ After they complete you will find .csv files with results in the benchmarks dire | n_proc | n_threads | batch_size | prompt_size | output_tokens | total token generation capability, tps | |--------|-----------|------------|-------------|---------------|----------------------------------------| -| 16 | 8 | 8 | 128 | 256 | 262.8345921 | +| 16 | 8 | 8 | 128 | 256 | 262.83 | #### Meta-Llama-3-8B-Instruct.Q8R16.gguf | n_proc | n_threads | batch_size | prompt_size | output_tokens | total token generation capability, tps | |--------|-----------|------------|-------------|---------------|----------------------------------------| -| 10 | 12 | 16 | 128 | 256 | 294.2275261 | \ No newline at end of file +| 10 | 12 | 16 | 128 | 256 | 294.23 | \ No newline at end of file From f110e3d70966de27f33fccd89a238f866a597eca Mon Sep 17 00:00:00 2001 From: Marcel Wilnicki Date: Mon, 26 Aug 2024 23:46:30 +0200 Subject: [PATCH 7/8] wip --- benchmarks/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/README.md b/benchmarks/README.md index 84f672c..06717d7 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -31,6 +31,7 @@ Benchmarks will take a moment in default setting. After they complete you will find .csv files with results in the benchmarks directory of this repo. ### results on Altra Max +the results were gathered using amperecomputingai/llama.cpp:1.2.6 image with aio optimizations on an Altra Max. #### Meta-Llama-3-8B-Instruct.Q4_K_4.gguf From 863c1d47e7d0f0be958e32933e0f79a8adbf6664 Mon Sep 17 00:00:00 2001 From: Marcel Wilnicki Date: Tue, 27 Aug 2024 11:48:23 +0200 Subject: [PATCH 8/8] remove run.sh --- benchmarks/README.md | 23 ++++++++++++++++++----- benchmarks/download_models.sh | 2 +- benchmarks/run.sh | 3 --- 3 files changed, 19 insertions(+), 9 deletions(-) delete mode 100644 benchmarks/run.sh diff --git a/benchmarks/README.md b/benchmarks/README.md index 06717d7..ec19eb3 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -8,8 +8,8 @@ cd benchmarks sudo bash setup_deb.sh # vim download_models.sh # uncomment / add models you want to download bash download_models.sh -# vim run.sh # modify run.sh (provide the name of the docker image, threads, batch sizes etc.) -nohup sudo bash run.sh +# quick run +sudo python3 run.py -m Meta-Llama-3-8B-Instruct.Q4_K_4.gguf Meta-Llama-3-8B-Instruct.Q8R16.gguf -t 128 -b 1 -p 128 -r 0-127 -d amperecomputingai/llama.cpp:latest ``` ## x86 @@ -23,8 +23,8 @@ bash download_models.sh cd utils sudo docker build -t llama_x86 . cd .. -# vim run.sh # modify run.sh (provide the name of the docker image, threads, batch sizes etc.) -nohup sudo bash run.sh +# quick run +python3 run.py -m Meta-Llama-3-8B-Instruct.Q4_K_M.gguf Meta-Llama-3-8B-Instruct.Q8_0.gguf -t 128 -b 1 -p 128 -r 0-127 -d llama_x86:latest ``` Benchmarks will take a moment in default setting. @@ -44,4 +44,17 @@ the results were gathered using amperecomputingai/llama.cpp:1.2.6 image with aio | n_proc | n_threads | batch_size | prompt_size | output_tokens | total token generation capability, tps | |--------|-----------|------------|-------------|---------------|----------------------------------------| -| 10 | 12 | 16 | 128 | 256 | 294.23 | \ No newline at end of file +| 10 | 12 | 16 | 128 | 256 | 294.23 | + + +## run.py options +Provide run.py Python script with following arguments: +- -m, filename(s) of model(s) that should be available under _**llama.cpp/benchmarks/models**_ directory, multiple models can be provided +- -t, threadpool(s) per single process, e.g., if there are 20 threads available on the system, if -t 10 is provided, 2 parallel processes will be spawned, each using 10 threads; + multiple threadpools can be provided and they will be treated as separate cases to benchmark +- -b, batch size(s) to benchmark, meaning separate token generation streams handled as a single batch; multiple batch sizes can be provided and they will be treated as separate cases to benchmark +- -p, prompt size(s) to benchmark, size of an input prompt; multiple prompt sizes can be provided and they will be treated as separate cases to benchmark +- -r, thread-range, e.g., on an 80-thread system, it should be input as 0-79, unless user wants to use just a subset of available threads, say 16-63 (48 threads indexed 16<>63) +```bash +python3 run.py -m Meta-Llama-3-8B-Instruct.Q8_0.gguf -t 10 16 32 40 64 80 -b 1 2 4 8 16 32 64 -p 512 -r 0-79 +``` \ No newline at end of file diff --git a/benchmarks/download_models.sh b/benchmarks/download_models.sh index 89942f1..f5bfc59 100644 --- a/benchmarks/download_models.sh +++ b/benchmarks/download_models.sh @@ -8,7 +8,7 @@ mkdir -p $SCRIPT_DIR/models #huggingface-cli download TheBloke/Llama-2-13B-GGUF llama-2-13b.Q8_0.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False #huggingface-cli download TheBloke/Llama-2-70B-GGUF llama-2-70b.Q4_K_M.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False huggingface-cli download QuantFactory/Meta-Llama-3-8B-Instruct-GGUF Meta-Llama-3-8B-Instruct.Q8_0.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False -#huggingface-cli download QuantFactory/Meta-Llama-3-8B-Instruct-GGUF Meta-Llama-3-8B-Instruct.Q4_K_M.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False +huggingface-cli download QuantFactory/Meta-Llama-3-8B-Instruct-GGUF Meta-Llama-3-8B-Instruct.Q4_K_M.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False wget -P models https://ampereaimodelzoo.s3.eu-central-1.amazonaws.com/gguf/Meta-Llama-3-8B-Instruct.Q4_K_4.gguf wget -P models https://ampereaimodelzoo.s3.eu-central-1.amazonaws.com/gguf/Meta-Llama-3-8B-Instruct.Q8R16.gguf \ No newline at end of file diff --git a/benchmarks/run.sh b/benchmarks/run.sh deleted file mode 100644 index a7c59a8..0000000 --- a/benchmarks/run.sh +++ /dev/null @@ -1,3 +0,0 @@ -set -e - -python3 run.py -m Meta-Llama-3-8B-Instruct.Q4_K_4.gguf Meta-Llama-3-8B-Instruct.Q8R16.gguf -t 128 -b 1 32 -p 128 -r 0-127 -d amperecomputingai/llama.cpp:latest