Skip to content

Commit 70591b9

Browse files
authored
Add datasets and allow to choose them (#259)
* Use 2 small cohere datasets * Use choice inputs
1 parent e3100e6 commit 70591b9

File tree

10 files changed

+102
-11
lines changed

10 files changed

+102
-11
lines changed

.github/workflows/actions/create-inventory/action.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,3 +39,9 @@ runs:
3939
EOL
4040
4141
mv inventory.ini ansible/playbooks/inventory.ini
42+
- name: Prepare datasets.yml
43+
shell: bash
44+
run: |
45+
apk add yq
46+
echo -e "datasets:\n" > ansible/playbooks/group_vars/datasets.yml
47+
yq -p json -o yaml datasets/datasets.json >> ansible/playbooks/group_vars/datasets.yml

.github/workflows/continuous-benchmark-hnsw.yaml

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,21 @@ name: Continuous Benchmark Hnsw Indexing
22

33
on:
44
workflow_dispatch:
5+
inputs:
6+
dataset_name:
7+
description: 'First dataset name for transform benchmark'
8+
required: false
9+
type: choice
10+
options:
11+
- 'cohere-wiki-100k-no-filters'
12+
- 'laion-small-clip-no-filters-1'
13+
dataset_2_name:
14+
description: 'Second dataset name for transform benchmark'
15+
required: false
16+
type: choice
17+
options:
18+
- 'cohere-wiki-100k-no-filters-2'
19+
- 'laion-small-clip-no-filters-2'
520
schedule:
621
# Run every day at 3am
722
- cron: "0 3 * * *"
@@ -30,7 +45,10 @@ jobs:
3045
- name: Run bench
3146
id: hnsw-indexing-update
3247
run: |
33-
cd ansible/playbooks && ansible-playbook playbook-hnsw-index.yml --extra-vars "bench=update"
48+
cd ansible/playbooks && ansible-playbook playbook-hnsw-index.yml --extra-vars "
49+
bench=update
50+
dataset_name=dbpedia-openai-100K-1536-angular
51+
"
3452
3553
runTransformHealingBenchmark:
3654
runs-on: ubuntu-latest
@@ -50,4 +68,8 @@ jobs:
5068
- name: Run bench
5169
id: hnsw-indexing-transform
5270
run: |
53-
cd ansible/playbooks && ansible-playbook playbook-hnsw-index.yml --extra-vars "bench=transform"
71+
cd ansible/playbooks && ansible-playbook playbook-hnsw-index.yml --extra-vars "
72+
bench=transform
73+
dataset_name=${{ inputs.dataset_name || 'laion-small-clip-no-filters-1' }}
74+
dataset_2_name=${{ inputs.dataset_2_name || 'laion-small-clip-no-filters-2' }}
75+
"

ansible/README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
### Prerequisites
44
* ssh keys (to connect to the remote machines)
55
* inventory.ini (to define the actual machine on which the benchmark is run)
6+
* datasets.yml (to define the datasets used in the benchmark)
67

78
Add inventory.ini in [ansible/playbooks/](playbooks) with the following content:
89
```ini
@@ -13,6 +14,12 @@ benchmark-machine ansible_host=${YOUR_SERVER_IP} ansible_user=${YOUR_USER}
1314
benchmark-db ansible_host=${YOUR_SERVER_IP} ansible_user=${YOUR_USER}
1415
```
1516

17+
Convert [datasets/datasets.json](../datasets/datasets.json) into datasets.yml in [ansible/playbooks/group_vars](playbooks/group_vars).
18+
You can use `yq` for it. Note that the yaml should start with `datasets:`. From [ansible](.) run:
19+
```bash
20+
yq -p json -o=yaml ../datasets/datasets.json >> playbooks/group_vars/datasets.yml
21+
```
22+
1623
### Run ansible inside Docker
1724
Ensure the ssh keys are properly mounted into the container.
1825

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# you can populate this file running a command from project root (yq should be installed):
2+
# yq -p json -o=yaml datasets/datasets.json >> ansible/playbooks/group_vars/datasets.yml
3+
datasets:

ansible/playbooks/group_vars/hnsw-indexing-transform.yml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
11
qdrant_python_client_version: "1.14.0"
22
logging_dir: "/tmp/logs"
33
working_dir: "/tmp/experiments"
4-
dataset_url: "https://storage.googleapis.com/ann-filtered-benchmark/datasets/laion-small-clip-no-filters-1.tgz"
4+
# Default dataset values (can be overridden via --extra-vars)
55
dataset_name: "laion-small-clip-no-filters-1"
6-
dataset_dim: "512"
7-
dataset_2_url: "https://storage.googleapis.com/ann-filtered-benchmark/datasets/laion-small-clip-no-filters-2.tgz"
86
dataset_2_name: "laion-small-clip-no-filters-2"
97
servers:
108
- name: "qdrant"

ansible/playbooks/group_vars/hnsw-indexing-update.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
qdrant_python_client_version: "1.14.0"
22
logging_dir: "/tmp/logs"
33
working_dir: "/tmp/experiments"
4-
dataset_url: "https://storage.googleapis.com/ann-filtered-benchmark/datasets/dbpedia_openai_100K.tgz"
4+
# Default dataset value (can be overridden via --extra-vars)
55
dataset_name: "dbpedia_openai_100K"
6-
dataset_dim: "1536"
76
servers:
87
- name: "qdrant"
98
registry: "ghcr.io"

ansible/playbooks/playbook-hnsw-index.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
- name: Load common variables
77
include_vars: "group_vars/hnsw-indexing-{{ bench | default('update') }}.yml"
88

9+
- name: Load datasets variables
10+
include_vars: "group_vars/datasets.yml"
11+
912
- name: Ensure necessary packages are installed
1013
ansible.builtin.package:
1114
name: "{{ item }}"

ansible/playbooks/roles/run-hnsw-indexing-transform/tasks/main.yml

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,22 @@
1313
- "{{ working_dir }}/data/{{ dataset_name }}"
1414
- "{{ working_dir }}/data/{{ dataset_2_name }}"
1515

16+
- name: Get dataset info for first dataset
17+
ansible.builtin.set_fact:
18+
dataset_1_info: "{{ datasets | selectattr('name', 'equalto', dataset_name) | first }}"
19+
20+
- name: Get dataset info for second dataset
21+
ansible.builtin.set_fact:
22+
dataset_2_info: "{{ datasets | selectattr('name', 'equalto', dataset_2_name) | first }}"
23+
1624
- name: Check if the dataset archive already exists
1725
ansible.builtin.stat:
1826
path: "{{ working_dir }}/data/{{ dataset_name }}.tgz"
1927
register: archive_stat
2028

2129
- name: Download the archive
2230
ansible.builtin.get_url:
23-
url: "{{ dataset_url }}"
31+
url: "{{ dataset_1_info.link }}"
2432
dest: "{{ working_dir }}/data/{{ dataset_name }}.tgz"
2533
when: not archive_stat.stat.exists
2634

@@ -45,7 +53,7 @@
4553

4654
- name: Download the second archive
4755
ansible.builtin.get_url:
48-
url: "{{ dataset_2_url }}"
56+
url: "{{ dataset_2_info.link }}"
4957
dest: "{{ working_dir }}/data/{{ dataset_2_name }}.tgz"
5058
when: not archive_2_stat.stat.exists
5159

@@ -63,6 +71,11 @@
6371
owner: "{{ ansible_user }}"
6472
when: dest_2_dir_contents.matched == 0
6573

74+
- name: Set dataset dimensions
75+
ansible.builtin.set_fact:
76+
dataset_dim: "{{ dataset_1_info.vector_size }}"
77+
dataset_2_dim: "{{ dataset_2_info.vector_size }}"
78+
6679
- name: Prepare and execute the benchmark
6780
ansible.builtin.include_role:
68-
name: run-hnsw-indexing-common
81+
name: run-hnsw-indexing-common

ansible/playbooks/roles/run-hnsw-indexing-update/tasks/main.yml

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,18 @@
1212
- "{{ working_dir }}/data"
1313
- "{{ working_dir }}/data/{{ dataset_name }}"
1414

15+
- name: Get dataset info
16+
ansible.builtin.set_fact:
17+
dataset_info: "{{ datasets | selectattr('name', 'equalto', dataset_name) | first }}"
18+
1519
- name: Check if the dataset archive already exists
1620
ansible.builtin.stat:
1721
path: "{{ working_dir }}/data/{{ dataset_name }}.tgz"
1822
register: archive_stat
1923

2024
- name: Download the archive
2125
ansible.builtin.get_url:
22-
url: "{{ dataset_url }}"
26+
url: "{{ dataset_info.link }}"
2327
dest: "{{ working_dir }}/data/{{ dataset_name }}.tgz"
2428
when: not archive_stat.stat.exists
2529

@@ -37,6 +41,10 @@
3741
owner: "{{ ansible_user }}"
3842
when: dest_dir_contents.matched == 0
3943

44+
- name: Set dataset dimension
45+
ansible.builtin.set_fact:
46+
dataset_dim: "{{ dataset_info.vector_size }}"
47+
4048
- name: Prepare and execute the benchmark
4149
ansible.builtin.include_role:
4250
name: run-hnsw-indexing-common

datasets/datasets.json

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,38 @@
363363
"link": "https://storage.googleapis.com/ann-filtered-benchmark/datasets/cohere-wiki-50m-test-only.tgz",
364364
"path": "cohere-wiki-50m/cohere_wiki_50m"
365365
},
366+
{
367+
"name": "cohere-wiki-100k-no-filters",
368+
"vector_size": 768,
369+
"distance": "cosine",
370+
"type": "tar",
371+
"link": "https://storage.googleapis.com/ann-filtered-benchmark/datasets/cohere-wiki-100k-no-filters.tgz",
372+
"path": "cohere-wiki-100k/cohere_wiki_100k_no_filters"
373+
},
374+
{
375+
"name": "cohere-wiki-100k-no-filters-2",
376+
"vector_size": 768,
377+
"distance": "cosine",
378+
"type": "tar",
379+
"link": "https://storage.googleapis.com/ann-filtered-benchmark/datasets/cohere-wiki-100k-no-filters-2.tgz",
380+
"path": "cohere-wiki-100k/cohere_wiki_100k_no_filters_2"
381+
},
382+
{
383+
"name": "laion-small-clip-no-filters-1",
384+
"vector_size": 512,
385+
"distance": "cosine",
386+
"type": "tar",
387+
"path": "laion-small-clip/laion-small-clip-no-filters-1",
388+
"link": "https://storage.googleapis.com/ann-filtered-benchmark/datasets/laion-small-clip-no-filters-1.tgz"
389+
},
390+
{
391+
"name": "laion-small-clip-no-filters-2",
392+
"vector_size": 512,
393+
"distance": "cosine",
394+
"type": "tar",
395+
"path": "laion-small-clip/laion-small-clip-no-filters-2",
396+
"link": "https://storage.googleapis.com/ann-filtered-benchmark/datasets/laion-small-clip-no-filters-2.tgz"
397+
},
366398
{
367399
"name": "cohere-wiki-1m",
368400
"vector_size": 768,

0 commit comments

Comments
 (0)