Skip to content

Commit d782c39

Browse files
authored
Merge pull request #157 from nathan-weinberg/large-runner
ci: add large-size E2E CI job
2 parents 57551d8 + 9ea4ee6 commit d782c39

File tree

2 files changed

+240
-1
lines changed

2 files changed

+240
-1
lines changed
Lines changed: 237 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
3+
name: E2E (NVIDIA L40S x4)
4+
5+
on:
6+
schedule:
7+
- cron: '0 16 * * *' # Runs at 4PM UTC every day
8+
workflow_dispatch:
9+
inputs:
10+
pr_or_branch:
11+
description: 'pull request number or branch name'
12+
required: true
13+
default: 'main'
14+
15+
jobs:
16+
start-large-ec2-runner:
17+
runs-on: ubuntu-latest
18+
outputs:
19+
label: ${{ steps.start-ec2-runner.outputs.label }}
20+
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
21+
steps:
22+
- name: Configure AWS credentials
23+
uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
24+
with:
25+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
26+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
27+
aws-region: ${{ secrets.AWS_REGION }}
28+
29+
- name: Start EC2 runner
30+
id: start-ec2-runner
31+
uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
32+
with:
33+
mode: start
34+
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
35+
ec2-image-id: ami-01a89eee1adde309c
36+
ec2-instance-type: g6e.12xlarge
37+
subnet-id: subnet-024298cefa3bedd61
38+
security-group-id: sg-06300447c4a5fbef3
39+
iam-role-name: instructlab-ci-runner
40+
aws-resource-tags: >
41+
[
42+
{"Key": "Name", "Value": "instructlab-ci-github-large-runner"},
43+
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
44+
{"Key": "GitHubRef", "Value": "${{ github.ref }}"},
45+
{"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
46+
]
47+
48+
e2e-large-test:
49+
needs:
50+
- start-large-ec2-runner
51+
runs-on: ${{ needs.start-large-ec2-runner.outputs.label }}
52+
53+
permissions:
54+
pull-requests: write
55+
56+
steps:
57+
- name: Install Packages
58+
run: |
59+
cat /etc/os-release
60+
sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
61+
62+
- name: Checkout instructlab/instructlab
63+
uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
64+
with:
65+
repository: "instructlab/instructlab"
66+
path: "instructlab"
67+
# https://github.com/actions/checkout/issues/249
68+
fetch-depth: 0
69+
70+
- name: Checkout instructlab/eval
71+
uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
72+
with:
73+
repository: "instructlab/eval"
74+
path: "eval"
75+
# https://github.com/actions/checkout/issues/249
76+
fetch-depth: 0
77+
78+
- name: Determine if pr_or_branch is a PR number
79+
id: check_pr
80+
run: |
81+
PR_OR_BRANCH=${{ github.event.inputs.pr_or_branch || 'main' }} # Default to 'main' if not set
82+
if [[ "$PR_OR_BRANCH" =~ ^[0-9]+$ ]]; then
83+
echo "is_pr=true" >> "$GITHUB_OUTPUT"
84+
else
85+
echo "is_pr=false" >> "$GITHUB_OUTPUT"
86+
fi
87+
echo "pr_or_branch=$PR_OR_BRANCH" >> "$GITHUB_OUTPUT"
88+
89+
- name: Check if gh cli is installed
90+
id: gh_cli
91+
run: |
92+
if command -v gh &> /dev/null ; then
93+
echo "gh_cli_installed=true" >> "$GITHUB_OUTPUT"
94+
else
95+
echo "gh_cli_installed=false" >> "$GITHUB_OUTPUT"
96+
fi
97+
98+
- name: Install gh CLI
99+
if: steps.gh_cli.outputs.gh_cli_installed == 'false'
100+
run: |
101+
sudo dnf install 'dnf-command(config-manager)' -y
102+
sudo dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo
103+
sudo dnf install gh --repo gh-cli -y
104+
105+
- name: test gh CLI
106+
run: |
107+
gh --version
108+
109+
- name: set default repo
110+
working-directory: ./eval
111+
run: |
112+
gh repo set-default ${{ github.server_url }}/${{ github.repository }}
113+
env:
114+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
115+
116+
- name: Add comment to PR
117+
if: steps.check_pr.outputs.is_pr == 'true'
118+
working-directory: ./eval
119+
run: |
120+
gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})"
121+
env:
122+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
123+
124+
- name: Fetch and checkout PR
125+
if: steps.check_pr.outputs.is_pr == 'true'
126+
working-directory: ./eval
127+
run: |
128+
gh pr checkout ${{ steps.check_pr.outputs.pr_or_branch }}
129+
env:
130+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
131+
132+
- name: Checkout branch
133+
if: steps.check_pr.outputs.is_pr == 'false'
134+
working-directory: ./eval
135+
run: |
136+
git checkout ${{ steps.check_pr.outputs.pr_or_branch }}
137+
138+
- name: Install ilab
139+
working-directory: ./instructlab
140+
run: |
141+
export CUDA_HOME="/usr/local/cuda"
142+
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
143+
export PATH="$PATH:$CUDA_HOME/bin"
144+
python3.11 -m venv --upgrade-deps venv
145+
. venv/bin/activate
146+
nvidia-smi
147+
python3.11 -m pip cache remove llama_cpp_python
148+
149+
CMAKE_ARGS="-DLLAMA_CUDA=on" python3.11 -m pip install .
150+
151+
# https://github.com/instructlab/instructlab/issues/1821
152+
# install with Torch and build dependencies installed
153+
python3.11 -m pip install packaging wheel setuptools-scm
154+
python3.11 -m pip install .[cuda] -r requirements-vllm-cuda.txt
155+
156+
- name: Update instructlab-eval library
157+
working-directory: ./eval
158+
run: |
159+
. ../instructlab/venv/bin/activate
160+
pip install .
161+
pip install .[cuda]
162+
163+
- name: Check disk
164+
run: |
165+
df -h
166+
167+
- name: Run e2e test
168+
working-directory: ./instructlab
169+
env:
170+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
171+
run: |
172+
. venv/bin/activate
173+
./scripts/e2e-ci.sh -l
174+
175+
- name: Add comment to PR if the workflow failed
176+
if: failure() && steps.check_pr.outputs.is_pr == 'true'
177+
working-directory: ./eval
178+
run: |
179+
gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate."
180+
env:
181+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
182+
183+
- name: Add comment to PR if the workflow succeeded
184+
if: success() && steps.check_pr.outputs.is_pr == 'true'
185+
working-directory: ./eval
186+
run: |
187+
gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!"
188+
env:
189+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
190+
191+
- name: Post job results to Slack if the workflow failed
192+
if: failure() && steps.check_pr.outputs.is_pr == 'false'
193+
id: slack-report-failure
194+
uses: slackapi/slack-github-action@37ebaef184d7626c5f204ab8d3baff4262dd30f0 # v1.27.0
195+
with:
196+
# Slack channel id, channel name, or user id to post message.
197+
# See also: https://api.slack.com/methods/chat.postMessage#channels
198+
# You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs.
199+
channel-id: 'e2e-ci-results'
200+
slack-message: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *with failures* :meow_sad-rain: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
201+
env:
202+
SLACK_BOT_TOKEN: ${{ secrets.SON_OF_JEEVES_TOKEN }}
203+
204+
- name: Post job results to Slack if the workflow succeeded
205+
if: success() && steps.check_pr.outputs.is_pr == 'false'
206+
id: slack-report-success
207+
uses: slackapi/slack-github-action@37ebaef184d7626c5f204ab8d3baff4262dd30f0 # v1.27.0
208+
with:
209+
# Slack channel id, channel name, or user id to post message.
210+
# See also: https://api.slack.com/methods/chat.postMessage#channels
211+
# You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs.
212+
channel-id: 'e2e-ci-results'
213+
slack-message: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *successfully* :meow_party: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
214+
env:
215+
SLACK_BOT_TOKEN: ${{ secrets.SON_OF_JEEVES_TOKEN }}
216+
217+
stop-large-ec2-runner:
218+
needs:
219+
- start-large-ec2-runner
220+
- e2e-large-test
221+
runs-on: ubuntu-latest
222+
if: ${{ always() }}
223+
steps:
224+
- name: Configure AWS credentials
225+
uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
226+
with:
227+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
228+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
229+
aws-region: ${{ secrets.AWS_REGION }}
230+
231+
- name: Stop EC2 runner
232+
uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
233+
with:
234+
mode: stop
235+
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
236+
label: ${{ needs.start-large-ec2-runner.outputs.label }}
237+
ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
# eval
22

33
![Lint](https://github.com/instructlab/eval/actions/workflows/lint.yml/badge.svg?branch=main)
4-
![`e2e-nvidia-a10g-x1.yaml` on `main`](https://github.com/instructlab/eval/actions/workflows/e2e-nvidia-a10g-x1.yml/badge.svg?branch=main)
54
![Build](https://github.com/instructlab/eval/actions/workflows/pypi.yaml/badge.svg?branch=main)
65
![Release](https://img.shields.io/github/v/release/instructlab/eval)
76
![License](https://img.shields.io/github/license/instructlab/eval)
87

8+
![`e2e-nvidia-a10g-x1.yml` on `main`](https://github.com/instructlab/eval/actions/workflows/e2e-nvidia-a10g-x1.yml/badge.svg?branch=main)
9+
![`e2e-nvidia-l40s-x4.yml` on `main`](https://github.com/instructlab/eval/actions/workflows/e2e-nvidia-l40s-x4.yml/badge.svg?branch=main)
10+
911
Python Library for Evaluation
1012

1113
## What is Evaluation?

0 commit comments

Comments
 (0)