-
Notifications
You must be signed in to change notification settings - Fork 2
132 lines (117 loc) · 3.95 KB
/
cv-sync.yml
File metadata and controls
132 lines (117 loc) · 3.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
name: "CV: Dataset Sync"
on:
workflow_dispatch:
inputs:
dataset_id:
description: "Data Collective dataset ID (from URL on datacollective.mozillafoundation.org)"
required: true
split:
description: "Dataset split (or 'all' to sync everything)"
required: true
default: "all"
type: choice
options:
- all
- validated
- train
- dev
- test
- invalidated
- other
r2_concurrency:
description: "Number of concurrent R2 uploads"
required: false
default: "32"
type: choice
options:
- "8"
- "16"
- "32"
- "64"
- "128"
- "256"
runner:
description: "GitHub Actions runner"
required: false
default: "cv-sync"
type: choice
options:
- cv-sync
- ubuntu-latest
- ubuntu-latest-m
- ubuntu-latest-l
force:
description: "Force re-sync even if already synced"
required: false
default: false
type: boolean
jobs:
sync:
runs-on: ${{ inputs.runner }}
steps:
- uses: actions/checkout@v6
- uses: actions/setup-node@v4
with:
node-version: 22
- name: Install dependencies
run: npm ci
working-directory: tools/cv-explorer/scripts
- name: Sync dataset
run: |
npx tsx sync.ts \
--dataset-id "${{ inputs.dataset_id }}" \
--split "${{ inputs.split }}" \
--r2-concurrency "${{ inputs.r2_concurrency }}" \
${{ inputs.force == true && '--force' || '' }}
working-directory: tools/cv-explorer/scripts
env:
DATACOLLECTIVE_API_KEY: ${{ secrets.DATACOLLECTIVE_API_KEY }}
CLOUDFLARE_ACCOUNT_ID: ${{ vars.CLOUDFLARE_ACCOUNT_ID }}
CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }}
D1_DATABASE_ID: ${{ secrets.CV_EXPLORER_D1_ID }}
R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
R2_BUCKET_NAME: ${{ vars.CV_EXPLORER_R2_BUCKET }}
cleanup:
runs-on: ubuntu-latest
needs: sync
if: false # Disabled during debugging — re-enable when stable
steps:
- name: Azure Login
uses: azure/login@v3
with:
creds: ${{ secrets.AZURE_CREDENTIALS }}
- name: Delete runner VM and associated resources
uses: azure/cli@v3
with:
inlineScript: |
set -euo pipefail
RG="${{ vars.AZURE_RESOURCE_GROUP }}"
# Find VMs with the cv-sync prefix
VMS=$(az vm list --resource-group "$RG" --query "[?starts_with(name, 'cv-sync-')].name" -o tsv)
for VM in $VMS; do
echo "Deleting VM: $VM"
az vm delete \
--resource-group "$RG" \
--name "$VM" \
--yes \
--force-deletion true
# Delete leftover networking resources (Azure auto-names these with suffixes)
echo "Deleting NSG: ${VM}NSG"
az network nsg delete --resource-group "$RG" --name "${VM}NSG" 2>/dev/null || true
echo "Deleting Public IP: ${VM}PublicIP"
az network public-ip delete --resource-group "$RG" --name "${VM}PublicIP" 2>/dev/null || true
echo "Deleting VNET: ${VM}VNET"
az network vnet delete --resource-group "$RG" --name "${VM}VNET" 2>/dev/null || true
done
echo "Cleanup complete."
- name: Job summary
run: |
cat >> "$GITHUB_STEP_SUMMARY" <<EOF
## Dataset Sync Complete
| Setting | Value |
|---------|-------|
| **Dataset ID** | \`${{ inputs.dataset_id }}\` |
| **Split** | \`${{ inputs.split }}\` |
Runner VM has been cleaned up.
EOF