wavekat-lab/.github/workflows/cv-sync.yml at main · wavekat/wavekat-lab · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
name: "CV: Dataset Sync"

on:
  workflow_dispatch:
    inputs:
      dataset_id:
        description: "Data Collective dataset ID (from URL on datacollective.mozillafoundation.org)"
        required: true
      split:
        description: "Dataset split (or 'all' to sync everything)"
        required: true
        default: "all"
        type: choice
        options:
          - all
          - validated
          - train
          - dev
          - test
          - invalidated
          - other
      r2_concurrency:
        description: "Number of concurrent R2 uploads"
        required: false
        default: "32"
        type: choice
        options:
          - "8"
          - "16"
          - "32"
          - "64"
          - "128"
          - "256"
      runner:
        description: "GitHub Actions runner"
        required: false
        default: "cv-sync"
        type: choice
        options:
          - cv-sync
          - ubuntu-latest
          - ubuntu-latest-m
          - ubuntu-latest-l
      force:
        description: "Force re-sync even if already synced"
        required: false
        default: false
        type: boolean

jobs:
  sync:
    runs-on: ${{ inputs.runner }}
    steps:
      - uses: actions/checkout@v6

      - uses: actions/setup-node@v4
        with:
          node-version: 22

      - name: Install dependencies
        run: npm ci
        working-directory: tools/cv-explorer/scripts

      - name: Sync dataset
        run: |
          npx tsx sync.ts \
            --dataset-id "${{ inputs.dataset_id }}" \
            --split "${{ inputs.split }}" \
            --r2-concurrency "${{ inputs.r2_concurrency }}" \
            ${{ inputs.force == true && '--force' || '' }}
        working-directory: tools/cv-explorer/scripts
        env:
          DATACOLLECTIVE_API_KEY: ${{ secrets.DATACOLLECTIVE_API_KEY }}
          CLOUDFLARE_ACCOUNT_ID: ${{ vars.CLOUDFLARE_ACCOUNT_ID }}
          CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }}
          D1_DATABASE_ID: ${{ secrets.CV_EXPLORER_D1_ID }}
          R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
          R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
          R2_BUCKET_NAME: ${{ vars.CV_EXPLORER_R2_BUCKET }}

  cleanup:
    runs-on: ubuntu-latest
    needs: sync
    if: false # Disabled during debugging — re-enable when stable
    steps:
      - name: Azure Login
        uses: azure/login@v3
        with:
          creds: ${{ secrets.AZURE_CREDENTIALS }}

      - name: Delete runner VM and associated resources
        uses: azure/cli@v3
        with:
          inlineScript: |
            set -euo pipefail

            RG="${{ vars.AZURE_RESOURCE_GROUP }}"

            # Find VMs with the cv-sync prefix
            VMS=$(az vm list --resource-group "$RG" --query "[?starts_with(name, 'cv-sync-')].name" -o tsv)

            for VM in $VMS; do
              echo "Deleting VM: $VM"
              az vm delete \
                --resource-group "$RG" \
                --name "$VM" \
                --yes \
                --force-deletion true

              # Delete leftover networking resources (Azure auto-names these with suffixes)
              echo "Deleting NSG: ${VM}NSG"
              az network nsg delete --resource-group "$RG" --name "${VM}NSG" 2>/dev/null || true
              echo "Deleting Public IP: ${VM}PublicIP"
              az network public-ip delete --resource-group "$RG" --name "${VM}PublicIP" 2>/dev/null || true
              echo "Deleting VNET: ${VM}VNET"
              az network vnet delete --resource-group "$RG" --name "${VM}VNET" 2>/dev/null || true
            done

            echo "Cleanup complete."

      - name: Job summary
        run: |
          cat >> "$GITHUB_STEP_SUMMARY" <<EOF
          ## Dataset Sync Complete

          | Setting | Value |
          |---------|-------|
          | **Dataset ID** | \`${{ inputs.dataset_id }}\` |
          | **Split** | \`${{ inputs.split }}\` |

          Runner VM has been cleaned up.
          EOF