From c0c29efdd906ee5bdd4b4154a3750ce91f818eef Mon Sep 17 00:00:00 2001
From: Trevor Bedford <trevor@bedford.io>
Date: Sat, 9 Mar 2024 15:22:26 -0800
Subject: [PATCH 1/3] Include profile for 21L open builds

This commit completely copies the logic in nextstrain_profiles/nextstrain-gisaid-21L/ to /nextstrain_profiles/nextstrain-open-21L/. Specific gisaid vs open changes etc... were derived by diffing the existing nextstrain-gisaid vs the existing nextstrain-open profiles.

In addition, this commit:
1. Fixes reference traits and frequencies for nextstrain-gisaid-21L
2. Switches from Nextstrain_clade, eg "21H (Mu)", to clade_nextstrain, eg "21H" during prefilter.
---
 .../nextstrain-gisaid-21L/builds.yaml         |    7 +
 .../nextstrain-gisaid-21L/exclude-clades.tsv  |   32 +-
 .../nextstrain-gisaid-21L/prefilter.smk       |    2 +-
 .../nextstrain-open-21L/builds.yaml           | 1100 +++++++++++++++++
 .../nextstrain-open-21L/config.yaml           |   12 +
 .../nextstrain-open-21L/exclude-clades.tsv    |   25 +
 .../nextstrain-open-21L/include.txt           |    1 +
 .../nextstrain_description.md                 |   39 +
 .../nextstrain-open-21L/prefilter.smk         |  103 ++
 9 files changed, 1304 insertions(+), 17 deletions(-)
 create mode 100644 nextstrain_profiles/nextstrain-open-21L/builds.yaml
 create mode 100644 nextstrain_profiles/nextstrain-open-21L/config.yaml
 create mode 100644 nextstrain_profiles/nextstrain-open-21L/exclude-clades.tsv
 create mode 100644 nextstrain_profiles/nextstrain-open-21L/include.txt
 create mode 100644 nextstrain_profiles/nextstrain-open-21L/nextstrain_description.md
 create mode 100644 nextstrain_profiles/nextstrain-open-21L/prefilter.smk

diff --git a/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml b/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml
index 24519a426..2ebdc5372 100644
--- a/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml
+++ b/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml
@@ -889,6 +889,9 @@ refine:
 # if different traits should be reconstructed for some builds, specify here
 # otherwise the default trait config in defaults/parameters.yaml will used
 traits:
+  reference:
+    sampling_bias_correction: 2.5
+    columns: ["region"]
   global_1m:
     sampling_bias_correction: 2.5
     columns: ["region"]
@@ -979,6 +982,10 @@ traits:
 # narrow_bandwidth = 0.019 or 7 days for "1m" and "2m"
 # narrow_bandwidth = 0.038 or 14 days for "6m" and "all-time"
 frequencies:
+  reference:
+    min_date: "2022-01-01"
+    narrow_bandwidth: 0.038
+    recent_days_to_censor: 7
   global_1m:
     min_date: "1M"
     narrow_bandwidth: 0.019
diff --git a/nextstrain_profiles/nextstrain-gisaid-21L/exclude-clades.tsv b/nextstrain_profiles/nextstrain-gisaid-21L/exclude-clades.tsv
index 6306b6449..58126031f 100644
--- a/nextstrain_profiles/nextstrain-gisaid-21L/exclude-clades.tsv
+++ b/nextstrain_profiles/nextstrain-gisaid-21L/exclude-clades.tsv
@@ -5,21 +5,21 @@ clade
 20B
 20C
 20D
-20E (EU1)
+20E
 20F
 20G
-20H (Beta, V2)
-20I (Alpha, V1)
-20J (Gamma, V3)
-21A (Delta)
-21B (Kappa)
-21C (Epsilon)
-21D (Eta)
-21E (Theta)
-21F (Iota)
-21G (Lambda)
-21H (Mu)
-21I (Delta)
-21J (Delta)
-21K (Omicron)
-21M (Omicron)
+20H
+20I
+20J
+21A
+21B
+21C
+21D
+21E
+21F
+21G
+21H
+21I
+21J
+21K
+21M
diff --git a/nextstrain_profiles/nextstrain-gisaid-21L/prefilter.smk b/nextstrain_profiles/nextstrain-gisaid-21L/prefilter.smk
index 23a38ee8d..5422e6ec6 100644
--- a/nextstrain_profiles/nextstrain-gisaid-21L/prefilter.smk
+++ b/nextstrain_profiles/nextstrain-gisaid-21L/prefilter.smk
@@ -49,7 +49,7 @@ rule gisaid_21L_metadata:
             --exclude \
             --filter-file {input.exclude_clades:q} \
             --key-fields clade \
-            --data-fields Nextstrain_clade \
+            --data-fields clade_nextstrain \
         | sed 1d \
         | zstd -T$(({threads} - 2)) \
         >> {output.metadata:q}
diff --git a/nextstrain_profiles/nextstrain-open-21L/builds.yaml b/nextstrain_profiles/nextstrain-open-21L/builds.yaml
new file mode 100644
index 000000000..60768b519
--- /dev/null
+++ b/nextstrain_profiles/nextstrain-open-21L/builds.yaml
@@ -0,0 +1,1100 @@
+auspice_json_prefix: ncov_open_21L
+
+# Define custom rules for pre- or post-standard workflow processing of data.
+custom_rules:
+  - workflow/snakemake_rules/export_for_nextstrain.smk
+  - nextstrain_profiles/nextstrain-open-21L/prefilter.smk
+
+# These parameters are only used by the `export_for_nextstrain` rule and shouldn't need to be modified.
+# To modify the s3 _source_ bucket, specify this directly in the `inputs` section of the config.
+# P.S. These are intentionally set as top-level keys as this allows command-line overrides.
+S3_DST_BUCKET: "nextstrain-data/files/ncov/open"
+S3_DST_COMPRESSION: "xz"
+S3_DST_ORIGINS: ["open"]
+
+# Deploy and Slack options are related to Nextstrain live builds and don't need to be modified for local builds
+deploy_url: s3://nextstrain-data
+slack_token: ~
+slack_channel: "#ncov-genbank-updates"
+
+genes: ["ORF1a", "ORF1b", "S", "ORF3a", "E", "M", "ORF6", "ORF7a", "ORF7b", "ORF8", "N", "ORF9b"]
+use_nextalign: true
+include_hcov19_prefix: False
+
+files:
+  # This file is produced by a custom clades_21L rule in our prefiltering rules
+  # for this build.
+  clades: "results/clades_21L.tsv"
+  include: "nextstrain_profiles/nextstrain-open-21L/include.txt"
+  description: "nextstrain_profiles/nextstrain-open-21L/nextstrain_description.md"
+
+inputs:
+  - name: open
+    # These two files are produced by our custom gisaid_21L prefiltering rules
+    # for this build.
+    metadata: "results/open_21L_metadata.tsv.zst"
+    aligned: "results/open_21L_aligned.fasta.zst"
+    skip_sanitize_metadata: true
+
+# Define locations for which builds should be created.
+# For each build we specify a subsampling scheme via an explicit key.
+# These subsampling schemes are defined at the bottom of this file.
+# (They override the defaults)
+# North America and Oceania are subsampled at the "division" level
+# Africa, Asia, Europe and South America are subsampled at the "country" level
+#
+# Auspice config is specified in rule auspice_config in export_for_nextstrain.smk
+builds:
+  reference:
+    subsampling_scheme: nextstrain_reference
+    title: Evolution SARS-CoV-2 relative to clade 21L reference virus with clade-focused subsampling
+  global_1m:
+    subsampling_scheme: nextstrain_global_1m
+    title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused globally over the past month
+  global_2m:
+    subsampling_scheme: nextstrain_global_2m
+    title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused globally over the past 2 months
+  global_6m:
+    subsampling_scheme: nextstrain_global_6m
+    title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused globally over the past 6 months
+  global_all-time:
+    subsampling_scheme: nextstrain_global_all_time
+    title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused globally since pandemic start
+  africa_1m:
+    subsampling_scheme: nextstrain_region_grouped_by_country_1m
+    region: Africa
+    title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Africa over the past month
+  africa_2m:
+    subsampling_scheme: nextstrain_region_grouped_by_country_2m
+    region: Africa
+    title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Africa over the past 2 months
+  africa_6m:
+    subsampling_scheme: nextstrain_region_grouped_by_country_6m
+    region: Africa
+    title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Africa over the past 6 months
+  africa_all-time:
+    subsampling_scheme: nextstrain_region_grouped_by_country_all_time
+    region: Africa
+    title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Africa since pandemic start
+  asia_1m:
+    subsampling_scheme: nextstrain_region_asia_grouped_by_division_1m
+    region: Asia
+    title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Asia over the past month
+  asia_2m:
+    subsampling_scheme: nextstrain_region_asia_grouped_by_division_2m
+    region: Asia
+    title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Asia over the past 2 months
+  asia_6m:
+    subsampling_scheme: nextstrain_region_asia_grouped_by_division_6m
+    region: Asia
+    title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Asia over the past 6 months
+  asia_all-time:
+    subsampling_scheme: nextstrain_region_asia_grouped_by_division_all_time
+    region: Asia
+    title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Asia since pandemic start
+  europe_1m:
+    subsampling_scheme: nextstrain_region_grouped_by_country_1m
+    region: Europe
+    title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Europe over the past month
+  europe_2m:
+    subsampling_scheme: nextstrain_region_grouped_by_country_2m
+    region: Europe
+    title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Europe over the past 2 months
+  europe_6m:
+    subsampling_scheme: nextstrain_region_grouped_by_country_6m
+    region: Europe
+    title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Europe over the past 6 months
+  europe_all-time:
+    subsampling_scheme: nextstrain_region_grouped_by_country_all_time
+    region: Europe
+    title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Europe since pandemic start
+  north-america_1m:
+    subsampling_scheme: nextstrain_region_grouped_by_division_1m
+    region: North America
+    title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on North America over the past month
+  north-america_2m:
+    subsampling_scheme: nextstrain_region_grouped_by_division_2m
+    region: North America
+    title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on North America over the past 2 months
+  north-america_6m:
+    subsampling_scheme: nextstrain_region_grouped_by_division_6m
+    region: North America
+    title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on North America over the past 6 months
+  north-america_all-time:
+    subsampling_scheme: nextstrain_region_grouped_by_division_all_time
+    region: North America
+    title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on North America since pandemic start
+  oceania_1m:
+    subsampling_scheme: nextstrain_region_grouped_by_division_1m
+    region: Oceania
+    title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Oceania over the past month
+  oceania_2m:
+    subsampling_scheme: nextstrain_region_grouped_by_division_2m
+    region: Oceania
+    title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Oceania over the past 2 months
+  oceania_6m:
+    subsampling_scheme: nextstrain_region_grouped_by_division_6m
+    region: Oceania
+    title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Oceania over the past 6 months
+  oceania_all-time:
+    subsampling_scheme: nextstrain_region_grouped_by_division_all_time
+    region: Oceania
+    title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Oceania since pandemic start
+  south-america_1m:
+    subsampling_scheme: nextstrain_region_grouped_by_country_1m
+    region: South America
+    title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on South America over the past month
+  south-america_2m:
+    subsampling_scheme: nextstrain_region_grouped_by_country_2m
+    region: South America
+    title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on South America over the past 2 months
+  south-america_6m:
+    subsampling_scheme: nextstrain_region_grouped_by_country_6m
+    region: South America
+    title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on South America over the past 6 months
+  south-america_all-time:
+    subsampling_scheme: nextstrain_region_grouped_by_country_all_time
+    region: South America
+    title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on South America since pandemic start
+
+# remove sequences without division label in US and sequences from prior to clade 21L
+filter:
+  exclude_where: "division='USA'"
+  min_date: "2022-01-01"
+
+subsampling:
+
+  # Custom subsampling logic for group by clade
+  nextstrain_reference:
+    clades:
+      group_by: "Nextstrain_clade"
+      max_sequences: 300
+
+  # Custom subsampling logic for regions over 1m
+  # Grouping by division for North America and Oceania
+  # 4000 total
+  # 4:1 ratio of recent to early
+  # 4:1 ratio of focal to context
+  nextstrain_region_grouped_by_division_1m:
+    # Early focal samples for region
+    focal_early:
+      group_by: "division year month"
+      max_sequences: 640
+      max_date: "--max-date 1M"
+      exclude: "--exclude-where 'region!={region}'"
+    # Early contextual samples from the rest of the world
+    context_early:
+      group_by: "country year month"
+      max_sequences: 160
+      max_date: "--max-date 1M"
+      exclude: "--exclude-where 'region={region}'"
+    # Recent focal samples for region
+    focal_recent:
+      group_by: "division week"
+      max_sequences: 2560
+      min_date: "--min-date 1M"
+      exclude: "--exclude-where 'region!={region}'"
+    # Early contextual samples from the rest of the world
+    context_recent:
+      group_by: "country week"
+      max_sequences: 640
+      min_date: "--min-date 1M"
+      exclude: "--exclude-where 'region={region}'"
+
+  # Custom subsampling logic for regions over 2m
+  # Grouping by division for North America and Oceania
+  # 4000 total
+  # 4:1 ratio of recent to early
+  # 4:1 ratio of focal to context
+  nextstrain_region_grouped_by_division_2m:
+    # Early focal samples for region
+    focal_early:
+      group_by: "division year month"
+      max_sequences: 640
+      max_date: "--max-date 2M"
+      exclude: "--exclude-where 'region!={region}'"
+    # Early contextual samples from the rest of the world
+    context_early:
+      group_by: "country year month"
+      max_sequences: 160
+      max_date: "--max-date 2M"
+      exclude: "--exclude-where 'region={region}'"
+    # Recent focal samples for region
+    focal_recent:
+      group_by: "division week"
+      max_sequences: 2560
+      min_date: "--min-date 2M"
+      exclude: "--exclude-where 'region!={region}'"
+    # Early contextual samples from the rest of the world
+    context_recent:
+      group_by: "country week"
+      max_sequences: 640
+      min_date: "--min-date 2M"
+      exclude: "--exclude-where 'region={region}'"
+
+  # Custom subsampling logic for regions over 6m
+  # Grouping by division for North America and Oceania
+  # 4000 total
+  # 4:1 ratio of recent to early
+  # 4:1 ratio of focal to context
+  nextstrain_region_grouped_by_division_6m:
+    # Early focal samples for region
+    focal_early:
+      group_by: "division year month"
+      max_sequences: 640
+      max_date: "--max-date 6M"
+      exclude: "--exclude-where 'region!={region}'"
+    # Early contextual samples from the rest of the world
+    context_early:
+      group_by: "country year month"
+      max_sequences: 160
+      max_date: "--max-date 6M"
+      exclude: "--exclude-where 'region={region}'"
+    # Recent focal samples for region
+    focal_recent:
+      group_by: "division year month"
+      max_sequences: 2560
+      min_date: "--min-date 6M"
+      exclude: "--exclude-where 'region!={region}'"
+    # Early contextual samples from the rest of the world
+    context_recent:
+      group_by: "country year month"
+      max_sequences: 640
+      min_date: "--min-date 6M"
+      exclude: "--exclude-where 'region={region}'"
+
+  # Custom subsampling logic for regions over all-time
+  # Grouping by division for North America and Oceania
+  # 4000 total
+  # 4:1 ratio of focal to context
+  nextstrain_region_grouped_by_division_all_time:
+    # Focal samples for region
+    focal:
+      group_by: "division year month"
+      max_sequences: 3200
+      exclude: "--exclude-where 'region!={region}'"
+    # Contextual samples from the rest of the world
+    context:
+      group_by: "country year month"
+      max_sequences: 800
+      exclude: "--exclude-where 'region={region}'"
+
+  # Custom subsampling logic for region Asia over 1m
+  # Grouping by division
+  # Separating three buckets for China, India and elsewhere
+  # 4375 total
+  # 4:1 ratio of recent to early
+  # 4:1 ratio of focal to context
+  # 3:2:2 proportions of Asia, China, India
+  nextstrain_region_asia_grouped_by_division_1m:
+    # Early focal samples for Asia
+    asia_early:
+      group_by: "division year month"
+      max_sequences: 300
+      max_date: "--max-date 1M"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    # Early focal samples for China
+    china_early:
+      group_by: "division year month"
+      max_sequences: 200
+      max_date: "--max-date 1M"
+      exclude: "--exclude-where 'country!=China'"
+    # Early focal samples for India
+    india_early:
+      group_by: "division year month"
+      max_sequences: 200
+      max_date: "--max-date 1M"
+      exclude: "--exclude-where 'country!=India'"
+    # Early contextual samples from the rest of the world
+    context_early:
+      group_by: "country year month"
+      max_sequences: 175
+      max_date: "--max-date 1M"
+      exclude: "--exclude-where 'region=Asia'"
+    # Recent focal samples for Asia
+    asia_recent:
+      group_by: "division year month"
+      max_sequences: 1200
+      min_date: "--min-date 1M"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    # Recent focal samples for China
+    china_recent:
+      group_by: "division year month"
+      max_sequences: 800
+      max_date: "--min-date 1M"
+      exclude: "--exclude-where 'country!=China'"
+    # Recent focal samples for India
+    india_recent:
+      group_by: "division year month"
+      max_sequences: 800
+      max_date: "--min-date 1M"
+      exclude: "--exclude-where 'country!=India'"
+    # Early contextual samples from the rest of the world
+    context_recent:
+      group_by: "country year month"
+      max_sequences: 700
+      min_date: "--min-date 1M"
+      exclude: "--exclude-where 'region=Asia'"
+
+  # Custom subsampling logic for region Asia over 2m
+  # Grouping by division
+  # Separating three buckets for China, India and elsewhere
+  # 4375 total
+  # 4:1 ratio of recent to early
+  # 4:1 ratio of focal to context
+  # 3:2:2 proportions of Asia, China, India
+  nextstrain_region_asia_grouped_by_division_2m:
+    # Early focal samples for Asia
+    asia_early:
+      group_by: "division year month"
+      max_sequences: 300
+      max_date: "--max-date 2M"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    # Early focal samples for China
+    china_early:
+      group_by: "division year month"
+      max_sequences: 200
+      max_date: "--max-date 2M"
+      exclude: "--exclude-where 'country!=China'"
+    # Early focal samples for India
+    india_early:
+      group_by: "division year month"
+      max_sequences: 200
+      max_date: "--max-date 2M"
+      exclude: "--exclude-where 'country!=India'"
+    # Early contextual samples from the rest of the world
+    context_early:
+      group_by: "country year month"
+      max_sequences: 175
+      max_date: "--max-date 2M"
+      exclude: "--exclude-where 'region=Asia'"
+    # Recent focal samples for Asia
+    asia_recent:
+      group_by: "division year month"
+      max_sequences: 1200
+      min_date: "--min-date 2M"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    # Recent focal samples for China
+    china_recent:
+      group_by: "division year month"
+      max_sequences: 800
+      max_date: "--min-date 2M"
+      exclude: "--exclude-where 'country!=China'"
+    # Recent focal samples for India
+    india_recent:
+      group_by: "division year month"
+      max_sequences: 800
+      max_date: "--min-date 2M"
+      exclude: "--exclude-where 'country!=India'"
+    # Early contextual samples from the rest of the world
+    context_recent:
+      group_by: "country year month"
+      max_sequences: 700
+      min_date: "--min-date 2M"
+      exclude: "--exclude-where 'region=Asia'"
+
+  # Custom subsampling logic for region Asia over 6m
+  # Grouping by division
+  # Separating three buckets for China, India and elsewhere
+  # 4375 total
+  # 4:1 ratio of recent to early
+  # 4:1 ratio of focal to context
+  # 3:2:2 proportions of Asia, China, India
+  nextstrain_region_asia_grouped_by_division_6m:
+    # Early focal samples for Asia
+    asia_early:
+      group_by: "division year month"
+      max_sequences: 300
+      max_date: "--max-date 6M"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    # Early focal samples for China
+    china_early:
+      group_by: "division year month"
+      max_sequences: 200
+      max_date: "--max-date 6M"
+      exclude: "--exclude-where 'country!=China'"
+    # Early focal samples for India
+    india_early:
+      group_by: "division year month"
+      max_sequences: 200
+      max_date: "--max-date 6M"
+      exclude: "--exclude-where 'country!=India'"
+    # Early contextual samples from the rest of the world
+    context_early:
+      group_by: "country year month"
+      max_sequences: 175
+      max_date: "--max-date 6M"
+      exclude: "--exclude-where 'region=Asia'"
+    # Recent focal samples for Asia
+    asia_recent:
+      group_by: "division year month"
+      max_sequences: 1200
+      min_date: "--min-date 6M"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    # Recent focal samples for China
+    china_recent:
+      group_by: "division year month"
+      max_sequences: 800
+      max_date: "--min-date 6M"
+      exclude: "--exclude-where 'country!=China'"
+    # Recent focal samples for India
+    india_recent:
+      group_by: "division year month"
+      max_sequences: 800
+      max_date: "--min-date 6M"
+      exclude: "--exclude-where 'country!=India'"
+    # Early contextual samples from the rest of the world
+    context_recent:
+      group_by: "country year month"
+      max_sequences: 700
+      min_date: "--min-date 6M"
+      exclude: "--exclude-where 'region=Asia'"
+
+  # Custom subsampling logic for region Asia over all-time
+  # Grouping by division
+  # Separating three buckets for China, India and elsewhere
+  # 4375 total
+  # 4:1 ratio of focal to context
+  # 3:2:2 proportions of Asia, China, India
+  nextstrain_region_asia_grouped_by_division_all_time:
+    # Focal samples for Asia
+    asia:
+      group_by: "division year month"
+      max_sequences: 1500
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    # Focal samples for China
+    china:
+      group_by: "division year month"
+      max_sequences: 1000
+      exclude: "--exclude-where 'country!=China'"
+    # Focal samples for India
+    india:
+      group_by: "division year month"
+      max_sequences: 1000
+      exclude: "--exclude-where 'country!=India'"
+    # Contextual samples from the rest of the world
+    context:
+      group_by: "country year month"
+      max_sequences: 875
+      exclude: "--exclude-where 'region={region}'"
+
+  # Custom subsampling logic for regions over 1m
+  # Grouping by country for Africa, Asia, Europe and South America
+  # 4000 total
+  # 4:1 ratio of recent to early
+  # 4:1 ratio of focal to context
+  nextstrain_region_grouped_by_country_1m:
+    # Early focal samples for region
+    focal_early:
+      group_by: "country year month"
+      max_sequences: 640
+      max_date: "--max-date 1M"
+      exclude: "--exclude-where 'region!={region}'"
+    # Early contextual samples from the rest of the world
+    context_early:
+      group_by: "country year month"
+      max_sequences: 160
+      max_date: "--max-date 1M"
+      exclude: "--exclude-where 'region={region}'"
+    # Recent focal samples for region
+    focal_recent:
+      group_by: "country week"
+      max_sequences: 2560
+      min_date: "--min-date 1M"
+      exclude: "--exclude-where 'region!={region}'"
+    # Early contextual samples from the rest of the world
+    context_recent:
+      group_by: "country week"
+      max_sequences: 640
+      min_date: "--min-date 1M"
+      exclude: "--exclude-where 'region={region}'"
+
+  # Custom subsampling logic for regions over 2m
+  # Grouping by country for Africa, Asia, Europe and South America
+  # 4000 total
+  # 4:1 ratio of recent to early
+  # 4:1 ratio of focal to context
+  nextstrain_region_grouped_by_country_2m:
+    # Early focal samples for region
+    focal_early:
+      group_by: "country year month"
+      max_sequences: 640
+      max_date: "--max-date 2M"
+      exclude: "--exclude-where 'region!={region}'"
+    # Early contextual samples from the rest of the world
+    context_early:
+      group_by: "country year month"
+      max_sequences: 160
+      max_date: "--max-date 2M"
+      exclude: "--exclude-where 'region={region}'"
+    # Recent focal samples for region
+    focal_recent:
+      group_by: "country week"
+      max_sequences: 2560
+      min_date: "--min-date 2M"
+      exclude: "--exclude-where 'region!={region}'"
+    # Early contextual samples from the rest of the world
+    context_recent:
+      group_by: "country week"
+      max_sequences: 640
+      min_date: "--min-date 2M"
+      exclude: "--exclude-where 'region={region}'"
+
+  # Custom subsampling logic for regions over 6m
+  # Grouping by country for Africa, Asia, Europe and South America
+  # 4000 total
+  # 4:1 ratio of recent to early
+  # 4:1 ratio of focal to context
+  nextstrain_region_grouped_by_country_6m:
+    # Early focal samples for region
+    focal_early:
+      group_by: "country year month"
+      max_sequences: 640
+      max_date: "--max-date 6M"
+      exclude: "--exclude-where 'region!={region}'"
+    # Early contextual samples from the rest of the world
+    context_early:
+      group_by: "country year month"
+      max_sequences: 160
+      max_date: "--max-date 6M"
+      exclude: "--exclude-where 'region={region}'"
+    # Recent focal samples for region
+    focal_recent:
+      group_by: "country year month"
+      max_sequences: 2560
+      min_date: "--min-date 6M"
+      exclude: "--exclude-where 'region!={region}'"
+    # Early contextual samples from the rest of the world
+    context_recent:
+      group_by: "country year month"
+      max_sequences: 640
+      min_date: "--min-date 6M"
+      exclude: "--exclude-where 'region={region}'"
+
+  # Custom subsampling logic for regions over all-time
+  # Grouping by country for Africa, Asia, Europe and South America
+  # 4000 total
+  # 4:1 ratio of focal to context
+  nextstrain_region_grouped_by_country_all_time:
+    # Focal samples for region
+    focal:
+      group_by: "country year month"
+      max_sequences: 3200
+      exclude: "--exclude-where 'region!={region}'"
+    # Contextual samples from the rest of the world
+    context:
+      group_by: "country year month"
+      max_sequences: 800
+      exclude: "--exclude-where 'region={region}'"
+
+  # Custom subsampling logic for global region over 1m
+  # 5125 total (expect ~3400)
+  # 4:1 ratio of recent to early
+  # all eight regions equal except Oceania at 20%
+  nextstrain_global_1m:
+    africa_early:
+      group_by: "country year month"
+      max_sequences: 150
+      max_date: "--max-date 1M"
+      exclude: "--exclude-where 'region!=Africa'"
+    asia_early:
+      group_by: "country year month"
+      max_sequences: 200
+      max_date: "--max-date 1M"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china_early:
+      group_by: "division year month"
+      max_sequences: 175
+      max_date: "--max-date 1M"
+      exclude: "--exclude-where 'country!=China'"
+    europe_early:
+      group_by: "country year month"
+      max_sequences: 125
+      max_date: "--max-date 1M"
+      exclude: "--exclude-where 'region!=Europe'"
+    india_early:
+      group_by: "division year month"
+      max_sequences: 175
+      max_date: "--max-date 1M"
+      exclude: "--exclude-where 'country!=India'"
+    north_america_early:
+      group_by: "division year month"
+      max_sequences: 100
+      max_date: "--max-date 1M"
+      exclude: "--exclude-where 'region!=North America'"
+    south_america_early:
+      group_by: "country year month"
+      max_sequences: 90
+      max_date: "--max-date 1M"
+      exclude: "--exclude-where 'region!=South America'"
+    oceania_early:
+      group_by: "division year month"
+      max_sequences: 15
+      max_date: "--max-date 1M"
+      exclude: "--exclude-where 'region!=Oceania'"
+    africa_recent:
+      group_by: "country week"
+      max_sequences: 600
+      min_date: "--min-date 1M"
+      exclude: "--exclude-where 'region!=Africa'"
+    asia_recent:
+      group_by: "country week"
+      max_sequences: 800
+      min_date: "--min-date 1M"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china_recent:
+      group_by: "division week"
+      max_sequences: 700
+      min_date: "--min-date 1M"
+      exclude: "--exclude-where 'country!=China'"
+    europe_recent:
+      group_by: "country week"
+      max_sequences: 500
+      min_date: "--min-date 1M"
+      exclude: "--exclude-where 'region!=Europe'"
+    india_recent:
+      group_by: "division week"
+      max_sequences: 700
+      min_date: "--min-date 1M"
+      exclude: "--exclude-where 'country!=India'"
+    north_america_recent:
+      group_by: "division week"
+      max_sequences: 400
+      min_date: "--min-date 1M"
+      exclude: "--exclude-where 'region!=North America'"
+    south_america_recent:
+      group_by: "country week"
+      max_sequences: 360
+      min_date: "--min-date 1M"
+      exclude: "--exclude-where 'region!=South America'"
+    oceania_recent:
+      group_by: "division week"
+      max_sequences: 60
+      min_date: "--min-date 1M"
+      exclude: "--exclude-where 'region!=Oceania'"
+
+  # Custom subsampling logic for global region over 2m
+  # 5125 total (expect ~3400)
+  # 4:1 ratio of recent to early
+  # all eight regions equal except Oceania at 20%
+  nextstrain_global_2m:
+    africa_early:
+      group_by: "country year month"
+      max_sequences: 150
+      max_date: "--max-date 2M"
+      exclude: "--exclude-where 'region!=Africa'"
+    asia_early:
+      group_by: "country year month"
+      max_sequences: 200
+      max_date: "--max-date 2M"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china_early:
+      group_by: "division year month"
+      max_sequences: 175
+      max_date: "--max-date 2M"
+      exclude: "--exclude-where 'country!=China'"
+    europe_early:
+      group_by: "country year month"
+      max_sequences: 125
+      max_date: "--max-date 2M"
+      exclude: "--exclude-where 'region!=Europe'"
+    india_early:
+      group_by: "division year month"
+      max_sequences: 175
+      max_date: "--max-date 2M"
+      exclude: "--exclude-where 'country!=India'"
+    north_america_early:
+      group_by: "division year month"
+      max_sequences: 100
+      max_date: "--max-date 2M"
+      exclude: "--exclude-where 'region!=North America'"
+    south_america_early:
+      group_by: "country year month"
+      max_sequences: 90
+      max_date: "--max-date 2M"
+      exclude: "--exclude-where 'region!=South America'"
+    oceania_early:
+      group_by: "division year month"
+      max_sequences: 15
+      max_date: "--max-date 2M"
+      exclude: "--exclude-where 'region!=Oceania'"
+    africa_recent:
+      group_by: "country week"
+      max_sequences: 600
+      min_date: "--min-date 2M"
+      exclude: "--exclude-where 'region!=Africa'"
+    asia_recent:
+      group_by: "country week"
+      max_sequences: 800
+      min_date: "--min-date 2M"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china_recent:
+      group_by: "division week"
+      max_sequences: 700
+      min_date: "--min-date 2M"
+      exclude: "--exclude-where 'country!=China'"
+    europe_recent:
+      group_by: "country week"
+      max_sequences: 500
+      min_date: "--min-date 2M"
+      exclude: "--exclude-where 'region!=Europe'"
+    india_recent:
+      group_by: "division week"
+      max_sequences: 700
+      min_date: "--min-date 2M"
+      exclude: "--exclude-where 'country!=India'"
+    north_america_recent:
+      group_by: "division week"
+      max_sequences: 400
+      min_date: "--min-date 2M"
+      exclude: "--exclude-where 'region!=North America'"
+    south_america_recent:
+      group_by: "country week"
+      max_sequences: 360
+      min_date: "--min-date 2M"
+      exclude: "--exclude-where 'region!=South America'"
+    oceania_recent:
+      group_by: "division week"
+      max_sequences: 60
+      min_date: "--min-date 2M"
+      exclude: "--exclude-where 'region!=Oceania'"
+
+  # Custom subsampling logic for global region over 6m
+  # 5125 total (expect ~3400)
+  # 4:1 ratio of recent to early
+  # all eight regions equal except Oceania at 20%
+  nextstrain_global_6m:
+    africa_early:
+      group_by: "country year month"
+      max_sequences: 150
+      max_date: "--max-date 6M"
+      exclude: "--exclude-where 'region!=Africa'"
+    asia_early:
+      group_by: "country year month"
+      max_sequences: 200
+      max_date: "--max-date 6M"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china_early:
+      group_by: "division year month"
+      max_sequences: 175
+      max_date: "--max-date 6M"
+      exclude: "--exclude-where 'country!=China'"
+    europe_early:
+      group_by: "country year month"
+      max_sequences: 125
+      max_date: "--max-date 6M"
+      exclude: "--exclude-where 'region!=Europe'"
+    india_early:
+      group_by: "division year month"
+      max_sequences: 175
+      max_date: "--max-date 6M"
+      exclude: "--exclude-where 'country!=India'"
+    north_america_early:
+      group_by: "division year month"
+      max_sequences: 100
+      max_date: "--max-date 6M"
+      exclude: "--exclude-where 'region!=North America'"
+    south_america_early:
+      group_by: "country year month"
+      max_sequences: 90
+      max_date: "--max-date 6M"
+      exclude: "--exclude-where 'region!=South America'"
+    oceania_early:
+      group_by: "division year month"
+      max_sequences: 15
+      max_date: "--max-date 6M"
+      exclude: "--exclude-where 'region!=Oceania'"
+    africa_recent:
+      group_by: "country year month"
+      max_sequences: 600
+      min_date: "--min-date 6M"
+      exclude: "--exclude-where 'region!=Africa'"
+    asia_recent:
+      group_by: "country year month"
+      max_sequences: 800
+      min_date: "--min-date 6M"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china_recent:
+      group_by: "division year month"
+      max_sequences: 700
+      min_date: "--min-date 6M"
+      exclude: "--exclude-where 'country!=China'"
+    europe_recent:
+      group_by: "country year month"
+      max_sequences: 500
+      min_date: "--min-date 6M"
+      exclude: "--exclude-where 'region!=Europe'"
+    india_recent:
+      group_by: "division year month"
+      max_sequences: 700
+      min_date: "--min-date 6M"
+      exclude: "--exclude-where 'country!=India'"
+    north_america_recent:
+      group_by: "division year month"
+      max_sequences: 400
+      min_date: "--min-date 6M"
+      exclude: "--exclude-where 'region!=North America'"
+    south_america_recent:
+      group_by: "country year month"
+      max_sequences: 360
+      min_date: "--min-date 6M"
+      exclude: "--exclude-where 'region!=South America'"
+    oceania_recent:
+      group_by: "division year month"
+      max_sequences: 60
+      min_date: "--min-date 6M"
+      exclude: "--exclude-where 'region!=Oceania'"
+
+  # Custom subsampling logic for global region over all-time
+  # 4320 total (expect ~3200)
+  # all eight regions equal except Oceania at 20%
+  nextstrain_global_all_time:
+    africa:
+      group_by: "country year month"
+      max_sequences: 750
+      exclude: "--exclude-where 'region!=Africa'"
+    asia:
+      group_by: "country year month"
+      max_sequences: 1000
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china:
+      group_by: "division year month"
+      max_sequences: 875
+      exclude: "--exclude-where 'country!=China'"
+    europe:
+      group_by: "country year month"
+      max_sequences: 625
+      exclude: "--exclude-where 'region!=Europe'"
+    india:
+      group_by: "division year month"
+      max_sequences: 875
+      exclude: "--exclude-where 'country!=India'"
+    north_america:
+      group_by: "division year month"
+      max_sequences: 500
+      exclude: "--exclude-where 'region!=North America'"
+    south_america:
+      group_by: "country year month"
+      max_sequences: 450
+      exclude: "--exclude-where 'region!=South America'"
+    oceania:
+      group_by: "division year month"
+      max_sequences: 75
+      exclude: "--exclude-where 'region!=Oceania'"
+
+# Root to clade 21L
+refine:
+  root: "21L"
+
+# if different traits should be reconstructed for some builds, specify here
+# otherwise the default trait config in defaults/parameters.yaml will used
+traits:
+  reference:
+    sampling_bias_correction: 2.5
+    columns: ["region"]
+  global_1m:
+    sampling_bias_correction: 2.5
+    columns: ["region"]
+  global_2m:
+    sampling_bias_correction: 2.5
+    columns: ["region"]
+  global_6m:
+    sampling_bias_correction: 2.5
+    columns: ["region"]
+  global_all-time:
+    sampling_bias_correction: 2.5
+    columns: ["region"]
+  africa_1m:
+    sampling_bias_correction: 2.5
+    columns: ["country"]
+  africa_2m:
+    sampling_bias_correction: 2.5
+    columns: ["country"]
+  africa_6m:
+    sampling_bias_correction: 2.5
+    columns: ["country"]
+  africa_all-time:
+    sampling_bias_correction: 2.5
+    columns: ["country"]
+  asia_1m:
+    sampling_bias_correction: 2.5
+    columns: ["country"]
+  asia_2m:
+    sampling_bias_correction: 2.5
+    columns: ["country"]
+  asia_6m:
+    sampling_bias_correction: 2.5
+    columns: ["country"]
+  asia_all-time:
+    sampling_bias_correction: 2.5
+    columns: ["country"]
+  europe_1m:
+    sampling_bias_correction: 2.5
+    columns: ["country"]
+  europe_2m:
+    sampling_bias_correction: 2.5
+    columns: ["country"]
+  europe_6m:
+    sampling_bias_correction: 2.5
+    columns: ["country"]
+  europe_all-time:
+    sampling_bias_correction: 2.5
+    columns: ["country"]
+  north-america_1m:
+    sampling_bias_correction: 2.5
+    columns: ["division"]
+  north-america_2m:
+    sampling_bias_correction: 2.5
+    columns: ["division"]
+  north-america_6m:
+    sampling_bias_correction: 2.5
+    columns: ["division"]
+  north-america_all-time:
+    sampling_bias_correction: 2.5
+    columns: ["division"]
+  oceania_1m:
+    sampling_bias_correction: 2.5
+    columns: ["division"]
+  oceania_2m:
+    sampling_bias_correction: 2.5
+    columns: ["division"]
+  oceania_6m:
+    sampling_bias_correction: 2.5
+    columns: ["division"]
+  oceania_all-time:
+    sampling_bias_correction: 2.5
+    columns: ["division"]
+  south-america_1m:
+    sampling_bias_correction: 2.5
+    columns: ["country"]
+  south-america_2m:
+    sampling_bias_correction: 2.5
+    columns: ["country"]
+  south-america_6m:
+    sampling_bias_correction: 2.5
+    columns: ["country"]
+  south-america_all-time:
+    sampling_bias_correction: 2.5
+    columns: ["country"]
+
+# Define frequencies parameters
+# Target frequencies to "1m", "2m", "6m" and "all-time" builds
+# narrow_bandwidth = 0.019 or 7 days for "1m" and "2m"
+# narrow_bandwidth = 0.038 or 14 days for "6m" and "all-time"
+frequencies:
+  reference:
+    min_date: "2022-01-01"
+    narrow_bandwidth: 0.038
+    recent_days_to_censor: 7  
+  global_1m:
+    min_date: "1M"
+    narrow_bandwidth: 0.019
+    recent_days_to_censor: 7
+  global_2m:
+    min_date: "2M"
+    narrow_bandwidth: 0.019
+    recent_days_to_censor: 7
+  global_6m:
+    min_date: "6M"
+    narrow_bandwidth: 0.038
+    recent_days_to_censor: 7
+  global_all-time:
+    min_date: "2022-01-01"
+    narrow_bandwidth: 0.038
+    recent_days_to_censor: 7
+  africa_1m:
+    min_date: "1M"
+    narrow_bandwidth: 0.019
+    recent_days_to_censor: 7
+  africa_2m:
+    min_date: "2M"
+    narrow_bandwidth: 0.019
+    recent_days_to_censor: 7
+  africa_6m:
+    min_date: "6M"
+    narrow_bandwidth: 0.038
+    recent_days_to_censor: 7
+  africa_all-time:
+    min_date: "2022-01-01"
+    narrow_bandwidth: 0.038
+    recent_days_to_censor: 7
+  asia_1m:
+    min_date: "1M"
+    narrow_bandwidth: 0.019
+    recent_days_to_censor: 7
+  asia_2m:
+    min_date: "2M"
+    narrow_bandwidth: 0.019
+    recent_days_to_censor: 7
+  asia_6m:
+    min_date: "6M"
+    narrow_bandwidth: 0.038
+    recent_days_to_censor: 7
+  asia_all-time:
+    min_date: "2022-01-01"
+    narrow_bandwidth: 0.038
+    recent_days_to_censor: 7
+  europe_1m:
+    min_date: "1M"
+    narrow_bandwidth: 0.019
+    recent_days_to_censor: 7
+  europe_2m:
+    min_date: "2M"
+    narrow_bandwidth: 0.019
+    recent_days_to_censor: 7
+  europe_6m:
+    min_date: "6M"
+    narrow_bandwidth: 0.038
+    recent_days_to_censor: 7
+  europe_all-time:
+    min_date: "2022-01-01"
+    narrow_bandwidth: 0.038
+    recent_days_to_censor: 7
+  north-america_1m:
+    min_date: "1M"
+    narrow_bandwidth: 0.019
+    recent_days_to_censor: 7
+  north-america_2m:
+    min_date: "2M"
+    narrow_bandwidth: 0.019
+    recent_days_to_censor: 7
+  north-america_6m:
+    min_date: "6M"
+    narrow_bandwidth: 0.038
+    recent_days_to_censor: 7
+  north-america_all-time:
+    min_date: "2022-01-01"
+    narrow_bandwidth: 0.038
+    recent_days_to_censor: 7
+  oceania_1m:
+    min_date: "1M"
+    narrow_bandwidth: 0.019
+    recent_days_to_censor: 7
+  oceania_2m:
+    min_date: "2M"
+    narrow_bandwidth: 0.019
+    recent_days_to_censor: 7
+  oceania_6m:
+    min_date: "6M"
+    narrow_bandwidth: 0.038
+    recent_days_to_censor: 7
+  oceania_all-time:
+    min_date: "2022-01-01"
+    narrow_bandwidth: 0.038
+    recent_days_to_censor: 7
+  south-america_1m:
+    min_date: "1M"
+    narrow_bandwidth: 0.019
+    recent_days_to_censor: 7
+  south-america_2m:
+    min_date: "2M"
+    narrow_bandwidth: 0.019
+    recent_days_to_censor: 7
+  south-america_6m:
+    min_date: "6M"
+    narrow_bandwidth: 0.038
+    recent_days_to_censor: 7
+  south-america_all-time:
+    min_date: "2022-01-01"
+    narrow_bandwidth: 0.038
+    recent_days_to_censor: 7
diff --git a/nextstrain_profiles/nextstrain-open-21L/config.yaml b/nextstrain_profiles/nextstrain-open-21L/config.yaml
new file mode 100644
index 000000000..64646a638
--- /dev/null
+++ b/nextstrain_profiles/nextstrain-open-21L/config.yaml
@@ -0,0 +1,12 @@
+configfile:
+  - defaults/parameters.yaml
+  - nextstrain_profiles/nextstrain-open-21L/builds.yaml
+
+cores: 8
+keep-going: False
+printshellcmds: True
+show-failed-logs: True
+restart-times: 2
+reason: True
+stats: stats.json
+set-threads: tree=4
diff --git a/nextstrain_profiles/nextstrain-open-21L/exclude-clades.tsv b/nextstrain_profiles/nextstrain-open-21L/exclude-clades.tsv
new file mode 100644
index 000000000..58126031f
--- /dev/null
+++ b/nextstrain_profiles/nextstrain-open-21L/exclude-clades.tsv
@@ -0,0 +1,25 @@
+clade
+19A
+19B
+20A
+20B
+20C
+20D
+20E
+20F
+20G
+20H
+20I
+20J
+21A
+21B
+21C
+21D
+21E
+21F
+21G
+21H
+21I
+21J
+21K
+21M
diff --git a/nextstrain_profiles/nextstrain-open-21L/include.txt b/nextstrain_profiles/nextstrain-open-21L/include.txt
new file mode 100644
index 000000000..80fab45a6
--- /dev/null
+++ b/nextstrain_profiles/nextstrain-open-21L/include.txt
@@ -0,0 +1 @@
+21L
diff --git a/nextstrain_profiles/nextstrain-open-21L/nextstrain_description.md b/nextstrain_profiles/nextstrain-open-21L/nextstrain_description.md
new file mode 100644
index 000000000..5b90b40f9
--- /dev/null
+++ b/nextstrain_profiles/nextstrain-open-21L/nextstrain_description.md
@@ -0,0 +1,39 @@
+Compiled Nextstrain SARS-CoV-2 resources are available at [nextstrain.org/sars-cov-2](https://nextstrain.org/sars-cov-2/). Follow [@nextstrain](https://twitter.com/nextstrain) for updates.
+
+This phylogeny shows evolutionary relationships of SARS-CoV-2 viruses relative to clade 21L (Pango lineage BA.2) reference virus. Although the genetic relationships among sampled viruses are generally quite clear, there is considerable uncertainty surrounding estimates of specific transmission dates and in reconstruction of geographic spread. Please be aware that specific inferred geographic transmission patterns and temporal estimates are only a hypothesis.
+
+There are millions of complete SARS-CoV-2 genomes available on open databases and this number increases every day. This visualization can only handle ~4000 genomes in a single view for performance and legibility reasons. Because of this we subsample available genome data for our analysis views. We provision multiple views to focus subsampling with different reference viruses, different geographic regions and different time periods. These views are available through the "Dataset" dropdown on the left or by clicking on the following links:
+
+**Using ancestral Wuhan viruses as phylogeny root**
+&nbsp;            | past 1 month                                                               | past 2 months                                                              | past 6 months                                                              | all time
+----------------- | -------------------------------------------------------------------------- | -------------------------------------------------------------------------- | -------------------------------------------------------------------------- | --------------------------------------------------------------------------------------
+**global**        | [global/1m](/ncov/open/global/1m)                                        | [global/2m](/ncov/open/global/2m)                                        | [global/6m](/ncov/open/global/6m)                                        | [global/all-time](/ncov/open/global/all-time)                                        |
+**Africa**        | [africa/1m](/ncov/open/africa/1m?f_region=Africa)                        | [africa/2m](/ncov/open/africa/2m?f_region=Africa)                        | [africa/6m](/ncov/open/africa/6m?f_region=Africa)                        | [africa/all-time](/ncov/open/africa/all-time?f_region=Africa)                        |
+**Asia**          | [asia/1m](/ncov/open/asia/1m?f_region=Asia)                              | [asia/2m](/ncov/open/asia/2m?f_region=Asia)                              | [asia/6m](/ncov/open/asia/6m?f_region=Asia)                              | [asia/all-time](/ncov/open/asia/all-time?f_region=Asia)                              |
+**Europe**        | [europe/1m](/ncov/open/europe/1m?f_region=Europe)                        | [europe/2m](/ncov/open/europe/2m?f_region=Europe)                        | [europe/6m](/ncov/open/europe/6m?f_region=Europe)                        | [europe/all-time](/ncov/open/europe/all-time?f_region=Europe)                        |
+**North America** | [north-america/1m](/ncov/open/north-america/1m?f_region=North%20America) | [north-america/2m](/ncov/open/north-america/2m?f_region=North%20America) | [north-america/6m](/ncov/open/north-america/6m?f_region=North%20America) | [north-america/all-time](/ncov/open/north-america/all-time?f_region=North%20America) |
+**Oceania**       | [oceania/1m](/ncov/open/oceania/1m?f_region=Oceania)                     | [oceania/2m](/ncov/open/oceania/2m?f_region=Oceania)                     | [oceania/6m](/ncov/open/oceania/6m?f_region=Oceania)                     | [oceania/all-time](/ncov/open/oceania/all-time?f_region=Oceania)                     |
+**South America** | [south-america/1m](/ncov/open/south-america/1m?f_region=South%20America) | [south-america/2m](/ncov/open/south-america/2m?f_region=South%20America) | [south-america/6m](/ncov/open/south-america/6m?f_region=South%20America) | [south-america/all-time](/ncov/open/south-america/all-time?f_region=South%20America) |
+
+**Using clade 21L (lineage BA.2) as phylogeny root**
+&nbsp;            | past 1 month                                                                       | past 2 months                                                                      | past 6 months                                                                      | all time
+----------------- | ---------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------
+**global**        | [21L/global/1m](/ncov/open/21L/global/1m)                                        | [21L/global/2m](/ncov/open/21L/global/2m)                                        | [21L/global/6m](/ncov/open/21L/global/6m)                                        | [21L/global/all-time](/ncov/open/21L/global/all-time)                                        |
+**Africa**        | [21L/africa/1m](/ncov/open/21L/africa/1m?f_region=Africa)                        | [21L/africa/2m](/ncov/open/21L/africa/2m?f_region=Africa)                        | [21L/africa/6m](/ncov/open/21L/africa/6m?f_region=Africa)                        | [21L/africa/all-time](/ncov/open/21L/africa/all-time?f_region=Africa)                        |
+**Asia**          | [21L/asia/1m](/ncov/open/21L/asia/1m?f_region=Asia)                              | [21L/asia/2m](/ncov/open/21L/asia/2m?f_region=Asia)                              | [21L/asia/6m](/ncov/open/21L/asia/6m?f_region=Asia)                              | [21L/asia/all-time](/ncov/open/21L/asia/all-time?f_region=Asia)                              |
+**Europe**        | [21L/europe/1m](/ncov/open/21L/europe/1m?f_region=Europe)                        | [21L/europe/2m](/ncov/open/21L/europe/2m?f_region=Europe)                        | [21L/europe/6m](/ncov/open/21L/europe/6m?f_region=Europe)                        | [21L/europe/all-time](/ncov/open/21L/europe/all-time?f_region=Europe)                        |
+**North America** | [21L/north-america/1m](/ncov/open/21L/north-america/1m?f_region=North%20America) | [21L/north-america/2m](/ncov/open/21L/north-america/2m?f_region=North%20America) | [21L/north-america/6m](/ncov/open/21L/north-america/6m?f_region=North%20America) | [21L/north-america/all-time](/ncov/open/21L/north-america/all-time?f_region=North%20America) |
+**Oceania**       | [21L/oceania/1m](/ncov/open/21L/oceania/1m?f_region=Oceania)                     | [21L/oceania/2m](/ncov/open/21L/oceania/2m?f_region=Oceania)                     | [21L/oceania/6m](/ncov/open/21L/oceania/6m?f_region=Oceania)                     | [21L/oceania/all-time](/ncov/open/21L/oceania/all-time?f_region=Oceania)                     |
+**South America** | [21L/south-america/1m](/ncov/open/21L/south-america/1m?f_region=South%20America) | [21L/south-america/2m](/ncov/open/21L/south-america/2m?f_region=South%20America) | [21L/south-america/6m](/ncov/open/21L/south-america/6m?f_region=South%20America) | [21L/south-america/all-time](/ncov/open/21L/south-america/all-time?f_region=South%20America) |
+
+Site numbering and genome structure uses [Wuhan-Hu-1/2019](https://www.ncbi.nlm.nih.gov/nuccore/MN908947) as reference. The phylogeny is rooted relative to clade 21L (Pango lineage BA.2) reference virus. Temporal resolution assumes a nucleotide substitution rate of 8 &times; 10^-4 subs per site per year. Mutational fitness is calculated using results from [Obermeyer et al](https://doi.org/10.1126/science.abm1208). Immune escape vs BA.2 is estimated using the [RBD antibody escape calculator](https://jbloomlab.github.io/SARS2-RBD-escape-calc/) maintained by Jesse Bloom. Full details on bioinformatic processing can be found [here](https://github.com/nextstrain/ncov).
+
+We gratefully acknowledge the authors, originating and submitting laboratories of the genetic sequences and metadata for sharing their work in open databases. Please note that although data generators have generously shared data in an open fashion, that does not mean there should be free license to publish on this data. Data generators should be cited where possible and collaborations should be sought in some circumstances. Please try to avoid scooping someone else's work. Reach out if uncertain. An attribution table is available by clicking on "Download Data" at the bottom of the page and then clicking on "Strain Metadata" in the resulting dialog box.
+
+To maximize the utility and visibility of these generously shared data, [we provide preprocessed files that can serve as a starting point for additional analyses](https://docs.nextstrain.org/projects/ncov/en/latest/reference/remote_inputs.html).
+
+#### Reusing code or images
+
+All source code for Auspice, the visualization tool, is freely available under the terms of the [GNU Affero General Public License 3.0](https://github.com/nextstrain/auspice/blob/HEAD/LICENSE.txt).
+
+Screenshots may be used under a [CC-BY-4.0 license](https://creativecommons.org/licenses/by/4.0/) and attribution to nextstrain.org must be provided. A high-quality download option is available by clicking the **DOWNLOAD DATA** button at the bottom of the page and selecting **SCREENSHOT (SVG)**.
diff --git a/nextstrain_profiles/nextstrain-open-21L/prefilter.smk b/nextstrain_profiles/nextstrain-open-21L/prefilter.smk
new file mode 100644
index 000000000..623f110dc
--- /dev/null
+++ b/nextstrain_profiles/nextstrain-open-21L/prefilter.smk
@@ -0,0 +1,103 @@
+rule clades_21L:
+    input:
+        clades = "defaults/clades.tsv",
+        exclude_clades = "nextstrain_profiles/nextstrain-open-21L/exclude-clades.tsv",
+    output:
+        clades = "results/clades_21L.tsv",
+    log: "logs/clades_21L.txt"
+    benchmark: "benchmarks/clades_21L.txt"
+    conda: config["conda_environment"]
+    shell:
+        r"""
+        exec 2> {log:q}
+
+          ./scripts/expand-clade-definitions {input.clades:q} \
+        | tsv-join \
+            --header \
+            --exclude \
+            --filter-file {input.exclude_clades:q} \
+            --key-fields clade \
+        > {output.clades:q}
+        """
+
+
+rule open_21L_metadata:
+    input:
+        references = "data/references_metadata.tsv",
+        metadata = path_or_url("s3://nextstrain-data/files/ncov/open/metadata.tsv.zst", keep_local=True),
+        exclude_clades = "nextstrain_profiles/nextstrain-open-21L/exclude-clades.tsv",
+    output:
+        metadata = "results/open_21L_metadata.tsv.zst",
+    log: "logs/open_21L_metadata.txt"
+    benchmark: "benchmarks/open_21L_metadata.txt"
+    conda: config["conda_environment"]
+    threads: 8
+    shell:
+        r"""
+        exec 2> {log:q}
+
+        ./scripts/tsv-cast-header \
+            <(unzstd < {input.metadata:q}) \
+            {input.references:q} \
+        | zstd \
+        > {output.metadata:q}
+
+        < {input.metadata:q} \
+          unzstd \
+        | tsv-join \
+            --header \
+            --exclude \
+            --filter-file {input.exclude_clades:q} \
+            --key-fields clade \
+            --data-fields clade_nextstrain \
+        | sed 1d \
+        | zstd -T$(({threads} - 2)) \
+        >> {output.metadata:q}
+        """
+
+
+rule open_21L_strains:
+    input:
+        metadata = "results/open_21L_metadata.tsv.zst",
+    output:
+        strains = "results/open_21L_strains.txt",
+    log: "logs/open_21L_strains.txt"
+    benchmark: "benchmarks/open_21L_strains.txt"
+    conda: config["conda_environment"]
+    shell:
+        r"""
+        exec 2> {log:q}
+
+        < {input.metadata:q} \
+          unzstd \
+        | tsv-select --header -f strain \
+        | sed 1d \
+        > {output.strains:q}
+        """
+
+
+rule open_21L_aligned:
+    input:
+        references = "data/references_sequences.fasta",
+        aligned = path_or_url("s3://nextstrain-data/files/ncov/open/aligned.fasta.zst", keep_local=True),
+        strains = "results/open_21L_strains.txt",
+    output:
+        aligned = "results/open_21L_aligned.fasta.zst",
+    log: "logs/open_21L_aligned.txt"
+    benchmark: "benchmarks/open_21L_aligned.txt"
+    conda: config["conda_environment"]
+    threads: 8
+    shell:
+        r"""
+        exec 2> {log:q}
+
+        < {input.references:q} \
+          zstd \
+        > {output.aligned}
+
+        < {input.aligned:q} \
+          unzstd \
+        | seqkit grep --by-name -f {input.strains:q} \
+        | zstd -T$(({threads} - 2)) \
+        >> {output.aligned:q}
+        """

From d49972dcbae33c24eacf109dff6f5b2aba16bede Mon Sep 17 00:00:00 2001
From: Trevor Bedford <trevor@bedford.io>
Date: Sat, 9 Mar 2024 15:30:45 -0800
Subject: [PATCH 2/3] Include GitHub Action for open 21L rebuild

---
 .github/workflows/rebuild-open-21L.yml | 90 ++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)
 create mode 100644 .github/workflows/rebuild-open-21L.yml

diff --git a/.github/workflows/rebuild-open-21L.yml b/.github/workflows/rebuild-open-21L.yml
new file mode 100644
index 000000000..d375bcb5d
--- /dev/null
+++ b/.github/workflows/rebuild-open-21L.yml
@@ -0,0 +1,90 @@
+name: Rebuild open 21L phylogenetic datasets
+
+on:
+  # This workflow can be triggered from repository_dispatch events,
+  # for instance, after the appropriate preprocessing actions have completed
+  repository_dispatch:
+    types:
+      - rebuild
+      - open/rebuild
+      - genbank/rebuild
+  # Manually triggered using GitHub's UI
+  workflow_dispatch:
+    inputs:
+      trial_name:
+        description: "Short name for this trial build, for prefixing the uploaded data and results files. WARNING: without this we will overwrite files in s3://nextstrain-data/files/ncov/open/ and the trees on nextstrain.org/ncov/open/..."
+        required: false
+      image:
+        description: 'Specific container image to use for build (will override the default of "nextstrain build")'
+        required: false
+
+env:
+  TRIAL_NAME: ${{ github.event.inputs.trial_name }}
+  NEXTSTRAIN_DOCKER_IMAGE: ${{ github.event.inputs.image }}
+
+jobs:
+  open-21L:
+    runs-on: ubuntu-22.04
+    steps:
+    - uses: actions/checkout@v4
+
+    - uses: nextstrain/.github/actions/setup-nextstrain-cli@master
+
+    - name: Launch build
+      run: |
+        set -x
+
+        declare -a config
+        config+=(build_date=\'$(date +'%Y-%m-%d')\')
+        if [[ "$TRIAL_NAME" ]]; then
+          config+=(
+            S3_DST_BUCKET=nextstrain-staging/files/ncov/open/trial/"$TRIAL_NAME"
+            deploy_url=s3://nextstrain-staging/
+            auspice_json_prefix=ncov_open_21L_trial_"$TRIAL_NAME"
+          )
+        else
+          config+=(slack_token=$SLACK_TOKEN)
+        fi
+
+        nextstrain build \
+          --aws-batch \
+          --detach \
+          --cpus 72 \
+          --memory 140GiB \
+          . \
+            deploy \
+            upload \
+            --config "${config[@]}" \
+            --profile nextstrain_profiles/nextstrain-open-21L \
+            --set-threads tree=8 \
+        |& tee build-launch.log
+      env:
+        AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+        AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+        SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
+
+    - name: Build info
+      run: |
+        if [[ "$TRIAL_NAME" ]]; then
+          echo "--> Trial name is: $TRIAL_NAME"
+          echo
+          echo "--> When completed, the following will be available:"
+          echo "build files: s3://nextstrain-staging/files/ncov/open/trial/$TRIAL_NAME/"
+          echo "nextstrain URLs: https://nextstrain.org/staging/ncov/open/21L/trial/$TRIAL_NAME/REGION_NAME/TIME_SPAN"
+          echo "e.g. https://nextstrain.org/staging/ncov/open/21L/trial/$TRIAL_NAME/global/all-time"
+        else
+          echo "--> open 21L phylogenetic analysis rebuilding on AWS"
+          echo
+          echo "--> When completed, the following will be updated:"
+          echo "build files: s3://nextstrain-data/files/ncov/open/REGION_NAME"
+          echo "nextstrain URLs: https://nextstrain.org/ncov/open/21L/REGION_NAME/TIME_SPAN"
+          echo "e.g. https://nextstrain.org/ncov/open/21L/global/all-time"
+        fi
+        echo
+        echo "--> You can attach to this AWS job via:"
+        tail -n1 build-launch.log
+        echo
+        JOBID=$( tail -n1 build-launch.log | sed -E 's/.+attach ([-a-f0-9]+).+/\1/' )
+        echo "--> View this job in the AWS console via"
+        echo "    https://console.aws.amazon.com/batch/home?region=us-east-1#jobs/detail/${JOBID}"
+        echo

From 9ff576ee35a3bf0c7ab8511b1cfc8953ca302b72 Mon Sep 17 00:00:00 2001
From: Trevor Bedford <trevor@bedford.io>
Date: Sat, 9 Mar 2024 15:37:45 -0800
Subject: [PATCH 3/3] Update change log to include open 21L

---
 docs/src/reference/change_log.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/src/reference/change_log.md b/docs/src/reference/change_log.md
index 08c030d3c..02be644c0 100644
--- a/docs/src/reference/change_log.md
+++ b/docs/src/reference/change_log.md
@@ -5,6 +5,8 @@ We also use this change log to document new features that maintain backward comp
 
 ## New features since last version update
 
+- 9 March 2024: Include 21L-focused builds for open data, eg https://nextstrain.org/ncov/open/21L/global/6m. [PR 1101](https://github.com/nextstrain/ncov/pull/1101)
+
 - 31 January 2024: Remove RBD-level related rules and files since this feature has been broken since May 2023 and is no longer relevant. [PR 1097](https://github.com/nextstrain/ncov/pull/1097)
 
 - 30 January 2024: Fix RBD-level coloring by updating clade label and clade parsing. [PR 1094](https://github.com/nextstrain/ncov/pull/1094)