From a53be64d88469d688a5bbf5b4a75781b995cf7a6 Mon Sep 17 00:00:00 2001 From: james hadfield Date: Thu, 27 Nov 2025 14:19:04 +1300 Subject: [PATCH 1/2] [curate apply-geolocation-rules] allow '?' values Allows '?' to be used interchangeably with the empty value for the purposes of geographic remappings. See the added tests for real-life occurrences which this commit fixes. In our default rules files there are n=5181 rules where only the final value (location) is empty and these rules can now apply to metadata values which set "location='?'". There are 44 rules with empty division & location and one rule (!) with empty country, division & location. --- augur/curate/apply_geolocation_rules.py | 9 +++ .../question-mark-values.t | 60 +++++++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 tests/functional/curate/cram/apply-geolocation-rules/question-mark-values.t diff --git a/augur/curate/apply_geolocation_rules.py b/augur/curate/apply_geolocation_rules.py index 97176e0a7..d3b41040a 100644 --- a/augur/curate/apply_geolocation_rules.py +++ b/augur/curate/apply_geolocation_rules.py @@ -73,6 +73,15 @@ def load_geolocation_rules(geolocation_rules_file, case_sensitive): geolocation_rules[raw[0]][raw[1]][raw[2]][raw[3]] = annot + # We want to match '?' values to the empty string as well to allow them to be used as empty values + # but if they are used it's expected they are used for all "empty" fields + if raw[1]=='' and raw[2]=='' and raw[3]=='': + geolocation_rules[raw[0]]['?']['?']['?'] = annot + elif raw[2]=='' and raw[3]=='': + geolocation_rules[raw[0]][raw[1]]['?']['?'] = annot + elif raw[3]=='': + geolocation_rules[raw[0]][raw[1]][raw[2]]['?'] = annot + return geolocation_rules diff --git a/tests/functional/curate/cram/apply-geolocation-rules/question-mark-values.t b/tests/functional/curate/cram/apply-geolocation-rules/question-mark-values.t new file mode 100644 index 000000000..6a1ddbe77 --- /dev/null +++ b/tests/functional/curate/cram/apply-geolocation-rules/question-mark-values.t @@ -0,0 +1,60 @@ +Setup + + $ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}" + +We sometimes use question marks '?' as an unknown field. `augur curate apply-record-annotations` +special-cases these so that they aren't interpreted as a valid value. + +This test uses real data from seasonal-flu where this bug was first observed + +Rules taken directly from `augur/data/geolocation_rules.tsv` as at c8181c7 + + $ cat >rules.tsv <<~~ + > Asia/Bangladesh/Jashore/ Asia/Bangladesh/Khulna/Jashore + > Asia/Bangladesh/Jashore/* Asia/Bangladesh/Khulna/* + > ~~ + +Fist check that if the division is EMPTY then the first rule is applied, as we expect + + $ echo '{"region": "Asia", "country": "Bangladesh", "division": "Jashore", "location": ""}' \ + > | ${AUGUR} curate apply-geolocation-rules --geolocation-rules rules.tsv + {"region": "Asia", "country": "Bangladesh", "division": "Khulna", "location": "Jashore"} + +but if the location is a "?", which is the case in our seasonal-flu ingest pipeline we want to use the first rule too +rather than interpreting "?" as a valid location to preserve + + + $ echo '{"region": "Asia", "country": "Bangladesh", "division": "Jashore", "location": "?"}' \ + > | ${AUGUR} curate apply-geolocation-rules --geolocation-rules rules.tsv + {"region": "Asia", "country": "Bangladesh", "division": "Khulna", "location": "Jashore"} + + +Edge case: preserve '?' in the output if we don't replace it via a rule + + $ echo '{"region": "Asia", "country": "X", "division": "?", "location": "?"}' \ + > | ${AUGUR} curate apply-geolocation-rules --geolocation-rules rules.tsv + {"region": "Asia", "country": "X", "division": "?", "location": "?"} + + +This works for multiple fields in a work-backwards fashion + + $ cat >rules.tsv <<~~ + > North America/USA Alabama// North America/USA/Alabama/ + > ~~ + + $ echo '{"region": "North America", "country": "USA Alabama", "division": "", "location": ""}' \ + > | ${AUGUR} curate apply-geolocation-rules --geolocation-rules rules.tsv + {"region": "North America", "country": "USA", "division": "Alabama", "location": ""} + + $ echo '{"region": "North America", "country": "USA Alabama", "division": "?", "location": "?"}' \ + > | ${AUGUR} curate apply-geolocation-rules --geolocation-rules rules.tsv + {"region": "North America", "country": "USA", "division": "Alabama", "location": ""} + +NOTE: but division=? and location=empty string (or vice-versa) doesn't work - it's expected that the missing-value is used consistently + $ echo '{"region": "North America", "country": "USA Alabama", "division": "?", "location": ""}' \ + > | ${AUGUR} curate apply-geolocation-rules --geolocation-rules rules.tsv + {"region": "North America", "country": "USA Alabama", "division": "?", "location": ""} + + $ echo '{"region": "North America", "country": "USA Alabama", "division": "", "location": "?"}' \ + > | ${AUGUR} curate apply-geolocation-rules --geolocation-rules rules.tsv + {"region": "North America", "country": "USA Alabama", "division": "", "location": "?"} From e5fcd693a15630295a2a388421659cdd31328e31 Mon Sep 17 00:00:00 2001 From: james hadfield Date: Thu, 27 Nov 2025 15:04:33 +1300 Subject: [PATCH 2/2] [tests] don't use default geolocation rules These didn't affect the test results (mainly as we used dummy rules which never conflicted) but the intention of these tests is to only use the rules defined in the actual test. (The --no-default-rules flag was added in c0575c68206f106a1030e6a72dc3f3a3402d4713) --- augur/curate/apply_geolocation_rules.py | 7 ++++--- .../cram/apply-geolocation-rules/cyclic-error.t | 4 ++-- .../cram/apply-geolocation-rules/empty-rule.t | 4 ++-- .../general-rule-fields.t | 12 ++++++------ .../general-rule-precedence.t | 6 +++--- .../apply-geolocation-rules/ignore-comments.t | 2 +- .../invalid-rule-warnings.t | 2 +- .../last-duplicate-rule-applied.t | 2 +- .../question-mark-values.t | 17 +++++++++-------- .../rule-case-sensitivity.t | 6 +++--- .../rule-type-preference.t | 2 +- 11 files changed, 33 insertions(+), 31 deletions(-) diff --git a/augur/curate/apply_geolocation_rules.py b/augur/curate/apply_geolocation_rules.py index d3b41040a..aedd29cc5 100644 --- a/augur/curate/apply_geolocation_rules.py +++ b/augur/curate/apply_geolocation_rules.py @@ -75,12 +75,13 @@ def load_geolocation_rules(geolocation_rules_file, case_sensitive): # We want to match '?' values to the empty string as well to allow them to be used as empty values # but if they are used it's expected they are used for all "empty" fields + annot_using_question_marks = tuple(['?' if x=='' else x for x in annot]) if raw[1]=='' and raw[2]=='' and raw[3]=='': - geolocation_rules[raw[0]]['?']['?']['?'] = annot + geolocation_rules[raw[0]]['?']['?']['?'] = annot_using_question_marks elif raw[2]=='' and raw[3]=='': - geolocation_rules[raw[0]][raw[1]]['?']['?'] = annot + geolocation_rules[raw[0]][raw[1]]['?']['?'] = annot_using_question_marks elif raw[3]=='': - geolocation_rules[raw[0]][raw[1]][raw[2]]['?'] = annot + geolocation_rules[raw[0]][raw[1]][raw[2]]['?'] = annot_using_question_marks return geolocation_rules diff --git a/tests/functional/curate/cram/apply-geolocation-rules/cyclic-error.t b/tests/functional/curate/cram/apply-geolocation-rules/cyclic-error.t index f380a0d33..9287e2a24 100644 --- a/tests/functional/curate/cram/apply-geolocation-rules/cyclic-error.t +++ b/tests/functional/curate/cram/apply-geolocation-rules/cyclic-error.t @@ -13,13 +13,13 @@ Attempting to use the rules with no matches results in no changes. $ echo '{"region": "r_old", "country": "c_old", "division": "d_old", "location": "l_something_else"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "r_old", "country": "c_old", "division": "d_old", "location": "l_something_else"} Attempting to use the rules with a match results in an error. $ echo '{"region": "r_old", "country": "c_old", "division": "d_old", "location": "l_old"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv ERROR: More than 1000 geolocation rules applied on the same entry ['r_old', 'c_old', 'd_old', 'l_old']. [2] diff --git a/tests/functional/curate/cram/apply-geolocation-rules/empty-rule.t b/tests/functional/curate/cram/apply-geolocation-rules/empty-rule.t index 336957edc..33d852d03 100644 --- a/tests/functional/curate/cram/apply-geolocation-rules/empty-rule.t +++ b/tests/functional/curate/cram/apply-geolocation-rules/empty-rule.t @@ -10,7 +10,7 @@ Test that an empty field can be loaded in the raw and annotated columns. $ echo '{"region": "r_raw", "country": "c_raw", "division": "", "location": ""}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "r_annotated", "country": "c_annotated", "division": "", "location": ""} This is effectively an exact rule, so it will not apply to any record that does @@ -18,5 +18,5 @@ not match the empty string. $ echo '{"region": "r_raw", "country": "c_raw", "division": "d_raw", "location": ""}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "r_raw", "country": "c_raw", "division": "d_raw", "location": ""} diff --git a/tests/functional/curate/cram/apply-geolocation-rules/general-rule-fields.t b/tests/functional/curate/cram/apply-geolocation-rules/general-rule-fields.t index af26c90e4..12519f39d 100644 --- a/tests/functional/curate/cram/apply-geolocation-rules/general-rule-fields.t +++ b/tests/functional/curate/cram/apply-geolocation-rules/general-rule-fields.t @@ -12,7 +12,7 @@ Field 1 $ echo '{"region": "a", "country": "b", "division": "c", "location": "d"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "a", "country": "2", "division": "3", "location": "4"} Field 2 @@ -23,7 +23,7 @@ Field 2 $ echo '{"region": "a", "country": "b", "division": "c", "location": "d"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "1", "country": "b", "division": "3", "location": "4"} Field 3 @@ -34,7 +34,7 @@ Field 3 $ echo '{"region": "a", "country": "b", "division": "c", "location": "d"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "1", "country": "2", "division": "c", "location": "4"} Field 4 @@ -45,7 +45,7 @@ Field 4 $ echo '{"region": "a", "country": "b", "division": "c", "location": "d"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "1", "country": "2", "division": "3", "location": "d"} Fields 2,3 @@ -56,7 +56,7 @@ Fields 2,3 $ echo '{"region": "a", "country": "b", "division": "c", "location": "d"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "1", "country": "b", "division": "c", "location": "4"} Fields 1,2,3 @@ -67,5 +67,5 @@ Fields 1,2,3 $ echo '{"region": "a", "country": "b", "division": "c", "location": "d"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "a", "country": "b", "division": "c", "location": "4"} diff --git a/tests/functional/curate/cram/apply-geolocation-rules/general-rule-precedence.t b/tests/functional/curate/cram/apply-geolocation-rules/general-rule-precedence.t index df3bb984c..6bf0d207b 100644 --- a/tests/functional/curate/cram/apply-geolocation-rules/general-rule-precedence.t +++ b/tests/functional/curate/cram/apply-geolocation-rules/general-rule-precedence.t @@ -13,15 +13,15 @@ more generic wildcard matches. $ echo '{"region": "a", "country": "b", "division": "c", "location": "x"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "1", "country": "2", "division": "3", "location": "x"} $ echo '{"region": "a", "country": "b", "division": "x", "location": "x"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "1", "country": "2", "division": "x", "location": "x"} $ echo '{"region": "a", "country": "x", "division": "x", "location": "x"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "1", "country": "x", "division": "x", "location": "x"} diff --git a/tests/functional/curate/cram/apply-geolocation-rules/ignore-comments.t b/tests/functional/curate/cram/apply-geolocation-rules/ignore-comments.t index 850e0b1ea..af480fa41 100644 --- a/tests/functional/curate/cram/apply-geolocation-rules/ignore-comments.t +++ b/tests/functional/curate/cram/apply-geolocation-rules/ignore-comments.t @@ -11,5 +11,5 @@ Test that comments are ignored. $ echo '{"region": "r_raw", "country": "c_raw", "division": "d_raw", "location": "l_raw"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "r_annotated", "country": "c_annotated", "division": "d_annotated", "location": "l_annotated"} diff --git a/tests/functional/curate/cram/apply-geolocation-rules/invalid-rule-warnings.t b/tests/functional/curate/cram/apply-geolocation-rules/invalid-rule-warnings.t index 9e5f94187..b6b1e8564 100644 --- a/tests/functional/curate/cram/apply-geolocation-rules/invalid-rule-warnings.t +++ b/tests/functional/curate/cram/apply-geolocation-rules/invalid-rule-warnings.t @@ -14,7 +14,7 @@ Valid rule matches are still applied. $ echo '{"region": "r_old", "country": "c_old", "division": "d_old", "location": "l_old"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv WARNING: Could not decode geolocation rule 'r_old/c_old/d_old/l_old\n'. Please make sure rules are formatted as 'region/country/division/locationregion/country/division/location'. WARNING: Could not decode the annotated geolocation 'r_new/c_new/d_new'. Please make sure it is formatted as 'region/country/division/location'. WARNING: Could not decode the raw geolocation 'r_old/c_old/d_old'. Please make sure it is formatted as 'region/country/division/location'. diff --git a/tests/functional/curate/cram/apply-geolocation-rules/last-duplicate-rule-applied.t b/tests/functional/curate/cram/apply-geolocation-rules/last-duplicate-rule-applied.t index 056974e84..7da5a238f 100644 --- a/tests/functional/curate/cram/apply-geolocation-rules/last-duplicate-rule-applied.t +++ b/tests/functional/curate/cram/apply-geolocation-rules/last-duplicate-rule-applied.t @@ -14,5 +14,5 @@ It applies the last rules that matched. $ echo '{"region": "r_raw", "country": "c_raw", "division": "d_raw", "location": "l_raw"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "r_annotated_custom", "country": "c_annotated_custom", "division": "d_annotated_custom", "location": "l_annotated_custom"} diff --git a/tests/functional/curate/cram/apply-geolocation-rules/question-mark-values.t b/tests/functional/curate/cram/apply-geolocation-rules/question-mark-values.t index 6a1ddbe77..896cffb40 100644 --- a/tests/functional/curate/cram/apply-geolocation-rules/question-mark-values.t +++ b/tests/functional/curate/cram/apply-geolocation-rules/question-mark-values.t @@ -17,7 +17,7 @@ Rules taken directly from `augur/data/geolocation_rules.tsv` as at c8181c7 Fist check that if the division is EMPTY then the first rule is applied, as we expect $ echo '{"region": "Asia", "country": "Bangladesh", "division": "Jashore", "location": ""}' \ - > | ${AUGUR} curate apply-geolocation-rules --geolocation-rules rules.tsv + > | ${AUGUR} curate apply-geolocation-rules --no-default-rules --geolocation-rules rules.tsv {"region": "Asia", "country": "Bangladesh", "division": "Khulna", "location": "Jashore"} but if the location is a "?", which is the case in our seasonal-flu ingest pipeline we want to use the first rule too @@ -25,14 +25,14 @@ rather than interpreting "?" as a valid location to preserve $ echo '{"region": "Asia", "country": "Bangladesh", "division": "Jashore", "location": "?"}' \ - > | ${AUGUR} curate apply-geolocation-rules --geolocation-rules rules.tsv + > | ${AUGUR} curate apply-geolocation-rules --no-default-rules --geolocation-rules rules.tsv {"region": "Asia", "country": "Bangladesh", "division": "Khulna", "location": "Jashore"} Edge case: preserve '?' in the output if we don't replace it via a rule $ echo '{"region": "Asia", "country": "X", "division": "?", "location": "?"}' \ - > | ${AUGUR} curate apply-geolocation-rules --geolocation-rules rules.tsv + > | ${AUGUR} curate apply-geolocation-rules --no-default-rules --geolocation-rules rules.tsv {"region": "Asia", "country": "X", "division": "?", "location": "?"} @@ -43,18 +43,19 @@ This works for multiple fields in a work-backwards fashion > ~~ $ echo '{"region": "North America", "country": "USA Alabama", "division": "", "location": ""}' \ - > | ${AUGUR} curate apply-geolocation-rules --geolocation-rules rules.tsv + > | ${AUGUR} curate apply-geolocation-rules --no-default-rules --geolocation-rules rules.tsv {"region": "North America", "country": "USA", "division": "Alabama", "location": ""} +Note that the corrected location is '?', not '' which the rule may suggest -- we attempt to preserve the empty-string value which is used $ echo '{"region": "North America", "country": "USA Alabama", "division": "?", "location": "?"}' \ - > | ${AUGUR} curate apply-geolocation-rules --geolocation-rules rules.tsv - {"region": "North America", "country": "USA", "division": "Alabama", "location": ""} + > | ${AUGUR} curate apply-geolocation-rules --no-default-rules --geolocation-rules rules.tsv + {"region": "North America", "country": "USA", "division": "Alabama", "location": "?"} NOTE: but division=? and location=empty string (or vice-versa) doesn't work - it's expected that the missing-value is used consistently $ echo '{"region": "North America", "country": "USA Alabama", "division": "?", "location": ""}' \ - > | ${AUGUR} curate apply-geolocation-rules --geolocation-rules rules.tsv + > | ${AUGUR} curate apply-geolocation-rules --no-default-rules --geolocation-rules rules.tsv {"region": "North America", "country": "USA Alabama", "division": "?", "location": ""} $ echo '{"region": "North America", "country": "USA Alabama", "division": "", "location": "?"}' \ - > | ${AUGUR} curate apply-geolocation-rules --geolocation-rules rules.tsv + > | ${AUGUR} curate apply-geolocation-rules --no-default-rules --geolocation-rules rules.tsv {"region": "North America", "country": "USA Alabama", "division": "", "location": "?"} diff --git a/tests/functional/curate/cram/apply-geolocation-rules/rule-case-sensitivity.t b/tests/functional/curate/cram/apply-geolocation-rules/rule-case-sensitivity.t index de63fa05c..5668d3d32 100644 --- a/tests/functional/curate/cram/apply-geolocation-rules/rule-case-sensitivity.t +++ b/tests/functional/curate/cram/apply-geolocation-rules/rule-case-sensitivity.t @@ -12,20 +12,20 @@ Rule matching is case-insensitive and the output matches the casing of the annot $ echo '{"region": "North America", "country": "USA", "division": "CA", "location": "Los Angeles"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "North America", "country": "USA", "division": "California", "location": "Los Angeles"} Rule matching is case-insensitive by default, so raw values with mismatched casing will still get updated. $ echo '{"region": "North America", "country": "USA", "division": "Ca", "location": "Los Angeles"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "North America", "country": "USA", "division": "California", "location": "Los Angeles"} Using the `--case-sensitive` flag will make rule matching case-sensitive. $ echo '{"region": "North America", "country": "USA", "division": "Ca", "location": "Los Angeles"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv \ + > --no-default-rules --geolocation-rules rules.tsv \ > --case-sensitive {"region": "North America", "country": "USA", "division": "Ca", "location": "Los Angeles"} diff --git a/tests/functional/curate/cram/apply-geolocation-rules/rule-type-preference.t b/tests/functional/curate/cram/apply-geolocation-rules/rule-type-preference.t index 83a30aa6e..a0059187e 100644 --- a/tests/functional/curate/cram/apply-geolocation-rules/rule-type-preference.t +++ b/tests/functional/curate/cram/apply-geolocation-rules/rule-type-preference.t @@ -11,5 +11,5 @@ Test that matching rules are used over general rules. $ echo '{"region": "a", "country": "b", "division": "c", "location": "d"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "1", "country": "2", "division": "3", "location": "4"}