diff --git a/augur/curate/apply_geolocation_rules.py b/augur/curate/apply_geolocation_rules.py index 97176e0a7..aedd29cc5 100644 --- a/augur/curate/apply_geolocation_rules.py +++ b/augur/curate/apply_geolocation_rules.py @@ -73,6 +73,16 @@ def load_geolocation_rules(geolocation_rules_file, case_sensitive): geolocation_rules[raw[0]][raw[1]][raw[2]][raw[3]] = annot + # We want to match '?' values to the empty string as well to allow them to be used as empty values + # but if they are used it's expected they are used for all "empty" fields + annot_using_question_marks = tuple(['?' if x=='' else x for x in annot]) + if raw[1]=='' and raw[2]=='' and raw[3]=='': + geolocation_rules[raw[0]]['?']['?']['?'] = annot_using_question_marks + elif raw[2]=='' and raw[3]=='': + geolocation_rules[raw[0]][raw[1]]['?']['?'] = annot_using_question_marks + elif raw[3]=='': + geolocation_rules[raw[0]][raw[1]][raw[2]]['?'] = annot_using_question_marks + return geolocation_rules diff --git a/tests/functional/curate/cram/apply-geolocation-rules/cyclic-error.t b/tests/functional/curate/cram/apply-geolocation-rules/cyclic-error.t index f380a0d33..9287e2a24 100644 --- a/tests/functional/curate/cram/apply-geolocation-rules/cyclic-error.t +++ b/tests/functional/curate/cram/apply-geolocation-rules/cyclic-error.t @@ -13,13 +13,13 @@ Attempting to use the rules with no matches results in no changes. $ echo '{"region": "r_old", "country": "c_old", "division": "d_old", "location": "l_something_else"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "r_old", "country": "c_old", "division": "d_old", "location": "l_something_else"} Attempting to use the rules with a match results in an error. $ echo '{"region": "r_old", "country": "c_old", "division": "d_old", "location": "l_old"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv ERROR: More than 1000 geolocation rules applied on the same entry ['r_old', 'c_old', 'd_old', 'l_old']. [2] diff --git a/tests/functional/curate/cram/apply-geolocation-rules/empty-rule.t b/tests/functional/curate/cram/apply-geolocation-rules/empty-rule.t index 336957edc..33d852d03 100644 --- a/tests/functional/curate/cram/apply-geolocation-rules/empty-rule.t +++ b/tests/functional/curate/cram/apply-geolocation-rules/empty-rule.t @@ -10,7 +10,7 @@ Test that an empty field can be loaded in the raw and annotated columns. $ echo '{"region": "r_raw", "country": "c_raw", "division": "", "location": ""}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "r_annotated", "country": "c_annotated", "division": "", "location": ""} This is effectively an exact rule, so it will not apply to any record that does @@ -18,5 +18,5 @@ not match the empty string. $ echo '{"region": "r_raw", "country": "c_raw", "division": "d_raw", "location": ""}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "r_raw", "country": "c_raw", "division": "d_raw", "location": ""} diff --git a/tests/functional/curate/cram/apply-geolocation-rules/general-rule-fields.t b/tests/functional/curate/cram/apply-geolocation-rules/general-rule-fields.t index af26c90e4..12519f39d 100644 --- a/tests/functional/curate/cram/apply-geolocation-rules/general-rule-fields.t +++ b/tests/functional/curate/cram/apply-geolocation-rules/general-rule-fields.t @@ -12,7 +12,7 @@ Field 1 $ echo '{"region": "a", "country": "b", "division": "c", "location": "d"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "a", "country": "2", "division": "3", "location": "4"} Field 2 @@ -23,7 +23,7 @@ Field 2 $ echo '{"region": "a", "country": "b", "division": "c", "location": "d"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "1", "country": "b", "division": "3", "location": "4"} Field 3 @@ -34,7 +34,7 @@ Field 3 $ echo '{"region": "a", "country": "b", "division": "c", "location": "d"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "1", "country": "2", "division": "c", "location": "4"} Field 4 @@ -45,7 +45,7 @@ Field 4 $ echo '{"region": "a", "country": "b", "division": "c", "location": "d"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "1", "country": "2", "division": "3", "location": "d"} Fields 2,3 @@ -56,7 +56,7 @@ Fields 2,3 $ echo '{"region": "a", "country": "b", "division": "c", "location": "d"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "1", "country": "b", "division": "c", "location": "4"} Fields 1,2,3 @@ -67,5 +67,5 @@ Fields 1,2,3 $ echo '{"region": "a", "country": "b", "division": "c", "location": "d"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "a", "country": "b", "division": "c", "location": "4"} diff --git a/tests/functional/curate/cram/apply-geolocation-rules/general-rule-precedence.t b/tests/functional/curate/cram/apply-geolocation-rules/general-rule-precedence.t index df3bb984c..6bf0d207b 100644 --- a/tests/functional/curate/cram/apply-geolocation-rules/general-rule-precedence.t +++ b/tests/functional/curate/cram/apply-geolocation-rules/general-rule-precedence.t @@ -13,15 +13,15 @@ more generic wildcard matches. $ echo '{"region": "a", "country": "b", "division": "c", "location": "x"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "1", "country": "2", "division": "3", "location": "x"} $ echo '{"region": "a", "country": "b", "division": "x", "location": "x"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "1", "country": "2", "division": "x", "location": "x"} $ echo '{"region": "a", "country": "x", "division": "x", "location": "x"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "1", "country": "x", "division": "x", "location": "x"} diff --git a/tests/functional/curate/cram/apply-geolocation-rules/ignore-comments.t b/tests/functional/curate/cram/apply-geolocation-rules/ignore-comments.t index 850e0b1ea..af480fa41 100644 --- a/tests/functional/curate/cram/apply-geolocation-rules/ignore-comments.t +++ b/tests/functional/curate/cram/apply-geolocation-rules/ignore-comments.t @@ -11,5 +11,5 @@ Test that comments are ignored. $ echo '{"region": "r_raw", "country": "c_raw", "division": "d_raw", "location": "l_raw"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "r_annotated", "country": "c_annotated", "division": "d_annotated", "location": "l_annotated"} diff --git a/tests/functional/curate/cram/apply-geolocation-rules/invalid-rule-warnings.t b/tests/functional/curate/cram/apply-geolocation-rules/invalid-rule-warnings.t index 9e5f94187..b6b1e8564 100644 --- a/tests/functional/curate/cram/apply-geolocation-rules/invalid-rule-warnings.t +++ b/tests/functional/curate/cram/apply-geolocation-rules/invalid-rule-warnings.t @@ -14,7 +14,7 @@ Valid rule matches are still applied. $ echo '{"region": "r_old", "country": "c_old", "division": "d_old", "location": "l_old"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv WARNING: Could not decode geolocation rule 'r_old/c_old/d_old/l_old\n'. Please make sure rules are formatted as 'region/country/division/locationregion/country/division/location'. WARNING: Could not decode the annotated geolocation 'r_new/c_new/d_new'. Please make sure it is formatted as 'region/country/division/location'. WARNING: Could not decode the raw geolocation 'r_old/c_old/d_old'. Please make sure it is formatted as 'region/country/division/location'. diff --git a/tests/functional/curate/cram/apply-geolocation-rules/last-duplicate-rule-applied.t b/tests/functional/curate/cram/apply-geolocation-rules/last-duplicate-rule-applied.t index 056974e84..7da5a238f 100644 --- a/tests/functional/curate/cram/apply-geolocation-rules/last-duplicate-rule-applied.t +++ b/tests/functional/curate/cram/apply-geolocation-rules/last-duplicate-rule-applied.t @@ -14,5 +14,5 @@ It applies the last rules that matched. $ echo '{"region": "r_raw", "country": "c_raw", "division": "d_raw", "location": "l_raw"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "r_annotated_custom", "country": "c_annotated_custom", "division": "d_annotated_custom", "location": "l_annotated_custom"} diff --git a/tests/functional/curate/cram/apply-geolocation-rules/question-mark-values.t b/tests/functional/curate/cram/apply-geolocation-rules/question-mark-values.t new file mode 100644 index 000000000..896cffb40 --- /dev/null +++ b/tests/functional/curate/cram/apply-geolocation-rules/question-mark-values.t @@ -0,0 +1,61 @@ +Setup + + $ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}" + +We sometimes use question marks '?' as an unknown field. `augur curate apply-record-annotations` +special-cases these so that they aren't interpreted as a valid value. + +This test uses real data from seasonal-flu where this bug was first observed + +Rules taken directly from `augur/data/geolocation_rules.tsv` as at c8181c7 + + $ cat >rules.tsv <<~~ + > Asia/Bangladesh/Jashore/ Asia/Bangladesh/Khulna/Jashore + > Asia/Bangladesh/Jashore/* Asia/Bangladesh/Khulna/* + > ~~ + +Fist check that if the division is EMPTY then the first rule is applied, as we expect + + $ echo '{"region": "Asia", "country": "Bangladesh", "division": "Jashore", "location": ""}' \ + > | ${AUGUR} curate apply-geolocation-rules --no-default-rules --geolocation-rules rules.tsv + {"region": "Asia", "country": "Bangladesh", "division": "Khulna", "location": "Jashore"} + +but if the location is a "?", which is the case in our seasonal-flu ingest pipeline we want to use the first rule too +rather than interpreting "?" as a valid location to preserve + + + $ echo '{"region": "Asia", "country": "Bangladesh", "division": "Jashore", "location": "?"}' \ + > | ${AUGUR} curate apply-geolocation-rules --no-default-rules --geolocation-rules rules.tsv + {"region": "Asia", "country": "Bangladesh", "division": "Khulna", "location": "Jashore"} + + +Edge case: preserve '?' in the output if we don't replace it via a rule + + $ echo '{"region": "Asia", "country": "X", "division": "?", "location": "?"}' \ + > | ${AUGUR} curate apply-geolocation-rules --no-default-rules --geolocation-rules rules.tsv + {"region": "Asia", "country": "X", "division": "?", "location": "?"} + + +This works for multiple fields in a work-backwards fashion + + $ cat >rules.tsv <<~~ + > North America/USA Alabama// North America/USA/Alabama/ + > ~~ + + $ echo '{"region": "North America", "country": "USA Alabama", "division": "", "location": ""}' \ + > | ${AUGUR} curate apply-geolocation-rules --no-default-rules --geolocation-rules rules.tsv + {"region": "North America", "country": "USA", "division": "Alabama", "location": ""} + +Note that the corrected location is '?', not '' which the rule may suggest -- we attempt to preserve the empty-string value which is used + $ echo '{"region": "North America", "country": "USA Alabama", "division": "?", "location": "?"}' \ + > | ${AUGUR} curate apply-geolocation-rules --no-default-rules --geolocation-rules rules.tsv + {"region": "North America", "country": "USA", "division": "Alabama", "location": "?"} + +NOTE: but division=? and location=empty string (or vice-versa) doesn't work - it's expected that the missing-value is used consistently + $ echo '{"region": "North America", "country": "USA Alabama", "division": "?", "location": ""}' \ + > | ${AUGUR} curate apply-geolocation-rules --no-default-rules --geolocation-rules rules.tsv + {"region": "North America", "country": "USA Alabama", "division": "?", "location": ""} + + $ echo '{"region": "North America", "country": "USA Alabama", "division": "", "location": "?"}' \ + > | ${AUGUR} curate apply-geolocation-rules --no-default-rules --geolocation-rules rules.tsv + {"region": "North America", "country": "USA Alabama", "division": "", "location": "?"} diff --git a/tests/functional/curate/cram/apply-geolocation-rules/rule-case-sensitivity.t b/tests/functional/curate/cram/apply-geolocation-rules/rule-case-sensitivity.t index de63fa05c..5668d3d32 100644 --- a/tests/functional/curate/cram/apply-geolocation-rules/rule-case-sensitivity.t +++ b/tests/functional/curate/cram/apply-geolocation-rules/rule-case-sensitivity.t @@ -12,20 +12,20 @@ Rule matching is case-insensitive and the output matches the casing of the annot $ echo '{"region": "North America", "country": "USA", "division": "CA", "location": "Los Angeles"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "North America", "country": "USA", "division": "California", "location": "Los Angeles"} Rule matching is case-insensitive by default, so raw values with mismatched casing will still get updated. $ echo '{"region": "North America", "country": "USA", "division": "Ca", "location": "Los Angeles"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "North America", "country": "USA", "division": "California", "location": "Los Angeles"} Using the `--case-sensitive` flag will make rule matching case-sensitive. $ echo '{"region": "North America", "country": "USA", "division": "Ca", "location": "Los Angeles"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv \ + > --no-default-rules --geolocation-rules rules.tsv \ > --case-sensitive {"region": "North America", "country": "USA", "division": "Ca", "location": "Los Angeles"} diff --git a/tests/functional/curate/cram/apply-geolocation-rules/rule-type-preference.t b/tests/functional/curate/cram/apply-geolocation-rules/rule-type-preference.t index 83a30aa6e..a0059187e 100644 --- a/tests/functional/curate/cram/apply-geolocation-rules/rule-type-preference.t +++ b/tests/functional/curate/cram/apply-geolocation-rules/rule-type-preference.t @@ -11,5 +11,5 @@ Test that matching rules are used over general rules. $ echo '{"region": "a", "country": "b", "division": "c", "location": "d"}' \ > | ${AUGUR} curate apply-geolocation-rules \ - > --geolocation-rules rules.tsv + > --no-default-rules --geolocation-rules rules.tsv {"region": "1", "country": "2", "division": "3", "location": "4"}