From 0adb79b937a69a85921bf292471c913552b8ddb4 Mon Sep 17 00:00:00 2001 From: m-maryia Date: Tue, 7 Oct 2025 10:28:51 +0200 Subject: [PATCH 1/4] add JP parsing rules without model, handle the absence of model --- model/countries/JP/JP-metadata.yaml | 2 + model/countries/JP/JP-parsing-rules.yaml | 87 +++++++++++++++++++ model/main.py | 29 ++++--- model/modules/formatting/formatting_module.py | 4 +- model/modules/parsing/parsing_module.py | 2 +- model/renderer.py | 5 +- 6 files changed, 111 insertions(+), 18 deletions(-) create mode 100644 model/countries/JP/JP-metadata.yaml create mode 100644 model/countries/JP/JP-parsing-rules.yaml diff --git a/model/countries/JP/JP-metadata.yaml b/model/countries/JP/JP-metadata.yaml new file mode 100644 index 0000000..f1daa1e --- /dev/null +++ b/model/countries/JP/JP-metadata.yaml @@ -0,0 +1,2 @@ +country: JP +flag: 🇯🇵 diff --git a/model/countries/JP/JP-parsing-rules.yaml b/model/countries/JP/JP-parsing-rules.yaml new file mode 100644 index 0000000..3c63e12 --- /dev/null +++ b/model/countries/JP/JP-parsing-rules.yaml @@ -0,0 +1,87 @@ +regex_definitions: + # Regular expression to match optional zip prefix. + kZipOptionalPrefixRe: + regex_fragment: (?:〒\s*) + + # Regular expression to match 3-digit zip prefix. + kZipPrefixValueRe: + regex_fragment: (?:[0-90-9]{3}) + + # Regular expression pattern to match the separator between + # zip code prefix and suffix. + kZipCodeSeparatorsRe: + regex_fragment: (?:[\s--]+) + + # Regular expression to match 4-digit zip suffix. + kZipSuffixValueRe: + regex_fragment: (?:[0-90-9]{4}) + +capture_definitions: + # Returns an expression to parse `postal-code` into `postal-code-prefix` + # and `postal-code-suffix`, separator is optional, skip optional prefix. + ParsePostalCodeOptionalSeparatorOptionalPrefixExpression: + capture: + output: postal-code + parts: + - no_capture: + parts: [ {regex_reference: kZipOptionalPrefixRe} ] + quantifier: MATCH_OPTIONAL + - capture: + output: postal-code-prefix + parts: [ {regex_reference: kZipPrefixValueRe} ] + - no_capture: + parts: + - separator: {regex_reference: kZipCodeSeparatorsRe} + quantifier: MATCH_OPTIONAL + - capture: + output: postal-code-suffix + parts: [ {regex_reference: kZipSuffixValueRe} ] + +parsing_definitions: + postal-code: + decomposition: + capture_reference: ParsePostalCodeOptionalSeparatorOptionalPrefixExpression + +test_parsing_definitions: +- id: "Zip code with separator" + type: postal-code + input: "163-8001" + output: + postal-code-prefix: "163" + postal-code-suffix: "8001" +- id: "Zip code without separator" + type: postal-code + input: "1638001" + output: + postal-code-prefix: "163" + postal-code-suffix: "8001" +- id: "Full-width zip code with separator" + type: postal-code + input: "163-8001" + output: + postal-code-prefix: "163" + postal-code-suffix: "8001" +- id: "Zip code with optional prefix and separator" + type: postal-code + input: "〒163-8001" + output: + postal-code-prefix: "163" + postal-code-suffix: "8001" +- id: "Zip code with optional prefix, space, and separator" + type: postal-code + input: "〒 163-8001" + output: + postal-code-prefix: "163" + postal-code-suffix: "8001" +- id: "Full-width zip code with optional prefix" + type: postal-code + input: "〒163-8001" + output: + postal-code-prefix: "163" + postal-code-suffix: "8001" +- id: "Zip code with optional prefix and no separator" + type: postal-code + input: "〒1638001" + output: + postal-code-prefix: "163" + postal-code-suffix: "8001" diff --git a/model/main.py b/model/main.py index bbb5c97..7deb819 100644 --- a/model/main.py +++ b/model/main.py @@ -127,20 +127,21 @@ def country_of_path(path: Path) -> str: after_token_index += new_after_token_index content += after_token_index - all_token_content = "" - for token in renderer.get_model(country).pre_order_only_uniques(): - token_content = "" - for module in modules: - if new_token_conent := module.render_token_details( - country, token.id, renderer): - token_content += new_token_conent - token_content = renderer.wrap_token_details(token.id, - renderer.get_model(country), - token_content) - all_token_content += token_content - if all_token_content: - all_token_content = renderer.wrap_all_token_details(all_token_content) - content += all_token_content + model = renderer.get_model(country) + if model is not None: + all_token_content = "" + for token in model.pre_order_only_uniques(): + token_content = "" + for module in modules: + if new_token_conent := module.render_token_details( + country, token.id, renderer): + token_content += new_token_conent + token_content = renderer.wrap_token_details(token.id, model, + token_content) + all_token_content += token_content + if all_token_content: + all_token_content = renderer.wrap_all_token_details(all_token_content) + content += all_token_content epilogue = "" for module in modules: diff --git a/model/modules/formatting/formatting_module.py b/model/modules/formatting/formatting_module.py index 2caa902..096e908 100644 --- a/model/modules/formatting/formatting_module.py +++ b/model/modules/formatting/formatting_module.py @@ -192,6 +192,8 @@ def apply_formatting(self, country: str, token_id: str, data: dict, return str(data[token_id]) model = renderer.get_model(country) + if model is None: + return "" token = model.find_token(token_id) if not token or token.is_atomic_token() or not token.children: @@ -290,7 +292,7 @@ def collect_details_for_example_addresses(self, country: str, }) return { 'examples': collected_details, - 'model': renderer.country_data[country]['model'] + 'model': renderer.country_data[country].get('model') } def render_after_token_index(self, country: str, diff --git a/model/modules/parsing/parsing_module.py b/model/modules/parsing/parsing_module.py index afd5305..051874d 100644 --- a/model/modules/parsing/parsing_module.py +++ b/model/modules/parsing/parsing_module.py @@ -174,7 +174,7 @@ def observe_file(self, path: Path, renderer: Renderer): engine.prune_output_types(all_removed_tokens) model = renderer.get_model(country) - if not engine.validate(model): + if model is not None and not engine.validate(model): return if 'test_regex_definitions' in yaml: diff --git a/model/renderer.py b/model/renderer.py index cbd8af8..2bb01c2 100644 --- a/model/renderer.py +++ b/model/renderer.py @@ -24,6 +24,7 @@ class Renderer: countries = [] vendor_extension_extra_pages: List[ExtraPage] = [] + LEGACY_COUNTRT_CODE = "XX" def __init__(self, output_dir: Optional[str] = None, @@ -38,8 +39,8 @@ def add_country(self, country: str) -> None: # Put "global" first. self.countries.sort(key=lambda c: "" if c == "global" else c) - def get_model(self, country: str) -> Any: - return self.country_data[country]["model"] + def get_model(self, country: str) -> Optional[Any]: + return self.country_data[country].get("model") def set_model(self, country: str, model: Any): self.country_data[country]["model"] = model From 53b267bad065925140d174976091df2a8e60a14b Mon Sep 17 00:00:00 2001 From: m-maryia Date: Tue, 7 Oct 2025 10:28:51 +0200 Subject: [PATCH 2/4] add JP parsing rules without model, handle the absence of model --- model/renderer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model/renderer.py b/model/renderer.py index 2bb01c2..291b63d 100644 --- a/model/renderer.py +++ b/model/renderer.py @@ -24,7 +24,7 @@ class Renderer: countries = [] vendor_extension_extra_pages: List[ExtraPage] = [] - LEGACY_COUNTRT_CODE = "XX" + LEGACY_COUNTRY_CODE = "XX" def __init__(self, output_dir: Optional[str] = None, From 1941df35e16bd1aa4d626729d6bc7d824f64ead7 Mon Sep 17 00:00:00 2001 From: m-maryia Date: Fri, 10 Oct 2025 14:38:26 +0200 Subject: [PATCH 3/4] remove is not None --- model/main.py | 2 +- model/modules/formatting/formatting_module.py | 2 +- model/modules/parsing/parsing_module.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/model/main.py b/model/main.py index 7deb819..f4071fb 100644 --- a/model/main.py +++ b/model/main.py @@ -128,7 +128,7 @@ def country_of_path(path: Path) -> str: content += after_token_index model = renderer.get_model(country) - if model is not None: + if model: all_token_content = "" for token in model.pre_order_only_uniques(): token_content = "" diff --git a/model/modules/formatting/formatting_module.py b/model/modules/formatting/formatting_module.py index 096e908..c436972 100644 --- a/model/modules/formatting/formatting_module.py +++ b/model/modules/formatting/formatting_module.py @@ -192,7 +192,7 @@ def apply_formatting(self, country: str, token_id: str, data: dict, return str(data[token_id]) model = renderer.get_model(country) - if model is None: + if not model: return "" token = model.find_token(token_id) diff --git a/model/modules/parsing/parsing_module.py b/model/modules/parsing/parsing_module.py index 051874d..4483a79 100644 --- a/model/modules/parsing/parsing_module.py +++ b/model/modules/parsing/parsing_module.py @@ -174,7 +174,7 @@ def observe_file(self, path: Path, renderer: Renderer): engine.prune_output_types(all_removed_tokens) model = renderer.get_model(country) - if model is not None and not engine.validate(model): + if model and not engine.validate(model): return if 'test_regex_definitions' in yaml: From b6f62d9663560e2657e5e67843704bb0e02a041a Mon Sep 17 00:00:00 2001 From: m-maryia Date: Fri, 10 Oct 2025 14:56:37 +0200 Subject: [PATCH 4/4] remove legacy country code constant --- model/renderer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/model/renderer.py b/model/renderer.py index 291b63d..b1b1a8b 100644 --- a/model/renderer.py +++ b/model/renderer.py @@ -24,7 +24,6 @@ class Renderer: countries = [] vendor_extension_extra_pages: List[ExtraPage] = [] - LEGACY_COUNTRY_CODE = "XX" def __init__(self, output_dir: Optional[str] = None,