diff --git a/model/countries/JP/JP-metadata.yaml b/model/countries/JP/JP-metadata.yaml new file mode 100644 index 0000000..f1daa1e --- /dev/null +++ b/model/countries/JP/JP-metadata.yaml @@ -0,0 +1,2 @@ +country: JP +flag: 🇯🇵 diff --git a/model/countries/JP/JP-parsing-rules.yaml b/model/countries/JP/JP-parsing-rules.yaml new file mode 100644 index 0000000..3c63e12 --- /dev/null +++ b/model/countries/JP/JP-parsing-rules.yaml @@ -0,0 +1,87 @@ +regex_definitions: + # Regular expression to match optional zip prefix. + kZipOptionalPrefixRe: + regex_fragment: (?:〒\s*) + + # Regular expression to match 3-digit zip prefix. + kZipPrefixValueRe: + regex_fragment: (?:[0-90-9]{3}) + + # Regular expression pattern to match the separator between + # zip code prefix and suffix. + kZipCodeSeparatorsRe: + regex_fragment: (?:[\s--]+) + + # Regular expression to match 4-digit zip suffix. + kZipSuffixValueRe: + regex_fragment: (?:[0-90-9]{4}) + +capture_definitions: + # Returns an expression to parse `postal-code` into `postal-code-prefix` + # and `postal-code-suffix`, separator is optional, skip optional prefix. + ParsePostalCodeOptionalSeparatorOptionalPrefixExpression: + capture: + output: postal-code + parts: + - no_capture: + parts: [ {regex_reference: kZipOptionalPrefixRe} ] + quantifier: MATCH_OPTIONAL + - capture: + output: postal-code-prefix + parts: [ {regex_reference: kZipPrefixValueRe} ] + - no_capture: + parts: + - separator: {regex_reference: kZipCodeSeparatorsRe} + quantifier: MATCH_OPTIONAL + - capture: + output: postal-code-suffix + parts: [ {regex_reference: kZipSuffixValueRe} ] + +parsing_definitions: + postal-code: + decomposition: + capture_reference: ParsePostalCodeOptionalSeparatorOptionalPrefixExpression + +test_parsing_definitions: +- id: "Zip code with separator" + type: postal-code + input: "163-8001" + output: + postal-code-prefix: "163" + postal-code-suffix: "8001" +- id: "Zip code without separator" + type: postal-code + input: "1638001" + output: + postal-code-prefix: "163" + postal-code-suffix: "8001" +- id: "Full-width zip code with separator" + type: postal-code + input: "163-8001" + output: + postal-code-prefix: "163" + postal-code-suffix: "8001" +- id: "Zip code with optional prefix and separator" + type: postal-code + input: "〒163-8001" + output: + postal-code-prefix: "163" + postal-code-suffix: "8001" +- id: "Zip code with optional prefix, space, and separator" + type: postal-code + input: "〒 163-8001" + output: + postal-code-prefix: "163" + postal-code-suffix: "8001" +- id: "Full-width zip code with optional prefix" + type: postal-code + input: "〒163-8001" + output: + postal-code-prefix: "163" + postal-code-suffix: "8001" +- id: "Zip code with optional prefix and no separator" + type: postal-code + input: "〒1638001" + output: + postal-code-prefix: "163" + postal-code-suffix: "8001" diff --git a/model/main.py b/model/main.py index bbb5c97..f4071fb 100644 --- a/model/main.py +++ b/model/main.py @@ -127,20 +127,21 @@ def country_of_path(path: Path) -> str: after_token_index += new_after_token_index content += after_token_index - all_token_content = "" - for token in renderer.get_model(country).pre_order_only_uniques(): - token_content = "" - for module in modules: - if new_token_conent := module.render_token_details( - country, token.id, renderer): - token_content += new_token_conent - token_content = renderer.wrap_token_details(token.id, - renderer.get_model(country), - token_content) - all_token_content += token_content - if all_token_content: - all_token_content = renderer.wrap_all_token_details(all_token_content) - content += all_token_content + model = renderer.get_model(country) + if model: + all_token_content = "" + for token in model.pre_order_only_uniques(): + token_content = "" + for module in modules: + if new_token_conent := module.render_token_details( + country, token.id, renderer): + token_content += new_token_conent + token_content = renderer.wrap_token_details(token.id, model, + token_content) + all_token_content += token_content + if all_token_content: + all_token_content = renderer.wrap_all_token_details(all_token_content) + content += all_token_content epilogue = "" for module in modules: diff --git a/model/modules/formatting/formatting_module.py b/model/modules/formatting/formatting_module.py index 2caa902..c436972 100644 --- a/model/modules/formatting/formatting_module.py +++ b/model/modules/formatting/formatting_module.py @@ -192,6 +192,8 @@ def apply_formatting(self, country: str, token_id: str, data: dict, return str(data[token_id]) model = renderer.get_model(country) + if not model: + return "" token = model.find_token(token_id) if not token or token.is_atomic_token() or not token.children: @@ -290,7 +292,7 @@ def collect_details_for_example_addresses(self, country: str, }) return { 'examples': collected_details, - 'model': renderer.country_data[country]['model'] + 'model': renderer.country_data[country].get('model') } def render_after_token_index(self, country: str, diff --git a/model/modules/parsing/parsing_module.py b/model/modules/parsing/parsing_module.py index afd5305..4483a79 100644 --- a/model/modules/parsing/parsing_module.py +++ b/model/modules/parsing/parsing_module.py @@ -174,7 +174,7 @@ def observe_file(self, path: Path, renderer: Renderer): engine.prune_output_types(all_removed_tokens) model = renderer.get_model(country) - if not engine.validate(model): + if model and not engine.validate(model): return if 'test_regex_definitions' in yaml: diff --git a/model/renderer.py b/model/renderer.py index cbd8af8..b1b1a8b 100644 --- a/model/renderer.py +++ b/model/renderer.py @@ -38,8 +38,8 @@ def add_country(self, country: str) -> None: # Put "global" first. self.countries.sort(key=lambda c: "" if c == "global" else c) - def get_model(self, country: str) -> Any: - return self.country_data[country]["model"] + def get_model(self, country: str) -> Optional[Any]: + return self.country_data[country].get("model") def set_model(self, country: str, model: Any): self.country_data[country]["model"] = model