From 8c6f61af02e93a4fa09edf98720ae2d8af536ffd Mon Sep 17 00:00:00 2001 From: David Baines Date: Fri, 27 Mar 2026 14:04:27 +0000 Subject: [PATCH 1/8] Update find_by_iso to write config.yml files --- silnlp/common/find_by_iso.py | 233 ++++++++++++++++++++++---------- silnlp/common/find_by_iso2.py | 244 ++++++++++++++++++++++++++++++++++ 2 files changed, 406 insertions(+), 71 deletions(-) create mode 100644 silnlp/common/find_by_iso2.py diff --git a/silnlp/common/find_by_iso.py b/silnlp/common/find_by_iso.py index a47effa8..a02feac9 100644 --- a/silnlp/common/find_by_iso.py +++ b/silnlp/common/find_by_iso.py @@ -1,12 +1,14 @@ import argparse import json import logging -from pathlib import Path -from typing import Dict, List, Set, Tuple, Union import sys +from pathlib import Path +from typing import Dict, List, Set, Tuple + +import yaml from .environment import SIL_NLP_ENV -from .iso_info import NLLB_ISO_SET, ALT_ISO +from .iso_info import ALT_ISO, NLLB_ISO_SET IsoCode = str IsoCodeList = List[IsoCode] @@ -14,6 +16,29 @@ LANGUAGE_FAMILY_FILE = SIL_NLP_ENV.assets_dir / "languageFamilies.json" + +def is_file_pattern(input_str: str) -> bool: + """Check if the input string contains a hyphen, indicating it's a filename pattern.""" + return "-" in input_str + + +def split_input_list(input_list: List[str]) -> Tuple[List[str], List[str]]: + """Split input list into ISO codes and file patterns.""" + iso_codes = [] + files = [] + for item in input_list: + if is_file_pattern(item): + files.append(item) + else: + iso_codes.append(item) + return iso_codes, files + + +def get_stem_name(file_path: Path) -> str: + """Get the stem name without path or extension.""" + return file_path.stem + + def load_language_data(file_path: Path) -> Tuple[Dict, Dict, Dict]: try: with open(file_path, "r", encoding="utf-8") as file: @@ -54,7 +79,7 @@ def find_related_isocodes( for iso_code in iso_codes: if iso_code in language_data: lang_info = language_data[iso_code] -# logger.info(f"{iso_code}: {lang_info['Name']}, {lang_info['Country']}, {lang_info['Family']}") + # logger.info(f"{iso_code}: {lang_info['Name']}, {lang_info['Country']}, {lang_info['Family']}") iso_set.update(country_data.get(lang_info["Country"], [])) iso_set.update(family_data.get(lang_info["Family"], [])) @@ -64,10 +89,10 @@ def find_related_isocodes( def get_files_by_iso(isocodes: IsoCodeList, scripture_dir: Path) -> List[Path]: return [ - file for file in scripture_dir.glob('*.txt') - if any(file.stem.startswith(isocode + '-') for isocode in isocodes) + file for file in scripture_dir.glob("*.txt") if any(file.stem.startswith(isocode + "-") for isocode in isocodes) ] + def split_files_by_projects(files: List[Path], projects_dir: Path) -> Tuple[Dict[Path, Path], Dict[Path, Path]]: existing_projects = {} missing_projects = {} @@ -85,90 +110,156 @@ def split_files_by_projects(files: List[Path], projects_dir: Path) -> Tuple[Dict def get_equivalent_isocodes(iso_codes: List[str]) -> Set[str]: return {code for iso_code in iso_codes for code in (iso_code, ALT_ISO.get_alternative(iso_code)) if code} + +def resolve_config_path(config_folder: Path) -> Path: + """Resolve config folder path relative to experiments directory if not absolute.""" + if not config_folder.is_absolute(): + return SIL_NLP_ENV.mt_experiments_dir / config_folder + return config_folder + + +def create_alignment_config(source_files: List[Path], target_files: List[str]) -> dict: + """Create the alignment configuration dictionary.""" + config = { + "data": { + "aligner": "fast_align", + "corpus_pairs": [ + { + "type": "train", + "src": [get_stem_name(f) for f in source_files], + "trg": target_files, + "mapping": "many_to_many", + "test_size": 0, + "val_size": 0, + } + ], + "tokenize": False, + } + } + return config + + +def write_or_print_config(config: dict, config_folder: Path = None): + """Write config to file or print to terminal.""" + if config_folder: + config_folder = Path(config_folder) + if not config_folder.is_absolute(): + config_folder = SIL_NLP_ENV.mt_experiments_dir / config_folder + config_folder.mkdir(parents=True, exist_ok=True) + config_path = config_folder / "config.yml" + with open(config_path, "w") as f: + yaml.dump(config, f, default_flow_style=False, sort_keys=False) + return str(config_path) + else: + return yaml.dump(config, default_flow_style=False, sort_keys=False) + + def main(): - parser = argparse.ArgumentParser(description="Find related ISO language codes.") - parser.add_argument("iso_codes", nargs="+", help="ISO codes to find related languages for") - parser.add_argument("--scripture-dir", type=Path, default=Path(SIL_NLP_ENV.mt_scripture_dir), help="Directory containing scripture files") - parser.add_argument("--all-related", action='store_true', help="List all related scriptures without filtering to those that are part of NLLB") - parser.add_argument("--no-related", action='store_true', help="Only list scriptures in the specified languages and not in related languages") + parser = argparse.ArgumentParser(description="Find related ISO language codes and create alignment config.") + parser.add_argument("inputs", nargs="+", help="ISO codes or file patterns (e.g., 'fra' or 'en-NIV')") + parser.add_argument( + "--scripture-dir", + type=Path, + default=Path(SIL_NLP_ENV.mt_scripture_dir), + help="Directory containing scripture files", + ) + parser.add_argument( + "--all-related", + action="store_true", + help="List all related scriptures without filtering to those that are part of NLLB", + ) + parser.add_argument( + "--no-related", + action="store_true", + help="Only list scriptures in the specified languages and not in related languages", + ) parser.add_argument("--output", type=Path, help="Output to the specified file.") + parser.add_argument("--target-files", nargs="+", help="List of target files in format -") + parser.add_argument( + "--config-folder", + type=Path, + help="Folder to write the config.yml file (absolute or relative to mt_experiments_dir)", + ) args = parser.parse_args() - # Create a custom logger + # Setup logging logger = logging.getLogger(__name__) - #logger.basicConfig() - - # Set the global logging level - logger.setLevel(logging.INFO) - - formatter = logging.Formatter('%(message)s') + logger.setLevel(logging.INFO) + formatter = logging.Formatter("%(message)s") if args.output: - # Create handler for the file output. file_handler = logging.FileHandler(args.output) file_handler.setFormatter(formatter) logger.addHandler(file_handler) else: - # Create handler for the console output. console_handler = logging.StreamHandler(sys.stdout) console_handler.setFormatter(formatter) logger.addHandler(console_handler) - - language_data, country_data, family_data = load_language_data(LANGUAGE_FAMILY_FILE) - projects_dir = SIL_NLP_ENV.pt_projects_dir - scripture_dir = Path(args.scripture_dir) + # Split inputs into ISO codes and file patterns + iso_codes, file_patterns = split_input_list(args.inputs) - if not language_data: - logging.error("Failed to load language data.") + source_files = [] + if iso_codes: + # Load language data and process ISO codes + language_data, country_data, family_data = load_language_data(LANGUAGE_FAMILY_FILE) + if not language_data: + logging.error("Failed to load language data.") + return + + iso_codes = get_equivalent_isocodes(iso_codes) + + if args.no_related: + codes_to_find = list(iso_codes) + logger.info(f"\nConsidering only the specified iso codes and their equivalents: {codes_to_find}") + else: + codes_to_find = find_related_isocodes(list(iso_codes), language_data, country_data, family_data) + logger.info(f"\nFound {len(codes_to_find)} related languages:\n{codes_to_find}.") + + if not args.all_related: + codes_to_find = [iso for iso in codes_to_find if iso in NLLB_ISO_SET] + logger.info(f"\nFound {len(codes_to_find)} specified or related languages in NLLB:\n{codes_to_find}") + else: + logger.info(f"\nFound {len(codes_to_find)} specified or related languages:\n{codes_to_find}") + + # Get all possible codes and find matching files + all_possible_codes = get_equivalent_isocodes(codes_to_find) + source_files.extend(get_files_by_iso(all_possible_codes, args.scripture_dir)) + + # Add files from file patterns + if file_patterns: + pattern_files = [args.scripture_dir / f"{pattern}.txt" for pattern in file_patterns] + existing_files = [f for f in pattern_files if f.exists()] + source_files.extend(existing_files) + if len(existing_files) < len(pattern_files): + missing = set(file_patterns) - set(get_stem_name(f) for f in existing_files) + logger.warning(f"Could not find these files: {missing}") + + if not source_files: + logger.error("\nCouldn't find any Scripture files.") return - - # Get equivalent ISO codes for input - iso_codes = get_equivalent_isocodes(args.iso_codes) - - if args.no_related: - - # Option 2: No files in related languages, only equivalent ISO codes - codes_to_find = list(iso_codes) - logger.info(f"\nConsidering only the specified iso codes and their equivalents. {codes_to_find}") - + + # Use target files from command line or file patterns from inputs + target_files = args.target_files if args.target_files else file_patterns + + # Create and output configuration + config = create_alignment_config(source_files, target_files) + result = write_or_print_config(config, args.config_folder) + + if args.config_folder: + logger.info(f"\nCreated alignment configuration in: {result}") else: - # Find related ISO codes - codes_to_find = find_related_isocodes(list(iso_codes), language_data, country_data, family_data) - logger.info(f"\nFound {len(codes_to_find)} related languages:\n{codes_to_find}.") - - if not args.all_related: - # Option 3 (default): Filter to NLLB languages - codes_to_find = [iso for iso in codes_to_find if iso in NLLB_ISO_SET] - logger.info(f"\nFound {len(codes_to_find)} specified or related languages in NLLB:\n{codes_to_find}") - # Option 1: All related files (no filtering) is handled by not applying the NLLB filter - else: - logger.info(f"\nFound {len(codes_to_find)} specified or related languages:\n{codes_to_find}") - - # Get all possible 2 and 3 letter codes for the related languages - all_possible_codes = get_equivalent_isocodes(codes_to_find) - - # Find files matching the codes - files = get_files_by_iso(all_possible_codes, scripture_dir) - existing_projects, missing_projects = split_files_by_projects(files, projects_dir) - - # Display results - if existing_projects: - logger.info(f"\nThese {len(existing_projects)} files have a corresponding project folder:") - for file, project in existing_projects.items(): - logger.info(f"{file.stem}, {project}") - logger.info("") - if missing_projects: - logger.info(f"\nThese {len(missing_projects)} files don't have a corresponding project folder:") - for file, _ in missing_projects.items(): - logger.info(f"{file.stem}") - logger.info(f"\nAll the files:") - for file in files: - logger.info(f" - {file.stem}") + logger.info("\nAlignment configuration:") + logger.info(result) + + logger.info(f"\nSource files found: {len(source_files)}") + for file in source_files: + logger.info(f" - {get_stem_name(file)}") + logger.info(f"\nTarget files: {len(target_files)}") + for file in target_files: + logger.info(f" - {file}") - if not files: - logger.info("\nCouldn't find any Scripture files in these languages.") if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/silnlp/common/find_by_iso2.py b/silnlp/common/find_by_iso2.py new file mode 100644 index 00000000..7e4c3c2a --- /dev/null +++ b/silnlp/common/find_by_iso2.py @@ -0,0 +1,244 @@ +import argparse +import json +import logging +from pathlib import Path +from typing import Dict, List, Set, Tuple, Union +import sys +import yaml + +from .environment import SIL_NLP_ENV +from .iso_info import NLLB_ISO_SET, ALT_ISO + +IsoCode = str +IsoCodeList = List[IsoCode] +IsoCodeSet = Set[IsoCode] + +LANGUAGE_FAMILY_FILE = SIL_NLP_ENV.assets_dir / "languageFamilies.json" + +def is_file_pattern(input_str: str) -> bool: + """Check if the input string contains a hyphen, indicating it's a filename pattern.""" + return '-' in input_str + +def split_input_list(input_list: List[str]) -> Tuple[List[str], List[str]]: + """Split input list into ISO codes and file patterns.""" + iso_codes = [] + files = [] + for item in input_list: + if is_file_pattern(item): + files.append(item) + else: + iso_codes.append(item) + return iso_codes, files + +def get_stem_name(file_path: Path) -> str: + """Get the stem name without path or extension.""" + return file_path.stem + + +def load_language_data(file_path: Path) -> Tuple[Dict, Dict, Dict]: + try: + with open(file_path, "r", encoding="utf-8") as file: + raw_data = json.load(file) + except FileNotFoundError: + logging.error(f"File not found: {file_path}") + return {}, {}, {} + except json.JSONDecodeError: + logging.error(f"Error decoding JSON from file: {file_path}") + return {}, {}, {} + + language_data = {} + country_data = {} + family_data = {} + + for lang in raw_data: + iso = lang["isoCode"] + country = lang["langCountry"] + family = lang["languageFamily"] + + language_data[iso] = { + "Name": lang["language"], + "Country": country, + "Family": family, + } + + country_data.setdefault(country, []).append(iso) + family_data.setdefault(family, []).append(iso) + + return language_data, country_data, family_data + + +def find_related_isocodes( + iso_codes: IsoCodeList, language_data: Dict, country_data: Dict, family_data: Dict +) -> IsoCodeList: + iso_set = set(iso_codes) + + for iso_code in iso_codes: + if iso_code in language_data: + lang_info = language_data[iso_code] +# logger.info(f"{iso_code}: {lang_info['Name']}, {lang_info['Country']}, {lang_info['Family']}") + + iso_set.update(country_data.get(lang_info["Country"], [])) + iso_set.update(family_data.get(lang_info["Family"], [])) + + return sorted(iso_set) + + +def get_files_by_iso(isocodes: IsoCodeList, scripture_dir: Path) -> List[Path]: + return [ + file for file in scripture_dir.glob('*.txt') + if any(file.stem.startswith(isocode + '-') for isocode in isocodes) + ] + +def split_files_by_projects(files: List[Path], projects_dir: Path) -> Tuple[Dict[Path, Path], Dict[Path, Path]]: + existing_projects = {} + missing_projects = {} + + for file in files: + project = projects_dir / file.stem.split("-")[1] + if project.is_dir(): + existing_projects[file] = project + else: + missing_projects[file] = project + + return existing_projects, missing_projects + + +def get_equivalent_isocodes(iso_codes: List[str]) -> Set[str]: + return {code for iso_code in iso_codes for code in (iso_code, ALT_ISO.get_alternative(iso_code)) if code} + +def resolve_config_path(config_folder: Path) -> Path: + """Resolve config folder path relative to experiments directory if not absolute.""" + if not config_folder.is_absolute(): + return SIL_NLP_ENV.mt_experiments_dir / config_folder + return config_folder + +def create_alignment_config(source_files: List[Path], target_files: List[str]) -> dict: + """Create the alignment configuration dictionary.""" + config = { + 'data': { + 'aligner': 'fast_align', + 'corpus_pairs': [{ + 'type': 'train', + 'src': [get_stem_name(f) for f in source_files], + 'trg': target_files, + 'mapping': 'many_to_many', + 'test_size': 0, + 'val_size': 0 + }], + 'tokenize': False + } + } + return config + +def write_or_print_config(config: dict, config_folder: Path = None): + """Write config to file or print to terminal.""" + if config_folder: + config_folder = Path(config_folder) + if not config_folder.is_absolute(): + config_folder = SIL_NLP_ENV.mt_experiments_dir / config_folder + config_folder.mkdir(parents=True, exist_ok=True) + config_path = config_folder / 'config.yml' + with open(config_path, 'w') as f: + yaml.dump(config, f, default_flow_style=False, sort_keys=False) + return str(config_path) + else: + return yaml.dump(config, default_flow_style=False, sort_keys=False) + +def main(): + parser = argparse.ArgumentParser(description="Find related ISO language codes and create alignment config.") + parser.add_argument("inputs", nargs="+", + help="ISO codes or file patterns (e.g., 'fra' or 'en-NIV')") + parser.add_argument("--scripture-dir", type=Path, + default=Path(SIL_NLP_ENV.mt_scripture_dir), + help="Directory containing scripture files") + parser.add_argument("--all-related", action='store_true', + help="List all related scriptures without filtering to those that are part of NLLB") + parser.add_argument("--no-related", action='store_true', + help="Only list scriptures in the specified languages and not in related languages") + parser.add_argument("--output", type=Path, help="Output to the specified file.") + parser.add_argument("--target-files", nargs="+", + help="List of target files in format -") + parser.add_argument("--config-folder", type=Path, + help="Folder to write the config.yml file (absolute or relative to mt_experiments_dir)") + + args = parser.parse_args() + + # Setup logging + logger = logging.getLogger(__name__) + logger.setLevel(logging.INFO) + formatter = logging.Formatter('%(message)s') + + if args.output: + file_handler = logging.FileHandler(args.output) + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + else: + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setFormatter(formatter) + logger.addHandler(console_handler) + + # Split inputs into ISO codes and file patterns + iso_codes, file_patterns = split_input_list(args.inputs) + + source_files = [] + if iso_codes: + # Load language data and process ISO codes + language_data, country_data, family_data = load_language_data(LANGUAGE_FAMILY_FILE) + if not language_data: + logging.error("Failed to load language data.") + return + + iso_codes = get_equivalent_isocodes(iso_codes) + + if args.no_related: + codes_to_find = list(iso_codes) + logger.info(f"\nConsidering only the specified iso codes and their equivalents: {codes_to_find}") + else: + codes_to_find = find_related_isocodes(list(iso_codes), language_data, country_data, family_data) + logger.info(f"\nFound {len(codes_to_find)} related languages:\n{codes_to_find}.") + + if not args.all_related: + codes_to_find = [iso for iso in codes_to_find if iso in NLLB_ISO_SET] + logger.info(f"\nFound {len(codes_to_find)} specified or related languages in NLLB:\n{codes_to_find}") + else: + logger.info(f"\nFound {len(codes_to_find)} specified or related languages:\n{codes_to_find}") + + # Get all possible codes and find matching files + all_possible_codes = get_equivalent_isocodes(codes_to_find) + source_files.extend(get_files_by_iso(all_possible_codes, args.scripture_dir)) + + # Add files from file patterns + if file_patterns: + pattern_files = [args.scripture_dir / f"{pattern}.txt" for pattern in file_patterns] + existing_files = [f for f in pattern_files if f.exists()] + source_files.extend(existing_files) + if len(existing_files) < len(pattern_files): + missing = set(file_patterns) - set(get_stem_name(f) for f in existing_files) + logger.warning(f"Could not find these files: {missing}") + + if not source_files: + logger.error("\nCouldn't find any Scripture files.") + return + + # Use target files from command line or file patterns from inputs + target_files = args.target_files if args.target_files else file_patterns + + # Create and output configuration + config = create_alignment_config(source_files, target_files) + result = write_or_print_config(config, args.config_folder) + + if args.config_folder: + logger.info(f"\nCreated alignment configuration in: {result}") + else: + logger.info("\nAlignment configuration:") + logger.info(result) + + logger.info(f"\nSource files found: {len(source_files)}") + for file in source_files: + logger.info(f" - {get_stem_name(file)}") + logger.info(f"\nTarget files: {len(target_files)}") + for file in target_files: + logger.info(f" - {file}") + +if __name__ == "__main__": + main() \ No newline at end of file From 86d5aea53f4a09dd991413f51f145bfe11704500 Mon Sep 17 00:00:00 2001 From: David Baines Date: Fri, 27 Mar 2026 15:19:56 +0000 Subject: [PATCH 2/8] WIP --- silnlp/common/find_by_iso.py | 58 ++++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 23 deletions(-) diff --git a/silnlp/common/find_by_iso.py b/silnlp/common/find_by_iso.py index a02feac9..65fb7d0e 100644 --- a/silnlp/common/find_by_iso.py +++ b/silnlp/common/find_by_iso.py @@ -139,14 +139,10 @@ def create_alignment_config(source_files: List[Path], target_files: List[str]) - return config -def write_or_print_config(config: dict, config_folder: Path = None): +def write_or_print_config(config: dict, config_file: Path = None): """Write config to file or print to terminal.""" - if config_folder: - config_folder = Path(config_folder) - if not config_folder.is_absolute(): - config_folder = SIL_NLP_ENV.mt_experiments_dir / config_folder - config_folder.mkdir(parents=True, exist_ok=True) - config_path = config_folder / "config.yml" + if config_file: + config_file = Path(config_file) with open(config_path, "w") as f: yaml.dump(config, f, default_flow_style=False, sort_keys=False) return str(config_path) @@ -154,6 +150,32 @@ def write_or_print_config(config: dict, config_folder: Path = None): return yaml.dump(config, default_flow_style=False, sort_keys=False) +def config_path(output_path: str) -> Path: + output_folder = Path(output_path) + + if output_folder.is_absolute(): + target = output_folder + if target.parent == target.anchor: + raise argparse.ArgumentTypeError(f"Absolute path '{p}' is too shallow. Will not create folders in root or experiments.") + else: + if len(output_folder.parts) < 2: + raise argparse.ArgumentTypeError( + f"Relative path '{output_folder}' must include a subfolder inside Experiments (e.g. typically: 'country/analyze' or 'country/language/analyze')." + ) + target = (SIL_NLP_ENV.mt_experiments_dir / output_folder).resolve() + try: + target.parent.mkdir(parents=False, exist_ok=True) + except PermissionError: + raise argparse.ArgumentTypeError(f"Permission denied creating directory: {target.parent}") + + return target + + + + p = Path(output_path) + return p if p.is_absolute() else SIL_NLP_ENV.mt_experiments_dir / p + + def main(): parser = argparse.ArgumentParser(description="Find related ISO language codes and create alignment config.") parser.add_argument("inputs", nargs="+", help="ISO codes or file patterns (e.g., 'fra' or 'en-NIV')") @@ -173,13 +195,8 @@ def main(): action="store_true", help="Only list scriptures in the specified languages and not in related languages", ) - parser.add_argument("--output", type=Path, help="Output to the specified file.") - parser.add_argument("--target-files", nargs="+", help="List of target files in format -") - parser.add_argument( - "--config-folder", - type=Path, - help="Folder to write the config.yml file (absolute or relative to mt_experiments_dir)", - ) + parser.add_argument("--targets", nargs="+", help="List of target files in format -") + parser.add_argument("--config-folder", type=config_path, help=f"Existing folder, or folder relative to the Experiments folder: {SIL_NLP_ENV.mt_experiments_dir}") args = parser.parse_args() @@ -188,14 +205,9 @@ def main(): logger.setLevel(logging.INFO) formatter = logging.Formatter("%(message)s") - if args.output: - file_handler = logging.FileHandler(args.output) - file_handler.setFormatter(formatter) - logger.addHandler(file_handler) - else: - console_handler = logging.StreamHandler(sys.stdout) - console_handler.setFormatter(formatter) - logger.addHandler(console_handler) + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setFormatter(formatter) + logger.addHandler(console_handler) # Split inputs into ISO codes and file patterns iso_codes, file_patterns = split_input_list(args.inputs) @@ -241,7 +253,7 @@ def main(): return # Use target files from command line or file patterns from inputs - target_files = args.target_files if args.target_files else file_patterns + target_files = args.targets if args.targets else file_patterns # Create and output configuration config = create_alignment_config(source_files, target_files) From c354e73384f98dd419d3ae05e987c7e8e4087139 Mon Sep 17 00:00:00 2001 From: David Baines Date: Fri, 27 Mar 2026 15:28:20 +0000 Subject: [PATCH 3/8] WIP2 --- silnlp/common/find_by_iso.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/silnlp/common/find_by_iso.py b/silnlp/common/find_by_iso.py index 65fb7d0e..4ec1cb80 100644 --- a/silnlp/common/find_by_iso.py +++ b/silnlp/common/find_by_iso.py @@ -140,7 +140,7 @@ def create_alignment_config(source_files: List[Path], target_files: List[str]) - def write_or_print_config(config: dict, config_file: Path = None): - """Write config to file or print to terminal.""" + """Write config to file or print to terminal.""" if config_file: config_file = Path(config_file) with open(config_path, "w") as f: From 562e302b035467573e9ac18c34ae5f0cedd8a948 Mon Sep 17 00:00:00 2001 From: David Baines Date: Fri, 27 Mar 2026 16:14:46 +0000 Subject: [PATCH 4/8] WIP3 --- silnlp/common/find_by_iso.py | 67 +++++++++++++++++------------------- 1 file changed, 32 insertions(+), 35 deletions(-) diff --git a/silnlp/common/find_by_iso.py b/silnlp/common/find_by_iso.py index 4ec1cb80..1c644ddf 100644 --- a/silnlp/common/find_by_iso.py +++ b/silnlp/common/find_by_iso.py @@ -139,41 +139,36 @@ def create_alignment_config(source_files: List[Path], target_files: List[str]) - return config -def write_or_print_config(config: dict, config_file: Path = None): - """Write config to file or print to terminal.""" - if config_file: - config_file = Path(config_file) - with open(config_path, "w") as f: - yaml.dump(config, f, default_flow_style=False, sort_keys=False) - return str(config_path) +def resolve_config_file(output_path: str) -> Path: + "Resolve config folder/file path, creating the config folder if needed." + path = Path(output_path) + if path.suffix: + config_file_name = path.name + folder_part = path.parent + if config_file_name != 'config.yml': + resp = input(f"Warning: filename '{config_file_name}' is not 'config.yml'. Use 'config.yml' instead? [Y/n] ") + if resp.strip().lower() != 'n': config_file_name = 'config.yml' else: - return yaml.dump(config, default_flow_style=False, sort_keys=False) - - -def config_path(output_path: str) -> Path: - output_folder = Path(output_path) - - if output_folder.is_absolute(): - target = output_folder - if target.parent == target.anchor: - raise argparse.ArgumentTypeError(f"Absolute path '{p}' is too shallow. Will not create folders in root or experiments.") + config_file_name = 'config.yml' + folder_part = path + + if folder_part.is_absolute(): + config_folder = folder_part + if config_folder.parent == Path(config_folder.anchor): + raise argparse.ArgumentTypeError(f"Absolute path '{output_path}' is too shallow. Will not create folders in root.") + try: config_folder.mkdir(parents=False, exist_ok=True) + except FileNotFoundError: + raise argparse.ArgumentTypeError(f"Parent directory does not exist: {config_folder.parent}") else: - if len(output_folder.parts) < 2: + if len(folder_part.parts) < 2: raise argparse.ArgumentTypeError( - f"Relative path '{output_folder}' must include a subfolder inside Experiments (e.g. typically: 'country/analyze' or 'country/language/analyze')." - ) - target = (SIL_NLP_ENV.mt_experiments_dir / output_folder).resolve() - try: - target.parent.mkdir(parents=False, exist_ok=True) - except PermissionError: - raise argparse.ArgumentTypeError(f"Permission denied creating directory: {target.parent}") + f"Relative path '{folder_part}' must include a subfolder inside Experiments (e.g. 'country/analyze').") + config_folder = (SIL_NLP_ENV.mt_experiments_dir / folder_part).resolve() + try: config_folder.mkdir(parents=True, exist_ok=True) + except PermissionError: + raise argparse.ArgumentTypeError(f"Permission denied creating directory: {config_folder}") - return target - - - - p = Path(output_path) - return p if p.is_absolute() else SIL_NLP_ENV.mt_experiments_dir / p + return config_folder / config_file_name def main(): @@ -196,7 +191,7 @@ def main(): help="Only list scriptures in the specified languages and not in related languages", ) parser.add_argument("--targets", nargs="+", help="List of target files in format -") - parser.add_argument("--config-folder", type=config_path, help=f"Existing folder, or folder relative to the Experiments folder: {SIL_NLP_ENV.mt_experiments_dir}") + parser.add_argument("--config-folder", help=f"Existing folder, or folder relative to the Experiments folder: {SIL_NLP_ENV.mt_experiments_dir}") args = parser.parse_args() @@ -257,13 +252,15 @@ def main(): # Create and output configuration config = create_alignment_config(source_files, target_files) - result = write_or_print_config(config, args.config_folder) if args.config_folder: - logger.info(f"\nCreated alignment configuration in: {result}") + config_file = resolve_config_file(args.config_folder) + with open(config_file, "w") as f: + yaml.dump(config, f, default_flow_style=False, sort_keys=False) + logger.info(f"\nCreated alignment configuration in: {config_file}") else: logger.info("\nAlignment configuration:") - logger.info(result) + logger.info(yaml.dump(config, default_flow_style=False, sort_keys=False)) logger.info(f"\nSource files found: {len(source_files)}") for file in source_files: From 27907a645375bf936bde196df9a12348a625b3dc Mon Sep 17 00:00:00 2001 From: David Baines Date: Fri, 27 Mar 2026 16:41:08 +0000 Subject: [PATCH 5/8] WIP4 --- silnlp/common/find_by_iso.py | 41 ++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/silnlp/common/find_by_iso.py b/silnlp/common/find_by_iso.py index 1c644ddf..447933be 100644 --- a/silnlp/common/find_by_iso.py +++ b/silnlp/common/find_by_iso.py @@ -111,22 +111,15 @@ def get_equivalent_isocodes(iso_codes: List[str]) -> Set[str]: return {code for iso_code in iso_codes for code in (iso_code, ALT_ISO.get_alternative(iso_code)) if code} -def resolve_config_path(config_folder: Path) -> Path: - """Resolve config folder path relative to experiments directory if not absolute.""" - if not config_folder.is_absolute(): - return SIL_NLP_ENV.mt_experiments_dir / config_folder - return config_folder - - def create_alignment_config(source_files: List[Path], target_files: List[str]) -> dict: """Create the alignment configuration dictionary.""" config = { "data": { - "aligner": "fast_align", + "aligner": "eflomal", "corpus_pairs": [ { "type": "train", - "src": [get_stem_name(f) for f in source_files], + "src": source_files, "trg": target_files, "mapping": "many_to_many", "test_size": 0, @@ -154,8 +147,9 @@ def resolve_config_file(output_path: str) -> Path: if folder_part.is_absolute(): config_folder = folder_part - if config_folder.parent == Path(config_folder.anchor): - raise argparse.ArgumentTypeError(f"Absolute path '{output_path}' is too shallow. Will not create folders in root.") + if not config_folder.parent.exists(): + raise argparse.ArgumentTypeError( + f"Parent directory does not exist: {config_folder.parent}") try: config_folder.mkdir(parents=False, exist_ok=True) except FileNotFoundError: raise argparse.ArgumentTypeError(f"Parent directory does not exist: {config_folder.parent}") @@ -191,7 +185,7 @@ def main(): help="Only list scriptures in the specified languages and not in related languages", ) parser.add_argument("--targets", nargs="+", help="List of target files in format -") - parser.add_argument("--config-folder", help=f"Existing folder, or folder relative to the Experiments folder: {SIL_NLP_ENV.mt_experiments_dir}") + parser.add_argument("--config-folder", type=resolve_config_file, help=f"Existing folder, or folder relative to the Experiments folder: {SIL_NLP_ENV.mt_experiments_dir}") args = parser.parse_args() @@ -247,14 +241,19 @@ def main(): logger.error("\nCouldn't find any Scripture files.") return - # Use target files from command line or file patterns from inputs + # Use target files from command line or file patterns from inputs target_files = args.targets if args.targets else file_patterns + targets = sorted([target_file for target_file in set(target_files)]) + + # Filter out targets from the source list keep only unique source and targets. + sources = [get_stem_name(f) for f in source_files if get_stem_name(f) not in target_files] + sources = sorted([source for source in set(sources)]) # Create and output configuration - config = create_alignment_config(source_files, target_files) + config = create_alignment_config(sources, targets) if args.config_folder: - config_file = resolve_config_file(args.config_folder) + config_file = args.config_folder with open(config_file, "w") as f: yaml.dump(config, f, default_flow_style=False, sort_keys=False) logger.info(f"\nCreated alignment configuration in: {config_file}") @@ -262,12 +261,12 @@ def main(): logger.info("\nAlignment configuration:") logger.info(yaml.dump(config, default_flow_style=False, sort_keys=False)) - logger.info(f"\nSource files found: {len(source_files)}") - for file in source_files: - logger.info(f" - {get_stem_name(file)}") - logger.info(f"\nTarget files: {len(target_files)}") - for file in target_files: - logger.info(f" - {file}") + logger.info(f"\nSource files found: {len(sources)}") + for source in sources: + logger.info(f" - {source}") + logger.info(f"\nTarget files: {len(targets)}") + for target in targets: + logger.info(f" - {target}") if __name__ == "__main__": From 73c47355bdc52aacb4e27c9073fbf0749e90d3d2 Mon Sep 17 00:00:00 2001 From: David Baines Date: Fri, 27 Mar 2026 16:50:04 +0000 Subject: [PATCH 6/8] Updated find_by_iso to write config.yml files and filter the targets from the sources --- silnlp/common/find_by_iso.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/silnlp/common/find_by_iso.py b/silnlp/common/find_by_iso.py index 447933be..3763b3f5 100644 --- a/silnlp/common/find_by_iso.py +++ b/silnlp/common/find_by_iso.py @@ -147,12 +147,12 @@ def resolve_config_file(output_path: str) -> Path: if folder_part.is_absolute(): config_folder = folder_part - if not config_folder.parent.exists(): + if not config_folder.parent.is_dir(): raise argparse.ArgumentTypeError( f"Parent directory does not exist: {config_folder.parent}") - try: config_folder.mkdir(parents=False, exist_ok=True) - except FileNotFoundError: - raise argparse.ArgumentTypeError(f"Parent directory does not exist: {config_folder.parent}") + try: config_folder.mkdir(parents=True, exist_ok=True) + except PermissionError: + raise argparse.ArgumentTypeError(f"Permission denied creating directory: {config_folder}") else: if len(folder_part.parts) < 2: raise argparse.ArgumentTypeError( From f82ce337ee559cbf4d3d7cafcc430f86af32647f Mon Sep 17 00:00:00 2001 From: David Baines Date: Fri, 27 Mar 2026 16:53:08 +0000 Subject: [PATCH 7/8] Remove temp find_by_iso2.py --- silnlp/common/find_by_iso2.py | 244 ---------------------------------- 1 file changed, 244 deletions(-) delete mode 100644 silnlp/common/find_by_iso2.py diff --git a/silnlp/common/find_by_iso2.py b/silnlp/common/find_by_iso2.py deleted file mode 100644 index 7e4c3c2a..00000000 --- a/silnlp/common/find_by_iso2.py +++ /dev/null @@ -1,244 +0,0 @@ -import argparse -import json -import logging -from pathlib import Path -from typing import Dict, List, Set, Tuple, Union -import sys -import yaml - -from .environment import SIL_NLP_ENV -from .iso_info import NLLB_ISO_SET, ALT_ISO - -IsoCode = str -IsoCodeList = List[IsoCode] -IsoCodeSet = Set[IsoCode] - -LANGUAGE_FAMILY_FILE = SIL_NLP_ENV.assets_dir / "languageFamilies.json" - -def is_file_pattern(input_str: str) -> bool: - """Check if the input string contains a hyphen, indicating it's a filename pattern.""" - return '-' in input_str - -def split_input_list(input_list: List[str]) -> Tuple[List[str], List[str]]: - """Split input list into ISO codes and file patterns.""" - iso_codes = [] - files = [] - for item in input_list: - if is_file_pattern(item): - files.append(item) - else: - iso_codes.append(item) - return iso_codes, files - -def get_stem_name(file_path: Path) -> str: - """Get the stem name without path or extension.""" - return file_path.stem - - -def load_language_data(file_path: Path) -> Tuple[Dict, Dict, Dict]: - try: - with open(file_path, "r", encoding="utf-8") as file: - raw_data = json.load(file) - except FileNotFoundError: - logging.error(f"File not found: {file_path}") - return {}, {}, {} - except json.JSONDecodeError: - logging.error(f"Error decoding JSON from file: {file_path}") - return {}, {}, {} - - language_data = {} - country_data = {} - family_data = {} - - for lang in raw_data: - iso = lang["isoCode"] - country = lang["langCountry"] - family = lang["languageFamily"] - - language_data[iso] = { - "Name": lang["language"], - "Country": country, - "Family": family, - } - - country_data.setdefault(country, []).append(iso) - family_data.setdefault(family, []).append(iso) - - return language_data, country_data, family_data - - -def find_related_isocodes( - iso_codes: IsoCodeList, language_data: Dict, country_data: Dict, family_data: Dict -) -> IsoCodeList: - iso_set = set(iso_codes) - - for iso_code in iso_codes: - if iso_code in language_data: - lang_info = language_data[iso_code] -# logger.info(f"{iso_code}: {lang_info['Name']}, {lang_info['Country']}, {lang_info['Family']}") - - iso_set.update(country_data.get(lang_info["Country"], [])) - iso_set.update(family_data.get(lang_info["Family"], [])) - - return sorted(iso_set) - - -def get_files_by_iso(isocodes: IsoCodeList, scripture_dir: Path) -> List[Path]: - return [ - file for file in scripture_dir.glob('*.txt') - if any(file.stem.startswith(isocode + '-') for isocode in isocodes) - ] - -def split_files_by_projects(files: List[Path], projects_dir: Path) -> Tuple[Dict[Path, Path], Dict[Path, Path]]: - existing_projects = {} - missing_projects = {} - - for file in files: - project = projects_dir / file.stem.split("-")[1] - if project.is_dir(): - existing_projects[file] = project - else: - missing_projects[file] = project - - return existing_projects, missing_projects - - -def get_equivalent_isocodes(iso_codes: List[str]) -> Set[str]: - return {code for iso_code in iso_codes for code in (iso_code, ALT_ISO.get_alternative(iso_code)) if code} - -def resolve_config_path(config_folder: Path) -> Path: - """Resolve config folder path relative to experiments directory if not absolute.""" - if not config_folder.is_absolute(): - return SIL_NLP_ENV.mt_experiments_dir / config_folder - return config_folder - -def create_alignment_config(source_files: List[Path], target_files: List[str]) -> dict: - """Create the alignment configuration dictionary.""" - config = { - 'data': { - 'aligner': 'fast_align', - 'corpus_pairs': [{ - 'type': 'train', - 'src': [get_stem_name(f) for f in source_files], - 'trg': target_files, - 'mapping': 'many_to_many', - 'test_size': 0, - 'val_size': 0 - }], - 'tokenize': False - } - } - return config - -def write_or_print_config(config: dict, config_folder: Path = None): - """Write config to file or print to terminal.""" - if config_folder: - config_folder = Path(config_folder) - if not config_folder.is_absolute(): - config_folder = SIL_NLP_ENV.mt_experiments_dir / config_folder - config_folder.mkdir(parents=True, exist_ok=True) - config_path = config_folder / 'config.yml' - with open(config_path, 'w') as f: - yaml.dump(config, f, default_flow_style=False, sort_keys=False) - return str(config_path) - else: - return yaml.dump(config, default_flow_style=False, sort_keys=False) - -def main(): - parser = argparse.ArgumentParser(description="Find related ISO language codes and create alignment config.") - parser.add_argument("inputs", nargs="+", - help="ISO codes or file patterns (e.g., 'fra' or 'en-NIV')") - parser.add_argument("--scripture-dir", type=Path, - default=Path(SIL_NLP_ENV.mt_scripture_dir), - help="Directory containing scripture files") - parser.add_argument("--all-related", action='store_true', - help="List all related scriptures without filtering to those that are part of NLLB") - parser.add_argument("--no-related", action='store_true', - help="Only list scriptures in the specified languages and not in related languages") - parser.add_argument("--output", type=Path, help="Output to the specified file.") - parser.add_argument("--target-files", nargs="+", - help="List of target files in format -") - parser.add_argument("--config-folder", type=Path, - help="Folder to write the config.yml file (absolute or relative to mt_experiments_dir)") - - args = parser.parse_args() - - # Setup logging - logger = logging.getLogger(__name__) - logger.setLevel(logging.INFO) - formatter = logging.Formatter('%(message)s') - - if args.output: - file_handler = logging.FileHandler(args.output) - file_handler.setFormatter(formatter) - logger.addHandler(file_handler) - else: - console_handler = logging.StreamHandler(sys.stdout) - console_handler.setFormatter(formatter) - logger.addHandler(console_handler) - - # Split inputs into ISO codes and file patterns - iso_codes, file_patterns = split_input_list(args.inputs) - - source_files = [] - if iso_codes: - # Load language data and process ISO codes - language_data, country_data, family_data = load_language_data(LANGUAGE_FAMILY_FILE) - if not language_data: - logging.error("Failed to load language data.") - return - - iso_codes = get_equivalent_isocodes(iso_codes) - - if args.no_related: - codes_to_find = list(iso_codes) - logger.info(f"\nConsidering only the specified iso codes and their equivalents: {codes_to_find}") - else: - codes_to_find = find_related_isocodes(list(iso_codes), language_data, country_data, family_data) - logger.info(f"\nFound {len(codes_to_find)} related languages:\n{codes_to_find}.") - - if not args.all_related: - codes_to_find = [iso for iso in codes_to_find if iso in NLLB_ISO_SET] - logger.info(f"\nFound {len(codes_to_find)} specified or related languages in NLLB:\n{codes_to_find}") - else: - logger.info(f"\nFound {len(codes_to_find)} specified or related languages:\n{codes_to_find}") - - # Get all possible codes and find matching files - all_possible_codes = get_equivalent_isocodes(codes_to_find) - source_files.extend(get_files_by_iso(all_possible_codes, args.scripture_dir)) - - # Add files from file patterns - if file_patterns: - pattern_files = [args.scripture_dir / f"{pattern}.txt" for pattern in file_patterns] - existing_files = [f for f in pattern_files if f.exists()] - source_files.extend(existing_files) - if len(existing_files) < len(pattern_files): - missing = set(file_patterns) - set(get_stem_name(f) for f in existing_files) - logger.warning(f"Could not find these files: {missing}") - - if not source_files: - logger.error("\nCouldn't find any Scripture files.") - return - - # Use target files from command line or file patterns from inputs - target_files = args.target_files if args.target_files else file_patterns - - # Create and output configuration - config = create_alignment_config(source_files, target_files) - result = write_or_print_config(config, args.config_folder) - - if args.config_folder: - logger.info(f"\nCreated alignment configuration in: {result}") - else: - logger.info("\nAlignment configuration:") - logger.info(result) - - logger.info(f"\nSource files found: {len(source_files)}") - for file in source_files: - logger.info(f" - {get_stem_name(file)}") - logger.info(f"\nTarget files: {len(target_files)}") - for file in target_files: - logger.info(f" - {file}") - -if __name__ == "__main__": - main() \ No newline at end of file From ff7d66d4dc639402c8d024aadad26c0cba578f84 Mon Sep 17 00:00:00 2001 From: David Baines Date: Fri, 27 Mar 2026 16:55:26 +0000 Subject: [PATCH 8/8] Remove comments --- silnlp/common/find_by_iso.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/silnlp/common/find_by_iso.py b/silnlp/common/find_by_iso.py index 3763b3f5..3c3353b8 100644 --- a/silnlp/common/find_by_iso.py +++ b/silnlp/common/find_by_iso.py @@ -79,8 +79,6 @@ def find_related_isocodes( for iso_code in iso_codes: if iso_code in language_data: lang_info = language_data[iso_code] - # logger.info(f"{iso_code}: {lang_info['Name']}, {lang_info['Country']}, {lang_info['Family']}") - iso_set.update(country_data.get(lang_info["Country"], [])) iso_set.update(family_data.get(lang_info["Family"], [])) @@ -189,7 +187,6 @@ def main(): args = parser.parse_args() - # Setup logging logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) formatter = logging.Formatter("%(message)s") @@ -198,12 +195,10 @@ def main(): console_handler.setFormatter(formatter) logger.addHandler(console_handler) - # Split inputs into ISO codes and file patterns iso_codes, file_patterns = split_input_list(args.inputs) source_files = [] if iso_codes: - # Load language data and process ISO codes language_data, country_data, family_data = load_language_data(LANGUAGE_FAMILY_FILE) if not language_data: logging.error("Failed to load language data.") @@ -224,11 +219,9 @@ def main(): else: logger.info(f"\nFound {len(codes_to_find)} specified or related languages:\n{codes_to_find}") - # Get all possible codes and find matching files all_possible_codes = get_equivalent_isocodes(codes_to_find) source_files.extend(get_files_by_iso(all_possible_codes, args.scripture_dir)) - # Add files from file patterns if file_patterns: pattern_files = [args.scripture_dir / f"{pattern}.txt" for pattern in file_patterns] existing_files = [f for f in pattern_files if f.exists()] @@ -241,15 +234,10 @@ def main(): logger.error("\nCouldn't find any Scripture files.") return - # Use target files from command line or file patterns from inputs target_files = args.targets if args.targets else file_patterns targets = sorted([target_file for target_file in set(target_files)]) - - # Filter out targets from the source list keep only unique source and targets. sources = [get_stem_name(f) for f in source_files if get_stem_name(f) not in target_files] sources = sorted([source for source in set(sources)]) - - # Create and output configuration config = create_alignment_config(sources, targets) if args.config_folder: