diff --git a/src/openpecha/bdrc_utils.py b/src/openpecha/bdrc_utils.py deleted file mode 100644 index 7ec1032c..00000000 --- a/src/openpecha/bdrc_utils.py +++ /dev/null @@ -1,51 +0,0 @@ -from pathlib import Path -from typing import Any, Dict, Optional - -from openpecha.utils import read_json - - -def extract_metadata_for_work(work_path: Path) -> Dict[str, Any]: - metadata = {} - ocr_import_info = read_json(work_path / "ocr_import_info.json") - metadata["ocr_import_info"] = ocr_import_info - buda_data = read_json(work_path / "buda_data.json") - metadata["buda_data"] = buda_data - - return metadata - - -def format_metadata_for_op_api(metadata: Dict[str, Any]) -> Dict[str, Any]: - """ - Formats BDRC metadata into a structure suitable for the OpenPecha API. - Excludes 'author' and 'title' keys if their corresponding values are None. - - Args: - metadata: A dictionary containing the raw BDRC metadata. - - Returns: - A dictionary with the formatted metadata. - """ - buda_data = metadata.get("buda_data", {}).get("source_metadata", {}) - ocr_info = metadata.get("ocr_import_info", {}) - - formatted_data: Dict[str, Any] = { - "source_type": "bdrc", - "bdrc": metadata, - "document_id": ocr_info.get("bdrc_scan_id"), - "language": ( - buda_data.get("languages", [None])[0] - if buda_data.get("languages") - else None - ), - "source_url": buda_data.get("id"), - } - - author: Optional[str] = buda_data.get("author") - if author: - formatted_data["author"] = {"bo": author} - - title: Optional[str] = buda_data.get("title") - if title: - formatted_data["title"] = {"bo": title} - - return formatted_data diff --git a/src/openpecha/pecha/parsers/__init__.py b/src/openpecha/pecha/parsers/__init__.py index 2348ce55..11eebfa1 100644 --- a/src/openpecha/pecha/parsers/__init__.py +++ b/src/openpecha/pecha/parsers/__init__.py @@ -1,110 +1,10 @@ -from abc import ABC, abstractmethod -from pathlib import Path -from typing import Any, Dict, List, Tuple - -from openpecha.config import PECHAS_PATH, get_logger -from openpecha.exceptions import MetaDataValidationError -from openpecha.pecha import Pecha, annotation_path from openpecha.pecha.annotations import BaseAnnotation from openpecha.pecha.blupdate import DiffMatchPatch -from openpecha.pecha.layer import AnnotationType -from openpecha.pecha.metadata import InitialCreationType, PechaMetaData - -logger = get_logger(__name__) - - -class DocxBaseParser(ABC): - @property - def name(self): - return self.__class__.__name__ - - @abstractmethod - def parse( - self, - input: str | Path, - annotation_type: AnnotationType, - metadata: Dict, - output_path: Path = PECHAS_PATH, - ) -> Tuple[Pecha, annotation_path]: - raise NotImplementedError - - def create_pecha( - self, base: str, output_path: Path, metadata: Dict, pecha_id: str | None - ) -> Pecha: - pecha = Pecha.create(output_path, pecha_id) - pecha.set_base(base) - - try: - pecha_metadata = PechaMetaData( - id=pecha.id, - parser=self.name, - **metadata, - bases={}, - initial_creation_type=InitialCreationType.google_docx, - ) - except Exception as e: - logger.error(f"The metadata given was not valid. {str(e)}") - raise MetaDataValidationError( - f"[Error] The metadata given was not valid. {str(e)}" - ) - else: - pecha.set_metadata(pecha_metadata.to_dict()) - - return pecha - - def add_segmentation_layer( - self, pecha: Pecha, anns: List[BaseAnnotation], ann_type: AnnotationType - ) -> annotation_path: +from typing import List - basename = list(pecha.bases.keys())[0] - layer, layer_path = pecha.add_layer(basename, ann_type) - for ann in anns: - pecha.add_annotation(layer, ann, ann_type) - layer.save() +from openpecha.config import get_logger - return str(layer_path.relative_to(pecha.layer_path)) - - -class BaseParser(ABC): - @property - def name(self): - return self.__class__.__name__ - - @abstractmethod - def parse( - self, - input: Any, - metadata: Dict, - output_path: Path = PECHAS_PATH, - ): - raise NotImplementedError - - -class DummyParser(BaseParser): - @property - def name(self): - return self.__class__.__name__ - - def parse( - self, - input: Any, - metadata: Dict, - output_path: Path = PECHAS_PATH, - ) -> Pecha: - raise NotImplementedError - - -class OCRBaseParser(ABC): - @property - def name(self): - return self.__class__.__name__ - - @abstractmethod - def parse( - self, - dataprovider: Any, - ) -> Pecha: - raise NotImplementedError +logger = get_logger(__name__) def update_coords( diff --git a/src/openpecha/pecha/parsers/docx/__init__.py b/src/openpecha/pecha/parsers/docx/__init__.py deleted file mode 100644 index 1b7a0446..00000000 --- a/src/openpecha/pecha/parsers/docx/__init__.py +++ /dev/null @@ -1,78 +0,0 @@ -from pathlib import Path -from typing import Any, Dict, List, Tuple - -from openpecha.config import get_logger -from openpecha.exceptions import ParseNotReadyForThisAnnotation -from openpecha.pecha import Pecha, annotation_path -from openpecha.pecha.layer import AnnotationType -from openpecha.pecha.parsers.docx.commentary.simple import DocxSimpleCommentaryParser -from openpecha.pecha.parsers.docx.root import DocxRootParser - -logger = get_logger(__name__) - - -class DocxParser: - def is_commentary_pecha(self, metadatas: List[Any]) -> bool: - """Checks if the given metadata corresponds to a commentary Pecha. - - Args: - metadatas (List[Dict]): List of dictionaries containing metadata of the Pecha. - - Returns: - bool: True if the Pecha is a commentary, otherwise False. - """ - for metadata in metadatas: - if metadata.type == "commentary": - return True - return False - - def parse( - self, - docx_file: str | Path, - annotation_type: AnnotationType | str, - metadatas: List[Any], - pecha_id: str | None = None, - ) -> Tuple[Pecha, annotation_path]: - """Parses a DOCX file and generates a Pecha object based on its type. - - Args: - docx_file (str | Path): Path to the DOCX file to be parsed. - metadatas (List[Dict]): List of dictionaries, where each dictionary - contains metadata of the Pecha. - output_path (Path): - pecha_id (str | None, optional): Pecha ID to be assigned. Defaults to None. - - Returns: - Pecha: Pecha object. - """ - - # Accept both str and AnnotationType, convert str to AnnotationType - if isinstance(annotation_type, str): - try: - annotation_type = AnnotationType(annotation_type) - except ValueError: - raise ParseNotReadyForThisAnnotation( - f"Invalid annotation type: {annotation_type}" - ) - - is_commentary = self.is_commentary_pecha(metadatas) - - # Convert metadata: MetadataModel to Dict - metadata = metadatas[0].model_dump() - - if is_commentary: - pecha, annotation_path = DocxSimpleCommentaryParser().parse( - input=docx_file, - annotation_type=annotation_type, - metadata=metadata, - pecha_id=pecha_id, - ) - return (pecha, annotation_path) - else: - pecha, annotation_path = DocxRootParser().parse( - input=docx_file, - annotation_type=annotation_type, - metadata=metadata, - pecha_id=pecha_id, - ) - return (pecha, annotation_path) diff --git a/src/openpecha/pecha/parsers/docx/annotation.py b/src/openpecha/pecha/parsers/docx/annotation.py deleted file mode 100644 index 0cd88db7..00000000 --- a/src/openpecha/pecha/parsers/docx/annotation.py +++ /dev/null @@ -1,89 +0,0 @@ -from pathlib import Path -from typing import Any, List, Tuple - -from stam import AnnotationStore - -from openpecha.config import get_logger -from openpecha.exceptions import ParseNotReadyForThisAnnotation -from openpecha.pecha import Pecha, annotation_path, get_anns -from openpecha.pecha.layer import AnnotationType -from openpecha.pecha.parsers import update_coords -from openpecha.pecha.parsers.docx.commentary.simple import DocxSimpleCommentaryParser -from openpecha.pecha.parsers.docx.footnote import DocxFootnoteParser -from openpecha.pecha.parsers.docx.root import DocxRootParser -from openpecha.pecha.pecha_types import is_root_related_pecha - -pecha_id = str - -logger = get_logger(__name__) - - -class DocxAnnotationParser: - def __init__(self): - pass - - def add_annotation( - self, - pecha: Pecha, - type: AnnotationType | str, - docx_file: Path, - metadatas: List[Any], - ) -> Tuple[Pecha, annotation_path]: - - # Accept both str and AnnotationType, convert str to AnnotationType - if isinstance(type, str): - try: - type = AnnotationType(type) - except ValueError: - raise ParseNotReadyForThisAnnotation(f"Invalid annotation type: {type}") - - if type not in [ - AnnotationType.ALIGNMENT, - AnnotationType.SEGMENTATION, - AnnotationType.FOOTNOTE, - ]: - raise ParseNotReadyForThisAnnotation( - f"Parser is not ready for the annotation type: {type}" - ) - - new_basename = list(pecha.bases.keys())[0] - new_base = pecha.get_base(new_basename) - - if type == AnnotationType.FOOTNOTE: - footnote_parser = DocxFootnoteParser() - annotation_path = footnote_parser.parse(pecha, docx_file) - return (pecha, annotation_path) - - elif is_root_related_pecha(metadatas): - parser = DocxRootParser() - anns, old_base = parser.extract_anns(docx_file, AnnotationType.SEGMENTATION) - - updated_anns = update_coords(anns, old_base, new_base) - logger.info(f"Updated Coordinate: {updated_anns}") - - annotation_path = parser.add_segmentation_layer(pecha, updated_anns, type) - anns = get_anns( - AnnotationStore(file=str(pecha.layer_path / annotation_path)) - ) - logger.info(f"New Updated Annotations: {anns}") - - logger.info( - f"Alignment Annotation is successfully added to Pecha {pecha.id}" - ) - return (pecha, annotation_path) - - else: - commentary_parser = DocxSimpleCommentaryParser() - ( - anns, - old_base, - ) = commentary_parser.extract_anns(docx_file, type) - - updated_coords = update_coords(anns, old_base, new_base) - annotation_path = commentary_parser.add_segmentation_layer( - pecha, updated_coords, type - ) - logger.info( - f"Alignment Annotation is successfully added to Pecha {pecha.id}" - ) - return (pecha, annotation_path) diff --git a/src/openpecha/pecha/parsers/docx/commentary/__init__.py b/src/openpecha/pecha/parsers/docx/commentary/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/openpecha/pecha/parsers/docx/commentary/complex.py b/src/openpecha/pecha/parsers/docx/commentary/complex.py deleted file mode 100644 index a52b5df1..00000000 --- a/src/openpecha/pecha/parsers/docx/commentary/complex.py +++ /dev/null @@ -1,325 +0,0 @@ -import re -from pathlib import Path -from typing import Any, Dict, List, Optional - -from docx import Document -from docx.shared import RGBColor - -from openpecha.config import PECHAS_PATH -from openpecha.pecha import Pecha -from openpecha.pecha.annotations import AlignmentAnnotation, SapcheAnnotation, span -from openpecha.pecha.layer import AnnotationType -from openpecha.pecha.metadata import InitialCreationType -from openpecha.pecha.parsers import BaseParser -from openpecha.utils import parse_alignment_index - - -class DocxComplexCommentaryParser(BaseParser): - def __init__(self, root_path: Optional[str] = None): - self.root_path = root_path - self.commentary_segment_splitter = "\n\n" - self.meaning_segment_anns: List[AlignmentAnnotation] = [] - self.sapche_anns: List[SapcheAnnotation] = [] - self.temp_state = { - "meaning_segment": {"anns": [], "char_diff": 0}, - "sapche": {"anns": [], "char_diff": 0}, - } - self.base = "" - self.metadata: Dict[str, Any] = {} - - def normalize_text(self, text: str): - text = self.normalize_whitespaces(text) - text = self.normalize_newlines(text) - return text - - @staticmethod - def normalize_whitespaces(text: str): - """ - If there are spaces or tab between newlines, it will be removed. - """ - return re.sub(r"\n[\s\t]+\n", "\n\n", text) - - @staticmethod - def normalize_newlines(text: str): - """ - If there are more than 2 newlines continuously, it will replace it with 2 newlines. - """ - return re.sub(r"\n{3,}", "\n\n", text) - - def parse( - self, - input: Path, - metadata: Dict, - output_path: Path = PECHAS_PATH, - ) -> Pecha: - - # Clean up class attributes - self.meaning_segment_anns = [] - self.base = "" - - metadata["root_path"] = self.root_path - self.metadata = metadata - self.parse_commentary(input) - - pecha = self.create_pecha(output_path) - return pecha - - def prepare_doc(self, input: Path): - """ - Input: a docx file - Process: Prepare the doc for parsing - """ - - def format_paragraphs(paragraphs: List[Dict[str, Any]]) -> Dict[str, Any]: - """ - Paragraphs is a text with styles. - Each line in docx file is a paragraph. - We have to combine the text and the styles - """ - formatted_paras = [] - para_texts = [] - for para in paragraphs: - para_texts.append(para["text"].strip()) - style_texts = [] - styles = [] - for para_style in para["styles"]: - para_text = para_style.text - style_texts.append(para_text) - styles.append(para_style.font) - if style_texts: - style_texts[0] = style_texts[0].lstrip() - style_texts[-1] = style_texts[-1].rstrip() - formatted_paras.append({"texts": style_texts, "styles": styles}) - res = {"text": "\n".join(para_texts), "styles": formatted_paras} - return res - - # Parse the document - docs = Document(input) - - formatted_docs = [] - last_doc_data: List[Dict[str, Any]] = [] - - for doc in docs.paragraphs: - if doc.text.strip() == "": - if last_doc_data: - formatted_docs.append(format_paragraphs(last_doc_data)) - last_doc_data = [] - else: - last_doc_data.append({"text": doc.text, "styles": doc.runs}) - - # Handle remaining paragraphs after the loop - if last_doc_data: - formatted_docs.append(format_paragraphs(last_doc_data)) - - return formatted_docs - - @staticmethod - def update_doc(doc: Dict[str, Any], char_diff: int): - """ - Updates the document by removing characters up to the given char_diff. - Args: - doc (Dict[str, Any]): The document to update, containing text and styles. - char_diff (int): The number of characters to remove from the beginning of the text. - Returns: - Dict[str, Any]: The updated document. - """ - # Update the main text field - doc["text"] = doc["text"][char_diff:] - - # Extract the first style's texts and styles - styles = doc["styles"][0] - texts = styles["texts"] - style_meta = styles["styles"] - - char_count = 0 - for idx, text_chunk in enumerate(texts): - if char_count >= char_diff or char_count + len(text_chunk) == char_diff: - doc["styles"][0]["styles"] = style_meta[idx + 1 :] - doc["styles"][0]["texts"] = texts[idx + 1 :] - break - - if char_count + len(text_chunk) > char_diff: - doc["styles"][0]["styles"] = style_meta[idx:] - doc["styles"][0]["texts"] = [ - text_chunk[char_diff - char_count :] - ] + texts[idx + 1 :] - break - char_count += len(text_chunk) - - return doc - - def add_commentary_meaning_ann( - self, doc: Dict[str, Any], index: int, char_count: int - ): - segment = doc["text"] - match = re.match(r"^([\d\-,]+) ", segment) - updated_segment = segment - if match: - alignment_index = match.group(1) - segment = segment.replace(alignment_index, "") - doc = self.update_doc(doc, len(alignment_index) + 1) - updated_segment = segment.strip() - curr_segment_ann = AlignmentAnnotation( - span=span(start=char_count, end=char_count + len(updated_segment)), - index=index, - alignment_index=parse_alignment_index(alignment_index), - ) - else: - curr_segment_ann = AlignmentAnnotation( - span=span(start=char_count, end=char_count + len(segment)), - index=index, - alignment_index=[], - ) - - self.temp_state["meaning_segment"]["anns"].append(curr_segment_ann) # type: ignore - return doc - - def add_sapche_ann(self, doc: Dict[str, Any], char_count: int): - """ - Extract and process sapche annotations (in Fuchsia/Pink color) from the provided docx file structure. - Args: - doc (Dict[str, Any]): The document structure containing styles and text. - char_count (int): The initial character count for span calculation. - Returns: - str: The updated segment text after processing annotations. - """ - inner_char_count = 0 - sapche_anns: List[SapcheAnnotation] = [] - for doc_style in doc["styles"]: - for idx in range(len(doc_style["texts"])): - if doc_style["styles"][idx].color.rgb == RGBColor(0xFF, 0x00, 0x00): - match = re.match(r"([\d\.]+)\s", doc_style["texts"][idx]) - if match: - # Extract sapche number and store the char length to update the previous ann spans - sapche_number = match.group(1) - doc_style["texts"][idx] = doc_style["texts"][idx].replace( - f"{sapche_number} ", "" - ) - self.temp_state["sapche"]["char_diff"] += len(sapche_number) # type: ignore - - start = char_count + inner_char_count - end = start + len(doc_style["texts"][idx]) - sapche_anns.append( - SapcheAnnotation( - span=span(start=start, end=end), - sapche_number=sapche_number, - ) - ) - # If the sapche number is not needed, use the following code in future - # else: - # start = char_count + inner_char_count - # end = start + len(doc_style["texts"][idx]) - # sapche_anns.append( - # { - # AnnotationType.sapche.value: { - # "start": start, - # "end": end, - # } - # } - # ) - inner_char_count += len(doc_style["texts"][idx]) - inner_char_count += 1 # for newline - - formatted_anns = self.merge_anns(sapche_anns, AnnotationType.SAPCHE) - self.temp_state["sapche"]["anns"].extend(formatted_anns) # type: ignore - updated_segment = "\n".join( - ["".join(doc_style["texts"]) for doc_style in doc["styles"]] - ) - return updated_segment - - @staticmethod - def merge_anns( - anns: List[SapcheAnnotation], ann_layer: AnnotationType - ) -> List[SapcheAnnotation]: - """ - Merge overlapping or consecutive sapche annotations. - Args: - annotations (List[Dict[str, Any]]): Eg: List of sapche annotations. - - Returns: - List[Dict[str, Any]]: Merged annotations. - """ - formatted_anns: List[SapcheAnnotation] = [] - last_ann: Optional[SapcheAnnotation] = None - for ann in anns: - if last_ann is None: - last_ann = ann - continue - if ann.span.start != last_ann.span.end: - formatted_anns.append(last_ann) - last_ann = ann - else: - last_ann[ann_layer.value].span.end = ann.span.end - - if last_ann: - formatted_anns.append(last_ann) - return formatted_anns - - def update_ann_spans(self): - """ - Update the spans of the meaning_segment and sapche annotations. - """ - if self.temp_state["meaning_segment"]["anns"]: - meaning_segment_ann = self.temp_state["meaning_segment"]["anns"][0] # type: ignore - meaning_segment_ann.span.end -= self.temp_state["sapche"]["char_diff"] - self.meaning_segment_anns.append(meaning_segment_ann) - - self.sapche_anns.extend(self.temp_state["sapche"]["anns"]) # type: ignore - - self.temp_state = { - "meaning_segment": {"anns": [], "char_diff": 0}, - "sapche": {"anns": [], "char_diff": 0}, - } - - def parse_commentary(self, input: Path): - """ - Input: a docx file - Process: - Parse and record the commentary annotations in self.meaning_segment_anns, - - Save the cleaned base text in self.base - """ - formatted_docs = self.prepare_doc(input) - - char_count = 0 - base_texts = [] - for index, doc in enumerate(formatted_docs): - segment = doc["text"] - if not segment: - continue - - doc = self.add_commentary_meaning_ann(doc, index, char_count) - updated_segment = self.add_sapche_ann(doc, char_count) - - self.update_ann_spans() - - base_texts.append(updated_segment) - char_count += len(updated_segment) - char_count += 2 # for two newlines - - self.base = "\n\n".join(base_texts) - - def create_pecha(self, output_path: Path) -> Pecha: - pecha = Pecha.create(output_path) - basename = pecha.set_base(self.base) - - # Add meaning_segment layer - meaning_segment_layer, _ = pecha.add_layer(basename, AnnotationType.ALIGNMENT) - for ann in self.meaning_segment_anns: - pecha.add_annotation(meaning_segment_layer, ann, AnnotationType.ALIGNMENT) - meaning_segment_layer.save() - - # Add sapche layer - sapche_layer, _ = pecha.add_layer(basename, AnnotationType.SAPCHE) - for ann in self.sapche_anns: - pecha.add_annotation(sapche_layer, ann, AnnotationType.SAPCHE) - sapche_layer.save() - - pecha.set_metadata( - { - "id": pecha.id, - "parser": self.name, - "initial_creation_type": InitialCreationType.google_docx, - **self.metadata, - } - ) - - return pecha diff --git a/src/openpecha/pecha/parsers/docx/commentary/simple.py b/src/openpecha/pecha/parsers/docx/commentary/simple.py deleted file mode 100644 index 5770c40a..00000000 --- a/src/openpecha/pecha/parsers/docx/commentary/simple.py +++ /dev/null @@ -1,117 +0,0 @@ -import re -from pathlib import Path -from typing import Any, Dict, List, Tuple - -from openpecha.config import PECHAS_PATH, get_logger -from openpecha.exceptions import FileNotFoundError -from openpecha.pecha import Pecha, annotation_path -from openpecha.pecha.annotations import ( - AlignmentAnnotation, - BaseAnnotation, - SegmentationAnnotation, - span, -) -from openpecha.pecha.layer import AnnotationType -from openpecha.pecha.parsers import DocxBaseParser -from openpecha.pecha.parsers.docx.utils import extract_numbered_list -from openpecha.utils import parse_alignment_index - -logger = get_logger(__name__) - - -class DocxSimpleCommentaryParser(DocxBaseParser): - def __init__(self): - self.root_alignment_index_regex = r"^([\d\-,]+)\s(.*)" - - def extract_segmentation_anns(self, numbered_text: Dict[str, str]): - anns = [] - base = "" - char_count = 0 - - for index, segment in numbered_text.items(): - anns.append( - SegmentationAnnotation( - span=span(start=char_count, end=char_count + len(segment)), - index=index, - ) - ) - base += f"{segment}\n" - char_count += len(segment) + 1 - - return (anns, base) - - def extract_alignment_anns(self, numbered_text: Dict[str, str]): - anns = [] - base = "" - char_count = 0 - - for index, segment in numbered_text.items(): - match = re.match(self.root_alignment_index_regex, segment) - - alignment_indices: str = match.group(1) if match else index - - segment = match.group(2) if match else segment - - anns.append( - AlignmentAnnotation( - span=span(start=char_count, end=char_count + len(segment)), - index=index, - alignment_index=parse_alignment_index(alignment_indices), - ) - ) - base += f"{segment}\n" - - char_count += len(segment) + 1 - - return (anns, base) - - def extract_anns( - self, docx_file: Path, annotation_type: AnnotationType - ) -> Tuple[List[BaseAnnotation], str]: - """ - Extract text from docx and calculate coordinates for segments. - """ - numbered_text = extract_numbered_list(docx_file) - - if annotation_type == AnnotationType.SEGMENTATION: - return self.extract_segmentation_anns(numbered_text) - - elif annotation_type == AnnotationType.ALIGNMENT: - return self.extract_alignment_anns(numbered_text) - - else: - raise NotImplementedError( - f"Annotation type {annotation_type} is not supported to extract segmentation." - ) - - def parse( - self, - input: str | Path, - annotation_type: AnnotationType, - metadata: Dict[str, Any], - output_path: Path = PECHAS_PATH, - pecha_id: str | None = None, - ) -> Tuple[Pecha, annotation_path]: - """ - Parse a docx file and create a pecha. - Steps: - 1. Extract text and calculate coordinates - 2. Extract segmentation annotations - 3. Initialize pecha with annotations and metadata - """ - input = Path(input) - if not input.exists(): - logger.error(f"The input docx file {str(input)} does not exist.") - raise FileNotFoundError( - f"[Error] The input docx file '{str(input)}' does not exist." - ) - - output_path.mkdir(parents=True, exist_ok=True) - - anns, base = self.extract_anns(input, annotation_type) - - pecha = self.create_pecha(base, output_path, metadata, pecha_id) - annotation_path = self.add_segmentation_layer(pecha, anns, annotation_type) - - logger.info(f"Pecha {pecha.id} is created successfully.") - return (pecha, annotation_path) diff --git a/src/openpecha/pecha/parsers/docx/footnote.py b/src/openpecha/pecha/parsers/docx/footnote.py deleted file mode 100644 index 3536d97c..00000000 --- a/src/openpecha/pecha/parsers/docx/footnote.py +++ /dev/null @@ -1,99 +0,0 @@ -import re -from pathlib import Path -from typing import Dict, List, Tuple - -from openpecha.config import get_logger -from openpecha.pecha import Pecha -from openpecha.pecha.annotations import BaseAnnotation, FootnoteAnnotation, span -from openpecha.pecha.layer import AnnotationType -from openpecha.pecha.parsers import update_coords -from openpecha.pecha.parsers.docx.utils import read_docx - -logger = get_logger(__name__) - - -class DocxFootnoteParser: - def __init__(self): - self.footnote_number = r"----footnote(\d+)----" - self.footnote_content = r"footnote(\d+)\)[\t\s]+(.+)" - - def get_footnote_contents(self, text: str) -> Tuple[str, Dict[int, str]]: - """ - Extract and remove footnote contents from text. - """ - matches = re.findall(self.footnote_content, text) - footnote_contents: Dict[int, str] = {} - - for match in matches: - footnote_number = int(match[0]) + 1 # footnote number starts from 0 - footnote_content = match[1] - footnote_contents[footnote_number] = footnote_content - - text = re.sub(self.footnote_content, "", text) - logger.info(f"Footnote content successfully extracted: {footnote_contents}") - return (text, footnote_contents) - - def get_footnote_spans( - self, text: str, footnote_contents: Dict[int, str] - ) -> Tuple[str, Dict[int, Tuple[int, int]]]: - matches = re.finditer(self.footnote_number, text) - footnote_spans: Dict[int, Tuple[int, int]] = {} - - offset = 0 - - for match in matches: - footnote_number = int(match.group(1)) + 1 # footnote number starts from 0 - if footnote_number in footnote_contents: - start_pos = match.start() - offset - footnote_spans[footnote_number] = (start_pos, start_pos) - - offset += match.end() - match.start() - - text = re.sub(self.footnote_number, "", text) - logger.info(f"Footnote spans successfully extracted: {footnote_spans}") - return (text, footnote_spans) - - def create_footnote_annotations( - self, - footnote_spans: Dict[int, Tuple[int, int]], - footnote_contents: Dict[int, str], - ) -> List[FootnoteAnnotation]: - return [ - FootnoteAnnotation( - index=footnote_number, - span=span(start=Span[0], end=Span[1]), - note=footnote_contents[footnote_number], - ) - for footnote_number, Span in footnote_spans.items() - ] - - def add_footnote_layer( - self, pecha: Pecha, anns: List[BaseAnnotation], ann_type: AnnotationType - ): - - basename = list(pecha.bases.keys())[0] - layer, layer_path = pecha.add_layer(basename, ann_type) - for ann in anns: - pecha.add_annotation(layer, ann, ann_type) - layer.save() - - return str(layer_path.relative_to(pecha.layer_path)) - - def parse(self, pecha: Pecha, input: str | Path) -> str: - logger.info(f"Parsing footnote annotation for {pecha.id}") - text = read_docx(docx_file=input, ignore_footnotes=False) - text, footnote_contents = self.get_footnote_contents(text) - text, footnote_spans = self.get_footnote_spans(text, footnote_contents) - - anns: List[FootnoteAnnotation] = self.create_footnote_annotations( - footnote_spans, footnote_contents - ) - - new_base = pecha.get_base(list(pecha.bases.keys())[0]) - anns = update_coords(anns, text, new_base) - - annotation_path: str = self.add_footnote_layer( - pecha, anns, AnnotationType.FOOTNOTE - ) - logger.info(f"Footnote annotation successfully added to Pecha {pecha.id}") - return annotation_path diff --git a/src/openpecha/pecha/parsers/docx/root/__init__.py b/src/openpecha/pecha/parsers/docx/root/__init__.py deleted file mode 100644 index 3aa017ce..00000000 --- a/src/openpecha/pecha/parsers/docx/root/__init__.py +++ /dev/null @@ -1,116 +0,0 @@ -import re -from pathlib import Path -from typing import Dict, List, Tuple - -from openpecha.config import PECHAS_PATH, get_logger -from openpecha.exceptions import FileNotFoundError -from openpecha.pecha import Pecha, annotation_path -from openpecha.pecha.annotations import ( - AlignmentAnnotation, - BaseAnnotation, - SegmentationAnnotation, - span, -) -from openpecha.pecha.layer import AnnotationType -from openpecha.pecha.metadata import InitialCreationType, PechaMetaData -from openpecha.pecha.parsers import DocxBaseParser -from openpecha.pecha.parsers.docx.utils import extract_numbered_list - -logger = get_logger(__name__) - - -class DocxRootParser(DocxBaseParser): - def extract_segmentation_anns( - self, numbered_text: Dict[str, str] - ) -> Tuple[List[SegmentationAnnotation], str]: - """ - Extract text from docx and calculate coordinates for segments. - """ - anns = [] - base = "" - char_count = 0 - - for index, segment in numbered_text.items(): - anns.append( - SegmentationAnnotation( - span=span(start=char_count, end=char_count + len(segment)), - index=index, - ) - ) - base += f"{segment}\n" - char_count += len(segment) + 1 - - return (anns, base) - - def extract_alignment_anns(self, numbered_text: Dict[str, str]): - """ - Extract text from docx and calculate coordinates for segments. - """ - anns = [] - base = "" - char_count = 0 - - for index, segment in numbered_text.items(): - anns.append( - AlignmentAnnotation( - span=span(start=char_count, end=char_count + len(segment)), - index=index, - alignment_index=[int(index)], - ) - ) - base += f"{segment}\n" - char_count += len(segment) + 1 - - return (anns, base) - - def extract_anns( - self, docx_file: Path, annotation_type: AnnotationType - ) -> Tuple[List[BaseAnnotation], str]: - """ - Extract text from docx and calculate coordinates for segments. - """ - numbered_text = extract_numbered_list(docx_file) - - if annotation_type == AnnotationType.SEGMENTATION: - return self.extract_segmentation_anns(numbered_text) - - elif annotation_type == AnnotationType.ALIGNMENT: - return self.extract_alignment_anns(numbered_text) - - else: - raise NotImplementedError( - f"Annotation type {annotation_type} is not supported to extract segmentation." - ) - - def parse( - self, - input: str | Path, - annotation_type: AnnotationType, - metadata: Dict, - output_path: Path = PECHAS_PATH, - pecha_id: str | None = None, - ) -> Tuple[Pecha, annotation_path]: - """ - Parse a docx file and create a pecha. - Steps: - 1. Extract text and calculate coordinates - 2. Extract segmentation annotations - 3. Initialize pecha with annotations and metadata - """ - input = Path(input) - if not input.exists(): - logger.error(f"The input docx file {str(input)} does not exist.") - raise FileNotFoundError( - f"[Error] The input docx file '{str(input)}' does not exist." - ) - - output_path.mkdir(parents=True, exist_ok=True) - - # anns, base = self.extract_segmentation_anns(input, annotation_type) - anns, base = self.extract_anns(input, annotation_type) - - pecha = self.create_pecha(base, output_path, metadata, pecha_id) - annotation_path = self.add_segmentation_layer(pecha, anns, annotation_type) - - logger.info(f"Pecha {pecha.id} is created successfully.") - return (pecha, annotation_path) diff --git a/src/openpecha/pecha/parsers/docx/update.py b/src/openpecha/pecha/parsers/docx/update.py deleted file mode 100644 index 737d7150..00000000 --- a/src/openpecha/pecha/parsers/docx/update.py +++ /dev/null @@ -1,41 +0,0 @@ -from pathlib import Path -from typing import Any, List -from unittest.mock import patch - -from openpecha.pecha import Pecha -from openpecha.pecha.layer import AnnotationType -from openpecha.pecha.parsers.docx.annotation import DocxAnnotationParser - - -class DocxAnnotationUpdate: - def __init__(self): - self.parser = DocxAnnotationParser() - - def extract_layer_name(self, layer_path: str) -> str: - return Path(layer_path).stem - - def extract_layer_id(self, layer_path: str) -> str: - layer_name = self.extract_layer_name(layer_path) - return layer_name.split("-")[-1] - - def extract_layer_enum(self, layer_path: str) -> AnnotationType: - layer_name = self.extract_layer_name(layer_path) - return AnnotationType(layer_name.split("-")[0]) - - def update_annotation( - self, - pecha: Pecha, - annotation_path: str, - docx_file: Path, - metadatas: List[Any], - ) -> Pecha: - type = self.extract_layer_enum(annotation_path) - layer_id = self.extract_layer_id(annotation_path) - - with patch("openpecha.pecha.get_layer_id") as mock_layer_id: - mock_layer_id.return_value = layer_id - updated_pecha, _ = self.parser.add_annotation( - pecha, type, docx_file, metadatas - ) - - return updated_pecha diff --git a/src/openpecha/pecha/parsers/docx/utils.py b/src/openpecha/pecha/parsers/docx/utils.py deleted file mode 100644 index 64ff4438..00000000 --- a/src/openpecha/pecha/parsers/docx/utils.py +++ /dev/null @@ -1,101 +0,0 @@ -import re -from pathlib import Path -from typing import Dict - -from docx2python import docx2python - -from openpecha.config import get_logger -from openpecha.exceptions import EmptyFileError - -logger = get_logger(__name__) - - -def normalize_whitespaces(text: str): - """ - If there are spaces or tab between newlines, it will be removed. - """ - return re.sub(r"\n[\s\t]+\n", "\n\n", text) - - -def normalize_newlines(text: str): - """ - If there are more than 2 newlines continuously, it will replace it with 2 newlines. - """ - return re.sub(r"\n{3,}", "\n\n", text) - - -def normalize_text(text: str): - text = normalize_whitespaces(text) - text = normalize_newlines(text) - text = text.strip() - return text - - -def read_docx(docx_file: str | Path, ignore_footnotes: bool = True) -> str: - """ - Read docx file as text. - """ - text = docx2python(docx_file).text - if not text: - logger.warning( - f"The docx file {str(docx_file)} is empty or contains only whitespace." - ) - raise EmptyFileError( - f"[Error] The document '{str(docx_file)}' is empty or contains only whitespace." - ) - - text = normalize_text(text) - if ignore_footnotes: - text = remove_footnote(text) - - logger.info(f"Text extracted from docx file: {text}") - return text - - -def remove_footnote(text: str) -> str: - """ - Input: text extracted from docx file - Output: text without footnote - """ - - # Remove footnote numbers - text = re.sub(r"----footnote\d+----", "", text) - - # Remove footnote content - parts = text.split("\n\n") - res = [] - for part in parts: - # Use regex to check if part starts with 'footnote' followed by digits - if not re.match(r"^footnote\d+\)", part.strip()): - res.append(part) - text = "\n\n".join(res) - return text - - -def extract_numbered_list(docx_file: str | Path) -> Dict[str, str]: - """ - Extract number list from the docx file. - - Example Output:> - { - '1': 'དབུ་མ་དགོངས་པ་རབ་གསལ་ལེའུ་དྲུག་པ་བདེན་གཉིས་སོ་སོའི་ངོ་བོ་བཤད་པ།། ', - '2': 'གསུམ་པ་ལ་གཉིས། ཀུན་རྫོབ་ཀྱི་བདེན་པ་བཤད་པ་དང་། ', - '3': 'དེས་གང་ལ་སྒྲིབ་ན་ཡང་དག་ཀུན་རྫོབ་འདོད་ཅེས་པས་ཡང་དག་པའི་དོན་ལ་སྒྲིབ་པས་ཀུན་རྫོབ་བམ་སྒྲིབ་བྱེད་དུ་འདོད་ཅེས་པ་སྟེ། །', - ... - } - """ - text = read_docx(docx_file) - - number_list_regex = r"^(\d+)\)\t(.*)" - - res: Dict[str, str] = {} - for para_text in text.split("\n\n"): - match = re.match(number_list_regex, para_text) - if match: - number = match.group(1) - text = match.group(2) - res[number] = text - - logger.info(f"Numbered List extracted from the docx file: {res}") - - return res diff --git a/src/openpecha/pecha/parsers/pedurma.py b/src/openpecha/pecha/parsers/pedurma.py deleted file mode 100644 index fbea5538..00000000 --- a/src/openpecha/pecha/parsers/pedurma.py +++ /dev/null @@ -1,214 +0,0 @@ -import re -from pathlib import Path -from typing import Dict, List - -from botok.tokenizers.chunktokenizer import ChunkTokenizer - -from openpecha.config import PECHAS_PATH -from openpecha.pecha import Pecha -from openpecha.pecha.annotations import PedurmaAnnotation, SegmentationAnnotation, span -from openpecha.pecha.layer import AnnotationType -from openpecha.pecha.parsers import BaseParser -from openpecha.utils import read_json - - -class PedurmaParser(BaseParser): - def __init__(self): - self.ann_regex = r"(\(\d+\) <.+?>)" - self.pagination_regex = r"\d+-\d+" - self.base_text = "" - self.pedurma_anns = [] - self.meaning_segment_anns = [] - - def get_base_text(self, text: str): - text = re.sub(self.ann_regex, "", text) - text = text.replace(":", "") - return text - - def parse( - self, - input: str, - metadata: Dict | Path, - output_path: Path = PECHAS_PATH, - ) -> Pecha: - - # Remove pagination - input = re.sub(self.pagination_regex, "", input) - self.base_text = self.get_base_text(input) - - # Normalize newlines with # - input = input.replace("\n", "#") - char_walker = 0 - # Split the text into chunks with anns regex - chunks = re.split(self.ann_regex, input) - prev_chunk = chunks[0] - self.pedurma_anns = [] - for chunk in chunks: - if re.search(self.ann_regex, chunk): - ann = get_annotation(prev_chunk, chunk, char_walker) - self.pedurma_anns.append(ann) - else: - clean_chunk = chunk.replace(":", "") - char_walker += len(clean_chunk) - prev_chunk = chunk - - input = input.replace("#", "\n") - - # Segment Annotation - char_walker = 0 - self.meaning_segment_anns = [] - for index, line in enumerate(self.base_text.splitlines()): - segment_ann = SegmentationAnnotation( - span=span(start=char_walker, end=char_walker + len(line)), index=index - ) - self.meaning_segment_anns.append(segment_ann) - char_walker += len(line) - char_walker += 1 # Add because of new line - - # Create a pecha - pecha = Pecha.create(output_path) - - basename = pecha.set_base(self.base_text) - - # Add Durchen Layer - durchen_layer, _ = pecha.add_layer(basename, AnnotationType.DURCHEN) - for ann in self.pedurma_anns: - pecha.add_annotation(durchen_layer, ann, AnnotationType.DURCHEN) - - durchen_layer.save() - - # Add Segment Layer - segment_layer, _ = pecha.add_layer(basename, AnnotationType.SEGMENTATION) - for ann in self.meaning_segment_anns: - pecha.add_annotation(segment_layer, ann, AnnotationType.SEGMENTATION) - - segment_layer.save() - # Set metadata - if isinstance(metadata, Path): - metadata = read_json(metadata) - - assert isinstance(metadata, dict) - metadata = modify_metadata(metadata) - pecha.set_metadata({"id": pecha.id, "parser": self.name, **metadata}) # noqa - - return pecha - - -def modify_metadata(metadata: Dict) -> Dict: - modified_metadata = { - k: v - for k, v in metadata.items() - if k not in ["title_bo", "alt_title_bo", "author_en", "author_bo"] - } - modified_metadata["title"] = { - "title_bo": metadata["title_bo"], - "alt_title_bo": metadata["alt_title_bo"], - } - modified_metadata["author"] = { - "author_en": metadata["author_en"], - "author_bo": metadata["author_bo"], - } - return modified_metadata - - -def get_annotation( - prev_chunk: str, note_chunk: str, char_walker: int -) -> PedurmaAnnotation: - span_text = get_span_text(prev_chunk, note_chunk) - start = char_walker - len(span_text) - end = char_walker - - return PedurmaAnnotation(span=span(start=start, end=end), note=note_chunk) - - -def get_span_text(prev_chunk: str, note_chunk: str): - """ - Input: text chunk - Process: Extract span text where the syllable/words have variations - if ':' is present, extract the text after ':' - else extract the last syllable - Output: span text - - Example: - Input: །དེ་བཞིན་ཉོན་མོངས་རྣམ་ Output: རྣམ་ - Input: ཆོས་ཀྱི་དབྱིངས་སུ་བསྟོད་པ། :འཕགས་པ་འཇམ་ Output: འཕགས་པ་འཇམ་ - """ - span_text = "" - if "+" in note_chunk: - return span_text - if ":" in prev_chunk: - match = re.search(":.*", prev_chunk) - if match: - span_text = match.group( - 0 - ) # Use group(0) to safely access the matched string - else: - syls = get_syls(prev_chunk) - if syls: - span_text = syls[-1] - if span_text == "#": - span_text = syls[-2] - span_text = span_text.replace("#", "\n") - span_text = span_text.replace(":", "") - return span_text - - -def get_syls(text: str): - """ - Split the text into syllables - """ - tokenizer = ChunkTokenizer(text) - - tokens = tokenizer.tokenize() - syls: List = [] - syl_walker = 0 - for token in tokens: - token_string = token[1] - if is_shad(token_string): - try: - syls[syl_walker - 1] += token_string - except: # noqa - syls.append(token_string) - syl_walker += 1 - else: - syls.append(token_string) - syl_walker += 1 - return syls - - -def is_shad(text): - shads = ["། །", "།", "།།", "། "] - if text in shads: - return True - return False - - -""" -PEDURMA PREPROCESSING -""" - - -def filter_pedurma_ann(text: str): - # remove <«སྣར་»«པེ་»བསྒྲུབ་པའི་> from text - text = re.sub(r"<[\u0F00-\u0FFF\s«»]+>", "", text) - # remove numbering (28) from text - text = re.sub(r"\(\d+\)", "", text) - - return text - - -def split_by_shad(text: str): - return text.split("།") - - -def preprocess_pedurma_text(text: str): - from bo_sent_tokenizer import segment - from fast_antx.core import transfer - - filtered_text = filter_pedurma_ann(text) - tokenized_text = segment(filtered_text, keep_non_bo_and_symbols=True) - - annotations = [["note_transfer", r"(\(\d+\) <[\u0F00-\u0FFF\s«»]+>)"]] - preprocessed_text = transfer(text, annotations, tokenized_text, output="txt") - - return preprocessed_text diff --git a/src/openpecha/pecha/pecha_types.py b/src/openpecha/pecha/pecha_types.py deleted file mode 100644 index 2dbc12d0..00000000 --- a/src/openpecha/pecha/pecha_types.py +++ /dev/null @@ -1,124 +0,0 @@ -from enum import Enum -from typing import Any, Dict, List - -from openpecha.config import get_logger -from openpecha.pecha import Pecha -from openpecha.pecha.annotations import AnnotationModel - -logger = get_logger(__name__) - - -class PechaType(Enum): - """ - Pecha Type for Serializer to determine the type of Pecha. - """ - - root_pecha = "root_pecha" - root_translation_pecha = "root_translation_pecha" - - commentary_pecha = "commentary_pecha" - commentary_translation_pecha = "commentary_translation_pecha" - - prealigned_root_translation_pecha = "prealigned_root_translation_pecha" - - prealigned_commentary_pecha = "prealigned_commentary_pecha" - prealigned_commentary_translation_pecha = "prealigned_commentary_translation_pecha" - - -def get_aligned_id(ann_models: List[AnnotationModel], annotation_path: str): - """ - Get the alignment id from List of AnnotationModel - """ - for ann_model in ann_models: - if annotation_path == ann_model.path: - aligned_to = ann_model.aligned_to - if aligned_to and aligned_to.alignment_id: - return aligned_to.alignment_id - return None - - -def get_pecha_type( - pechas: List[Pecha], - metadatas: List[Any], - annotations: Dict[str, List[AnnotationModel]], - annotation_path: str, -) -> PechaType: - is_commentary = is_commentary_pecha(metadatas) - is_translation = is_translation_pecha(metadatas) - - if is_commentary: - if is_translation: - if has_version_of(pechas, annotations, annotation_path): - return PechaType.prealigned_commentary_translation_pecha - return PechaType.commentary_translation_pecha - if has_version_of(pechas, annotations, annotation_path): - return PechaType.prealigned_commentary_pecha - - return PechaType.commentary_pecha - else: - if is_translation: - if has_version_of(pechas, annotations, annotation_path): - return PechaType.prealigned_root_translation_pecha - return PechaType.root_translation_pecha - return PechaType.root_pecha - - -def is_commentary_pecha(metadatas: List[Any]) -> bool: - """ - Pecha can be i) Root Pecha ii) Commentary Pecha - Output: True if Commentary Pecha, False otherwise - """ - for metadata in metadatas: - if metadata.type == "commentary": - return True - return False - - -def is_translation_pecha(metadatas: List[Any]) -> bool: - """ - Return - True if i) Translation of Root Pecha ii) Translation of Commentary Pecha - False otherwise - """ - if metadatas[0].type == "translation": - return True - return False - - -def has_version_of( - pechas: List[Pecha], - annotations: Dict[str, List[AnnotationModel]], - annotation_path: str, -) -> bool: - """ - Return - True: If the pecha points to an alignment annotation layer of Root Pecha - False: otherwise - """ - root_pecha = pechas[-1] - parent_pecha = pechas[-2] - - logger.info(f"Annotations: {annotations}") - logger.info(f"Root Pecha Annotations: {annotations[root_pecha.id]}") - logger.info(f"Commentary Pecha Annotations: {annotations[parent_pecha.id]}") - - if len(annotations.keys()) == 3: - annotation_path = get_aligned_id(annotations[pechas[0].id], annotation_path) - - associated_root_alignment_id = get_aligned_id( - annotations[parent_pecha.id], annotation_path - ) - - if associated_root_alignment_id.split("/")[1].startswith("alignment"): - return True - return False - - -def is_root_related_pecha(metadatas: List[Any]) -> bool: - """ - Returns True if the pecha type is root-related. - """ - for metadata in metadatas: - if metadata.type == "commentary": - return False - return True diff --git a/src/openpecha/utils.py b/src/openpecha/utils.py index e87cbf81..7084992e 100644 --- a/src/openpecha/utils.py +++ b/src/openpecha/utils.py @@ -1,12 +1,10 @@ import csv import json -import math import os from contextlib import contextmanager from pathlib import Path from typing import Dict, List -from openpecha.config import NO_OF_CHAPTER_SEGMENT from openpecha.exceptions import FileNotFoundError @@ -24,94 +22,6 @@ def cwd(path): os.chdir(prev_cwd) -def get_text_direction_with_lang(lang): - # Left-to-Right (LTR) languages - ltr_languages = [ - "bo", # Tibetan - "dz", # Dzongkha - "en", # English - "es", # Spanish - "fr", # French - "hi", # Hindi - "ja", # Japanese - "ko", # Korean - "mn", # Mongolian - "mr", # Marathi - "ms", # Malay - "ne", # Nepali - "pt", # Portuguese - "ru", # Russian - "sw", # Swahili - "th", # Thai - "vi", # Vietnamese - "zh", # Chinese (both Simplified and Traditional) - ] - - # Right-to-Left (RTL) languages - rtl_languages = ["ar", "he"] # Arabic # Hebrew - - if lang in ltr_languages: - return "ltr" - elif lang in rtl_languages: - return "rtl" - else: - # Default to LTR if language is unknown - return "ltr" - - -def parse_alignment_index(root_mapping) -> List[int]: - """ - Parse the root_mapping into List of Integers. - Examples:> - Input: 1 Output: [1] - Input: 1,2,3,4,5 Output: [1,2,3,4,5] - Input: 1-3 Output: [1,2,3] - Input: 1-3,5-7 Output: [1,2,3,5,6,7] - """ - root_mapping = root_mapping.replace(" ", "").strip() - root_mapping_list = [] - for mapping in root_mapping.split(","): - if "-" in mapping: - start, end = mapping.split("-") - root_mapping_list.extend(list(range(int(start), int(end) + 1))) - else: - root_mapping_list.append(int(mapping)) - return root_mapping_list - - -def chunk_strings(strings: List[str], chunk_size=NO_OF_CHAPTER_SEGMENT): - """ - Splits a list of strings into smaller lists of at most chunk_size elements each. - - Args: - strings (list of str): The list of strings to be chunked. - chunk_size (int): The maximum size of each chunk. - - Returns: - list of list of str: A list of lists, where each inner list contains up to chunk_size elements. - """ - return [strings[i : i + chunk_size] for i in range(0, len(strings), chunk_size)] - - -def get_chapter_for_segment( - segment_num: int, no_of_chapter_segment: int = NO_OF_CHAPTER_SEGMENT -) -> int: - """ - For commentary pecha, get the chapter number from the segment number(root mapping). - """ - return math.ceil(segment_num / no_of_chapter_segment) - - -def adjust_segment_num_for_chapter( - segment_num: int, no_of_chapter_segment: int = NO_OF_CHAPTER_SEGMENT -) -> int: - return ( - segment_num % no_of_chapter_segment - if segment_num % no_of_chapter_segment != 0 - else no_of_chapter_segment - ) - - def read_csv(file_path) -> List[List[str]]: with open(file_path, newline="", encoding="utf-8") as file: reader = csv.reader(file) diff --git a/tests/alignment/translation_transfer/data/expected_serialized_translation_with_display.json b/tests/alignment/translation_transfer/data/expected_serialized_translation_with_display.json deleted file mode 100644 index a55ce87e..00000000 --- a/tests/alignment/translation_transfer/data/expected_serialized_translation_with_display.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - "The Heart of the Perfection of Wisdom of the Blessed Mother.", - "In Sanskrit: Bhagavatī Prajñāpāramitā Hṛdaya.", - "In Tibetan: The Heart of the Perfection of Wisdom of the Blessed Mother.", - "Thus have I heard at one time.", - "At one time, I heard these words.\nThe Blessed One was dwelling at Vulture Peak Mountain in Rājagṛha, \ntogether with a great assembly of monks and a great assembly of bodhisattvas.", - "At that time, the Blessed One entered the meditative absorption on the teaching called “Profound Illumination.”", - "At that time also, the bodhisattva-mahāsattva noble Avalokiteśvara looked upon the practice of the profound perfection of wisdom.", - "He saw that the five aggregates are empty of inherent existence.", - "Then, through the power of the Buddha, the venerable Śāriputra spoke these words to the bodhisattva-mahāsattva noble Avalokiteśvara", - "How should any son of noble family train who wishes to practice the profound perfection of wisdom?" -] \ No newline at end of file diff --git a/tests/alignment/translation_transfer/data/root/I2B2E5268/base/44AC.txt b/tests/alignment/translation_transfer/data/root/I2B2E5268/base/44AC.txt deleted file mode 100644 index f9213f3d..00000000 --- a/tests/alignment/translation_transfer/data/root/I2B2E5268/base/44AC.txt +++ /dev/null @@ -1,10 +0,0 @@ -བཅོམ་ལྡན་འདས་མ་ཤེས་རབ་ཀྱི་ཕ་རོལ་ཏུ་ཕྱིན་པའི་སྙིང་པོ། ། -༄༅། །​རྒྱ་གར་སྐད་དུ། བྷ་ག་བ་ཏི་པྲ་ཛྙ་པ་ར་མི་ཏཱྀ་ཧྲད་ཡ། -བོད་སྐད་དུ། བཅོམ་ལྡན་འདས་མ་ཤེས་རབ་ཀྱི་ཕ་རོལ་ཏུ་ཕྱིན་པའི་སྙིང་པོ། -བམ་པོ་གཅིག་གོ །​ -​འདི་སྐད་བདག་གིས་ཐོས་པ་དུས་གཅིག་ན། བཅོམ་ལྡན་འདས་རྒྱལ་པོའི་ཁབ་བྱ་རྒོད་ཕུང་པོའི་རི་ལ་དགེ་སློང་གི་དགེ་འདུན་ཆེན་པོ་དང་། ​བྱང་ཆུབ་སེམས་དཔའི་དགེ་འདུན་ཆེན་པོ་དང་ཐབས་ཅིག་ཏུ་བཞུགས་ཏེ། -དེའི་ཚེ་བཅོམ་ལྡན་འདས་ཟབ་མོ་སྣང་བ་ཞེས་བྱ་བའི་ཆོས་ཀྱི་རྣམ་གྲངས་ཀྱི་ཏིང་ངེ་འཛིན་ལ་སྙོམས་པར་ཞུགས་སོ། ། -​ཡང་དེའི་ཚེ་བྱང་ཆུབ་སེམས་དཔའ་སེམས་དཔའ་ཆེན་པོ་འཕགས་པ་སྤྱན་རས་གཟིགས་དབང་ཕྱུག་ཤེས་རབ་ཀྱི་ཕ་རོལ་ཏུ་ཟབ་མོའི་སྤྱོད་པ་ཉིད་ལ་རྣམ་པར་བལྟ་ཞིང་། -ཕུང་པོ་ལྔ་པོ་དེ་དག་ལ་ཡང་རང་བཞིན་གྱིས་སྟོང་པར་རྣམ་པར་བལྟའོ། ། -​དེ་ནས་སངས་རྒྱས་ཀྱི་མཐུས། ​ཚེ་དང་ལྡན་པ་ཤཱ་རིའི་བུས་བྱང་ཆུབ་སེམས་དཔའ་ཆེན་པོ་འཕགས་པ་སྤྱན་རས་གཟིགས་དབང་ཕྱུག་ལ་འདི་སྐད་ཅེས་སྨྲས་སོ། །​ -རིགས་ཀྱི་བུ་གང་ལ་ལ་ཤེས་རབ་ཀྱི་ཕ་རོལ་ཏུ་ཕྱིན་པ་ཟབ་མོའི་སྤྱོད་པ་སྤྱད་པར་འདོད་པ་དེས་ཇི་ལྟར་བསླབ་པར་བྱ། diff --git a/tests/alignment/translation_transfer/data/root/I2B2E5268/layers/44AC/alignment-148C.json b/tests/alignment/translation_transfer/data/root/I2B2E5268/layers/44AC/alignment-148C.json deleted file mode 100644 index 357b9772..00000000 --- a/tests/alignment/translation_transfer/data/root/I2B2E5268/layers/44AC/alignment-148C.json +++ /dev/null @@ -1,693 +0,0 @@ -{ - "@type": "AnnotationStore", - "@id": "I2B2E5268", - "resources": [ - { - "@type": "TextResource", - "@id": "44AC", - "@include": "../../base/44AC.txt" - } - ], - "annotationsets": [ - { - "@type": "AnnotationDataSet", - "@id": "segmentation_annotation", - "keys": [ - { - "@type": "DataKey", - "@id": "index" - }, - { - "@type": "DataKey", - "@id": "alignment_index" - }, - { - "@type": "DataKey", - "@id": "segmentation_type" - } - ], - "data": [ - { - "@type": "AnnotationData", - "@id": "018EDEF38F", - "key": "index", - "value": { - "@type": "Int", - "value": 1 - } - }, - { - "@type": "AnnotationData", - "@id": "4804515E81", - "key": "alignment_index", - "value": { - "@type": "List", - "value": [ - { - "@type": "Int", - "value": 1 - } - ] - } - }, - { - "@type": "AnnotationData", - "@id": "D72CCD4474", - "key": "segmentation_type", - "value": { - "@type": "String", - "value": "alignment" - } - }, - { - "@type": "AnnotationData", - "@id": "A0BF615B0D", - "key": "index", - "value": { - "@type": "Int", - "value": 2 - } - }, - { - "@type": "AnnotationData", - "@id": "E796670CC9", - "key": "alignment_index", - "value": { - "@type": "List", - "value": [ - { - "@type": "Int", - "value": 2 - } - ] - } - }, - { - "@type": "AnnotationData", - "@id": "ACBE155145", - "key": "index", - "value": { - "@type": "Int", - "value": 3 - } - }, - { - "@type": "AnnotationData", - "@id": "A507801DC1", - "key": "alignment_index", - "value": { - "@type": "List", - "value": [ - { - "@type": "Int", - "value": 3 - } - ] - } - }, - { - "@type": "AnnotationData", - "@id": "11990004FF", - "key": "index", - "value": { - "@type": "Int", - "value": 4 - } - }, - { - "@type": "AnnotationData", - "@id": "447471AF64", - "key": "alignment_index", - "value": { - "@type": "List", - "value": [ - { - "@type": "Int", - "value": 4 - } - ] - } - }, - { - "@type": "AnnotationData", - "@id": "5F6B9512FE", - "key": "index", - "value": { - "@type": "Int", - "value": 5 - } - }, - { - "@type": "AnnotationData", - "@id": "DBD7ADDC7F", - "key": "alignment_index", - "value": { - "@type": "List", - "value": [ - { - "@type": "Int", - "value": 5 - } - ] - } - }, - { - "@type": "AnnotationData", - "@id": "E5E688FE4F", - "key": "index", - "value": { - "@type": "Int", - "value": 6 - } - }, - { - "@type": "AnnotationData", - "@id": "7B557B7BC7", - "key": "alignment_index", - "value": { - "@type": "List", - "value": [ - { - "@type": "Int", - "value": 6 - } - ] - } - }, - { - "@type": "AnnotationData", - "@id": "6FC26B22F0", - "key": "index", - "value": { - "@type": "Int", - "value": 7 - } - }, - { - "@type": "AnnotationData", - "@id": "F1938BBADB", - "key": "alignment_index", - "value": { - "@type": "List", - "value": [ - { - "@type": "Int", - "value": 7 - } - ] - } - }, - { - "@type": "AnnotationData", - "@id": "05864D1680", - "key": "index", - "value": { - "@type": "Int", - "value": 8 - } - }, - { - "@type": "AnnotationData", - "@id": "A22A085AF0", - "key": "alignment_index", - "value": { - "@type": "List", - "value": [ - { - "@type": "Int", - "value": 8 - } - ] - } - }, - { - "@type": "AnnotationData", - "@id": "BA428DDF4F", - "key": "index", - "value": { - "@type": "Int", - "value": 9 - } - }, - { - "@type": "AnnotationData", - "@id": "C6DC9CF664", - "key": "alignment_index", - "value": { - "@type": "List", - "value": [ - { - "@type": "Int", - "value": 9 - } - ] - } - }, - { - "@type": "AnnotationData", - "@id": "86C4550875", - "key": "index", - "value": { - "@type": "Int", - "value": 10 - } - }, - { - "@type": "AnnotationData", - "@id": "8E8088E0E1", - "key": "alignment_index", - "value": { - "@type": "List", - "value": [ - { - "@type": "Int", - "value": 10 - } - ] - } - }, - { - "@type": "AnnotationData", - "@id": "ECDBC6C8BA", - "key": "index", - "value": { - "@type": "Int", - "value": 11 - } - }, - { - "@type": "AnnotationData", - "@id": "808560B64D", - "key": "alignment_index", - "value": { - "@type": "List", - "value": [ - { - "@type": "Int", - "value": 11 - } - ] - } - } - ] - } - ], - "annotations": [ - { - "@type": "Annotation", - "@id": "4CF3C46A76", - "target": { - "@type": "TextSelector", - "resource": "44AC", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 0 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 54 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "018EDEF38F", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "4804515E81", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "D72CCD4474", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "20D24729C9", - "target": { - "@type": "TextSelector", - "resource": "44AC", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 55 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 109 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "A0BF615B0D", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "E796670CC9", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "D72CCD4474", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "37E443680E", - "target": { - "@type": "TextSelector", - "resource": "44AC", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 110 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 174 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "ACBE155145", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "A507801DC1", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "D72CCD4474", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "71339F132F", - "target": { - "@type": "TextSelector", - "resource": "44AC", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 175 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 191 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "11990004FF", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "447471AF64", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "D72CCD4474", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "0B6BA5B099", - "target": { - "@type": "TextSelector", - "resource": "44AC", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 193 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 226 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "5F6B9512FE", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "DBD7ADDC7F", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "D72CCD4474", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "27712A888F", - "target": { - "@type": "TextSelector", - "resource": "44AC", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 227 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 366 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "E5E688FE4F", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "7B557B7BC7", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "D72CCD4474", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "D614670030", - "target": { - "@type": "TextSelector", - "resource": "44AC", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 367 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 465 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "6FC26B22F0", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "F1938BBADB", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "D72CCD4474", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "44DAAFD6DB", - "target": { - "@type": "TextSelector", - "resource": "44AC", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 467 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 599 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "05864D1680", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "A22A085AF0", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "D72CCD4474", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "3401496706", - "target": { - "@type": "TextSelector", - "resource": "44AC", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 600 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 660 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "BA428DDF4F", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "C6DC9CF664", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "D72CCD4474", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "CAC129D04C", - "target": { - "@type": "TextSelector", - "resource": "44AC", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 662 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 791 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "86C4550875", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "8E8088E0E1", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "D72CCD4474", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "E46998BE6F", - "target": { - "@type": "TextSelector", - "resource": "44AC", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 792 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 891 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "ECDBC6C8BA", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "808560B64D", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "D72CCD4474", - "set": "segmentation_annotation" - } - ] - } - ] -} \ No newline at end of file diff --git a/tests/alignment/translation_transfer/data/root/I2B2E5268/layers/44AC/segmentation-EB5B.json b/tests/alignment/translation_transfer/data/root/I2B2E5268/layers/44AC/segmentation-EB5B.json deleted file mode 100644 index c46d828f..00000000 --- a/tests/alignment/translation_transfer/data/root/I2B2E5268/layers/44AC/segmentation-EB5B.json +++ /dev/null @@ -1,440 +0,0 @@ -{ - "@type": "AnnotationStore", - "@id": "I2B2E5268", - "resources": [ - { - "@type": "TextResource", - "@id": "44AC", - "@include": "../../base/44AC.txt" - } - ], - "annotationsets": [ - { - "@type": "AnnotationDataSet", - "@id": "segmentation_annotation", - "keys": [ - { - "@type": "DataKey", - "@id": "index" - }, - { - "@type": "DataKey", - "@id": "segmentation_type" - } - ], - "data": [ - { - "@type": "AnnotationData", - "@id": "F904E898AE", - "key": "index", - "value": { - "@type": "Int", - "value": 1 - } - }, - { - "@type": "AnnotationData", - "@id": "F2BDE4D444", - "key": "segmentation_type", - "value": { - "@type": "String", - "value": "segmentation" - } - }, - { - "@type": "AnnotationData", - "@id": "247C9A822A", - "key": "index", - "value": { - "@type": "Int", - "value": 2 - } - }, - { - "@type": "AnnotationData", - "@id": "509CD12DFD", - "key": "index", - "value": { - "@type": "Int", - "value": 3 - } - }, - { - "@type": "AnnotationData", - "@id": "88A3F99E6E", - "key": "index", - "value": { - "@type": "Int", - "value": 4 - } - }, - { - "@type": "AnnotationData", - "@id": "30967FB200", - "key": "index", - "value": { - "@type": "Int", - "value": 5 - } - }, - { - "@type": "AnnotationData", - "@id": "BFF8F2BAEE", - "key": "index", - "value": { - "@type": "Int", - "value": 6 - } - }, - { - "@type": "AnnotationData", - "@id": "922D9F7D56", - "key": "index", - "value": { - "@type": "Int", - "value": 7 - } - }, - { - "@type": "AnnotationData", - "@id": "40A000F194", - "key": "index", - "value": { - "@type": "Int", - "value": 8 - } - }, - { - "@type": "AnnotationData", - "@id": "D10535D387", - "key": "index", - "value": { - "@type": "Int", - "value": 9 - } - }, - { - "@type": "AnnotationData", - "@id": "9F9864847C", - "key": "index", - "value": { - "@type": "Int", - "value": 10 - } - } - ] - } - ], - "annotations": [ - { - "@type": "Annotation", - "@id": "19DDCF909D", - "target": { - "@type": "TextSelector", - "resource": "44AC", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 0 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 54 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "F904E898AE", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "F2BDE4D444", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "324FA88F84", - "target": { - "@type": "TextSelector", - "resource": "44AC", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 55 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 109 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "247C9A822A", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "F2BDE4D444", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "19D1D11EDC", - "target": { - "@type": "TextSelector", - "resource": "44AC", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 110 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 174 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "509CD12DFD", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "F2BDE4D444", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "9A90D40319", - "target": { - "@type": "TextSelector", - "resource": "44AC", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 175 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 191 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "88A3F99E6E", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "F2BDE4D444", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "2D48DA6339", - "target": { - "@type": "TextSelector", - "resource": "44AC", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 192 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 366 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "30967FB200", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "F2BDE4D444", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "407B4CB44D", - "target": { - "@type": "TextSelector", - "resource": "44AC", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 367 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 465 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "BFF8F2BAEE", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "F2BDE4D444", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "C3A7459EE7", - "target": { - "@type": "TextSelector", - "resource": "44AC", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 466 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 599 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "922D9F7D56", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "F2BDE4D444", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "4346023B45", - "target": { - "@type": "TextSelector", - "resource": "44AC", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 600 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 660 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "40A000F194", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "F2BDE4D444", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "B84AED3440", - "target": { - "@type": "TextSelector", - "resource": "44AC", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 661 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 791 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "D10535D387", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "F2BDE4D444", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "13EE1303BB", - "target": { - "@type": "TextSelector", - "resource": "44AC", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 792 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 891 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "9F9864847C", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "F2BDE4D444", - "set": "segmentation_annotation" - } - ] - } - ] -} \ No newline at end of file diff --git a/tests/alignment/translation_transfer/data/root/I2B2E5268/metadata.json b/tests/alignment/translation_transfer/data/root/I2B2E5268/metadata.json deleted file mode 100644 index 0def165d..00000000 --- a/tests/alignment/translation_transfer/data/root/I2B2E5268/metadata.json +++ /dev/null @@ -1,41 +0,0 @@ -{ - "id": "I2B2E5268", - "title": { - "BO": "དགོངས་པ་རབ་གསལ་ལས་སེམས་བསྐྱེད་དྲུག་པ། ཤོ་ལོ་ཀ ༡-༦༤", - "EN": "Illuminating the Intent Chapter 6, verses 1 to 64" - }, - "author": { - "BO": "ཙོང་ཁ་པ་བློ་བཟང་གྲགས་པ།", - "EN": "Tsongkhapa Lobzang Drakpa" - }, - "imported": "2025-06-04T11:11:17.380001", - "source": "http://purl.bdrc.io/resource/WA1NLM688", - "toolkit_version": "2.1.13", - "parser": "DocxRootParser", - "initial_creation_type": "google_docx", - "language": "bo", - "source_metadata": { - "title_short": { - "BO": "དགོངས་པ་རབ་གསལ་ལས་སེམས་བསྐྱེད་དྲུག་པ། ཤོ་ལོ་ཀ ༡-༦༤" - }, - "title_long_clean": { - "BO": "དགོངས་པ་རབ་གསལ་ལས་སེམས་བསྐྱེད་དྲུག་པ། ཤོ་ལོ་ཀ ༡-༦༤" - }, - "is_commentary_of": { - "BO": "དབུ་མ་འཇུག་པ་ལས་སེམས་བསྐྱེད་དྲུག་པ། ཤོ་ལོ་ཀ ༡-༦༤" - } - }, - "bases": {}, - "copyright": { - "status": "Unknown", - "notice": "", - "info_url": null - }, - "licence": "Unknown", - "legacy_id": null, - "source_file": null, - "ocr_import_info": null, - "statistics": null, - "quality": null, - "last_modified": "2025-06-04T11:11:17.379998" -} \ No newline at end of file diff --git a/tests/alignment/translation_transfer/data/serialized_translation.json b/tests/alignment/translation_transfer/data/serialized_translation.json deleted file mode 100644 index a55ce87e..00000000 --- a/tests/alignment/translation_transfer/data/serialized_translation.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - "The Heart of the Perfection of Wisdom of the Blessed Mother.", - "In Sanskrit: Bhagavatī Prajñāpāramitā Hṛdaya.", - "In Tibetan: The Heart of the Perfection of Wisdom of the Blessed Mother.", - "Thus have I heard at one time.", - "At one time, I heard these words.\nThe Blessed One was dwelling at Vulture Peak Mountain in Rājagṛha, \ntogether with a great assembly of monks and a great assembly of bodhisattvas.", - "At that time, the Blessed One entered the meditative absorption on the teaching called “Profound Illumination.”", - "At that time also, the bodhisattva-mahāsattva noble Avalokiteśvara looked upon the practice of the profound perfection of wisdom.", - "He saw that the five aggregates are empty of inherent existence.", - "Then, through the power of the Buddha, the venerable Śāriputra spoke these words to the bodhisattva-mahāsattva noble Avalokiteśvara", - "How should any son of noble family train who wishes to practice the profound perfection of wisdom?" -] \ No newline at end of file diff --git a/tests/alignment/translation_transfer/data/translation/I9248F287/base/BE59.txt b/tests/alignment/translation_transfer/data/translation/I9248F287/base/BE59.txt deleted file mode 100644 index dfa91929..00000000 --- a/tests/alignment/translation_transfer/data/translation/I9248F287/base/BE59.txt +++ /dev/null @@ -1,12 +0,0 @@ -The Heart of the Perfection of Wisdom of the Blessed Mother. -In Sanskrit: Bhagavatī Prajñāpāramitā Hṛdaya. -In Tibetan: The Heart of the Perfection of Wisdom of the Blessed Mother. -Thus have I heard at one time. -At one time, I heard these words. -The Blessed One was dwelling at Vulture Peak Mountain in Rājagṛha, -together with a great assembly of monks and a great assembly of bodhisattvas. -At that time, the Blessed One entered the meditative absorption on the teaching called “Profound Illumination.” -At that time also, the bodhisattva-mahāsattva noble Avalokiteśvara looked upon the practice of the profound perfection of wisdom. -He saw that the five aggregates are empty of inherent existence. -Then, through the power of the Buddha, the venerable Śāriputra spoke these words to the bodhisattva-mahāsattva noble Avalokiteśvara -How should any son of noble family train who wishes to practice the profound perfection of wisdom? diff --git a/tests/alignment/translation_transfer/data/translation/I9248F287/layers/BE59/alignment-044E.json b/tests/alignment/translation_transfer/data/translation/I9248F287/layers/BE59/alignment-044E.json deleted file mode 100644 index bc224e99..00000000 --- a/tests/alignment/translation_transfer/data/translation/I9248F287/layers/BE59/alignment-044E.json +++ /dev/null @@ -1,693 +0,0 @@ -{ - "@type": "AnnotationStore", - "@id": "I9248F287", - "resources": [ - { - "@type": "TextResource", - "@id": "BE59", - "@include": "../../base/BE59.txt" - } - ], - "annotationsets": [ - { - "@type": "AnnotationDataSet", - "@id": "segmentation_annotation", - "keys": [ - { - "@type": "DataKey", - "@id": "index" - }, - { - "@type": "DataKey", - "@id": "alignment_index" - }, - { - "@type": "DataKey", - "@id": "segmentation_type" - } - ], - "data": [ - { - "@type": "AnnotationData", - "@id": "332822E09F", - "key": "index", - "value": { - "@type": "Int", - "value": 1 - } - }, - { - "@type": "AnnotationData", - "@id": "2D00056BBB", - "key": "alignment_index", - "value": { - "@type": "List", - "value": [ - { - "@type": "Int", - "value": 1 - } - ] - } - }, - { - "@type": "AnnotationData", - "@id": "6B6796356B", - "key": "segmentation_type", - "value": { - "@type": "String", - "value": "alignment" - } - }, - { - "@type": "AnnotationData", - "@id": "4ED1B529C0", - "key": "index", - "value": { - "@type": "Int", - "value": 2 - } - }, - { - "@type": "AnnotationData", - "@id": "1F70515313", - "key": "alignment_index", - "value": { - "@type": "List", - "value": [ - { - "@type": "Int", - "value": 2 - } - ] - } - }, - { - "@type": "AnnotationData", - "@id": "2B6CB333AA", - "key": "index", - "value": { - "@type": "Int", - "value": 3 - } - }, - { - "@type": "AnnotationData", - "@id": "F89212B44E", - "key": "alignment_index", - "value": { - "@type": "List", - "value": [ - { - "@type": "Int", - "value": 3 - } - ] - } - }, - { - "@type": "AnnotationData", - "@id": "C121F4DEAB", - "key": "index", - "value": { - "@type": "Int", - "value": 4 - } - }, - { - "@type": "AnnotationData", - "@id": "AA49DB284B", - "key": "alignment_index", - "value": { - "@type": "List", - "value": [ - { - "@type": "Int", - "value": 4 - } - ] - } - }, - { - "@type": "AnnotationData", - "@id": "69D692D20D", - "key": "index", - "value": { - "@type": "Int", - "value": 5 - } - }, - { - "@type": "AnnotationData", - "@id": "F75EF7A6B3", - "key": "alignment_index", - "value": { - "@type": "List", - "value": [ - { - "@type": "Int", - "value": 5 - } - ] - } - }, - { - "@type": "AnnotationData", - "@id": "14E8CC1F2E", - "key": "index", - "value": { - "@type": "Int", - "value": 6 - } - }, - { - "@type": "AnnotationData", - "@id": "465F28F0F6", - "key": "alignment_index", - "value": { - "@type": "List", - "value": [ - { - "@type": "Int", - "value": 6 - } - ] - } - }, - { - "@type": "AnnotationData", - "@id": "FACAB0A11B", - "key": "index", - "value": { - "@type": "Int", - "value": 7 - } - }, - { - "@type": "AnnotationData", - "@id": "995690DC68", - "key": "alignment_index", - "value": { - "@type": "List", - "value": [ - { - "@type": "Int", - "value": 7 - } - ] - } - }, - { - "@type": "AnnotationData", - "@id": "E5FEFE430C", - "key": "index", - "value": { - "@type": "Int", - "value": 8 - } - }, - { - "@type": "AnnotationData", - "@id": "E855F543CE", - "key": "alignment_index", - "value": { - "@type": "List", - "value": [ - { - "@type": "Int", - "value": 8 - } - ] - } - }, - { - "@type": "AnnotationData", - "@id": "E0C94D8C24", - "key": "index", - "value": { - "@type": "Int", - "value": 9 - } - }, - { - "@type": "AnnotationData", - "@id": "39FEAA6EA2", - "key": "alignment_index", - "value": { - "@type": "List", - "value": [ - { - "@type": "Int", - "value": 9 - } - ] - } - }, - { - "@type": "AnnotationData", - "@id": "B39B8BE3C8", - "key": "index", - "value": { - "@type": "Int", - "value": 10 - } - }, - { - "@type": "AnnotationData", - "@id": "66FE0C222B", - "key": "alignment_index", - "value": { - "@type": "List", - "value": [ - { - "@type": "Int", - "value": 10 - } - ] - } - }, - { - "@type": "AnnotationData", - "@id": "EDE6FCDAED", - "key": "index", - "value": { - "@type": "Int", - "value": 11 - } - }, - { - "@type": "AnnotationData", - "@id": "697BCB4494", - "key": "alignment_index", - "value": { - "@type": "List", - "value": [ - { - "@type": "Int", - "value": 11 - } - ] - } - } - ] - } - ], - "annotations": [ - { - "@type": "Annotation", - "@id": "FE0BEEB4B9", - "target": { - "@type": "TextSelector", - "resource": "BE59", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 0 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 60 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "332822E09F", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "2D00056BBB", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "6B6796356B", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "2D740F1727", - "target": { - "@type": "TextSelector", - "resource": "BE59", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 61 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 106 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "4ED1B529C0", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "1F70515313", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "6B6796356B", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "BC5DBCDF47", - "target": { - "@type": "TextSelector", - "resource": "BE59", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 107 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 179 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "2B6CB333AA", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "F89212B44E", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "6B6796356B", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "478ED007C8", - "target": { - "@type": "TextSelector", - "resource": "BE59", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 180 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 210 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "C121F4DEAB", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "AA49DB284B", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "6B6796356B", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "CC9CB389CA", - "target": { - "@type": "TextSelector", - "resource": "BE59", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 211 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 244 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "69D692D20D", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "F75EF7A6B3", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "6B6796356B", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "DDBBB6D24F", - "target": { - "@type": "TextSelector", - "resource": "BE59", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 245 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 390 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "14E8CC1F2E", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "465F28F0F6", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "6B6796356B", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "DCE3A3DDBC", - "target": { - "@type": "TextSelector", - "resource": "BE59", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 391 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 502 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "FACAB0A11B", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "995690DC68", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "6B6796356B", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "743A44B367", - "target": { - "@type": "TextSelector", - "resource": "BE59", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 503 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 632 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "E5FEFE430C", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "E855F543CE", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "6B6796356B", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "8818AB8652", - "target": { - "@type": "TextSelector", - "resource": "BE59", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 633 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 697 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "E0C94D8C24", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "39FEAA6EA2", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "6B6796356B", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "D8D2DDA020", - "target": { - "@type": "TextSelector", - "resource": "BE59", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 698 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 829 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "B39B8BE3C8", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "66FE0C222B", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "6B6796356B", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "F81ED67FFD", - "target": { - "@type": "TextSelector", - "resource": "BE59", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 830 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 928 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "EDE6FCDAED", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "697BCB4494", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "6B6796356B", - "set": "segmentation_annotation" - } - ] - } - ] -} \ No newline at end of file diff --git a/tests/alignment/translation_transfer/data/translation/I9248F287/layers/BE59/segmentation-60A8.json b/tests/alignment/translation_transfer/data/translation/I9248F287/layers/BE59/segmentation-60A8.json deleted file mode 100644 index 7b5e433b..00000000 --- a/tests/alignment/translation_transfer/data/translation/I9248F287/layers/BE59/segmentation-60A8.json +++ /dev/null @@ -1,520 +0,0 @@ -{ - "@type": "AnnotationStore", - "@id": "I9248F287", - "resources": [ - { - "@type": "TextResource", - "@id": "BE59", - "@include": "../../base/BE59.txt" - } - ], - "annotationsets": [ - { - "@type": "AnnotationDataSet", - "@id": "segmentation_annotation", - "keys": [ - { - "@type": "DataKey", - "@id": "index" - }, - { - "@type": "DataKey", - "@id": "segmentation_type" - } - ], - "data": [ - { - "@type": "AnnotationData", - "@id": "5C8AE48E2F", - "key": "index", - "value": { - "@type": "Int", - "value": 1 - } - }, - { - "@type": "AnnotationData", - "@id": "3DBBE022B7", - "key": "segmentation_type", - "value": { - "@type": "String", - "value": "segmentation" - } - }, - { - "@type": "AnnotationData", - "@id": "DBE43E2994", - "key": "index", - "value": { - "@type": "Int", - "value": 2 - } - }, - { - "@type": "AnnotationData", - "@id": "4B23C60F0C", - "key": "index", - "value": { - "@type": "Int", - "value": 3 - } - }, - { - "@type": "AnnotationData", - "@id": "FFF664F1A9", - "key": "index", - "value": { - "@type": "Int", - "value": 4 - } - }, - { - "@type": "AnnotationData", - "@id": "B2DEFF1F92", - "key": "index", - "value": { - "@type": "Int", - "value": 5 - } - }, - { - "@type": "AnnotationData", - "@id": "8D34D587D8", - "key": "index", - "value": { - "@type": "Int", - "value": 6 - } - }, - { - "@type": "AnnotationData", - "@id": "A835A08B6C", - "key": "index", - "value": { - "@type": "Int", - "value": 7 - } - }, - { - "@type": "AnnotationData", - "@id": "112A59C8DA", - "key": "index", - "value": { - "@type": "Int", - "value": 8 - } - }, - { - "@type": "AnnotationData", - "@id": "F8B1BD691F", - "key": "index", - "value": { - "@type": "Int", - "value": 9 - } - }, - { - "@type": "AnnotationData", - "@id": "4FFA90419F", - "key": "index", - "value": { - "@type": "Int", - "value": 10 - } - }, - { - "@type": "AnnotationData", - "@id": "47A80FC410", - "key": "index", - "value": { - "@type": "Int", - "value": 11 - } - }, - { - "@type": "AnnotationData", - "@id": "6FFF544B75", - "key": "index", - "value": { - "@type": "Int", - "value": 12 - } - } - ] - } - ], - "annotations": [ - { - "@type": "Annotation", - "@id": "3667D0B6B6", - "target": { - "@type": "TextSelector", - "resource": "BE59", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 0 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 60 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "5C8AE48E2F", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "3DBBE022B7", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "71E11E1D51", - "target": { - "@type": "TextSelector", - "resource": "BE59", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 61 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 106 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "DBE43E2994", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "3DBBE022B7", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "31F944DD99", - "target": { - "@type": "TextSelector", - "resource": "BE59", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 107 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 179 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "4B23C60F0C", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "3DBBE022B7", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "986590066F", - "target": { - "@type": "TextSelector", - "resource": "BE59", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 180 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 210 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "FFF664F1A9", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "3DBBE022B7", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "081A0B1C6D", - "target": { - "@type": "TextSelector", - "resource": "BE59", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 211 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 244 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "B2DEFF1F92", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "3DBBE022B7", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "64254C4EF6", - "target": { - "@type": "TextSelector", - "resource": "BE59", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 245 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 312 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "8D34D587D8", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "3DBBE022B7", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "4740F0FAAF", - "target": { - "@type": "TextSelector", - "resource": "BE59", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 313 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 390 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "A835A08B6C", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "3DBBE022B7", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "D354B16CDD", - "target": { - "@type": "TextSelector", - "resource": "BE59", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 391 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 502 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "112A59C8DA", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "3DBBE022B7", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "1CCC77B6CC", - "target": { - "@type": "TextSelector", - "resource": "BE59", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 503 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 632 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "F8B1BD691F", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "3DBBE022B7", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "BD8AD4CC11", - "target": { - "@type": "TextSelector", - "resource": "BE59", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 633 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 697 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "4FFA90419F", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "3DBBE022B7", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "874D76944B", - "target": { - "@type": "TextSelector", - "resource": "BE59", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 698 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 829 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "47A80FC410", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "3DBBE022B7", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "499B2C4950", - "target": { - "@type": "TextSelector", - "resource": "BE59", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 830 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 928 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "6FFF544B75", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "3DBBE022B7", - "set": "segmentation_annotation" - } - ] - } - ] -} \ No newline at end of file diff --git a/tests/alignment/translation_transfer/data/translation/I9248F287/metadata.json b/tests/alignment/translation_transfer/data/translation/I9248F287/metadata.json deleted file mode 100644 index 766bafd3..00000000 --- a/tests/alignment/translation_transfer/data/translation/I9248F287/metadata.json +++ /dev/null @@ -1,42 +0,0 @@ -{ - "id": "I9248F287", - "title": { - "BO": "དགོངས་པ་རབ་གསལ་ལས་སེམས་བསྐྱེད་དྲུག་པ། ཤོ་ལོ་ཀ ༡-༦༤", - "EN": "Illuminating the Intent Chapter 6, verses 1 to 64" - }, - "author": { - "BO": "ཙོང་ཁ་པ་བློ་བཟང་གྲགས་པ།", - "EN": "Tsongkhapa Lobzang Drakpa" - }, - "imported": "2025-06-05T04:11:03.779463", - "source": "http://purl.bdrc.io/resource/WA1NLM688", - "toolkit_version": "2.1.13", - "parser": "DocxRootParser", - "initial_creation_type": "google_docx", - "language": "en", - "source_metadata": { - "title_short": { - "BO": "དགོངས་པ་རབ་གསལ་ལས་སེམས་བསྐྱེད་དྲུག་པ། ཤོ་ལོ་ཀ ༡-༦༤", - "EN": "Illuminating the Intent Chapter 6, verses 1 to 64" - }, - "title_long_clean": { - "BO": "དགོངས་པ་རབ་གསལ་ལས་སེམས་བསྐྱེད་དྲུག་པ། ཤོ་ལོ་ཀ ༡-༦༤" - }, - "is_commentary_of": { - "BO": "དབུ་མ་འཇུག་པ་ལས་སེམས་བསྐྱེད་དྲུག་པ། ཤོ་ལོ་ཀ ༡-༦༤" - } - }, - "bases": {}, - "copyright": { - "status": "Unknown", - "notice": "", - "info_url": null - }, - "licence": "Unknown", - "legacy_id": null, - "source_file": null, - "ocr_import_info": null, - "statistics": null, - "quality": null, - "last_modified": "2025-06-05T04:11:03.779460" -} \ No newline at end of file diff --git a/tests/pecha/__init__.py b/tests/pecha/__init__.py index 543833e9..e69de29b 100644 --- a/tests/pecha/__init__.py +++ b/tests/pecha/__init__.py @@ -1,240 +0,0 @@ -from pathlib import Path - -from openpecha.pecha import Pecha -from openpecha.pecha.annotations import AnnotationModel, PechaAlignment -from openpecha.pecha.layer import AnnotationType - - -class DummyMetadataModel: - def __init__(self, **args): - for k, v in args.items(): - setattr(self, k, v) - - -class SharedPechaSetup: - def setup_pechas(self): - self.root_pecha_path = Path( - "tests/alignment/commentary_transfer/data/root/I6556B464" - ) - self.root_translation_pecha_path = Path("tests/pecha/data/I5003D420") - self.commentary_pecha_path = Path( - "tests/alignment/commentary_transfer/data/commentary/I015AFFA7" - ) - self.commentary_translation_pecha_path = Path("tests/pecha/data/ICFCF1CDC") - self.prealigned_commentary_translation_pecha_path = Path( - "tests/pecha/data/IF5944957" - ) - - self.root_pecha = Pecha.from_path(self.root_pecha_path) - self.root_translation_pecha = Pecha.from_path(self.root_translation_pecha_path) - self.commentary_pecha = Pecha.from_path(self.commentary_pecha_path) - self.commentary_translation_pecha = Pecha.from_path( - self.commentary_translation_pecha_path - ) - self.prealigned_commentary_translation_pecha = Pecha.from_path( - self.prealigned_commentary_translation_pecha_path - ) - - self.root_pecha_metadata = DummyMetadataModel( - **{ - "type": "root", - "parent": None, - **self.root_pecha.metadata.to_dict(), - } - ) - self.root_pecha_annotations = [ - AnnotationModel( - pecha_id="I6556B464", - type=AnnotationType.SEGMENTATION, - document_id="d2", - path="B5FE/segmentation-4FD1.json", - title="དབུ་མ་འཇུག་པ་ལས་སེམས་བསྐྱེད་དྲུག་པ། ཤོ་ལོ་ཀ ༡-༦༤ segmentation", - aligned_to=None, - ), - AnnotationModel( - pecha_id="I6556B464", - type=AnnotationType.ALIGNMENT, - document_id="d2", - path="B5FE/alignment-6707.json", - title="དབུ་མ་འཇུག་པ་ལས་སེམས་བསྐྱེད་དྲུག་པ། ཤོ་ལོ་ཀ ༡-༦༤ alignment", - aligned_to=None, - ), - ] - - self.root_translation_pecha_metadata = DummyMetadataModel( - **{ - "type": "translation", - "parent": "I6556B464", - **self.root_translation_pecha.metadata.to_dict(), - } - ) - self.root_translation_pecha_annotations = [ - AnnotationModel( - pecha_id="I5003D420", - type=AnnotationType.ALIGNMENT, - document_id="d3", - path="9813/alignment-AE0B.json", - title="དགོངས་པ་རབ་གསལ་ལས་སེམས་བསྐྱེད་དྲུག་པ། ཤོ་ལོ་ཀ ༡-༦༤ translation 1", - aligned_to=PechaAlignment( - pecha_id="IE60BBDE8", alignment_id="B8B3/segmentation-74F4.json" - ), - ) - ] - - self.commentary_pecha_metadata = DummyMetadataModel( - **{ - "type": "commentary", - "parent": "IE60BBDE8", - **self.commentary_pecha.metadata.to_dict(), - } - ) - self.commentary_pecha_annotations = [ - AnnotationModel( - pecha_id="I015AFFA7", - type=AnnotationType.ALIGNMENT, - document_id="d4", - path="B014/alignment-2127.json", - title="དགོངས་པ་རབ་གསལ་ལས་སེམས་བསྐྱེད་དྲུག་པ། ཤོ་ལོ་ཀ ༡-༦༤ commentary", - aligned_to=PechaAlignment( - pecha_id="IE60BBDE8", alignment_id="B8B3/segmentation-74F4.json" - ), - ) - ] - - self.commentary_translation_pecha_metadata = DummyMetadataModel( - **{ - "type": "translation", - "parent": "I6944984E", - **self.commentary_translation_pecha.metadata.to_dict(), - } - ) - self.commentary_translation_pecha_annotations = [ - AnnotationModel( - pecha_id="ICFCF1CDC", - type=AnnotationType.ALIGNMENT, - document_id="d4", - path="EB60/alignment-6786.json", - title="དགོངས་པ་རབ་གསལ་ལས་སེམས་བསྐྱེད་དྲུག་པ། ཤོ་ལོ་ཀ ༡-༦༤ commentary", - aligned_to=PechaAlignment( - pecha_id="I015AFFA7", alignment_id="B014/alignment-2127.json" - ), - ) - ] - - self.prealigned_root_translation_pecha_metadata = DummyMetadataModel( - **{ - "type": "translation", - "parent": "IE60BBDE8", - **self.root_translation_pecha.metadata.to_dict(), - } - ) - self.prealigned_root_translation_pecha_annotations = [ - AnnotationModel( - pecha_id="I62E00D78", - type=AnnotationType.ALIGNMENT, - document_id="d3", - path="D93E/alignment-0216.json", - title="དགོངས་པ་རབ་གསལ་ལས་སེམས་བསྐྱེད་དྲུག་པ། ཤོ་ལོ་ཀ ༡-༦༤ translation 1", - aligned_to=PechaAlignment( - pecha_id="IE60BBDE8", alignment_id="B8B3/alignment-F81A.json" - ), - ) - ] - - self.prealigned_root_translation_segmentation_pecha_metadata = ( - DummyMetadataModel( - **{ - "type": "translation", - "parent": "IE60BBDE8", - **self.root_translation_pecha.metadata.to_dict(), - } - ) - ) - self.prealigned_root_translation_segmentation_pecha_annotations = [ - AnnotationModel( - pecha_id="I62E00D78", - type=AnnotationType.SEGMENTATION, - document_id="d3", - path="D93E/segmentation-2143.json", - title="དགོངས་པ་རབ་གསལ་ལས་སེམས་བསྐྱེད་དྲུག་པ། ཤོ་ལོ་ཀ ༡-༦༤ segmentation", - aligned_to=PechaAlignment(pecha_id="IE60BBDE8", alignment_id=None), - ), - AnnotationModel( - pecha_id="I62E00D78", - type=AnnotationType.ALIGNMENT, - document_id="d3", - path="D93E/alignment-0216.json", - title="དགོངས་པ་རབ་གསལ་ལས་སེམས་བསྐྱེད་དྲུག་པ། ཤོ་ལོ་ཀ ༡-༦༤ translation 1", - aligned_to=PechaAlignment( - pecha_id="IE60BBDE8", alignment_id="B8B3/alignment-F81A.json" - ), - ), - ] - - self.prealigned_commentary_pecha_metadata = DummyMetadataModel( - **{ - "type": "commentary", - "parent": "IE60BBDE8", - **self.commentary_pecha.metadata.to_dict(), - } - ) - self.prealigned_commentary_pecha_annotations = [ - AnnotationModel( - pecha_id="I6944984E", - type=AnnotationType.ALIGNMENT, - document_id="d4", - path="E949/alignment-2F29.json", - title="དགོངས་པ་རབ་གསལ་ལས་སེམས་བསྐྱེད་དྲུག་པ། ཤོ་ལོ་ཀ ༡-༦༤ commentary", - aligned_to=PechaAlignment( - pecha_id="IE60BBDE8", alignment_id="B8B3/alignment-F81A.json" - ), - ) - ] - - self.prealigned_commentary_segmentation_pecha_metadata = DummyMetadataModel( - **{ - "type": "commentary", - "parent": "IE60BBDE8", - **self.commentary_pecha.metadata.to_dict(), - } - ) - self.prealigned_commentary_segmentation_pecha_annotations = [ - AnnotationModel( - pecha_id="I6944984E", - type=AnnotationType.SEGMENTATION, - document_id="d4", - path="E949/segmentation-2134.json", - title="དགོངས་པ་རབ་གསལ་ལས་སེམས་བསྐྱེད་དྲུག་པ། ཤོ་ལོ་ཀ ༡-༦༤ commentary", - aligned_to=PechaAlignment(pecha_id="IE60BBDE8", alignment_id=None), - ), - AnnotationModel( - pecha_id="I6944984E", - type=AnnotationType.ALIGNMENT, - document_id="d4", - path="E949/alignment-2F29.json", - title="དགོངས་པ་རབ་གསལ་ལས་སེམས་བསྐྱེད་དྲུག་པ། ཤོ་ལོ་ཀ ༡-༦༤ commentary", - aligned_to=PechaAlignment( - pecha_id="IE60BBDE8", alignment_id="B8B3/alignment-F81A.json" - ), - ), - ] - - self.prealigned_commentary_translation_pecha_metadata = DummyMetadataModel( - **{ - "type": "translation", - "parent": "I6944984E", - **self.prealigned_commentary_translation_pecha.metadata.to_dict(), - } - ) - self.prealigned_commentary_translation_pecha_annotations = [ - AnnotationModel( - pecha_id="IF5944957", - type=AnnotationType.ALIGNMENT, - document_id="d4", - path="0DCE/alignment-8B56.json", - title="དགོངས་པ་རབ་གསལ་ལས་སེམས་བསྐྱེད་དྲུག་པ། ཤོ་ལོ་ཀ ༡-༦༤ commentary translation", - aligned_to=PechaAlignment( - pecha_id="I6944984E", alignment_id="E949/alignment-2F29.json" - ), - ) - ] diff --git a/tests/pecha/metadata/data/W24767/buda_data.json b/tests/pecha/metadata/data/W24767/buda_data.json deleted file mode 100644 index 0baf1eda..00000000 --- a/tests/pecha/metadata/data/W24767/buda_data.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "source_metadata": { - "id": "http://purl.bdrc.io/resource/W24767", - "status": "http://purl.bdrc.io/admindata/StatusReleased", - "access": "http://purl.bdrc.io/admindata/AccessOpen", - "reproduction_of": "http://purl.bdrc.io/resource/MW24767", - "copyright_status": "http://purl.bdrc.io/resource/CopyrightPublicDomain", - "title": "ལམ་འབྲས་གཞུང་རྡོ་རྗེའི་ཚིག་རྐང་གི་རྣམ་འགྲེལ་བཅུ་གཅིག", - "author": "ས་ཆེན་ཀུན་དགའ་སྙིང་པོ།", - "languages": [ - "bo" - ] - }, - "image_groups": { - "I3852": { - "id": "http://purl.bdrc.io/resource/I3852", - "total_pages": 542, - "volume_number": 1, - "volume_pages_bdrc_intro": 2 - }, - "I3853": { - "id": "http://purl.bdrc.io/resource/I3853", - "total_pages": 504, - "volume_number": 2, - "volume_pages_bdrc_intro": 2 - }, - "I3854": { - "id": "http://purl.bdrc.io/resource/I3854", - "total_pages": 502, - "volume_number": 3, - "volume_pages_bdrc_intro": 2 - } - } - } - \ No newline at end of file diff --git a/tests/pecha/metadata/data/W24767/ocr_import_info.json b/tests/pecha/metadata/data/W24767/ocr_import_info.json deleted file mode 100644 index 65fd9e2b..00000000 --- a/tests/pecha/metadata/data/W24767/ocr_import_info.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "source": "bdrc", - "software": "google_books", - "batch": "batch_2022", - "expected_default_language": "bo", - "bdrc_scan_id": "W24767", - "ocr_info": { - "timestamp": "1977-04-22T06:00:00Z", - "prop1": "value 1", - "prop2": "value 2" - } - } - \ No newline at end of file diff --git a/tests/pecha/metadata/data/expected_extracted_metadata.json b/tests/pecha/metadata/data/expected_extracted_metadata.json deleted file mode 100644 index 28cef68b..00000000 --- a/tests/pecha/metadata/data/expected_extracted_metadata.json +++ /dev/null @@ -1,48 +0,0 @@ -{ - "ocr_import_info": { - "source": "bdrc", - "software": "google_books", - "batch": "batch_2022", - "expected_default_language": "bo", - "bdrc_scan_id": "W24767", - "ocr_info": { - "timestamp": "1977-04-22T06:00:00Z", - "prop1": "value 1", - "prop2": "value 2" - } - }, - "buda_data": { - "source_metadata": { - "id": "http://purl.bdrc.io/resource/W24767", - "status": "http://purl.bdrc.io/admindata/StatusReleased", - "access": "http://purl.bdrc.io/admindata/AccessOpen", - "reproduction_of": "http://purl.bdrc.io/resource/MW24767", - "copyright_status": "http://purl.bdrc.io/resource/CopyrightPublicDomain", - "title": "ལམ་འབྲས་གཞུང་རྡོ་རྗེའི་ཚིག་རྐང་གི་རྣམ་འགྲེལ་བཅུ་གཅིག", - "author": "ས་ཆེན་ཀུན་དགའ་སྙིང་པོ།", - "languages": [ - "bo" - ] - }, - "image_groups": { - "I3852": { - "id": "http://purl.bdrc.io/resource/I3852", - "total_pages": 542, - "volume_number": 1, - "volume_pages_bdrc_intro": 2 - }, - "I3853": { - "id": "http://purl.bdrc.io/resource/I3853", - "total_pages": 504, - "volume_number": 2, - "volume_pages_bdrc_intro": 2 - }, - "I3854": { - "id": "http://purl.bdrc.io/resource/I3854", - "total_pages": 502, - "volume_number": 3, - "volume_pages_bdrc_intro": 2 - } - } - } -} \ No newline at end of file diff --git a/tests/pecha/metadata/data/expected_formatted_metadata.json b/tests/pecha/metadata/data/expected_formatted_metadata.json deleted file mode 100644 index adc1c2aa..00000000 --- a/tests/pecha/metadata/data/expected_formatted_metadata.json +++ /dev/null @@ -1,60 +0,0 @@ -{ - "source_type": "bdrc", - "bdrc": { - "ocr_import_info": { - "source": "bdrc", - "software": "google_books", - "batch": "batch_2022", - "expected_default_language": "bo", - "bdrc_scan_id": "W24767", - "ocr_info": { - "timestamp": "1977-04-22T06:00:00Z", - "prop1": "value 1", - "prop2": "value 2" - } - }, - "buda_data": { - "source_metadata": { - "id": "http://purl.bdrc.io/resource/W24767", - "status": "http://purl.bdrc.io/admindata/StatusReleased", - "access": "http://purl.bdrc.io/admindata/AccessOpen", - "reproduction_of": "http://purl.bdrc.io/resource/MW24767", - "copyright_status": "http://purl.bdrc.io/resource/CopyrightPublicDomain", - "title": "\u0f63\u0f58\u0f0b\u0f60\u0f56\u0fb2\u0f66\u0f0b\u0f42\u0f5e\u0f74\u0f44\u0f0b\u0f62\u0fa1\u0f7c\u0f0b\u0f62\u0f97\u0f7a\u0f60\u0f72\u0f0b\u0f5a\u0f72\u0f42\u0f0b\u0f62\u0f90\u0f44\u0f0b\u0f42\u0f72\u0f0b\u0f62\u0fa3\u0f58\u0f0b\u0f60\u0f42\u0fb2\u0f7a\u0f63\u0f0b\u0f56\u0f45\u0f74\u0f0b\u0f42\u0f45\u0f72\u0f42", - "author": "\u0f66\u0f0b\u0f46\u0f7a\u0f53\u0f0b\u0f40\u0f74\u0f53\u0f0b\u0f51\u0f42\u0f60\u0f0b\u0f66\u0f99\u0f72\u0f44\u0f0b\u0f54\u0f7c\u0f0d", - "languages": [ - "bo" - ] - }, - "image_groups": { - "I3852": { - "id": "http://purl.bdrc.io/resource/I3852", - "total_pages": 542, - "volume_number": 1, - "volume_pages_bdrc_intro": 2 - }, - "I3853": { - "id": "http://purl.bdrc.io/resource/I3853", - "total_pages": 504, - "volume_number": 2, - "volume_pages_bdrc_intro": 2 - }, - "I3854": { - "id": "http://purl.bdrc.io/resource/I3854", - "total_pages": 502, - "volume_number": 3, - "volume_pages_bdrc_intro": 2 - } - } - } - }, - "document_id": "W24767", - "language": "bo", - "source_url": "http://purl.bdrc.io/resource/W24767", - "author": { - "bo": "\u0f66\u0f0b\u0f46\u0f7a\u0f53\u0f0b\u0f40\u0f74\u0f53\u0f0b\u0f51\u0f42\u0f60\u0f0b\u0f66\u0f99\u0f72\u0f44\u0f0b\u0f54\u0f7c\u0f0d" - }, - "title": { - "bo": "\u0f63\u0f58\u0f0b\u0f60\u0f56\u0fb2\u0f66\u0f0b\u0f42\u0f5e\u0f74\u0f44\u0f0b\u0f62\u0fa1\u0f7c\u0f0b\u0f62\u0f97\u0f7a\u0f60\u0f72\u0f0b\u0f5a\u0f72\u0f42\u0f0b\u0f62\u0f90\u0f44\u0f0b\u0f42\u0f72\u0f0b\u0f62\u0fa3\u0f58\u0f0b\u0f60\u0f42\u0fb2\u0f7a\u0f63\u0f0b\u0f56\u0f45\u0f74\u0f0b\u0f42\u0f45\u0f72\u0f42" - } -} \ No newline at end of file diff --git a/tests/pecha/metadata/data/input_metadata.json b/tests/pecha/metadata/data/input_metadata.json deleted file mode 100644 index 58d283ef..00000000 --- a/tests/pecha/metadata/data/input_metadata.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "title": "Dummy Title", - "author": "Dummy Author", - "source": "Dummy Source", - "initial_creation_type": "ebook", - "language": "bo" -} diff --git a/tests/pecha/metadata/data/pecha_metadata.json b/tests/pecha/metadata/data/pecha_metadata.json deleted file mode 100644 index 1b34e5d2..00000000 --- a/tests/pecha/metadata/data/pecha_metadata.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "id": "I363FFF6C", - "title": "Dummy Title", - "author": "Dummy Author", - "imported": "2024-10-10T13:35:20.963029", - "source": "Dummy Source", - "toolkit_version": "0.0.1", - "parser": "ChonjukChapterParser", - "initial_creation_type": "ebook", - "language": "bo", - "source_metadata": {}, - "bases": {}, - "copyright": { - "status": "Unknown", - "notice": "", - "info_url": null - }, - "licence": "Unknown" - } \ No newline at end of file diff --git a/tests/pecha/metadata/test_metadata.py b/tests/pecha/metadata/test_metadata.py deleted file mode 100644 index adc01e6a..00000000 --- a/tests/pecha/metadata/test_metadata.py +++ /dev/null @@ -1,231 +0,0 @@ -import json -from datetime import datetime -from pathlib import Path -from unittest import TestCase - -from openpecha import ids -from openpecha.bdrc_utils import extract_metadata_for_work, format_metadata_for_op_api -from openpecha.pecha.metadata import ( - Copyright, - CopyrightStatus, - InitialCreationType, - InitialPechaMetadata, - LicenseType, - PechaMetaData, -) -from openpecha.pecha.parsers import DummyParser -from openpecha.utils import read_json - - -class TestPechaMetadata(TestCase): - def test_create_instance(self): - """ - Create an instance of PechaMetaData from raw metadata. - """ - file = Path(__file__).parent / "data" / "input_metadata.json" - with open(file) as f: - metadata = json.load(f) - pecha_metadata = PechaMetaData(parser=DummyParser().name, **metadata) - - toolkit_version = pecha_metadata.toolkit_version - self.assertEqual(len((toolkit_version).split(".")), 3) - self.assertIsInstance(pecha_metadata, PechaMetaData) - - def test_load(self): - """ - Create an instance of PechaMetaData from a processed metadata file. - """ - file = Path(__file__).parent / "data" / "pecha_metadata.json" - with open(file) as f: - metadata = json.load(f) - - pecha_metadata = PechaMetaData(**metadata) - self.assertIsInstance(pecha_metadata, PechaMetaData) - - def test_toolkit_version(self): - """Test when toolkit_version is provided in input""" - file = Path(__file__).parent / "data" / "input_metadata.json" - with open(file) as f: - metadata = json.load(f) - - pecha_metadata = PechaMetaData(parser=DummyParser().name, **metadata) - self.assertIsInstance(pecha_metadata.toolkit_version, str) - - def test_base_pecha_metadata_model(self): - imported_at = datetime.fromisoformat("2020-01-01T00:00:00") - last_modified_at = datetime.fromisoformat("2020-01-01T00:00:00") - - metadata = PechaMetaData( - id=ids.get_initial_pecha_id(), - source="https://library.bdrc.io", - source_file="https://library.bdrc.io/text.json", - initial_creation_type=InitialCreationType.ocr, - imported=imported_at, - last_modified=last_modified_at, - parser=DummyParser().name, - source_metadata={ - "id": "bdr:W1PD90121", - "title": "མའོ་རྫོང་གི་ས་ཆའི་མིང་བཏུས།", - "author": "author name", - }, - base={ - "f3c9": { - "id": "I1PD90137", - "title": "Volume 1 of mao wen qiang zu zi zhi xian di ming lu", - "total_pages": 220, - "order": 1, - "base_file": "f3c9.tx", - } - }, - ) - - self.assertEqual(metadata.imported, imported_at) - self.assertEqual(metadata.last_modified, last_modified_at) - self.assertEqual( - metadata.initial_creation_type.value, InitialCreationType.ocr.value - ) - self.assertTrue(metadata.id.startswith("I")) - self.assertEqual(len(metadata.id), 9) - - def test_initial_pecha_metadata(self): - metadata = InitialPechaMetadata( - initial_creation_type=InitialCreationType.ocr, - statistics={"ocr_word_median_confidence_index": 0.9}, - bases={ - "id": "529C", - "source_metadata": { - "image_group_id": "I3CN8548", - "title": "", - "total_pages": 62, - }, - "order": 1, - "base_file": "529C.txt", - "statistics": { - "ocr_word_median_confidence_index": 0.9, - }, - }, - parser=DummyParser().name, # Add parser field. - ) - - self.assertEqual( - metadata.initial_creation_type.value, InitialCreationType.ocr.value - ) - self.assertTrue(metadata.id.startswith("I")) - - self.assertIsNotNone(metadata.statistics) - self.assertEqual(metadata.statistics["ocr_word_median_confidence_index"], 0.9) - self.assertIsNotNone(metadata.bases) - self.assertEqual(metadata.bases["id"], "529C") - - def test_pecha_copyright(self): - copyright_status = CopyrightStatus.COPYRIGHTED - - copyright = Copyright( - status=copyright_status, - notice="Copyright 2022 OpenPecha", - info_url="https://dev.openpecha.org", - ) - - metadata = InitialPechaMetadata( - initial_creation_type=InitialCreationType.ocr, - copyright=copyright, - parser=DummyParser().name, # Add parser field. - ) - - self.assertIsNotNone(metadata.copyright) - self.assertEqual(metadata.copyright.status, copyright_status) - - def test_pecha_licence(self): - license_type = LicenseType.CC_BY_NC_SA - - metadata = InitialPechaMetadata( - initial_creation_type=InitialCreationType.ocr, - license=license_type, - parser=DummyParser().name, # Add parser field. - ) - - self.assertIsNotNone(metadata.license) - self.assertEqual(metadata.license, license_type) - - def test_extract_metadata_for_work(self): - metadata = extract_metadata_for_work(Path(__file__).parent / "data" / "W24767") - expected_metadata = read_json( - Path(__file__).parent / "data" / "expected_extracted_metadata.json" - ) - self.assertIsNotNone(metadata) - self.assertIsInstance(metadata, dict) - - if metadata and expected_metadata: # check both metadata and expected_metadata - if metadata.get("ocr_import_info") and expected_metadata.get( - "ocr_import_info" - ): - self.assertEqual( - metadata.get("ocr_import_info"), - expected_metadata.get("ocr_import_info"), - ) - if metadata.get("buda_data") and expected_metadata.get("buda_data"): - self.assertEqual( - metadata.get("buda_data"), expected_metadata.get("buda_data") - ) - - def test_format_metadata_for_op_api(self): - """Test that BDRC metadata is correctly formatted for OpenPecha API.""" - # Load test input data - metadata = extract_metadata_for_work(Path(__file__).parent / "data" / "W24767") - - # Call the function under test - formatted_data = format_metadata_for_op_api(metadata) - - # Load expected output - expected_formatted_metadata = read_json( - Path(__file__).parent / "data" / "expected_formatted_metadata.json" - ) - - self.assertIsNotNone(formatted_data) - self.assertIsInstance(formatted_data, dict) - - if formatted_data and expected_formatted_metadata: - if formatted_data.get("bdrc") and expected_formatted_metadata.get("bdrc"): - self.assertEqual( - formatted_data.get("bdrc"), expected_formatted_metadata.get("bdrc") - ) - if formatted_data.get("author") and expected_formatted_metadata.get( - "author" - ): - self.assertEqual( - formatted_data.get("author"), - expected_formatted_metadata.get("author"), - ) - if formatted_data.get("document_id") and expected_formatted_metadata.get( - "document_id" - ): - self.assertEqual( - formatted_data.get("document_id"), - expected_formatted_metadata.get("document_id"), - ) - if formatted_data.get("language") and expected_formatted_metadata.get( - "language" - ): - self.assertEqual( - formatted_data.get("language"), - expected_formatted_metadata.get("language"), - ) - if formatted_data.get("long_title") and expected_formatted_metadata.get( - "long_title" - ): - self.assertEqual( - formatted_data.get("long_title"), - expected_formatted_metadata.get("long_title"), - ) - if formatted_data.get("source_url") and expected_formatted_metadata.get( - "source_url" - ): - self.assertEqual( - formatted_data.get("source_url"), - expected_formatted_metadata.get("source_url"), - ) - if formatted_data.get("title") and expected_formatted_metadata.get("title"): - self.assertEqual( - formatted_data.get("title"), - expected_formatted_metadata.get("title"), - ) diff --git a/tests/pecha/parser/docx/__init__.py b/tests/pecha/parser/docx/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/pecha/parser/docx/annotation/__init__.py b/tests/pecha/parser/docx/annotation/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/pecha/parser/docx/annotation/data/commentary_pecha/expected_new_anns.json b/tests/pecha/parser/docx/annotation/data/commentary_pecha/expected_new_anns.json deleted file mode 100644 index 188703a7..00000000 --- a/tests/pecha/parser/docx/annotation/data/commentary_pecha/expected_new_anns.json +++ /dev/null @@ -1,87 +0,0 @@ -[ - { - "index": 1, - "alignment_index": [ - 2 - ], - "segmentation_type": "alignment", - "text": "གཟུང་བ་མེད་པར་ཞེས་བྱ་བ་ལ་སོགས་པ་གསུངས་ཏེ། " - }, - { - "index": 2, - "alignment_index": [ - 2 - ], - "segmentation_type": "alignment", - "text": "འཆད་པར་འགྱུར་བའི་རིགས་པས་ཞེས་བྱ་བ་ནི་ཅི་ལྟར་རླུང་གིས་ཞེས་བྱ་བ་ལ་སོགས་པའོ། " - }, - { - "index": 3, - "alignment_index": [ - 2, - 3 - ], - "segmentation_type": "alignment", - "text": "ཐ་སྙད་ཙམ་དུ་ཡོད་པ་དོན་དམ་པར་ཡོད་པ་མ་ཡིན་པ་དེ་ལ་དངོས་པོ་བརྟགས་པར་ཡོད་པ་ཞེས་བྱ་ལ། " - }, - { - "index": 4, - "alignment_index": [ - 4 - ], - "segmentation_type": "alignment", - "text": "ཕྱི་རོལ་གྱི་དོན་མེད་ན་ཞེས་བྱ་བ་ནི་ཤེས་པ་ལས་ཐ་དད་དུ་གྱུར་པའི་སྔོན་པོ་ལ་སོགས་པའི་དོན་མེད་ནའོ།" - }, - { - "index": 5, - "alignment_index": [ - 5 - ], - "segmentation_type": "alignment", - "text": "དེ་བསམ་བྱ་ཞེས་བྱ་བ་ནི་རྨི་ལམ་ན་དོན་མེད་པའི་ཤེས་པ་ཡོད་པ་ཡིན་ནོ་ཞེས་བྱ་བ་དེ་ཡང་བསམ་པར་བྱའོ། \n།ཡང་དེ་ཅི་ཞིག་ཅེ་ན་ཞེས་བྱ་བ་ནི། བསམ་པར་བྱ་བ་གང་ཞིག་ཡིན། །ཞེས་བྱ་བའོ། །" - }, - { - "index": 6, - "alignment_index": [ - 6, - 7, - 8 - ], - "segmentation_type": "alignment", - "text": "གང་ཚེ་ཞེས་བྱ་བ་ལ་སོགས་པ་ནི་དབུ་མ་གང་གི་ཚེ་རྨི་ལམ་ན་ཡང་སེམས་མེད་པ་དེའི་ཚེ་རྨི་ལམ་གྱི་དཔེ་དེ་མ་གྲུབ་པ་ཡིན་ནོ་ཞེས་བྱ་བའི་ཐ་ཚིག་གོ། " - }, - { - "index": 7, - "alignment_index": [ - 6, - 7, - 8 - ], - "segmentation_type": "alignment", - "text": "།དེ་ཉིད་བསྟན་པར་བྱ་བའི་ཕྱིར། དེ་ཚེ་ཁྱོད་ཀྱི་དཔེ་ཡོད་མིན། །ཞེས་བྱ་བ་གསུངས་ཏེ། " - }, - { - "index": 8, - "alignment_index": [ - 8 - ], - "segmentation_type": "alignment", - "text": "གཉིད་སད་པའི་གནས་སྐབས་ན་དྲན་པའི་སྒོ་ནས་གལ་ཏེ་རྨི་ལམ་གྱི་གནས་སྐབས་ན་རྣམ་པར་ཤེས་པ་ཡོད་པར་ཁས་ལེན་ན། " - }, - { - "index": 9, - "alignment_index": [ - 8 - ], - "segmentation_type": "alignment", - "text": "ཅི་ལྟར་ཞེས་བྱ་བ་ལ་སོགས་པ་གསུངས་ཏེ། ཅི་ལྟར་ཉམས་སུ་མྱོང་བ་ལ་ནི་ངས་མཐོང་ངོ་སྙམ་པའི་དྲན་པ་ཡོད་པ་དེ་བཞིན་དུ་ཡུལ་ཡང་ཁས་བླང་བར་བྱ་དགོས་སོ། \n།ཡང་ན་རྣམ་པར་ཤེས་པ་ཡང་ཡོད་པ་མ་ཡིན་ནོ་ཞེས་བྱ་བ་ནི་གལ་ཏེ་ཡུལ་དྲན་དུ་ཟིན་ཀྱང་ཡུལ་ཁས་མི་ལེན་ན་རྣམ་པར་ཤེས་པ་དྲན་ཡང་རྣམ་པར་ཤེས་པ་མེད་པར་ཁས་བླང་བའི་ཕྱིར་ཁོ་བོའི་ཕྱོགས་གྲུབ་པ་ཡིན་ནོ། " - }, - { - "index": 10, - "alignment_index": [ - 10 - ], - "segmentation_type": "alignment", - "text": "གཉིད་ཀྱིས་ཞེས་བྱ་བ་ལ་སོགས་པ་གསུངས་ཏེ། གཉིད་ལོག་པས་མིག་དང༌། རྣ་བ་དང༌། སྣ་དང༌།" - } -] \ No newline at end of file diff --git "a/tests/pecha/parser/docx/annotation/data/commentary_pecha/\340\275\221\340\275\202\340\275\274\340\275\204\340\275\246\340\274\213\340\275\224\340\274\213\340\275\242\340\275\226\340\274\213\340\275\202\340\275\246\340\275\243\340\274\213\340\275\243\340\275\246\340\274\213\340\275\246\340\275\272\340\275\230\340\275\246\340\274\213\340\275\226\340\275\246\340\276\220\340\276\261\340\275\272\340\275\221\340\274\213\340\275\221\340\276\262\340\275\264\340\275\202\340\274\213\340\275\224\340\274\215 \340\275\244\340\275\274\340\274\213\340\275\243\340\275\274\340\274\213\340\275\200 \340\274\241-\340\274\246\340\274\244 _commentary segmentation 1.docx" "b/tests/pecha/parser/docx/annotation/data/commentary_pecha/\340\275\221\340\275\202\340\275\274\340\275\204\340\275\246\340\274\213\340\275\224\340\274\213\340\275\242\340\275\226\340\274\213\340\275\202\340\275\246\340\275\243\340\274\213\340\275\243\340\275\246\340\274\213\340\275\246\340\275\272\340\275\230\340\275\246\340\274\213\340\275\226\340\275\246\340\276\220\340\276\261\340\275\272\340\275\221\340\274\213\340\275\221\340\276\262\340\275\264\340\275\202\340\274\213\340\275\224\340\274\215 \340\275\244\340\275\274\340\274\213\340\275\243\340\275\274\340\274\213\340\275\200 \340\274\241-\340\274\246\340\274\244 _commentary segmentation 1.docx" deleted file mode 100644 index 7d88f14c..00000000 Binary files "a/tests/pecha/parser/docx/annotation/data/commentary_pecha/\340\275\221\340\275\202\340\275\274\340\275\204\340\275\246\340\274\213\340\275\224\340\274\213\340\275\242\340\275\226\340\274\213\340\275\202\340\275\246\340\275\243\340\274\213\340\275\243\340\275\246\340\274\213\340\275\246\340\275\272\340\275\230\340\275\246\340\274\213\340\275\226\340\275\246\340\276\220\340\276\261\340\275\272\340\275\221\340\274\213\340\275\221\340\276\262\340\275\264\340\275\202\340\274\213\340\275\224\340\274\215 \340\275\244\340\275\274\340\274\213\340\275\243\340\275\274\340\274\213\340\275\200 \340\274\241-\340\274\246\340\274\244 _commentary segmentation 1.docx" and /dev/null differ diff --git a/tests/pecha/parser/docx/annotation/data/root_display_pecha/expected_new_anns.json b/tests/pecha/parser/docx/annotation/data/root_display_pecha/expected_new_anns.json deleted file mode 100644 index 3b3d1366..00000000 --- a/tests/pecha/parser/docx/annotation/data/root_display_pecha/expected_new_anns.json +++ /dev/null @@ -1,62 +0,0 @@ -[ - { - "index": 1, - "segmentation_type": "alignment", - "text": "བུ་མ་འཇུག་པ་ལས་སེམས་བསྐྱེད་དྲུག་པ། ཤོ་ལོ་ཀ ༡-༦༤ མངོན་དུ་ཕྱོགས་པར་མཉམ་བཞག་སེམས་གནས་ཏེ། " - }, - { - "index": 2, - "segmentation_type": "alignment", - "text": "།རྫོགས་པའི་སངས་རྒྱས་ཆོས་ལ་མངོན་ཕྱོགས་ཤིང༌། །" - }, - { - "index": 3, - "segmentation_type": "alignment", - "text": "འདི་བརྟེན་འབྱུང་བའི་དེ་ཉིད་མཐོང་བ་དེས། །" - }, - { - "index": 4, - "segmentation_type": "alignment", - "text": "ཤེས་རབ་གནས་པས་འགོག་པ་ཐོབ་པར་འགྱུར། །" - }, - { - "index": 5, - "segmentation_type": "alignment", - "text": "ཇི་ལྟར་ལོང་བའི་ཚོགས་ཀུན་བདེ་བླག་ཏུ། །མིག་ལྡན་སྐྱེས་བུ་གཅིག་གིས་འདོད་པ་ཡི། །ཡུལ་དུ་འཁྲིད་པ་དེ་བཞིན་འདིར་ཡང་བློས། །མིག་ཉམས་ཡོན་ཏན་བླངས་ཏེ་རྒྱལ་ཉིད་འགྲོ། །" - }, - { - "index": 6, - "segmentation_type": "alignment", - "text": "ཇི་ལྟར་དེ་ཡིས་ཆེས་ཟབ་ཆོས་རྟོགས་པ། །ལུང་དང་གཞན་ཡང་རིགས་པས་ཡིན་པས་ན། །དེ་ལྟར་འཕགས་པ་ཀླུ་སྒྲུབ་གཞུང་ལུགས་ལས། །ཇི་ལྟར་གནས་པའི་ལུགས་བཞིན་བརྗོད་པར་བྱ། །" - }, - { - "index": 7, - "segmentation_type": "alignment", - "text": "སོ་སོ་སྐྱེ་བོའི་དུས་ནའང་སྟོང་པ་ཉིད་ཐོས་ནས། །ནང་དུ་རབ་ཏུ་དགའ་བ་ཡང་དང་ཡང་དུ་འབྱུང༌། །རབ་ཏུ་དགའ་བ་ལས་བྱུང་མཆི་མས་མིག་བརླན་ཞིང༌། །ལུས་ཀྱི་བ་སྤུ་ལྡང་པར་འགྱུར་པ་གང་ཡིན་པ། །" - }, - { - "index": 8, - "segmentation_type": "alignment", - "text": "དེ་ལ་རྫོགས་པའི་སངས་རྒྱས་བློ་ཡི་ས་བོན་ཡོད། །དེ་ཉིད་ཉེ་བར་བསྟན་པའི་སྣོད་ནི་དེ་ཡིན་ཏེ། །དེ་ལ་དམ་པའི་དོན་གྱི་བདེན་པ་བསྟན་པར་བྱ། །དེ་ལ་དེ་ཡི་རྗེས་སུ་འགྲོ་བའི་ཡོན་ཏན་འབྱུང༌། །" - }, - { - "index": 9, - "segmentation_type": "alignment", - "text": "རྟག་ཏུ་ཚུལ་ཁྲིམས་ཡང་དག་བླངས་ནས་གནས་པར་འགྱུར། །སྦྱིན་པ་གཏོང་བར་འགྱུར་ཞིང་སྙིང་རྗེ་བསྟེན་པར་བྱེད། །བཟོད་པ་སྒོམ་བྱེད་དེ་ཡི་དགེ་བའང་བྱང་ཆུབ་ཏུ། །འགྲོ་བ་དགྲོལ་བར་བྱ་ཕྱིར་ཡོངས་སུ་བསྔོ་བྱེད་ཅིང༌། །" - }, - { - "index": 10, - "segmentation_type": "alignment", - "text": "རྫོགས་པའི་བྱང་ཆུབ་སེམས་དཔའ་རྣམས་ལ་གུས་པར་བྱེད། །ཟབ་ཅིང་རྒྱ་ཆེའི་ཚུལ་ལ་མཁས་པའི་སྐྱེ་བོས་ནི། །རིམ་གྱིས་རབ་ཏུ་དགའ་བའི་ས་ནི་འཐོབ་འགྱུར་བས། །དེ་ནི་དོན་དུ་གཉེར་བས་ལམ་འདི་མཉན་པར་གྱིས། །" - }, - { - "index": 11, - "segmentation_type": "alignment", - "text": "དེ་ཉིད་དེ་ལས་འབྱུང་མིན་གཞན་དག་ལས་ལྟ་ག་ལ་ཞིག །གཉིས་ཀ་ལས་ཀྱང་མ་ཡིན་རྒྱུ་མེད་པར་ནི་ག་ལ་ཡོད། །དེ་ནི་དེ་ལས་འབྱུང་ན་ཡོན་ཏན་འགའ་ཡང་ཡོད་མ་ཡིན། །སྐྱེས་པར་གྱུར་པ་སླར་ཡང་སྐྱེ་བར་རིགས་པའང་མ་ཡིན་ཉིད། །" - }, - { - "index": 12, - "segmentation_type": "alignment", - "text": "སྐྱེས་ཟིན་སླར་ཡང་སྐྱེ་བར་ཡོངས་སུ་རྟོག་པར་འགྱུར་ན་ནི། །མྱུ་གུ་ལ་སོགས་རྣམས་ཀྱི་སྐྱེ་བ་འདིར་རྙེད་མི་འགྱུར་ཞིང༌། །ས་བོན་སྲིད་མཐར་ཐུག་པར་རབ་ཏུ་སྐྱེ་བ་ཉིད་དུ་འགྱུར། །ཇི་ལྟར་དེ་ཉིད་ཀྱིས་དེ་རྣམ་པར་འཇིག་པར་བྱེད་པར་འགྱུར། །" - } -] \ No newline at end of file diff --git "a/tests/pecha/parser/docx/annotation/data/root_display_pecha/\340\275\221\340\275\202\340\275\274\340\275\204\340\275\246\340\274\213\340\275\224\340\274\213\340\275\242\340\275\226\340\274\213\340\275\202\340\275\246\340\275\243\340\274\213\340\275\243\340\275\246\340\274\213\340\275\246\340\275\272\340\275\230\340\275\246\340\274\213\340\275\226\340\275\246\340\276\220\340\276\261\340\275\272\340\275\221\340\274\213\340\275\221\340\276\262\340\275\264\340\275\202\340\274\213\340\275\224\340\274\215 \340\275\244\340\275\274\340\274\213\340\275\243\340\275\274\340\274\213\340\275\200 \340\274\241-\340\274\246\340\274\244 segmentation 1.docx" "b/tests/pecha/parser/docx/annotation/data/root_display_pecha/\340\275\221\340\275\202\340\275\274\340\275\204\340\275\246\340\274\213\340\275\224\340\274\213\340\275\242\340\275\226\340\274\213\340\275\202\340\275\246\340\275\243\340\274\213\340\275\243\340\275\246\340\274\213\340\275\246\340\275\272\340\275\230\340\275\246\340\274\213\340\275\226\340\275\246\340\276\220\340\276\261\340\275\272\340\275\221\340\274\213\340\275\221\340\276\262\340\275\264\340\275\202\340\274\213\340\275\224\340\274\215 \340\275\244\340\275\274\340\274\213\340\275\243\340\275\274\340\274\213\340\275\200 \340\274\241-\340\274\246\340\274\244 segmentation 1.docx" deleted file mode 100644 index 4435ab65..00000000 Binary files "a/tests/pecha/parser/docx/annotation/data/root_display_pecha/\340\275\221\340\275\202\340\275\274\340\275\204\340\275\246\340\274\213\340\275\224\340\274\213\340\275\242\340\275\226\340\274\213\340\275\202\340\275\246\340\275\243\340\274\213\340\275\243\340\275\246\340\274\213\340\275\246\340\275\272\340\275\230\340\275\246\340\274\213\340\275\226\340\275\246\340\276\220\340\276\261\340\275\272\340\275\221\340\274\213\340\275\221\340\276\262\340\275\264\340\275\202\340\274\213\340\275\224\340\274\215 \340\275\244\340\275\274\340\274\213\340\275\243\340\275\274\340\274\213\340\275\200 \340\274\241-\340\274\246\340\274\244 segmentation 1.docx" and /dev/null differ diff --git a/tests/pecha/parser/docx/annotation/test_docx_ann_parser.py b/tests/pecha/parser/docx/annotation/test_docx_ann_parser.py deleted file mode 100644 index 51decdae..00000000 --- a/tests/pecha/parser/docx/annotation/test_docx_ann_parser.py +++ /dev/null @@ -1,90 +0,0 @@ -from pathlib import Path -from unittest import TestCase - -from stam import AnnotationStore - -from openpecha.pecha import get_anns -from openpecha.pecha.layer import AnnotationType -from openpecha.pecha.parsers.docx.annotation import DocxAnnotationParser -from openpecha.utils import read_json -from tests.pecha import SharedPechaSetup - - -class TestDocxAnnotationParser(TestCase, SharedPechaSetup): - def setUp(self): - self.setup_pechas() - self.parser = DocxAnnotationParser() - self.root_pecha_backup = { - f: f.read_bytes() for f in self.root_pecha_path.glob("**/*") if f.is_file() - } - self.commentary_pecha_backup = { - f: f.read_bytes() - for f in self.commentary_pecha_path.glob("**/*") - if f.is_file() - } - - def test_root_pecha(self): - type = AnnotationType.ALIGNMENT - docx_file = Path( - "tests/pecha/parser/docx/annotation/data/root_display_pecha/དགོངས་པ་རབ་གསལ་ལས་སེམས་བསྐྱེད་དྲུག་པ། ཤོ་ལོ་ཀ ༡-༦༤ segmentation 1.docx" - ) - metadatas = [self.root_pecha_metadata] - - pecha, layer_name = self.parser.add_annotation( - self.root_pecha, type, docx_file, metadatas - ) - layer_path = pecha.layer_path / layer_name - new_anns = get_anns(AnnotationStore(file=str(layer_path))) - expected_new_anns = read_json( - Path( - "tests/pecha/parser/docx/annotation/data/root_display_pecha/expected_new_anns.json" - ) - ) - - assert new_anns == expected_new_anns - - def test_commentary_pecha(self): - type = AnnotationType.ALIGNMENT - docx_file = Path( - "tests/pecha/parser/docx/annotation/data/commentary_pecha/དགོངས་པ་རབ་གསལ་ལས་སེམས་བསྐྱེད་དྲུག་པ། ཤོ་ལོ་ཀ ༡-༦༤ _commentary segmentation 1.docx" - ) - metadatas = [ - self.commentary_pecha_metadata, - self.root_pecha_metadata, - ] - - pecha, layer_name = self.parser.add_annotation( - self.commentary_pecha, - type, - docx_file, - metadatas, - ) - layer_path = pecha.layer_path / layer_name - - new_anns = get_anns(AnnotationStore(file=str(layer_path))) - expected_new_anns = read_json( - Path( - "tests/pecha/parser/docx/annotation/data/commentary_pecha/expected_new_anns.json" - ) - ) - - assert new_anns == expected_new_anns - - def tearDown(self) -> None: - # Revert all original files - for f, content in self.root_pecha_backup.items(): - f.write_bytes(content) - - # Remove any new files that weren't in the original backup - for f in self.root_pecha_path.glob("**/*"): - if f.is_file() and f not in self.root_pecha_backup: - f.unlink() - - # Revert all original files - for f, content in self.commentary_pecha_backup.items(): - f.write_bytes(content) - - # Remove any new files that weren't in the original backup - for f in self.commentary_pecha_path.glob("**/*"): - if f.is_file() and f not in self.commentary_pecha_backup: - f.unlink() diff --git a/tests/pecha/parser/docx/commentary/complex/data/bo/Tibetan Commentary text test 2.docx b/tests/pecha/parser/docx/commentary/complex/data/bo/Tibetan Commentary text test 2.docx deleted file mode 100644 index 3ed39661..00000000 Binary files a/tests/pecha/parser/docx/commentary/complex/data/bo/Tibetan Commentary text test 2.docx and /dev/null differ diff --git a/tests/pecha/parser/docx/commentary/complex/data/bo/metadata.json b/tests/pecha/parser/docx/commentary/complex/data/bo/metadata.json deleted file mode 100644 index 0ac35c3a..00000000 --- a/tests/pecha/parser/docx/commentary/complex/data/bo/metadata.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "author": {"bo": "བཀྲ་ཤིས་ཚེ་རིང༌།"}, - "usage_title": "དབུ་མ་འཇུག་པ། སེམས་བསྐྱེད་དྲུག་པ། - ཤོ་ལོ་ག་༢༩-༦༤", - "title_short": { - "bo": "རྡོ་རྗེ་གཅོད་པ་བོད་ཡིག་འགྲེལ་པ་དེབ། ༢", "en": "Vajra Cutter Tibetan Commentary Book 2" - }, - "title_long_clean":{ - "bo": "འཕགས་པ་ཤེས་རབ་ཀྱི་ཕ་རོལ་ཏུ་ཕྱིན་པ་རྡོ་རྗེ་གཅོད་པ་ཞེས་བྱ་བ།" - }, - "title_alt_1":{ - "bo": "འཕགས་པ་ཤེས་རབ་ཀྱི་ཕ་རོལད་ཏུ་ཕྱིནད་པ་རྡོ་རྗེ་གཅོད་པ་ཞེས་བྱ་བ་ཐེག་པ་ཆེན་པོའི་མདོ།" - }, - "title_alt_2":{ - "bo": "རྡོ་རྗེ་སུམ་བརྒྱ་པའམ་རྡོ་རྗེ་གཅོད་པ།" - }, - "language": "bo" -} \ No newline at end of file diff --git a/tests/pecha/parser/docx/commentary/complex/data/en/English aligned Commentary Text 2.docx b/tests/pecha/parser/docx/commentary/complex/data/en/English aligned Commentary Text 2.docx deleted file mode 100644 index f3658cbb..00000000 Binary files a/tests/pecha/parser/docx/commentary/complex/data/en/English aligned Commentary Text 2.docx and /dev/null differ diff --git a/tests/pecha/parser/docx/commentary/complex/data/en/metadata.json b/tests/pecha/parser/docx/commentary/complex/data/en/metadata.json deleted file mode 100644 index 091a53b2..00000000 --- a/tests/pecha/parser/docx/commentary/complex/data/en/metadata.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "author": {"bo": "Tashi Tsering"}, - "title_short": { - "bo": "རྡོ་རྗེ་གཅོད་པ་དབྱིན་ཡིག་འགྲེལ་པ་དེབ། ༢", "en": "Vajra Cutter Tibetan Commentary Book 2" - }, - "title_long_clean":{ - "bo": "འཕགས་པ་ཤེས་རབ་ཀྱི་ཕ་རོལ་ཏུ་ཕྱིན་པ་རྡོ་རྗེ་གཅོད་པ་ཞེས་བྱ་བ།" - }, - "title_alt_1":{ - "bo": "འཕགས་པ་ཤེས་རབ་ཀྱི་ཕ་རོལད་ཏུ་ཕྱིནད་པ་རྡོ་རྗེ་གཅོད་པ་ཞེས་བྱ་བ་ཐེག་པ་ཆེན་པོའི་མདོ།" - }, - "title_alt_2":{ - "bo": "རྡོ་རྗེ་སུམ་བརྒྱ་པའམ་རྡོ་རྗེ་གཅོད་པ།" - }, - "language": "en" -} \ No newline at end of file diff --git a/tests/pecha/parser/docx/commentary/complex/data/zh/Chinese aligned Commentary Text 1.docx b/tests/pecha/parser/docx/commentary/complex/data/zh/Chinese aligned Commentary Text 1.docx deleted file mode 100644 index 5752f0e6..00000000 Binary files a/tests/pecha/parser/docx/commentary/complex/data/zh/Chinese aligned Commentary Text 1.docx and /dev/null differ diff --git a/tests/pecha/parser/docx/commentary/complex/data/zh/metadata.json b/tests/pecha/parser/docx/commentary/complex/data/zh/metadata.json deleted file mode 100644 index 82d5bb44..00000000 --- a/tests/pecha/parser/docx/commentary/complex/data/zh/metadata.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "author": {"bo": "སྟོན་པ་བཅོམ་ལྡན་འདས།", "zh": "丹增楚杜"}, - "title_short": { - "bo": "རྡོ་རྗེ་གཅོད་པ་རྒྱ་ནག་གི་ཡིག་གི་འགྲེལ་པ་དེབ། ༡", "en": "Vajra Cutter Chinese Commentary Book 1", "zh": "中文对齐测试" - }, - "title_long_clean":{ - "bo": "འཕགས་པ་ཤེས་རབ་ཀྱི་ཕ་རོལ་ཏུ་ཕྱིན་པ་རྡོ་རྗེ་གཅོད་པ་ཞེས་བྱ་བ།" - }, - - "language": "zh" -} \ No newline at end of file diff --git a/tests/pecha/parser/docx/commentary/complex/test_complex_commentary_parser.py b/tests/pecha/parser/docx/commentary/complex/test_complex_commentary_parser.py deleted file mode 100644 index f67c407d..00000000 --- a/tests/pecha/parser/docx/commentary/complex/test_complex_commentary_parser.py +++ /dev/null @@ -1,74 +0,0 @@ -import tempfile -from pathlib import Path -from unittest import TestCase - -from openpecha.pecha.annotations import SapcheAnnotation, span -from openpecha.pecha.parsers.docx.commentary.complex import DocxComplexCommentaryParser -from openpecha.utils import read_json - - -class TestDocxComplexCommentaryParser(TestCase): - def setUp(self): - self.DATA_DIR = Path(__file__).parent / "data" - - def test_parser_on_bo_commentary(self): - input = self.DATA_DIR / "bo/Tibetan Commentary text test 2.docx" - - metadata = read_json(self.DATA_DIR / "bo/metadata.json") - - parser = DocxComplexCommentaryParser( - root_path="opf_id/layers/basename/layer_file.json" - ) - - with tempfile.TemporaryDirectory() as tmpdirname: - output_path = Path(tmpdirname) - output_path.mkdir(parents=True, exist_ok=True) - parser.parse(input, metadata, output_path) - expected_sapche_anns = [ - SapcheAnnotation(span=span(start=102, end=124), sapche_number="1."), - SapcheAnnotation(span=span(start=126, end=166), sapche_number="1.1."), - SapcheAnnotation(span=span(start=2122, end=2153), sapche_number="1.2."), - SapcheAnnotation(span=span(start=2816, end=2856), sapche_number="1.3."), - ] - - assert parser.sapche_anns == expected_sapche_anns - - def test_parser_on_en_commentary(self): - input = self.DATA_DIR / "en/English aligned Commentary Text 2.docx" - metadata = read_json(self.DATA_DIR / "en/metadata.json") - - parser = DocxComplexCommentaryParser( - root_path="opf_id/layers/basename/layer_file.json" - ) - with tempfile.TemporaryDirectory() as tmpdirname: - output_path = Path(tmpdirname) - output_path.mkdir(parents=True, exist_ok=True) - parser.parse(input, metadata, output_path) - expected_sapche_anns = [ - SapcheAnnotation(span=span(start=124, end=164), sapche_number="1."), - SapcheAnnotation(span=span(start=166, end=238), sapche_number="1.1."), - ] - - assert parser.sapche_anns == expected_sapche_anns - - def test_parser_on_zh_commentary(self): - input = self.DATA_DIR / "zh/Chinese aligned Commentary Text 1.docx" - metadata = read_json(self.DATA_DIR / "zh/metadata.json") - - parser = DocxComplexCommentaryParser( - root_path="opf_id/layers/basename/layer_file.json" - ) - with tempfile.TemporaryDirectory() as tmpdirname: - output_path = Path(tmpdirname) - output_path.mkdir(parents=True, exist_ok=True) - parser.parse(input, metadata, output_path) - - expected_sapche_anns = [ - SapcheAnnotation(span=span(start=251, end=253), sapche_number="1."), - SapcheAnnotation(span=span(start=316, end=322), sapche_number="2."), - SapcheAnnotation(span=span(start=324, end=330), sapche_number="2.1"), - SapcheAnnotation(span=span(start=397, end=403), sapche_number="2.1.1"), - SapcheAnnotation(span=span(start=731, end=737), sapche_number="3."), - ] - - assert parser.sapche_anns == expected_sapche_anns diff --git a/tests/pecha/parser/docx/commentary/simple/data/metadata.json b/tests/pecha/parser/docx/commentary/simple/data/metadata.json deleted file mode 100644 index bcf4a6fc..00000000 --- a/tests/pecha/parser/docx/commentary/simple/data/metadata.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "author": {"bo": "ཟླ་བ་གྲགས་པ།", "en": "Candrak Irti"}, - "source": "http://purl.bdrc.io/resource/WA1NLM688", - "usage_title": "དབུ་མ་འཇུག་པ། སེམས་བསྐྱེད་དྲུག་པ། - ཤོ་ལོ་ག་༢༩-༦༤", - "title_short": { - "bo": "དབུ་མ་ལ་འཇུག་པ། སེམས་བསྐྱེད་དྲུག་པ། - ཤོ་ལོ་ག་༢༩-༦༤" - }, - "title_long_clean":{ - "bo": "དབུ་མ་ལ་འཇུག་པའི་ཚིག་ལེའུར་བྱས་པ། སེམས་བསྐྱེད་དྲུག་པ། - ཤོ་ལོ་ག་༢༩-༦༤" - }, - "title_alt_1":{ - "bo": "དབུ་མ་ལ་འཇུག་པ་སོགས། སེམས་བསྐྱེད་དྲུག་པ། - ཤོ་ལོ་ག་༢༩-༦༤" - }, - "type": "commentary", - "parent": "I54C654B4", - "language": "bo" -} \ No newline at end of file diff --git "a/tests/pecha/parser/docx/commentary/simple/data/\340\275\221\340\275\226\340\275\264\340\274\213\340\275\230\340\274\213_bo_commentary.docx" "b/tests/pecha/parser/docx/commentary/simple/data/\340\275\221\340\275\226\340\275\264\340\274\213\340\275\230\340\274\213_bo_commentary.docx" deleted file mode 100644 index 0e79f7d2..00000000 Binary files "a/tests/pecha/parser/docx/commentary/simple/data/\340\275\221\340\275\226\340\275\264\340\274\213\340\275\230\340\274\213_bo_commentary.docx" and /dev/null differ diff --git a/tests/pecha/parser/docx/commentary/simple/test_simple_commentary_parser.py b/tests/pecha/parser/docx/commentary/simple/test_simple_commentary_parser.py deleted file mode 100644 index 520419f8..00000000 --- a/tests/pecha/parser/docx/commentary/simple/test_simple_commentary_parser.py +++ /dev/null @@ -1,83 +0,0 @@ -import tempfile -from pathlib import Path -from unittest import TestCase, mock -from unittest.mock import patch - -from openpecha.pecha import Pecha -from openpecha.pecha.annotations import AlignmentAnnotation, span -from openpecha.pecha.layer import AnnotationType -from openpecha.pecha.parsers.docx.commentary.simple import DocxSimpleCommentaryParser -from openpecha.utils import read_json - - -class TestDocxSimpleCommentaryParser(TestCase): - def setUp(self): - self.data_dir = Path(__file__).parent / "data" - self.input = self.data_dir / "དབུ་མ་_bo_commentary.docx" - self.metadata = read_json(self.data_dir / "metadata.json") - self.expected_anns = [ - AlignmentAnnotation( - span=span(start=0, end=65, errors=None), - metadata=None, - index=1, - alignment_index=[1], - ), - AlignmentAnnotation( - span=span(start=66, end=330, errors=None), - metadata=None, - index=2, - alignment_index=[2], - ), - AlignmentAnnotation( - span=span(start=331, end=758, errors=None), - metadata=None, - index=3, - alignment_index=[2, 3], - ), - AlignmentAnnotation( - span=span(start=759, end=1075, errors=None), - metadata=None, - index=4, - alignment_index=[3, 4, 5], - ), - AlignmentAnnotation( - span=span(start=1076, end=1470, errors=None), - metadata=None, - index=5, - alignment_index=[2, 4, 5], - ), - ] - self.expected_base = "དབུ་མ་དགོངས་པ་རབ་གསལ་ལེའུ་དྲུག་པ་བདེན་གཉིས་སོ་སོའི་ངོ་བོ་བཤད་པ།། \nགསུམ་པ་ལ་གཉིས། ཀུན་རྫོབ་ཀྱི་བདེན་པ་བཤད་པ་དང་། དོན་དམ་པའི་བདེན་པ་བཤད་པའོ། །དང་པོ་ལ་གསུམ། ཀུན་རྫོབ་པ་གང་གི་ངོར་བདེན་ལ་གང་གི་ངོར་མི་བདེན་པ་དང་། ཀུན་རྫོབ་ཙམ་དེ་གང་ཟག་གསུམ་ལ་སྣང་བ་དང་མི་སྣང་བའི་ཚུལ་དང་། སོ་སྐྱེ་དང་འཕགས་པ་ལ་ལྟོས་ཏེ་དོན་དམ་པ་དང་ཀུན་རྫོབ་ཏུ་འགྱུར་ཚུལ་ལོ། \nདེས་གང་ལ་སྒྲིབ་ན་ཡང་དག་ཀུན་རྫོབ་འདོད་ཅེས་པས་ཡང་དག་པའི་དོན་ལ་སྒྲིབ་པས་ཀུན་རྫོབ་བམ་སྒྲིབ་བྱེད་དུ་འདོད་ཅེས་པ་སྟེ། ཡང་ལོག་གཉིས་ཀྱི་ནང་ནས་ཡང་དག་ཀུན་རྫོབ་ཏུ་སྟོན་པ་མིན་ནོ། །རྐང་པ་དང་པོས་བསྟན་པའི་ཀུན་རྫོབ་དང་། རྐང་པ་ཕྱི་མ་གཉིས་ཀྱིས་བསྟན་པའི་ཀུན་རྫོབ་གཉིས་གཅིག་ཏུ་མི་བྱ་སྟེ། དང་པོ་ནི། རང་གིས་དངོས་པོ་རྣམས་སྐྱེ་བ་སོགས་སུ་གང་དུ་ཁས་ལེན་པའི་ཀུན་རྫོབ་ཡིན་ལ། ཕྱི་མ་ནི་དངོས་པོ་རྣམས་གང་གི་ངོར་བདེན་པའི་བདེན་འཛིན་གྱི་ཀུན་རྫོབ་ཡིན་པའི་ཕྱིར་རོ། །\nཀུན་རྫོབ་བདེན་འཛིན་དེའི་མཐུས་སྔོན་པོ་ལ་སོགས་པ་གང་ཞིག རང་བཞིན་གྱིས་གྲུབ་པ་མེད་བཞིན་དུ་དེར་སྣང་བར་བཅོས་པའི་བཅོས་མ་སེམས་ཅན་རྣམས་ལ་བདེན་པར་སྣང་བ་དེ་ནི། སྔར་བཤད་པའི་འཇིག་རྟེན་གྱི་ཕྱིན་ཅི་ལོག་གི་ཀུན་རྫོབ་པ་དེའི་ངོར་བདེན་པས་འཇིག་རྟེན་གྱི་ཀུན་རྫོབ་ཀྱི་བདེན་པ་ཞེས་ཐུབ་པ་དེས་གསུངས་ཏེ། གསུངས་ཚུལ་ནི་སྔར་གྱི་མདོ་དེར་གསུངས་པའོ། །\nགང་ཟག་གསུམ་པོ་གང་གི་ངོར་མི་བདེན་པའི་རྟོག་པས་བཅོས་པས་བཅོས་མར་གྱུར་པའི་དངོས་པོ་ནི་དེའི་ཀུན་རྫོབ་པའི་ངོར་མི་བདེན་པས་ཀུན་རྫོབ་ཙམ་ཞེས་བྱའོ། །རྟེན་འབྱུང་གཟུགས་བརྙན་དང་བྲག་ཆ་སོགས་ཅུང་ཟད་ཅིག་ནི་བརྫུན་ཡང་མ་རིག་པ་དང་ལྡན་པ་རྣམས་ལ་སྣང་ལ། སྔོན་པོ་ལ་སོགས་པའི་གཟུགས་དང་སེམས་དང་ཚོར་བ་སོགས་ཅུང་ཟད་ཅིག་ནི་བདེན་པར་སྣང་སྟེ། ཆོས་རྣམས་ཀྱི་ཡིན་ལུགས་ཀྱི་རང་བཞིན་ནི་མ་རིག་པ་དང་ལྡན་པ་རྣམས་ལ་རྣམ་པ་ཐམས་ཅད་དུ་མི་སྣང་ངོ་། །\n" - - def test_extract_commentary_segments(self): - parser = DocxSimpleCommentaryParser() - anns, base = parser.extract_anns(self.input, AnnotationType.ALIGNMENT) - - assert ( - anns == self.expected_anns - ), "NumberedList Commentary Parser failed parsing commentary segments properly." - assert ( - base == self.expected_base - ), "NumberedList Commentary failed preparing base text properly." - - def test_create_pecha(self): - parser = DocxSimpleCommentaryParser() - with tempfile.TemporaryDirectory() as tempdir, mock.patch( - "openpecha.pecha.parsers.docx.commentary.simple.DocxSimpleCommentaryParser.extract_anns" - ) as mock_extract_commentary_segments_anns, patch( - "openpecha.pecha.get_base_id" - ) as mock_get_base_id, patch( - "openpecha.pecha.get_layer_id" - ) as mock_get_layer_id: - mock_extract_commentary_segments_anns.return_value = ( - self.expected_anns, - self.expected_base, - ) - mock_get_base_id.return_value = "B001" - mock_get_layer_id.return_value = "L001" - - pecha, layer_name = parser.parse( - self.input, AnnotationType.SEGMENTATION, self.metadata, Path(tempdir) - ) - assert isinstance(pecha, Pecha) - assert layer_name == "B001/segmentation-L001.json" diff --git a/tests/pecha/parser/docx/docx_parser/test_docx_parser.py b/tests/pecha/parser/docx/docx_parser/test_docx_parser.py deleted file mode 100644 index e48c3606..00000000 --- a/tests/pecha/parser/docx/docx_parser/test_docx_parser.py +++ /dev/null @@ -1,106 +0,0 @@ -from typing import Any -from unittest import TestCase - -from openpecha.pecha.parsers.docx import DocxParser -from tests.pecha import DummyMetadataModel - -extra_fields = { - "author": {"en": "DPO and Claude-3-5-sonnet-20241022"}, - "document_id": "1vgnfCQH3yaWPDaMDFXT_5GhlG0M9kEra0mxkDX46VLE", - "language": "en", - "long_title": { - "en": "Illuminating the Intent Chapter 6, verses 1 to 64 Literal Translation, Monlam AI, February 2025" - }, - "title": { - "bo": "\u0f51\u0f42\u0f7c\u0f44\u0f66\u0f0b\u0f54\u0f0b\u0f62\u0f56\u0f0b\u0f42\u0f66\u0f63\u0f0b\u0f63\u0f66\u0f0b\u0f66\u0f7a\u0f58\u0f66\u0f0b\u0f56\u0f66\u0f90\u0fb1\u0f7a\u0f51\u0f0b\u0f51\u0fb2\u0f74\u0f42\u0f0b\u0f54\u0f0d \u0f64\u0f7c\u0f0b\u0f63\u0f7c\u0f0b\u0f40 \u0f21 \u0f53\u0f66\u0f0b \u0f26\u0f24", - "en": "Illuminating the Intent Chapter 6", - }, - "usage_title": {"en": "Illuminating the Intent Chapter 6"}, -} - -MetadataType = Any - - -class TestDocxParser(TestCase): - def setUp(self): - self.parser = DocxParser() - - def test_root_pecha(self): - # this is the root pecha - - metadatas: list[MetadataType] = [ - DummyMetadataModel( - **{ - "type": "root", - "parent": None, - **extra_fields, - } - ) - ] - assert not self.parser.is_commentary_pecha(metadatas) - - def test_root_translation_pecha(self): - # translation of root pecha - metadatas: list[MetadataType] = [ - DummyMetadataModel( - **{ - "type": "translation", - "parent": "P0001", - **extra_fields, - } - ), - DummyMetadataModel( - **{ - "type": "root", - "parent": None, - **extra_fields, - } - ), - ] - assert not self.parser.is_commentary_pecha(metadatas) - - def test_commentary_pecha(self): - metadatas: list[MetadataType] = [ - DummyMetadataModel( - **{ - "type": "commentary", - "parent": "P0001", - **extra_fields, - } - ), - DummyMetadataModel( - **{ - "type": "root", - "parent": None, - **extra_fields, - } - ), - ] - assert self.parser.is_commentary_pecha(metadatas) - - def test_commentary_translation_pecha(self): - # translation of commentary pecha - metadatas: list[MetadataType] = [ - DummyMetadataModel( - **{ - "type": "translation", - "parent": "P0001", - **extra_fields, - } - ), - DummyMetadataModel( - **{ - "type": "commentary", - "parent": "P0002", - **extra_fields, - } - ), - DummyMetadataModel( - **{ - "type": "root", - "parent": None, - **extra_fields, - } - ), - ] - assert self.parser.is_commentary_pecha(metadatas) diff --git a/tests/pecha/parser/docx/footnote/data/I926CCA43/base/B4A6.txt b/tests/pecha/parser/docx/footnote/data/I926CCA43/base/B4A6.txt deleted file mode 100644 index 6db31110..00000000 --- a/tests/pecha/parser/docx/footnote/data/I926CCA43/base/B4A6.txt +++ /dev/null @@ -1,18 +0,0 @@ - - 菩薩戒品釋 - 功德光論師造 - 解說菩薩戒品。 - 頂禮一切佛菩薩! - 問:「云何所說『具四功德自性尸羅,應知即是妙善(淨戒)』?」 - 依彼而言,說「能利自(他)」等文。 - 其中「利益」謂善行。 - 「安樂」謂無惱害。 - 「哀憫」,謂如以諸善及無惱害行哀憫對方。 - 「義利」謂希求義利及具有義利,凡所有欲求及無罪。 - 「利益安樂故」謂住於善及無惱害行。 - 「人」謂刹帝利等,彼等中多數,由於佛陀出世、善說正法、善建立僧伽,當成極多利益、安樂。 - 彼等亦由利益、安樂自己後,而哀憫世間, - 彼等於他人作如是念:「(他們)具足利益安樂,復何妙哉!」 - 他人亦作是念:「我等亦得如是,亦何其妙哉!」 - 是故,說「令得義利、利益、安樂故。」 - 「諸人天等」謂不能通達及成辦彼等之義利故。 diff --git a/tests/pecha/parser/docx/footnote/data/I926CCA43/layers/B4A6/segmentation-39E1.json b/tests/pecha/parser/docx/footnote/data/I926CCA43/layers/B4A6/segmentation-39E1.json deleted file mode 100644 index f86aeafd..00000000 --- a/tests/pecha/parser/docx/footnote/data/I926CCA43/layers/B4A6/segmentation-39E1.json +++ /dev/null @@ -1,760 +0,0 @@ -{ - "@type": "AnnotationStore", - "@id": "I926CCA43", - "resources": [ - { - "@type": "TextResource", - "@id": "B4A6", - "@include": "../../base/B4A6.txt" - } - ], - "annotationsets": [ - { - "@type": "AnnotationDataSet", - "@id": "segmentation_annotation", - "keys": [ - { - "@type": "DataKey", - "@id": "index" - }, - { - "@type": "DataKey", - "@id": "segmentation_type" - } - ], - "data": [ - { - "@type": "AnnotationData", - "@id": "73DED77327", - "key": "index", - "value": { - "@type": "Int", - "value": 1 - } - }, - { - "@type": "AnnotationData", - "@id": "DA80D9999D", - "key": "segmentation_type", - "value": { - "@type": "String", - "value": "segmentation" - } - }, - { - "@type": "AnnotationData", - "@id": "4CD00DA4DC", - "key": "index", - "value": { - "@type": "Int", - "value": 2 - } - }, - { - "@type": "AnnotationData", - "@id": "FF312157C0", - "key": "index", - "value": { - "@type": "Int", - "value": 3 - } - }, - { - "@type": "AnnotationData", - "@id": "3C00FCC92F", - "key": "index", - "value": { - "@type": "Int", - "value": 4 - } - }, - { - "@type": "AnnotationData", - "@id": "3431F03B38", - "key": "index", - "value": { - "@type": "Int", - "value": 5 - } - }, - { - "@type": "AnnotationData", - "@id": "F4B6EDB096", - "key": "index", - "value": { - "@type": "Int", - "value": 6 - } - }, - { - "@type": "AnnotationData", - "@id": "408A8019A4", - "key": "index", - "value": { - "@type": "Int", - "value": 7 - } - }, - { - "@type": "AnnotationData", - "@id": "8B2052E266", - "key": "index", - "value": { - "@type": "Int", - "value": 8 - } - }, - { - "@type": "AnnotationData", - "@id": "AD772CA1BB", - "key": "index", - "value": { - "@type": "Int", - "value": 9 - } - }, - { - "@type": "AnnotationData", - "@id": "A89C82853A", - "key": "index", - "value": { - "@type": "Int", - "value": 10 - } - }, - { - "@type": "AnnotationData", - "@id": "E2FE035AF8", - "key": "index", - "value": { - "@type": "Int", - "value": 11 - } - }, - { - "@type": "AnnotationData", - "@id": "789FFB94FB", - "key": "index", - "value": { - "@type": "Int", - "value": 12 - } - }, - { - "@type": "AnnotationData", - "@id": "E520B90474", - "key": "index", - "value": { - "@type": "Int", - "value": 13 - } - }, - { - "@type": "AnnotationData", - "@id": "EB96C1B7B3", - "key": "index", - "value": { - "@type": "Int", - "value": 14 - } - }, - { - "@type": "AnnotationData", - "@id": "D9ECBD77D8", - "key": "index", - "value": { - "@type": "Int", - "value": 15 - } - }, - { - "@type": "AnnotationData", - "@id": "E9849CD25A", - "key": "index", - "value": { - "@type": "Int", - "value": 16 - } - }, - { - "@type": "AnnotationData", - "@id": "DD32F049D9", - "key": "index", - "value": { - "@type": "Int", - "value": 17 - } - }, - { - "@type": "AnnotationData", - "@id": "C3B3A87C3C", - "key": "index", - "value": { - "@type": "Int", - "value": 18 - } - } - ] - } - ], - "annotations": [ - { - "@type": "Annotation", - "@id": "4CB1628A30", - "target": { - "@type": "TextSelector", - "resource": "B4A6", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 0 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 1 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "73DED77327", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "DA80D9999D", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "9488B21508", - "target": { - "@type": "TextSelector", - "resource": "B4A6", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 2 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 8 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "4CD00DA4DC", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "DA80D9999D", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "242366CC4C", - "target": { - "@type": "TextSelector", - "resource": "B4A6", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 9 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 16 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "FF312157C0", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "DA80D9999D", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "99237F10FF", - "target": { - "@type": "TextSelector", - "resource": "B4A6", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 17 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 25 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "3C00FCC92F", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "DA80D9999D", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "BDB5AD4A49", - "target": { - "@type": "TextSelector", - "resource": "B4A6", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 26 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 35 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "3431F03B38", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "DA80D9999D", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "5EC28FF27C", - "target": { - "@type": "TextSelector", - "resource": "B4A6", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 36 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 67 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "F4B6EDB096", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "DA80D9999D", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "585439843C", - "target": { - "@type": "TextSelector", - "resource": "B4A6", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 68 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 86 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "408A8019A4", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "DA80D9999D", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "FEDC0DCBBF", - "target": { - "@type": "TextSelector", - "resource": "B4A6", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 87 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 98 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "8B2052E266", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "DA80D9999D", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "435A449636", - "target": { - "@type": "TextSelector", - "resource": "B4A6", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 99 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 109 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "AD772CA1BB", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "DA80D9999D", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "3AD2BD9D34", - "target": { - "@type": "TextSelector", - "resource": "B4A6", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 110 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 131 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "A89C82853A", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "DA80D9999D", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "06E40BC993", - "target": { - "@type": "TextSelector", - "resource": "B4A6", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 132 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 157 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "E2FE035AF8", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "DA80D9999D", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "8A01475217", - "target": { - "@type": "TextSelector", - "resource": "B4A6", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 158 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 176 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "789FFB94FB", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "DA80D9999D", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "1AD534F124", - "target": { - "@type": "TextSelector", - "resource": "B4A6", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 177 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 221 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "E520B90474", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "DA80D9999D", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "3BE27478E9", - "target": { - "@type": "TextSelector", - "resource": "B4A6", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 222 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 242 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "EB96C1B7B3", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "DA80D9999D", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "ADFBD48EA8", - "target": { - "@type": "TextSelector", - "resource": "B4A6", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 243 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 272 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "D9ECBD77D8", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "DA80D9999D", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "11D5BB711B", - "target": { - "@type": "TextSelector", - "resource": "B4A6", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 273 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 296 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "E9849CD25A", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "DA80D9999D", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "884518C694", - "target": { - "@type": "TextSelector", - "resource": "B4A6", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 297 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 316 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "DD32F049D9", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "DA80D9999D", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "04407D070E", - "target": { - "@type": "TextSelector", - "resource": "B4A6", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 317 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 339 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "C3B3A87C3C", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "DA80D9999D", - "set": "segmentation_annotation" - } - ] - } - ] -} \ No newline at end of file diff --git a/tests/pecha/parser/docx/footnote/data/I926CCA43/metadata.json b/tests/pecha/parser/docx/footnote/data/I926CCA43/metadata.json deleted file mode 100644 index e2f1d8b1..00000000 --- a/tests/pecha/parser/docx/footnote/data/I926CCA43/metadata.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "id": "I926CCA43", - "title": "བོད་སྐད་རིགས་པའི་རྒྱ་གར་རིགས་པ།", - "author": "Claude", - "imported": "2025-06-03T05:44:27.010514", - "source": "https://www.google.com", - "toolkit_version": "2.1.13", - "parser": "DocxRootParser", - "initial_creation_type": "google_docx", - "language": "bo", - "type": "root", - "parent":null, - "source_metadata": {}, - "bases": {}, - "copyright": { - "status": "Unknown", - "notice": "", - "info_url": null - }, - "licence": "Unknown", - "legacy_id": null, - "source_file": null, - "ocr_import_info": null, - "statistics": null, - "quality": null, - "last_modified": "2025-06-03T05:44:27.010508" -} \ No newline at end of file diff --git a/tests/pecha/parser/docx/footnote/test_footnote_parser.py b/tests/pecha/parser/docx/footnote/test_footnote_parser.py deleted file mode 100644 index e887fd61..00000000 --- a/tests/pecha/parser/docx/footnote/test_footnote_parser.py +++ /dev/null @@ -1,162 +0,0 @@ -from pathlib import Path -from unittest import TestCase, mock - -from stam import AnnotationStore - -from openpecha.pecha import Pecha, get_anns -from openpecha.pecha.annotations import FootnoteAnnotation, span -from openpecha.pecha.layer import AnnotationType -from openpecha.pecha.parsers import update_coords -from openpecha.pecha.parsers.docx.footnote import DocxFootnoteParser -from openpecha.pecha.parsers.docx.utils import read_docx - - -class TestFootnoteParser(TestCase): - def setUp(self): - self.FOOTNOTE_DIR = Path("tests/pecha/parser/docx/utils/data/footnote") - self.ONE_PAGE_DIR = self.FOOTNOTE_DIR / "one_page" - self.one_page_footnote = self.ONE_PAGE_DIR / "one_page_footnote.docx" - - self.parser = DocxFootnoteParser() - - self.expected_footnote_contents = { - 1: "功德光論師,世親菩薩弟子,極善巧毗奈耶,出生於秣搜羅國婆羅門家,幼時 熟習外道宗義及明處,後於家鄉出家並受具足戒,依止世親菩薩,修習大小乘 藏,及聲聞十八部派一切宗義,弟子約五千比丘。住世說約四百年,是否屬實, 猶存疑惑,然而確是住世很久。後圓寂於自己的故鄉。著作頗豐,譯為藏文廣 知者,有《菩薩地所分佈施次第第九之上注釋》、《菩薩地戒品釋》,此二者 由那措譯師及香智軍二位譯為藏文,存於論典經品「ཡི」函中。《毗奈耶根本經》 及其自釋、《毗奈耶行持一百零一竭摩法》,在八世紀末,赤松德贊時,由覺 柔譯師及噶榮戒源二位譯為藏文,存於論典經品「ཟུ」、「འུ」、「ཡུ」。", - 2: "《瑜伽師地論》中《本地分》菩薩地之菩薩戒品。", - 3: "《菩薩地戒品》說:「云何菩薩自性戒?謂若略說具四功德,當知是名菩薩自性戒。何等為四?一、從他正受。二、善淨意樂。三、犯已還淨。四、深敬專念無有違犯。」", - } - self.expected_text_without_footnote_content = "1)\t \n\n2)\t 菩薩戒品釋\n\n3)\t 功德光----footnote0----論師造\n\n4)\t 解說菩薩戒品----footnote1----。\n\n5)\t 頂禮一切佛菩薩!\n\n6)\t 問:「云何所說『具四功德自性尸羅----footnote2----,應知即是妙善(淨戒)』?」\n\n7)\t 依彼而言,說「能利自(他)」等文。\n\n8)\t 其中「利益」謂善行。\n\n9)\t 「安樂」謂無惱害。\n\n10)\t 「哀憫」,謂如以諸善及無惱害行哀憫對方。\n\n11)\t 「義利」謂希求義利及具有義利,凡所有欲求及無罪。\n\n12)\t 「利益安樂故」謂住於善及無惱害行。\n\n13)\t 「人」謂刹帝利等,彼等中多數,由於佛陀出世、善說正法、善建立僧伽,當成極多利益、安樂。\n\n14)\t 彼等亦由利益、安樂自己後,而哀憫世間,\n\n15)\t 彼等於他人作如是念:「(他們)具足利益安樂,復何妙哉!」\n\n16)\t 他人亦作是念:「我等亦得如是,亦何其妙哉!」\n\n17)\t 是故,說「令得義利、利益、安樂故。」\n\n18)\t 「諸人天等」謂不能通達及成辦彼等之義利故。\n\n\n\n\n\n" - - self.expected_footnote_spans = {1: (24, 24), 2: (39, 39), 3: (76, 76)} - self.expected_text_without_footnote_spans = "1)\t \n\n2)\t 菩薩戒品釋\n\n3)\t 功德光論師造\n\n4)\t 解說菩薩戒品。\n\n5)\t 頂禮一切佛菩薩!\n\n6)\t 問:「云何所說『具四功德自性尸羅,應知即是妙善(淨戒)』?」\n\n7)\t 依彼而言,說「能利自(他)」等文。\n\n8)\t 其中「利益」謂善行。\n\n9)\t 「安樂」謂無惱害。\n\n10)\t 「哀憫」,謂如以諸善及無惱害行哀憫對方。\n\n11)\t 「義利」謂希求義利及具有義利,凡所有欲求及無罪。\n\n12)\t 「利益安樂故」謂住於善及無惱害行。\n\n13)\t 「人」謂刹帝利等,彼等中多數,由於佛陀出世、善說正法、善建立僧伽,當成極多利益、安樂。\n\n14)\t 彼等亦由利益、安樂自己後,而哀憫世間,\n\n15)\t 彼等於他人作如是念:「(他們)具足利益安樂,復何妙哉!」\n\n16)\t 他人亦作是念:「我等亦得如是,亦何其妙哉!」\n\n17)\t 是故,說「令得義利、利益、安樂故。」\n\n18)\t 「諸人天等」謂不能通達及成辦彼等之義利故。\n\n\n\n\n\n" - - self.expected_annotations = [ - FootnoteAnnotation( - index=1, - span=span(start=24, end=24), - note=self.expected_footnote_contents[1], - ), - FootnoteAnnotation( - index=2, - span=span(start=39, end=39), - note=self.expected_footnote_contents[2], - ), - FootnoteAnnotation( - index=3, - span=span(start=76, end=76), - note=self.expected_footnote_contents[3], - ), - ] - - self.expected_updated_anns = [ - FootnoteAnnotation( - index=1, - span=span(start=13, end=13, errors=None), - metadata=None, - note="功德光論師,世親菩薩弟子,極善巧毗奈耶,出生於秣搜羅國婆羅門家,幼時 熟習外道宗義及明處,後於家鄉出家並受具足戒,依止世親菩薩,修習大小乘 藏,及聲聞十八部派一切宗義,弟子約五千比丘。住世說約四百年,是否屬實, 猶存疑惑,然而確是住世很久。後圓寂於自己的故鄉。著作頗豐,譯為藏文廣 知者,有《菩薩地所分佈施次第第九之上注釋》、《菩薩地戒品釋》,此二者 由那措譯師及香智軍二位譯為藏文,存於論典經品「ཡི」函中。《毗奈耶根本經》 及其自釋、《毗奈耶行持一百零一竭摩法》,在八世紀末,赤松德贊時,由覺 柔譯師及噶榮戒源二位譯為藏文,存於論典經品「ཟུ」、「འུ」、「ཡུ」。", - ), - FootnoteAnnotation( - index=2, - span=span(start=24, end=24, errors=None), - metadata=None, - note="《瑜伽師地論》中《本地分》菩薩地之菩薩戒品。", - ), - FootnoteAnnotation( - index=3, - span=span(start=53, end=53, errors=None), - metadata=None, - note="《菩薩地戒品》說:「云何菩薩自性戒?謂若略說具四功德,當知是名菩薩自性戒。何等為四?一、從他正受。二、善淨意樂。三、犯已還淨。四、深敬專念無有違犯。」", - ), - ] - - self.pecha = Pecha.from_path( - Path("tests/pecha/parser/docx/footnote/data/I926CCA43") - ) - self.pecha_backup = { - f: f.read_bytes() for f in self.pecha.pecha_path.glob("**/*") if f.is_file() - } - - def test_get_footnote_contents(self): - text = read_docx(self.one_page_footnote, ignore_footnotes=False) - ( - text_without_footnote_content, - footnote_contents, - ) = self.parser.get_footnote_contents(text) - self.assertEqual(footnote_contents, self.expected_footnote_contents) - self.assertEqual( - text_without_footnote_content, self.expected_text_without_footnote_content - ) - - def test_get_footnote_spans(self): - text_without_footnote_spans, footnote_spans = self.parser.get_footnote_spans( - self.expected_text_without_footnote_content, self.expected_footnote_contents - ) - - self.assertEqual(footnote_spans, self.expected_footnote_spans) - self.assertEqual( - text_without_footnote_spans, self.expected_text_without_footnote_spans - ) - - def test_create_footnote_annotations(self): - annotations = self.parser.create_footnote_annotations( - self.expected_footnote_spans, self.expected_footnote_contents - ) - print(annotations) - self.assertEqual(annotations, self.expected_annotations) - - def test_update_coords(self): - new_base = self.pecha.get_base(list(self.pecha.bases.keys())[0]) - updated_anns = update_coords( - self.expected_annotations, - self.expected_text_without_footnote_spans, - new_base, - ) - self.assertEqual(updated_anns, self.expected_updated_anns) - - @mock.patch("openpecha.pecha.get_layer_id") - def test_create_footnote_layer(self, mock_get_layer_id): - mock_get_layer_id.return_value = "D93E" - annotation_path = self.parser.add_footnote_layer( - self.pecha, self.expected_updated_anns, AnnotationType.FOOTNOTE - ) - self.assertEqual(annotation_path, "B4A6/footnote-D93E.json") - - anns = get_anns( - ann_store=AnnotationStore( - file=str(self.pecha.layer_path / annotation_path) - ), - include_span=True, - ) - expected_anns = [ - { - "index": 1, - "note": "功德光論師,世親菩薩弟子,極善巧毗奈耶,出生於秣搜羅國婆羅門家,幼時 熟習外道宗義及明處,後於家鄉出家並受具足戒,依止世親菩薩,修習大小乘 藏,及聲聞十八部派一切宗義,弟子約五千比丘。住世說約四百年,是否屬實, 猶存疑惑,然而確是住世很久。後圓寂於自己的故鄉。著作頗豐,譯為藏文廣 知者,有《菩薩地所分佈施次第第九之上注釋》、《菩薩地戒品釋》,此二者 由那措譯師及香智軍二位譯為藏文,存於論典經品「ཡི」函中。《毗奈耶根本經》 及其自釋、《毗奈耶行持一百零一竭摩法》,在八世紀末,赤松德贊時,由覺 柔譯師及噶榮戒源二位譯為藏文,存於論典經品「ཟུ」、「འུ」、「ཡུ」。", - "structure_type": "footnote", - "text": "", - "span": {"start": 13, "end": 13}, - }, - { - "index": 2, - "note": "《瑜伽師地論》中《本地分》菩薩地之菩薩戒品。", - "structure_type": "footnote", - "text": "", - "span": {"start": 24, "end": 24}, - }, - { - "index": 3, - "note": "《菩薩地戒品》說:「云何菩薩自性戒?謂若略說具四功德,當知是名菩薩自性戒。何等為四?一、從他正受。二、善淨意樂。三、犯已還淨。四、深敬專念無有違犯。」", - "structure_type": "footnote", - "text": "", - "span": {"start": 53, "end": 53}, - }, - ] - self.assertEqual(anns, expected_anns) - - def tearDown(self): - # Revert all original files - for f, content in self.pecha_backup.items(): - f.write_bytes(content) - - # Remove any new files that weren't in the original backup - for f in self.pecha.pecha_path.glob("**/*"): - if f.is_file() and f not in self.pecha_backup: - f.unlink() diff --git a/tests/pecha/parser/docx/root/data/bo/entering_middle_way.docx b/tests/pecha/parser/docx/root/data/bo/entering_middle_way.docx deleted file mode 100644 index 3463c2ee..00000000 Binary files a/tests/pecha/parser/docx/root/data/bo/entering_middle_way.docx and /dev/null differ diff --git a/tests/pecha/parser/docx/root/data/bo/metadata.json b/tests/pecha/parser/docx/root/data/bo/metadata.json deleted file mode 100644 index 69703c9c..00000000 --- a/tests/pecha/parser/docx/root/data/bo/metadata.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "author": {"bo": "སྟོན་པ་བཅོམ་ལྡན་འདས།"}, - "title_short": { - "bo": "ཚོད་ལྟ་དཔེ་དེབ་", - "en": "English aligned test" - }, - "title_long_clean":{ - "bo": "འཕགས་པ་ཤེས་རབ་ཀྱི་ཕ་རོལ་ཏུ་ཕྱིན་པ་རྡོ་རྗེ་གཅོད་པ་ཞེས་བྱ་བ།" - }, - "title_alt_1":{ - "bo": "འཕགས་པ་ཤེས་རབ་ཀྱི་ཕ་རོལད་ཏུ་ཕྱིནད་པ་རྡོ་རྗེ་གཅོད་པ་ཞེས་བྱ་བ་ཐེག་པ་ཆེན་པོའི་མདོ།" - }, - "title_alt_2":{ - "bo": "རྡོ་རྗེ་སུམ་བརྒྱ་པའམ་རྡོ་རྗེ་གཅོད་པ།" - }, - "language": "bo", - "type": "root", - "parent": null -} \ No newline at end of file diff --git a/tests/pecha/parser/docx/root/data/en/entering the middle way english.docx b/tests/pecha/parser/docx/root/data/en/entering the middle way english.docx deleted file mode 100644 index dd2ec544..00000000 Binary files a/tests/pecha/parser/docx/root/data/en/entering the middle way english.docx and /dev/null differ diff --git a/tests/pecha/parser/docx/root/data/en/metadata.json b/tests/pecha/parser/docx/root/data/en/metadata.json deleted file mode 100644 index c8bab944..00000000 --- a/tests/pecha/parser/docx/root/data/en/metadata.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "author": {"bo": "སྟོན་པ་བཅོམ་ལྡན་འདས།", "en": "Buddha"}, - "title_short": { - "bo": "ཚོད་ལྟ་དཔེ་དེབ་", - "en": "English aligned test" - }, - "title_long_clean":{ - "bo": "འཕགས་པ་ཤེས་རབ་ཀྱི་ཕ་རོལ་ཏུ་ཕྱིན་པ་རྡོ་རྗེ་གཅོད་པ་ཞེས་བྱ་བ།", - "en": "Test Root English aligned with Tibetan" - }, - "title_alt_1":{ - "bo": "འཕགས་པ་ཤེས་རབ་ཀྱི་ཕ་རོལད་ཏུ་ཕྱིནད་པ་རྡོ་རྗེ་གཅོད་པ་ཞེས་བྱ་བ་ཐེག་པ་ཆེན་པོའི་མདོ།" - }, - "title_alt_2":{ - "bo": "རྡོ་རྗེ་སུམ་བརྒྱ་པའམ་རྡོ་རྗེ་གཅོད་པ།" - }, - "language": "en", - "type": "translation", - "parent": "IB66C26FB" -} \ No newline at end of file diff --git a/tests/pecha/parser/docx/root/test_root_parser.py b/tests/pecha/parser/docx/root/test_root_parser.py deleted file mode 100644 index d4ee3542..00000000 --- a/tests/pecha/parser/docx/root/test_root_parser.py +++ /dev/null @@ -1,123 +0,0 @@ -import tempfile -from pathlib import Path -from unittest import TestCase -from unittest.mock import patch - -from openpecha.pecha import Pecha -from openpecha.pecha.annotations import SegmentationAnnotation, span -from openpecha.pecha.layer import AnnotationType -from openpecha.pecha.parsers.docx.root import DocxRootParser -from openpecha.utils import read_json - - -class TestDocxRootParser(TestCase): - def setUp(self): - self.DATA_DIR = Path(__file__).parent / "data" - self.parser = DocxRootParser() - - def test_bo_google_doc_translation_parser(self): - bo_docx_file = self.DATA_DIR / "bo/entering_middle_way.docx" - metadata = read_json(self.DATA_DIR / "bo/metadata.json") - - expected_anns = [ - SegmentationAnnotation( - span=span(start=0, end=41, errors=None), metadata=None, index=1 - ), - SegmentationAnnotation( - span=span(start=42, end=200, errors=None), metadata=None, index=2 - ), - SegmentationAnnotation( - span=span(start=201, end=353, errors=None), metadata=None, index=3 - ), - SegmentationAnnotation( - span=span(start=354, end=500, errors=None), metadata=None, index=4 - ), - SegmentationAnnotation( - span=span(start=501, end=667, errors=None), metadata=None, index=5 - ), - ] - expected_base = "དབུ་མ་ལུ་འཇུག་པ་ལས། སེམས་བསྐྱེད་པ་དྲུག་པ།\nམངོན་དུ་ཕྱོགས་པར་མཉམ་བཞག་སེམས་གནས་ཏེ། །རྫོགས་པའི་སངས་རྒྱས་ཆོས་ལ་མངོན་ཕྱོགས་ཤིང༌། །འདི་བརྟེན་འབྱུང་བའི་དེ་ཉིད་མཐོང་བ་དེས། །ཤེས་རབ་གནས་པས་འགོག་པ་ཐོབ་པར་འགྱུར། །\nཇི་ལྟར་ལོང་བའི་ཚོགས་ཀུན་བདེ་བླག་ཏུ། །མིག་ལྡན་སྐྱེས་བུ་གཅིག་གིས་འདོད་པ་ཡི། །ཡུལ་དུ་འཁྲིད་པ་དེ་བཞིན་འདིར་ཡང་བློས། །མིག་ཉམས་ཡོན་ཏན་བླངས་ཏེ་རྒྱལ་ཉིད་འགྲོ། །\nཇི་ལྟར་དེ་ཡིས་ཆེས་ཟབ་ཆོས་རྟོགས་པ། །ལུང་དང་གཞན་ཡང་རིགས་པས་ཡིན་པས་ན། །དེ་ལྟར་འཕགས་པ་ཀླུ་སྒྲུབ་གཞུང་ལུགས་ལས། །ཇི་ལྟར་གནས་པའི་ལུགས་བཞིན་བརྗོད་པར་བྱ། །\nསོ་སོ་སྐྱེ་བོའི་དུས་ནའང་སྟོང་པ་ཉིད་ཐོས་ནས། །ནང་དུ་རབ་ཏུ་དགའ་བ་ཡང་དང་ཡང་དུ་འབྱུང༌། །རབ་ཏུ་དགའ་བ་ལས་བྱུང་མཆི་མས་མིག་བརླན་ཞིང༌། །ལུས་ཀྱི་བ་སྤུ་ལྡང་པར་འགྱུར་པ་གང་ཡིན་པ། །\n" - - anns, base = self.parser.extract_anns(bo_docx_file, AnnotationType.SEGMENTATION) - - assert ( - anns == expected_anns - ), "TestDocxRootParser failed extract segmentation coordinates for bo data." - assert ( - base == expected_base - ), "TestDocxRootParser failed preparing base text properly for bo data" - - with tempfile.TemporaryDirectory() as tmpdirname, patch( - "openpecha.pecha.parsers.docx.root.DocxRootParser.extract_segmentation_anns" - ) as mock_extract_root_idx, patch( - "openpecha.pecha.get_base_id" - ) as mock_get_base_id, patch( - "openpecha.pecha.get_layer_id" - ) as mock_get_layer_id: - OUTPUT_DIR = Path(tmpdirname) - mock_extract_root_idx.return_value = ( - expected_anns, - expected_base, - ) - mock_get_base_id.return_value = "B001" - mock_get_layer_id.return_value = "L001" - pecha, layer_name = self.parser.parse( - bo_docx_file, AnnotationType.SEGMENTATION, metadata, OUTPUT_DIR - ) - - assert isinstance(pecha, Pecha) - assert layer_name == "B001/segmentation-L001.json" - - def test_en_google_doc_translation_parser(self): - en_docx_file = self.DATA_DIR / "en" / "entering the middle way english.docx" - metadata = read_json(self.DATA_DIR / "en" / "metadata.json") - - expected_anns = [ - SegmentationAnnotation( - span=span(start=0, end=50, errors=None), metadata=None, index=1 - ), - SegmentationAnnotation( - span=span(start=51, end=281, errors=None), metadata=None, index=2 - ), - SegmentationAnnotation( - span=span(start=282, end=500, errors=None), metadata=None, index=3 - ), - SegmentationAnnotation( - span=span(start=501, end=707, errors=None), metadata=None, index=4 - ), - SegmentationAnnotation( - span=span(start=708, end=907, errors=None), metadata=None, index=5 - ), - ] - expected_base = '"From the Madhyamakavatara, Sixth Mind Generation"\n"When the mind rests in meditative equipoise, directly oriented Towards the qualities of the fully enlightened Buddha, And through this, sees the reality of dependent origination, Through abiding in wisdom, one attains cessation."\n"Just as a single person with eyes Can easily lead an entire group of blind people To their desired destination, likewise here too, intelligence Takes hold of the qualities lacking sight and proceeds to enlightenment."\n"Just as one understands these profound teachings Through scripture and through reasoning as well, Similarly, following the tradition of Noble Nagarjuna\'s writings, I shall explain things just as they are."\n"Even while still an ordinary being, upon hearing about emptiness, Great joy arises again and again within. From this supreme joy, tears moisten one\'s eyes, And the hairs of one\'s body stand on end."\n' - - anns, base = self.parser.extract_anns(en_docx_file, AnnotationType.SEGMENTATION) - - assert ( - anns == expected_anns - ), "TestDocxRootParser failed extract segmentation coordinates for en data." - assert ( - base == expected_base - ), "TestDocxRootParser failed preparing base text properly for en data" - - with tempfile.TemporaryDirectory() as tmpdirname, patch( - "openpecha.pecha.parsers.docx.root.DocxRootParser.extract_segmentation_anns" - ) as mock_extract_root_idx, patch( - "openpecha.pecha.get_base_id" - ) as mock_get_base_id, patch( - "openpecha.pecha.get_layer_id" - ) as mock_get_layer_id: - OUTPUT_DIR = Path(tmpdirname) - mock_extract_root_idx.return_value = ( - expected_anns, - expected_base, - ) - mock_get_base_id.return_value = "B002" - mock_get_layer_id.return_value = "L002" - - pecha, layer_name = self.parser.parse( - en_docx_file, AnnotationType.SEGMENTATION, metadata, OUTPUT_DIR - ) - - assert isinstance(pecha, Pecha) - assert layer_name == "B002/segmentation-L002.json" diff --git a/tests/pecha/parser/docx/update/__init__.py b/tests/pecha/parser/docx/update/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/pecha/parser/docx/update/data/commentary/new_anns.json b/tests/pecha/parser/docx/update/data/commentary/new_anns.json deleted file mode 100644 index b55c3525..00000000 --- a/tests/pecha/parser/docx/update/data/commentary/new_anns.json +++ /dev/null @@ -1,62 +0,0 @@ -[ - { - "index": 1, - "alignment_index": [2], - "segmentation_type": "alignment", - "text": "གཟུང་བ་མེད་པར་ཞེས་བྱ་བ་ལ་སོགས་པ་གསུངས་ཏེ། " - }, - { - "index": 2, - "alignment_index": [2], - "segmentation_type": "alignment", - "text": "འཆད་པར་འགྱུར་བའི་རིགས་པས་ཞེས་བྱ་བ་ནི་ཅི་ལྟར་རླུང་གིས་ཞེས་བྱ་བ་ལ་སོགས་པའོ། " - }, - { - "index": 3, - "alignment_index": [2,3], - "segmentation_type": "alignment", - "text": "ཐ་སྙད་ཙམ་དུ་ཡོད་པ་དོན་དམ་པར་ཡོད་པ་མ་ཡིན་པ་དེ་ལ་དངོས་པོ་བརྟགས་པར་ཡོད་པ་ཞེས་བྱ་ལ། " - }, - { - "index": 4, - "alignment_index": [4], - "segmentation_type": "alignment", - "text": "ཕྱི་རོལ་གྱི་དོན་མེད་ན་ཞེས་བྱ་བ་ནི་ཤེས་པ་ལས་ཐ་དད་དུ་གྱུར་པའི་སྔོན་པོ་ལ་སོགས་པའི་དོན་མེད་ནའོ།" - }, - { - "index": 5, - "alignment_index": [5], - "segmentation_type": "alignment", - "text": "དེ་བསམ་བྱ་ཞེས་བྱ་བ་ནི་རྨི་ལམ་ན་དོན་མེད་པའི་ཤེས་པ་ཡོད་པ་ཡིན་ནོ་ཞེས་བྱ་བ་དེ་ཡང་བསམ་པར་བྱའོ། \n།ཡང་དེ་ཅི་ཞིག་ཅེ་ན་ཞེས་བྱ་བ་ནི། བསམ་པར་བྱ་བ་གང་ཞིག་ཡིན། །ཞེས་བྱ་བའོ། །" - }, - { - "index": 6, - "alignment_index": [6,7,8], - "segmentation_type": "alignment", - "text": "གང་ཚེ་ཞེས་བྱ་བ་ལ་སོགས་པ་ནི་དབུ་མ་གང་གི་ཚེ་རྨི་ལམ་ན་ཡང་སེམས་མེད་པ་དེའི་ཚེ་རྨི་ལམ་གྱི་དཔེ་དེ་མ་གྲུབ་པ་ཡིན་ནོ་ཞེས་བྱ་བའི་ཐ་ཚིག་གོ། " - }, - { - "index": 7, - "alignment_index": [6,7,8], - "segmentation_type": "alignment", - "text": "།དེ་ཉིད་བསྟན་པར་བྱ་བའི་ཕྱིར། དེ་ཚེ་ཁྱོད་ཀྱི་དཔེ་ཡོད་མིན། །ཞེས་བྱ་བ་གསུངས་ཏེ། " - }, - { - "index": 8, - "alignment_index": [8], - "segmentation_type": "alignment", - "text": "གཉིད་སད་པའི་གནས་སྐབས་ན་དྲན་པའི་སྒོ་ནས་གལ་ཏེ་རྨི་ལམ་གྱི་གནས་སྐབས་ན་རྣམ་པར་ཤེས་པ་ཡོད་པར་ཁས་ལེན་ན། " - }, - { - "index": 9, - "alignment_index": [8], - "segmentation_type": "alignment", - "text": "ཅི་ལྟར་ཞེས་བྱ་བ་ལ་སོགས་པ་གསུངས་ཏེ། ཅི་ལྟར་ཉམས་སུ་མྱོང་བ་ལ་ནི་ངས་མཐོང་ངོ་སྙམ་པའི་དྲན་པ་ཡོད་པ་དེ་བཞིན་དུ་ཡུལ་ཡང་ཁས་བླང་བར་བྱ་དགོས་སོ། \n།ཡང་ན་རྣམ་པར་ཤེས་པ་ཡང་ཡོད་པ་མ་ཡིན་ནོ་ཞེས་བྱ་བ་ནི་གལ་ཏེ་ཡུལ་དྲན་དུ་ཟིན་ཀྱང་ཡུལ་ཁས་མི་ལེན་ན་རྣམ་པར་ཤེས་པ་དྲན་ཡང་རྣམ་པར་ཤེས་པ་མེད་པར་ཁས་བླང་བའི་ཕྱིར་ཁོ་བོའི་ཕྱོགས་གྲུབ་པ་ཡིན་ནོ། " - }, - { - "index": 10, - "alignment_index": [10], - "segmentation_type": "alignment", - "text": "གཉིད་ཀྱིས་ཞེས་བྱ་བ་ལ་སོགས་པ་གསུངས་ཏེ། གཉིད་ལོག་པས་མིག་དང༌། རྣ་བ་དང༌། སྣ་དང༌།" - } -] \ No newline at end of file diff --git a/tests/pecha/parser/docx/update/data/commentary/old_anns.json b/tests/pecha/parser/docx/update/data/commentary/old_anns.json deleted file mode 100644 index d7ccd6fd..00000000 --- a/tests/pecha/parser/docx/update/data/commentary/old_anns.json +++ /dev/null @@ -1,56 +0,0 @@ -[ - { - "index": 1, - "alignment_index": [1], - "segmentation_type": "alignment", - "text": "གཟུང་བ་མེད་པར་ཞེས་བྱ་བ་ལ་སོགས་པ་གསུངས་ཏེ། " - }, - { - "index": 2, - "alignment_index": [2], - "segmentation_type": "alignment", - "text": "འཆད་པར་འགྱུར་བའི་རིགས་པས་ཞེས་བྱ་བ་ནི་ཅི་ལྟར་རླུང་གིས་ཞེས་བྱ་བ་ལ་སོགས་པའོ། " - }, - { - "index": 3, - "alignment_index": [3], - "segmentation_type": "alignment", - "text": "ཐ་སྙད་ཙམ་དུ་ཡོད་པ་དོན་དམ་པར་ཡོད་པ་མ་ཡིན་པ་དེ་ལ་དངོས་པོ་བརྟགས་པར་ཡོད་པ་ཞེས་བྱ་ལ། " - }, - { - "index": 4, - "alignment_index": [4], - "segmentation_type": "alignment", - "text": "ཕྱི་རོལ་གྱི་དོན་མེད་ན་ཞེས་བྱ་བ་ནི་ཤེས་པ་ལས་ཐ་དད་དུ་གྱུར་པའི་སྔོན་པོ་ལ་སོགས་པའི་དོན་མེད་ནའོ།" - }, - { - "index": 5, - "alignment_index": [5], - "segmentation_type": "alignment", - "text": "དེ་བསམ་བྱ་ཞེས་བྱ་བ་ནི་རྨི་ལམ་ན་དོན་མེད་པའི་ཤེས་པ་ཡོད་པ་ཡིན་ནོ་ཞེས་བྱ་བ་དེ་ཡང་བསམ་པར་བྱའོ། \n།ཡང་དེ་ཅི་ཞིག་ཅེ་ན་ཞེས་བྱ་བ་ནི། བསམ་པར་བྱ་བ་གང་ཞིག་ཡིན། །ཞེས་བྱ་བའོ། །" - }, - { - "index": 6, - "alignment_index": [6], - "segmentation_type": "alignment", - "text": "གང་ཚེ་ཞེས་བྱ་བ་ལ་སོགས་པ་ནི་དབུ་མ་གང་གི་ཚེ་རྨི་ལམ་ན་ཡང་སེམས་མེད་པ་དེའི་ཚེ་རྨི་ལམ་གྱི་དཔེ་དེ་མ་གྲུབ་པ་ཡིན་ནོ་ཞེས་བྱ་བའི་ཐ་ཚིག་གོ། \n།དེ་ཉིད་བསྟན་པར་བྱ་བའི་ཕྱིར། དེ་ཚེ་ཁྱོད་ཀྱི་དཔེ་ཡོད་མིན། །ཞེས་བྱ་བ་གསུངས་ཏེ། " - }, - { - "index": 7, - "alignment_index": [7], - "segmentation_type": "alignment", - "text": "གཉིད་སད་པའི་གནས་སྐབས་ན་དྲན་པའི་སྒོ་ནས་གལ་ཏེ་རྨི་ལམ་གྱི་གནས་སྐབས་ན་རྣམ་པར་ཤེས་པ་ཡོད་པར་ཁས་ལེན་ན། " - }, - { - "index": 8, - "alignment_index": [8], - "segmentation_type": "alignment", - "text": "ཅི་ལྟར་ཞེས་བྱ་བ་ལ་སོགས་པ་གསུངས་ཏེ། ཅི་ལྟར་ཉམས་སུ་མྱོང་བ་ལ་ནི་ངས་མཐོང་ངོ་སྙམ་པའི་དྲན་པ་ཡོད་པ་དེ་བཞིན་དུ་ཡུལ་ཡང་ཁས་བླང་བར་བྱ་དགོས་སོ། \n།ཡང་ན་རྣམ་པར་ཤེས་པ་ཡང་ཡོད་པ་མ་ཡིན་ནོ་ཞེས་བྱ་བ་ནི་གལ་ཏེ་ཡུལ་དྲན་དུ་ཟིན་ཀྱང་ཡུལ་ཁས་མི་ལེན་ན་རྣམ་པར་ཤེས་པ་དྲན་ཡང་རྣམ་པར་ཤེས་པ་མེད་པར་ཁས་བླང་བའི་ཕྱིར་ཁོ་བོའི་ཕྱོགས་གྲུབ་པ་ཡིན་ནོ། " - }, - { - "index": 9, - "alignment_index": [9], - "segmentation_type": "alignment", - "text": "གཉིད་ཀྱིས་ཞེས་བྱ་བ་ལ་སོགས་པ་གསུངས་ཏེ། གཉིད་ལོག་པས་མིག་དང༌། རྣ་བ་དང༌། སྣ་དང༌།" - } -] \ No newline at end of file diff --git a/tests/pecha/parser/docx/update/data/root/new_anns.json b/tests/pecha/parser/docx/update/data/root/new_anns.json deleted file mode 100644 index 2accfa10..00000000 --- a/tests/pecha/parser/docx/update/data/root/new_anns.json +++ /dev/null @@ -1,62 +0,0 @@ -[ - { - "index": 1, - "segmentation_type": "segmentation", - "text": "བུ་མ་འཇུག་པ་ལས་སེམས་བསྐྱེད་དྲུག་པ། ཤོ་ལོ་ཀ ༡-༦༤ མངོན་དུ་ཕྱོགས་པར་མཉམ་བཞག་སེམས་གནས་ཏེ། " - }, - { - "index": 2, - "segmentation_type": "segmentation", - "text": "།རྫོགས་པའི་སངས་རྒྱས་ཆོས་ལ་མངོན་ཕྱོགས་ཤིང༌། །" - }, - { - "index": 3, - "segmentation_type": "segmentation", - "text": "འདི་བརྟེན་འབྱུང་བའི་དེ་ཉིད་མཐོང་བ་དེས། །" - }, - { - "index": 4, - "segmentation_type": "segmentation", - "text": "ཤེས་རབ་གནས་པས་འགོག་པ་ཐོབ་པར་འགྱུར། །" - }, - { - "index": 5, - "segmentation_type": "segmentation", - "text": "ཇི་ལྟར་ལོང་བའི་ཚོགས་ཀུན་བདེ་བླག་ཏུ། །མིག་ལྡན་སྐྱེས་བུ་གཅིག་གིས་འདོད་པ་ཡི། །ཡུལ་དུ་འཁྲིད་པ་དེ་བཞིན་འདིར་ཡང་བློས། །མིག་ཉམས་ཡོན་ཏན་བླངས་ཏེ་རྒྱལ་ཉིད་འགྲོ། །" - }, - { - "index": 6, - "segmentation_type": "segmentation", - "text": "ཇི་ལྟར་དེ་ཡིས་ཆེས་ཟབ་ཆོས་རྟོགས་པ། །ལུང་དང་གཞན་ཡང་རིགས་པས་ཡིན་པས་ན། །དེ་ལྟར་འཕགས་པ་ཀླུ་སྒྲུབ་གཞུང་ལུགས་ལས། །ཇི་ལྟར་གནས་པའི་ལུགས་བཞིན་བརྗོད་པར་བྱ། །" - }, - { - "index": 7, - "segmentation_type": "segmentation", - "text": "སོ་སོ་སྐྱེ་བོའི་དུས་ནའང་སྟོང་པ་ཉིད་ཐོས་ནས། །ནང་དུ་རབ་ཏུ་དགའ་བ་ཡང་དང་ཡང་དུ་འབྱུང༌། །རབ་ཏུ་དགའ་བ་ལས་བྱུང་མཆི་མས་མིག་བརླན་ཞིང༌། །ལུས་ཀྱི་བ་སྤུ་ལྡང་པར་འགྱུར་པ་གང་ཡིན་པ། །" - }, - { - "index": 8, - "segmentation_type": "segmentation", - "text": "དེ་ལ་རྫོགས་པའི་སངས་རྒྱས་བློ་ཡི་ས་བོན་ཡོད། །དེ་ཉིད་ཉེ་བར་བསྟན་པའི་སྣོད་ནི་དེ་ཡིན་ཏེ། །དེ་ལ་དམ་པའི་དོན་གྱི་བདེན་པ་བསྟན་པར་བྱ། །དེ་ལ་དེ་ཡི་རྗེས་སུ་འགྲོ་བའི་ཡོན་ཏན་འབྱུང༌། །" - }, - { - "index": 9, - "segmentation_type": "segmentation", - "text": "རྟག་ཏུ་ཚུལ་ཁྲིམས་ཡང་དག་བླངས་ནས་གནས་པར་འགྱུར། །སྦྱིན་པ་གཏོང་བར་འགྱུར་ཞིང་སྙིང་རྗེ་བསྟེན་པར་བྱེད། །བཟོད་པ་སྒོམ་བྱེད་དེ་ཡི་དགེ་བའང་བྱང་ཆུབ་ཏུ། །འགྲོ་བ་དགྲོལ་བར་བྱ་ཕྱིར་ཡོངས་སུ་བསྔོ་བྱེད་ཅིང༌། །" - }, - { - "index": 10, - "segmentation_type": "segmentation", - "text": "རྫོགས་པའི་བྱང་ཆུབ་སེམས་དཔའ་རྣམས་ལ་གུས་པར་བྱེད། །ཟབ་ཅིང་རྒྱ་ཆེའི་ཚུལ་ལ་མཁས་པའི་སྐྱེ་བོས་ནི། །རིམ་གྱིས་རབ་ཏུ་དགའ་བའི་ས་ནི་འཐོབ་འགྱུར་བས། །དེ་ནི་དོན་དུ་གཉེར་བས་ལམ་འདི་མཉན་པར་གྱིས། །" - }, - { - "index": 11, - "segmentation_type": "segmentation", - "text": "དེ་ཉིད་དེ་ལས་འབྱུང་མིན་གཞན་དག་ལས་ལྟ་ག་ལ་ཞིག །གཉིས་ཀ་ལས་ཀྱང་མ་ཡིན་རྒྱུ་མེད་པར་ནི་ག་ལ་ཡོད། །དེ་ནི་དེ་ལས་འབྱུང་ན་ཡོན་ཏན་འགའ་ཡང་ཡོད་མ་ཡིན། །སྐྱེས་པར་གྱུར་པ་སླར་ཡང་སྐྱེ་བར་རིགས་པའང་མ་ཡིན་ཉིད། །" - }, - { - "index": 12, - "segmentation_type": "segmentation", - "text": "སྐྱེས་ཟིན་སླར་ཡང་སྐྱེ་བར་ཡོངས་སུ་རྟོག་པར་འགྱུར་ན་ནི། །མྱུ་གུ་ལ་སོགས་རྣམས་ཀྱི་སྐྱེ་བ་འདིར་རྙེད་མི་འགྱུར་ཞིང༌། །ས་བོན་སྲིད་མཐར་ཐུག་པར་རབ་ཏུ་སྐྱེ་བ་ཉིད་དུ་འགྱུར། །ཇི་ལྟར་དེ་ཉིད་ཀྱིས་དེ་རྣམ་པར་འཇིག་པར་བྱེད་པར་འགྱུར། །" - } -] \ No newline at end of file diff --git a/tests/pecha/parser/docx/update/data/root/old_anns.json b/tests/pecha/parser/docx/update/data/root/old_anns.json deleted file mode 100644 index af86f063..00000000 --- a/tests/pecha/parser/docx/update/data/root/old_anns.json +++ /dev/null @@ -1,47 +0,0 @@ -[ - { - "index": 1, - "segmentation_type": "segmentation", - "text": "བུ་མ་འཇུག་པ་ལས་སེམས་བསྐྱེད་དྲུག་པ། ཤོ་ལོ་ཀ ༡-༦༤ མངོན་དུ་ཕྱོགས་པར་མཉམ་བཞག་སེམས་གནས་ཏེ། །རྫོགས་པའི་སངས་རྒྱས་ཆོས་ལ་མངོན་ཕྱོགས་ཤིང༌། །འདི་བརྟེན་འབྱུང་བའི་དེ་ཉིད་མཐོང་བ་དེས། །ཤེས་རབ་གནས་པས་འགོག་པ་ཐོབ་པར་འགྱུར། །" - }, - { - "index": 2, - "segmentation_type": "segmentation", - "text": "ཇི་ལྟར་ལོང་བའི་ཚོགས་ཀུན་བདེ་བླག་ཏུ། །མིག་ལྡན་སྐྱེས་བུ་གཅིག་གིས་འདོད་པ་ཡི། །ཡུལ་དུ་འཁྲིད་པ་དེ་བཞིན་འདིར་ཡང་བློས། །མིག་ཉམས་ཡོན་ཏན་བླངས་ཏེ་རྒྱལ་ཉིད་འགྲོ། །" - }, - { - "index": 3, - "segmentation_type": "segmentation", - "text": "ཇི་ལྟར་དེ་ཡིས་ཆེས་ཟབ་ཆོས་རྟོགས་པ། །ལུང་དང་གཞན་ཡང་རིགས་པས་ཡིན་པས་ན། །དེ་ལྟར་འཕགས་པ་ཀླུ་སྒྲུབ་གཞུང་ལུགས་ལས། །ཇི་ལྟར་གནས་པའི་ལུགས་བཞིན་བརྗོད་པར་བྱ། །" - }, - { - "index": 4, - "segmentation_type": "segmentation", - "text": "སོ་སོ་སྐྱེ་བོའི་དུས་ནའང་སྟོང་པ་ཉིད་ཐོས་ནས། །ནང་དུ་རབ་ཏུ་དགའ་བ་ཡང་དང་ཡང་དུ་འབྱུང༌། །རབ་ཏུ་དགའ་བ་ལས་བྱུང་མཆི་མས་མིག་བརླན་ཞིང༌། །ལུས་ཀྱི་བ་སྤུ་ལྡང་པར་འགྱུར་པ་གང་ཡིན་པ། །" - }, - { - "index": 5, - "segmentation_type": "segmentation", - "text": "དེ་ལ་རྫོགས་པའི་སངས་རྒྱས་བློ་ཡི་ས་བོན་ཡོད། །དེ་ཉིད་ཉེ་བར་བསྟན་པའི་སྣོད་ནི་དེ་ཡིན་ཏེ། །དེ་ལ་དམ་པའི་དོན་གྱི་བདེན་པ་བསྟན་པར་བྱ། །དེ་ལ་དེ་ཡི་རྗེས་སུ་འགྲོ་བའི་ཡོན་ཏན་འབྱུང༌། །" - }, - { - "index": 6, - "segmentation_type": "segmentation", - "text": "རྟག་ཏུ་ཚུལ་ཁྲིམས་ཡང་དག་བླངས་ནས་གནས་པར་འགྱུར། །སྦྱིན་པ་གཏོང་བར་འགྱུར་ཞིང་སྙིང་རྗེ་བསྟེན་པར་བྱེད། །བཟོད་པ་སྒོམ་བྱེད་དེ་ཡི་དགེ་བའང་བྱང་ཆུབ་ཏུ། །འགྲོ་བ་དགྲོལ་བར་བྱ་ཕྱིར་ཡོངས་སུ་བསྔོ་བྱེད་ཅིང༌། །" - }, - { - "index": 7, - "segmentation_type": "segmentation", - "text": "རྫོགས་པའི་བྱང་ཆུབ་སེམས་དཔའ་རྣམས་ལ་གུས་པར་བྱེད། །ཟབ་ཅིང་རྒྱ་ཆེའི་ཚུལ་ལ་མཁས་པའི་སྐྱེ་བོས་ནི། །རིམ་གྱིས་རབ་ཏུ་དགའ་བའི་ས་ནི་འཐོབ་འགྱུར་བས། །དེ་ནི་དོན་དུ་གཉེར་བས་ལམ་འདི་མཉན་པར་གྱིས། །" - }, - { - "index": 8, - "segmentation_type": "segmentation", - "text": "དེ་ཉིད་དེ་ལས་འབྱུང་མིན་གཞན་དག་ལས་ལྟ་ག་ལ་ཞིག །གཉིས་ཀ་ལས་ཀྱང་མ་ཡིན་རྒྱུ་མེད་པར་ནི་ག་ལ་ཡོད། །དེ་ནི་དེ་ལས་འབྱུང་ན་ཡོན་ཏན་འགའ་ཡང་ཡོད་མ་ཡིན། །སྐྱེས་པར་གྱུར་པ་སླར་ཡང་སྐྱེ་བར་རིགས་པའང་མ་ཡིན་ཉིད། །" - }, - { - "index": 9, - "segmentation_type": "segmentation", - "text": "སྐྱེས་ཟིན་སླར་ཡང་སྐྱེ་བར་ཡོངས་སུ་རྟོག་པར་འགྱུར་ན་ནི། །མྱུ་གུ་ལ་སོགས་རྣམས་ཀྱི་སྐྱེ་བ་འདིར་རྙེད་མི་འགྱུར་ཞིང༌། །ས་བོན་སྲིད་མཐར་ཐུག་པར་རབ་ཏུ་སྐྱེ་བ་ཉིད་དུ་འགྱུར། །ཇི་ལྟར་དེ་ཉིད་ཀྱིས་དེ་རྣམ་པར་འཇིག་པར་བྱེད་པར་འགྱུར། །" - } -] \ No newline at end of file diff --git a/tests/pecha/parser/docx/update/test_docx_update.py b/tests/pecha/parser/docx/update/test_docx_update.py deleted file mode 100644 index 743045f1..00000000 --- a/tests/pecha/parser/docx/update/test_docx_update.py +++ /dev/null @@ -1,100 +0,0 @@ -from pathlib import Path -from unittest import TestCase - -from stam import AnnotationStore - -from openpecha.pecha import get_anns -from openpecha.pecha.parsers.docx.update import DocxAnnotationUpdate -from openpecha.utils import read_json -from tests.pecha import SharedPechaSetup - - -class TestDocxAnnotationUpdate(TestCase, SharedPechaSetup): - def setUp(self) -> None: - self.setup_pechas() - - self.root_pecha_backup = { - f: f.read_bytes() for f in self.root_pecha_path.glob("**/*") if f.is_file() - } - self.commentary_pecha_backup = { - f: f.read_bytes() - for f in self.commentary_pecha_path.glob("**/*") - if f.is_file() - } - - def test_root_pecha(self): - updater = DocxAnnotationUpdate() - annotation_path = "B5FE/segmentation-4FD1.json" - docx_file = Path( - "tests/pecha/parser/docx/annotation/data/root_display_pecha/དགོངས་པ་རབ་གསལ་ལས་སེམས་བསྐྱེད་དྲུག་པ། ཤོ་ལོ་ཀ ༡-༦༤ segmentation 1.docx" - ) - metadatas = [self.root_pecha_metadata] - - full_ann_path = self.root_pecha.layer_path / annotation_path - old_anns = get_anns(AnnotationStore(file=str(full_ann_path))) - expected_old_anns = read_json( - "tests/pecha/parser/docx/update/data/root/old_anns.json" - ) - assert ( - old_anns == expected_old_anns - ), "Old annotations do not match in Root Pecha Segmentation Layer Update" - - updater.update_annotation( - self.root_pecha, annotation_path, docx_file, metadatas - ) - - updated_anns = get_anns(AnnotationStore(file=str(full_ann_path))) - expected_new_anns = read_json( - "tests/pecha/parser/docx/update/data/root/new_anns.json" - ) - assert ( - updated_anns == expected_new_anns - ), "New annotations do not match in Root Pecha Segmentation Layer Update" - - def test_commentary_pecha(self): - updater = DocxAnnotationUpdate() - annotation_path = "B014/alignment-2127.json" - docx_file = Path( - "tests/pecha/parser/docx/annotation/data/commentary_pecha/དགོངས་པ་རབ་གསལ་ལས་སེམས་བསྐྱེད་དྲུག་པ། ཤོ་ལོ་ཀ ༡-༦༤ _commentary segmentation 1.docx" - ) - metadatas = [self.commentary_pecha_metadata, self.root_pecha_metadata] - - full_ann_path = self.commentary_pecha.layer_path / annotation_path - old_anns = get_anns(AnnotationStore(file=str(full_ann_path))) - expected_old_anns = read_json( - "tests/pecha/parser/docx/update/data/commentary/old_anns.json" - ) - assert ( - old_anns == expected_old_anns - ), "Old annotations do not match in Commentary Pecha Segmentation Layer Update" - - updater.update_annotation( - self.commentary_pecha, annotation_path, docx_file, metadatas - ) - - updated_anns = get_anns(AnnotationStore(file=str(full_ann_path))) - expected_updated_anns = read_json( - "tests/pecha/parser/docx/update/data/commentary/new_anns.json" - ) - assert ( - updated_anns == expected_updated_anns - ), "New annotations do not match in Commentary Pecha Segmentation Layer Update" - - def tearDown(self) -> None: - # Revert all original files - for f, content in self.root_pecha_backup.items(): - f.write_bytes(content) - - # Remove any new files that weren't in the original backup - for f in self.root_pecha_path.glob("**/*"): - if f.is_file() and f not in self.root_pecha_backup: - f.unlink() - - # Revert all original files - for f, content in self.commentary_pecha_backup.items(): - f.write_bytes(content) - - # Remove any new files that weren't in the original backup - for f in self.commentary_pecha_path.glob("**/*"): - if f.is_file() and f not in self.commentary_pecha_backup: - f.unlink() diff --git a/tests/pecha/parser/docx/utils/data/footnote/one_page/after_one_page.txt b/tests/pecha/parser/docx/utils/data/footnote/one_page/after_one_page.txt deleted file mode 100644 index cf1f7877..00000000 --- a/tests/pecha/parser/docx/utils/data/footnote/one_page/after_one_page.txt +++ /dev/null @@ -1,35 +0,0 @@ -1) - -2) 菩薩戒品釋 - -3) 功德光論師造 - -4) 解說菩薩戒品。 - -5) 頂禮一切佛菩薩! - -6) 問:「云何所說『具四功德自性尸羅,應知即是妙善(淨戒)』?」 - -7) 依彼而言,說「能利自(他)」等文。 - -8) 其中「利益」謂善行。 - -9) 「安樂」謂無惱害。 - -10) 「哀憫」,謂如以諸善及無惱害行哀憫對方。 - -11) 「義利」謂希求義利及具有義利,凡所有欲求及無罪。 - -12) 「利益安樂故」謂住於善及無惱害行。 - -13) 「人」謂刹帝利等,彼等中多數,由於佛陀出世、善說正法、善建立僧伽,當成極多利益、安樂。 - -14) 彼等亦由利益、安樂自己後,而哀憫世間, - -15) 彼等於他人作如是念:「(他們)具足利益安樂,復何妙哉!」 - -16) 他人亦作是念:「我等亦得如是,亦何其妙哉!」 - -17) 是故,說「令得義利、利益、安樂故。」 - -18) 「諸人天等」謂不能通達及成辦彼等之義利故。 \ No newline at end of file diff --git a/tests/pecha/parser/docx/utils/data/footnote/one_page/before_one_page.txt b/tests/pecha/parser/docx/utils/data/footnote/one_page/before_one_page.txt deleted file mode 100644 index eba2d528..00000000 --- a/tests/pecha/parser/docx/utils/data/footnote/one_page/before_one_page.txt +++ /dev/null @@ -1,41 +0,0 @@ -1) - -2) 菩薩戒品釋 - -3) 功德光----footnote0----論師造 - -4) 解說菩薩戒品----footnote1----。 - -5) 頂禮一切佛菩薩! - -6) 問:「云何所說『具四功德自性尸羅----footnote2----,應知即是妙善(淨戒)』?」 - -7) 依彼而言,說「能利自(他)」等文。 - -8) 其中「利益」謂善行。 - -9) 「安樂」謂無惱害。 - -10) 「哀憫」,謂如以諸善及無惱害行哀憫對方。 - -11) 「義利」謂希求義利及具有義利,凡所有欲求及無罪。 - -12) 「利益安樂故」謂住於善及無惱害行。 - -13) 「人」謂刹帝利等,彼等中多數,由於佛陀出世、善說正法、善建立僧伽,當成極多利益、安樂。 - -14) 彼等亦由利益、安樂自己後,而哀憫世間, - -15) 彼等於他人作如是念:「(他們)具足利益安樂,復何妙哉!」 - -16) 他人亦作是念:「我等亦得如是,亦何其妙哉!」 - -17) 是故,說「令得義利、利益、安樂故。」 - -18) 「諸人天等」謂不能通達及成辦彼等之義利故。 - -footnote0) 功德光論師,世親菩薩弟子,極善巧毗奈耶,出生於秣搜羅國婆羅門家,幼時 熟習外道宗義及明處,後於家鄉出家並受具足戒,依止世親菩薩,修習大小乘 藏,及聲聞十八部派一切宗義,弟子約五千比丘。住世說約四百年,是否屬實, 猶存疑惑,然而確是住世很久。後圓寂於自己的故鄉。著作頗豐,譯為藏文廣 知者,有《菩薩地所分佈施次第第九之上注釋》、《菩薩地戒品釋》,此二者 由那措譯師及香智軍二位譯為藏文,存於論典經品「ཡི」函中。《毗奈耶根本經》 及其自釋、《毗奈耶行持一百零一竭摩法》,在八世紀末,赤松德贊時,由覺 柔譯師及噶榮戒源二位譯為藏文,存於論典經品「ཟུ」、「འུ」、「ཡུ」。 - -footnote1) 《瑜伽師地論》中《本地分》菩薩地之菩薩戒品。 - -footnote2) 《菩薩地戒品》說:「云何菩薩自性戒?謂若略說具四功德,當知是名菩薩自性戒。何等為四?一、從他正受。二、善淨意樂。三、犯已還淨。四、深敬專念無有違犯。」 \ No newline at end of file diff --git a/tests/pecha/parser/docx/utils/data/footnote/one_page/one_page_footnote.docx b/tests/pecha/parser/docx/utils/data/footnote/one_page/one_page_footnote.docx deleted file mode 100644 index 3af9eb4b..00000000 Binary files a/tests/pecha/parser/docx/utils/data/footnote/one_page/one_page_footnote.docx and /dev/null differ diff --git a/tests/pecha/parser/docx/utils/data/footnote/one_page/~$e_page_footnote.docx b/tests/pecha/parser/docx/utils/data/footnote/one_page/~$e_page_footnote.docx deleted file mode 100644 index 7fcab66c..00000000 Binary files a/tests/pecha/parser/docx/utils/data/footnote/one_page/~$e_page_footnote.docx and /dev/null differ diff --git a/tests/pecha/parser/docx/utils/data/footnote/two_page/after_two_page.txt b/tests/pecha/parser/docx/utils/data/footnote/two_page/after_two_page.txt deleted file mode 100644 index 8eac18bf..00000000 --- a/tests/pecha/parser/docx/utils/data/footnote/two_page/after_two_page.txt +++ /dev/null @@ -1,37 +0,0 @@ -1) 聖梵天所問大乘經 - -2) 敬禮一切佛菩薩 - -3) 如是我聞:一時, - -4) 世尊在毘舍離大林重樓閣, - -5) 與大比丘眾萬二千人及諸菩薩摩訶薩俱。 - -6) 世尊於中夜,結跏趺坐,正念而住, - -7) 有尋有伺,定生喜樂,入於初禪。 - -8) 從彼定起,無尋無伺,定生喜樂,入第二禪。 - -9) 從彼定起,離於喜貪, - -10) 具平等捨,亦具正念, - -11) 住於妙樂,入第三禪。 - -12) 從彼定起,樂亦復斷: - -13) 前苦已斷,憂喜皆泯,故無苦樂, - -14) 具平等捨,正念清淨,入第四禪。 - -15) 復次,世尊從四禪起,入第三禪; - -16) 從三禪起,入第二禪; - -17) 從二禪起,入於初禪; - -18) 從初禪起,入於三昧,名曰:如實應機於諸有情普現色身。 - -19) 世尊既入如實應機於諸有情普現色身三昧頃,大放光明,威光妙德而住。[爾時,]娑婆世界主梵 \ No newline at end of file diff --git a/tests/pecha/parser/docx/utils/data/footnote/two_page/before_two_page.txt b/tests/pecha/parser/docx/utils/data/footnote/two_page/before_two_page.txt deleted file mode 100644 index ed0bb836..00000000 --- a/tests/pecha/parser/docx/utils/data/footnote/two_page/before_two_page.txt +++ /dev/null @@ -1,59 +0,0 @@ -1) 聖梵天所問大乘經 - -2) 敬禮一切佛菩薩 - -3) 如是我聞:一時, - -4) 世尊----footnote0----在毘舍離大林重樓閣----footnote1----, - -5) 與大比丘眾----footnote2----萬二千人及諸菩薩摩訶薩俱。 - -6) 世尊於中夜----footnote3----,結跏趺坐,正念而住, - -7) 有尋有伺----footnote4----,定生喜樂,入於初禪。----footnote5---- - -8) 從彼定起,無尋無伺,定生喜樂,入第二禪----footnote6----。 - -9) 從彼定起,離於喜貪----footnote7----, - -10) 具平等捨,亦具正念, - -11) 住於妙樂----footnote8----,入第三禪。 - -12) 從彼定起,樂亦復斷: - -13) 前苦已斷,憂喜皆泯,故無苦樂, - -14) 具平等捨,正念清淨,入第四禪。 - -15) 復次,世尊從四禪起,入第三禪; - -16) 從三禪起,入第二禪; - -17) 從二禪起,入於初禪; - -18) 從初禪起,入於三昧,名曰:如實應機於諸有情普現色身。 - -19) 世尊既入如實應機於諸有情普現色身三昧頃,大放光明,威光妙德----footnote9----而住。----footnote10----[爾時,]娑婆世界主梵 - -footnote0) 世尊:原文寫做 བཅོམ་ལྡན་འདས,亦音譯為薄伽梵(bhagavat),為佛陀十號之一,有多種意義。漢譯經典多取其中一義,譯為世尊。 - -footnote1) 大林重樓閣:原文寫做 ཚལ་ཆེན་པོའི་ཁང་པ་བརྩེགས་པའི་གནས。位於中印度毘舍離城北附近。此譯名見《高僧法顯傳》。 - -footnote2) 大比丘眾萬二千人:原文寫做 དགེ་སློང་ཁྲི་ཉིས་སྟོང་གི་དགེ་འདུན་ཆེན་པོ,「大比丘眾」(དགེ་སློང་གི་དགེ་འདུན་ཆེན་པོ)一詞常見於佛典,人數由五百至無量不等。關於此詞彙, 常見的解釋約有二種:一種解為「比丘萬二千人大眾(mahāsaṃghika / དགེ་ འདུན་ཆེན་པོ)」或「比丘大眾萬二千人」,另一則解為「聞法眾皆為大比丘」。 此處譯法順古。 - -footnote3) 中夜:原文寫做 མི་ཉལ་ཙམ,梵文做 praśāntarātriḥ,直譯為「約莫未寢之時」。《翻 譯名義大集》中將之譯為「未映之時」(未映義為天未破曉),並記為「極為 寧靜之夜」(མཚན་མོ་རབ་ཏུ་ཞི་བ)的同義詞(Mvyt: 。此詞在藏文大藏經中 的出現頻率並不高,若依據藏語文直觀的解讀方式,常有人將之解為「約莫將 寢未寢之時」,但在《大正藏》中,並未出現過「未映之時」一詞,或雖有「未 寢」「未眠」等詞,然其詞義及語境亦與此處有別。基於前述原因,按照字面 將之譯出的做法顯然有待商榷。經比照錄有此詞的藏本大藏經與《大正藏》可 知:《佛說首楞嚴三昧經.卷一》中,將之翻為「中夜半」(CBETA, T no. ,《方廣大莊嚴經.卷一.序品第一》中翻為「中夜」(CBETA, T no. ,《佛說月上女經》中則僅翻為「夜」一字(CBETA, T no. ;至 於其餘藏文藏經,僅於《華嚴經》尚有此詞,如《大方廣佛華嚴經.佛不思議 法品.第三十三》,但在漢譯《佛不思議法品》中,則未將該詞譯出。在此, 根據前述分析,參考《首楞嚴三昧經》與《方廣大莊嚴經》,將此詞彙譯為「中 夜」。 - -footnote4) 有尋有伺:尋伺為尋、伺二詞之合稱。尋係指粗大的分別,伺則指細微的分別。 相關解釋,可參考《中華佛教百科全書》「尋伺」條:「尋與伺二心所的併稱。 『尋』者,舊譯『覺』,為粗略推求諸法名義的思惟作用,通於定、散及無漏。 『伺』者,舊譯『觀』,乃細心伺察諸法名義的思惟作用,不遍於一切心,不 起於一切時,其性雖遲鈍,但深入推度名身等,與『尋』同有等起語言之作用。 二者皆攝於俱舍七十五法的不定地法、唯識百法的四不定。」 - -footnote5) 有尋有伺,定生喜樂,入於初禪:原文寫做 རྟོག་པ་དང་བཅས་པ། དཔྱོད་པ་དང་བཅས་པའི་ཏིང་ངེ་འཛིན་ལས་སྐྱེས་པའི་དགའ་བ་དང་བདེ་བ་ཅན་བསམ་གཏན་དང་པོ་ལ་སྙོམས་པར་ཞུགས,直譯為「平 等安住於帶有尋伺的三昧所生喜樂之初禪」。其中「定生喜樂」一詞,義為「由 三昧所生之喜樂」,漢文古譯多做此種譯法,今從古而譯,下同。特此說明。 又,漢譯佛經中,「定生喜樂」往往專指二禪境界,而初禪則多譯為「離生喜 樂」,義為「由『捨離五欲及諸罪』所生之喜樂」。由於原文已經敘明是由三 昧所生之喜樂,故雖與漢譯慣用譯法略有出入,此處仍將之譯為「定生喜樂」。 又,「禪」字原文寫做 བསམ་གཏན,梵文則做 dhyāna,音譯為禪那,簡稱禪;意 譯則為靜慮,義指心專注於某一事物的狀態。鳩摩羅什翻譯此字時,多做靜慮, 但有時也會譯為禪。玄奘翻譯時則一律譯為靜慮。為俾讀誦,此處採用鳩摩羅 什的譯法,將之譯為禪。此外,漢譯佛典中,多將「入於初禪」寫做「入初禪」 或「入初靜慮」。為俾讀誦並兼顧前後用語的一致性,此處採用權宜的譯法, 譯為「入於初禪」。 - -footnote6) 此句省略了主詞「世尊」。由於此段關於入出四禪境界的主詞皆為世尊,為俾 閱讀,特將入於第二、三四禪的主詞世尊一概略去不譯,不另行再加註解。特 此說明。 - -footnote7) 喜貪:原文寫做 དགའ་བའི་འདོད་ཆགས,梵文做 nandīrāgaḥ,為「喜中貪」之義。《阿 毘達磨俱舍論》卷三〈分別根品第二〉:「第三靜慮心悅安靜,離喜貪故唯名 樂根。」(CBETA Q T no. - -footnote8) 住於妙樂:原文寫做 བདེ་བར་གནས་པ,義為「住於樂中」。由於三禪所受之樂, 已離貪喜,所以將之譯為「妙樂」,以突出此樂於三界九地最上之義。 - -footnote9) 威光妙德:原文寫做 ལྷམ་མེ་ལྷན་ནེ་ལྷང་ངེ་བ,義指佛陀放光降伏天、人、阿修羅 的三種功德。此譯詞可見於玄奘譯《大般若波羅蜜多經.卷第四百一十八. 第二分超勝品第二十之二》及同經《卷第四百九十三.第三分善現品第三之 十二》:「以諸如來、應、正等覺三十二大士相、八十隨好所莊嚴身,非實有性, 是非有性故,諸如來、應、正等覺威光妙德普超一切世間天、人、阿素洛等, 最尊最勝。」(CBETA, T no. 西元 世紀印度論師 Damshtasena 所造《聖 般若十萬頌、二萬五千頌、八千頌廣說》對該詞語則做此解:「威光妙德:光 者,顯義,熾盛義;妙者,身色肌膚美好義;威者,勢力義;德者,形色美好 等義也。云何『普超,美好,威光妙德』?私意以為,此謂如來說法時,為令 聽者生敬心故,乃如是演示。彼時為令一切有情皆如是生心,因作是語也。以 光普超,故云美好;身色普超,故云為妙;威勢普超,故云為威;妙德普超, 故云為德。佛以光降伏梵天,以梵天自恃威光故;佛復以身色降伏諸天,以威 勢降伏魔類外道,以德降伏眾人也。」(德格版《丹珠爾》Vol. BDRC ID:W folio .. - -footnote10) 放 大 光 明, 威 光 妙 德 而 住: 原 文 寫 做 བཅོམ་ལྡན་འདས་ཤིན་ཏུ་ལྷམ་མེ་ལྷན་ནེ་ལྷང་ངེར་བཞུགས,直譯為「世尊放大光明,威光妙德而住」。因同段前文主詞亦為世尊,為顧及譯文文脈流暢,酌將主詞刪除,特此說明。 \ No newline at end of file diff --git a/tests/pecha/parser/docx/utils/data/footnote/two_page/two_page_footnote.docx b/tests/pecha/parser/docx/utils/data/footnote/two_page/two_page_footnote.docx deleted file mode 100644 index 3fba3ebc..00000000 Binary files a/tests/pecha/parser/docx/utils/data/footnote/two_page/two_page_footnote.docx and /dev/null differ diff --git a/tests/pecha/parser/docx/utils/test_remove_footnote.py b/tests/pecha/parser/docx/utils/test_remove_footnote.py deleted file mode 100644 index 83863975..00000000 --- a/tests/pecha/parser/docx/utils/test_remove_footnote.py +++ /dev/null @@ -1,31 +0,0 @@ -from pathlib import Path -from unittest import TestCase - -from openpecha.pecha.parsers.docx.utils import read_docx, remove_footnote - - -class TestRemoveFootNote(TestCase): - def setUp(self): - self.FOOTNOTE_DIR = Path(__file__).parent / "data" / "footnote" - self.ONE_PAGE_DIR = self.FOOTNOTE_DIR / "one_page" - self.TWO_PAGE_DIR = self.FOOTNOTE_DIR / "two_page" - self.one_page_footnote = self.ONE_PAGE_DIR / "one_page_footnote.docx" - self.two_page_footnote = self.TWO_PAGE_DIR / "two_page_footnote.docx" - - def test_remove_footnote_one_page(self): - text = read_docx(self.one_page_footnote, False) - expected_before = self.ONE_PAGE_DIR / "before_one_page.txt" - assert text == expected_before.read_text(encoding="utf-8").strip() - - text = remove_footnote(text) - expected_after = self.ONE_PAGE_DIR / "after_one_page.txt" - assert text == expected_after.read_text(encoding="utf-8").strip() - - def test_remove_footnote_two_page(self): - text = read_docx(self.two_page_footnote, False) - expected_before = self.TWO_PAGE_DIR / "before_two_page.txt" - assert text == expected_before.read_text(encoding="utf-8").strip() - - text = remove_footnote(text) - expected_after = self.TWO_PAGE_DIR / "after_two_page.txt" - assert text == expected_after.read_text(encoding="utf-8").strip() diff --git a/tests/pecha/parser/pedurma/data/expected_base.txt b/tests/pecha/parser/pedurma/data/expected_base.txt deleted file mode 100644 index 6e0dc710..00000000 --- a/tests/pecha/parser/pedurma/data/expected_base.txt +++ /dev/null @@ -1,7 +0,0 @@ -༄༅། ། -རྒྱ་གར་སྐད་དུ། དྷརྨ་དྷཱ་ཏུ་སྟ་བཾ། བོད་སྐད་དུ། ཆོས་ཀྱི་དབྱིངས་སུ་བསྟོད་པ། འཕགས་པ་འཇམ་དཔལ་གཞོན་ནུར་གྱུར་པ་ལ་ཕྱག་འཚལ་ལོ། །གང་ཞིག་ཀུན་དུ་མ་ཤེས་ན། ། -སྲིད་པ་གསུམ་དུ་རྣམ་འཁོར་བ། །སེམས་ཅན་ཀུན་ལ་ངེས་གནས་པའི། །ཆོས་ཀྱི་དབྱིངས་ལ་ཕྱག་འཚལ་འདུད། ། -གང་ཞིག་འཁོར་བའི་རྒྱུར་གྱུར་པ། །དེ་ཉིད་སྦྱང་བ་བྱས་པང་པོ་མི་་ལས། །དག་པ་དེ་ཉིད་མྱ་ངན་འདས། ། -ཆོས་ཀྱི་སྐུ་ཡང་དེ་ཉིད་དོ། །ཇི་ལྟར་འོ་མ་དང་འདྲེས་པས། །མར་གྱི་སྙིསྣང་བ། ། -དེ་བཞིན་ཉོན་མོངས་དང་འདྲེས་པས། །ཆོས་ཀྱི་དབྱིངས་ཀྱང་མི་མཐོང་ངོ་། །ཇི་ལྟར་འོ་མ་རྣམ་སྦྱངས་པས། ། -མར་གྱི་སྙིང་པོ་དྲི་མེད་འགྱུར། །དེ་བཞིན་ཉོན་མོངས་རྣམ་སྦྱངས་པས། །ཆོས་དབྱིངས་ཤིན་ཏུ་དྲི་མེད་འགྱུར། ། \ No newline at end of file diff --git a/tests/pecha/parser/pedurma/data/metadata.json b/tests/pecha/parser/pedurma/data/metadata.json deleted file mode 100644 index 82471c32..00000000 --- a/tests/pecha/parser/pedurma/data/metadata.json +++ /dev/null @@ -1 +0,0 @@ -{"title_bo": "རྩོད་པའི་རིགས་པའི་འགྲེལ་པ་དོན་རྣམ་པར་འབྱེད་པ་།", "alt_title_bo": "རྩོད་པའི་རིགས་པའི་འགྲེལ་པ་དོན་རྣམ་པར་འབྱེད་པ།", "text_id": "D4239", "author_en": "Shantarakshita", "author_bo": "མཁན་ཆེན་ཞི་བ་འཚོ་", "bdrc_person_id": "P5659", "initial_creation_type": "input", "language": "bo"} \ No newline at end of file diff --git a/tests/pecha/parser/pedurma/data/pedurma_hfml.txt b/tests/pecha/parser/pedurma/data/pedurma_hfml.txt deleted file mode 100644 index 01501a09..00000000 --- a/tests/pecha/parser/pedurma/data/pedurma_hfml.txt +++ /dev/null @@ -1,7 +0,0 @@ -༄༅། །(1) <«ཅོ་»«སྡེ་»«སྣར་»«པེ་»༄༅། །ཆོས་ཀྱི་དབྱིངས་སུ་བསྟོད་པ། ༄༅༅། །> -རྒྱ་གར་སྐད་དུ། དྷརྨ་དྷཱ་ཏུ་སྟ་བཾ། བོད་སྐད་དུ། ཆོས་ཀྱི་དབྱིངས་སུ་བསྟོད་པ། :འཕགས་པ་འཇམ་(3) <«སྣར་»«པེ་»འཇམ་>དཔལ་གཞོན་ནུར་གྱུར་པ་ལ་ཕྱག་འཚལ་ལོ། །གང་ཞིག་ཀུན་དུ་མ་ཤེས་ན། ། -སྲིད་པ་གསུམ་དུ་རྣམ་འཁོར་བ། །སེམས་ཅན་ཀུན་ལ་ངེས་གནས་པའི། །ཆོས་ཀྱི་དབྱིངས་ལ་ཕྱག་འཚལ་འདུད།(4) <«སྣར་»«པེ་»ལོ།> ། -གང་ཞིག་འཁོར་བའི་རྒྱུར་གྱུར་པ། །དེ་ཉིད་སྦྱང་བ་བྱས་པང་པོ་མི་་ལས། །དག་པ་དེ་ཉིད་མྱ་ངན་འདས། ། -ཆོས་ཀྱི་སྐུ་ཡང་དེ་ཉིད་དོ། །ཇི་ལྟར་འོ་མ་དང་འདྲེས་པས། །མར་གྱི་སྙིསྣང་བ། ། -དེ་བཞིན་ཉོན་མོངས་དང་འདྲེས་པས། །ཆོས་ཀྱི་དབྱིངས་ཀྱང་མི་མཐོང་ངོ་། །ཇི་ལྟར་འོ་མ་རྣམ་སྦྱངས་པས། ། -མར་གྱི་སྙིང་པོ་དྲི་མེད་འགྱུར། །དེ་བཞིན་ཉོན་མོངས་རྣམ་(5) <«སྣར་»«པེ་»རྣམས་>སྦྱངས་པས། །ཆོས་དབྱིངས་ཤིན་ཏུ་དྲི་མེད་འགྱུར། ། \ No newline at end of file diff --git a/tests/pecha/parser/pedurma/preprocess_input/expected_output.txt b/tests/pecha/parser/pedurma/preprocess_input/expected_output.txt deleted file mode 100644 index 93a922ef..00000000 --- a/tests/pecha/parser/pedurma/preprocess_input/expected_output.txt +++ /dev/null @@ -1,3 +0,0 @@ -ཡང་འདིར་:རྟོག་པ་(9) <«པེ་»རྟོགས་པ་> ཡང་གང་ལ་མངོན་པར་འདོད་ཅིང་གང་དང་བྲལ་བའི་ཤེས་པ་མངོན་སུམ་ཡིན་ཞེ་ན། -:རྟོག་པ་(10) <«པེ་»རྟོགས་པ་> མངོན་པར་བརྗོད་ཅན་གྱི།། -ཤེས་པ་ཞེས་བྱ་བ་སྨོས་ཏེ། diff --git a/tests/pecha/parser/pedurma/preprocess_input/input.txt b/tests/pecha/parser/pedurma/preprocess_input/input.txt deleted file mode 100644 index fd070462..00000000 --- a/tests/pecha/parser/pedurma/preprocess_input/input.txt +++ /dev/null @@ -1 +0,0 @@ -ཡང་འདིར་:རྟོག་པ་(9) <«པེ་»རྟོགས་པ་> ཡང་གང་ལ་མངོན་པར་འདོད་ཅིང་གང་དང་བྲལ་བའི་ཤེས་པ་མངོན་སུམ་ཡིན་ཞེ་ན།:རྟོག་པ་(10) <«པེ་»རྟོགས་པ་> མངོན་པར་བརྗོད་ཅན་གྱི།།ཤེས་པ་ཞེས་བྱ་བ་སྨོས་ཏེ། \ No newline at end of file diff --git a/tests/pecha/parser/pedurma/test_pedurma.py b/tests/pecha/parser/pedurma/test_pedurma.py deleted file mode 100644 index b884e9e9..00000000 --- a/tests/pecha/parser/pedurma/test_pedurma.py +++ /dev/null @@ -1,55 +0,0 @@ -import tempfile -from pathlib import Path - -from openpecha.pecha.parsers.pedurma import PedurmaParser -from openpecha.utils import read_json - - -def test_pedurma(): - data = Path(__file__).parent / "data" - pedurmafile = data / "pedurma_hfml.txt" - pedurma_text = pedurmafile.read_text(encoding="utf-8") - - metadata_file = data / "metadata.json" - metadata = read_json(metadata_file) - - parser = PedurmaParser() - - with tempfile.TemporaryDirectory() as tmpdirname: - output_path = Path(tmpdirname) - parser.parse(pedurma_text, metadata=metadata, output_path=output_path) - - expected_base = (data / "expected_base.txt").read_text(encoding="utf-8") - assert parser.base_text == expected_base - - # Checking extracted pedurma annotations - expected_span_texts = ["༄༅། །", "འཕགས་པ་འཇམ་", "འདུད།", "རྣམ་"] - for ann, expected_span in zip(parser.pedurma_anns, expected_span_texts): - start, end = ann.span.start, ann.span.end - assert parser.base_text[start:end] == expected_span - - expected_ann_notes = [ - "(1) <«ཅོ་»«སྡེ་»«སྣར་»«པེ་»༄༅། །ཆོས་ཀྱི་དབྱིངས་སུ་བསྟོད་པ། ༄༅༅། །>", - "(3) <«སྣར་»«པེ་»འཇམ་>", - "(4) <«སྣར་»«པེ་»ལོ།>", - "(5) <«སྣར་»«པེ་»རྣམས་>", - ] - - for ann, expected_note in zip(parser.pedurma_anns, expected_ann_notes): - assert ann.note == expected_note - - # Checking extracted meaning segment annotations - expected_meaning_segments = [ - "༄༅། །", - "རྒྱ་གར་སྐད་དུ། དྷརྨ་དྷཱ་ཏུ་སྟ་བཾ། བོད་སྐད་དུ། ཆོས་ཀྱི་དབྱིངས་སུ་བསྟོད་པ། འཕགས་པ་འཇམ་དཔལ་གཞོན་ནུར་གྱུར་པ་ལ་ཕྱག་འཚལ་ལོ། །གང་ཞིག་ཀུན་དུ་མ་ཤེས་ན། །", - "སྲིད་པ་གསུམ་དུ་རྣམ་འཁོར་བ། །སེམས་ཅན་ཀུན་ལ་ངེས་གནས་པའི། །ཆོས་ཀྱི་དབྱིངས་ལ་ཕྱག་འཚལ་འདུད། །", - "གང་ཞིག་འཁོར་བའི་རྒྱུར་གྱུར་པ། །དེ་ཉིད་སྦྱང་བ་བྱས་པང་པོ་མི་་ལས། །དག་པ་དེ་ཉིད་མྱ་ངན་འདས། །", - "ཆོས་ཀྱི་སྐུ་ཡང་དེ་ཉིད་དོ། །ཇི་ལྟར་འོ་མ་དང་འདྲེས་པས། །མར་གྱི་སྙིསྣང་བ། །", - "དེ་བཞིན་ཉོན་མོངས་དང་འདྲེས་པས། །ཆོས་ཀྱི་དབྱིངས་ཀྱང་མི་མཐོང་ངོ་། །ཇི་ལྟར་འོ་མ་རྣམ་སྦྱངས་པས། །", - "མར་གྱི་སྙིང་པོ་དྲི་མེད་འགྱུར། །དེ་བཞིན་ཉོན་མོངས་རྣམ་སྦྱངས་པས། །ཆོས་དབྱིངས་ཤིན་ཏུ་དྲི་མེད་འགྱུར། །", - ] - for ann, expected_segment in zip( - parser.meaning_segment_anns, expected_meaning_segments - ): - start, end = ann.span.start, ann.span.end - assert parser.base_text[start:end] == expected_segment diff --git a/tests/pecha/parser/pedurma/test_pedurma_preprocess.py b/tests/pecha/parser/pedurma/test_pedurma_preprocess.py deleted file mode 100644 index 6552d0ce..00000000 --- a/tests/pecha/parser/pedurma/test_pedurma_preprocess.py +++ /dev/null @@ -1,13 +0,0 @@ -from pathlib import Path - -from openpecha.pecha.parsers.pedurma import preprocess_pedurma_text - - -def test_pedurma_preprocess(): - DATA_DIR = Path(__file__).parent / "preprocess_input" - - input_text = (DATA_DIR / "input.txt").read_text(encoding="utf-8") - output = preprocess_pedurma_text(input_text) - - expected_output = (DATA_DIR / "expected_output.txt").read_text(encoding="utf-8") - assert output == expected_output diff --git a/tests/pecha/test_pecha_types.py b/tests/pecha/test_pecha_types.py deleted file mode 100644 index 449c3638..00000000 --- a/tests/pecha/test_pecha_types.py +++ /dev/null @@ -1,135 +0,0 @@ -from typing import List -from unittest import TestCase - -from openpecha.pecha import Pecha -from openpecha.pecha.annotations import AnnotationModel -from openpecha.pecha.pecha_types import PechaType, get_pecha_type -from tests.pecha import DummyMetadataModel, SharedPechaSetup - - -class TestPechaType(TestCase, SharedPechaSetup): - def setUp(self): - self.setup_pechas() - - def test_root_pecha(self): - pechas: list[Pecha] = [self.root_pecha] - metadatas: list[DummyMetadataModel] = [self.root_pecha_metadata] - annotations: dict[str, List[AnnotationModel]] = { - self.root_pecha.id: self.root_pecha_annotations - } - annotation_path = "B8B3/segmentation-74F4.json" - - assert ( - get_pecha_type(pechas, metadatas, annotations, annotation_path) - == PechaType.root_pecha - ) - - def test_root_translation_pecha(self): - pechas: list[Pecha] = [self.root_translation_pecha, self.root_pecha] - metadatas: list[DummyMetadataModel] = [ - self.root_translation_pecha_metadata, - self.root_pecha_metadata, - ] - annotations: dict[str, List[AnnotationModel]] = { - self.root_pecha.id: self.root_pecha_annotations, - self.root_translation_pecha.id: self.root_translation_pecha_annotations, - } - annotation_path = "9813/alignment-AE0B.json" - - assert ( - get_pecha_type(pechas, metadatas, annotations, annotation_path) - == PechaType.root_translation_pecha - ) - - def test_commentary_pecha(self): - pechas: list[Pecha] = [self.commentary_pecha, self.root_pecha] - metadatas: list[DummyMetadataModel] = [ - self.commentary_pecha_metadata, - self.root_pecha_metadata, - ] - annotations: dict[str, List[AnnotationModel]] = { - self.root_pecha.id: self.root_pecha_annotations, - self.commentary_pecha.id: self.commentary_pecha_annotations, - } - annotation_path = "B014/alignment-2127.json" - - assert ( - get_pecha_type(pechas, metadatas, annotations, annotation_path) - == PechaType.commentary_pecha - ) - - def test_commentary_translation_pecha(self): - pechas: list[Pecha] = [ - self.commentary_translation_pecha, - self.commentary_pecha, - self.root_pecha, - ] - metadatas: list[DummyMetadataModel] = [ - self.commentary_translation_pecha_metadata, - self.commentary_pecha_metadata, - self.root_pecha_metadata, - ] - annotations: dict[str, List[AnnotationModel]] = { - self.root_pecha.id: self.root_pecha_annotations, - self.commentary_pecha.id: self.commentary_pecha_annotations, - self.commentary_translation_pecha.id: self.commentary_translation_pecha_annotations, - } - annotation_path = "EB60/alignment-6786.json" - assert ( - get_pecha_type(pechas, metadatas, annotations, annotation_path) - == PechaType.commentary_translation_pecha - ) - - def test_prealigned_root_translation_pecha(self): - pechas: list[Pecha] = [self.root_translation_pecha, self.root_pecha] - metadatas: list[DummyMetadataModel] = [ - self.prealigned_root_translation_pecha_metadata, - self.root_pecha_metadata, - ] - annotations: dict[str, List[AnnotationModel]] = { - self.root_pecha.id: self.root_pecha_annotations, - self.root_translation_pecha.id: self.prealigned_root_translation_pecha_annotations, - } - annotation_path = "D93E/alignment-0216.json" - assert ( - get_pecha_type(pechas, metadatas, annotations, annotation_path) - == PechaType.prealigned_root_translation_pecha - ) - - def test_prealigned_commentary_pecha(self): - pechas: list[Pecha] = [self.commentary_pecha, self.root_pecha] - metadatas: list[DummyMetadataModel] = [ - self.prealigned_commentary_pecha_metadata, - self.root_pecha_metadata, - ] - annotations: dict[str, List[AnnotationModel]] = { - self.root_pecha.id: self.root_pecha_annotations, - self.commentary_pecha.id: self.prealigned_commentary_pecha_annotations, - } - annotation_path = "E949/alignment-2F29.json" - assert ( - get_pecha_type(pechas, metadatas, annotations, annotation_path) - == PechaType.prealigned_commentary_pecha - ) - - def test_prealigned_commentary_translation_pecha(self): - pechas: list[Pecha] = [ - self.prealigned_commentary_translation_pecha, - self.commentary_pecha, - self.root_pecha, - ] - metadatas: list[DummyMetadataModel] = [ - self.prealigned_commentary_translation_pecha_metadata, - self.prealigned_commentary_pecha_metadata, - self.root_pecha_metadata, - ] - annotations: dict[str, List[AnnotationModel]] = { - self.root_pecha.id: self.root_pecha_annotations, - self.commentary_pecha.id: self.prealigned_commentary_pecha_annotations, - self.prealigned_commentary_translation_pecha.id: self.prealigned_commentary_translation_pecha_annotations, - } - annotation_path = "0DCE/alignment-8B56.json" - assert ( - get_pecha_type(pechas, metadatas, annotations, annotation_path) - == PechaType.prealigned_commentary_translation_pecha - ) diff --git a/tests/test_utils.py b/tests/test_utils.py deleted file mode 100644 index c0e75014..00000000 --- a/tests/test_utils.py +++ /dev/null @@ -1,96 +0,0 @@ -from openpecha.utils import ( - adjust_segment_num_for_chapter, - chunk_strings, - get_chapter_for_segment, - parse_alignment_index, -) - - -def test_parse_root_mapping(): - input = "1" - assert parse_alignment_index(input) == [1] - - input = "1,2,3,4" - assert parse_alignment_index(input) == [1, 2, 3, 4] - - input = "1-4" - assert parse_alignment_index(input) == [1, 2, 3, 4] - - input = "1-4,5-8" - assert parse_alignment_index(input) == [1, 2, 3, 4, 5, 6, 7, 8] - - -def test_chunk_strings(): - # Less than chunk_size - strings = ["1", "2", "3", "4", "5"] - chunk_size = 10 - expected = [["1", "2", "3", "4", "5"]] - assert chunk_strings(strings, chunk_size) == expected - - # More than chunk_size - strings = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"] - chunk_size = 3 - expected = [["1", "2", "3"], ["4", "5", "6"], ["7", "8", "9"], ["10"]] - assert chunk_strings(strings, chunk_size) == expected - - # Equal to chunk_size - strings = ["1", "2", "3", "4", "5"] - chunk_size = 5 - expected = [["1", "2", "3", "4", "5"]] - assert chunk_strings(strings, chunk_size) == expected - - # Empty list - strings = [] - chunk_size = 5 - expected = [] - assert chunk_strings(strings, chunk_size) == expected - - # Evenly divisible - strings = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"] - chunk_size = 2 - expected = [["1", "2"], ["3", "4"], ["5", "6"], ["7", "8"], ["9", "10"]] - assert chunk_strings(strings, chunk_size) == expected - - -def test_get_chapter_num_from_segment_num(): - segment_num = 1 - no_of_chapter_segment = 100 - assert get_chapter_for_segment(segment_num, no_of_chapter_segment) == 1 - - segment_num = 100 - no_of_chapter_segment = 100 - assert get_chapter_for_segment(segment_num, no_of_chapter_segment) == 1 - - segment_num = 101 - no_of_chapter_segment = 100 - assert get_chapter_for_segment(segment_num, no_of_chapter_segment) == 2 - - segment_num = 200 - no_of_chapter_segment = 100 - assert get_chapter_for_segment(segment_num, no_of_chapter_segment) == 2 - - segment_num = 893 - no_of_chapter_segment = 100 - assert get_chapter_for_segment(segment_num, no_of_chapter_segment) == 9 - - -def test_process_segment_num_for_chapter(): - segment_num = 1 - no_of_chapter_segment = 100 - assert adjust_segment_num_for_chapter(segment_num, no_of_chapter_segment) == 1 - - segment_num = 100 - no_of_chapter_segment = 100 - assert adjust_segment_num_for_chapter(segment_num, no_of_chapter_segment) == 100 - - segment_num = 101 - no_of_chapter_segment = 100 - assert adjust_segment_num_for_chapter(segment_num, no_of_chapter_segment) == 1 - - segment_num = 200 - no_of_chapter_segment = 100 - assert adjust_segment_num_for_chapter(segment_num, no_of_chapter_segment) == 100 - - segment_num = 893 - no_of_chapter_segment = 100 - assert adjust_segment_num_for_chapter(segment_num, no_of_chapter_segment) == 93