diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py index 0e7bfdf..0a80c40 100644 --- a/machine/corpora/paratext_project_text_updater_base.py +++ b/machine/corpora/paratext_project_text_updater_base.py @@ -29,6 +29,7 @@ def __init__( def update_usfm( self, book_id: str, + chapters: Optional[Sequence[int]] = None, rows: Optional[Sequence[UpdateUsfmRow]] = None, full_name: Optional[str] = None, text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_EXISTING, @@ -61,7 +62,7 @@ def update_usfm( ) try: parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification) - return handler.get_usfm(self._settings.stylesheet) + return handler.get_usfm(self._settings.stylesheet, chapters) except Exception as e: error_message = ( f"An error occurred while parsing the usfm for '{book_id}'" diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py index 9d95850..4c187ac 100644 --- a/machine/corpora/update_usfm_parser_handler.py +++ b/machine/corpora/update_usfm_parser_handler.py @@ -334,27 +334,43 @@ def _end_embed_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) - if embed_outside_of_block: self._end_update_block(state, [scripture_ref]) - def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str: + def get_usfm( + self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty", chapters: Optional[Sequence[int]] = None + ) -> str: if isinstance(stylesheet, str): stylesheet = UsfmStylesheet(stylesheet) tokenizer = UsfmTokenizer(stylesheet) tokens = list(self._tokens) + if chapters is not None: + tokens = self._get_incremental_draft_tokens(tokens, chapters) if len(self._remarks) > 0: remark_tokens: List[UsfmToken] = [] for remark in self._remarks: remark_tokens.append(UsfmToken(UsfmTokenType.PARAGRAPH, "rem")) remark_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=remark)) if len(tokens) > 0: - index = 0 - markers_to_skip = {"id", "ide", "rem"} - while tokens[index].marker in markers_to_skip: - index += 1 - if len(tokens) > index and tokens[index].type == UsfmTokenType.TEXT: - index += 1 - for remark_token in reversed(remark_tokens): - tokens.insert(index, remark_token) + for index, token in enumerate(tokens): + if token.type == UsfmTokenType.CHAPTER: + tokens[index + 1 : index + 1] = remark_tokens return tokenizer.detokenize(tokens) + def _get_incremental_draft_tokens(self, tokens: List[UsfmToken], chapters: Sequence[int]) -> List[UsfmToken]: + incremental_draft_tokens: List[UsfmToken] = [] + in_chapter: bool = False + for index, token in enumerate(tokens): + if index == 0 and token.marker == "id": + incremental_draft_tokens.append(token) + continue + elif token.type == UsfmTokenType.CHAPTER: + if token.data and int(token.data) in chapters: + in_chapter = True + incremental_draft_tokens.append(token) + else: + in_chapter = False + elif in_chapter: + incremental_draft_tokens.append(token) + return incremental_draft_tokens + def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str], Optional[dict[str, object]]]: row_texts: List[str] = [] row_metadata = None