From 7b065029cbd40bb57ec390ce275c7f26f600e0cc Mon Sep 17 00:00:00 2001 From: Graham Hukill Date: Fri, 30 Jan 2026 11:37:56 -0500 Subject: [PATCH] Add --skip-embeddings flag to reindex-source Why these changes are being introduced: There might be times we want to re-index a source to Opensearch without indexing embeddings as a secondary pass. How this addresses that need: Add --skip-embeddings flag to reindex-source CLI command. Side effects of this change: * None Relevant ticket(s): * None --- tests/test_cli.py | 39 +++++++++++++++++++++++++++++++++++++++ tim/cli.py | 44 ++++++++++++++++++++++++++++---------------- 2 files changed, 67 insertions(+), 16 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 9dc6ebb..25054d3 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -415,3 +415,42 @@ def test_reindex_source_success( "Reindex source complete: " f'{{"index": {json.dumps(mock_bulk_index())}' in caplog.text ) + + +@patch("tim.opensearch.create_index") +@patch("tim.opensearch.promote_index") +@patch("tim.opensearch.get_index_aliases") +@patch("tim.opensearch.bulk_update") +@patch("tim.opensearch.bulk_index") +def test_reindex_source_skip_embeddings( + mock_bulk_index, + mock_bulk_update, + mock_get_index_aliases, + mock_promote_index, + mock_create_index, + caplog, + monkeypatch, + runner, +): + monkeypatch.delenv("TIMDEX_OPENSEARCH_ENDPOINT", raising=False) + mock_get_index_aliases.return_value = ["alma", "all-current", "timdex"] + mock_bulk_index.return_value = { + "created": 1000, + "updated": 0, + "errors": 0, + "total": 1000, + } + + result = runner.invoke( + main, + [ + "reindex-source", + "--source", + "alma", + "--skip-embeddings", + "tests/fixtures/dataset", + ], + ) + assert result.exit_code == EXIT_CODES["success"] + assert "Skipping embeddings update." in caplog.text + mock_bulk_update.assert_not_called() diff --git a/tim/cli.py b/tim/cli.py index 093a94d..4d6d9ff 100644 --- a/tim/cli.py +++ b/tim/cli.py @@ -430,6 +430,12 @@ def bulk_update_embeddings( help="Alias to promote the index to in addition to the primary alias. May " "be repeated to promote the index to multiple aliases at once.", ) +@click.option( + "--skip-embeddings", + is_flag=True, + default=False, + help="Skip the secondary update of documents with embeddings.", +) @click.argument( "dataset_path", type=click.Path(), @@ -441,6 +447,7 @@ def reindex_source( ctx: click.Context, source: str, alias: tuple[str], + skip_embeddings: bool, # noqa: FBT001 dataset_path: str, ) -> None: """Perform a full refresh for a source in Opensearch for all current records. @@ -488,23 +495,28 @@ def reindex_source( logger.error(f"Bulk indexing failed: {exception}") # noqa: TRY400 # bulk index embeddings - logger.info("Reindexing embeddings.") update_results = {"updated": 0, "errors": 0, "total": 0} - embeddings = td.embeddings.read_dicts_iter( - table="current_embeddings", - columns=[ - "timdex_record_id", - "embedding_strategy", - "embedding_object", - ], - source=source, - action="index", - ) - embeddings_to_index = helpers.format_embeddings(embeddings) - try: - update_results.update(tim_os.bulk_update(client, index, embeddings_to_index)) - except BulkOperationError as exception: - logger.error(f"Bulk update with embeddings failed: {exception}") # noqa: TRY400 + if skip_embeddings: + logger.info("Skipping embeddings update.") + else: + logger.info("Reindexing embeddings.") + embeddings = td.embeddings.read_dicts_iter( + table="current_embeddings", + columns=[ + "timdex_record_id", + "embedding_strategy", + "embedding_object", + ], + source=source, + action="index", + ) + embeddings_to_index = helpers.format_embeddings(embeddings) + try: + update_results.update(tim_os.bulk_update(client, index, embeddings_to_index)) + except BulkOperationError as exception: + logger.error( # noqa: TRY400 + f"Bulk update with embeddings failed: {exception}" + ) summary_results = {"index": index_results, "update": update_results} logger.info(f"Reindex source complete: {json.dumps(summary_results)}")