From b320e4bef04e025f21d39ba838093359c31924c7 Mon Sep 17 00:00:00 2001 From: Maciej Rek Date: Fri, 6 Mar 2026 10:41:21 +0100 Subject: [PATCH 1/4] [SNOW-3203938] Fix ai_parse_document_basic test --- tests/integ/test_dataframe_ai.py | 48 ++++++++++++++++++++++++++++---- 1 file changed, 43 insertions(+), 5 deletions(-) diff --git a/tests/integ/test_dataframe_ai.py b/tests/integ/test_dataframe_ai.py index b205cd93c0..84d07bdc9e 100644 --- a/tests/integ/test_dataframe_ai.py +++ b/tests/integ/test_dataframe_ai.py @@ -1070,8 +1070,49 @@ def test_dataframe_ai_transcribe_default_output_column(session, resources_path): assert all("start" in s and "end" in s for s in data["segments"]) -def test_dataframe_ai_parse_document_basic(session, resources_path): - """Test DataFrame.ai.parse_document OCR on a PDF document.""" +def test_dataframe_ai_parse_document_basic_legacy(session, resources_path): + """Test DataFrame.ai.parse_document OCR on a PDF document. + This is a test that covers legacy behaviour (pre error handling changes).""" + session.sql( + "ALTER SESSION SET AI_SQL_ERROR_HANDLING_USE_FAIL_ON_ERROR = FALSE" + ).collect() + try: + stage_name = Utils.random_stage_name() + _ = session.sql( + f"CREATE OR REPLACE TEMP STAGE {stage_name} ENCRYPTION = (TYPE = 'SNOWFLAKE_SSE')" + ).collect() + file_local = TestFiles(resources_path).test_doc_pdf + _ = session.file.put(file_local, f"@{stage_name}", auto_compress=False) + + df = session.create_dataframe( + [[f"@{stage_name}/doc.pdf"]], schema=["file_path"] + ) + + result_df = df.ai.parse_document( + input_column=to_file(col("file_path")), + output_column="parsed", + mode="OCR", + ) + + assert result_df.columns == ["FILE_PATH", "PARSED"] + + results = result_df.collect() + data = json.loads(results[0]["PARSED"]) if results[0]["PARSED"] else {} + assert isinstance(data, dict) + assert "content" in data and isinstance(data["content"], str) + assert isinstance(data.get("metadata", {}), dict) + if "metadata" in data and "pageCount" in data["metadata"]: + assert data["metadata"].get("pageCount", 0) >= 3 + finally: + session.sql( + "ALTER SESSION SET AI_SQL_ERROR_HANDLING_USE_FAIL_ON_ERROR = TRUE" + ).collect() + + +def test_dataframe_ai_parse_document_basic_new_eh(session, resources_path): + """Test DataFrame.ai.parse_document OCR on a PDF document. + This is a test that covers legacy behaviour (post error handling changes, metadata is absent).""" + stage_name = Utils.random_stage_name() _ = session.sql( f"CREATE OR REPLACE TEMP STAGE {stage_name} ENCRYPTION = (TYPE = 'SNOWFLAKE_SSE')" @@ -1093,9 +1134,6 @@ def test_dataframe_ai_parse_document_basic(session, resources_path): data = json.loads(results[0]["PARSED"]) if results[0]["PARSED"] else {} assert isinstance(data, dict) assert "content" in data and isinstance(data["content"], str) - assert isinstance(data.get("metadata", {}), dict) - if "metadata" in data and "pageCount" in data["metadata"]: - assert data["metadata"].get("pageCount", 0) >= 3 def test_dataframe_ai_parse_document_default_output_column(session, resources_path): From 128cda43e87d6eba35cf04e409f0d0ba147ff07f Mon Sep 17 00:00:00 2001 From: Maciej Rek Date: Fri, 6 Mar 2026 10:49:57 +0100 Subject: [PATCH 2/4] Add Changelog entry --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9fa0d9d747..df855daf68 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ - Fixed a bug in `Session.client_telemetry` that trace does not have snowflake style trace id. - Fixed a bug in `ai_complete` where `model_parameters` and `response_format` values containing single quotes would generate malformed SQL. +- Fixed a ai_parse_document test that failed with new error handling ## 1.47.0 (2026-03-05) From 5fe58535a67740d47202f712e260cfb3e7dde34a Mon Sep 17 00:00:00 2001 From: Maciej Rek Date: Fri, 6 Mar 2026 10:57:32 +0100 Subject: [PATCH 3/4] Adjust comments according to graphite-app suggestions --- tests/integ/test_dataframe_ai.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integ/test_dataframe_ai.py b/tests/integ/test_dataframe_ai.py index 84d07bdc9e..d225d90cf2 100644 --- a/tests/integ/test_dataframe_ai.py +++ b/tests/integ/test_dataframe_ai.py @@ -1072,7 +1072,7 @@ def test_dataframe_ai_transcribe_default_output_column(session, resources_path): def test_dataframe_ai_parse_document_basic_legacy(session, resources_path): """Test DataFrame.ai.parse_document OCR on a PDF document. - This is a test that covers legacy behaviour (pre error handling changes).""" + This test covers legacy behavior (post error handling changes, metadata is present in response).""" session.sql( "ALTER SESSION SET AI_SQL_ERROR_HANDLING_USE_FAIL_ON_ERROR = FALSE" ).collect() @@ -1111,7 +1111,7 @@ def test_dataframe_ai_parse_document_basic_legacy(session, resources_path): def test_dataframe_ai_parse_document_basic_new_eh(session, resources_path): """Test DataFrame.ai.parse_document OCR on a PDF document. - This is a test that covers legacy behaviour (post error handling changes, metadata is absent).""" + This test covers legacy behavior (post error handling changes, metadata is absent in response).""" stage_name = Utils.random_stage_name() _ = session.sql( From 6c6df01280f1a36556b4742f9d39b377e34d9cd0 Mon Sep 17 00:00:00 2001 From: Maciej Rek Date: Mon, 9 Mar 2026 09:19:17 +0100 Subject: [PATCH 4/4] Update CHANGELOG.md Co-authored-by: Jonathan Shi <149419494+sfc-gh-joshi@users.noreply.github.com> --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index df855daf68..9fa0d9d747 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,6 @@ - Fixed a bug in `Session.client_telemetry` that trace does not have snowflake style trace id. - Fixed a bug in `ai_complete` where `model_parameters` and `response_format` values containing single quotes would generate malformed SQL. -- Fixed a ai_parse_document test that failed with new error handling ## 1.47.0 (2026-03-05)