From e4fb9fafccb72f8d3e88c1896478a19c6878a720 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Thu, 4 Dec 2025 13:06:05 +0000 Subject: [PATCH 01/12] add version release to _pkgdown.yml update README to show how to tidy oai_complete build readme and move vignettes/ to articles/ for the .html --- README.Rmd | 11 ++++++++--- README.md | 11 ++++++++--- _pkgdown.yml | 2 ++ 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/README.Rmd b/README.Rmd index fe61b4a..b9e0a36 100644 --- a/README.Rmd +++ b/README.Rmd @@ -207,17 +207,22 @@ oai_complete_df( Complete a Data Frame of texts with schema: ```{r, eval = FALSE} -oai_complete_df( +df_output_w_schema <- oai_complete_df( df = review_data, text_var = review_text, id_var = review_id, system_prompt = "Classify the following review:", schema = sentiment_schema, key_name = "OPENAI_API_KEY", - output_dir = "completions_output", + output_dir = NULL, + # output_dir = "completions_output", chunk_size = 1000, concurrent_requests = 5 ) + +df_output_w_schema |> + dplyr::mutate(content = purrr::map(content, safely_from_json)) |> + tidyr::unnest_wider(content) ``` # Working with Output Files @@ -267,7 +272,7 @@ metadata$endpoint_url **Note:** Add output directories to `.gitignore` to avoid committing API responses and metadata. -Read the [LLM Providers Vignette](articles/llm_providers.html), and the [Structured Outputs Vignette](vignettes/structured_outputs_json_schema.html) for more information on common workflows with the OpenAI Chat Completions API [^1] +Read the [LLM Providers Vignette](articles/llm_providers.html), and the [Structured Outputs Vignette](articles/structured_outputs_json_schema.html) for more information on common workflows with the OpenAI Chat Completions API [^1] [^1]: Content pending implementation for Anthroic Messages API, Gemini API, and OpenAI Responses API diff --git a/README.md b/README.md index 0f9e141..7c38e8d 100644 --- a/README.md +++ b/README.md @@ -208,17 +208,22 @@ oai_complete_df( Complete a Data Frame of texts with schema: ``` r -oai_complete_df( +df_output_w_schema <- oai_complete_df( df = review_data, text_var = review_text, id_var = review_id, system_prompt = "Classify the following review:", schema = sentiment_schema, key_name = "OPENAI_API_KEY", - output_dir = "completions_output", + output_dir = NULL, + # output_dir = "completions_output", chunk_size = 1000, concurrent_requests = 5 ) + +df_output_w_schema |> + dplyr::mutate(content = purrr::map(content, safely_from_json)) |> + tidyr::unnest_wider(content) ``` # Working with Output Files @@ -275,7 +280,7 @@ responses and metadata. Read the [LLM Providers Vignette](articles/llm_providers.html), and the [Structured Outputs -Vignette](vignettes/structured_outputs_json_schema.html) for more +Vignette](articles/structured_outputs_json_schema.html) for more information on common workflows with the OpenAI Chat Completions API [^1] diff --git a/_pkgdown.yml b/_pkgdown.yml index 75b859a..98d756e 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -166,6 +166,8 @@ development: news: releases: + - text: "Version 0.2" + href: nrews/index.html#endpointr-012 - text: "Version 0.1.2" href: news/index.html#endpointr-012 - text: "Version 0.1.1" From 222a2ab1de2abe1a4e31695efa188f9abcb85a7f Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Thu, 4 Dec 2025 15:04:49 +0000 Subject: [PATCH 02/12] add pre-PR checklist for maintainers (helpful if ever onboard anyone). give integrations an 01 to make it easier to remember incl. tip to give an out to prevent hallucinations in sturctured outputs vignette --- CONTRIBUTORS.md | 17 +++++++++++++++++ .../{integrations.qmd => 01_integrations.qmd} | 0 todos.qmd | 8 ++++---- vignettes/structured_outputs_json_schema.Rmd | 2 +- 4 files changed, 22 insertions(+), 5 deletions(-) rename dev_docs/{integrations.qmd => 01_integrations.qmd} (100%) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 628a962..bb06106 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -1,5 +1,22 @@ # EndpointR Development Guide +## Pre-PR Checklist + +- [ ] export necessary funcs and artefacts +- [ ] document with examples/full roxygen2 skeleton +- [ ] check DESCRIPTION for version bump or dependencies update +- [ ] update news.md +- [ ] update todos.md +- [ ] update \_pkgdown.yml (function reference, news etc.) +- [ ] run `spelling::spell_check_package()` +- [ ] `devtools::document()` +- [ ] run `testthat::test()` +- [ ] run `devtools::check()` +- [ ] run `pkgdown::build_site()` +- [ ] inspect site incl. vignettes +- [ ] PR +- [ ] Check CI/CD + ## Commands - Run all tests: `testthat::test()` diff --git a/dev_docs/integrations.qmd b/dev_docs/01_integrations.qmd similarity index 100% rename from dev_docs/integrations.qmd rename to dev_docs/01_integrations.qmd diff --git a/todos.qmd b/todos.qmd index fe7293d..65db2ae 100644 --- a/todos.qmd +++ b/todos.qmd @@ -7,14 +7,14 @@ - [ ] Support for Anthropic API - [ ] Batches - [ ] Messages (Completions) - - [ ] Structured Outputs + - [x] Structured Outputs - [ ] Support for Gemini API - [ ] Embeddings - [ ] Completions - [ ] Structured Outputs -- [ ] LLM Providers Vignette Updated -- [ ] Structured Outputs Vignette Updated -- [ ] Better error propagation throughout package (refactor, large) +- [x] LLM Providers Vignette Updated +- [x] Structured Outputs Vignette Updated +- [x] Better error propagation throughout package (refactor, large) Error reporting is somewhat annoying by default with httr2::req_perform() if we don't: diff --git a/vignettes/structured_outputs_json_schema.Rmd b/vignettes/structured_outputs_json_schema.Rmd index 3947b4e..6e8a54a 100644 --- a/vignettes/structured_outputs_json_schema.Rmd +++ b/vignettes/structured_outputs_json_schema.Rmd @@ -358,7 +358,7 @@ Schema design principles: - Use descriptive field names and descriptions - Set appropriate constraints (min/max values, required fields) -- Prefer enums over free text for categories +- Prefer enums over free text for categories but give the model outs, or it will hallcuinate e.g. an 'other' option for when the document doesn't have what you're looking for - Nest objects logically for complex data - Validate some mock responses in advance From 6780ab5ba3d30d5ea9af98959404587b810826a7 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Thu, 4 Dec 2025 15:27:28 +0000 Subject: [PATCH 03/12] add ant_complete_text and .extract_ant_message_content start writing tests --- R/anthropic_messages.R | 97 ++++++++++++++++++++++++ tests/testthat/test-anthropic_messages.R | 8 ++ 2 files changed, 105 insertions(+) diff --git a/R/anthropic_messages.R b/R/anthropic_messages.R index e828609..239e9a1 100644 --- a/R/anthropic_messages.R +++ b/R/anthropic_messages.R @@ -156,6 +156,103 @@ ant_build_messages_request <- function( return(request) } +ant_complete_text <- function(text, + model = .ANT_DEFAULT_MODEL, + system_prompt = NULL, + schema = NULL, + temperature = 0, + max_tokens = 500L, + key_name = "ANTHROPIC_API_KEY", + endpoint_url = .ANT_MESSAGES_ENDPOINT, + max_retries = 5L, + tiemout = 30L, + tidy = TRUE) { + + # surface errors quickly here, before building any request + if (!rlang::is_scalar_character(text)) { + cli::cli_abort( + "{.arg text} must be a single string (a character vector of length 1)." + ) + } + + if (nchar(text) == 0) { + cli::cli_abort( + "{.arg text} must not be an empty string." + ) + } + + + req <- ant_build_messages_request( + input = text, + model = model, + schema = schema, + temperature = temperature, + max_tokens = max_tokens, + endpoint_url = endpoint_url, + max_retries = max_retries, + timeout = 30L + ) + + tryCatch({ + response <- httr2::req_perform(req) + }, error = function(e) { + cli::cli_abort(c( + "Failed to generate completion", + "i" = "Text: {cli::cli_vec(text, list('vec-trunc' = 50, 'vec-sep' = ''))}", + "x" = "Error: {conditionMessage(e)}" + )) + }) + + if (httr2::resp_status(response) != 200) { + error_msg <- .extract_api_error(response) + cli::cli_abort(c( + "API request failed", + "x" = error_msg + )) + } + + content <- .extract_ant_message_content(response) + + if (!is.null(schema) && tidy && !is.na(content)) { + content <- tryCatch({ + parsed <- jsonlite::fromJSON(content, simplifyVector = FALSE) + if (!is.null(schema)) { + parsed <- validate_response(schema, content) + } + parsed + }, error = function(e) { + cli::cli_warn(c( + "Failed to parse structured output", + "i" = "Returning raw response", + "x" = conditionMessage(e) + )) + content + }) + } + + return(content) +} + + +#' Extract text content from Anthropic Messages API response +#' @keywords internal +.extract_ant_message_content <- function(resp) { + body <- httr2::resp_body_json(resp) + + # find the first text block + content_blocks <- body$content + + if (length(content_blocks) == 0) { + return(NA_character_) + } + + for (block in content_blocks) { + if (block$type == "text") { + return(block$text) + } + } + return(NA_character_) +} #' Convert json_schema S7 object to Anthropic output_format structure diff --git a/tests/testthat/test-anthropic_messages.R b/tests/testthat/test-anthropic_messages.R index bc7fddc..d3a78dd 100644 --- a/tests/testthat/test-anthropic_messages.R +++ b/tests/testthat/test-anthropic_messages.R @@ -75,6 +75,14 @@ test_that("ant_build_messages_request accepts endpointr_id and adds to headers", }) +test_that("ant_complete_text validates its inputs", { + expect_error(ant_complete_text(c(""), + "must not be an empty")) + + expect_error(ant_complete_text(c("hello", "bonjour"), + "must be a single string")) + +}) From 9e69516b66c2c3cf1068218953c0e41569fba07b Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Thu, 4 Dec 2025 15:44:44 +0000 Subject: [PATCH 04/12] add ant_complete_text docs add some tests for .extract_ant_message_content --- R/anthropic_messages.R | 48 +++++++++++++++++++++++- tests/testthat/test-anthropic_messages.R | 33 +++++++++++++++- 2 files changed, 78 insertions(+), 3 deletions(-) diff --git a/R/anthropic_messages.R b/R/anthropic_messages.R index 239e9a1..c9443b4 100644 --- a/R/anthropic_messages.R +++ b/R/anthropic_messages.R @@ -36,7 +36,6 @@ #' @param endpoint_url Anthropic API endpoint URL #' @param timeout Request timeout in seconds #' @param max_retries Maximum number of retry attempts for failed requests - #' #' @return An httr2 request object #' @export @@ -156,6 +155,52 @@ ant_build_messages_request <- function( return(request) } +# ant_complete_text docs ---- +#' Generate a completion for a single text using Anthropic's Messages API +#' +#' @description +#' High-level function to generate a completion for a single text string. +#' Handles request creation, execution, and response processing with +#' optional structured output support. +#' +#' @param text Character string to send to the model +#' @param model Anthropic model to use (default: "claude-sonnet-4-5-20250929") +#' @param system_prompt Optional system prompt +#' @param schema Optional JSON schema for structured output +#' @param temperature Sampling temperature (0-1) +#' @param max_tokens Maximum tokens in response +#' @param key_name Environment variable name for API key +#' @param endpoint_url Anthropic API endpoint URL +#' @param max_retries Maximum retry attempts +#' @param timeout Request timeout in seconds +#' @param tidy Whether to parse structured output (default: TRUE) +#' +#' @return Character string with the model's response, or parsed JSON if schema provided +#' @export +#' +#' @examples +#' \dontrun{ +#' # simple completion +#' response <- ant_complete_text( +#' text = "Explain quantum computing in simple terms", +#' max_tokens = 500 +#' ) +#' +#' # with structured output +#' sentiment_schema <- create_json_schema( +#' name = "sentiment", +#' schema = schema_object( +#' sentiment = schema_enum(c("positive", "negative", "neutral")), +#' confidence = schema_number(minimum = 0, maximum = 1), +#' required = c("sentiment", "confidence") +#' ) +#' ) +#' result <- ant_complete_text( +#' text = "I love this product!", +#' schema = sentiment_schema +#' ) +#' } +# ant_complete_text docs ---- ant_complete_text <- function(text, model = .ANT_DEFAULT_MODEL, system_prompt = NULL, @@ -181,7 +226,6 @@ ant_complete_text <- function(text, ) } - req <- ant_build_messages_request( input = text, model = model, diff --git a/tests/testthat/test-anthropic_messages.R b/tests/testthat/test-anthropic_messages.R index d3a78dd..cc88fcd 100644 --- a/tests/testthat/test-anthropic_messages.R +++ b/tests/testthat/test-anthropic_messages.R @@ -65,6 +65,38 @@ test_that("ant_build_messages_request accepts schemas and formats properly with }) +test_that(".extract_ant_message_content covers basic edgecases and actually gets the text", { + mock_body <- list( + content = list( + list(type = "text", text = "Hello, world!") + ), + stop_reason = "end_turn" + ) + + mock_response <- httr2::response_json( + status_code = 200L, + body = mock_body + ) + + content <- .extract_ant_message_content(mock_response) + expect_equal(content, "Hello, world!") + + mock_empty_body <- list( + content = list(), + stop_reason = "end_turn" + ) + + mock_empty_response <- httr2::response_json( + status_code = 200L, + body = mock_empty_body + ) + + empty_content <- .extract_ant_message_content(mock_empty_response) + expect_true(is.na(empty_content)) + +}) + + test_that("ant_build_messages_request accepts endpointr_id and adds to headers", { req <- ant_build_messages_request( "Hello this a test", @@ -82,7 +114,6 @@ test_that("ant_complete_text validates its inputs", { expect_error(ant_complete_text(c("hello", "bonjour"), "must be a single string")) - }) From 413964540e6ba308e5af1bd95a8051edccfcc908 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Fri, 5 Dec 2025 10:27:58 +0000 Subject: [PATCH 05/12] add some sections to ant_messages.R as it becomes larger commit ant_complete_chunks with changes to response objects/successes/failures, parquet writing add schema to metadata tracking --- R/anthropic_messages.R | 220 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 215 insertions(+), 5 deletions(-) diff --git a/R/anthropic_messages.R b/R/anthropic_messages.R index c9443b4..7a7a9cd 100644 --- a/R/anthropic_messages.R +++ b/R/anthropic_messages.R @@ -4,12 +4,13 @@ .ANT_MESSAGES_ENDPOINT <- "https://api.anthropic.com/v1/messages" .ANT_DEFAULT_MODEL <- "claude-haiku-4-5" +# ant_build_messages_request ---- #' Build an Anthropic Messages API request #' #' @description #' Constructs an httr2 request object for Anthropic's Messages API. #' Handles message formatting, system prompts, and optional JSON schema -#' for structured outputs. When using strucutred outputs you must select the correct model. +#' for structured outputs. When using structured outputs you must select the correct model. #' #' #' @details @@ -154,8 +155,10 @@ ant_build_messages_request <- function( return(request) } +# ant_build_messages_request ---- -# ant_complete_text docs ---- + +# ant_complete_text ---- #' Generate a completion for a single text using Anthropic's Messages API #' #' @description @@ -200,7 +203,7 @@ ant_build_messages_request <- function( #' schema = sentiment_schema #' ) #' } -# ant_complete_text docs ---- + ant_complete_text <- function(text, model = .ANT_DEFAULT_MODEL, system_prompt = NULL, @@ -210,7 +213,7 @@ ant_complete_text <- function(text, key_name = "ANTHROPIC_API_KEY", endpoint_url = .ANT_MESSAGES_ENDPOINT, max_retries = 5L, - tiemout = 30L, + timeout = 30L, tidy = TRUE) { # surface errors quickly here, before building any request @@ -234,7 +237,7 @@ ant_complete_text <- function(text, max_tokens = max_tokens, endpoint_url = endpoint_url, max_retries = max_retries, - timeout = 30L + timeout = timeout ) tryCatch({ @@ -277,6 +280,213 @@ ant_complete_text <- function(text, return(content) } +# ant_complete_text ---- + +# ant_complete_chunks ---- + +ant_complete_chunks <- function(texts, + ids, + chunk_size = 5000L, + model = "claude-haiku-4-5", + system_prompt = NULL, + output_dir = "auto", + schema = NULL, + concurrent_requests = 5L, + temperature = 0, + max_tokens = 1024L, + max_retries = 5L, + timeout = 30L, + key_name = "ANTHROPIC_API_KEY", + endpoint_url = .ANT_MESSAGES_ENDPOINT, + id_col_name = "id") { + + stopifnot( + "texts must be a vector" = is.vector(texts), + "ids must be a vector" = is.vector(ids), + "texts and ids must be the same length" = length(texts) == length(ids), + "chunk_size must be a positive integer" = is.numeric(chunk_size) && chunk_size > 0 + ) + + output_dir <- .handle_output_directory(output_dir, base_dir_name = "ant_messages_chunks") + + if (!dir.exists(output_dir)) { + dir.create(output_dir, recursive = TRUE) + } + + # build the schema just once (and apply later) + if (!is.null(schema) && inherits(schema, "EndpointR::json_schema")) { + formatted_schema <- .ant_format_schema(schema) + } else { + formatted_schema <- schema + } + + chunk_data <- batch_vector(seq_along(texts), chunk_size) + n_chunks <- length(chunk_data$batch_indices) + + metadata <- list( + output_dir = output_dir, + endpoint_url = endpoint_url, + model = model, + schema = NULL, + has_system_prompt = !is.null(system_prompt), + chunk_size = chunk_size, + n_chunks = n_chunks, + n_texts = length(texts), + concurrent_requests = concurrent_requests, + timeout = timeout, + max_retries = max_retries, + max_tokens = max_tokens, + temperature = temperature, + key_name = key_name, + timestamp = Sys.time() + ) + + if (!is.null(formatted_schema)) { + metadata$schema <- jsonlite::toJSON(formatted_schema) |> + jsonlite::prettify() + } + + jsonlite::write_json( + metadata, + file.path(output_dir, "metadata.json"), + auto_unbox = TRUE, + pretty = TRUE + ) + + cli::cli_alert_info("Processing {length(texts)} text{?s} in {n_chunks} chunk{?s} of up to {chunk_size} each") + cli::cli_alert_info("Results will be saved as parquet files in {output_dir}") + + total_successes <- 0 + total_failures <- 0 + + # core processing logic + for (chunk_num in seq_along(chunk_data$batch_indices)) { + chunk_indices <- chunk_data$batch_indices[[chunk_num]] + chunk_texts <- texts[chunk_indices] + chunk_ids <- ids[chunk_indices] + + cli::cli_progress_message("Processing chunk {chunk_num}/{n_chunks} ({length(chunk_indices)} text{?s})") + + # within chunk reqs + requests <- purrr::map2( + .x = chunk_texts, + .y = chunk_ids, + .f = \(x, y) ant_build_messages_request( + input = x, + endpointr_id = y, + model = model, + temperature = temperature, + max_tokens = max_tokens, + schema = formatted_schema, + system_prompt = system_prompt, + key_name = key_name, + endpoint_url = endpoint_url, + max_retries = max_retries, + timeout = timeout + ) + ) + + is_valid_request <- purrr::map_lgl(requests, \(x) inherits(x, "httr2_request")) + valid_requests <- requests[is_valid_request] + + if (length(valid_requests) == 0) { + cli::cli_alert_warning("No valid request{?s} in chunk {chunk_num}, skipping") + next + } + + responses <- perform_requests_with_strategy( + valid_requests, + concurrent_requests = concurrent_requests, + progress = TRUE + ) + + is_response <- purrr::map_lgl(responses, inherits, "httr2_response") + response_objects <- responses[is_response] + error_objects <- responses[!is_response] + + is_success <- purrr::map_lgl(response_objects, \(x) httr2::resp_status(x) < 400) + successes <- response_objects[is_success] + http_failures <- response_objects[!is_success] + + failures <- c(http_failures, error_objects) + + n_successes <- length(successes) + n_failures <- length(failures) + total_successes <- total_successes + n_successes + total_failures <- total_failures + n_failures + + + chunk_results <- list() + + if (n_successes > 0) { + successes_ids <- purrr::map(successes, \(x) purrr::pluck(x, "request", "headers", "endpointr_id")) |> unlist() + successes_content <- purrr::map_chr(successes, .extract_ant_message_content) + + chunk_results$successes <- tibble::tibble( + !!id_col_name := successes_ids, + content = successes_content, + .error = FALSE, + .error_msg = NA_character_, + .status = NA_integer_, + .chunk = chunk_num + ) + } + + if (n_failures > 0) { + failures_ids <- purrr::map(failures, \(x) purrr::pluck(x, "request", "headers", "endpointr_id")) |> unlist() + failures_msgs <- purrr::map_chr(failures, \(x){ + if (inherits(x, "httr2_response")) { + .extract_api_error(x) + } else { + # error object - try to get resp from it + resp <- purrr::pluck(x, "resp") + if (!is.null(resp)) .extract_api_error(resp) else .extract_api_error(x, "Unknown error") + } + }) + failures_status <- purrr::map_int(failures, \(x){ + if (inherits(x, "httr2_response")) { + httr2::resp_status(x) + } else { + resp <- purrr::pluck(x, "resp") + if (!is.null(resp)) httr2::resp_status(resp) else NA_integer_ + } + }) + + chunk_results$failures <- tibble::tibble( + !!id_col_name := failures_ids, + content = NA_character_, + .error = TRUE, + .error_msg = failures_msgs, + .status = failures_status, + .chunk = chunk_num + ) + } + + chunk_df <- dplyr::bind_rows(chunk_results) + + if (nrow(chunk_df) > 0) { + chunk_file <- glue::glue("{output_dir}/chunk_{stringr::str_pad(chunk_num, 3, pad = '0')}.parquet") + arrow::write_parquet(chunk_df, chunk_file) + } + + cli::cli_alert_success("Chunk {chunk_num}: {n_successes} successful, {n_failures} failed") + + rm(requests, responses, successes, failures, chunk_results, chunk_df) + gc(verbose = FALSE) + } + + cli::cli_alert_info("Processing completed, there were {total_successes} successes\n and {total_failures} failures.") + + parquet_files <- list.files(output_dir, pattern = "\\.parquet$", full.names = TRUE) + final_results <- arrow::open_dataset(parquet_files, format = "parquet") |> + dplyr::collect() + + return(final_results) +} + +# ant_complete_chunks ---- + + #' Extract text content from Anthropic Messages API response #' @keywords internal From 907a5f7cfc4eb41fcd8243cad2cf2152cb924ba6 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Fri, 5 Dec 2025 10:37:39 +0000 Subject: [PATCH 06/12] add roxygen2 skeleton to ant_complete_chunks --- R/anthropic_messages.R | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/R/anthropic_messages.R b/R/anthropic_messages.R index 7a7a9cd..0c47663 100644 --- a/R/anthropic_messages.R +++ b/R/anthropic_messages.R @@ -283,7 +283,44 @@ ant_complete_text <- function(text, # ant_complete_text ---- # ant_complete_chunks ---- - +#' Process text chunks through Anthropic's Messages API with batch file output +#' +#' @description +#' Processes large volumes of text through Anthropic's Messages API in +#' configurable chunks, writing results progressively to parquet files. +#' Handles concurrent requests, automatic retries, and structured outputs. +#' +#' @details +#' This function is designed for processing large text datasets. It divides +#' input into chunks, processes each chunk with concurrent API requests, and +#' writes results to disk to minimise memory usage and possibility of data loss. +#' +#' Results are written as parquet files in the specified output directory, +#' along with a metadata.json file containing processing parameters. +#' +#' When using the `output_dir =` argument, be careful that you select +#' a new directory if you do not wish to overwrite existing chunks. +#' If there is already a `chunks_001.parquet` file in the directory, +#' it will be overwritten. +#' +#' @param texts Character vector of texts to process +#' @param ids Vector of unique identifiers (same length as texts) +#' @param chunk_size Number of texts per chunk before writing to disk +#' @param model Anthropic model to use +#' @param system_prompt Optional system prompt (applied to all requests) +#' @param output_dir Directory for parquet chunks ("auto" generates timestamped dir) +#' @param schema Optional JSON schema for structured output +#' @param concurrent_requests Number of concurrent requests +#' @param temperature Sampling temperature +#' @param max_tokens Maximum tokens per response +#' @param max_retries Maximum retry attempts per request +#' @param timeout Request timeout in seconds +#' @param key_name Environment variable name for API key +#' @param endpoint_url Anthropic API endpoint URL +#' @param id_col_name Name for ID column in output +#' +#' @return A tibble with all results +#' @export ant_complete_chunks <- function(texts, ids, chunk_size = 5000L, From fd170b48bb95b07d443b97f69dc00ba3f619bdf8 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Fri, 5 Dec 2025 11:14:02 +0000 Subject: [PATCH 07/12] remove the cli_alert_warning in ant_build_messages_request as it's unnecessary, may be doing a lot of unnecessary processing by checking if it's the right type anyway... as caller funcs could/should do that, maybe. But then there's ugly coupling add ant_complete_df first pass --- NAMESPACE | 2 + R/anthropic_messages.R | 63 ++++++++++++++++++++- man/ant_build_messages_request.Rd | 2 +- man/ant_complete_chunks.Rd | 76 ++++++++++++++++++++++++++ man/ant_complete_text.Rd | 74 +++++++++++++++++++++++++ man/dot-extract_ant_message_content.Rd | 12 ++++ 6 files changed, 225 insertions(+), 4 deletions(-) create mode 100644 man/ant_complete_chunks.Rd create mode 100644 man/ant_complete_text.Rd create mode 100644 man/dot-extract_ant_message_content.Rd diff --git a/NAMESPACE b/NAMESPACE index bdc43fe..be4ddbb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,8 @@ # Generated by roxygen2: do not edit by hand export(ant_build_messages_request) +export(ant_complete_chunks) +export(ant_complete_text) export(create_json_schema) export(get_api_key) export(hf_build_request) diff --git a/R/anthropic_messages.R b/R/anthropic_messages.R index 0c47663..c6b475d 100644 --- a/R/anthropic_messages.R +++ b/R/anthropic_messages.R @@ -118,7 +118,6 @@ ant_build_messages_request <- function( if (inherits(schema, "EndpointR::json_schema")) { body$output_format <- .ant_format_schema(schema) } else if (is.list(schema)) { - cli::cli_alert_warning("Your {.arg schema} is a list, not an EndpointR json_schema") body$output_format <- schema } else { cli::cli_abort("{.arg chema} must be an EndpointR json_schema object or a list") @@ -279,7 +278,6 @@ ant_complete_text <- function(text, return(content) } - # ant_complete_text ---- # ant_complete_chunks ---- @@ -520,10 +518,69 @@ ant_complete_chunks <- function(texts, return(final_results) } - # ant_complete_chunks ---- +# ant_complete_df ---- +ant_complete_df <- function(df, + text_var, + id_var, + model = "claude-haiku-4-5", + output_dir = "auto", + system_prompt = NULL, + schema = NULL, + chunk_size = 5000L, + concurrent_requests = 5L, + max_retries = 5L, + timeout = 30, + temperature = 0, + max_tokens = 1024L, + key_name = "ANTHROPIC_API_KEY", + endpoint_url = .ANT_MESSAGES_ENDPOINT) { + + text_sym <- rlang::ensym(text_var) + id_sym <- rlang::ensym(id_var) + + stopifnot( + "df must be a data frame" = is.data.frame(df), + "df must not be empty" = nrow(df) > 0, + "text_var must exist in df" = rlang::as_name(text_sym) %in% names(df), + "id_var must exist in df" = rlang::as_name(id_sym) %in% names(df), + "model must be a character vector" = is.character(model), + "`chunk_size` must be a positive integer" = is.numeric(chunk_size) && chunk_size > 0 + ) + + output_dir <- .handle_output_directory(output_dir, base_dir_name = "ant_messages_chunks") + + text_vec <- dplyr::pull(df, !!text_sym) + id_vec <- dplyr::pull(df, !!id_sym) + + + id_col_name <- rlang::as_name(id_sym) # needed for preserving original col names in chunks func (which doesn't receive the id_var, but a vec of ids+texts) + + results <- ant_complete_chunks( + texts = text_vec, + ids = id_vec, + model = model, + system_prompt = system_prompt, + schema = schema, + chunk_size = chunk_size, + concurrent_requests = concurrent_requests, + max_retries = max_retries, + timeout = timeout, + temperature = temperature, + max_tokens = max_tokens, + key_name = key_name, + endpoint_url = endpoint_url, + output_dir = output_dir, + id_col_name = id_col_name + ) + + results <- dplyr::rename(results, !!id_sym := !!rlang::sym(id_col_name)) + + return(results) +} +# ant_complete_df ---- #' Extract text content from Anthropic Messages API response #' @keywords internal diff --git a/man/ant_build_messages_request.Rd b/man/ant_build_messages_request.Rd index c9d1686..a7d1b54 100644 --- a/man/ant_build_messages_request.Rd +++ b/man/ant_build_messages_request.Rd @@ -47,7 +47,7 @@ An httr2 request object \description{ Constructs an httr2 request object for Anthropic's Messages API. Handles message formatting, system prompts, and optional JSON schema -for structured outputs. When using strucutred outputs you must select the correct model. +for structured outputs. When using structured outputs you must select the correct model. } \details{ This function creates the HTTP request but does not execute it. For diff --git a/man/ant_complete_chunks.Rd b/man/ant_complete_chunks.Rd new file mode 100644 index 0000000..059de57 --- /dev/null +++ b/man/ant_complete_chunks.Rd @@ -0,0 +1,76 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/anthropic_messages.R +\name{ant_complete_chunks} +\alias{ant_complete_chunks} +\title{Process text chunks through Anthropic's Messages API with batch file output} +\usage{ +ant_complete_chunks( + texts, + ids, + chunk_size = 5000L, + model = "claude-haiku-4-5", + system_prompt = NULL, + output_dir = "auto", + schema = NULL, + concurrent_requests = 5L, + temperature = 0, + max_tokens = 1024L, + max_retries = 5L, + timeout = 30L, + key_name = "ANTHROPIC_API_KEY", + endpoint_url = .ANT_MESSAGES_ENDPOINT, + id_col_name = "id" +) +} +\arguments{ +\item{texts}{Character vector of texts to process} + +\item{ids}{Vector of unique identifiers (same length as texts)} + +\item{chunk_size}{Number of texts per chunk before writing to disk} + +\item{model}{Anthropic model to use} + +\item{system_prompt}{Optional system prompt (applied to all requests)} + +\item{output_dir}{Directory for parquet chunks ("auto" generates timestamped dir)} + +\item{schema}{Optional JSON schema for structured output} + +\item{concurrent_requests}{Number of concurrent requests} + +\item{temperature}{Sampling temperature} + +\item{max_tokens}{Maximum tokens per response} + +\item{max_retries}{Maximum retry attempts per request} + +\item{timeout}{Request timeout in seconds} + +\item{key_name}{Environment variable name for API key} + +\item{endpoint_url}{Anthropic API endpoint URL} + +\item{id_col_name}{Name for ID column in output} +} +\value{ +A tibble with all results +} +\description{ +Processes large volumes of text through Anthropic's Messages API in +configurable chunks, writing results progressively to parquet files. +Handles concurrent requests, automatic retries, and structured outputs. +} +\details{ +This function is designed for processing large text datasets. It divides +input into chunks, processes each chunk with concurrent API requests, and +writes results to disk to minimise memory usage and possibility of data loss. + +Results are written as parquet files in the specified output directory, +along with a metadata.json file containing processing parameters. + +When using the \verb{output_dir =} argument, be careful that you select +a new directory if you do not wish to overwrite existing chunks. +If there is already a \code{chunks_001.parquet} file in the directory, +it will be overwritten. +} diff --git a/man/ant_complete_text.Rd b/man/ant_complete_text.Rd new file mode 100644 index 0000000..5d0c6a7 --- /dev/null +++ b/man/ant_complete_text.Rd @@ -0,0 +1,74 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/anthropic_messages.R +\name{ant_complete_text} +\alias{ant_complete_text} +\title{Generate a completion for a single text using Anthropic's Messages API} +\usage{ +ant_complete_text( + text, + model = .ANT_DEFAULT_MODEL, + system_prompt = NULL, + schema = NULL, + temperature = 0, + max_tokens = 500L, + key_name = "ANTHROPIC_API_KEY", + endpoint_url = .ANT_MESSAGES_ENDPOINT, + max_retries = 5L, + timeout = 30L, + tidy = TRUE +) +} +\arguments{ +\item{text}{Character string to send to the model} + +\item{model}{Anthropic model to use (default: "claude-sonnet-4-5-20250929")} + +\item{system_prompt}{Optional system prompt} + +\item{schema}{Optional JSON schema for structured output} + +\item{temperature}{Sampling temperature (0-1)} + +\item{max_tokens}{Maximum tokens in response} + +\item{key_name}{Environment variable name for API key} + +\item{endpoint_url}{Anthropic API endpoint URL} + +\item{max_retries}{Maximum retry attempts} + +\item{timeout}{Request timeout in seconds} + +\item{tidy}{Whether to parse structured output (default: TRUE)} +} +\value{ +Character string with the model's response, or parsed JSON if schema provided +} +\description{ +High-level function to generate a completion for a single text string. +Handles request creation, execution, and response processing with +optional structured output support. +} +\examples{ +\dontrun{ + # simple completion + response <- ant_complete_text( + text = "Explain quantum computing in simple terms", + max_tokens = 500 + ) + + # with structured output + sentiment_schema <- create_json_schema( + name = "sentiment", + schema = schema_object( + sentiment = schema_enum(c("positive", "negative", "neutral")), + confidence = schema_number(minimum = 0, maximum = 1), + required = c("sentiment", "confidence") + ) + ) + result <- ant_complete_text( + text = "I love this product!", + schema = sentiment_schema + ) +} +} diff --git a/man/dot-extract_ant_message_content.Rd b/man/dot-extract_ant_message_content.Rd new file mode 100644 index 0000000..b2b4e76 --- /dev/null +++ b/man/dot-extract_ant_message_content.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/anthropic_messages.R +\name{.extract_ant_message_content} +\alias{.extract_ant_message_content} +\title{Extract text content from Anthropic Messages API response} +\usage{ +.extract_ant_message_content(resp) +} +\description{ +Extract text content from Anthropic Messages API response +} +\keyword{internal} From 2aeb0191acd733662c660e76e70fd54c27518ed6 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Fri, 5 Dec 2025 11:24:39 +0000 Subject: [PATCH 08/12] pass the schema itself to metadata then writing to JSON takes care of formatting (for prettier printing inside metadata.json), ugly newlines etc. everywhere otherwise write the system prompt to metadata as well, will be helpful for debugging --- R/anthropic_messages.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/anthropic_messages.R b/R/anthropic_messages.R index c6b475d..1621ba7 100644 --- a/R/anthropic_messages.R +++ b/R/anthropic_messages.R @@ -377,8 +377,7 @@ ant_complete_chunks <- function(texts, ) if (!is.null(formatted_schema)) { - metadata$schema <- jsonlite::toJSON(formatted_schema) |> - jsonlite::prettify() + metadata$schema <- formatted_schema } jsonlite::write_json( From 186355e068a220e3ae5376d7a8ac8566ecdbf5eb Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Fri, 5 Dec 2025 11:33:21 +0000 Subject: [PATCH 09/12] add schema and system prompt to oai_complete_chunks metadata as well as system prompt to ant_complete_chunks --- R/anthropic_messages.R | 2 +- R/openai_completions.R | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/R/anthropic_messages.R b/R/anthropic_messages.R index 1621ba7..e4caea2 100644 --- a/R/anthropic_messages.R +++ b/R/anthropic_messages.R @@ -363,7 +363,7 @@ ant_complete_chunks <- function(texts, endpoint_url = endpoint_url, model = model, schema = NULL, - has_system_prompt = !is.null(system_prompt), + system_prompt = system_prompt, chunk_size = chunk_size, n_chunks = n_chunks, n_texts = length(texts), diff --git a/R/openai_completions.R b/R/openai_completions.R index 34da373..7ebc281 100644 --- a/R/openai_completions.R +++ b/R/openai_completions.R @@ -432,6 +432,8 @@ oai_complete_chunks <- function(texts, metadata <- list( model = model, endpoint_url = endpoint_url, + schema = schema, + system_prompt = system_prompt, chunk_size = chunk_size, n_texts = length(texts), concurrent_requests = concurrent_requests, @@ -442,8 +444,6 @@ oai_complete_chunks <- function(texts, output_dir = output_dir, key_name = key_name, n_chunks = n_chunks, - has_schema = !is.null(schema), - has_system_prompt = !is.null(system_prompt), timestamp = Sys.time() ) From c34aaf44424eae150d9ba38ed0e7030ba7328437 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Fri, 5 Dec 2025 11:41:48 +0000 Subject: [PATCH 10/12] add roxygen2 skeleton for ant_complete_df --- R/anthropic_messages.R | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/R/anthropic_messages.R b/R/anthropic_messages.R index e4caea2..8d1ab7c 100644 --- a/R/anthropic_messages.R +++ b/R/anthropic_messages.R @@ -520,6 +520,44 @@ ant_complete_chunks <- function(texts, # ant_complete_chunks ---- # ant_complete_df ---- +#' Process a data frame through Anthropic's Messages API +#' +#' @description +#' Takes a data frame with text inputs and processes each row through +#' Anthropic's Messages API using chunked processing. Results are written +#' progressively to parquet files and returned as a tibble. Supports +#' structured outputs via the schema = argument. +#' +#' @details +#' writes results to disk to minimise memory usage and possibility of data loss. +#' +#' Results are written as parquet files in the specified output directory, +#' along with a metadata.json file containing processing parameters. +#' +#' When using the `output_dir =` argument, be careful that you select +#' a new directory if you do not wish to overwrite existing chunks. +#' If there is already a `chunks_001.parquet` file in the directory, +#' it will be overwritten. +#' +#' @param df Data frame containing text to process +#' @param text_var Column name (unquoted) containing text inputs +#' @param id_var Column name (unquoted) for unique row identifiers +#' @param model Anthropic model to use +#' @param output_dir Directory for parquet chunks +#' @param system_prompt Optional system prompt +#' @param schema Optional JSON schema for structured output +#' @param chunk_size Number of texts per chunk +#' @param concurrent_requests Number of concurrent requests +#' @param max_retries Maximum retry attempts +#' @param timeout Request timeout in seconds +#' @param temperature Sampling temperature +#' @param max_tokens Maximum tokens per response +#' @param key_name Environment variable name for API key +#' @param endpoint_url Anthropic API endpoint URL +#' +#' @return A tibble with results +#' @export +#' ant_complete_df <- function(df, text_var, id_var, From f31ce30a85977506e702d95198e34ea381fbdb8c Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Fri, 5 Dec 2025 11:59:50 +0000 Subject: [PATCH 11/12] fix some typos in package add spelling test switch lang to en-GB in DESCRIPTION add ant funcs to _pkgdown.yml --- DESCRIPTION | 4 +- README.Rmd | 2 +- README.md | 2 +- _pkgdown.yml | 3 + dev_docs/01_integrations.qmd | 128 +++++++++- dev_docs/ant_messages.qmd | 160 +++++++++++- tests/spelling.R | 3 + vignettes/embeddings_providers.Rmd | 2 +- vignettes/llm_providers.Rmd | 251 ++++++++++++++++++- vignettes/structured_outputs_json_schema.Rmd | 2 +- 10 files changed, 537 insertions(+), 20 deletions(-) create mode 100644 tests/spelling.R diff --git a/DESCRIPTION b/DESCRIPTION index f2fab13..8584875 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -8,7 +8,8 @@ License: MIT + file LICENSE Encoding: UTF-8 Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.2 -Suggests: +Suggests: + spelling, broom, ggplot2, here, @@ -39,3 +40,4 @@ Depends: R (>= 3.5) LazyData: true URL: https://jpcompartir.github.io/EndpointR/ +Language: en-GB diff --git a/README.Rmd b/README.Rmd index b9e0a36..31989d6 100644 --- a/README.Rmd +++ b/README.Rmd @@ -147,7 +147,7 @@ hf_classify_df( dplyr::rename(!!!labelid_2class()) ``` -Read the [Hugging Face Inference Vignette](articles/hugging_face_inference.html) for more infromation on embedding and classifying using Dedicated Inference Endpoints and the Inference API from Hugging Face. +Read the [Hugging Face Inference Vignette](articles/hugging_face_inference.html) for more information on embedding and classifying using Dedicated Inference Endpoints and the Inference API from Hugging Face. ## OpenAI - Chat Completions API diff --git a/README.md b/README.md index 7c38e8d..10c035d 100644 --- a/README.md +++ b/README.md @@ -146,7 +146,7 @@ hf_classify_df( ``` Read the [Hugging Face Inference -Vignette](articles/hugging_face_inference.html) for more infromation on +Vignette](articles/hugging_face_inference.html) for more information on embedding and classifying using Dedicated Inference Endpoints and the Inference API from Hugging Face. diff --git a/_pkgdown.yml b/_pkgdown.yml index 98d756e..ad8acfe 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -92,6 +92,9 @@ reference: desc: "functions for working with Anthropic's Messages API" contents: - ant_build_messages_request + - ant_complete_text + - ant_complete_chunks + - ant_comple_df - title: "OpenAI Completions" diff --git a/dev_docs/01_integrations.qmd b/dev_docs/01_integrations.qmd index d4bc839..f85a047 100644 --- a/dev_docs/01_integrations.qmd +++ b/dev_docs/01_integrations.qmd @@ -7,6 +7,7 @@ format: html library(tidyverse) library(httr2) library(EndpointR) +library(jsonlite) n_tests <- 5 test_df <- tibble( @@ -17,7 +18,9 @@ test_df <- tibble( Space for integration tests (useful for not relying on unit tests when interacting with real APIs) -# oai embed +# OpenAI + +## oai embed ```{r, invalid_model_oai_embed} oai_embed_invalid_model <- oai_embed_df( @@ -64,7 +67,7 @@ oai_embed_success <- oai_embed_df( oai_embed_success ``` -# oai completions +## oai completions invalid API key for completions @@ -100,7 +103,9 @@ oai_complete_good_auth <- oai_complete_df( oai_complete_good_auth ``` -# hf embed +# Hugging Face + +## hf embed non-existent HuggingFace model @@ -170,3 +175,120 @@ hf_classify_wrong_task |> dplyr::select(id, .error, .error_msg, .status) # expect: .error = TRUE, error message should indicate task mismatch, .status = 400 ``` + +# Anthroic + +```{r, chunk_df_test_texts_ids} +ids <- c(paste0("id_", 1:10)) +texts <- c( + "The kettle whistled at 6 AM.", + "Machine learning models require substantial computational resources and careful hyperparameter tuning to achieve optimal performance on complex datasets with high dimensionality and temporal dependencies.", + "She bought milk.", + "Quantum entanglement remains one of the most counterintuitive phenomena in physics, suggesting that particles can influence each other instantaneously across arbitrary distances, challenging classical notions of locality and causality that governed physics for centuries.", + "It rained yesterday.", + "The algorithm iterated through 47 million records in under three minutes, filtering by date and category before aggregating results into hierarchical structures.", + "Dogs bark.", + "Brexit negotiations involved trade agreements, fisheries disputes, regulatory alignment across financial services, customs procedures, and residency rights for citizens living abroad, spanning four years of complex bilateral discussions.", + "Coffee is hot.", + "The startup pivoted twice before finding product-market fit in the enterprise SaaS space, eventually acquiring three smaller competitors and expanding to eighteen countries across three continents." +) + +word_summary_schema <- create_json_schema( + name = "summary", + schema = schema_object( + summary = schema_string(), + required = list("summary") + + ) +) +``` + +## chunks - ant_complete_chunks + +```{r, ant_complete_chunks} +chunks_test <- ant_complete_chunks( + texts = texts, + ids = ids, + id_col_name = "id_column", + output_dir = "test_dir/anthropic/chunks/no_schema", + model = "claude-haiku-4-5", + concurrent_requests = 10, + chunk_size = 10, + system_prompt = "Write a one-word summary of the input text", + key_name = "ANTHROPIC_API_KEY" +) +``` + +```{r} +chunks_text_w_schema <- ant_complete_chunks( + texts = texts, + ids = ids, + id_col_name = "id_column", + output_dir = "test_dir/anthropic/chunks/schema", + model = "claude-haiku-4-5", + concurrent_requests = 10, + chunk_size = 10, + system_prompt = "Write a one-word summary of the input text", + schema = word_summary_schema, + key_name = "ANTHROPIC_API_KEY" +) + +chunks_text_w_schema |> + mutate(content = purrr::map(content, \(x) safely_from_json(x))) |> + unnest_wider(content) +``` + +## df - ant_complete_df + +```{r, ant_complete_df} +df <- tibble( + ids = ids, + text = texts +) + +df_test <- df |> ant_complete_df( + text_var = text, + id_var = ids, + output_dir = "test_dir/anthropic/df/no_schema", + chunk_size = 10, + concurrent_requests = 10, + model = "claude-haiku-4-5", + system_prompt = "Write 1 word to describe the text", +) + +df_test +``` + +```{r} +df_w_schema <-df |> ant_complete_df( + text_var = text, + id_var = ids, + output_dir = "test_dir/anthropic/df/schema", + chunk_size = 10, + concurrent_requests = 10, + model = "claude-haiku-4-5", + system_prompt = "Write 1 word to describe the text", + schema = word_summary_schema +) + +df_w_schema |> + mutate(content = map(content, \(x) fromJSON(x))) |> + unnest_wider(content) +``` + +```{r} +df_no_system_prompt <- df |> + ant_complete_df( + text_var = text, + id_var = ids, + output_dir = "test_dir/anthropic/df/no_system_prompt", + chunk_size = 10, + concurrent_requests = 10, + model = "claude-haiku-4-5", + system_prompt = NULL, + max_tokens = 2, + schema = word_summary_schema +) + +df_no_system_prompt +``` diff --git a/dev_docs/ant_messages.qmd b/dev_docs/ant_messages.qmd index 43be43c..c7985a2 100644 --- a/dev_docs/ant_messages.qmd +++ b/dev_docs/ant_messages.qmd @@ -3,14 +3,19 @@ title: "ant_messages" format: html --- -# Structured Outputs - -HTTP 400 errors - ```{r} library(httr2) +library(purrr) library(jsonlite) +library(tidyr) +library(dplyr) +``` + +# Structured Outputs +HTTP 400 errors (discovery: due to structured outputs not being supported in certain models and I switched to default to Haiku for testing/\$\$\$) + +```{r} req <- ant_build_messages_request(input = "This is terrible, fin.") resp <- req_perform(req, verbosity = 1) @@ -104,10 +109,6 @@ resp |> resp_status() ## Nested Schemas ```{r} -library(httr2) -library(purrr) -library(jsonlite) -library(tidyr) absa_entities_schema <- create_json_schema( name = "entities", strict = TRUE, @@ -137,8 +138,6 @@ resp |> resp_body_json() |> purrr::pluck("content", 1, "text") |> fromJSON() |> pluck('entities') - - ``` ``` @@ -146,3 +145,144 @@ resp |> resp_body_json() |> 1 Apple positive 2 Microsoft negative ``` + +Now we run it again and expect an error, and that the error message will actually be viewable rather than a generic error message with the status code: + +```{r} +error_400 <- req <- ant_build_messages_request( + "Apple have been wonderful, Microsoft... not so much. And by not so much I mean pathetic.", + schema = absa_entities_schema +) +resp_400 <- req_perform(error_400) +.extract_api_error(resp_400) +resp_400 |> resp_body_json() +``` + +# text - ant_complete_text + +```{r} +ant_complete_text("What's going on dawg?") + +absa_test <- ant_complete_text("Apple have been wonderful, Microsoft... not so much. And by not so much I mean pathetic.", + model = "claude-sonnet-4-5", + system_prompt = "Follow the schema and help the user.", + schema = absa_entities_schema) + +absa_test$entities |> + bind_rows() +``` + +# chunks - ant_complete_chunks + +```{r, chunk_df_test_texts_ids} +ids <- c(paste0("id_", 1:10)) +texts <- c( + "The kettle whistled at 6 AM.", + "Machine learning models require substantial computational resources and careful hyperparameter tuning to achieve optimal performance on complex datasets with high dimensionality and temporal dependencies.", + "She bought milk.", + "Quantum entanglement remains one of the most counterintuitive phenomena in physics, suggesting that particles can influence each other instantaneously across arbitrary distances, challenging classical notions of locality and causality that governed physics for centuries.", + "It rained yesterday.", + "The algorithm iterated through 47 million records in under three minutes, filtering by date and category before aggregating results into hierarchical structures.", + "Dogs bark.", + "Brexit negotiations involved trade agreements, fisheries disputes, regulatory alignment across financial services, customs procedures, and residency rights for citizens living abroad, spanning four years of complex bilateral discussions.", + "Coffee is hot.", + "The startup pivoted twice before finding product-market fit in the enterprise SaaS space, eventually acquiring three smaller competitors and expanding to eighteen countries across three continents." +) + +word_summary_schema <- create_json_schema( + name = "summary", + schema = schema_object( + summary = schema_string(), + required = list("summary") + + ) +) +``` + +```{r, ant_complete_chunks} +chunks_test <- ant_complete_chunks( + texts = texts, + ids = ids, + id_col_name = "id_column", + output_dir = "test_dir/anthropic/chunks/no_schema", + model = "claude-haiku-4-5", + concurrent_requests = 10, + chunk_size = 10, + system_prompt = "Write a one-word summary of the input text", + key_name = "ANTHROPIC_API_KEY" +) +``` + +```{r} +chunks_text_w_schema <- ant_complete_chunks( + texts = texts, + ids = ids, + id_col_name = "id_column", + output_dir = "test_dir/anthropic/chunks/schema", + model = "claude-haiku-4-5", + concurrent_requests = 10, + chunk_size = 10, + system_prompt = "Write a one-word summary of the input text", + schema = word_summary_schema, + key_name = "ANTHROPIC_API_KEY" +) + +chunks_text_w_schema |> + mutate(content = purrr::map(content, \(x) safely_from_json(x))) |> + unnest_wider(content) +``` + +# df - ant_complete_df + +```{r, ant_complete_df} +df <- tibble( + ids = ids, + text = texts +) + +df_test <- df |> ant_complete_df( + text_var = text, + id_var = ids, + output_dir = "test_dir/anthropic/df/no_schema", + chunk_size = 10, + concurrent_requests = 10, + model = "claude-haiku-4-5", + system_prompt = "Write 1 word to describe the text", +) + +df_test +``` + +```{r} +df_w_schema <-df |> ant_complete_df( + text_var = text, + id_var = ids, + output_dir = "test_dir/anthropic/df/schema", + chunk_size = 10, + concurrent_requests = 10, + model = "claude-haiku-4-5", + system_prompt = "Write 1 word to describe the text", + schema = word_summary_schema +) + +df_w_schema |> + mutate(content = map(content, \(x) fromJSON(x))) |> + unnest_wider(content) +``` + +```{r} +df_no_system_prompt <- df |> + ant_complete_df( + text_var = text, + id_var = ids, + output_dir = "test_dir/anthropic/df/no_system_prompt", + chunk_size = 10, + concurrent_requests = 10, + model = "claude-haiku-4-5", + system_prompt = NULL, + max_tokens = 2, + schema = word_summary_schema +) + +df_no_system_prompt +``` diff --git a/tests/spelling.R b/tests/spelling.R new file mode 100644 index 0000000..6713838 --- /dev/null +++ b/tests/spelling.R @@ -0,0 +1,3 @@ +if(requireNamespace('spelling', quietly = TRUE)) + spelling::spell_check_test(vignettes = TRUE, error = FALSE, + skip_on_cran = TRUE) diff --git a/vignettes/embeddings_providers.Rmd b/vignettes/embeddings_providers.Rmd index 1287b91..2b69030 100644 --- a/vignettes/embeddings_providers.Rmd +++ b/vignettes/embeddings_providers.Rmd @@ -321,7 +321,7 @@ if (any(results$.error)) { - OpenAI: `text-embedding-3-small` with custom dimensions for flexibility 4. **Consider dedicated endpoints** for production Hugging Face deployments -> **TIP**: Check your organisation's tier on OpenAI, tier 5 organisatons can send many more requests than tier 1. [OpenAI Rate Limits](https://platform.openai.com/settings/organization/limits) +> **TIP**: Check your organisation's tier on OpenAI, tier 5 organisations can send many more requests than tier 1. [OpenAI Rate Limits](https://platform.openai.com/settings/organization/limits) ## Cost Optimisation diff --git a/vignettes/llm_providers.Rmd b/vignettes/llm_providers.Rmd index ff53a9c..f252921 100644 --- a/vignettes/llm_providers.Rmd +++ b/vignettes/llm_providers.Rmd @@ -420,9 +420,256 @@ The textual responses we get from LLM providers are difficult to deal with progr For detailed information on creating JSON schemas for structured outputs, see `vignette("structured_outputs_json_schema")`. -# Anthropic +# Anthropic - Quick Start - Messages API - Single Text -TBC +Before getting started with Anthropic: + +- First, get your API key from the [Anthropic Console](https://console.anthropic.com/) and store it as "ANTHROPIC_API_KEY" with `set_api_key()` +- Second, choose your model - EndpointR defaults to "claude-haiku-4-5" for cost efficiency, but you can use any Claude model +- Third, note that Anthropic's temperature range is 0-1 (unlike OpenAI's 0-2) + +To get a completion for a single text we can use the `ant_complete_text()` function: + +```{r} +set_api_key("ANTHROPIC_API_KEY") + +sentiment_system_prompt = "Analyse the sentiment of the given text." +text = "The weather has been absolutely fantastic this summer. I wish it could be like this every year, maybe I'll move to the South of France where they get 300 days of sunshine a year. Oh to dream." + +ant_complete_text( + text = text, + system_prompt = sentiment_system_prompt, + model = "claude-haiku-4-5" +) +``` + +Output: [1] "The sentiment of the text is positive and wistful. The author expresses happiness about the fantastic summer weather and daydreams about experiencing similar conditions year-round in the South of France." + +## Anthropic - Quick Start - Messages API - Data Frame of Texts + +We can input a data frame directly into the `ant_complete_df()` function: + +```{r} +review_df <- data.frame( + id = 1:5, + text = c( + "Absolutely fantastic service! The staff were incredibly helpful and friendly.", + "Terrible experience. Food was cold and the waiter was rude.", + "Pretty good overall, but nothing special. Average food and service.", + "Outstanding meal! Best restaurant I've been to in years. Highly recommend!", + "Disappointed with the long wait times. Food was okay when it finally arrived." + ) +) + +ant_complete_df( + review_df, + text_var = text, + id_var = id, + output_dir = NULL, # leave this to 'auto' to have results written to a timestamped directory + system_prompt = sentiment_system_prompt, + concurrent_requests = 5, + chunk_size = 5 +) +``` + +```{=html} +
+ℹ Processing 5 texts in 1 chunk of up to 5 each
+ℹ Results will be saved as parquet files in [directory]
+Processing chunk 1/1 (5 texts)
+✔ Chunk 1: 5 successful, 0 failed
+ℹ Processing completed, there were 5 successes and 0 failures.
+# A tibble: 5 × 6
+     id content                                                .error .error_msg .status .chunk
+                                                                 
+1     1 "The sentiment is highly positive."                    FALSE  NA              NA      1
+2     2 "The sentiment is negative."                           FALSE  NA              NA      1
+3     3 "The sentiment is neutral with a slight positive lean." FALSE  NA              NA      1
+4     4 "The sentiment is extremely positive."                 FALSE  NA              NA      1
+5     5 "The sentiment is negative."                           FALSE  NA              NA      1
+>
+
+``` + +# Anthropic - Quick Start - Structured Outputs + +Anthropic supports [structured outputs](https://platform.claude.com/docs/en/build-with-claude/structured-outputs) (currently in beta) for the Claude 4.5 series models: `claude-haiku-4-5`, `claude-sonnet-4-5`, and `claude-opus-4-5`. Like with OpenAI, we can use a schema to ensure predictable, parseable responses: + +> **NOTE**: Structured outputs are in beta and may change faster than we can track in this documentation. Check the [official documentation](https://platform.claude.com/docs/en/build-with-claude/structured-outputs) for the latest requirements. + +```{r} +sentiment_schema <- create_json_schema( + name = "simple_sentiment_schema", + schema = schema_object( + sentiment = schema_string(description = "Sentiment classification", + enum = c("positive", "negative", "neutral")), + required = c("sentiment") + ) +) + +structured_df <- ant_complete_df( + review_df, + text_var = text, + id_var = id, + schema = sentiment_schema, + output_dir = NULL, + system_prompt = sentiment_system_prompt, + concurrent_requests = 5, + chunk_size = 5 +) +``` + +```{=html} +
+ℹ Processing 5 texts in 1 chunk of up to 5 each
+ℹ Results will be saved as parquet files in [directory]
+Processing chunk 1/1 (5 texts)
+✔ Chunk 1: 5 successful, 0 failed
+ℹ Processing completed, there were 5 successes and 0 failures.
+# A tibble: 5 × 6
+     id content                        .error .error_msg .status .chunk
+                                         
+1     1 "{\"sentiment\":\"positive\"}" FALSE  NA              NA      1
+2     2 "{\"sentiment\":\"negative\"}" FALSE  NA              NA      1
+3     3 "{\"sentiment\":\"neutral\"}"  FALSE  NA              NA      1
+4     4 "{\"sentiment\":\"positive\"}" FALSE  NA              NA      1
+5     5 "{\"sentiment\":\"negative\"}" FALSE  NA              NA      1
+>
+
+``` + +Now we extract the sentiment values: + +```{r} +structured_df |> + dplyr::mutate(content = purrr::map(content, ~safely_from_json(.x))) |> + tidyr::unnest_wider(content) +``` + +```{=html} +
+# A tibble: 5 × 6
+     id sentiment .error .error_msg .status .chunk
+                    
+1     1 positive  FALSE  NA              NA      1
+2     2 negative  FALSE  NA              NA      1
+3     3 neutral   FALSE  NA              NA      1
+4     4 positive  FALSE  NA              NA      1
+5     5 negative  FALSE  NA              NA      1
+
+``` + +# Anthropic - Under the Hood + +We looked at `ant_complete_text()` and `ant_complete_df()` which are high-level convenience functions, but let's understand what's happening under the hood. + +## Messages API + +Anthropic's Messages API is similar to OpenAI's Chat Completions API - it generates model responses from a conversation of messages. For EndpointR use cases, this is typically a single user-model interaction for a specific task. + +### Building Requests + +Let's create a request and inspect it: + +```{r} +sentiment_text <- "This is absolutely brilliant! I couldn't be happier with the results." + +sentiment_request <- ant_build_messages_request( + input = sentiment_text, + system_prompt = sentiment_system_prompt, + model = "claude-haiku-4-5" +) + +sentiment_request |> + req_dry_run() +``` + +Your generated HTTP request should look something like this: + +```{http} +POST /v1/messages HTTP/1.1 +Host: api.anthropic.com +User-Agent: EndpointR +Content-Type: application/json +x-api-key: [YOUR_API_KEY] +anthropic-version: 2023-06-01 + +{ + "model": "claude-haiku-4-5", + "messages": [ + { + "role": "user", + "content": "This is absolutely brilliant! I couldn't be happier with the results." + } + ], + "max_tokens": 500, + "temperature": 0, + "system": "Analyse the sentiment of the given text." +} +``` + +Key differences from OpenAI: - **Authentication**: Anthropic uses `x-api-key` header (not `Authorization: Bearer`) - **System prompts**: Passed as a separate `system` parameter (not in messages array) - **Temperature range**: 0-1 for Anthropic (vs 0-2 for OpenAI) - **Required parameter**: `max_tokens` is required for Anthropic requests + +### Structured Outputs with Anthropic + +Anthropic's structured outputs use the `output_format` parameter with JSON schemas: + +```{r} +structured_request <- ant_build_messages_request( + input = sentiment_text, + system_prompt = sentiment_system_prompt, + schema = sentiment_schema, + model = "claude-haiku-4-5" +) +``` + +When a schema is provided, the request automatically includes: - The `output_format` parameter with your schema - The `anthropic-beta` header for structured outputs support + +The schema is validated and formatted automatically by EndpointR's `create_json_schema()` helper. + +### Processing Multiple Texts + +Like with OpenAI, we can process multiple texts efficiently with concurrent requests: + +```{r} +classical_texts <- c( + "It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.", + "All happy families are alike; each unhappy family is unhappy in its own way.", + "It was the best of times, it was the worst of times." +) + +# Build requests for each text +requests <- purrr::map( + classical_texts, + ~ant_build_messages_request( + input = .x, + system_prompt = "Classify the sentiment as positive, negative, neutral, or mixed." + ) +) + +# Send all requests concurrently +responses <- perform_requests_with_strategy( + requests, + concurrent_requests = 3 +) + +# Extract content from responses +responses |> + purrr::map_chr(~.extract_ant_message_content(.x)) +``` + +### Memory-Efficient Batch Processing + +The `ant_complete_chunks()` and `ant_complete_df()` functions handle large-scale processing by: + +1. Dividing input into chunks (default 5000 texts per chunk) +2. Processing each chunk with concurrent requests +3. Writing results to parquet files immediately +4. Tracking metadata including the full schema structure + +This approach minimises memory usage and prevents data loss if processing is interrupted. + +**Key advantage**: Anthropic's metadata includes the complete schema definition (not just a boolean flag), making it easy to reproduce results and debug issues. # Google diff --git a/vignettes/structured_outputs_json_schema.Rmd b/vignettes/structured_outputs_json_schema.Rmd index 6e8a54a..9df85c0 100644 --- a/vignettes/structured_outputs_json_schema.Rmd +++ b/vignettes/structured_outputs_json_schema.Rmd @@ -358,7 +358,7 @@ Schema design principles: - Use descriptive field names and descriptions - Set appropriate constraints (min/max values, required fields) -- Prefer enums over free text for categories but give the model outs, or it will hallcuinate e.g. an 'other' option for when the document doesn't have what you're looking for +- Prefer enums over free text for categories but give the model outs, or it will hallucinate e.g. an 'other' option for when the document doesn't have what you're looking for - Nest objects logically for complex data - Validate some mock responses in advance From 707f1b90156880fbb4d291cfc109b7b2c734c8bd Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Fri, 5 Dec 2025 12:15:16 +0000 Subject: [PATCH 12/12] dump openai schema for metadata writing safely add webfake endpoints for ant functions like oai add tests for ant_messages based on oai funcs drop readr dependency --- DESCRIPTION | 1 - NAMESPACE | 1 + NEWS.md | 4 + R/openai_completions.R | 2 +- _pkgdown.yml | 2 +- man/ant_complete_df.Rd | 75 ++++++ tests/testthat/helper-webfake.R | 74 ++++++ tests/testthat/test-anthropic_messages.R | 293 ++++++++++++++++++++++- 8 files changed, 443 insertions(+), 9 deletions(-) create mode 100644 man/ant_complete_df.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 8584875..356eafe 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -33,7 +33,6 @@ Imports: tibble, S7, jsonvalidate, - readr, arrow VignetteBuilder: knitr Depends: diff --git a/NAMESPACE b/NAMESPACE index be4ddbb..92850c2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,7 @@ export(ant_build_messages_request) export(ant_complete_chunks) +export(ant_complete_df) export(ant_complete_text) export(create_json_schema) export(get_api_key) diff --git a/NEWS.md b/NEWS.md index 74dad18..23bd895 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,10 @@ - adds parquet writing to oai_complete_df and oai_embed_df - adds chunks func to oai_embed, and re-writes all batch -\> chunk logic - implements the Anthropic messages API with structured outputs (via BETA) +- adds `ant_complete_df()` and `ant_complete_chunks()` for batch/chunked processing with the Anthropic API, with parquet writing and metadata tracking +- metadata tracking now includes `schema` and `system_prompt` for both OpenAI and Anthropic chunked processing functions +- bug fix: S7 schema objects now correctly serialised to metadata.json (previously caused "No method asJSON S3 class: S7_object" error) +- adds spelling test, sets language to en-GB in DESCRIPTION # EndpointR 0.1.2 diff --git a/R/openai_completions.R b/R/openai_completions.R index 7ebc281..9370cf7 100644 --- a/R/openai_completions.R +++ b/R/openai_completions.R @@ -432,7 +432,7 @@ oai_complete_chunks <- function(texts, metadata <- list( model = model, endpoint_url = endpoint_url, - schema = schema, + schema = dumped_schema, system_prompt = system_prompt, chunk_size = chunk_size, n_texts = length(texts), diff --git a/_pkgdown.yml b/_pkgdown.yml index ad8acfe..99b766b 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -94,7 +94,7 @@ reference: - ant_build_messages_request - ant_complete_text - ant_complete_chunks - - ant_comple_df + - ant_complete_df - title: "OpenAI Completions" diff --git a/man/ant_complete_df.Rd b/man/ant_complete_df.Rd new file mode 100644 index 0000000..de15fec --- /dev/null +++ b/man/ant_complete_df.Rd @@ -0,0 +1,75 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/anthropic_messages.R +\name{ant_complete_df} +\alias{ant_complete_df} +\title{Process a data frame through Anthropic's Messages API} +\usage{ +ant_complete_df( + df, + text_var, + id_var, + model = "claude-haiku-4-5", + output_dir = "auto", + system_prompt = NULL, + schema = NULL, + chunk_size = 5000L, + concurrent_requests = 5L, + max_retries = 5L, + timeout = 30, + temperature = 0, + max_tokens = 1024L, + key_name = "ANTHROPIC_API_KEY", + endpoint_url = .ANT_MESSAGES_ENDPOINT +) +} +\arguments{ +\item{df}{Data frame containing text to process} + +\item{text_var}{Column name (unquoted) containing text inputs} + +\item{id_var}{Column name (unquoted) for unique row identifiers} + +\item{model}{Anthropic model to use} + +\item{output_dir}{Directory for parquet chunks} + +\item{system_prompt}{Optional system prompt} + +\item{schema}{Optional JSON schema for structured output} + +\item{chunk_size}{Number of texts per chunk} + +\item{concurrent_requests}{Number of concurrent requests} + +\item{max_retries}{Maximum retry attempts} + +\item{timeout}{Request timeout in seconds} + +\item{temperature}{Sampling temperature} + +\item{max_tokens}{Maximum tokens per response} + +\item{key_name}{Environment variable name for API key} + +\item{endpoint_url}{Anthropic API endpoint URL} +} +\value{ +A tibble with results +} +\description{ +Takes a data frame with text inputs and processes each row through +Anthropic's Messages API using chunked processing. Results are written +progressively to parquet files and returned as a tibble. Supports +structured outputs via the schema = argument. +} +\details{ +writes results to disk to minimise memory usage and possibility of data loss. + +Results are written as parquet files in the specified output directory, +along with a metadata.json file containing processing parameters. + +When using the \verb{output_dir =} argument, be careful that you select +a new directory if you do not wish to overwrite existing chunks. +If there is already a \code{chunks_001.parquet} file in the directory, +it will be overwritten. +} diff --git a/tests/testthat/helper-webfake.R b/tests/testthat/helper-webfake.R index 47ea4d9..6b3a564 100644 --- a/tests/testthat/helper-webfake.R +++ b/tests/testthat/helper-webfake.R @@ -217,6 +217,80 @@ withr::local_envvar(HF_TEST_API_KEY = "fake-key") }) +# Anthropic Messages API mock endpoints +.app$post("/test_ant_schemaless", function(req, res) { + response_data <- list( + content = list( + list( + type = "text", + text = "This is a helpful response from Claude." + ) + ), + stop_reason = "end_turn" + ) + + res$ + set_status(200L)$ + set_header("Content-Type", "application/json")$ + send_json(response_data) +}) + +.app$post("/test_ant_sentiment", function(req, res) { + sentiment_response <- list( + sentiment = "positive", + confidence = 0.85 + ) + + response_data <- list( + content = list( + list( + type = "text", + text = jsonlite::toJSON(sentiment_response, auto_unbox = TRUE) + ) + ), + stop_reason = "end_turn" + ) + + res$ + set_status(200L)$ + set_header("Content-Type", "application/json")$ + send_json(response_data) +}) + +.app$post("/test_ant_complete_df_review", function(req, res) { + response_data <- list( + content = list( + list( + type = "text", + text = "positive" + ) + ), + stop_reason = "end_turn" + ) + + res$ + set_status(200L)$ + set_header("Content-Type", "application/json")$ + send(jsonlite::toJSON(response_data, auto_unbox = TRUE)) +}) + +.app$post("/test_ant_complete_df_schema", function(req, res) { + response_data <- list( + content = list( + list( + type = "text", + text = '{"sentiment": "positive"}' + ) + ), + stop_reason = "end_turn" + ) + + res$ + set_status(200L)$ + set_header("Content-Type", "application/json")$ + send(jsonlite::toJSON(response_data, auto_unbox = TRUE)) +}) + server <- webfakes::local_app_process(.app) diff --git a/tests/testthat/test-anthropic_messages.R b/tests/testthat/test-anthropic_messages.R index cc88fcd..fbe70d9 100644 --- a/tests/testthat/test-anthropic_messages.R +++ b/tests/testthat/test-anthropic_messages.R @@ -74,7 +74,7 @@ test_that(".extract_ant_message_content covers basic edgecases and actually gets ) mock_response <- httr2::response_json( - status_code = 200L, + # status_code = 200L, body = mock_body ) @@ -87,7 +87,7 @@ test_that(".extract_ant_message_content covers basic edgecases and actually gets ) mock_empty_response <- httr2::response_json( - status_code = 200L, + # status_code = 200L, body = mock_empty_body ) @@ -109,11 +109,292 @@ test_that("ant_build_messages_request accepts endpointr_id and adds to headers", test_that("ant_complete_text validates its inputs", { - expect_error(ant_complete_text(c(""), - "must not be an empty")) + expect_error(ant_complete_text(c("")), + "must not be an empty") - expect_error(ant_complete_text(c("hello", "bonjour"), - "must be a single string")) + expect_error(ant_complete_text(c("hello", "bonjour")), + "must be a single string") +}) + + +test_that("ant_complete_text takes a single text and returns the response", { + + test_url <- server$url("/test_ant_schemaless") + + withr::with_envvar( + c("ANTHROPIC_API_KEY" = "test-key"), + { + response <- expect_no_error(ant_complete_text( + text = "Give me a helpful response", + endpoint_url = test_url + )) + } + ) + + expect_true(grepl(x = response, pattern = "helpful response")) +}) + + +test_that("ant_complete_text handles a schema appropriately", { + + test_url <- server$url("/test_ant_sentiment") + + sentiment_schema <- create_json_schema( + name = "sentiment_schema", + schema = schema_object( + sentiment = schema_enum(values = c("positive", "negative", "neutral"), type = "string"), + confidence = schema_number(minimum = 0, maximum = 1) + ) + ) + + withr::with_envvar( + c("ANTHROPIC_API_KEY" = "test-key"), + { + schema_response <- expect_no_error( + ant_complete_text( + text = "What a remarkable achievement", + system_prompt = "classify the sentiment of this text: ", + endpoint_url = test_url, + schema = sentiment_schema, + tidy = FALSE + )) + } + ) + + expect_no_error(validate_response(sentiment_schema, schema_response[[1]])) +}) + + +test_that("ant_complete_chunks processes chunks correctly", { + texts <- paste0("text", 1:6) + ids <- paste0("id", 1:length(texts)) + temp_dir <- withr::local_tempdir() + expected_cols <- c("id", "content", ".error", ".error_msg", ".status", ".chunk") + + withr::with_envvar( + c("ANTHROPIC_API_KEY" = "test-key"), + { + # Test with chunk_size = 2 + chunk_2 <- expect_no_error(ant_complete_chunks( + texts = texts, + ids = ids, + endpoint_url = server$url("/test_ant_complete_df_review"), + key_name = "ANTHROPIC_API_KEY", + chunk_size = 2, + concurrent_requests = 1, + output_dir = temp_dir + )) |> suppressMessages() + } + ) + + expect_setequal(unique(chunk_2$.chunk), c(1, 2, 3)) + expect_setequal(names(chunk_2), expected_cols) + expect_equal(nrow(chunk_2), 6) + + withr::with_envvar( + c("ANTHROPIC_API_KEY" = "test-key"), + { + # Test with chunk_size = 1 + chunk_1 <- expect_no_error(ant_complete_chunks( + texts = texts, + ids = ids, + endpoint_url = server$url("/test_ant_complete_df_review"), + key_name = "ANTHROPIC_API_KEY", + chunk_size = 1, + concurrent_requests = 1, + output_dir = temp_dir + )) |> suppressMessages() + } + ) + + expect_setequal(unique(chunk_1$.chunk), 1:6) + expect_equal(nrow(chunk_1), 6) +}) + + +test_that("ant_complete_df takes single row, multi-row data frames as inputs", { + expect_error(ant_complete_df("hello"), + regexp = "df must be") + + review_df <- get_review_df() + + endpoint_url <- server$url("/test_ant_complete_df_review") + + withr::with_envvar( + c("ANTHROPIC_API_KEY" = "test-key"), + { + successful_response <- expect_no_error( + ant_complete_df(review_df, + review_text, + id, + endpoint_url = endpoint_url, + concurrent_requests = 1, + max_retries = 1, + output_dir = NULL) + ) + } + ) + + expect_setequal(names(successful_response), + c("id", "content", ".error", ".error_msg", ".status", ".chunk")) + expect_setequal(unique(successful_response$content), "positive") + + withr::with_envvar( + c("ANTHROPIC_API_KEY" = "test-key"), + { + expect_message(object = + ant_complete_df(review_df, + review_text, + id, + endpoint_url = endpoint_url, + concurrent_requests = 1, + max_retries = 1, + output_dir = NULL), + regexp = "Processing 5 text" + ) + } + ) +}) + + +test_that("ant_complete_df works correctly with chunk processing", { + test_df <- data.frame( + id = paste0("id", 1:2), + text = c("text1", "text2"), + stringsAsFactors = FALSE + ) + output_dir <- withr::local_tempdir() + + withr::with_envvar( + c("ANTHROPIC_API_KEY" = "test-key"), + { + result <- expect_no_error( + ant_complete_df( + df = test_df, + text_var = text, + id_var = id, + endpoint_url = server$url("/test_ant_complete_df_review"), + key_name = "ANTHROPIC_API_KEY", + chunk_size = 1, + output_dir = output_dir + ) + ) |> suppressMessages() + } + ) + + expect_s3_class(result, "data.frame") + expect_equal(nrow(result), 2) + expect_true(all(c("id", "content", ".error", ".error_msg", ".chunk") %in% names(result))) + expect_equal(result$id, c("id1", "id2")) + expect_equal(result$content, c("positive", "positive")) + expect_equal(result$.error, c(FALSE, FALSE)) +}) + + +test_that("ant_complete_df works with different chunk sizes", { + test_df <- data.frame( + id = paste0("id", 1:4), + text = paste0("text", 1:4), + stringsAsFactors = FALSE + ) + temp_dir <- withr::local_tempdir() + + withr::with_envvar( + c("ANTHROPIC_API_KEY" = "test-key"), + { + result <- expect_no_error( + ant_complete_df( + df = test_df, + text_var = text, + id_var = id, + endpoint_url = server$url("/test_ant_complete_df_review"), + key_name = "ANTHROPIC_API_KEY", + chunk_size = 2, + concurrent_requests = 1, + output_dir = temp_dir + ) + ) |> suppressMessages() + } + ) + + expect_s3_class(result, "data.frame") + expect_equal(nrow(result), 4) + expect_true(all(c("id", ".chunk", ".error", ".error_msg") %in% names(result))) + expect_equal(result$.error, c(FALSE, FALSE, FALSE, FALSE)) + expect_setequal(unique(result$.chunk), c(1, 2)) +}) + + +test_that("ant_complete_df takes a schema as input", { + + sentiment_schema <- create_json_schema( + name = "sentiment_test", + schema = schema_object( + sentiment = schema_enum( + values = c("positive", "negative", "neutral"), + description = "Sentiment classification for the document", + type = "string" + ), + required = list("sentiment"), + additional_properties = FALSE + ) + ) + + review_df <- get_review_df() + + endpoint_url <- server$url("/test_ant_complete_df_schema") + + withr::with_envvar( + c("ANTHROPIC_API_KEY" = "test-key"), + { + successful_response <- expect_no_error( + ant_complete_df(review_df, + review_text, + id, + endpoint_url = endpoint_url, + concurrent_requests = 1, + max_retries = 1, + schema = sentiment_schema, + output_dir = NULL + ) + ) + } + ) + + expect_s3_class(successful_response, "data.frame") + expect_equal(nrow(successful_response), 5) + expect_true("content" %in% names(successful_response)) + expect_true(all(grepl("sentiment", successful_response$content))) +}) + + +test_that("ant_complete_df's input validation is working", { + + test_df <- data.frame( + id = c(1, 2), + text = c("positive text", "negative text"), + stringsAsFactors = FALSE + ) + + expect_error( + ant_complete_df(df = "not_a_dataframe", text_var = text, id_var = id, endpoint_url = "url", key_name = "key"), + "df must be a data frame" + ) + + expect_error( + ant_complete_df(df = data.frame(), text_var = text, id_var = id, endpoint_url = "url", key_name = "key"), + "df must not be empty" + ) + + expect_error( + ant_complete_df(df = test_df, text_var = text, id_var = id, chunk_size = "text", endpoint_url = "url", key_name = "key"), + "`chunk_size` must be a positive integer" + ) + + expect_error( + ant_complete_df(df = test_df, text_var = text, id_var = id, chunk_size = NULL, endpoint_url = "url", key_name = "key"), + "`chunk_size` must be a positive integer" + ) })