From c91ebecf549b3e1cbae54453bf2b63291ff140e2 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Wed, 3 Dec 2025 11:12:52 +0000 Subject: [PATCH 01/56] fix the broken link (.Rmd not .html) in the README for structured outputs build readme md from Rmd --- README.Rmd | 2 +- README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.Rmd b/README.Rmd index 22071ee..b8c1c0e 100644 --- a/README.Rmd +++ b/README.Rmd @@ -267,7 +267,7 @@ metadata$endpoint_url **Note:** Add output directories to `.gitignore` to avoid committing API responses and metadata. -Read the [LLM Providers Vignette](articles/llm_providers.html), and the [Structured Outputs Vignette](vignettes/structured_outputs_json_schema.Rmd) for more information on common workflows with the OpenAI Chat Completions API [^1] +Read the [LLM Providers Vignette](articles/llm_providers.html), and the [Structured Outputs Vignette](vignettes/structured_outputs_json_schema.html) for more information on common workflows with the OpenAI Chat Completions API [^1] [^1]: Content pending implementation for Anthroic Messages API, Gemini API, and OpenAI Responses API diff --git a/README.md b/README.md index e54bd32..f641d81 100644 --- a/README.md +++ b/README.md @@ -275,7 +275,7 @@ responses and metadata. Read the [LLM Providers Vignette](articles/llm_providers.html), and the [Structured Outputs -Vignette](vignettes/structured_outputs_json_schema.Rmd) for more +Vignette](vignettes/structured_outputs_json_schema.html) for more information on common workflows with the OpenAI Chat Completions API [^1] From 5fc5249eae0c17fae1835578d381165e1b55d4f9 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Wed, 3 Dec 2025 12:36:08 +0000 Subject: [PATCH 02/56] Start building Anthropic integration --- R/anthropic_messages.R | 58 ++++++++++++++++++++++++ tests/testthat/test-anthropic_messages.R | 13 ++++++ 2 files changed, 71 insertions(+) create mode 100644 R/anthropic_messages.R create mode 100644 tests/testthat/test-anthropic_messages.R diff --git a/R/anthropic_messages.R b/R/anthropic_messages.R new file mode 100644 index 0000000..1602c6f --- /dev/null +++ b/R/anthropic_messages.R @@ -0,0 +1,58 @@ +# constants ---- +.ANT_API_VERSION <- "2023-06-01" +.ANT_STRUCTURED_OUTPUTS_BETA <- "structured-outputs-2025-11-13" +.ANT_MESSAGES_ENDPOINT <- "https://api.anthropic.com/v1/messages" +.ANT_DEFAULT_MODEL <- "claude-haiku-4-5" + +ant_build_messages_request <- function( + input, + endpointr_id = NULL, + model = .ANT_DEFAULT_MODEL, + temperature = 0, + max_tokens = 500L, + schema = NULL, + system_prompt = NULL, + key_name = "ANTHROPIC_API_KEY", + endpoint_url = .ANT_MESSAGES_ENDPOINT, + timeout = 30L, + max_retries = 5L + ) { + # can't use `base_request()` from core.R because Anthropic use different auth (x-api-key) so we add as a header + + stopifnot( + "input must be a non-empty character string" = is.character(input) && length(input) == 1 && nchar(input) > 0, + "model must be a character string" = is.character(model) && length(model) == 1, + "temperature must be numeric between 0 and 1" = is.numeric(temperature) && temperature >= 0 && temperature <= 1, # diff to OAI API + "max_tokens must be a positive integer" = is.numeric(max_tokens) && max_tokens > 0) + + use_structured_outputs <- FALSE # flag for later control flow + + api_key <- get_api_key(key_name) + + messages <- list( + list(role = "user", content = input) + ) + + body <- list( + model = model, + messages = messages, + max_tokens = as.integer(max_tokens), + temperature = temperature + ) + + # Anthropic API takes system_prompt as its own parameter, different to OAI where we concatenate + + if(!is.null(system_prompt)){ + if (!rlang::is_scalar_character(system_prompt)){ + cli::cli_abort("{.arg system_prompt} must be a {.cls character} of length 1, e.g. 'This is a valid system prompt'") + } + } + + + return( + list( + messages = messages, + body = body + ) + ) +} diff --git a/tests/testthat/test-anthropic_messages.R b/tests/testthat/test-anthropic_messages.R new file mode 100644 index 0000000..c0bc39b --- /dev/null +++ b/tests/testthat/test-anthropic_messages.R @@ -0,0 +1,13 @@ +test_that("Building messages requests works", { + + expect_error( + ant_build_messages_request(input = c("Vector", "input")), + "input must be a non-empty character string" + ) + + expect_error( + ant_build_messages_request(input = "User stuff", system_prompt = c("Vector", "Prompt")), "must be a " + ) + + +}) From 3f30251dc97aca5b5f15e060cc2440b5f2c7d3e3 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Wed, 3 Dec 2025 13:06:04 +0000 Subject: [PATCH 03/56] add ant schema conversion - make sure types are as expected for API add Jimbo's batch API quarto to dev_docs --- R/anthropic_messages.R | 17 + dev_docs/jimbo_oai_batch_api.qmd | 630 +++++++++++++++++++++++++++++++ 2 files changed, 647 insertions(+) create mode 100644 dev_docs/jimbo_oai_batch_api.qmd diff --git a/R/anthropic_messages.R b/R/anthropic_messages.R index 1602c6f..6eda9b5 100644 --- a/R/anthropic_messages.R +++ b/R/anthropic_messages.R @@ -56,3 +56,20 @@ ant_build_messages_request <- function( ) ) } + + + +#' Convert json_schema S7 object to Anthropic output_format structure +#' @keywords internal +.ant_format_schema <- function(schema) { + if (!inherits(schema, "EndpointR::json_schema")) { + cli::cli_abort("schema must be a json_schema object") + } + + # Anthropic uses output_format with type "json_schema" + # The schema goes directly in the "schema" field (not nested like OpenAI) + list( + type = "json_schema", + schema = schema@schema + ) +} diff --git a/dev_docs/jimbo_oai_batch_api.qmd b/dev_docs/jimbo_oai_batch_api.qmd new file mode 100644 index 0000000..6dfe0c2 --- /dev/null +++ b/dev_docs/jimbo_oai_batch_api.qmd @@ -0,0 +1,630 @@ +--- +title: "batch_api" +format: html +--- + +```{python} +import json +from openai import OpenAI +import pandas as pd +from IPython.display import Image, display +``` + +```{r} +reticulate::repl_python() + +OPENAI_API_KEY <- Sys.getenv("OPENAI_API_KEY") +``` + +```{python} +py_OPENAI_API_KEY = r.OPENAI_API_KEY + +client = OpenAI() +``` + +Loading data + +```{python} +dataset_path = "data/batch_trial.csv" + +df = pd.read_csv(dataset_path) +df.head() +``` + +Processing step + +```{python} +categorize_system_prompt = ''' +You are an internal tool that classifies online posts. + +Classify as TRUE only if the post explicitly discusses exercise as a deliberate activity AND explicitly states the REASON behind their exercise behaviour: +- A specific motivation/driver for exercising: WHY do they exercise? (e.g., losing weight, health problems, enjoying beautiful environments, dog needs walks, saving money on transport, mental health benefits, social connection) +- A specific barrier preventing exercise: WHAT stops them? (e.g., lack of facilities, weather, injury, cost, time constraints, caregiving responsibilities, safety concerns) + +Exercise includes: organised activities (gym, running, yoga, sports) or incidental physical activity performed regularly (walking for transport, active caregiving, dog walking). + +Classify as FALSE if: +- Exercise verbs describe ordinary movement, not deliberate exercise +- Exercise is tangential or metaphorical +- The post is about someone else's experience, not the author's own +- Exercise is mentioned but NO specific reason is given +- The post shows enthusiasm for exercise but doesn't explain the underlying driver (e.g., "I love working out!", "Can't wait for gym time") +- The post discusses exercise planning, routines, or methods without explaining WHY they exercise or WHAT prevents them +- Any connection to exercise requires inference or multiple logical steps + +The motivation or barrier must be directly stated in relation to exercise - do not infer connections. + +Answer TRUE or FALSE, then provide a brief rationale. + +Examples: + +"I work in Seattle this week right next to this cool ass hotel. I love older architecture. I could walk around a city looking at cool buildings admiring them all day✨" - TRUE - Motivation to walk around an aesthetically pleasing environment + +"Fine, but I still want to learn to walk before I try to run, okay?" - FALSE - Metaphorical use of exercise verbs, not about actual exercise + +"Dentist will prescribe them but not walk in centre. I could see the emergency dentist but tbh I can't drive on these levels of codeine." - FALSE - "Walk in centre" is a facility type, not about exercise + +"After my spinal fusion I had to beg and cry for anything. I was a few days in with little to no sleep, I told them if I don't get something to help so I could sleep, I'd be walking upstairs to psych." - FALSE - "Walking upstairs" describes ordinary movement in a hospital, not exercise; no motivation or barrier to exercise is discussed + +"I can't afford a gym membership and it's too dark after work to run outside safely" - TRUE - Explicit barriers to exercise (cost and safety concerns) + +"Just walking to pick the kids up from school is my daily workout now" - TRUE - Walking as deliberate exercise with implicit motivation (combining caregiving with activity) + +"I want your workout split and I want to follow it like the Bible" - FALSE - Person engaged in exercise, but no information as to why they exercise or what prevents them + +"I ran for 4 hours today and my legs are killing me but I'm going to keep walking" - FALSE - Describes doing exercise with dedication but doesn't explain WHY they exercise or what motivates them to continue +''' + +exercise_schema = { + "type": "json_schema", + "json_schema": { + "name": "simple_schema", + "schema": { + "type": "object", + "properties": { + "exercise": { + "type": "boolean", + "description": "contains reference to motivation towards exercise or barriers faced stopping exercise" + }, + "reason": { + "type": "string", + "description": "brief rationale behind classification" + } + }, + "additionalProperties": False, + "required": [ + "exercise", + "reason" + ] + }, + "strict": True + } +} + +def get_categories(description): + response = client.chat.completions.create( + model="gpt-4.1-mini", + temperature=0.0, + # This is to enable JSON mode, making sure responses are valid json objects + response_format= exercise_schema, + messages=[ + { + "role": "system", + "content": categorize_system_prompt + }, + { + "role": "user", + "content": description + } + ], + ) + + return response.choices[0].message.content + +## test on a few examples +for _, row in df[:5].iterrows(): + description = row['text'] + result = get_categories(description) + print(f"TEXT: {description}\n\nRESULT: {result}") + print("\n\n----------------------------\n\n") +``` + +Create batch task: + +```{python} +# Creating an array of json tasks + +tasks = [] + +for index, row in df.iterrows(): + + description = row['text'] + + task = { + "custom_id": str(row['radarly_id']), + "method": "POST", + "url": "/v1/chat/completions", + "body": { + # This is what you would have in your Chat Completions API call + "model": "gpt-4.1-mini", + "temperature": 0.0, + "response_format": exercise_schema, + "messages": [ + { + "role": "system", + "content": categorize_system_prompt + }, + { + "role": "user", + "content": description + } + ], + } + } + + tasks.append(task) + +tasks +``` + +Note: the request ID should be unique per batch. This is what you can use to match results to the initial input files, as requests will not be returned in the same order. + +```{python} +# Creating the file + +file_name = "data/batch_tasks_exercise.jsonl" + +with open(file_name, 'w') as file: + for obj in tasks: + file.write(json.dumps(obj) + '\n') +``` + +Uploading file + +```{python} +batch_file = client.files.create( + file=open(file_name, "rb"), + purpose="batch" +) + +print(batch_file) + +``` + +Create batch job + +```{python} +batch_job = client.batches.create( + input_file_id=batch_file.id, + endpoint="/v1/chat/completions", + completion_window="24h" +) +``` + +Checking batch status Note: this can take up to 24h, but it will usually be completed faster. + +You can continue checking until the status is 'completed'. + +```{python} +batch_job = client.batches.retrieve(batch_job.id) +print(batch_job) +``` + +Retrieving results + +```{python} +result_file_id = batch_job.output_file_id +result = client.files.content(result_file_id).content +``` + +```{python} +result_file_name = "data/batch_job_results.jsonl" + +with open(result_file_name, 'wb') as file: + file.write(result) +``` + +```{python} +# Loading data from saved file +results = [] +with open(result_file_name, 'r') as file: + for line in file: + # Parsing the JSON string into a dict and appending to the list of results + json_object = json.loads(line.strip()) + results.append(json_object) +``` + +```{python} +# Reading only the first results +for res in results[:5]: + task_id = res['custom_id'] + result = res['response']['body']['choices'][0]['message']['content'] + row = df[df['radarly_id'] == task_id].iloc[0] + description = row['text'] + radarly_id = row['radarly_id'] + print(f"RADARLY ID: {radarly_id}\nTEXT: {description}\n\nRESULT: {result}") + print("\n\n----------------------------\n\n") +``` + +Okay now I am going to try this on a sample of 10k different posts: + +```{python} +dataset_path = "data/batch_trial_v3.csv" +dataset_path = "data/batch_trial_v4.csv" +dataset_path = "data/lr_labelled_batch.csv" +dataset_path = "data/lr_labelled_batch_2.csv" + +df = pd.read_csv(dataset_path) +df.head() +len(df) +``` + +```{python} +categorize_system_prompt = ''' +You are an internal tool that classifies online posts relating to exercise. + +For each post, provide THREE classifications: + +1. Exercise: Does the post mention exercise in any form? (TRUE/FALSE) + - Include: organised activities (gym, running, yoga, sports, workout) or incidental physical activity performed regularly (walking for transport, active caregiving, dog walking) + - Exclude: ordinary movement, metaphorical uses, tangential mentions + +2. Motivation: Does the post explicitly state WHY they exercise or want to exercise? (TRUE/FALSE) + - Must explicitly state a reason: "because...", "so that...", "for my...", "to..." + - Examples: losing weight, health problems, enjoying beautiful environments, dog needs walks, saving money, mental health benefits, social connection + - FALSE if: shows enthusiasm but no specific reason, or requires inference + +3. Barrier: Does the post explicitly state WHAT prevents or hinders them from exercising? (TRUE/FALSE) + - Must explicitly state an obstacle or challenge + - Examples: lack of facilities, weather, injury, cost, time constraints, caregiving responsibilities, safety concerns + - FALSE if requires inference + + +IMPORTANT: +- The post must be about the author's OWN experience (not someone else's) +- Motivations and barriers must be DIRECTLY STATED, not inferred +- "Implies motivation" or "suggests barrier" is NOT sufficient +- Simply doing exercise != stating motivation + +Output format: +Exercise: TRUE/FALSE +Motivation: TRUE/FALSE +Barrier: TRUE/FALSE + +Examples: + +"I work in Seattle this week right next to this cool ass hotel. I love older architecture. I could walk around a city looking at cool buildings admiring them all day✨" +Exercise: TRUE +Motivation: TRUE +Barrier: FALSE + +"Fine, but I still want to learn to walk before I try to run, okay?" +Exercise: FALSE +Motivation: FALSE +Barrier: FALSE + +"Dentist will prescribe them but not walk in centre. I could see the emergency dentist but tbh I can't drive on these levels of codeine." +Exercise: FALSE +Motivation: FALSE +Barrier: FALSE + +"After my spinal fusion I had to beg and cry for anything. I was a few days in with little to no sleep, I told them if I don't get something to help so I could sleep, I'd be walking upstairs to psych." +Exercise: FALSE +Motivation: FALSE +Barrier: FALSE + +"I can't afford a gym membership and it's too dark after work to run outside safely" +Exercise: TRUE +Motivation: FALSE +Barrier: TRUE + +"Just walking to pick the kids up from school is my daily workout now" +Exercise: TRUE +Motivation: TRUE +Barrier: FALSE + +"I want your workout split and I want to follow it like the Bible" +Exercise: TRUE +Motivation: FALSE +Barrier: FALSE + +"I ran for 4 hours today and my legs are killing me but I'm going to keep walking" +Exercise: TRUE +Motivation: FALSE +Barrier: FALSE +''' + +exercise_schema = { + "type": "json_schema", + "json_schema": { + "name": "simple_schema", + "schema": { + "type": "object", + "properties": { + "exercise": { + "type": "boolean", + "description": "contains reference to exercise" + }, + "motivation": { + "type": "boolean", + "description": "contains reference to motivation towards exercise " + }, + "barrier": { + "type": "boolean", + "description": "contains reference to barriers faced stopping exercise" + } + }, + "additionalProperties": False, + "required": [ + "exercise", + "motivation", + "barrier" + ] + }, + "strict": True + } +} +``` + +```{python} +def get_categories(description): + response = client.chat.completions.create( + model="gpt-4.1-mini", + temperature=0.0, + # This is to enable JSON mode, making sure responses are valid json objects + response_format= exercise_schema, + messages=[ + { + "role": "system", + "content": categorize_system_prompt + }, + { + "role": "user", + "content": description + } + ], + ) + + return response.choices[0].message.content + +## test on a few examples +for _, row in df[:10].iterrows(): + description = row['text'] + result = get_categories(description) + print(f"TEXT: {description}\n\nRESULT: {result}") + print("\n\n----------------------------\n\n") +``` + +Create batch task: + +```{python} +# Creating an array of json tasks + +tasks = [] + +for index, row in df.iterrows(): + + description = row['text'] + + task = { + "custom_id": str(row['radarly_id']), + "method": "POST", + "url": "/v1/chat/completions", + "body": { + # This is what you would have in your Chat Completions API call + "model": "gpt-4.1-mini", + "temperature": 0.0, + "response_format": exercise_schema, + "messages": [ + { + "role": "system", + "content": categorize_system_prompt + }, + { + "role": "user", + "content": description + } + ], + } + } + + tasks.append(task) + +# tasks +``` + +Note: the request ID should be unique per batch. This is what you can use to match results to the initial input files, as requests will not be returned in the same order. + +```{python} +# Creating the file + +file_name = "data/batch_tasks_exercise_v2.jsonl" #10k +file_name = "data/batch_tasks_exercise_v3.jsonl" #35k +file_name_2 = "data/batch_tasks_exercise_v4.jsonl" #35k + +file_name_3 = "data/new_data_batch_tasks_exercise_v1.jsonl" #50k +file_name_4 = "data/new_data_batch_tasks_exercise_v2.jsonl" #50k + +with open(file_name, 'w') as file: + for obj in tasks: + file.write(json.dumps(obj) + '\n') + +with open(file_name_2, 'w') as file: + for obj in tasks: + file.write(json.dumps(obj) + '\n') + + +with open(file_name_3, 'w') as file: + for obj in tasks: + file.write(json.dumps(obj) + '\n') + + +with open(file_name_4, 'w') as file: + for obj in tasks: + file.write(json.dumps(obj) + '\n') +``` + +Uploading file + +```{python} +batch_file = client.files.create( + file=open(file_name, "rb"), + purpose="batch" +) + +print(batch_file) + +batch_file_2 = client.files.create( + file=open(file_name_2, "rb"), + purpose="batch" +) + +print(batch_file_2) + +batch_file_3 = client.files.create( + file=open(file_name_3, "rb"), + purpose="batch" +) + +print(batch_file_3) + +batch_file_4 = client.files.create( + file=open(file_name_4, "rb"), + purpose="batch" +) + +print(batch_file_4) + +``` + +Create batch job + +```{python} +batch_job = client.batches.create( + input_file_id=batch_file.id, + endpoint="/v1/chat/completions", + completion_window="24h" +) + +batch_job_2 = client.batches.create( + input_file_id=batch_file_2.id, + endpoint="/v1/chat/completions", + completion_window="24h" +) + +batch_job_3 = client.batches.create( + input_file_id=batch_file_3.id, + endpoint="/v1/chat/completions", + completion_window="24h" +) + +batch_job_4 = client.batches.create( + input_file_id=batch_file_4.id, + endpoint="/v1/chat/completions", + completion_window="24h" +) +``` + +Checking batch status Note: this can take up to 24h, but it will usually be completed faster. + +You can continue checking until the status is 'completed'. + +```{python} +batch_job = client.batches.retrieve(batch_job.id) +print(batch_job) +batch_job.status +batch_job.request_counts + +batch_job_2 = client.batches.retrieve(batch_job_2.id) +print(batch_job_2) +batch_job_2.status +batch_job_2.request_counts + +batch_job_3 = client.batches.retrieve(batch_job_3.id) +print(batch_job_3) +batch_job_3.status +batch_job_3.request_counts + +batch_job_4 = client.batches.retrieve(batch_job_4.id) +print(batch_job_4) +batch_job_4.status +batch_job_4.request_counts +``` + +Retrieving results + +```{python} +result_file_id = batch_job.output_file_id +result = client.files.content(result_file_id).content + +result_file_id_2 = batch_job_2.output_file_id +result_2 = client.files.content(result_file_id_2).content + +result_file_id_3 = batch_job_3.output_file_id +result_3 = client.files.content(result_file_id_3).content + +result_file_id_4 = batch_job_4.output_file_id +result_4 = client.files.content(result_file_id_4).content +``` + +```{python} +result_file_name_2 = "data/batch_job_results_v3.jsonl" + +with open(result_file_name_2, 'wb') as file: + file.write(result) + +result_file_name_3 = "data/batch_job_results_v4.jsonl" + +with open(result_file_name_3, 'wb') as file: + file.write(result_2) + + +result_file_name_4 = "data/batch_job_results_v6.jsonl" + +with open(result_file_name_4, 'wb') as file: + file.write(result_3) + +result_file_name_5 = "data/batch_job_results_v5.jsonl" + +with open(result_file_name_5, 'wb') as file: + file.write(result_4) +``` + +```{python} +# Loading data from saved file +results = [] +with open(result_file_name_2, 'r') as file: + for line in file: + # Parsing the JSON string into a dict and appending to the list of results + json_object = json.loads(line.strip()) + results.append(json_object) + +results_2 = [] +with open(result_file_name_3, 'r') as file: + for line in file: + # Parsing the JSON string into a dict and appending to the list of results + json_object = json.loads(line.strip()) + results_2.append(json_object) +``` + +```{python} +# Reading only the first results +for res in results[:5]: + task_id = res['custom_id'] + result = res['response']['body']['choices'][0]['message']['content'] + row = df[df['radarly_id'] == task_id].iloc[0] + description = row['text'] + radarly_id = row['radarly_id'] + print(f"RADARLY ID: {radarly_id}\nTEXT: {description}\n\nRESULT: {result}") + print("\n\n----------------------------\n\n") + +# Reading only the first results +for res in results_2[:5]: + task_id = res['custom_id'] + result = res['response']['body']['choices'][0]['message']['content'] + row = df[df['radarly_id'] == task_id].iloc[0] + description = row['text'] + radarly_id = row['radarly_id'] + print(f"RADARLY ID: {radarly_id}\nTEXT: {description}\n\nRESULT: {result}") + print("\n\n----------------------------\n\n") +``` From f99c5d290e4e18d29a17da41e6ee640815cf91dd Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Wed, 3 Dec 2025 13:14:02 +0000 Subject: [PATCH 04/56] add schema section to ant_build_messages_request --- R/anthropic_messages.R | 48 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/R/anthropic_messages.R b/R/anthropic_messages.R index 6eda9b5..6b39fa9 100644 --- a/R/anthropic_messages.R +++ b/R/anthropic_messages.R @@ -48,17 +48,53 @@ ant_build_messages_request <- function( } } + # + if(!is.null(schema)) { + use_structured_outputs <- TRUE + if (inherits(schema, "EndpointR::json_schema")) { + body$output_format <- .ant_format_schema(schema) + } else if (is.list(schema)) { + cli::cli_alert_warning("Your {.arg schema} is a list, not an EndpointR json_schema") + body$output_format <- schema + } else { + cli::cli_abort("{.arg chema} must be an EndpointR json_schema object or a list") + } + } - return( - list( - messages = messages, - body = body - ) - ) + # build the request with headers, auth, timeout, retries, backoff (incl. system prompt if applicable) + request <- httr2::request(endpoint_url) |> + httr2::req_user_agent("EndpointR") |> + httr2::req_method("POST") |> + httr2::req_headers( + "Content-Type" = "application/json", + "x-api-key" = api_key, + "anthropic-version" = .ANT_API_VERSION + ) |> + httr2::req_timeout(timeout) |> + httr2::req_retry( + max_tries = max_retries, + backoff = ~ 2 ^ .x, + retry_on_failure = TRUE + ) |> + httr2::req_body_json(body) + + # if we did use structured outputs then we need to add the anthropic-beta header (this will be patched at some point I expect) + + if (use_structured_outputs) { + request <- httr2::req_headers(request, "anthropic-beta" = .ANT_STRUCTURED_OUTPUTS_BETA) + } + + if (!is.null(endpointr_id)) { + request <- httr2::req_headers(request, endpointr_id = endpointr_id) + } + + return(request) } + + #' Convert json_schema S7 object to Anthropic output_format structure #' @keywords internal .ant_format_schema <- function(schema) { From 66e77821183f292776e98e72f790f211e844ee44 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Wed, 3 Dec 2025 16:02:54 +0000 Subject: [PATCH 05/56] add roxygen2 docs for ant_build_messages_request update tests export ant_build_messages_request add schema tests for ant_build_messages_request --- NAMESPACE | 1 + R/anthropic_messages.R | 62 +++++++++++++- dev_docs/ant_messages.qmd | 102 +++++++++++++++++++++++ man/ant_build_messages_request.Rd | 90 ++++++++++++++++++++ man/dot-ant_format_schema.Rd | 12 +++ tests/testthat/test-anthropic_messages.R | 39 ++++++++- 6 files changed, 303 insertions(+), 3 deletions(-) create mode 100644 dev_docs/ant_messages.qmd create mode 100644 man/ant_build_messages_request.Rd create mode 100644 man/dot-ant_format_schema.Rd diff --git a/NAMESPACE b/NAMESPACE index 6346a46..c8bb882 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +export(ant_build_messages_request) export(create_json_schema) export(get_api_key) export(hf_build_request) diff --git a/R/anthropic_messages.R b/R/anthropic_messages.R index 6b39fa9..adf0bdf 100644 --- a/R/anthropic_messages.R +++ b/R/anthropic_messages.R @@ -4,6 +4,65 @@ .ANT_MESSAGES_ENDPOINT <- "https://api.anthropic.com/v1/messages" .ANT_DEFAULT_MODEL <- "claude-haiku-4-5" +#' Build an Anthropic Messages API request +#' +#' @description +#' Constructs an httr2 request object for Anthropic's Messages API. +#' Handles message formatting, system prompts, and optional JSON schema +#' for structured outputs. +#' +#' @details +#' This function creates the HTTP request but does not execute it. For +#' structured outputs, you must use a supported model (Claude Sonnet 4.5 +#' or Opus 4.1) and the request will automatically include the required +#' beta header. +#' +#' The `schema` parameter accepts either: +#' - A `json_schema` S7 object created with `create_json_schema()` +#' - A raw list in Anthropic's `output_format` structure +#' +#' Unlike OpenAI, Anthropic uses `output_format` (not `response_format`) +#' and the schema structure differs slightly. +#' +#' @param input Text input to send to the model +#' @param endpointr_id An id that will persist through to response +#' @param model Anthropic model to use (default: "claude-haiku-4.5") +#' @param temperature Sampling temperature (0-2), higher values = more randomness +#' @param max_tokens Maximum tokens in response +#' @param schema Optional JSON schema for structured output (json_schema object or list) +#' @param system_prompt Optional system prompt +#' @param key_name Environment variable name for API key +#' @param endpoint_url Anthropic API endpoint URL +#' @param timeout Request timeout in seconds +#' @param max_retries Maximum number of retry attempts for failed requests + +#' +#' @return An httr2 request object +#' @export +#' +#' @examples +#' \dontrun{ +#' # simple request +#' req <- ant_build_messages_request( +#' input = "What is the capital of France?", +#' max_tokens = 100 +#' ) +#' +#' # with structured output +#' schema <- create_json_schema( +#' name = "capital_response", +#' schema = schema_object( +#' country = schema_string(), +#' capital = schema_string(), +#' required = c("country", "capital") +#' ) +#' ) +#' req <- ant_build_messages_request( +#' input = "What is the capital of France?", +#' schema = schema, +#' max_tokens = 100 +#' ) +#' } ant_build_messages_request <- function( input, endpointr_id = NULL, @@ -93,8 +152,6 @@ ant_build_messages_request <- function( - - #' Convert json_schema S7 object to Anthropic output_format structure #' @keywords internal .ant_format_schema <- function(schema) { @@ -109,3 +166,4 @@ ant_build_messages_request <- function( schema = schema@schema ) } + diff --git a/dev_docs/ant_messages.qmd b/dev_docs/ant_messages.qmd new file mode 100644 index 0000000..389ef1d --- /dev/null +++ b/dev_docs/ant_messages.qmd @@ -0,0 +1,102 @@ +--- +title: "ant_messages" +format: html +--- + +# Structured Outputs + +HTTP 400 errors + +```{r} +library(httr2) +library(jsonlite) + +req <- ant_build_messages_request(input = "This is terrible, fin.") + +resp <- req_perform(req, verbosity = 1) +resp |> resp_body_string() |> prettify() + + +schema <- create_json_schema( + name = "capital_response", + schema = schema_object( + country = schema_string(), + capital = schema_string(), + required = c("country", "capital") + ) +) + +structured_req <- ant_build_messages_request(input = "Well I would walk five hundred miles, to visit the noble city of Prague.", schema = schema, model = "claude-sonnet-4-5") + +structured_req |> req_dry_run() |> toJSON() |> jsonlite::flatten() + +structured_resp <- req_perform(structured_req) + +ant_schema <- .ant_format_schema(schema) +ant_structured_req <- ant_build_messages_request(input = "Well I would walk five hundred miles, to visit the noble city of Prague.", schema = ant_schema) + +setdiff(ant_structured_req |> req_dry_run(), structured_req |> req_dry_run()) + +ant_structured_resp <- req_perform(ant_structured_req) + +struc_dry <- req_dry_run(structured_req) +ant_dry <- req_dry_run(ant_structured_req) + +setdiff(struc_dry, ant_dry) +identical(struc_dry, ant_dry) + +``` + +``` +"output_format": { + "type": "json_schema", + "schema": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "email": {"type": "string"}, + "plan_interest": {"type": "string"}, + "demo_requested": {"type": "boolean"} + }, + "required": ["name", "email", "plan_interest", "demo_requested"], + "additionalProperties": false + } +} +``` + +```{r} +list_schema <- list( + type = "json_schema", + schema = list( + type = "object", + properties = list( + country = list(type = "string"), + capital = list(type = "string") + ), + required = c("country", "capital"), + additionalProperties = FALSE + ) +) + +``` + +```{r} +list_schema_req <- ant_build_messages_request("Prague is my favourite city, I went there when I visited The Czech Republic", schema = list_schema, model = "claude-sonnet-4-5") +list_schema_req |> req_dry_run() +list_schema_resp <- req_perform(list_schema_req, verbosity = 1) +``` + +If we want to surface the actual errors we'll need to re-write a bunch of the package. Thought about this earlier but wasn't sure. The structured outputs error with Haiku makes me think it's probs worth. + +```{r} +req_schema_haiku <- ant_build_messages_request("Prague is my favourite city, I went there when I visited The Czech Republic", schema = list_schema) + +req_schema_haiku <- req_schema_haiku |> + req_error(is_error = ~FALSE) + +resp <- req_schema_haiku |> + req_perform(verbosity = 1) + +resp_body_json(resp)[["error"]][["message"]] +resp |> resp_status() +``` diff --git a/man/ant_build_messages_request.Rd b/man/ant_build_messages_request.Rd new file mode 100644 index 0000000..b5caaba --- /dev/null +++ b/man/ant_build_messages_request.Rd @@ -0,0 +1,90 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/anthropic_messages.R +\name{ant_build_messages_request} +\alias{ant_build_messages_request} +\title{Build an Anthropic Messages API request} +\usage{ +ant_build_messages_request( + input, + endpointr_id = NULL, + model = .ANT_DEFAULT_MODEL, + temperature = 0, + max_tokens = 500L, + schema = NULL, + system_prompt = NULL, + key_name = "ANTHROPIC_API_KEY", + endpoint_url = .ANT_MESSAGES_ENDPOINT, + timeout = 30L, + max_retries = 5L +) +} +\arguments{ +\item{input}{Text input to send to the model} + +\item{endpointr_id}{An id that will persist through to response} + +\item{model}{Anthropic model to use (default: "claude-haiku-4.5")} + +\item{temperature}{Sampling temperature (0-2), higher values = more randomness} + +\item{max_tokens}{Maximum tokens in response} + +\item{schema}{Optional JSON schema for structured output (json_schema object or list)} + +\item{system_prompt}{Optional system prompt} + +\item{key_name}{Environment variable name for API key} + +\item{endpoint_url}{Anthropic API endpoint URL} + +\item{timeout}{Request timeout in seconds} + +\item{max_retries}{Maximum number of retry attempts for failed requests} +} +\value{ +An httr2 request object +} +\description{ +Constructs an httr2 request object for Anthropic's Messages API. +Handles message formatting, system prompts, and optional JSON schema +for structured outputs. +} +\details{ +This function creates the HTTP request but does not execute it. For +structured outputs, you must use a supported model (Claude Sonnet 4.5 +or Opus 4.1) and the request will automatically include the required +beta header. + +The \code{schema} parameter accepts either: +\itemize{ +\item A \code{json_schema} S7 object created with \code{create_json_schema()} +\item A raw list in Anthropic's \code{output_format} structure +} + +Unlike OpenAI, Anthropic uses \code{output_format} (not \code{response_format}) +and the schema structure differs slightly. +} +\examples{ +\dontrun{ + # simple request + req <- ant_build_messages_request( + input = "What is the capital of France?", + max_tokens = 100 + ) + + # with structured output + schema <- create_json_schema( + name = "capital_response", + schema = schema_object( + country = schema_string(), + capital = schema_string(), + required = c("country", "capital") + ) + ) + req <- ant_build_messages_request( + input = "What is the capital of France?", + schema = schema, + max_tokens = 100 + ) +} +} diff --git a/man/dot-ant_format_schema.Rd b/man/dot-ant_format_schema.Rd new file mode 100644 index 0000000..23e0ace --- /dev/null +++ b/man/dot-ant_format_schema.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/anthropic_messages.R +\name{.ant_format_schema} +\alias{.ant_format_schema} +\title{Convert json_schema S7 object to Anthropic output_format structure} +\usage{ +.ant_format_schema(schema) +} +\description{ +Convert json_schema S7 object to Anthropic output_format structure +} +\keyword{internal} diff --git a/tests/testthat/test-anthropic_messages.R b/tests/testthat/test-anthropic_messages.R index c0bc39b..fef33a2 100644 --- a/tests/testthat/test-anthropic_messages.R +++ b/tests/testthat/test-anthropic_messages.R @@ -1,4 +1,4 @@ -test_that("Building messages requests works", { +test_that("Building Anthropic Messages Requests works with arguments and features", { expect_error( ant_build_messages_request(input = c("Vector", "input")), @@ -9,5 +9,42 @@ test_that("Building messages requests works", { ant_build_messages_request(input = "User stuff", system_prompt = c("Vector", "Prompt")), "must be a " ) + req <- expect_no_error( + ant_build_messages_request(input = "Test Input Alone") + ) + + expect_equal(req$headers$`Content-Type`, "application/json") + expect_equal(req$headers$`anthropic-version`, "2023-06-01") + expect_equal(req$body$data$messages[[1]][["content"]], "Test Input Alone") + + +}) + + +test_that("Anthropic Messages Requests with schemas look right", { + sentiment_schema <- create_json_schema( + name = "sent_schema", + schema = schema_object( + sentiment = schema_enum(values = c("positive", "negative", "neutral")), + required = list("sentiment"), + additional_properties = FALSE + ) + ) + + req_schema <-ant_build_messages_request( + "the UX of tensorflow was vastly inferior to Pytorch, hence the latter's dominance", + schema = sentiment_schema, + model = "claude-sonnet-4-5") + + + schema_data <- req_schema$body$data$output_format + expect_equal(schema_data$type, "json_schema") + + expect_equal(names(schema_data$schema$properties), "sentiment") + expect_equal(req_schema$headers$`anthropic-beta`, "structured-outputs-2025-11-13") + }) + + + From eff7ca24fac0fe5ff483541b490800a32a329a49 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Wed, 3 Dec 2025 16:25:49 +0000 Subject: [PATCH 06/56] add sonnet-4-5 instruction to docs for structures outputs (should add this to vignette too) add more tests to ant_build_messages_request --- R/anthropic_messages.R | 9 ++- man/ant_build_messages_request.Rd | 8 ++- tests/testthat/test-anthropic_messages.R | 34 ++++++++- todos.qmd | 92 +++++++++++++++++++++++- 4 files changed, 136 insertions(+), 7 deletions(-) diff --git a/R/anthropic_messages.R b/R/anthropic_messages.R index adf0bdf..d39d628 100644 --- a/R/anthropic_messages.R +++ b/R/anthropic_messages.R @@ -9,7 +9,8 @@ #' @description #' Constructs an httr2 request object for Anthropic's Messages API. #' Handles message formatting, system prompts, and optional JSON schema -#' for structured outputs. +#' for structured outputs. When using strucutred outputs you must select the correct model. +#' #' #' @details #' This function creates the HTTP request but does not execute it. For @@ -40,6 +41,7 @@ #' @return An httr2 request object #' @export #' +#' @seealso \url{https://platform.claude.com/docs/en/build-with-claude/structured-outputs} #' @examples #' \dontrun{ #' # simple request @@ -60,7 +62,8 @@ #' req <- ant_build_messages_request( #' input = "What is the capital of France?", #' schema = schema, -#' max_tokens = 100 +#' max_tokens = 100, +#' model = "sonnet-4-5" #' ) #' } ant_build_messages_request <- function( @@ -105,6 +108,8 @@ ant_build_messages_request <- function( if (!rlang::is_scalar_character(system_prompt)){ cli::cli_abort("{.arg system_prompt} must be a {.cls character} of length 1, e.g. 'This is a valid system prompt'") } + + body$system <- system_prompt } # diff --git a/man/ant_build_messages_request.Rd b/man/ant_build_messages_request.Rd index b5caaba..c9d1686 100644 --- a/man/ant_build_messages_request.Rd +++ b/man/ant_build_messages_request.Rd @@ -47,7 +47,7 @@ An httr2 request object \description{ Constructs an httr2 request object for Anthropic's Messages API. Handles message formatting, system prompts, and optional JSON schema -for structured outputs. +for structured outputs. When using strucutred outputs you must select the correct model. } \details{ This function creates the HTTP request but does not execute it. For @@ -84,7 +84,11 @@ and the schema structure differs slightly. req <- ant_build_messages_request( input = "What is the capital of France?", schema = schema, - max_tokens = 100 + max_tokens = 100, + model = "sonnet-4-5" ) } } +\seealso{ +\url{https://platform.claude.com/docs/en/build-with-claude/structured-outputs} +} diff --git a/tests/testthat/test-anthropic_messages.R b/tests/testthat/test-anthropic_messages.R index fef33a2..bc7fddc 100644 --- a/tests/testthat/test-anthropic_messages.R +++ b/tests/testthat/test-anthropic_messages.R @@ -1,4 +1,4 @@ -test_that("Building Anthropic Messages Requests works with arguments and features", { +test_that("ant_build_messages_request validates inputs and generates valid requests", { expect_error( ant_build_messages_request(input = c("Vector", "input")), @@ -17,11 +17,30 @@ test_that("Building Anthropic Messages Requests works with arguments and feature expect_equal(req$headers$`anthropic-version`, "2023-06-01") expect_equal(req$body$data$messages[[1]][["content"]], "Test Input Alone") + expect_equal(req$url, "https://api.anthropic.com/v1/messages") + expect_equal(req$method, "POST") + expect_equal(req$policies$retry_max_tries, 5) + expect_equal(req$options$timeout_ms, 30000) + expect_error( + ant_build_messages_request("hello",temperature = 2), + "temperature must be numeric between 0 and 1" + ) }) +test_that("ant_build_messages accepts a system_prompt and the request is formatted appropriately", { + + message <- "The 4th king of neverland was not Captain Hook" + req <- ant_build_messages_request(message) + + expect_null(req$body$data$system) + + req_w_sys <- ant_build_messages_request(message, system_prompt = "Talk about all things Peter Pan only") + + expect_true(!is.null(req_w_sys$body$data$system)) +}) -test_that("Anthropic Messages Requests with schemas look right", { +test_that("ant_build_messages_request accepts schemas and formats properly with .ant_format_schema", { sentiment_schema <- create_json_schema( name = "sent_schema", schema = schema_object( @@ -46,5 +65,16 @@ test_that("Anthropic Messages Requests with schemas look right", { }) +test_that("ant_build_messages_request accepts endpointr_id and adds to headers", { + req <- ant_build_messages_request( + "Hello this a test", + endpointr_id = "id_101" + ) + + expect_equal(req$headers$endpointr_id, "id_101") +}) + + + diff --git a/todos.qmd b/todos.qmd index 9fed3d6..b3696ed 100644 --- a/todos.qmd +++ b/todos.qmd @@ -1,6 +1,41 @@ # EndpointR Hugging Face Embeddings Implementation Checklist -# 0.1.2 +# Versions + +## 0.2 + +- [ ] Support for Anthropic API + - [ ] Batches + - [ ] Messages (Completions) + - [ ] Structured Outputs +- [ ] Support for Gemini API + - [ ] Embeddings + - [ ] Completions + - [ ] Structured Outputs +- [ ] LLM Providers Vignette Updated +- [ ] Structured Outpits Vignette Updated + +Error reporting is somewhat annoying by default with httr2::req_perform() if we don't: + +``` +response <- req |> + httr2::req_error(is_error = ~ FALSE) |> + httr2::req_perform() + +if (httr2::resp_status(response) >= 400) { + error_body <- tryCatch( + httr2::resp_body_json(response), + error = function(e) list(error = list(message = paste("HTTP", httr2::resp_status(response)))) + ) + + cli::cli_abort(c( + "API request failed ({error_body$error$type %||% 'unknown'})", + "x" = error_body$error$message %||% paste("HTTP", httr2::resp_status(response)) + )) +} +``` + +## 0.1.2 - [x] tests passing following hf\_\* changes - [x] max_length for hf functions - not applicable in hf_embed\_\* functions as they use HF's TEI which doesn't allow max_length @@ -317,3 +352,58 @@ Schema -\> Type relationship? e.g. schema_number has its own method which coerce - [x] Vignettes with real-world examples - [x] Vignette re-working/refining based on JH feedback - [ ] Optional Fields and Schemas + +# Anthropic API + +## Anthropic API - Messages + +Anthropic Version of Completions API [docs](https://platform.claude.com/docs/en/api/overview) + +- [ ] build request with headers and auth. THey use X-api-key for auth +- [ ] `ant_complete_text()` +- [ ] `ant_complete_chunks()` +- [ ] `ant_complete_df()` +- [ ] Structured outputs [docs](https://platform.claude.com/docs/en/build-with-claude/structured-outputs) uses JSON schema similar to OAI "To use the feature, set the beta header structured-outputs-2025-11-13" + +Default request looks like: + +``` +curl https://api.anthropic.com/v1/messages \ + -H 'Content-Type: application/json' \ + -H "X-Api-Key: $ANTHROPIC_API_KEY" \ + -d '{ + "max_tokens": 1024, + "messages": [ + { + "content": "Hello, world", + "role": "user" + } + ], + "model": "claude-sonnet-4-5-20250929" + }' +``` + +Basic 200 response body: + +``` +{ + "id": "msg_01XFDUDYJgAACzvnptvVoYEL", + "type": "message", + "role": "assistant", + "content": [ + { + "type": "text", + "text": "Hello!" + } + ], + "model": "claude-sonnet-4-5", + "stop_reason": "end_turn", + "stop_sequence": null, + "usage": { + "input_tokens": 12, + "output_tokens": 6 + } +} +``` + +## Anthropic API - Batches From a9ab70f8c59596fd90efead433de3e7e546a3347 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Wed, 3 Dec 2025 16:49:12 +0000 Subject: [PATCH 07/56] update chunks and df examples for oai module --- R/openai_completions.R | 88 +++++++++++++++++++++++++++++++++----- man/oai_complete_chunks.Rd | 44 ++++++++++++++++--- man/oai_complete_df.Rd | 39 +++++++++++++++++ 3 files changed, 154 insertions(+), 17 deletions(-) diff --git a/R/openai_completions.R b/R/openai_completions.R index 957d1be..4cfe474 100644 --- a/R/openai_completions.R +++ b/R/openai_completions.R @@ -358,18 +358,47 @@ oai_complete_text <- function(text, #' @export #' @examples #' \dontrun{ -#' # basic usage with automatic file naming: -#' -#' # large-scale processing with custom output file: - -#' #structured extraction with schema: +#' # Basic usage with automatic file naming +#' texts <- c("Great product!", "Awful service.", "Decent value.") +#' ids <- c("review_1", "review_2", "review_3") +#' +#' results <- oai_complete_chunks( +#' texts = texts, +#' ids = ids, +#' system_prompt = "Classify sentiment as positive, negative, or neutral." +#' ) +#' +#' # Large-scale processing with custom output file +#' results <- oai_complete_chunks( +#' texts = large_text_vector, +#' ids = seq_along(large_text_vector), +#' chunk_size = 2000, +#' concurrent_requests = 10, +#' output_file = "data/sentiment_results.csv" +#' ) +#' +#' # Structured extraction with schema +#' sentiment_schema <- create_json_schema( +#' name = "sentiment_analysis", +#' schema = schema_object( +#' sentiment = schema_string("positive, negative, or neutral"), +#' confidence = schema_number("confidence score between 0 and 1"), +#' required = list("sentiment", "confidence") +#' ) +#' ) #' +#' results <- oai_complete_chunks( +#' texts = texts, +#' ids = ids, +#' schema = sentiment_schema, +#' temperature = 0 +#' ) #' -#' # post-process structured results: -#' xx <- xx |> +#' # Post-process structured results +#' results |> #' dplyr::filter(!.error) |> -#' dplyr::mutate(parsed = map(content, ~jsonlite::fromJSON(.x))) |> -#' unnest_wider(parsed) +#' dplyr::mutate(parsed = purrr::map(content, safely_from_json)) |> +#' tidyr::unnest_wider(parsed) #' } # oai_complete_chunks docs ---- oai_complete_chunks <- function(texts, @@ -559,7 +588,46 @@ oai_complete_chunks <- function(texts, #' @export #' @examples #' \dontrun{ -#' +#' # Basic usage with a data frame +#' df <- tibble::tibble( +#' doc_id = 1:3, +#' text = c( +#' "I absolutely loved this product!", +#' "Terrible experience, would not recommend.", +#' "It was okay, nothing special." +#' ) +#' ) +#' +#' results <- oai_complete_df( +#' df = df, +#' text_var = text, +#' id_var = doc_id, +#' system_prompt = "Summarise the sentiment in one word." +#' ) +#' +#' # Structured extraction with schema +#' sentiment_schema <- create_json_schema( +#' name = "sentiment_analysis", +#' schema = schema_object( +#' sentiment = schema_string("positive, negative, or neutral"), +#' confidence = schema_number("confidence score between 0 and 1"), +#' required = list("sentiment", "confidence") +#' ) +#' ) +#' +#' results <- oai_complete_df( +#' df = df, +#' text_var = text, +#' id_var = doc_id, +#' schema = sentiment_schema, +#' temperature = 0 +#' ) +#' +#' # Post-process structured results +#' results |> +#' dplyr::filter(!.error) |> +#' dplyr::mutate(parsed = purrr::map(content, safely_from_json)) |> +#' tidyr::unnest_wider(parsed) #' } #oai_complete_df docs---- oai_complete_df <- function(df, diff --git a/man/oai_complete_chunks.Rd b/man/oai_complete_chunks.Rd index d433341..971cacb 100644 --- a/man/oai_complete_chunks.Rd +++ b/man/oai_complete_chunks.Rd @@ -86,16 +86,46 @@ your system resources and text sizes. } \examples{ \dontrun{ -# basic usage with automatic file naming: +# Basic usage with automatic file naming +texts <- c("Great product!", "Awful service.", "Decent value.") +ids <- c("review_1", "review_2", "review_3") + +results <- oai_complete_chunks( + texts = texts, + ids = ids, + system_prompt = "Classify sentiment as positive, negative, or neutral." +) -# large-scale processing with custom output file: -#structured extraction with schema: +# Large-scale processing with custom output file +results <- oai_complete_chunks( + texts = large_text_vector, + ids = seq_along(large_text_vector), + chunk_size = 2000, + concurrent_requests = 10, + output_file = "data/sentiment_results.csv" +) +# Structured extraction with schema +sentiment_schema <- create_json_schema( + name = "sentiment_analysis", + schema = schema_object( + sentiment = schema_string("positive, negative, or neutral"), + confidence = schema_number("confidence score between 0 and 1"), + required = list("sentiment", "confidence") + ) +) + +results <- oai_complete_chunks( + texts = texts, + ids = ids, + schema = sentiment_schema, + temperature = 0 +) -# post-process structured results: -xx <- xx |> +# Post-process structured results +results |> dplyr::filter(!.error) |> - dplyr::mutate(parsed = map(content, ~jsonlite::fromJSON(.x))) |> - unnest_wider(parsed) + dplyr::mutate(parsed = purrr::map(content, safely_from_json)) |> + tidyr::unnest_wider(parsed) } } diff --git a/man/oai_complete_df.Rd b/man/oai_complete_df.Rd index 15554b2..a75a8e4 100644 --- a/man/oai_complete_df.Rd +++ b/man/oai_complete_df.Rd @@ -87,6 +87,45 @@ allowing for easy filtering and retry logic on failures. } \examples{ \dontrun{ +# Basic usage with a data frame +df <- tibble::tibble( + doc_id = 1:3, + text = c( + "I absolutely loved this product!", + "Terrible experience, would not recommend.", + "It was okay, nothing special." + ) +) + +results <- oai_complete_df( + df = df, + text_var = text, + id_var = doc_id, + system_prompt = "Summarise the sentiment in one word." +) + +# Structured extraction with schema +sentiment_schema <- create_json_schema( + name = "sentiment_analysis", + schema = schema_object( + sentiment = schema_string("positive, negative, or neutral"), + confidence = schema_number("confidence score between 0 and 1"), + required = list("sentiment", "confidence") + ) +) + +results <- oai_complete_df( + df = df, + text_var = text, + id_var = doc_id, + schema = sentiment_schema, + temperature = 0 +) +# Post-process structured results +results |> + dplyr::filter(!.error) |> + dplyr::mutate(parsed = purrr::map(content, safely_from_json)) |> + tidyr::unnest_wider(parsed) } } From 4d6559ce00a270666dcee4a819f53c4a7e773e61 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Wed, 3 Dec 2025 21:53:45 +0000 Subject: [PATCH 08/56] stage dev_docs/ant_messages.qmd bump version update news and _pkgdown.yml --- DESCRIPTION | 2 +- NEWS.md | 5 ++++- _pkgdown.yml | 12 +++++++++- dev_docs/ant_messages.qmd | 46 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 62 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 628d86a..f2fab13 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: EndpointR Title: Connects to various Machine Learning inference providers -Version: 0.1.2 +Version: 0.2 Authors@R: person("Jack", "Penzer", , "Jack.penzer@sharecreative.com", role = c("aut", "cre")) Description: EndpointR is a 'batteries included', open-source R package for connecting to various APIs for Machine Learning model predictions. EndpointR is built for company-specific use cases, so may not be useful to a wide audience. diff --git a/NEWS.md b/NEWS.md index c48bc6f..de3c2ea 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,5 @@ +# EndpointR 0.2 + # EndpointR 0.1.2 - **File writing improvements**: `hf_embed_df()` and `hf_classify_df()` now write intermediate results as `.parquet` files to `output_dir` directories, similar to improvements in 0.1.1 for OpenAI functions @@ -9,6 +11,7 @@ - **Dependency update**: Package now depends on `arrow` for faster `.parquet` file writing and reading - **Metadata tracking**: Hugging Face functions that write to files (`hf_embed_df()`, `hf_classify_df()`, `hf_embed_chunks()`, `hf_classify_chunks()`) now write `metadata.json` to output directories containing: + - Endpoint URL and API key name used - Processing parameters (chunk_size, concurrent_requests, timeout, max_retries) - Inference parameters (truncate, max_length) @@ -18,6 +21,7 @@ - **max_length parameter**: Added `max_length` parameter to `hf_classify_df()` and `hf_classify_chunks()` for text truncation control. Note: `hf_embed_df()` handles truncation automatically via endpoint configuration (set `AUTO_TRUNCATE` in endpoint settings) - **New utility functions**: + - `hf_get_model_max_length()` - Retrieve maximum token length for a Hugging Face model - `hf_get_endpoint_info()` - Retrieve detailed information about a Hugging Face Inference Endpoint @@ -36,4 +40,3 @@ Initial BETA release, ships with: - Support for text completion using OpenAI models via the Chat Completions API - Support for embeddings with the OpenAI Embeddings API - Structured outputs via JSON schemas and validators - diff --git a/_pkgdown.yml b/_pkgdown.yml index aa2657d..6a5a621 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -88,6 +88,12 @@ reference: - hf_get_endpoint_info - hf_get_model_max_length +- title: "Anthropic Messages" + desc: "functions for working with Anthropic's Messages API" + contents: + - ant_build_messages_request + + - title: "OpenAI Completions" desc: "Functions for working with OpenAI's APIs including structured outputs" contents: @@ -159,8 +165,12 @@ development: news: releases: + - text: "Version 0.1.2" + href: news/index.html#endpointr-012 + - text: "Version 0.1.1" + href: news/index.html#endpointr-011 - text: "Version 0.1.0" - href: news/index.html + href: news/index.html#endpointr-010 footer: structure: diff --git a/dev_docs/ant_messages.qmd b/dev_docs/ant_messages.qmd index 389ef1d..43be43c 100644 --- a/dev_docs/ant_messages.qmd +++ b/dev_docs/ant_messages.qmd @@ -100,3 +100,49 @@ resp <- req_schema_haiku |> resp_body_json(resp)[["error"]][["message"]] resp |> resp_status() ``` + +## Nested Schemas + +```{r} +library(httr2) +library(purrr) +library(jsonlite) +library(tidyr) +absa_entities_schema <- create_json_schema( + name = "entities", + strict = TRUE, + description = "List of entity-sentiment", + schema = schema_object( + entities = schema_array( + schema_object( + entity = schema_string( + description = "Name of the named entity"), + sentiment = schema_string( + enum = c("positive", "negative", "neutral"), + "sentiment associated with the entity") + ) + ) + ) +) + +req <- ant_build_messages_request( + "Apple have been wonderful, Microsoft... not so much. And by not so much I mean pathetic.", + schema = absa_entities_schema, + model = "claude-sonnet-4-5" +) + +resp <- httr2::req_perform(req) + +resp |> resp_body_json() |> + purrr::pluck("content", 1, "text") |> + fromJSON() |> + pluck('entities') + + +``` + +``` + entity sentiment +1 Apple positive +2 Microsoft negative +``` From 5da258d9b0fcf6484787271fa44a749690c2e739 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Wed, 3 Dec 2025 22:42:16 +0000 Subject: [PATCH 09/56] start refactoring for the proper propagation of error messages --- R/core.R | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/R/core.R b/R/core.R index a1c649d..5f13c22 100644 --- a/R/core.R +++ b/R/core.R @@ -24,7 +24,8 @@ base_request <- function(endpoint_url, api_key){ httr2::req_user_agent("EndpointR") |> httr2::req_method("POST") |> httr2::req_headers("Content-Type" = "application/json") |> - httr2::req_auth_bearer_token(token = api_key) + httr2::req_auth_bearer_token(token = api_key) |> + httr2::req_error(is_error = ~ FALSE) # don't let httr2 auto-throw errors; we handle them ourselves for better error messages return(req) } @@ -34,22 +35,36 @@ base_request <- function(endpoint_url, api_key){ #' #' @description #' Wrapper around httr2::req_perform that handles errors gracefully. +#' Returns the response object directly - check status with httr2::resp_status(). #' #' @param request An httr2 request object #' -#' @return A list with components $result and $error +#' @return A list with components $result (httr2_response or NULL) and $error (NULL or condition) #' @export safely_perform_request <- function(request) { purrr::safely(httr2::req_perform)(request) } +#' Perform request and return response or error object +#' +#' @description +#' Performs a request and returns the response. Since req_error(is_error = ~ FALSE) +#' is set in base_request(), httr2 won't throw errors for HTTP status codes >= 400. +#' Instead, callers should check the response status with httr2::resp_status(). +#' +#' @param request An httr2 request object +#' +#' @return An httr2_response object (check status with resp_status()) or an error condition +#' @keywords internal perform_request_or_return_error <- function(request) { tryCatch({ response <- httr2::req_perform(request) - + # with req_error(is_error = ~ FALSE), we get responses even for HTTP errors + # callers should check status themselves return(response) }, error = function(e) { - cli::cli_alert_warning("Sequential request to {.url {request$url}} failed: {conditionMessage(e)}") + # this catches network errors, timeouts, etc. (not HTTP status errors) + cli::cli_alert_warning("Request to {.url {request$url}} failed: {conditionMessage(e)}") return(e) }) } From deb66074dff11b1428e1b20718836bc694acd962 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Wed, 3 Dec 2025 22:43:44 +0000 Subject: [PATCH 10/56] fix the process_response function to start dealing with errors add the .extract_api_error function --- R/core.R | 75 +++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 63 insertions(+), 12 deletions(-) diff --git a/R/core.R b/R/core.R index 5f13c22..47e3ab1 100644 --- a/R/core.R +++ b/R/core.R @@ -161,21 +161,31 @@ perform_requests_with_strategy <- function(requests, #' ) #' } process_response <- function(resp, indices, tidy_func) { - #higher-order function for processing (takes function as inputs) + # higher-order function for processing (takes function as inputs) if (inherits(resp, "httr2_response")) { + # check if response is an error (status >= 400) + status <- httr2::resp_status(resp) + if (status >= 400) { + error_msg <- .extract_api_error(resp) + cli::cli_warn("Request failed with status {status}: {error_msg}") + return(.create_error_tibble(indices, error_msg)) + } + tryCatch({ result <- tidy_func(resp) result$original_index <- indices result$.error <- FALSE - result$.error_message <- NA_character_ + result$.error_msg <- NA_character_ return(result) }, error = function(e) { cli::cli_warn("Error processing response: {conditionMessage(e)}") return(.create_error_tibble(indices, conditionMessage(e))) }) } else { - cli::cli_warn("Request failed: {conditionMessage(resp)}") - return(.create_error_tibble(indices, "Request failed")) + # handle non-response objects (e.g., errors from network failures) + error_msg <- .extract_api_error(resp, "Request failed") + cli::cli_warn("Request failed: {error_msg}") + return(.create_error_tibble(indices, error_msg)) } } @@ -186,27 +196,68 @@ process_response <- function(resp, indices, tidy_func) { #' Ensures uniform error reporting across different failure modes. #' #' @param indices Vector of indices indicating original request positions -#' @param error_message Character string or condition object describing the error +#' @param error_msg Character string or condition object describing the error #' #' @return A tibble with columns: #' - original_index: Position in original request batch #' - .error: Always TRUE for error tibbles -#' - .error_message: Character description of the error +#' - .error_msg: Character description of the error #' #' @keywords internal -.create_error_tibble <- function(indices, error_message) { +.create_error_tibble <- function(indices, error_msg) { # for consistent outputs with safely function(s) - if (!is.character(error_message)) { - if (inherits(error_message, "condition")) { - error_message <- conditionMessage(error_message) + if (!is.character(error_msg)) { + if (inherits(error_msg, "condition")) { + error_msg <- conditionMessage(error_msg) } else { - error_message <- as.character(error_message) + error_msg <- as.character(error_msg) } } return(tibble::tibble( original_index = indices, .error = TRUE, - .error_message = error_message + .error_msg = error_msg )) } + + +#' Extract error message from an API response +#' +#' @description +#' Extracts a meaningful error message from an httr2 response object. +#' Handles different API response formats (OpenAI, Anthropic, HuggingFace). +#' +#' @param response An httr2_response object, error object, or other response type +#' @param fallback_message Message to return if extraction fails +#' +#' @return Character string containing the error message, or NA_character_ if response is successful +#' @keywords internal +.extract_api_error <- function(response, fallback_message = "Unknown error") { + # handle non-response objects (e.g., errors from network failures) + if (!inherits(response, "httr2_response")) { + if (inherits(response, "error") || inherits(response, "condition")) { + return(conditionMessage(response)) + } + return(as.character(fallback_message)) + } + + status <- httr2::resp_status(response) + if (status < 400) return(NA_character_) + + # try to extract error from response body - different APIs use different formats + + tryCatch({ + body <- httr2::resp_body_json(response) + # huggingface format: {"error": "..."} - check first as it's a string not a list + if (!is.null(body$error) && is.character(body$error)) return(body$error) + # openai format: {"error": {"message": "...", "type": "..."}} + if (!is.null(body$error) && is.list(body$error) && !is.null(body$error$message)) return(body$error$message) + # anthropic format: {"message": "..."} + if (!is.null(body$message)) return(body$message) + # fallback to status code + paste("HTTP", status) + }, error = function(e) { + paste("HTTP", status) + }) +} From d75d4069016b7e576c3e4090886725f519457bac Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Wed, 3 Dec 2025 22:44:44 +0000 Subject: [PATCH 11/56] switch to .error_msg everywhere in package --- R/core.R | 10 +++++++--- R/data.R | 4 ++-- R/hf_classify.R | 2 +- R/hf_embed.R | 2 +- R/openai_embed.R | 10 +++++----- R/zzz.R | 2 +- man/df_embeddings_hf.Rd | 2 +- man/df_sentiment_classification_example.Rd | 2 +- man/dot-extract_api_error.Rd | 21 +++++++++++++++++++++ man/hf_perform_request.Rd | 8 +++++--- man/oai_embed_batch.Rd | 4 ++-- man/oai_embed_df.Rd | 2 +- man/process_response.Rd | 2 +- man/safely_perform_request.Rd | 3 ++- 14 files changed, 51 insertions(+), 23 deletions(-) create mode 100644 man/dot-extract_api_error.Rd diff --git a/R/core.R b/R/core.R index 47e3ab1..f5c9130 100644 --- a/R/core.R +++ b/R/core.R @@ -77,13 +77,17 @@ perform_request_or_return_error <- function(request) { #' Automatically chooses sequential processing when concurrent_requests = 1 #' or when there's only one request. #' -#' @details returns responses in the order that requests were sent, and returns errors in a predictable format. +#' @details Returns responses in the order that requests were sent. +#' Since requests use req_error(is_error = ~ FALSE), HTTP error responses (status >= 400) +#' are returned as httr2_response objects rather than being thrown as errors. +#' Callers should check response status with httr2::resp_status() or use +#' httr2::resps_successes() / httr2::resps_failures() to categorise responses. #' #' @param requests List of httr2_request objects to perform #' @param concurrent_requests Integer specifying maximum number of simultaneous requests (default: 1) #' @param progress Logical indicating whether to show progress bar (default: TRUE) #' -#' @return List of httr2_response objects or error objects for failed requests +#' @return List of httr2_response objects (check status with resp_status()) or error objects for network failures #' @export #' @examples #' \dontrun{ @@ -147,7 +151,7 @@ perform_requests_with_strategy <- function(requests, #' @return A tibble with processed results or error information, including: #' - original_index: Position in original request batch #' - .error: Logical indicating if an error occurred -#' - .error_message: Character description of any error +#' - .error_msg: Character description of any error #' - Additional columns from tidy_func output #' #' @export diff --git a/R/data.R b/R/data.R index 14c731e..42b3869 100644 --- a/R/data.R +++ b/R/data.R @@ -28,7 +28,7 @@ #' \item{text}{Character; the original text that was embedded} #' \item{category}{Character; category classification of the text} #' \item{.error}{Logical; whether the embedding process failed} -#' \item{.error_message}{Character; error message if embedding failed (NA if successful)} +#' \item{.error_msg}{Character; error message if embedding failed (NA if successful)} #' \item{V1}{Numeric; embedding vector dimensions} #' \item{V2}{Numeric; embedding vector dimensions} #' \item{V3}{Numeric; embedding vector dimensions} @@ -815,7 +815,7 @@ #' \item{NEGATIVE}{Numeric; probability score for negative sentiment (0-1)} #' \item{POSITIVE}{Numeric; probability score for positive sentiment (0-1)} #' \item{.error}{Logical; whether the classification process failed} -#' \item{.error_message}{Character; error message if classification failed (NA if successful)} +#' \item{.error_msg}{Character; error message if classification failed (NA if successful)} #' } #' @source Generated using Hugging Face sentiment classification model via EndpointR functions "df_sentiment_classification_example" diff --git a/R/hf_classify.R b/R/hf_classify.R index e0e6304..432ff70 100644 --- a/R/hf_classify.R +++ b/R/hf_classify.R @@ -321,7 +321,7 @@ hf_classify_batch <- function(texts, result <- tidy_func(response$result) result$original_index <- batch_data$batch_indices[[1]] result$.error <- FALSE - result$.error_message <- NA_character_ + result$.error_msg <- NA_character_ }, error = function(e) { cli::cli_warn("Error in single batch request: {conditionMessage(e)}") return(.create_error_tibble(batch_data$batch_indices, conditionMessage(e))) diff --git a/R/hf_embed.R b/R/hf_embed.R index e82a875..2ad70c6 100644 --- a/R/hf_embed.R +++ b/R/hf_embed.R @@ -230,7 +230,7 @@ hf_embed_batch <- function(texts, result$original_index <- NULL # drop index now we're returning - result <- dplyr::relocate(result, c(`.error`, `.error_message`), .before = dplyr::all_of(relocate_col)) + result <- dplyr::relocate(result, c(`.error`, `.error_msg`), .before = dplyr::all_of(relocate_col)) return(result) } diff --git a/R/openai_embed.R b/R/openai_embed.R index c8eeef0..f64bcb1 100644 --- a/R/openai_embed.R +++ b/R/openai_embed.R @@ -312,7 +312,7 @@ oai_embed_text <- function(text, #' filled with NA values and marked with error information. #' #' The function returns a tibble with embedding columns (V1, V2, ..., Vn), -#' error tracking columns (.error, .error_message), and optionally the +#' error tracking columns (.error, .error_msg), and optionally the #' original texts. #' #' @param texts Vector or list of character strings to generate embeddings for @@ -331,7 +331,7 @@ oai_embed_text <- function(text, #' @return A tibble containing: #' - Embedding vectors as columns (V1, V2, ..., Vn) #' - .error: Logical column indicating if embedding failed -#' - .error_message: Character column with error details +#' - .error_msg: Character column with error details #' - text: Original texts (if include_texts = TRUE) #' @export #' @@ -466,7 +466,7 @@ oai_embed_batch <- function(texts, # 5. Adding Error Information to Data Frame ---- # add errors and messages to return df. FALSE and "" if no error. result$.error <- errors - result$.error_message <- error_msgs + result$.error_msg <- error_msgs n_failed <- sum(result$.error) n_succeeded <- n_texts - n_failed @@ -491,7 +491,7 @@ oai_embed_batch <- function(texts, } # 6. Relocating Cols and Returning ---- - result <- dplyr::relocate(result, c(.error, .error_message), .before = dplyr::all_of(relocate_col)) + result <- dplyr::relocate(result, c(.error, .error_msg), .before = dplyr::all_of(relocate_col)) return(result) } @@ -538,7 +538,7 @@ oai_embed_df <- function(df, text_var, id_var, model = "text-embedding-3-small", #' @param progress Whether to display a progress bar (default: TRUE) #' #' @return Original data frame with additional columns for embeddings (V1, V2, etc.), -#' plus .error and .error_message columns indicating any failures +#' plus .error and .error_msg columns indicating any failures #' #' @export #' diff --git a/R/zzz.R b/R/zzz.R index bb8cd44..db1fa09 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1,5 +1,5 @@ utils::globalVariables(c(".embeddings", ".request", ".response", ".row_num", ".data", ".error", - ".error_message", "original_index", "text", ":=", ".row_id", "id", "label", "score", "verbose")) + ".error_msg", "original_index", "text", ":=", ".row_id", "id", "label", "score", "verbose")) .onLoad <- function(...) { S7::methods_register() diff --git a/man/df_embeddings_hf.Rd b/man/df_embeddings_hf.Rd index ae7753b..e0bd787 100644 --- a/man/df_embeddings_hf.Rd +++ b/man/df_embeddings_hf.Rd @@ -11,7 +11,7 @@ A data frame with 3 rows and 773 variables: \item{text}{Character; the original text that was embedded} \item{category}{Character; category classification of the text} \item{.error}{Logical; whether the embedding process failed} -\item{.error_message}{Character; error message if embedding failed (NA if successful)} +\item{.error_msg}{Character; error message if embedding failed (NA if successful)} \item{V1}{Numeric; embedding vector dimensions} \item{V2}{Numeric; embedding vector dimensions} \item{V3}{Numeric; embedding vector dimensions} diff --git a/man/df_sentiment_classification_example.Rd b/man/df_sentiment_classification_example.Rd index 662137d..c4efb77 100644 --- a/man/df_sentiment_classification_example.Rd +++ b/man/df_sentiment_classification_example.Rd @@ -13,7 +13,7 @@ A data frame with 3 rows and 7 variables: \item{NEGATIVE}{Numeric; probability score for negative sentiment (0-1)} \item{POSITIVE}{Numeric; probability score for positive sentiment (0-1)} \item{.error}{Logical; whether the classification process failed} -\item{.error_message}{Character; error message if classification failed (NA if successful)} +\item{.error_msg}{Character; error message if classification failed (NA if successful)} } } \source{ diff --git a/man/dot-extract_api_error.Rd b/man/dot-extract_api_error.Rd new file mode 100644 index 0000000..5a5e1ab --- /dev/null +++ b/man/dot-extract_api_error.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/core.R +\name{.extract_api_error} +\alias{.extract_api_error} +\title{Extract error message from an API response} +\usage{ +.extract_api_error(response, fallback_message = "Unknown error") +} +\arguments{ +\item{response}{An httr2_response object, error object, or other response type} + +\item{fallback_message}{Message to return if extraction fails} +} +\value{ +Character string containing the error message, or NA_character_ if response is successful +} +\description{ +Extracts a meaningful error message from an httr2 response object. +Handles different API response formats (OpenAI, Anthropic, HuggingFace). +} +\keyword{internal} diff --git a/man/hf_perform_request.Rd b/man/hf_perform_request.Rd index 75648b2..8810a4d 100644 --- a/man/hf_perform_request.Rd +++ b/man/hf_perform_request.Rd @@ -9,13 +9,15 @@ hf_perform_request(request, ...) \arguments{ \item{request}{An httr2 request object created by hf_build_request} -\item{...}{ellipsis is sent to \code{httr2::req_perform}, e.g. for \code{path} and \code{verbosity}arguments.} +\item{...}{ellipsis is sent to \code{httr2::req_perform}, e.g. for \code{path} and \code{verbosity} arguments.} } \value{ -A httr2 response object +An httr2 response object. Check status with httr2::resp_status(). } \description{ -Performs a prepared request and returns the response +Performs a prepared request and returns the response. +Since requests use req_error(is_error = ~ FALSE), HTTP error responses +(status >= 400) are returned rather than thrown as errors. } \examples{ \dontrun{ diff --git a/man/oai_embed_batch.Rd b/man/oai_embed_batch.Rd index 5331d95..8a90b03 100644 --- a/man/oai_embed_batch.Rd +++ b/man/oai_embed_batch.Rd @@ -49,7 +49,7 @@ A tibble containing: \itemize{ \item Embedding vectors as columns (V1, V2, ..., Vn) \item .error: Logical column indicating if embedding failed -\item .error_message: Character column with error details +\item .error_msg: Character column with error details \item text: Original texts (if include_texts = TRUE) } } @@ -73,7 +73,7 @@ as failed, not all documents across all batches. Failed embeddings will be filled with NA values and marked with error information. The function returns a tibble with embedding columns (V1, V2, ..., Vn), -error tracking columns (.error, .error_message), and optionally the +error tracking columns (.error, .error_msg), and optionally the original texts. } \examples{ diff --git a/man/oai_embed_df.Rd b/man/oai_embed_df.Rd index 08cf47d..f7d90c4 100644 --- a/man/oai_embed_df.Rd +++ b/man/oai_embed_df.Rd @@ -46,7 +46,7 @@ oai_embed_df( } \value{ Original data frame with additional columns for embeddings (V1, V2, etc.), -plus .error and .error_message columns indicating any failures +plus .error and .error_msg columns indicating any failures } \description{ High-level function to generate embeddings for texts in a data frame using diff --git a/man/process_response.Rd b/man/process_response.Rd index ec9f819..4082549 100644 --- a/man/process_response.Rd +++ b/man/process_response.Rd @@ -18,7 +18,7 @@ A tibble with processed results or error information, including: \itemize{ \item original_index: Position in original request batch \item .error: Logical indicating if an error occurred -\item .error_message: Character description of any error +\item .error_msg: Character description of any error \item Additional columns from tidy_func output } } diff --git a/man/safely_perform_request.Rd b/man/safely_perform_request.Rd index 9800ea8..d72b436 100644 --- a/man/safely_perform_request.Rd +++ b/man/safely_perform_request.Rd @@ -10,8 +10,9 @@ safely_perform_request(request) \item{request}{An httr2 request object} } \value{ -A list with components $result and $error +A list with components $result (httr2_response or NULL) and $error (NULL or condition) } \description{ Wrapper around httr2::req_perform that handles errors gracefully. +Returns the response object directly - check status with httr2::resp_status(). } From 76d3fdb301e8139068784d117080524032d0d49b Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Wed, 3 Dec 2025 22:45:42 +0000 Subject: [PATCH 12/56] finish .error_msg changes --- R/hf_inference.R | 8 +++++--- man/dot-create_error_tibble.Rd | 6 +++--- man/perform_request_or_return_error.Rd | 20 ++++++++++++++++++++ man/perform_requests_with_strategy.Rd | 8 ++++++-- tests/testthat/test-core.R | 2 +- tests/testthat/test-hf_classify.R | 2 +- tests/testthat/test-hf_embed.R | 6 +++--- vignettes/embeddings_providers.Rmd | 6 +++--- vignettes/hugging_face_inference.Rmd | 2 +- 9 files changed, 43 insertions(+), 17 deletions(-) create mode 100644 man/perform_request_or_return_error.Rd diff --git a/R/hf_inference.R b/R/hf_inference.R index 9d7e4fc..b26618e 100644 --- a/R/hf_inference.R +++ b/R/hf_inference.R @@ -231,12 +231,14 @@ #' Execute a single embedding request and process the response #' #' @description - #' Performs a prepared request and returns the response + #' Performs a prepared request and returns the response. + #' Since requests use req_error(is_error = ~ FALSE), HTTP error responses + #' (status >= 400) are returned rather than thrown as errors. #' #' @param request An httr2 request object created by hf_build_request - #' @param ... ellipsis is sent to `httr2::req_perform`, e.g. for `path` and `verbosity`arguments. + #' @param ... ellipsis is sent to `httr2::req_perform`, e.g. for `path` and `verbosity` arguments. #' - #' @return A httr2 response object + #' @return An httr2 response object. Check status with httr2::resp_status(). #' @export #' #' @examples diff --git a/man/dot-create_error_tibble.Rd b/man/dot-create_error_tibble.Rd index e01adba..1ef9608 100644 --- a/man/dot-create_error_tibble.Rd +++ b/man/dot-create_error_tibble.Rd @@ -4,19 +4,19 @@ \alias{.create_error_tibble} \title{Create standardised error tibble for failed requests} \usage{ -.create_error_tibble(indices, error_message) +.create_error_tibble(indices, error_msg) } \arguments{ \item{indices}{Vector of indices indicating original request positions} -\item{error_message}{Character string or condition object describing the error} +\item{error_msg}{Character string or condition object describing the error} } \value{ A tibble with columns: \itemize{ \item original_index: Position in original request batch \item .error: Always TRUE for error tibbles -\item .error_message: Character description of the error +\item .error_msg: Character description of the error } } \description{ diff --git a/man/perform_request_or_return_error.Rd b/man/perform_request_or_return_error.Rd new file mode 100644 index 0000000..02d7cfe --- /dev/null +++ b/man/perform_request_or_return_error.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/core.R +\name{perform_request_or_return_error} +\alias{perform_request_or_return_error} +\title{Perform request and return response or error object} +\usage{ +perform_request_or_return_error(request) +} +\arguments{ +\item{request}{An httr2 request object} +} +\value{ +An httr2_response object (check status with resp_status()) or an error condition +} +\description{ +Performs a request and returns the response. Since req_error(is_error = ~ FALSE) +is set in base_request(), httr2 won't throw errors for HTTP status codes >= 400. +Instead, callers should check the response status with httr2::resp_status(). +} +\keyword{internal} diff --git a/man/perform_requests_with_strategy.Rd b/man/perform_requests_with_strategy.Rd index a2fda0a..80ce1e7 100644 --- a/man/perform_requests_with_strategy.Rd +++ b/man/perform_requests_with_strategy.Rd @@ -18,7 +18,7 @@ perform_requests_with_strategy( \item{progress}{Logical indicating whether to show progress bar (default: TRUE)} } \value{ -List of httr2_response objects or error objects for failed requests +List of httr2_response objects (check status with resp_status()) or error objects for network failures } \description{ Executes a list of HTTP requests either sequentially or in parallel. @@ -26,7 +26,11 @@ Automatically chooses sequential processing when concurrent_requests = 1 or when there's only one request. } \details{ -returns responses in the order that requests were sent, and returns errors in a predictable format. +Returns responses in the order that requests were sent. +Since requests use req_error(is_error = ~ FALSE), HTTP error responses (status >= 400) +are returned as httr2_response objects rather than being thrown as errors. +Callers should check response status with httr2::resp_status() or use +httr2::resps_successes() / httr2::resps_failures() to categorise responses. } \examples{ \dontrun{ diff --git a/tests/testthat/test-core.R b/tests/testthat/test-core.R index b49ea6c..7cecfbd 100644 --- a/tests/testthat/test-core.R +++ b/tests/testthat/test-core.R @@ -131,7 +131,7 @@ test_that("process_response handles batches of inputs when passed the correct ti single_batch <- expect_no_error(process_response(resp = mock_batch_response, indices = 1:3, tidy_func = tidy_batch_classification_response)) - expect_setequal(names(single_batch), c("positive", "negative", "neutral", "original_index", ".error", ".error_message")) + expect_setequal(names(single_batch), c("positive", "negative", "neutral", "original_index", ".error", ".error_msg")) expect_equal(nrow(single_batch), 3) # multi-batches diff --git a/tests/testthat/test-hf_classify.R b/tests/testthat/test-hf_classify.R index e0d89dc..a6a0aaa 100644 --- a/tests/testthat/test-hf_classify.R +++ b/tests/testthat/test-hf_classify.R @@ -143,7 +143,7 @@ test_that("hf_classify_batch processes a batch of texts and returns a tidied cla expect_equal(nrow(res), 4) - expect_setequal(names(res), c("positive", "negative", "neutral", ".error", ".error_message")) + expect_setequal(names(res), c("positive", "negative", "neutral", ".error", ".error_msg")) }) diff --git a/tests/testthat/test-hf_embed.R b/tests/testthat/test-hf_embed.R index 52a2df1..0c23d7e 100644 --- a/tests/testthat/test-hf_embed.R +++ b/tests/testthat/test-hf_embed.R @@ -60,7 +60,7 @@ test_that("hf_embed_batch works correctly with tidy_func parameter added", { expect_s3_class(result, "tbl_df") expect_equal(nrow(result), 2) - expect_true(all(c("V1", "V2", "V3", ".error", ".error_message") %in% names(result))) + expect_true(all(c("V1", "V2", "V3", ".error", ".error_msg") %in% names(result))) }) test_that("hf_embed_batch allows custom tidy_func", { @@ -104,7 +104,7 @@ test_that("hf_embed_df works correctly with real endpoint", { expect_s3_class(result, "data.frame") expect_equal(nrow(result), 2) - expect_true(all(c("id", "text", "V1", "V2", "V3", ".error", ".error_message") %in% names(result))) + expect_true(all(c("id", "text", "V1", "V2", "V3", ".error", ".error_msg") %in% names(result))) expect_equal(result$id, c(1, 2)) expect_equal(result$text, c("text1", "text2")) expect_equal(result$V1, c(0.1, 0.2)) @@ -134,6 +134,6 @@ test_that("hf_embed_df works with different batch sizes", { expect_s3_class(result, "data.frame") expect_equal(nrow(result), 2) - expect_true(all(c("id", "text", ".error", ".error_message") %in% names(result))) + expect_true(all(c("id", "text", ".error", ".error_msg") %in% names(result))) expect_equal(result$.error, c(FALSE, FALSE)) }) diff --git a/vignettes/embeddings_providers.Rmd b/vignettes/embeddings_providers.Rmd index 1a025bc..1287b91 100644 --- a/vignettes/embeddings_providers.Rmd +++ b/vignettes/embeddings_providers.Rmd @@ -126,7 +126,7 @@ batch_embeddings <- hf_embed_batch( glimpse(batch_embeddings[1,1:10 ]) # truncated for ease ``` -The result includes: - `text`: your original text - `.error` and `.error_message`: error tracking - `V1` to `V768`: the embedding dimensions +The result includes: - `text`: your original text - `.error` and `.error_msg`: error tracking - `V1` to `V768`: the embedding dimensions ## Data Frame Integration @@ -144,7 +144,7 @@ embedded_df <- hf_embed_df( ) # Original data + embeddings -names(embedded_df)[1:10] # shows: id, text, category, .error, .error_message, V1, V2... +names(embedded_df)[1:10] # shows: id, text, category, .error, .error_msg, V1, V2... embedded_df ``` @@ -298,7 +298,7 @@ results <- oai_embed_batch(texts = texts_to_embed) if (any(results$.error)) { failed <- results |> filter(.error) |> - select(text, .error_message) + select(text, .error_msg) print(failed) diff --git a/vignettes/hugging_face_inference.Rmd b/vignettes/hugging_face_inference.Rmd index 7b6aa08..eb2b2ce 100644 --- a/vignettes/hugging_face_inference.Rmd +++ b/vignettes/hugging_face_inference.Rmd @@ -112,7 +112,7 @@ The result includes: - `text`: your original text - `.error`: TRUE if something went wrong -- `.error_message`: what went wrong (if anything) +- `.error_msg`: what went wrong (if anything) - `V1` to `V384`: the embedding values ## Data Frame From 4e330dbe0ad2452281e4ecbcc9c791f624477750 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Wed, 3 Dec 2025 22:51:45 +0000 Subject: [PATCH 13/56] add tests for .extract_api_error with mocks push planning doc for refactor --- dev_docs/refactor_error_messages.qmd | 202 +++++++++++++++++++++++ tests/testthat/test-core.R | 237 +++++++++++++++++++++++++++ 2 files changed, 439 insertions(+) create mode 100644 dev_docs/refactor_error_messages.qmd diff --git a/dev_docs/refactor_error_messages.qmd b/dev_docs/refactor_error_messages.qmd new file mode 100644 index 0000000..fea52da --- /dev/null +++ b/dev_docs/refactor_error_messages.qmd @@ -0,0 +1,202 @@ +--- +title: "Error Handling Refactor Plan" +format: html +--- + +# EndpointR Error Handling Refactor Plan + +## Problem Statement + +1. **httr2 catches errors prematurely** - When API requests fail, httr2 throws errors before we can inspect the response body, making debugging difficult. Users see generic error messages instead of the detailed API error messages. + +2. **Inconsistent error column naming** - The package uses both `.error_msg` (in `openai_completions.R`) and `.error_message` (everywhere else), causing confusion and potential bugs. + +## Solution Overview + +### Part A: Prevent httr2 from auto-catching errors + +Add `httr2::req_error(is_error = ~ FALSE)` to request building functions so httr2 doesn't throw errors automatically. Then check response status manually and extract meaningful error messages from the API response body. + +### Part B: Standardize on `.error_msg` + +Rename all `.error_message` occurrences to `.error_msg` for consistency (shorter, cleaner). + +--- + +## Detailed Implementation Plan + +### Phase 1: Create a centralised error handling utility + +**File: `R/core.R`** + +Add a new helper function to extract error messages from failed responses: + +```r +.extract_api_error <- function(response, fallback_message = "Unknown error") { + # Handle different response types + if (!inherits(response, "httr2_response")) { + if (inherits(response, "error") || inherits(response, "condition")) { + return(conditionMessage(response)) + } + return(as.character(fallback_message)) + } + + status <- httr2::resp_status(response) + if (status < 400) return(NA_character_) + + # Try to extract error from response body + tryCatch({ + body <- httr2::resp_body_json(response) + # OpenAI format + if (!is.null(body$error$message)) return(body$error$message) + # Anthropic format + if (!is.null(body$error$message)) return(body$error$message) + # HuggingFace format + if (!is.null(body$error)) return(body$error) + # Generic message with status + paste("HTTP", status) + }, error = function(e) { + paste("HTTP", status) + }) +} +``` + +### Phase 2: Modify request building functions + +Add `httr2::req_error(is_error = ~ FALSE)` to each request builder: + +| File | Function | Change | +|------|----------|--------| +| `R/core.R` | `base_request()` | Add `req_error(is_error = ~ FALSE)` to the pipe chain | +| `R/hf_inference.R` | `hf_build_request()` | Already uses `base_request()`, will inherit | +| `R/hf_inference.R` | `hf_build_request_batch()` | Already uses `base_request()`, will inherit | +| `R/openai_completions.R` | `oai_build_completions_request()` | Already uses `base_request()`, will inherit | +| `R/openai_embed.R` | `oai_build_embedding_request()` | Already uses `base_request()`, will inherit | +| `R/openai_embed.R` | `oai_build_embedding_request_batch()` | Already uses `base_request()`, will inherit | + +**Key insight:** Adding `req_error()` to `base_request()` will cascade to all derived request functions. + +### Phase 3: Update request performing functions + +Update functions that perform requests to check status manually: + +#### 3.1 `R/core.R` - `safely_perform_request()` +- Currently wraps with `purrr::safely()` +- Update to check response status and extract error messages + +#### 3.2 `R/core.R` - `perform_request_or_return_error()` +- Remove tryCatch (no longer needed since errors won't be thrown) +- Add status check and error extraction + +#### 3.3 `R/core.R` - `perform_requests_with_strategy()` +- For parallel requests, httr2 returns all responses +- Update to properly categorise successes/failures based on status code + +#### 3.4 `R/hf_inference.R` - `hf_perform_request()` +- Add status check after `req_perform()` +- Return error info if status >= 400 + +### Phase 4: Update high-level functions + +Update functions that call perform functions to handle the new error format: + +| File | Functions | +|------|-----------| +| `R/openai_completions.R` | `oai_complete_text()`, `oai_complete_chunks()`, `oai_complete_df()` | +| `R/openai_embed.R` | `oai_embed_text()`, `oai_embed_batch()`, `oai_embed_df()` | +| `R/hf_embed.R` | `hf_embed_text()`, `hf_embed_batch()`, `hf_embed_df()` | +| `R/hf_classify.R` | `hf_classify_text()`, `hf_classify_batch()`, `hf_classify_df()` | + +### Phase 5: Standardize error column naming + +Rename `.error_message` to `.error_msg` across the package: + +| File | Change | +|------|--------| +| `R/core.R` | Lines 135, 155, 179, 195 - `.error_message` -> `.error_msg` | +| `R/hf_embed.R` | Line 233 - `.error_message` -> `.error_msg` | +| `R/hf_classify.R` | Line 324 - `.error_message` -> `.error_msg` | +| `R/openai_embed.R` | Lines 315, 334, 469, 494, 541 - `.error_message` -> `.error_msg` | +| `R/zzz.R` | Line 2 - `.error_message` -> `.error_msg` in globalVariables | + +### Phase 6: Update tests and documentation + +| File | Changes Needed | +|-----------|----------------| +| `tests/testthat/test-openai_completions.R` | Already uses `.error_msg` - no changes | +| `tests/testthat/test-core.R` | Line 134: `.error_message` -> `.error_msg` | +| `tests/testthat/test-hf_embed.R` | Lines 63, 107, 137: `.error_message` -> `.error_msg` | +| `tests/testthat/test-hf_classify.R` | Line 146: `.error_message` -> `.error_msg` | +| `vignettes/*.Rmd` | Update examples to use `.error_msg` | +| `man/*.Rd` | Will regenerate with `devtools::document()` | + +--- + +## Files to Modify (Summary) + +1. **R/core.R** - Add `.extract_api_error()`, modify `base_request()`, update perform functions, rename `.error_message` -> `.error_msg` +2. **R/openai_completions.R** - Update error handling (already uses `.error_msg`) +3. **R/openai_embed.R** - Rename `.error_message` -> `.error_msg`, update error handling +4. **R/hf_inference.R** - Update `hf_perform_request()` error handling +5. **R/hf_embed.R** - Rename `.error_message` -> `.error_msg`, update error handling +6. **R/hf_classify.R** - Rename `.error_message` -> `.error_msg`, update error handling +7. **R/zzz.R** - Change globalVariables from `.error_message` to `.error_msg` +8. **tests/testthat/test-core.R** - Update expected column name +9. **tests/testthat/test-hf_embed.R** - Update expected column name +10. **tests/testthat/test-hf_classify.R** - Update expected column name +11. **vignettes/*.Rmd** - Update documentation examples +12. **man/*.Rd** - Will regenerate with roxygen2 + +--- + +## Risk Assessment + +| Risk | Mitigation | +|------|------------| +| Breaking existing workflows | Run full test suite after each phase | +| Response body structure varies by API | Handle multiple formats in `.extract_api_error()` | +| Edge cases with empty responses | Default fallback messages | +| Documentation sync | Regenerate docs with `devtools::document()` | + +--- + +## Implementation Order + +1. Add `.extract_api_error()` helper (Phase 1) +2. Add `req_error()` to `base_request()` (Phase 2) +3. Update `perform_requests_with_strategy()` (Phase 3) +4. Update individual perform functions (Phase 3) +5. Rename `.error_message` to `.error_msg` across package (Phase 5) +6. Update tests (Phase 6) +7. Run tests and fix any issues +8. Regenerate documentation with `devtools::document()` + +--- + +## Implementation Notes (Completed) + +### Bug fix in `.extract_api_error()` + +During testing, discovered that the HuggingFace error format check must come **before** the OpenAI format check. When `body$error` is a string (HuggingFace format), attempting `body$error$message` throws "$ operator is invalid for atomic vectors". + +Fixed order: +```r +# huggingface format: {"error": "..."} - check first as it's a string not a list +if (!is.null(body$error) && is.character(body$error)) return(body$error) +# openai format: {"error": {"message": "...", "type": "..."}} +if (!is.null(body$error) && is.list(body$error) && !is.null(body$error$message)) return(body$error$message) +``` + +### Tests added (40 new tests in test-core.R) + +- `base_request` includes `req_error` policy +- `.extract_api_error()` extracts OpenAI-format error messages (400, 401, 429) +- `.extract_api_error()` extracts HuggingFace-format error messages (500, 503) +- `.extract_api_error()` extracts Anthropic-format error messages (529) +- `.extract_api_error()` falls back to HTTP status when body parsing fails +- `.extract_api_error()` returns NA for successful responses +- `.extract_api_error()` handles non-response objects gracefully +- `.extract_api_error()` handles all common HTTP error status codes (400, 401, 403, 404, 429, 500, 502, 503, 529) +- `process_response()` handles HTTP error responses correctly +- `process_response()` handles mixed success and error batches +- `.create_error_tibble()` produces correct structure with `.error_msg` column diff --git a/tests/testthat/test-core.R b/tests/testthat/test-core.R index 7cecfbd..8006828 100644 --- a/tests/testthat/test-core.R +++ b/tests/testthat/test-core.R @@ -170,3 +170,240 @@ test_that(".create_erorr_tibble deals with indices and messages and outputs a ti expect_true(nrow(error_tib) ==2) }) + + +# error handling tests ---- + +test_that("base_request includes req_error to prevent auto-throwing on HTTP errors", { + # verify that req_error policy is set on base requests + req <- base_request("https://api.example.com", "fake_key") + + # the request should have an error policy (error_is_error) that doesn't auto-throw + expect_true("error_is_error" %in% names(req$policies)) +}) + +test_that(".extract_api_error extracts OpenAI-format error messages", { + # openai format: {"error": {"message": "...", "type": "..."}} + + mock_400 <- httr2::response_json( + status_code = 400L, + body = list(error = list( + message = "Invalid request: missing required parameter 'model'", + type = "invalid_request_error" + )) + ) + + error_msg <- .extract_api_error(mock_400) + expect_equal(error_msg, "Invalid request: missing required parameter 'model'") + + mock_401 <- httr2::response_json( + status_code = 401L, + body = list(error = list( + message = "Incorrect API key provided", + type = "authentication_error" + )) + ) + + error_msg <- .extract_api_error(mock_401) + expect_equal(error_msg, "Incorrect API key provided") + + mock_429 <- httr2::response_json( + status_code = 429L, + body = list(error = list( + message = "Rate limit exceeded. Please retry after 60 seconds.", + type = "rate_limit_error" + )) + ) + + error_msg <- .extract_api_error(mock_429) + expect_equal(error_msg, "Rate limit exceeded. Please retry after 60 seconds.") +}) + +test_that(".extract_api_error extracts HuggingFace-format error messages", { + # huggingface format: {"error": "..."} + + mock_503 <- httr2::response_json( + status_code = 503L, + body = list(error = "Model is currently loading, please retry in 30 seconds") + ) + + error_msg <- .extract_api_error(mock_503) + expect_equal(error_msg, "Model is currently loading, please retry in 30 seconds") + + mock_500 <- httr2::response_json( + status_code = 500L, + body = list(error = "Internal server error") + ) + + error_msg <- .extract_api_error(mock_500) + expect_equal(error_msg, "Internal server error") +}) + +test_that(".extract_api_error extracts Anthropic-format error messages", { + # anthropic format: {"message": "..."} + + mock_529 <- httr2::response_json( + status_code = 529L, + body = list(message = "Anthropic API is temporarily overloaded") + ) + + error_msg <- .extract_api_error(mock_529) + expect_equal(error_msg, "Anthropic API is temporarily overloaded") +}) + +test_that(".extract_api_error falls back to HTTP status when body parsing fails", { + # response with non-json body or unexpected structure + + mock_502 <- httr2::response_json( + status_code = 502L, + body = list(unexpected_field = "something went wrong") + ) + + error_msg <- .extract_api_error(mock_502) + expect_equal(error_msg, "HTTP 502") + + mock_403 <- httr2::response_json( + status_code = 403L, + body = list() # empty body + ) + + error_msg <- .extract_api_error(mock_403) + expect_equal(error_msg, "HTTP 403") +}) + +test_that(".extract_api_error returns NA for successful responses", { + mock_200 <- httr2::response_json( + status_code = 200L, + body = list(data = "success") + ) + + error_msg <- .extract_api_error(mock_200) + expect_true(is.na(error_msg)) +}) + +test_that(".extract_api_error handles non-response objects gracefully", { + # error condition object + err <- simpleError("Connection timed out") + error_msg <- .extract_api_error(err) + expect_equal(error_msg, "Connection timed out") + + # generic object with fallback + error_msg <- .extract_api_error("not a response", fallback_message = "Unknown failure") + expect_equal(error_msg, "Unknown failure") + + # NULL input + error_msg <- .extract_api_error(NULL, fallback_message = "Request failed") + expect_equal(error_msg, "Request failed") +}) + +test_that(".extract_api_error handles all common HTTP error status codes", { + # test a range of common error codes + status_codes <- c(400L, 401L, 403L, 404L, 429L, 500L, 502L, 503L, 529L) + + for (code in status_codes) { + mock_resp <- httr2::response_json( + status_code = code, + body = list(error = list(message = paste("Error", code))) + ) + + error_msg <- .extract_api_error(mock_resp) + expect_equal(error_msg, paste("Error", code), info = paste("Failed for status code", code)) + } +}) + +test_that("process_response handles HTTP error responses correctly", { + # mock a 429 rate limit error + mock_429 <- httr2::response_json( + status_code = 429L, + body = list(error = list( + message = "Rate limit exceeded", + type = "rate_limit_error" + )) + ) + + # check warning is produced + expect_warning( + process_response(mock_429, indices = 1:3, tidy_func = tidy_classification_response), + "Request failed with status 429" + ) + + # capture result for assertions + result <- suppressWarnings( + process_response(mock_429, indices = 1:3, tidy_func = tidy_classification_response) + ) + + expect_true(all(result$.error)) + expect_equal(result$.error_msg[1], "Rate limit exceeded") + expect_equal(nrow(result), 3) + expect_true("original_index" %in% names(result)) + + # mock a 500 server error + mock_500 <- httr2::response_json( + status_code = 500L, + body = list(error = list(message = "Internal server error")) + ) + + # check warning is produced + expect_warning( + process_response(mock_500, indices = c(5, 6), tidy_func = tidy_classification_response), + "Request failed with status 500" + ) + + # capture result for assertions + result <- suppressWarnings( + process_response(mock_500, indices = c(5, 6), tidy_func = tidy_classification_response) + ) + + expect_true(all(result$.error)) + expect_equal(result$.error_msg[1], "Internal server error") + expect_equal(nrow(result), 2) +}) + +test_that("process_response handles mixed success and error batches", { + # this tests that we can process a mix of successful and failed responses + + # successful response + success_body <- list( + list( + list(label = "positive", score = 0.9), + list(label = "negative", score = 0.1) + ) + ) + + mock_success <- httr2::response_json( + status_code = 200L, + body = success_body + ) + + # error response + mock_error <- httr2::response_json( + status_code = 429L, + body = list(error = list(message = "Rate limited")) + ) + + responses <- list(mock_success, mock_error) + indices_list <- list(1, 2) + + # process both + results <- purrr::map2( + responses, + indices_list, + ~ suppressWarnings(process_response(.x, .y, tidy_func = tidy_classification_response)) + ) |> + purrr::list_rbind() + + expect_equal(nrow(results), 2) + expect_equal(sum(results$.error), 1) # one error + expect_equal(sum(!results$.error), 1) # one success +}) + +test_that(".create_error_tibble produces correct structure with .error_msg column", { + error_tib <- .create_error_tibble(c(1, 2, 3), "Something went wrong") + + expect_s3_class(error_tib, "tbl_df") + expect_equal(nrow(error_tib), 3) + expect_true(all(error_tib$.error)) + expect_true(all(error_tib$.error_msg == "Something went wrong")) + expect_equal(error_tib$original_index, c(1, 2, 3)) + expect_setequal(names(error_tib), c("original_index", ".error", ".error_msg")) +}) From cee7350ba0e5411560b8d8b68bc9e1cd0c4fef05 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Wed, 8 Oct 2025 13:37:27 +0100 Subject: [PATCH 14/56] add endpointr_id to hf_build_request in prep for hf_embed_chunks --- R/hf_inference.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/R/hf_inference.R b/R/hf_inference.R index b26618e..1685f10 100644 --- a/R/hf_inference.R +++ b/R/hf_inference.R @@ -14,6 +14,7 @@ #' @param input Character string to get a response for #' @param endpoint_url The URL of the Hugging Face Inference API endpoint #' @param key_name Name of the environment variable containing the API key + #' @param endpointr_id a unique identifier for EndpointR to keep track of the request #' @param parameters Advanced usage: parameters to pass to the API endpoint #' @param max_retries Maximum number of retry attempts for failed requests #' @param timeout Request timeout in seconds @@ -41,6 +42,7 @@ hf_build_request <- function(input, endpoint_url, key_name, + endpointr_id = NULL, parameters = list(), max_retries = 5, timeout = 10, @@ -68,6 +70,10 @@ backoff = ~ 2 ^ .x, # exponential backoff strategy retry_on_failure = TRUE) + if(!is.null(endpointr_id)) { + req <- httr2::req_headers(req, endpointr_id = endpointr_id) + } + return(req) } From 3036f6c3ec2e32a2bae0e20064c84b2777e5d863 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Thu, 9 Oct 2025 17:30:39 +0100 Subject: [PATCH 15/56] add the hf_embed_chunks function --- R/hf_embed.R | 145 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) diff --git a/R/hf_embed.R b/R/hf_embed.R index 2ad70c6..f0faa14 100644 --- a/R/hf_embed.R +++ b/R/hf_embed.R @@ -234,6 +234,151 @@ hf_embed_batch <- function(texts, return(result) } + +#' Embed text chunks through Hugging Face Inference Embedding Endpoints +#' +#' This function is capable of processing large volumes of text through Hugging Face's Inference Embedding Endpoints. Results are written in batches to a file, to avoid out of memory issues. +#' +#' +#' @param texts Character vector of texts to process +#' @param ids Vector of unique identifiers corresponding to each text (same length as texts) +#' @param endpoint_url Hugging Face Embedding Endpoint +#' @param output_file Path to .CSV file for results. "auto" generates the filename, location and is persistent across sessions. If NULL, generates timestamped filename. +#' @param chunk_size Number of texts to process in each batch (default: 5000) +#' @param concurrent_requests number of concurrent requests (default: 5) +#' @param max_retries aximum retry attempts per failed request (default: 5) +#' @param timeout Request timeout in seconds (default: 30) +#' @param key_name ame of environment variable containing the API key (default: +#' "HF_API_KEY") +#' +#' @returns +#' @export +#' +#' @examples +hf_embed_chunks <- function(texts, + ids, + endpoint_url, + output_file = "auto", + chunk_size = 5000L, + concurrent_requests = 5L, + max_retries = 5L, + timeout = 10L, + key_name = "HF_API_KEY") { + + # to batch_size or not to batch_size? As there were some problems with 1 item in the batch failing leading to the whole batch failing, we'll start without. + # input validation ---- + stopifnot( + "texts must be a vector" = is.vector(texts), + "ids must be a vector" = is.vector(ids), + "texts and ids must be the same length" = length(texts) == length(ids), + "chunk_size must be a positive integer greater than 1" = is.numeric(chunk_size) && chunk_size > 0 + ) + + output_file = .handle_output_filename(output_file) + + batch_data <- batch_vector(seq_along(texts), chunk_size) + n_batches <- length(batch_data$batch_indices) + + cli::cli_alert_info("Processing {length(texts)} text{?s} in {n_batches} chunk{?s} of up to {chunk_size} rows each") + cli::cli_alert_info("Intermediate results will be saved to a .csv at {output_file}.") + + total_success <- 0 + total_failures <- 0 + + ## Batch Processing ---- + for (batch_num in seq_along(batch_data$batch_indices)) + { + batch_indices <- batch_data$batch_indices[[batch_num]] + batch_texts <- texts[batch_indices] + batch_ids <- ids[batch_indices] + + cli::cli_progress_message("Processing batch {batch_num}/{n_batches} ({length(batch_indices)} text{?s})") + + requests <- purrr::map2( + .x = batch_texts, + .y = batch_ids, + .f = \(x, y) hf_build_request( + input = x, + endpoint_url = endpoint_url, + endpointr_id = y, + key_name = key_name, + parameters = list(), + max_retries = max_retries, + timeout = timeout, + validate = FALSE + ) + ) + + is_valid_request <- purrr::map_lgl(requests, \(x) inherits(x, "httr2_request")) + valid_requests <- requests[is_valid_request] + + if (length(valid_requests) == 0) { + cli::cli_alert_warning("No valid request{?s} in batch {batch_num}, skipping") + } + + responses <- perform_requests_with_strategy( + valid_requests, + concurrent_requests = concurrent_requests, + progress = TRUE + ) + + successes <- httr2::resps_successes(responses) + failures <- httr2:::resps_failures(responses) + + n_successes <- length(successes) + n_failures <- length(failures) + total_success <- total_success + n_successes + total_failures <- total_failures + n_failures + + # within batch results ---- + batch_results <- list() + + if (length(successes) >0){ + successes_ids <- purrr::map(successes, \(x) purrr::pluck(x, "request", "headers", "endpointr_id")) |> unlist() + successes_content <- purrr::map(successes, tidy_embedding_response) |> + purrr::list_rbind() + + batch_results$successes <- tibble::tibble( + id = successes_ids, + .error = FALSE, + .error_msg = NA_character_, + .batch = batch_num + ) |> + dplyr::bind_cols(successes_content) + } + + if (length(failures) > 0) { + failures_ids <- purrr::map(failures, \(x) pluck(x, "request", "headers", "endpointr_id")) |> unlist() + failures_msgs <- purrr::map_chr(failures, \(x) purrr::pluck(x, "message", .default = "Unknown error")) + + batch_results$failures <- tibble::tibble( + id = failures_ids, + .error = TRUE, + .error_msg = failures_msgs, + .batch = batch_num + ) + } + + batch_df <- dplyr::bind_rows(batch_results) + + if (nrow(batch_df) > 0) { + if (batch_num == 1) { + # if we're in the first batch write to csv with headers (col names) + readr::write_csv(batch_df, output_file, append = FALSE) + } else { + # all other batches, append and don't use col names + readr::write_csv(batch_df, output_file, append = TRUE, col_names = FALSE)} + } + + cli::cli_alert_success("Batch {batch_num}: {n_successes} successful, {n_failures} failed") + } + + final_results <- readr::read_csv(output_file, show_col_types = FALSE) + + return(final_results) +} + + # hf_embed_df docs ---- #' Generate embeddings for texts in a data frame #' From 983ff0123cabdb8587ceb9808ad71375ccc29453 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Thu, 9 Oct 2025 17:30:57 +0100 Subject: [PATCH 16/56] update hf_embed_df to use the hf_embed_chunks function for efficiency and refactor chunk function to avoid confusing batch_size naming --- R/hf_embed.R | 146 ++++++++++++++++++++++----------------------------- 1 file changed, 64 insertions(+), 82 deletions(-) diff --git a/R/hf_embed.R b/R/hf_embed.R index f0faa14..7964101 100644 --- a/R/hf_embed.R +++ b/R/hf_embed.R @@ -237,21 +237,28 @@ hf_embed_batch <- function(texts, #' Embed text chunks through Hugging Face Inference Embedding Endpoints #' -#' This function is capable of processing large volumes of text through Hugging Face's Inference Embedding Endpoints. Results are written in batches to a file, to avoid out of memory issues. +#' This function is capable of processing large volumes of text through Hugging Face's Inference Embedding Endpoints. Results are written in chunks to a file, to avoid out of memory issues. #' +#' @details This function processes texts in chunks, creating individual requests for each text +#' within a chunk. The chunk size determines how many texts are processed before writing results +#' to disk. Within each chunk, requests are sent with the specified level of concurrency. #' #' @param texts Character vector of texts to process #' @param ids Vector of unique identifiers corresponding to each text (same length as texts) #' @param endpoint_url Hugging Face Embedding Endpoint #' @param output_file Path to .CSV file for results. "auto" generates the filename, location and is persistent across sessions. If NULL, generates timestamped filename. -#' @param chunk_size Number of texts to process in each batch (default: 5000) -#' @param concurrent_requests number of concurrent requests (default: 5) -#' @param max_retries aximum retry attempts per failed request (default: 5) -#' @param timeout Request timeout in seconds (default: 30) -#' @param key_name ame of environment variable containing the API key (default: -#' "HF_API_KEY") +#' @param chunk_size Number of texts to process in each chunk before writing to disk (default: 5000) +#' @param concurrent_requests Number of concurrent requests (default: 5) +#' @param max_retries Maximum retry attempts per failed request (default: 5) +#' @param timeout Request timeout in seconds (default: 10) +#' @param key_name Name of environment variable containing the API key (default: "HF_API_KEY") #' -#' @returns +#' @return A tibble with columns: +#' - `id`: Original identifier from input +#' - `.error`: Logical indicating if request failed +#' - `.error_msg`: Error message if failed, NA otherwise +#' - `.chunk`: Chunk number for tracking +#' - Embedding columns (V1, V2, etc.) #' @export #' #' @examples @@ -265,7 +272,6 @@ hf_embed_chunks <- function(texts, timeout = 10L, key_name = "HF_API_KEY") { - # to batch_size or not to batch_size? As there were some problems with 1 item in the batch failing leading to the whole batch failing, we'll start without. # input validation ---- stopifnot( "texts must be a vector" = is.vector(texts), @@ -276,27 +282,27 @@ hf_embed_chunks <- function(texts, output_file = .handle_output_filename(output_file) - batch_data <- batch_vector(seq_along(texts), chunk_size) - n_batches <- length(batch_data$batch_indices) + chunk_data <- batch_vector(seq_along(texts), chunk_size) + n_chunks <- length(chunk_data$batch_indices) - cli::cli_alert_info("Processing {length(texts)} text{?s} in {n_batches} chunk{?s} of up to {chunk_size} rows each") + cli::cli_alert_info("Processing {length(texts)} text{?s} in {n_chunks} chunk{?s} of up to {chunk_size} each") cli::cli_alert_info("Intermediate results will be saved to a .csv at {output_file}.") total_success <- 0 total_failures <- 0 - ## Batch Processing ---- - for (batch_num in seq_along(batch_data$batch_indices)) + ## Chunk Processing ---- + for (chunk_num in seq_along(chunk_data$batch_indices)) { - batch_indices <- batch_data$batch_indices[[batch_num]] - batch_texts <- texts[batch_indices] - batch_ids <- ids[batch_indices] + chunk_indices <- chunk_data$batch_indices[[chunk_num]] + chunk_texts <- texts[chunk_indices] + chunk_ids <- ids[chunk_indices] - cli::cli_progress_message("Processing batch {batch_num}/{n_batches} ({length(batch_indices)} text{?s})") + cli::cli_progress_message("Processing chunk {chunk_num}/{n_chunks} ({length(chunk_indices)} text{?s})") requests <- purrr::map2( - .x = batch_texts, - .y = batch_ids, + .x = chunk_texts, + .y = chunk_ids, .f = \(x, y) hf_build_request( input = x, endpoint_url = endpoint_url, @@ -313,7 +319,8 @@ hf_embed_chunks <- function(texts, valid_requests <- requests[is_valid_request] if (length(valid_requests) == 0) { - cli::cli_alert_warning("No valid request{?s} in batch {batch_num}, skipping") + cli::cli_alert_warning("No valid request{?s} in chunk {chunk_num}, skipping") + next } responses <- perform_requests_with_strategy( @@ -330,47 +337,48 @@ hf_embed_chunks <- function(texts, total_success <- total_success + n_successes total_failures <- total_failures + n_failures - # within batch results ---- - batch_results <- list() + # within chunk results ---- + chunk_results <- list() - if (length(successes) >0){ + if (length(successes) > 0) { successes_ids <- purrr::map(successes, \(x) purrr::pluck(x, "request", "headers", "endpointr_id")) |> unlist() successes_content <- purrr::map(successes, tidy_embedding_response) |> purrr::list_rbind() - batch_results$successes <- tibble::tibble( + chunk_results$successes <- tibble::tibble( id = successes_ids, .error = FALSE, .error_msg = NA_character_, - .batch = batch_num + .chunk = chunk_num ) |> dplyr::bind_cols(successes_content) } if (length(failures) > 0) { - failures_ids <- purrr::map(failures, \(x) pluck(x, "request", "headers", "endpointr_id")) |> unlist() + failures_ids <- purrr::map(failures, \(x) purrr::pluck(x, "request", "headers", "endpointr_id")) |> unlist() failures_msgs <- purrr::map_chr(failures, \(x) purrr::pluck(x, "message", .default = "Unknown error")) - batch_results$failures <- tibble::tibble( + chunk_results$failures <- tibble::tibble( id = failures_ids, .error = TRUE, .error_msg = failures_msgs, - .batch = batch_num + .chunk = chunk_num ) } - batch_df <- dplyr::bind_rows(batch_results) + chunk_df <- dplyr::bind_rows(chunk_results) - if (nrow(batch_df) > 0) { - if (batch_num == 1) { - # if we're in the first batch write to csv with headers (col names) - readr::write_csv(batch_df, output_file, append = FALSE) + if (nrow(chunk_df) > 0) { + if (chunk_num == 1) { + # if we're in the first chunk write to csv with headers (col names) + readr::write_csv(chunk_df, output_file, append = FALSE) } else { - # all other batches, append and don't use col names - readr::write_csv(batch_df, output_file, append = TRUE, col_names = FALSE)} + # all other chunks, append and don't use col names + readr::write_csv(chunk_df, output_file, append = TRUE, col_names = FALSE) + } } - cli::cli_alert_success("Batch {batch_num}: {n_successes} successful, {n_failures} failed") + cli::cli_alert_success("Chunk {chunk_num}: {n_successes} successful, {n_failures} failed") } final_results <- readr::read_csv(output_file, show_col_types = FALSE) @@ -390,10 +398,10 @@ hf_embed_chunks <- function(texts, #' #' @param df A data frame containing texts to embed #' @param text_var Name of the column containing text to embed -#' @param endpoint_url The URL of the Hugging Face Inference API endpoint #' @param id_var Name of the column to use as ID +#' @param endpoint_url The URL of the Hugging Face Inference API endpoint #' @param key_name Name of the environment variable containing the API key -#' @param batch_size Number of texts to process in one batch (NULL for no batching) +#' @param chunk_size Number of texts to process in one batch (NULL for no batching) #' @param concurrent_requests Number of requests to send at once. Some APIs do not allow for multiple requests. #' @param max_retries Maximum number of retry attempts for failed requests. #' @param timeout Request timeout in seconds @@ -446,10 +454,11 @@ hf_embed_df <- function(df, id_var, endpoint_url, key_name, - batch_size = 8, - concurrent_requests = 1, - max_retries = 5, - timeout = 15, + output_file = "auto", + chunk_size = 8L, + concurrent_requests = 1L, + max_retries = 5L, + timeout = 15L, progress = TRUE) { text_sym <- rlang::ensym(text_var) @@ -457,61 +466,34 @@ hf_embed_df <- function(df, stopifnot( "df must be a data frame" = is.data.frame(df), - # "df must be a data frame with > 0 rows", nrow(df) > 0, + "df must not be empty" = nrow(df) > 0, + "text_var must exist in df" = rlang::as_name(text_sym) %in% names(df), + "id_var must exist in df" = rlang::as_name(id_sym) %in% names(df), "endpoint_url must be provided" = !is.null(endpoint_url) && nchar(endpoint_url) > 0, "concurrent_requests must be an integer" = is.numeric(concurrent_requests) && concurrent_requests > 0 ) - if (!rlang::as_string(text_sym) %in% names(df)) { - cli::cli_abort("Column {.code {rlang::as_string(text_sym)}} not found in data frame") - } - - if (!rlang::as_string(id_sym) %in% names(df)) { - cli::cli_abort("Column {.code {rlang::as_string(id_sym)}} not found in data frame") - } - - original_num_rows <- nrow(df) + output_file <- .handle_output_filename(output_file, + base_file_name = "hf_embeddings_batch") # refactoring to always use hf_embed_batch - if batch_size if one then it gets handled anyway, avoids branching and additional complexity. texts <- dplyr::pull(df, !!text_sym) indices <- dplyr::pull(df, !!id_sym) - batch_size <- if(is.null(batch_size) || batch_size <= 1) 1 else batch_size + chunk_size <- if(is.null(chunk_size) || chunk_size <= 1) 1 else chunk_size - embeddings_tbl <- hf_embed_batch( + results <- hf_embed_chunks( texts = texts, + ids = indices, endpoint_url = endpoint_url, key_name = key_name, - batch_size = batch_size, - include_texts = FALSE, + chunk_size = chunk_size, concurrent_requests = concurrent_requests, max_retries = max_retries, - timeout = timeout, - validate = FALSE, - relocate_col = 1 + timeout = timeout ) - df_with_row_id <- df |> dplyr::mutate(.row_id = dplyr::row_number()) # do we definitely want to copy this df? It could be large - - embeddings_tbl <- embeddings_tbl |> - dplyr::mutate(.row_id = dplyr::row_number()) - - result_df <- df_with_row_id |> - dplyr::left_join(embeddings_tbl, by = ".row_id") |> - dplyr::select(-.row_id) - - - # final sanity check and alert user if there's a mismatch - final_num_rows <- nrow(result_df) - - if(final_num_rows != original_num_rows){ - cli::cli_warn("Rows in original data frame and returned data frame do not match:") - cli::cli_bullets(text = c( - "Rows in original data frame: {original_num_rows}", - "Rows in returned data frame: {final_num_rows}" - )) - } - - return(result_df) + return(results) } + From e6f357195aa3ee59b5fa71acaa0ee537c3feeb41 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Fri, 10 Oct 2025 12:02:31 +0100 Subject: [PATCH 17/56] export hf_embed_chunks fix small bug in the calling of .handle_output_filename inside hf_embed_chunks/df update todos/news update _pkgdown.yml with hf_embed_chunks --- NAMESPACE | 1 + NEWS.md | 4 +++ R/hf_embed.R | 3 +-- _pkgdown.yml | 1 + man/hf_build_request.Rd | 3 +++ man/hf_embed_chunks.Rd | 55 +++++++++++++++++++++++++++++++++++++++++ man/hf_embed_df.Rd | 11 +++++---- todos.qmd | 13 ++++++++++ 8 files changed, 84 insertions(+), 7 deletions(-) create mode 100644 man/hf_embed_chunks.Rd diff --git a/NAMESPACE b/NAMESPACE index 7ce1c02..d761a6e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,6 +9,7 @@ export(hf_classify_batch) export(hf_classify_df) export(hf_classify_text) export(hf_embed_batch) +export(hf_embed_chunks) export(hf_embed_df) export(hf_embed_text) export(hf_perform_request) diff --git a/NEWS.md b/NEWS.md index 109b006..882260d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# Endpointr 0.1.2 + +- [ ] `hf_embed_df()`, `hf_classify_df()` improved to write to files similarly to the upgrades applied in 0.1qq.1 + # EndpointR 0.1.1 - `oai_complete_chunks()` function to better support for chunking/batching in `oai_complete_df()` diff --git a/R/hf_embed.R b/R/hf_embed.R index 7964101..f708f07 100644 --- a/R/hf_embed.R +++ b/R/hf_embed.R @@ -261,7 +261,6 @@ hf_embed_batch <- function(texts, #' - Embedding columns (V1, V2, etc.) #' @export #' -#' @examples hf_embed_chunks <- function(texts, ids, endpoint_url, @@ -280,7 +279,7 @@ hf_embed_chunks <- function(texts, "chunk_size must be a positive integer greater than 1" = is.numeric(chunk_size) && chunk_size > 0 ) - output_file = .handle_output_filename(output_file) + output_file = .handle_output_filename(output_file, base_file_name = "hf_embeddings_batch") chunk_data <- batch_vector(seq_along(texts), chunk_size) n_chunks <- length(chunk_data$batch_indices) diff --git a/_pkgdown.yml b/_pkgdown.yml index 042416c..c1c4629 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -61,6 +61,7 @@ reference: contents: - hf_embed_text - hf_embed_batch + - hf_embed_chunks - hf_embed_df - tidy_embedding_response diff --git a/man/hf_build_request.Rd b/man/hf_build_request.Rd index 534684b..5a9acb9 100644 --- a/man/hf_build_request.Rd +++ b/man/hf_build_request.Rd @@ -8,6 +8,7 @@ hf_build_request( input, endpoint_url, key_name, + endpointr_id = NULL, parameters = list(), max_retries = 5, timeout = 10, @@ -21,6 +22,8 @@ hf_build_request( \item{key_name}{Name of the environment variable containing the API key} +\item{endpointr_id}{a unique identifier for EndpointR to keep track of the request} + \item{parameters}{Advanced usage: parameters to pass to the API endpoint} \item{max_retries}{Maximum number of retry attempts for failed requests} diff --git a/man/hf_embed_chunks.Rd b/man/hf_embed_chunks.Rd new file mode 100644 index 0000000..ed307b9 --- /dev/null +++ b/man/hf_embed_chunks.Rd @@ -0,0 +1,55 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/hf_embed.R +\name{hf_embed_chunks} +\alias{hf_embed_chunks} +\title{Embed text chunks through Hugging Face Inference Embedding Endpoints} +\usage{ +hf_embed_chunks( + texts, + ids, + endpoint_url, + output_file = "auto", + chunk_size = 5000L, + concurrent_requests = 5L, + max_retries = 5L, + timeout = 10L, + key_name = "HF_API_KEY" +) +} +\arguments{ +\item{texts}{Character vector of texts to process} + +\item{ids}{Vector of unique identifiers corresponding to each text (same length as texts)} + +\item{endpoint_url}{Hugging Face Embedding Endpoint} + +\item{output_file}{Path to .CSV file for results. "auto" generates the filename, location and is persistent across sessions. If NULL, generates timestamped filename.} + +\item{chunk_size}{Number of texts to process in each chunk before writing to disk (default: 5000)} + +\item{concurrent_requests}{Number of concurrent requests (default: 5)} + +\item{max_retries}{Maximum retry attempts per failed request (default: 5)} + +\item{timeout}{Request timeout in seconds (default: 10)} + +\item{key_name}{Name of environment variable containing the API key (default: "HF_API_KEY")} +} +\value{ +A tibble with columns: +\itemize{ +\item \code{id}: Original identifier from input +\item \code{.error}: Logical indicating if request failed +\item \code{.error_msg}: Error message if failed, NA otherwise +\item \code{.chunk}: Chunk number for tracking +\item Embedding columns (V1, V2, etc.) +} +} +\description{ +This function is capable of processing large volumes of text through Hugging Face's Inference Embedding Endpoints. Results are written in chunks to a file, to avoid out of memory issues. +} +\details{ +This function processes texts in chunks, creating individual requests for each text +within a chunk. The chunk size determines how many texts are processed before writing results +to disk. Within each chunk, requests are sent with the specified level of concurrency. +} diff --git a/man/hf_embed_df.Rd b/man/hf_embed_df.Rd index 1104b4d..ef6e4da 100644 --- a/man/hf_embed_df.Rd +++ b/man/hf_embed_df.Rd @@ -10,10 +10,11 @@ hf_embed_df( id_var, endpoint_url, key_name, - batch_size = 8, - concurrent_requests = 1, - max_retries = 5, - timeout = 15, + output_file = "auto", + chunk_size = 8L, + concurrent_requests = 1L, + max_retries = 5L, + timeout = 15L, progress = TRUE ) } @@ -28,7 +29,7 @@ hf_embed_df( \item{key_name}{Name of the environment variable containing the API key} -\item{batch_size}{Number of texts to process in one batch (NULL for no batching)} +\item{chunk_size}{Number of texts to process in one batch (NULL for no batching)} \item{concurrent_requests}{Number of requests to send at once. Some APIs do not allow for multiple requests.} diff --git a/todos.qmd b/todos.qmd index 9b831ab..8a46920 100644 --- a/todos.qmd +++ b/todos.qmd @@ -1,5 +1,18 @@ # EndpointR Hugging Face Embeddings Implementation Checklist +# 0.1.2 + +- Update the `hf_embed_df()` and `hf_classify_df()` functions to output files for intermediate results +- Refactor to include an ID in the request building for hf\_ functions +- It's tempting to abstract the processing chunks + file writes code, but I think two separate batch loops is cleaner because whilst the hf_embed_df and oai_complete_df functions are sometimes structurally similar: + - oai_complete_df takes a schema, returns a complex result in cases or a single column + - hf_embed_df returns a tibble with many columns + - primary error handling needs are different - HF for the endpoint start up, oai for rate limits + +> Some duplication is fine + +# 0.1 + ## Prios: First: separation of concerns inside perform_requests_with_strategy - function shouldn't perform the requests and tidy them. Tidying should be done on the responses, but outside of the perform_requests_with_strategy. From 797a42ca8efe7b3e4105ab7be8ab6283f91a5e59 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Thu, 4 Dec 2025 10:12:18 +0000 Subject: [PATCH 18/56] fix merge conflicts in test-hf_embed.R --- R/hf_embed.R | 3 +- man/hf_embed_df.Rd | 2 ++ tests/testthat/test-hf_embed.R | 65 +++++++++++++++++++++++++++------- 3 files changed, 56 insertions(+), 14 deletions(-) diff --git a/R/hf_embed.R b/R/hf_embed.R index f708f07..da184d4 100644 --- a/R/hf_embed.R +++ b/R/hf_embed.R @@ -329,7 +329,7 @@ hf_embed_chunks <- function(texts, ) successes <- httr2::resps_successes(responses) - failures <- httr2:::resps_failures(responses) + failures <- httr2::resps_failures(responses) n_successes <- length(successes) n_failures <- length(failures) @@ -400,6 +400,7 @@ hf_embed_chunks <- function(texts, #' @param id_var Name of the column to use as ID #' @param endpoint_url The URL of the Hugging Face Inference API endpoint #' @param key_name Name of the environment variable containing the API key +#' @param output_file Path to .CSV file for results. "auto" generates the filename, location and is persistent across sessions. If NULL, generates timestamped filename. #' @param chunk_size Number of texts to process in one batch (NULL for no batching) #' @param concurrent_requests Number of requests to send at once. Some APIs do not allow for multiple requests. #' @param max_retries Maximum number of retry attempts for failed requests. diff --git a/man/hf_embed_df.Rd b/man/hf_embed_df.Rd index ef6e4da..8b4a227 100644 --- a/man/hf_embed_df.Rd +++ b/man/hf_embed_df.Rd @@ -29,6 +29,8 @@ hf_embed_df( \item{key_name}{Name of the environment variable containing the API key} +\item{output_file}{Path to .CSV file for results. "auto" generates the filename, location and is persistent across sessions. If NULL, generates timestamped filename.} + \item{chunk_size}{Number of texts to process in one batch (NULL for no batching)} \item{concurrent_requests}{Number of requests to send at once. Some APIs do not allow for multiple requests.} diff --git a/tests/testthat/test-hf_embed.R b/tests/testthat/test-hf_embed.R index 0c23d7e..5c930c0 100644 --- a/tests/testthat/test-hf_embed.R +++ b/tests/testthat/test-hf_embed.R @@ -84,32 +84,68 @@ test_that("hf_embed_batch allows custom tidy_func", { expect_equal(result$custom_col, c("custom_value", "custom_value")) }) +test_that("hf_embed_chunks replaces hf_embed_batch", { + texts <- paste0("text", 1:6) + ids <- paste0('id', 1:length(texts)) + temp_file <- tempfile(fileext = ".csv") + expected_cols <- c("id", ".error", ".error_msg", ".chunk", "V1", "V2", "V3") + + + chunk_2 <- expect_no_error(hf_embed_chunks( + texts = texts, + ids = ids, + endpoint_url = server$url("/test_embedding"), # use this as chunks are 1:1 request-row-response, not batches. I.e. don't need a chunk endpoint.= + key_name = "HF_TEST_API_KEY", + chunk_size = 2, + concurrent_requests =1, + output_file = temp_file + )) |> suppressMessages() + + expect_setequal(unique(chunk_2$`.chunk`), c(1, 2, 3)) + expect_setequal(names(chunk_2), expected_cols) + + chunk_1 <- expect_no_error(hf_embed_chunks( + texts = texts, + ids = ids, + endpoint_url = server$url("/test_embedding"), # use this as chunks are 1:1 request-row-response, not batches. I.e. don't need a chunk endpoint.= + key_name = "HF_TEST_API_KEY", + chunk_size = 1, + concurrent_requests =1, + output_file = temp_file + )) |> suppressMessages() + + expect_setequal(unique(chunk_1$`.chunk`), 1:6) + +}) + test_that("hf_embed_df works correctly with real endpoint", { test_df <- data.frame( id = c(1, 2), text = c("text1", "text2"), stringsAsFactors = FALSE ) + temp_file <- tempfile(fileext = ".csv") result <- expect_no_error( hf_embed_df( df = test_df, text_var = text, id_var = id, - endpoint_url = server$url("/test_df_embedding"), + endpoint_url = server$url("/test_embedding"), key_name = "HF_TEST_API_KEY", - batch_size = 2 + chunk_size = 2, + output_file = temp_file ) - ) + ) |> + suppressMessages() expect_s3_class(result, "data.frame") expect_equal(nrow(result), 2) - expect_true(all(c("id", "text", "V1", "V2", "V3", ".error", ".error_msg") %in% names(result))) + expect_true(all(c("id", "V1", "V2", "V3", ".error", ".error_msg", ".chunk") %in% names(result))) expect_equal(result$id, c(1, 2)) - expect_equal(result$text, c("text1", "text2")) - expect_equal(result$V1, c(0.1, 0.2)) - expect_equal(result$V2, c(0.2, 0.4)) - expect_equal(result$V3, c(0.3, 0.6)) + expect_equal(result$V1, c(0.1, 0.1), tolerance = 1e-7) + expect_equal(result$V2, c(0.2, 0.2), tolerance = 1e-7) + expect_equal(result$V3, c(0.3, 0.3), tolerance = 1e-7) expect_equal(result$.error, c(FALSE, FALSE)) }) @@ -119,21 +155,24 @@ test_that("hf_embed_df works with different batch sizes", { text = c("text1", "text2"), stringsAsFactors = FALSE ) + temp_file <- tempfile(fileext = ".csv") result <- expect_no_error( hf_embed_df( df = test_df, text_var = text, id_var = id, - endpoint_url = server$url("/test_df_embedding"), + endpoint_url = server$url("/test_embedding"), key_name = "HF_TEST_API_KEY", - batch_size = 1, - concurrent_requests = 1 + chunk_size = 1, + concurrent_requests = 1, + output_file = temp_file ) - ) + ) |> + suppressMessages() expect_s3_class(result, "data.frame") expect_equal(nrow(result), 2) - expect_true(all(c("id", "text", ".error", ".error_msg") %in% names(result))) + expect_true(all(c("id", ".chunk", ".error", ".error_msg") %in% names(result))) expect_equal(result$.error, c(FALSE, FALSE)) }) From b008a59e4a5e5947eada0b5cb38f2886a9c5a624 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Thu, 6 Nov 2025 10:48:56 +0000 Subject: [PATCH 19/56] Fix part of out-of-date docs and examples pass output_file to hf_embed_chunks from inside hf_embed_df to fix the filenmae issue --- R/hf_embed.R | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/R/hf_embed.R b/R/hf_embed.R index da184d4..66f5294 100644 --- a/R/hf_embed.R +++ b/R/hf_embed.R @@ -395,13 +395,15 @@ hf_embed_chunks <- function(texts, #' response processing, with options for batching & parallel execution. #' Setting the number of retries #' +#' Avoid risk of data loss by setting a low-ish chunk_size (e.g. 5,000, 10,000). +#' #' @param df A data frame containing texts to embed #' @param text_var Name of the column containing text to embed #' @param id_var Name of the column to use as ID #' @param endpoint_url The URL of the Hugging Face Inference API endpoint #' @param key_name Name of the environment variable containing the API key #' @param output_file Path to .CSV file for results. "auto" generates the filename, location and is persistent across sessions. If NULL, generates timestamped filename. -#' @param chunk_size Number of texts to process in one batch (NULL for no batching) +#' @param chunk_size The size of each chunk that will be processed and then written to a file. #' @param concurrent_requests Number of requests to send at once. Some APIs do not allow for multiple requests. #' @param max_retries Maximum number of retry attempts for failed requests. #' @param timeout Request timeout in seconds @@ -418,34 +420,22 @@ hf_embed_chunks <- function(texts, #' text = c("First example", "Second example", "Third example") #' ) #' -#' # Use parallel processing without batching -#' embeddings_df <- hf_embed_df( -#' df = df, -#' text_var = text, -#' endpoint_url = "https://my-endpoint.huggingface.cloud", -#' id_var = id, -#' parallel = TRUE, -#' batch_size = NULL -#' ) -#' #' # Use batching without parallel processing #' embeddings_df <- hf_embed_df( #' df = df, #' text_var = text, #' endpoint_url = "https://my-endpoint.huggingface.cloud", -#' id_var = id, -#' parallel = FALSE, -#' batch_size = 10 +#' id_var = id #' ) #' -#' # Use both batching and parallel processing +#' # Use both chunking and parallel processing #' embeddings_df <- hf_embed_df( #' df = df, #' text_var = text, #' endpoint_url = "https://my-endpoint.huggingface.cloud", #' id_var = id, -#' parallel = TRUE, -#' batch_size = 10 +#' chunk_size = 10000, +#' concurrent_requests = 50 #' ) #' } # hf_embed_df docs ---- @@ -455,7 +445,7 @@ hf_embed_df <- function(df, endpoint_url, key_name, output_file = "auto", - chunk_size = 8L, + chunk_size = 5000L, concurrent_requests = 1L, max_retries = 5L, timeout = 15L, @@ -490,7 +480,8 @@ hf_embed_df <- function(df, chunk_size = chunk_size, concurrent_requests = concurrent_requests, max_retries = max_retries, - timeout = timeout + timeout = timeout, + output_file = output_file ) return(results) From 6eabc13927017b6ce0dd3f02b096fd04123fda1c Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Thu, 6 Nov 2025 14:17:31 +0000 Subject: [PATCH 20/56] add the .handle_output_dir to start replacing .handle_output_file --- R/hf_embed.R | 42 ++++++++++++++++++++++++------------------ R/utils.R | 15 +++++++++++++-- man/hf_embed_chunks.Rd | 4 ++-- man/hf_embed_df.Rd | 30 ++++++++++-------------------- 4 files changed, 49 insertions(+), 42 deletions(-) diff --git a/R/hf_embed.R b/R/hf_embed.R index 66f5294..accfb22 100644 --- a/R/hf_embed.R +++ b/R/hf_embed.R @@ -246,7 +246,7 @@ hf_embed_batch <- function(texts, #' @param texts Character vector of texts to process #' @param ids Vector of unique identifiers corresponding to each text (same length as texts) #' @param endpoint_url Hugging Face Embedding Endpoint -#' @param output_file Path to .CSV file for results. "auto" generates the filename, location and is persistent across sessions. If NULL, generates timestamped filename. +#' @param output_dir Path to directory for the .parquet chunks #' @param chunk_size Number of texts to process in each chunk before writing to disk (default: 5000) #' @param concurrent_requests Number of concurrent requests (default: 5) #' @param max_retries Maximum retry attempts per failed request (default: 5) @@ -264,7 +264,7 @@ hf_embed_batch <- function(texts, hf_embed_chunks <- function(texts, ids, endpoint_url, - output_file = "auto", + output_dir = "auto", chunk_size = 5000L, concurrent_requests = 5L, max_retries = 5L, @@ -279,13 +279,19 @@ hf_embed_chunks <- function(texts, "chunk_size must be a positive integer greater than 1" = is.numeric(chunk_size) && chunk_size > 0 ) - output_file = .handle_output_filename(output_file, base_file_name = "hf_embeddings_batch") + # output_file = .handle_output_filename(output_file, base_file_name = "hf_embeddings_batch") + + output_dir <- .handle_output_directory(output_dir, base_dir_name = "hf_embeddings_batch") + + if (!dir.exists(output_dir)) { + dir.create(output_dir, recursive = TRUE) + } chunk_data <- batch_vector(seq_along(texts), chunk_size) n_chunks <- length(chunk_data$batch_indices) cli::cli_alert_info("Processing {length(texts)} text{?s} in {n_chunks} chunk{?s} of up to {chunk_size} each") - cli::cli_alert_info("Intermediate results will be saved to a .csv at {output_file}.") + cli::cli_alert_info("Intermediate results will be saved as parquet files in {output_dir}") total_success <- 0 total_failures <- 0 @@ -367,20 +373,16 @@ hf_embed_chunks <- function(texts, chunk_df <- dplyr::bind_rows(chunk_results) - if (nrow(chunk_df) > 0) { - if (chunk_num == 1) { - # if we're in the first chunk write to csv with headers (col names) - readr::write_csv(chunk_df, output_file, append = FALSE) - } else { - # all other chunks, append and don't use col names - readr::write_csv(chunk_df, output_file, append = TRUE, col_names = FALSE) - } + if (nrow(chunk_df) > 0) { + chunk_file <- glue::glue("{output_dir}/chunk_{stringr::str_pad(chunk_num, 3, pad = '0')}.parquet") + arrow::write_parquet(chunk_df, chunk_file) } cli::cli_alert_success("Chunk {chunk_num}: {n_successes} successful, {n_failures} failed") } - final_results <- readr::read_csv(output_file, show_col_types = FALSE) + final_results <- arrow::open_dataset(output_dir, format = "parquet") |> + dplyr::collect() return(final_results) } @@ -402,7 +404,7 @@ hf_embed_chunks <- function(texts, #' @param id_var Name of the column to use as ID #' @param endpoint_url The URL of the Hugging Face Inference API endpoint #' @param key_name Name of the environment variable containing the API key -#' @param output_file Path to .CSV file for results. "auto" generates the filename, location and is persistent across sessions. If NULL, generates timestamped filename. +#' @param output_dir Path to directory for the .parquet chunks #' @param chunk_size The size of each chunk that will be processed and then written to a file. #' @param concurrent_requests Number of requests to send at once. Some APIs do not allow for multiple requests. #' @param max_retries Maximum number of retry attempts for failed requests. @@ -444,7 +446,7 @@ hf_embed_df <- function(df, id_var, endpoint_url, key_name, - output_file = "auto", + output_dir = "auto", chunk_size = 5000L, concurrent_requests = 1L, max_retries = 5L, @@ -463,8 +465,11 @@ hf_embed_df <- function(df, "concurrent_requests must be an integer" = is.numeric(concurrent_requests) && concurrent_requests > 0 ) - output_file <- .handle_output_filename(output_file, - base_file_name = "hf_embeddings_batch") + + output_dir <- .handle_output_directory(output_dir, + base_dir_name = "hf_embeddings_batch") + # output_file <- .handle_output_filename(output_file, + # base_file_name = "hf_embeddings_batch") # refactoring to always use hf_embed_batch - if batch_size if one then it gets handled anyway, avoids branching and additional complexity. texts <- dplyr::pull(df, !!text_sym) @@ -481,10 +486,11 @@ hf_embed_df <- function(df, concurrent_requests = concurrent_requests, max_retries = max_retries, timeout = timeout, - output_file = output_file + output_dir = output_dir ) return(results) } + diff --git a/R/utils.R b/R/utils.R index 8d9e75e..5f153f3 100644 --- a/R/utils.R +++ b/R/utils.R @@ -260,9 +260,20 @@ extract_field <- function(api_response, field_name) { return(x) } +# modifying the .handle_output_filename to work with .parquet #' @keywords internal -.append_tibble_class <- function(x) { - attr(x, "class") <- c("tbl_df", "tbl", "data.frame") +.handle_output_directory <- function(x, base_dir_name = "batch_processing_") { + if (is.null(x)) { + return(tempfile(pattern = base_dir_name)) + } + + if(identical(x, "auto")) { + timestamp <- format(Sys.time(), "%d%m%Y_%H%M%S") + output_dir <- glue::glue("{base_dir_name}_{timestamp}") + return(output_dir) + } + + # Accept directory path directly return(x) } diff --git a/man/hf_embed_chunks.Rd b/man/hf_embed_chunks.Rd index ed307b9..353a203 100644 --- a/man/hf_embed_chunks.Rd +++ b/man/hf_embed_chunks.Rd @@ -8,7 +8,7 @@ hf_embed_chunks( texts, ids, endpoint_url, - output_file = "auto", + output_dir = "auto", chunk_size = 5000L, concurrent_requests = 5L, max_retries = 5L, @@ -23,7 +23,7 @@ hf_embed_chunks( \item{endpoint_url}{Hugging Face Embedding Endpoint} -\item{output_file}{Path to .CSV file for results. "auto" generates the filename, location and is persistent across sessions. If NULL, generates timestamped filename.} +\item{output_dir}{Path to directory for the .parquet chunks} \item{chunk_size}{Number of texts to process in each chunk before writing to disk (default: 5000)} diff --git a/man/hf_embed_df.Rd b/man/hf_embed_df.Rd index 8b4a227..12ec8a8 100644 --- a/man/hf_embed_df.Rd +++ b/man/hf_embed_df.Rd @@ -10,8 +10,8 @@ hf_embed_df( id_var, endpoint_url, key_name, - output_file = "auto", - chunk_size = 8L, + output_dir = "auto", + chunk_size = 5000L, concurrent_requests = 1L, max_retries = 5L, timeout = 15L, @@ -29,9 +29,9 @@ hf_embed_df( \item{key_name}{Name of the environment variable containing the API key} -\item{output_file}{Path to .CSV file for results. "auto" generates the filename, location and is persistent across sessions. If NULL, generates timestamped filename.} +\item{output_dir}{Path to directory for the .parquet chunks} -\item{chunk_size}{Number of texts to process in one batch (NULL for no batching)} +\item{chunk_size}{The size of each chunk that will be processed and then written to a file.} \item{concurrent_requests}{Number of requests to send at once. Some APIs do not allow for multiple requests.} @@ -49,6 +49,8 @@ High-level function to generate embeddings for texts in a data frame. This function handles the entire process from request creation to response processing, with options for batching & parallel execution. Setting the number of retries + +Avoid risk of data loss by setting a low-ish chunk_size (e.g. 5,000, 10,000). } \examples{ \dontrun{ @@ -58,34 +60,22 @@ Setting the number of retries text = c("First example", "Second example", "Third example") ) - # Use parallel processing without batching - embeddings_df <- hf_embed_df( - df = df, - text_var = text, - endpoint_url = "https://my-endpoint.huggingface.cloud", - id_var = id, - parallel = TRUE, - batch_size = NULL - ) - # Use batching without parallel processing embeddings_df <- hf_embed_df( df = df, text_var = text, endpoint_url = "https://my-endpoint.huggingface.cloud", - id_var = id, - parallel = FALSE, - batch_size = 10 + id_var = id ) - # Use both batching and parallel processing + # Use both chunking and parallel processing embeddings_df <- hf_embed_df( df = df, text_var = text, endpoint_url = "https://my-endpoint.huggingface.cloud", id_var = id, - parallel = TRUE, - batch_size = 10 + chunk_size = 10000, + concurrent_requests = 50 ) } } From 37559e851e6b476a058cd17dec9f3d0890b7d106 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Tue, 18 Nov 2025 10:25:54 +0000 Subject: [PATCH 21/56] bump version and add arrow to deps --- DESCRIPTION | 5 +++-- todos.qmd | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 419c468..628d86a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: EndpointR Title: Connects to various Machine Learning inference providers -Version: 0.1.1 +Version: 0.1.2 Authors@R: person("Jack", "Penzer", , "Jack.penzer@sharecreative.com", role = c("aut", "cre")) Description: EndpointR is a 'batteries included', open-source R package for connecting to various APIs for Machine Learning model predictions. EndpointR is built for company-specific use cases, so may not be useful to a wide audience. @@ -32,7 +32,8 @@ Imports: tibble, S7, jsonvalidate, - readr + readr, + arrow VignetteBuilder: knitr Depends: R (>= 3.5) diff --git a/todos.qmd b/todos.qmd index 8a46920..e65bd47 100644 --- a/todos.qmd +++ b/todos.qmd @@ -2,9 +2,43 @@ # 0.1.2 +similarly to the upgrades applied in 0.1.1: + +``` +- [ ] `hf_embed_df()` + - [x] write to files + - [x] parquet + - [x] fix output dir + - [x] chunk_size + - [ ] Update tests +- [ ] `hf_classify_df()` + - [ ] write to files + - [ ] parquet + - [ ] fix output dir + - [ ] chunk_size +- [ ] `oai_complete_df` + - [x] write to files + - [ ] parquet + - [x] chunk_size +- [ ] `oai_complete_chunks` + - [ ] write to files + - [ ] parquet + - [ ] chunk_size +- [ ] `oai_embed_df` + - [ ] write to files + - [ ] parquet + - [ ] chunk_size +``` + - Update the `hf_embed_df()` and `hf_classify_df()` functions to output files for intermediate results + + - use parquet not csv + - adds arrow to deps + - Refactor to include an ID in the request building for hf\_ functions + - It's tempting to abstract the processing chunks + file writes code, but I think two separate batch loops is cleaner because whilst the hf_embed_df and oai_complete_df functions are sometimes structurally similar: + - oai_complete_df takes a schema, returns a complex result in cases or a single column - hf_embed_df returns a tibble with many columns - primary error handling needs are different - HF for the endpoint start up, oai for rate limits From 340aa1f6ce43358ea5e7a788a797c2f33f4843f0 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Tue, 18 Nov 2025 11:36:48 +0000 Subject: [PATCH 22/56] add metadata.json to the output_dir - will be useful in the future for debugging people's code/errors (including my own) --- R/hf_embed.R | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/R/hf_embed.R b/R/hf_embed.R index accfb22..e9320b9 100644 --- a/R/hf_embed.R +++ b/R/hf_embed.R @@ -235,6 +235,7 @@ hf_embed_batch <- function(texts, } +# hf_embed_chunks docs ---- #' Embed text chunks through Hugging Face Inference Embedding Endpoints #' #' This function is capable of processing large volumes of text through Hugging Face's Inference Embedding Endpoints. Results are written in chunks to a file, to avoid out of memory issues. @@ -261,6 +262,7 @@ hf_embed_batch <- function(texts, #' - Embedding columns (V1, V2, etc.) #' @export #' +# hf_embed_chunks docs ---- hf_embed_chunks <- function(texts, ids, endpoint_url, @@ -290,6 +292,21 @@ hf_embed_chunks <- function(texts, chunk_data <- batch_vector(seq_along(texts), chunk_size) n_chunks <- length(chunk_data$batch_indices) + # write/store imoortant metadata in the output dir + metadata <- list( + endpoint_url = endpoint_url, + chunk_size = chunk_size, + n_texts = length(texts), + concurrent_requests = concurrent_requests, + timeout = timeout, + output_dir = output_dir, + key_name = key_name, + n_chunks = n_chunks, + timestamp = Sys.time() + ) + + jsonlite::write_json(metadata, file.path(output_dir, "metadata.json"), auto_unbox = TRUE, pretty = TRUE) + cli::cli_alert_info("Processing {length(texts)} text{?s} in {n_chunks} chunk{?s} of up to {chunk_size} each") cli::cli_alert_info("Intermediate results will be saved as parquet files in {output_dir}") @@ -381,7 +398,9 @@ hf_embed_chunks <- function(texts, cli::cli_alert_success("Chunk {chunk_num}: {n_successes} successful, {n_failures} failed") } - final_results <- arrow::open_dataset(output_dir, format = "parquet") |> + parquet_files <- list.files(output_dir, pattern = "\\.parquet$", full.names = TRUE) + + final_results <- arrow::open_dataset(parquet_files, format = "parquet") |> dplyr::collect() return(final_results) From 0179d55a7b08dccbcdd848d63dbc03c71ea26a9f Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Tue, 18 Nov 2025 12:07:10 +0000 Subject: [PATCH 23/56] add final alert for hf_embed_df function reporting number of successes/failures --- R/hf_embed.R | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/R/hf_embed.R b/R/hf_embed.R index e9320b9..b46f7a0 100644 --- a/R/hf_embed.R +++ b/R/hf_embed.R @@ -305,7 +305,10 @@ hf_embed_chunks <- function(texts, timestamp = Sys.time() ) - jsonlite::write_json(metadata, file.path(output_dir, "metadata.json"), auto_unbox = TRUE, pretty = TRUE) + jsonlite::write_json(metadata, + file.path(output_dir, "metadata.json"), + auto_unbox = TRUE, + pretty = TRUE) cli::cli_alert_info("Processing {length(texts)} text{?s} in {n_chunks} chunk{?s} of up to {chunk_size} each") cli::cli_alert_info("Intermediate results will be saved as parquet files in {output_dir}") @@ -400,6 +403,7 @@ hf_embed_chunks <- function(texts, parquet_files <- list.files(output_dir, pattern = "\\.parquet$", full.names = TRUE) + cli::cli_alert_info("Processing completed, there were {total_success} successes\n and {total_failures} failures.") final_results <- arrow::open_dataset(parquet_files, format = "parquet") |> dplyr::collect() @@ -416,7 +420,7 @@ hf_embed_chunks <- function(texts, #' response processing, with options for batching & parallel execution. #' Setting the number of retries #' -#' Avoid risk of data loss by setting a low-ish chunk_size (e.g. 5,000, 10,000). +#' Avoid risk of data loss by setting a low-ish chunk_size (e.g. 5,000, 10,000). Each chunk is written to a `.parquet` file in the `output_dir=` directory, which also contains a `metadata.json` file which tracks important information such as the endpoint URL used. Be sure to check any output directories into .gitignore! #' #' @param df A data frame containing texts to embed #' @param text_var Name of the column containing text to embed From 4db719046f6bf1d395e44d1e8131eada2f93adec Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Tue, 18 Nov 2025 16:55:14 +0000 Subject: [PATCH 24/56] start the hf_classify_chunks func --- .gitignore | 4 +--- R/hf_classify.R | 24 ++++++++++++++++++++++-- man/hf_embed_df.Rd | 2 +- 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 433a50a..c043856 100644 --- a/.gitignore +++ b/.gitignore @@ -36,12 +36,10 @@ rsconnect/ .Rproj.user inst/doc EndpointR.Rproj - *.html *_dev_files* - dev_docs/project_test_run.qmd docs - # testing /dev_docs/ artifacts *.csv +test_dir diff --git a/R/hf_classify.R b/R/hf_classify.R index 432ff70..0eea701 100644 --- a/R/hf_classify.R +++ b/R/hf_classify.R @@ -91,7 +91,7 @@ tidy_batch_classification_response <- function(response) { return(results) } -# hf_classify_docs ---- +# hf_classify_text_docs ---- #' Classify text using a Hugging Face Inference API endpoint #' #' @description @@ -153,7 +153,7 @@ tidy_batch_classification_response <- function(response) { #' tidy = FALSE #' ) #' } -# hf_classify_docs ---- +# hf_classify_text docs ---- hf_classify_text <- function(text, endpoint_url, key_name, @@ -206,6 +206,7 @@ hf_classify_text <- function(text, } +# hf_classify_batch docs ---- #' Classify multiple texts using Hugging Face Inference Endpoints #' #' @description @@ -261,6 +262,7 @@ hf_classify_text <- function(text, #' batch_size = 3 #' ) #' } +# hf_classify_batch docs ---- hf_classify_batch <- function(texts, endpoint_url, key_name, @@ -359,7 +361,25 @@ hf_classify_batch <- function(texts, return(result) } +# hf_classify_chunks docs ---- +#' @param output_dir Path to directory for the .parquet chunks +# hf_classify_chunks docs ---- +hf_classify_chunks <- function(texts, + ids, + endpoint_url, + ..., + tidy_func = tidy_classification_response, + output_dir = "auto", + chunk_size = 5000L, + concurrent_requests = 5L, + max_retries = 5L, + timeout = 30L, + include_texts = TRUE, + relocate_col = 2, + key_name = "HF_API_KEY" +) { +} #' Classify a data frame of texts using Hugging Face Inference Endpoints #' #' @description diff --git a/man/hf_embed_df.Rd b/man/hf_embed_df.Rd index 12ec8a8..ebd69e4 100644 --- a/man/hf_embed_df.Rd +++ b/man/hf_embed_df.Rd @@ -50,7 +50,7 @@ This function handles the entire process from request creation to response processing, with options for batching & parallel execution. Setting the number of retries -Avoid risk of data loss by setting a low-ish chunk_size (e.g. 5,000, 10,000). +Avoid risk of data loss by setting a low-ish chunk_size (e.g. 5,000, 10,000). Each chunk is written to a \code{.parquet} file in the \verb{output_dir=} directory, which also contains a \code{metadata.json} file which tracks important information such as the endpoint URL used. Be sure to check any output directories into .gitignore! } \examples{ \dontrun{ From 0236ef568ebed1619f649945aec584da0387f3fe Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Tue, 18 Nov 2025 16:55:32 +0000 Subject: [PATCH 25/56] add input val to hf_classify_chunks --- R/hf_classify.R | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/R/hf_classify.R b/R/hf_classify.R index 0eea701..a4efb14 100644 --- a/R/hf_classify.R +++ b/R/hf_classify.R @@ -379,6 +379,26 @@ hf_classify_chunks <- function(texts, key_name = "HF_API_KEY" ) { + # input validation ---- + if (length(texts) == 0) { + cli::cli_abort("Input 'texts' is empty or . Returning an empty tibble.") + } + + if (length(texts) == 1) { + cli::cli_abort("Function expects a batch of inputs, use `hf_classify_text` for single texts.") + } + + + stopifnot( + "Texts must be a list or vector" = is.vector(texts), + "batch_size must be a positive integer" = is.numeric(chunk_size) && chunk_size > 0 && chunk_size == as.integer(chunk_size), + "concurrent_requests must be a positive integer" = is.numeric(concurrent_requests) && concurrent_requests > 0 && concurrent_requests == as.integer(concurrent_requests), + "max_retries must be a positive integer" = is.numeric(max_retries) && max_retries >= 0 && max_retries == as.integer(max_retries), + "timeout must be a positive integer" = is.numeric(timeout) && timeout > 0, + "endpoint_url must be a non-empty string" = is.character(endpoint_url) && nchar(endpoint_url) > 0, + "key_name must be a non-empty string" = is.character(key_name) && nchar(key_name) > 0 + ) + } #' Classify a data frame of texts using Hugging Face Inference Endpoints #' From b2e7c35532b19d61ee39a09dcae90cabe8de3e62 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Tue, 18 Nov 2025 16:56:00 +0000 Subject: [PATCH 26/56] metadata and alerts etc. for hf_classify_chunks --- R/hf_classify.R | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/R/hf_classify.R b/R/hf_classify.R index a4efb14..fa3a023 100644 --- a/R/hf_classify.R +++ b/R/hf_classify.R @@ -399,6 +399,39 @@ hf_classify_chunks <- function(texts, "key_name must be a non-empty string" = is.character(key_name) && nchar(key_name) > 0 ) + # core logic ---- + output_dir <- .handle_output_directory(output_dir, base_dir_name = "hf_classify_chunk") + + if (!dir.exists(output_dir)) { + dir.create(output_dir, recursive = TRUE) + } + + chunk_data <- batch_vector(seq_along(texts), chunk_size) + n_chunks <- length(chunk_data$batch_indices) + + metadata <- list( + endpoint_url = endpoint_url, + chunk_size = chunk_size, + n_texts = length(texts), + concurrent_requests = concurrent_requests, + timeout = timeout, + output_dir = output_dir, + key_name = key_name, + n_chunks = n_chunks, + timestamp = Sys.time() + ) + + jsonlite::write_json(metafata, + file.path(output_dir, "metadata.json"), + auto_unbox = TRUE, + pretty = TRUE) + + cli::cli_alert_info("Processing {length(texts)} text{?s} in {n_chunks} chunk{?s} of up to {chunk_size} rows per chunk") + cli::cli_alert_info("Intermediate results and metadata will be saved as .parquet files and .json in {output_dir}") + + # track global successes for failures for end-of-pipeline reporting + total_successes <- 0 + total_failure <- -0 } #' Classify a data frame of texts using Hugging Face Inference Endpoints #' From 33d3ceb1f2d37dbd02192d7b4b8b2f21bdc48564 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Tue, 18 Nov 2025 17:10:30 +0000 Subject: [PATCH 27/56] fix typos in hf_classify_chunks --- R/hf_classify.R | 108 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 105 insertions(+), 3 deletions(-) diff --git a/R/hf_classify.R b/R/hf_classify.R index fa3a023..0a63f97 100644 --- a/R/hf_classify.R +++ b/R/hf_classify.R @@ -391,7 +391,9 @@ hf_classify_chunks <- function(texts, stopifnot( "Texts must be a list or vector" = is.vector(texts), - "batch_size must be a positive integer" = is.numeric(chunk_size) && chunk_size > 0 && chunk_size == as.integer(chunk_size), + "ids must be a vector" = is.vector(ids), + "texts and ids must be the same length" = length(texts) == length(ids), + "chunk_size must be a positive integer" = is.numeric(chunk_size) && chunk_size > 0 && chunk_size == as.integer(chunk_size), "concurrent_requests must be a positive integer" = is.numeric(concurrent_requests) && concurrent_requests > 0 && concurrent_requests == as.integer(concurrent_requests), "max_retries must be a positive integer" = is.numeric(max_retries) && max_retries >= 0 && max_retries == as.integer(max_retries), "timeout must be a positive integer" = is.numeric(timeout) && timeout > 0, @@ -421,7 +423,7 @@ hf_classify_chunks <- function(texts, timestamp = Sys.time() ) - jsonlite::write_json(metafata, + jsonlite::write_json(metadata, file.path(output_dir, "metadata.json"), auto_unbox = TRUE, pretty = TRUE) @@ -431,8 +433,107 @@ hf_classify_chunks <- function(texts, # track global successes for failures for end-of-pipeline reporting total_successes <- 0 - total_failure <- -0 + total_failures <- 0 + + for (chunk_num in seq_along(chunk_data$batch_indices)) { + chunk_indices <- chunk_data$batch_indices[[chunk_num]] + chunk_texts <- texts[chunk_indices] + chunk_ids <- ids[chunk_indices] + + cli::cli_progress_message("Classifying chunk {chunk_num}/{n_chunks} ({length(chunk_indices)} text{?s})") + + requests <- purrr::map2( + .x = chunk_texts, + .y = chunk_ids, + .f = \(x, y) hf_build_request( + input = x, + endpoint_url = endpoint_url, + endpointr_id = y, + key_name = key_name, + parameters = list(return_all_scores = TRUE), + max_retries = max_retries, + timeout = timeout, + validate = FALSE + ) + ) + + is_valid_request <- purrr::map_lgl(requests, \(x) inherits(x, "httr2_request")) + + valid_requests <- requests[is_valid_request] + + if (length(valid_requests) == 0) { + cli::cli_alert_warning("No valid request{?s} in chunk {chunk_num}, skipping") + next + } + + responses <- perform_requests_with_strategy( + valid_requests, + concurrent_requests = concurrent_requests, + progress = TRUE + ) + + chunk_successes <- httr2::resps_successes(responses) + chunk_failures <- httr2::resps_failures(responses) + + n_chunk_successes <- length(chunk_successes) + n_chunk_failures <- length(chunk_failures) + + total_successes <- total_successes + n_chunk_successes + total_failures <- total_failures + n_chunk_failures + + chunk_results <- list() + + if (n_chunk_successes > 0) { + + successes_ids <- purrr::map(chunk_successes, \(x) purrr::pluck(x, "request", "headers", "endpointr_id")) |> + unlist() + + successes_content <- purrr::map(chunk_successes, tidy_func) |> + purrr::list_rbind() + + chunk_results$successes <- tibble::tibble( + id = successes_ids, + .error = FALSE, + .error_msg = NA_character_, + .chunk = chunk_num + ) |> + dplyr::bind_cols(successes_content) + + } + + if (n_chunk_failures > 0) { + + failures_ids <- purrr::map(chunk_failures, \(x) purrr::pluck(x, "request", "headers", "endpointr_id")) |> unlist() + failures_msgs <- purrr::map_chr(chunk_failures, \(x) purrr::pluck(x, "message", .default = "Unknown error")) + + chunk_results$failures <- tibble::tibble( + id = failures_ids, + .error = TRUE, + .error_msg = failures_msgs, + .chunk = chunk_num + ) + } + + chunk_df <- dplyr::bind_rows(chunk_results) + + if (nrow(chunk_df) > 0) { + chunk_file <- glue::glue("{output_dir}/chunk_{stringr::str_pad(chunk_num, 3, pad = '0')}.parquet") + arrow::write_parquet(chunk_df, chunk_file) + } + + cli::cli_alert_success("Chunk {chunk_num}: {n_chunk_successes} successful, {n_chunk_failures} failed") + } + + parquet_files <- list.files(output_dir, pattern = "\\.parquet$", full.names = TRUE) + + cli::cli_alert_info("Processing completed, there were {total_successes} successes\n and {total_failures} failures.") + final_results <- arrow::open_dataset(parquet_files, format = "parquet") |> + dplyr::collect() + + return(final_results) } + +# hf_classify_df docs ---- #' Classify a data frame of texts using Hugging Face Inference Endpoints #' #' @description @@ -487,6 +588,7 @@ hf_classify_chunks <- function(texts, #' key_name = "API_KEY" #' ) #' } +# hf_classify_df docs ---- hf_classify_df <- function(df, text_var, id_var, From a0e8cc82d85380d0b3e2c25f8093fdc8133584fc Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Wed, 19 Nov 2025 11:22:07 +0000 Subject: [PATCH 28/56] add texts to failures and successes for the hf_classify_chunks function --- .gitignore | 1 + NAMESPACE | 1 + R/hf_classify.R | 32 ++++++++++++++++++++++++---- man/hf_classify_chunks.Rd | 45 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 75 insertions(+), 4 deletions(-) create mode 100644 man/hf_classify_chunks.Rd diff --git a/.gitignore b/.gitignore index c043856..c95a029 100644 --- a/.gitignore +++ b/.gitignore @@ -43,3 +43,4 @@ docs # testing /dev_docs/ artifacts *.csv test_dir +metadata_test_dir diff --git a/NAMESPACE b/NAMESPACE index d761a6e..edfa5b0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,6 +6,7 @@ export(hf_build_request) export(hf_build_request_batch) export(hf_build_request_df) export(hf_classify_batch) +export(hf_classify_chunks) export(hf_classify_df) export(hf_classify_text) export(hf_embed_batch) diff --git a/R/hf_classify.R b/R/hf_classify.R index 0a63f97..016edef 100644 --- a/R/hf_classify.R +++ b/R/hf_classify.R @@ -362,8 +362,30 @@ hf_classify_batch <- function(texts, } # hf_classify_chunks docs ---- -#' @param output_dir Path to directory for the .parquet chunks + # hf_classify_chunks docs ---- + +#' Title +#' +#' @param texts Character vector of texts to classify +#' @param ids +#' @param endpoint_url +#' @param ... +#' @param tidy_func Function to process API responses, defaults to +#' `tidy_classification_response` +#' @param output_dir Path to directory for the .parquet chunks +#' @param chunk_size +#' @param concurrent_requests +#' @param max_retries +#' @param timeout +#' @param include_texts +#' @param relocate_col +#' @param key_name Name of environment variable containing the API key +#' +#' @returns +#' @export +#' +#' @examples hf_classify_chunks <- function(texts, ids, endpoint_url, @@ -374,8 +396,6 @@ hf_classify_chunks <- function(texts, concurrent_requests = 5L, max_retries = 5L, timeout = 30L, - include_texts = TRUE, - relocate_col = 2, key_name = "HF_API_KEY" ) { @@ -487,12 +507,13 @@ hf_classify_chunks <- function(texts, successes_ids <- purrr::map(chunk_successes, \(x) purrr::pluck(x, "request", "headers", "endpointr_id")) |> unlist() - + successes_texts <- purrr::map(chunk_successes, \(x) purrr::pluck(x, "request", "body", "data", "inputs")) |> unlist() successes_content <- purrr::map(chunk_successes, tidy_func) |> purrr::list_rbind() chunk_results$successes <- tibble::tibble( id = successes_ids, + text = successes_texts, .error = FALSE, .error_msg = NA_character_, .chunk = chunk_num @@ -504,10 +525,13 @@ hf_classify_chunks <- function(texts, if (n_chunk_failures > 0) { failures_ids <- purrr::map(chunk_failures, \(x) purrr::pluck(x, "request", "headers", "endpointr_id")) |> unlist() + failures_texts <- purrr::map_chr(chunk_failures, \(x) purrr::pluck(x, "request", "body", "data", "inputs")) |> unlist() failures_msgs <- purrr::map_chr(chunk_failures, \(x) purrr::pluck(x, "message", .default = "Unknown error")) + chunk_results$failures <- tibble::tibble( id = failures_ids, + text = failures_texts, .error = TRUE, .error_msg = failures_msgs, .chunk = chunk_num diff --git a/man/hf_classify_chunks.Rd b/man/hf_classify_chunks.Rd new file mode 100644 index 0000000..6fe9246 --- /dev/null +++ b/man/hf_classify_chunks.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/hf_classify.R +\name{hf_classify_chunks} +\alias{hf_classify_chunks} +\title{Title} +\usage{ +hf_classify_chunks( + texts, + ids, + endpoint_url, + ..., + tidy_func = tidy_classification_response, + output_dir = "auto", + chunk_size = 5000L, + concurrent_requests = 5L, + max_retries = 5L, + timeout = 30L, + key_name = "HF_API_KEY" +) +} +\arguments{ +\item{texts}{Character vector of texts to classify} + +\item{ids}{Vector of unique identifiers corresponding to each text (same length as texts)} + +\item{endpoint_url}{Hugging Face Embedding Endpoint} + +\item{tidy_func}{Function to process API responses, defaults to +\code{tidy_classification_response}} + +\item{output_dir}{Path to directory for the .parquet chunks} + +\item{chunk_size}{Number of texts to process in each chunk before writing to disk (default: 5000)} + +\item{concurrent_requests}{Integer; number of concurrent requests (default: 5)} + +\item{max_retries}{Integer; maximum retry attempts (default: 5)} + +\item{timeout}{Numeric; request timeout in seconds (default: 20)} + +\item{key_name}{Name of environment variable containing the API key} +} +\description{ +Title +} From 8edc8cd757e3a894f3c1e0cda7faafd081060bf1 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Wed, 19 Nov 2025 14:46:53 +0000 Subject: [PATCH 29/56] add the hf_classify_dev (with encrypted endpoints) --- R/hf_embed.R | 4 +-- dev_docs/hf_classify_dev.qmd | 65 ++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 2 deletions(-) diff --git a/R/hf_embed.R b/R/hf_embed.R index b46f7a0..12cc497 100644 --- a/R/hf_embed.R +++ b/R/hf_embed.R @@ -365,7 +365,7 @@ hf_embed_chunks <- function(texts, # within chunk results ---- chunk_results <- list() - if (length(successes) > 0) { + if(n_successes > 0) { successes_ids <- purrr::map(successes, \(x) purrr::pluck(x, "request", "headers", "endpointr_id")) |> unlist() successes_content <- purrr::map(successes, tidy_embedding_response) |> purrr::list_rbind() @@ -379,7 +379,7 @@ hf_embed_chunks <- function(texts, dplyr::bind_cols(successes_content) } - if (length(failures) > 0) { + if (n_failures > 0) { failures_ids <- purrr::map(failures, \(x) purrr::pluck(x, "request", "headers", "endpointr_id")) |> unlist() failures_msgs <- purrr::map_chr(failures, \(x) purrr::pluck(x, "message", .default = "Unknown error")) diff --git a/dev_docs/hf_classify_dev.qmd b/dev_docs/hf_classify_dev.qmd index 9dc61b5..a747644 100644 --- a/dev_docs/hf_classify_dev.qmd +++ b/dev_docs/hf_classify_dev.qmd @@ -485,3 +485,68 @@ hf_classify_df( concurrent_requests = 4 ) ``` + +# Writing to Files + +November, 2025 + +```{r} +hf_cl +``` + +# Truncation and Padding + +```{r} +library(tidyverse) +library(EndpointR) +library(arrow) + +x <- read_parquet("~/dev/projects/diageo/mltm/data/main/mltm_l3_trend_df.parquet") + +endpointr_test_url <- httr2::secret_decrypt("osXYmCFrfPR4NsIIOBOIwWzp8XdA77BVP5QKVwJb0jzHPUwgQ8NmZcHXrhvoYrR8a2QNXZjmwIHpW2X55Ivaa_y11RV5TgXOiD7aK3O8hbDoeQ", "ENDPOINTR_KEY") + +test_sent_url <- httr2::secret_decrypt("STCYZqL6e2yxfGaz5AWA80pnINnfX-47vsqEcxpm23IkxE1R8238Gtnp7oeRUQ6GxkF6jWUYYmiSjIh-Abo1cWOe23qUsZ7uSy5UE9BwvJ9oWg", "ENDPOINTR_KEY") +``` + +```{r} +should_pass <- x |> + filter(language == "Spanish (Español)") |> + slice(10:15) |> + mutate(message = str_sub(message, 1, 100)) |> + filter(message != "", !is.na(message)) + +pass_results <- hf_classify_chunks( + should_pass$message, + should_pass$universal_message_id, + tidy_func = tidy_classification_response, + endpoint_url = test_sent_url, + output_dir = "test_dir/test_classify/test_passes", + chunk_size = 5, + concurrent_requests = 5, + timeout = 60, + max_retries = 10 +) + + +``` + +```{r} +should_fail <- x |> + filter(language == "Arabic (ạlʿrbyẗ)") |> + slice(100:110) |> + # mutate(message = str_sub(message, 1, 180)) |> + filter(message != "", !is.na(message)) + +fail_results <- hf_classify_chunks( + should_fail$message, + should_fail$universal_message_id, + tidy_func = tidy_classification_response, + endpoint_url = test_sent_url, + output_dir = "test_dir/test_classify/test_failures", + chunk_size = 5, + concurrent_requests = 5, + timeout = 60, + max_retries = 10 +) + +``` From f23851f68b0afc2951a86fc65a3052ffcaa368f8 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Wed, 19 Nov 2025 15:27:49 +0000 Subject: [PATCH 30/56] add the hf_get_model_max_length function for ammendments to hf_classify and later hf_embed --- R/utils.R | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/R/utils.R b/R/utils.R index 5f153f3..ec0fa3e 100644 --- a/R/utils.R +++ b/R/utils.R @@ -283,3 +283,37 @@ parse_oai_date <- function(date_string) { date <- as.Date(parsed_date) return(date) } + + + +#' Check the max number of tokens allowed for your inputs +#' +#' This function requires the model to have 'tokenizer_config.json' file with a +#' `model_max_length` key, otherwise it will error. +#' +#' @param model_name name of the model e.g. 'sentence-transformers/mpnet-base-v2' +#' @param api_key Your Hugging Face auth token +#' +#' @returns +#' @export +#' +hf_get_model_max_length <- function(model_name, api_key = "HF_API_KEY") { + config_url <- glue::glue("https://huggingface.co/{model_name}/resolve/main/tokenizer_config.json") + + use_api_key <- get_api_key(api_key) + + req <- httr2::request(config_url) + + if (!is.null(use_api_key)) { + req <- req |> + httr2::req_headers(Authorization = paste("Bearer", use_api_key)) + } + + response <- req |> httr2::req_perform() + + tokenizer_config <- response |> + httr2::resp_body_string() |> + jsonlite::fromJSON() + + return(tokenizer_config$model_max_length) +} From 6d984660e5479842a2233b4a54aeb2d6b222f346 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Wed, 19 Nov 2025 16:29:02 +0000 Subject: [PATCH 31/56] write inference parameters to metadata in hf_embed_df add max_length to hf_embed_df and hf_embed_df --- R/hf_embed.R | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/R/hf_embed.R b/R/hf_embed.R index 12cc497..79cc73b 100644 --- a/R/hf_embed.R +++ b/R/hf_embed.R @@ -266,6 +266,7 @@ hf_embed_batch <- function(texts, hf_embed_chunks <- function(texts, ids, endpoint_url, + max_length = 8192L, output_dir = "auto", chunk_size = 5000L, concurrent_requests = 5L, @@ -278,9 +279,12 @@ hf_embed_chunks <- function(texts, "texts must be a vector" = is.vector(texts), "ids must be a vector" = is.vector(ids), "texts and ids must be the same length" = length(texts) == length(ids), - "chunk_size must be a positive integer greater than 1" = is.numeric(chunk_size) && chunk_size > 0 + "chunk_size must be a positive integer greater than 1" = is.numeric(chunk_size) && chunk_size > 0, + "max_length must be a positive integer greater than 1" = is.numeric(max_length) && max_length > 0 ) + max_length = as.integer(max_length) # type conversion to be extra safe as it's feeding to Py + # output_file = .handle_output_filename(output_file, base_file_name = "hf_embeddings_batch") output_dir <- .handle_output_directory(output_dir, base_dir_name = "hf_embeddings_batch") @@ -292,6 +296,9 @@ hf_embed_chunks <- function(texts, chunk_data <- batch_vector(seq_along(texts), chunk_size) n_chunks <- length(chunk_data$batch_indices) + inference_parameters = list(truncation = TRUE, + max_length = max_length) + # write/store imoortant metadata in the output dir metadata <- list( endpoint_url = endpoint_url, @@ -302,7 +309,8 @@ hf_embed_chunks <- function(texts, output_dir = output_dir, key_name = key_name, n_chunks = n_chunks, - timestamp = Sys.time() + timestamp = Sys.time(), + inference_parameters = inference_parameters ) jsonlite::write_json(metadata, @@ -317,8 +325,8 @@ hf_embed_chunks <- function(texts, total_failures <- 0 ## Chunk Processing ---- - for (chunk_num in seq_along(chunk_data$batch_indices)) - { + for (chunk_num in seq_along(chunk_data$batch_indices)) { + chunk_indices <- chunk_data$batch_indices[[chunk_num]] chunk_texts <- texts[chunk_indices] chunk_ids <- ids[chunk_indices] @@ -333,7 +341,7 @@ hf_embed_chunks <- function(texts, endpoint_url = endpoint_url, endpointr_id = y, key_name = key_name, - parameters = list(), + parameters = list(inference_parameters), max_retries = max_retries, timeout = timeout, validate = FALSE @@ -427,6 +435,7 @@ hf_embed_chunks <- function(texts, #' @param id_var Name of the column to use as ID #' @param endpoint_url The URL of the Hugging Face Inference API endpoint #' @param key_name Name of the environment variable containing the API key +#' @param max_length The maximum number of tokens in the text variable. Beyond this cut-off everything is truncated. #' @param output_dir Path to directory for the .parquet chunks #' @param chunk_size The size of each chunk that will be processed and then written to a file. #' @param concurrent_requests Number of requests to send at once. Some APIs do not allow for multiple requests. @@ -469,6 +478,7 @@ hf_embed_df <- function(df, id_var, endpoint_url, key_name, + max_length = 8192L, output_dir = "auto", chunk_size = 5000L, concurrent_requests = 1L, @@ -509,7 +519,8 @@ hf_embed_df <- function(df, concurrent_requests = concurrent_requests, max_retries = max_retries, timeout = timeout, - output_dir = output_dir + output_dir = output_dir, + max_length = max_length ) return(results) From 7ed5fba52ddb6f560e2eeb09019e66da4f9abfa7 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Wed, 19 Nov 2025 16:29:55 +0000 Subject: [PATCH 32/56] add max_length parameter to hf_classify_df and write inference parameters to metadata --- NAMESPACE | 1 + NEWS.md | 17 +++++++- R/hf_classify.R | 49 +++++++++++++--------- R/utils.R | 2 +- dev_docs/hf_classify_dev.qmd | 74 ++++++++++++++++++++++++---------- man/hf_classify_chunks.Rd | 19 +++++++-- man/hf_embed_chunks.Rd | 1 + man/hf_embed_df.Rd | 3 ++ man/hf_get_model_max_length.Rd | 20 +++++++++ 9 files changed, 140 insertions(+), 46 deletions(-) create mode 100644 man/hf_get_model_max_length.Rd diff --git a/NAMESPACE b/NAMESPACE index edfa5b0..117efe3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -13,6 +13,7 @@ export(hf_embed_batch) export(hf_embed_chunks) export(hf_embed_df) export(hf_embed_text) +export(hf_get_model_max_length) export(hf_perform_request) export(json_dump) export(json_schema) diff --git a/NEWS.md b/NEWS.md index 882260d..dd53966 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,20 @@ # Endpointr 0.1.2 -- [ ] `hf_embed_df()`, `hf_classify_df()` improved to write to files similarly to the upgrades applied in 0.1qq.1 +- We extend the coverage of writing to files to the `hf_embed_df()`, `hf_classify_df()` functions and move to a chunk_size argument rather than batch_size. + +- [ ] `hf_embed_df()`, `hf_classify_df()` improved to write to files similarly to the upgrades applied in 0.1.1 + + - [ ] .parquet files + +- [ ] `oai_complete_df`, `oai_complete_chunks`, `oai_embed_df` all write to .parquet files + + - [ ] Move to chunk_size argument + +- The package takes a dependency on arrow over read_csv, this enables faster writing and reading of files once stored + +- max_length added to `hf_classify` and `hf_embed` functions + +- `hf_get_model_max_length()` function introduced to make it easier to set the max_length argument in `hf_*` functions. # EndpointR 0.1.1 @@ -15,3 +29,4 @@ Initial BETA release, ships with: - Support for text completion using OpenAI models via the Chat Completions API - Support for embeddings with the OpenAI Embeddings API - Structured outputs via JSON schemas and validators + diff --git a/R/hf_classify.R b/R/hf_classify.R index 016edef..744b241 100644 --- a/R/hf_classify.R +++ b/R/hf_classify.R @@ -362,34 +362,38 @@ hf_classify_batch <- function(texts, } # hf_classify_chunks docs ---- - -# hf_classify_chunks docs ---- - -#' Title +#' Efficiently classify vectors of text in chunks +#' +#' TODO - description +#' +#' TODO - details +#' #' #' @param texts Character vector of texts to classify -#' @param ids -#' @param endpoint_url -#' @param ... +#' @param ids Vector of unique identifiers corresponding to each text (same length as texts) +#' @param endpoint_url Hugging Face Embedding Endpoint +#' @param max_length The maximum number of tokens in the text variable. Beyond this cut-off everything is truncated. #' @param tidy_func Function to process API responses, defaults to #' `tidy_classification_response` #' @param output_dir Path to directory for the .parquet chunks -#' @param chunk_size -#' @param concurrent_requests -#' @param max_retries -#' @param timeout -#' @param include_texts -#' @param relocate_col +#' @param chunk_size Number of texts to process in each chunk before writing to disk (default: 5000) +#' @param concurrent_requests Integer; number of concurrent requests (default: 5) +#' @param max_retries Integer; maximum retry attempts (default: 5) +#' @param timeout Numeric; request timeout in seconds (default: 20) #' @param key_name Name of environment variable containing the API key #' -#' @returns +#' @returns A data frame of classified documents with successes and failures #' @export #' #' @examples +#' \dontrun{ +#' 1+1 = 2 +#' } +# hf_classify_chunks docs ---- hf_classify_chunks <- function(texts, ids, endpoint_url, - ..., + max_length = 512L, tidy_func = tidy_classification_response, output_dir = "auto", chunk_size = 5000L, @@ -431,6 +435,10 @@ hf_classify_chunks <- function(texts, chunk_data <- batch_vector(seq_along(texts), chunk_size) n_chunks <- length(chunk_data$batch_indices) + inference_parameters = list(return_all_scores = TRUE, + truncation = TRUE, + max_length = max_length) + metadata <- list( endpoint_url = endpoint_url, chunk_size = chunk_size, @@ -440,7 +448,8 @@ hf_classify_chunks <- function(texts, output_dir = output_dir, key_name = key_name, n_chunks = n_chunks, - timestamp = Sys.time() + timestamp = Sys.time(), + inference_parameters = inference_parameters ) jsonlite::write_json(metadata, @@ -470,7 +479,7 @@ hf_classify_chunks <- function(texts, endpoint_url = endpoint_url, endpointr_id = y, key_name = key_name, - parameters = list(return_all_scores = TRUE), + parameters = inference_parameters, max_retries = max_retries, timeout = timeout, validate = FALSE @@ -565,9 +574,9 @@ hf_classify_chunks <- function(texts, #' endpoint and joins the results back to the original data frame. #' #' @details -#' This function extracts texts from a specified column, classifies them using -#' `hf_classify_batch()`, and joins the classification results back to the -#' original data frame using a specified ID column. +#' This function extracts texts and IDs from the specified columns, classifies them in chunks. +#' It writes +#' `hf_classify_chunks()`, and then returns all of the chu #' #' The function preserves the original data frame structure and adds new #' columns for classification scores. If the number of rows doesn't match diff --git a/R/utils.R b/R/utils.R index ec0fa3e..0726f6a 100644 --- a/R/utils.R +++ b/R/utils.R @@ -294,7 +294,7 @@ parse_oai_date <- function(date_string) { #' @param model_name name of the model e.g. 'sentence-transformers/mpnet-base-v2' #' @param api_key Your Hugging Face auth token #' -#' @returns +#' @returns Integer value of the model_max_length from tokenizer config #' @export #' hf_get_model_max_length <- function(model_name, api_key = "HF_API_KEY") { diff --git a/dev_docs/hf_classify_dev.qmd b/dev_docs/hf_classify_dev.qmd index a747644..5215135 100644 --- a/dev_docs/hf_classify_dev.qmd +++ b/dev_docs/hf_classify_dev.qmd @@ -494,7 +494,9 @@ November, 2025 hf_cl ``` -# Truncation and Padding +# Truncation, padding, max_length + +We were getting some failures which seemed hard to pin down - was it Arabic, tokenisation issues, empty strings? No, it was token length. The test endpoint we're using has a max_model_length of 128. If we tried to feed in a string that tokenised to \> 128 tokens, then we got errors. ```{r} library(tidyverse) @@ -508,45 +510,75 @@ endpointr_test_url <- httr2::secret_decrypt("osXYmCFrfPR4NsIIOBOIwWzp8XdA77BVP5Q test_sent_url <- httr2::secret_decrypt("STCYZqL6e2yxfGaz5AWA80pnINnfX-47vsqEcxpm23IkxE1R8238Gtnp7oeRUQ6GxkF6jWUYYmiSjIh-Abo1cWOe23qUsZ7uSy5UE9BwvJ9oWg", "ENDPOINTR_KEY") ``` +So we grab a bunch of strings that are guaranteed to fail if we don't do anything special: + +```{r} +test_long_strings <- x |> + filter(language == "Spanish (Español)", + message != "", + !is.na(message)) |> + mutate(string_length = str_length(message)) |> + filter(string_length > 1000) |> + slice(10:15) + # mutate(message = str_sub(message, 1, 100)) |> +``` + +And then we make them pass, checking that the max_length argument is working as intended. + ```{r} -should_pass <- x |> - filter(language == "Spanish (Español)") |> - slice(10:15) |> - mutate(message = str_sub(message, 1, 100)) |> - filter(message != "", !is.na(message)) - -pass_results <- hf_classify_chunks( - should_pass$message, - should_pass$universal_message_id, +should_pass <- hf_classify_chunks( + test_long_strings$message, + test_long_strings$universal_message_id, tidy_func = tidy_classification_response, endpoint_url = test_sent_url, output_dir = "test_dir/test_classify/test_passes", chunk_size = 5, concurrent_requests = 5, timeout = 60, - max_retries = 10 + max_retries = 10, + max_length = 128L ) ``` +``` +ℹ Processing 6 texts in 2 chunks of up to 5 rows per chunk +ℹ Intermediate results and metadata will be saved as .parquet files and .json in test_dir/test_classify/test_passes +ℹ Performing 5 requests in parallel (with 5 concurrent requests)... +✔ Chunk 1: 5 successful, 0 failed +ℹ Performing 1 request sequentially... +✔ Chunk 2: 1 successful, 0 failed +ℹ Processing completed, there were 6 successes +and 0 failures. +``` + +And we make them fail, to confirm the argument is working as intended! + ```{r} -should_fail <- x |> - filter(language == "Arabic (ạlʿrbyẗ)") |> - slice(100:110) |> - # mutate(message = str_sub(message, 1, 180)) |> - filter(message != "", !is.na(message)) - -fail_results <- hf_classify_chunks( - should_fail$message, - should_fail$universal_message_id, +should_fail <- hf_classify_chunks( + test_long_strings$message, + test_long_strings$universal_message_id, tidy_func = tidy_classification_response, endpoint_url = test_sent_url, output_dir = "test_dir/test_classify/test_failures", chunk_size = 5, concurrent_requests = 5, timeout = 60, - max_retries = 10 + max_retries = 10, + max_length = 512L ) ``` + +``` +ℹ Processing 6 texts in 2 chunks of up to 5 rows per chunk +ℹ Intermediate results and metadata will be saved as .parquet files and .json in test_dir/test_classify/test_failures +ℹ Performing 5 requests in parallel (with 5 concurrent requests)... +✔ Chunk 1: 0 successful, 5 failed +ℹ Performing 1 request sequentially... +! Sequential request to failed: HTTP 400 Bad Request. +✔ Chunk 2: 0 successful, 1 failed +ℹ Processing completed, there were 0 successes +and 6 failures. +``` diff --git a/man/hf_classify_chunks.Rd b/man/hf_classify_chunks.Rd index 6fe9246..f100e9a 100644 --- a/man/hf_classify_chunks.Rd +++ b/man/hf_classify_chunks.Rd @@ -2,13 +2,13 @@ % Please edit documentation in R/hf_classify.R \name{hf_classify_chunks} \alias{hf_classify_chunks} -\title{Title} +\title{Efficiently classify vectors of text in chunks} \usage{ hf_classify_chunks( texts, ids, endpoint_url, - ..., + max_length = 512L, tidy_func = tidy_classification_response, output_dir = "auto", chunk_size = 5000L, @@ -25,6 +25,8 @@ hf_classify_chunks( \item{endpoint_url}{Hugging Face Embedding Endpoint} +\item{max_length}{The maximum number of tokens in the text variable. Beyond this cut-off everything is truncated.} + \item{tidy_func}{Function to process API responses, defaults to \code{tidy_classification_response}} @@ -40,6 +42,17 @@ hf_classify_chunks( \item{key_name}{Name of environment variable containing the API key} } +\value{ +A data frame of classified documents with successes and failures +} \description{ -Title +TODO - description +} +\details{ +TODO - details +} +\examples{ +\dontrun{ +1+1 = 2 +} } diff --git a/man/hf_embed_chunks.Rd b/man/hf_embed_chunks.Rd index 353a203..26c1c3c 100644 --- a/man/hf_embed_chunks.Rd +++ b/man/hf_embed_chunks.Rd @@ -8,6 +8,7 @@ hf_embed_chunks( texts, ids, endpoint_url, + max_length = 8192L, output_dir = "auto", chunk_size = 5000L, concurrent_requests = 5L, diff --git a/man/hf_embed_df.Rd b/man/hf_embed_df.Rd index ebd69e4..ab8b234 100644 --- a/man/hf_embed_df.Rd +++ b/man/hf_embed_df.Rd @@ -10,6 +10,7 @@ hf_embed_df( id_var, endpoint_url, key_name, + max_length = 8192L, output_dir = "auto", chunk_size = 5000L, concurrent_requests = 1L, @@ -29,6 +30,8 @@ hf_embed_df( \item{key_name}{Name of the environment variable containing the API key} +\item{max_length}{The maximum number of tokens in the text variable. Beyond this cut-off everything is truncated.} + \item{output_dir}{Path to directory for the .parquet chunks} \item{chunk_size}{The size of each chunk that will be processed and then written to a file.} diff --git a/man/hf_get_model_max_length.Rd b/man/hf_get_model_max_length.Rd new file mode 100644 index 0000000..a7727ca --- /dev/null +++ b/man/hf_get_model_max_length.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{hf_get_model_max_length} +\alias{hf_get_model_max_length} +\title{Check the max number of tokens allowed for your inputs} +\usage{ +hf_get_model_max_length(model_name, api_key = "HF_API_KEY") +} +\arguments{ +\item{model_name}{name of the model e.g. 'sentence-transformers/mpnet-base-v2'} + +\item{api_key}{Your Hugging Face auth token} +} +\value{ +Integer value of the model_max_length from tokenizer config +} +\description{ +This function requires the model to have 'tokenizer_config.json' file with a +\code{model_max_length} key, otherwise it will error. +} From 512bc248d4aafcf75e1847e802619005df7ea2f0 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Wed, 19 Nov 2025 17:18:37 +0000 Subject: [PATCH 33/56] remove ... option for args passing in hf_classify_chunks/df move hf_classify_df over to hf_classify_chunks not hf_classify_batch remove old comments from hf_embed_df --- R/hf_classify.R | 70 +++++++++++++++++-------------------------- R/hf_embed.R | 4 +-- man/hf_classify_df.Rd | 29 ++++++++---------- 3 files changed, 41 insertions(+), 62 deletions(-) diff --git a/R/hf_classify.R b/R/hf_classify.R index 744b241..0f58310 100644 --- a/R/hf_classify.R +++ b/R/hf_classify.R @@ -440,16 +440,17 @@ hf_classify_chunks <- function(texts, max_length = max_length) metadata <- list( + output_dir = output_dir, endpoint_url = endpoint_url, + inference_parameters = inference_parameters, chunk_size = chunk_size, + n_chunks = n_chunks, n_texts = length(texts), concurrent_requests = concurrent_requests, timeout = timeout, - output_dir = output_dir, + max_retries = max_retries, key_name = key_name, - n_chunks = n_chunks, - timestamp = Sys.time(), - inference_parameters = inference_parameters + timestamp = Sys.time() ) jsonlite::write_json(metadata, @@ -590,7 +591,6 @@ hf_classify_chunks <- function(texts, #' @param id_var Column name to use as identifier for joining (unquoted) #' @param endpoint_url URL of the Hugging Face Inference API endpoint #' @param key_name Name of environment variable containing the API key -#' @param ... Additional arguments passed to request functions #' @param tidy_func Function to process API responses, defaults to #' `tidy_batch_classification_response` #' @param parameters List of parameters for the API endpoint, defaults to @@ -627,14 +627,13 @@ hf_classify_df <- function(df, id_var, endpoint_url, key_name, - ..., - tidy_func = tidy_batch_classification_response, - parameters = list(return_all_scores = TRUE), - batch_size = 4, + max_length = 512L, + output_dir = "auto", + tidy_func = tidy_classification_response, + chunk_size = 2500, concurrent_requests = 1, max_retries = 5, - timeout = 30, - progress = TRUE) { + timeout = 60) { # mirrors the hf_embed_df function @@ -645,47 +644,32 @@ hf_classify_df <- function(df, "df must be a data frame" = is.data.frame(df), "endpoint_url must be provided" = !is.null(endpoint_url) && nchar(endpoint_url) > 0, "concurrent_requests must be a number greater than 0" = is.numeric(concurrent_requests) && concurrent_requests > 0, - "batch_size must be a number greater than 0" = is.numeric(batch_size) && batch_size > 0 + "chunk_size must be a number greater than 0" = is.numeric(chunk_size) && chunk_size > 0 ) - original_num_rows <- nrow(df) # for final sanity check + output_dir <- .handle_output_directory(output_dir, base_dir_name = "hf_classification_chunks") # pull texts & ids into vectors for batch function text_vec <- dplyr::pull(df, !!text_sym) indices_vec <- dplyr::pull(df, !!id_sym) - batch_size <- if(is.null(batch_size) || batch_size <=1) 1 else batch_size - - classification_tbl <- hf_classify_batch(texts = text_vec, - endpoint_url = endpoint_url, - key_name = key_name, - tidy_func = tidy_func, - parameters = parameters, - batch_size = batch_size, - max_retries = max_retries, - timeout = timeout, - progress = TRUE, - concurrent_requests = concurrent_requests) - - - final_num_rows <- nrow(classification_tbl) - - if(final_num_rows == original_num_rows) { - classification_tbl <- classification_tbl |> dplyr::mutate(!!id_sym := indices_vec) - - df <- dplyr::left_join(df, classification_tbl) + chunk_size <- if(is.null(chunk_size) || chunk_size <=1) 1 else chunk_size - return(df) - } else { - cli::cli_warn("Rows in original data frame and returned data frame do not match:") - cli::cli_bullets(text = c( - "Rows in original data frame: {original_num_rows}", - "Rows in returned data frame: {final_num_rows}" - )) - cli::cli_alert_info("Returning table with all available response data") - return(classification_tbl) - } + results <- hf_classify_chunks( + texts = text_vec, + ids = indices_vec, + endpoint_url = endpoint_url, + max_length = max_length, + tidy_func = tidy_func, + chunk_size = chunk_size, + concurrent_requests = concurrent_requests, + max_retries = max_retries, + timeout = timeout, + key_name = key_name, + output_dir = output_dir + ) + return(results) } diff --git a/R/hf_embed.R b/R/hf_embed.R index 79cc73b..6be2db1 100644 --- a/R/hf_embed.R +++ b/R/hf_embed.R @@ -501,10 +501,8 @@ hf_embed_df <- function(df, output_dir <- .handle_output_directory(output_dir, base_dir_name = "hf_embeddings_batch") - # output_file <- .handle_output_filename(output_file, - # base_file_name = "hf_embeddings_batch") - # refactoring to always use hf_embed_batch - if batch_size if one then it gets handled anyway, avoids branching and additional complexity. + texts <- dplyr::pull(df, !!text_sym) indices <- dplyr::pull(df, !!id_sym) diff --git a/man/hf_classify_df.Rd b/man/hf_classify_df.Rd index 77f9cb6..0a946b2 100644 --- a/man/hf_classify_df.Rd +++ b/man/hf_classify_df.Rd @@ -10,14 +10,13 @@ hf_classify_df( id_var, endpoint_url, key_name, - ..., - tidy_func = tidy_batch_classification_response, - parameters = list(return_all_scores = TRUE), - batch_size = 4, + max_length = 512L, + output_dir = "auto", + tidy_func = tidy_classification_response, + chunk_size = 2500, concurrent_requests = 1, max_retries = 5, - timeout = 30, - progress = TRUE + timeout = 60 ) } \arguments{ @@ -31,22 +30,20 @@ hf_classify_df( \item{key_name}{Name of environment variable containing the API key} -\item{...}{Additional arguments passed to request functions} - \item{tidy_func}{Function to process API responses, defaults to \code{tidy_batch_classification_response}} -\item{parameters}{List of parameters for the API endpoint, defaults to -\code{list(return_all_scores = TRUE)}} - -\item{batch_size}{Integer; number of texts per batch (default: 4)} - \item{concurrent_requests}{Integer; number of concurrent requests (default: 1)} \item{max_retries}{Integer; maximum retry attempts (default: 5)} \item{timeout}{Numeric; request timeout in seconds (default: 30)} +\item{parameters}{List of parameters for the API endpoint, defaults to +\code{list(return_all_scores = TRUE)}} + +\item{batch_size}{Integer; number of texts per batch (default: 4)} + \item{progress}{Logical; whether to show progress bar (default: TRUE)} } \value{ @@ -58,9 +55,9 @@ Classifies texts in a data frame column using a Hugging Face classification endpoint and joins the results back to the original data frame. } \details{ -This function extracts texts from a specified column, classifies them using -\code{hf_classify_batch()}, and joins the classification results back to the -original data frame using a specified ID column. +This function extracts texts and IDs from the specified columns, classifies them in chunks. +It writes +\code{hf_classify_chunks()}, and then returns all of the chu The function preserves the original data frame structure and adds new columns for classification scores. If the number of rows doesn't match From 8a56718b71b00f43959b28afc286875553e59c74 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Thu, 20 Nov 2025 12:00:21 +0000 Subject: [PATCH 34/56] Remove max_length from hf_embed_df, and hf_classify_df - the solution is to turn on 'AUTO_TRUNCATE' in the Set up of the endpoint --- R/hf_classify.R | 6 ++- R/hf_embed.R | 16 ++----- dev_docs/hf_embed_dev.qmd | 97 +++++++++++++++++++++++++++++++++++++++ man/hf_embed_chunks.Rd | 1 - man/hf_embed_df.Rd | 3 -- 5 files changed, 106 insertions(+), 17 deletions(-) create mode 100644 dev_docs/hf_embed_dev.qmd diff --git a/R/hf_classify.R b/R/hf_classify.R index 0f58310..50b2547 100644 --- a/R/hf_classify.R +++ b/R/hf_classify.R @@ -425,7 +425,7 @@ hf_classify_chunks <- function(texts, "key_name must be a non-empty string" = is.character(key_name) && nchar(key_name) > 0 ) - # core logic ---- + # Chunking set up and metadata ---- output_dir <- .handle_output_directory(output_dir, base_dir_name = "hf_classify_chunk") if (!dir.exists(output_dir)) { @@ -461,6 +461,9 @@ hf_classify_chunks <- function(texts, cli::cli_alert_info("Processing {length(texts)} text{?s} in {n_chunks} chunk{?s} of up to {chunk_size} rows per chunk") cli::cli_alert_info("Intermediate results and metadata will be saved as .parquet files and .json in {output_dir}") + + # process chunks ---- + # track global successes for failures for end-of-pipeline reporting total_successes <- 0 total_failures <- 0 @@ -560,6 +563,7 @@ hf_classify_chunks <- function(texts, parquet_files <- list.files(output_dir, pattern = "\\.parquet$", full.names = TRUE) + # report and return ---- cli::cli_alert_info("Processing completed, there were {total_successes} successes\n and {total_failures} failures.") final_results <- arrow::open_dataset(parquet_files, format = "parquet") |> dplyr::collect() diff --git a/R/hf_embed.R b/R/hf_embed.R index 6be2db1..971bcf2 100644 --- a/R/hf_embed.R +++ b/R/hf_embed.R @@ -266,7 +266,6 @@ hf_embed_batch <- function(texts, hf_embed_chunks <- function(texts, ids, endpoint_url, - max_length = 8192L, output_dir = "auto", chunk_size = 5000L, concurrent_requests = 5L, @@ -279,12 +278,9 @@ hf_embed_chunks <- function(texts, "texts must be a vector" = is.vector(texts), "ids must be a vector" = is.vector(ids), "texts and ids must be the same length" = length(texts) == length(ids), - "chunk_size must be a positive integer greater than 1" = is.numeric(chunk_size) && chunk_size > 0, - "max_length must be a positive integer greater than 1" = is.numeric(max_length) && max_length > 0 + "chunk_size must be a positive integer greater than 1" = is.numeric(chunk_size) && chunk_size > 0 ) - max_length = as.integer(max_length) # type conversion to be extra safe as it's feeding to Py - # output_file = .handle_output_filename(output_file, base_file_name = "hf_embeddings_batch") output_dir <- .handle_output_directory(output_dir, base_dir_name = "hf_embeddings_batch") @@ -296,8 +292,7 @@ hf_embed_chunks <- function(texts, chunk_data <- batch_vector(seq_along(texts), chunk_size) n_chunks <- length(chunk_data$batch_indices) - inference_parameters = list(truncation = TRUE, - max_length = max_length) + inference_parameters = list(truncate = TRUE) # text embeddings inference - TEI only takes truncate, not truncation and max_length like other inference endpoints! # write/store imoortant metadata in the output dir metadata <- list( @@ -341,7 +336,7 @@ hf_embed_chunks <- function(texts, endpoint_url = endpoint_url, endpointr_id = y, key_name = key_name, - parameters = list(inference_parameters), + parameters = inference_parameters, max_retries = max_retries, timeout = timeout, validate = FALSE @@ -435,7 +430,6 @@ hf_embed_chunks <- function(texts, #' @param id_var Name of the column to use as ID #' @param endpoint_url The URL of the Hugging Face Inference API endpoint #' @param key_name Name of the environment variable containing the API key -#' @param max_length The maximum number of tokens in the text variable. Beyond this cut-off everything is truncated. #' @param output_dir Path to directory for the .parquet chunks #' @param chunk_size The size of each chunk that will be processed and then written to a file. #' @param concurrent_requests Number of requests to send at once. Some APIs do not allow for multiple requests. @@ -478,7 +472,6 @@ hf_embed_df <- function(df, id_var, endpoint_url, key_name, - max_length = 8192L, output_dir = "auto", chunk_size = 5000L, concurrent_requests = 1L, @@ -517,8 +510,7 @@ hf_embed_df <- function(df, concurrent_requests = concurrent_requests, max_retries = max_retries, timeout = timeout, - output_dir = output_dir, - max_length = max_length + output_dir = output_dir ) return(results) diff --git a/dev_docs/hf_embed_dev.qmd b/dev_docs/hf_embed_dev.qmd new file mode 100644 index 0000000..d03fb14 --- /dev/null +++ b/dev_docs/hf_embed_dev.qmd @@ -0,0 +1,97 @@ +--- +title: "hf_embed_dev" +format: html +--- + +```{r} +library(EndpointR) +library(arrow) +library(httr2) +library(purrr) +library(dplyr) +library(tidyr) +``` + +```{r} +test_embed_url <- secret_decrypt( + "KWqk_H6v58UIaJm6WPAekfrAD8LVAZGUmfZNysv4Ze48NxHJCYyOPWdqMn8_z-xU_MFCccR8Qm0_qUzqcSMB6QlphuHPtkHmYievFG1L4J6k0g", + "ENDPOINTR_KEY" +) + +test_data <- read_parquet("~/dev/projects/diageo/mltm/data/main/mltm_l3_trend_df.parquet") +``` + +```{r} +test_data <- test_data |> + slice(1:10000) |> + mutate(string_length = str_length(message)) + +test_long_strings <- test_data |> + filter(string_length > 2500) + +test_short_strings <- test_data |> + filter(string_length < 500) |> + slice(1:50) +``` + +```{r} +should_fail <- test_long_strings |> + hf_embed_df( + message, + universal_message_id, + test_embed_url, + "HF_API_KEY", + max_length = 8192L, + output_dir = "test_dir/test_embed/test_failures_long", + concurrent_requests = 5, + max_retries = 10L, + timeout = 60 + ) +``` + +```{r} +should_pass <- test_short_strings |> + hf_embed_df( + message, + universal_message_id, + test_embed_url, + "HF_API_KEY", + max_length = 8192L, + output_dir = "test_dir/test_embed/test_passes_short", + concurrent_requests = 5, + max_retries = 10L, + timeout = 60 + ) +``` + +The endpoint reports 512 as model_max_length but actually the inference endpoint asks for 256 + +```{r} +should_pass_string_truncation <- test_long_strings |> + mutate(message = str_trunc(message, 250, ellipsis = "")) |> + hf_embed_df( + message, + universal_message_id, + test_embed_url, + "HF_API_KEY", + max_length = 50, + output_dir = "test_dir/test_embed/test_pass_truncation", + concurrent_requests = 5, + max_retries = 10L, + timeout = 60 + ) +``` + +```{r} +should_pass_tokenisation <- test_long_strings |> + hf_embed_df( + message, + universal_message_id, + test_embed_url, + "HF_API_KEY", + output_dir = "test_dir/test_embed/test_pass_truncation", + concurrent_requests = 5, + max_retries = 10L, + timeout = 60 + ) +``` diff --git a/man/hf_embed_chunks.Rd b/man/hf_embed_chunks.Rd index 26c1c3c..353a203 100644 --- a/man/hf_embed_chunks.Rd +++ b/man/hf_embed_chunks.Rd @@ -8,7 +8,6 @@ hf_embed_chunks( texts, ids, endpoint_url, - max_length = 8192L, output_dir = "auto", chunk_size = 5000L, concurrent_requests = 5L, diff --git a/man/hf_embed_df.Rd b/man/hf_embed_df.Rd index ab8b234..ebd69e4 100644 --- a/man/hf_embed_df.Rd +++ b/man/hf_embed_df.Rd @@ -10,7 +10,6 @@ hf_embed_df( id_var, endpoint_url, key_name, - max_length = 8192L, output_dir = "auto", chunk_size = 5000L, concurrent_requests = 1L, @@ -30,8 +29,6 @@ hf_embed_df( \item{key_name}{Name of the environment variable containing the API key} -\item{max_length}{The maximum number of tokens in the text variable. Beyond this cut-off everything is truncated.} - \item{output_dir}{Path to directory for the .parquet chunks} \item{chunk_size}{The size of each chunk that will be processed and then written to a file.} From 628233bfc15c681ed26d2ff7ea7e5573c834007c Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Thu, 20 Nov 2025 12:07:40 +0000 Subject: [PATCH 35/56] add the 'hf_get_endpoint_info()` function to retrieve endpoint details --- NAMESPACE | 1 + R/utils.R | 22 ++++++++++++++++++++++ dev_docs/hf_classify_dev.qmd | 22 ++++++++++++++++++++++ dev_docs/hf_embed_dev.qmd | 31 ++++++++++++++++++++++++++++--- man/hf_get_endpoint_info.Rd | 16 ++++++++++++++++ 5 files changed, 89 insertions(+), 3 deletions(-) create mode 100644 man/hf_get_endpoint_info.Rd diff --git a/NAMESPACE b/NAMESPACE index 117efe3..6346a46 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -13,6 +13,7 @@ export(hf_embed_batch) export(hf_embed_chunks) export(hf_embed_df) export(hf_embed_text) +export(hf_get_endpoint_info) export(hf_get_model_max_length) export(hf_perform_request) export(json_dump) diff --git a/R/utils.R b/R/utils.R index 0726f6a..56a25a2 100644 --- a/R/utils.R +++ b/R/utils.R @@ -317,3 +317,25 @@ hf_get_model_max_length <- function(model_name, api_key = "HF_API_KEY") { return(tokenizer_config$model_max_length) } + + +#' Retrieve information about an endpoint +#' +#' @param endpoint_url Hugging Face Embedding Endpoint +#' @param key_name Name of environment variable containing the API key (default: "HF_API_KEY") +#' +#' @returns +#' @export +#' +hf_get_endpoint_info <- function(endpoint_url, key_name = "HF_API_KEY") { + + info_endpoint_url <- glue::glue("{endpoint_url}/info") + api_key = get_api_key(key_name) + + info <-httr2::request(info_endpoint_url) |> + httr2::req_headers(Authorization = paste("Bearer", api_key)) |> + httr2::req_perform() |> + httr2::resp_body_json() + + return(info) +} diff --git a/dev_docs/hf_classify_dev.qmd b/dev_docs/hf_classify_dev.qmd index 5215135..db01c26 100644 --- a/dev_docs/hf_classify_dev.qmd +++ b/dev_docs/hf_classify_dev.qmd @@ -582,3 +582,25 @@ should_fail <- hf_classify_chunks( ℹ Processing completed, there were 0 successes and 6 failures. ``` + +Test the function over 10k data points + +```{r} + ten_k_results <- x |> + filter(message != "", + !is.na(message)) |> + slice(1:10000) |> + hf_classify_df( + text_var = message, + id_var = universal_message_id, + endpoint_url = test_sent_url, + key_name = "HF_API_KEY", + max_length = 128, + output_dir = "test_dir/test_classify/ten_thousand_rows", + tidy_func = tidy_classification_response, + chunk_size = 2500, + concurrent_requests = 15, + max_retries = 10L, + timeout = 60 + ) +``` diff --git a/dev_docs/hf_embed_dev.qmd b/dev_docs/hf_embed_dev.qmd index d03fb14..11b1529 100644 --- a/dev_docs/hf_embed_dev.qmd +++ b/dev_docs/hf_embed_dev.qmd @@ -10,6 +10,7 @@ library(httr2) library(purrr) library(dplyr) library(tidyr) +library(stringr) ``` ```{r} @@ -41,7 +42,7 @@ should_fail <- test_long_strings |> universal_message_id, test_embed_url, "HF_API_KEY", - max_length = 8192L, + # max_length = 8192L, output_dir = "test_dir/test_embed/test_failures_long", concurrent_requests = 5, max_retries = 10L, @@ -56,7 +57,7 @@ should_pass <- test_short_strings |> universal_message_id, test_embed_url, "HF_API_KEY", - max_length = 8192L, + # max_length = 8192L, output_dir = "test_dir/test_embed/test_passes_short", concurrent_requests = 5, max_retries = 10L, @@ -74,7 +75,7 @@ should_pass_string_truncation <- test_long_strings |> universal_message_id, test_embed_url, "HF_API_KEY", - max_length = 50, + # max_length = 50, output_dir = "test_dir/test_embed/test_pass_truncation", concurrent_requests = 5, max_retries = 10L, @@ -82,6 +83,8 @@ should_pass_string_truncation <- test_long_strings |> ) ``` +TEST Tokenisation is working + ```{r} should_pass_tokenisation <- test_long_strings |> hf_embed_df( @@ -95,3 +98,25 @@ should_pass_tokenisation <- test_long_strings |> timeout = 60 ) ``` + +```{r} +test_ten_k <- test_data |> + hf_embed_df( + message, + universal_message_id, + test_embed_url, + "HF_API_KEY", + output_dir = "test_dir/test_embed/test_ten_k", + concurrent_requests = 50, + max_retries = 10L, + timeout = 60 + ) +``` + +With AUTO_TRUNCATE: true, they do work (in the Endpoint configs) + +Getting model info - requires the API be up and running. + +```{r} +hf_get_endpoint_info(test_embed_url) +``` diff --git a/man/hf_get_endpoint_info.Rd b/man/hf_get_endpoint_info.Rd new file mode 100644 index 0000000..0dce124 --- /dev/null +++ b/man/hf_get_endpoint_info.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{hf_get_endpoint_info} +\alias{hf_get_endpoint_info} +\title{Retrieve information about an endpoint} +\usage{ +hf_get_endpoint_info(endpoint_url, key_name = "HF_API_KEY") +} +\arguments{ +\item{endpoint_url}{Hugging Face Embedding Endpoint} + +\item{key_name}{Name of environment variable containing the API key (default: "HF_API_KEY")} +} +\description{ +Retrieve information about an endpoint +} From ee043281640e03cb0fd1c29460eacdb2aab5b8a2 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Thu, 20 Nov 2025 13:59:09 +0000 Subject: [PATCH 36/56] add @returns for hf_get_endpoint_info --- R/utils.R | 2 +- man/hf_get_endpoint_info.Rd | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/R/utils.R b/R/utils.R index 56a25a2..bb4ef98 100644 --- a/R/utils.R +++ b/R/utils.R @@ -324,7 +324,7 @@ hf_get_model_max_length <- function(model_name, api_key = "HF_API_KEY") { #' @param endpoint_url Hugging Face Embedding Endpoint #' @param key_name Name of environment variable containing the API key (default: "HF_API_KEY") #' -#' @returns +#' @returns JSON of endpoint information #' @export #' hf_get_endpoint_info <- function(endpoint_url, key_name = "HF_API_KEY") { diff --git a/man/hf_get_endpoint_info.Rd b/man/hf_get_endpoint_info.Rd index 0dce124..a905169 100644 --- a/man/hf_get_endpoint_info.Rd +++ b/man/hf_get_endpoint_info.Rd @@ -11,6 +11,9 @@ hf_get_endpoint_info(endpoint_url, key_name = "HF_API_KEY") \item{key_name}{Name of environment variable containing the API key (default: "HF_API_KEY")} } +\value{ +JSON of endpoint information +} \description{ Retrieve information about an endpoint } From 4f110f19db58214ad6e0800ce49bde8d572c4582 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Thu, 20 Nov 2025 14:27:12 +0000 Subject: [PATCH 37/56] Update test_embed tests following changes to file writing and arguments --- tests/testthat/test-hf_embed.R | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/testthat/test-hf_embed.R b/tests/testthat/test-hf_embed.R index 5c930c0..a63276b 100644 --- a/tests/testthat/test-hf_embed.R +++ b/tests/testthat/test-hf_embed.R @@ -87,7 +87,7 @@ test_that("hf_embed_batch allows custom tidy_func", { test_that("hf_embed_chunks replaces hf_embed_batch", { texts <- paste0("text", 1:6) ids <- paste0('id', 1:length(texts)) - temp_file <- tempfile(fileext = ".csv") + temp_dir <- withr::local_tempdir() expected_cols <- c("id", ".error", ".error_msg", ".chunk", "V1", "V2", "V3") @@ -98,7 +98,7 @@ test_that("hf_embed_chunks replaces hf_embed_batch", { key_name = "HF_TEST_API_KEY", chunk_size = 2, concurrent_requests =1, - output_file = temp_file + output_dir = temp_dir )) |> suppressMessages() expect_setequal(unique(chunk_2$`.chunk`), c(1, 2, 3)) @@ -111,7 +111,7 @@ test_that("hf_embed_chunks replaces hf_embed_batch", { key_name = "HF_TEST_API_KEY", chunk_size = 1, concurrent_requests =1, - output_file = temp_file + output_dir = temp_dir )) |> suppressMessages() expect_setequal(unique(chunk_1$`.chunk`), 1:6) @@ -120,11 +120,11 @@ test_that("hf_embed_chunks replaces hf_embed_batch", { test_that("hf_embed_df works correctly with real endpoint", { test_df <- data.frame( - id = c(1, 2), + id = paste0("id", 1:2), text = c("text1", "text2"), stringsAsFactors = FALSE ) - temp_file <- tempfile(fileext = ".csv") + output_dir <- withr::local_tempdir() result <- expect_no_error( hf_embed_df( @@ -133,8 +133,8 @@ test_that("hf_embed_df works correctly with real endpoint", { id_var = id, endpoint_url = server$url("/test_embedding"), key_name = "HF_TEST_API_KEY", - chunk_size = 2, - output_file = temp_file + chunk_size = 1, + output_dir = output_dir ) ) |> suppressMessages() @@ -142,7 +142,7 @@ test_that("hf_embed_df works correctly with real endpoint", { expect_s3_class(result, "data.frame") expect_equal(nrow(result), 2) expect_true(all(c("id", "V1", "V2", "V3", ".error", ".error_msg", ".chunk") %in% names(result))) - expect_equal(result$id, c(1, 2)) + expect_equal(result$id, c("id1", "id2")) expect_equal(result$V1, c(0.1, 0.1), tolerance = 1e-7) expect_equal(result$V2, c(0.2, 0.2), tolerance = 1e-7) expect_equal(result$V3, c(0.3, 0.3), tolerance = 1e-7) @@ -151,11 +151,11 @@ test_that("hf_embed_df works correctly with real endpoint", { test_that("hf_embed_df works with different batch sizes", { test_df <- data.frame( - id = c(1, 2), + id = c(paste0("id" 1:2)), text = c("text1", "text2"), stringsAsFactors = FALSE ) - temp_file <- tempfile(fileext = ".csv") + temp_dir <- withr::local_tempdir() result <- expect_no_error( hf_embed_df( @@ -166,7 +166,7 @@ test_that("hf_embed_df works with different batch sizes", { key_name = "HF_TEST_API_KEY", chunk_size = 1, concurrent_requests = 1, - output_file = temp_file + output_dir = output_dir ) ) |> suppressMessages() From 09c476982f6f9cbe3369f6978dfd09a934cdfc20 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Thu, 20 Nov 2025 14:38:48 +0000 Subject: [PATCH 38/56] Update README following changes to hf_*_df functions and move to chunks Build README Update NEWS.md --- NEWS.md | 25 +++++++++++------- README.Rmd | 70 ++++++++++++++++++++++++++++++++++++++++++++++---- README.md | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 151 insertions(+), 19 deletions(-) diff --git a/NEWS.md b/NEWS.md index dd53966..c48bc6f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,20 +1,27 @@ -# Endpointr 0.1.2 +# EndpointR 0.1.2 -- We extend the coverage of writing to files to the `hf_embed_df()`, `hf_classify_df()` functions and move to a chunk_size argument rather than batch_size. +- **File writing improvements**: `hf_embed_df()` and `hf_classify_df()` now write intermediate results as `.parquet` files to `output_dir` directories, similar to improvements in 0.1.1 for OpenAI functions -- [ ] `hf_embed_df()`, `hf_classify_df()` improved to write to files similarly to the upgrades applied in 0.1.1 +- **Parameter changes**: Moved from `batch_size` to `chunk_size` argument across `hf_embed_df()`, `hf_classify_df()`, and `oai_complete_df()` for consistency - - [ ] .parquet files +- **New chunking functions**: Introduced `hf_embed_chunks()` and `hf_classify_chunks()` for more efficient batch processing with better error handling -- [ ] `oai_complete_df`, `oai_complete_chunks`, `oai_embed_df` all write to .parquet files +- **Dependency update**: Package now depends on `arrow` for faster `.parquet` file writing and reading - - [ ] Move to chunk_size argument +- **Metadata tracking**: Hugging Face functions that write to files (`hf_embed_df()`, `hf_classify_df()`, `hf_embed_chunks()`, `hf_classify_chunks()`) now write `metadata.json` to output directories containing: + - Endpoint URL and API key name used + - Processing parameters (chunk_size, concurrent_requests, timeout, max_retries) + - Inference parameters (truncate, max_length) + - Timestamp and row counts + - Useful for debugging, reproducibility, and tracking which models/endpoints were used -- The package takes a dependency on arrow over read_csv, this enables faster writing and reading of files once stored +- **max_length parameter**: Added `max_length` parameter to `hf_classify_df()` and `hf_classify_chunks()` for text truncation control. Note: `hf_embed_df()` handles truncation automatically via endpoint configuration (set `AUTO_TRUNCATE` in endpoint settings) -- max_length added to `hf_classify` and `hf_embed` functions +- **New utility functions**: + - `hf_get_model_max_length()` - Retrieve maximum token length for a Hugging Face model + - `hf_get_endpoint_info()` - Retrieve detailed information about a Hugging Face Inference Endpoint -- `hf_get_model_max_length()` function introduced to make it easier to set the max_length argument in `hf_*` functions. +- **Improved reporting**: Chunked/batch processing functions now report total successes and failures at completion # EndpointR 0.1.1 diff --git a/README.Rmd b/README.Rmd index a9ff7b5..22071ee 100644 --- a/README.Rmd +++ b/README.Rmd @@ -93,8 +93,11 @@ hf_embed_df( id_var = review_id, endpoint_url = endpoint_url, key_name = "HF_API_KEY", + output_dir = "embeddings_output", # writes .parquet chunks to this directory + chunk_size = 5000, # process 5000 rows per chunk concurrent_requests = 2, - batch_size = 3 + max_retries = 5, + timeout = 15 ) ``` @@ -134,8 +137,12 @@ hf_classify_df( id_var = review_id, endpoint_url = sentiment_endpoint, key_name = "HF_API_KEY", - batch_size = 8, - concurrent_requests = 3 + max_length = 512, # truncate texts longer than 512 tokens + output_dir = "classification_output", # writes .parquet chunks to this directory + chunk_size = 2500, # process 2500 rows per chunk + concurrent_requests = 3, + max_retries = 5, + timeout = 60 ) |> dplyr::rename(!!!labelid_2class()) ``` @@ -189,7 +196,11 @@ oai_complete_df( id_var = review_id, system_prompt = "Classify the following review:", key_name = "OPENAI_API_KEY", - concurrent_requests = 5 # send 5 rows of data simultaneously + output_file = "completions_output.parquet", # writes results to this file + chunk_size = 1000, # process 1000 rows per chunk + concurrent_requests = 5, # send 5 rows of data simultaneously + max_retries = 5, + timeout = 30 ) ``` @@ -203,10 +214,59 @@ oai_complete_df( system_prompt = "Classify the following review:", schema = sentiment_schema, key_name = "OPENAI_API_KEY", - concurrent_requests = 5 # send 5 rows of data simultaneously + output_file = "completions_output.parquet", + chunk_size = 1000, + concurrent_requests = 5 ) ``` +# Working with Output Files + +## Reading Results from Disk + +Hugging Face functions (`hf_embed_df()`, `hf_classify_df()`) write intermediate results as `.parquet` files in the specified `output_dir`. To read all results back: + +```{r, eval = FALSE} +# List all parquet files (excludes metadata.json automatically) +parquet_files <- list.files("embeddings_output", + pattern = "\\.parquet$", + full.names = TRUE) + +# Read all chunks into a single data frame +results <- arrow::open_dataset(parquet_files, format = "parquet") |> + dplyr::collect() +``` + +## Understanding metadata.json + +Each Hugging Face output directory contains a `metadata.json` file that records: + +- `endpoint_url`: The API endpoint used +- `chunk_size`: Number of rows processed per chunk +- `n_texts`: Total number of texts processed +- `concurrent_requests`: Parallel request setting +- `timeout`: Request timeout in seconds +- `max_retries`: Maximum retry attempts +- `inference_parameters`: Model-specific parameters (e.g., truncate, max_length) +- `timestamp`: When the job was run +- `key_name`: Which API key was used + +This metadata is useful for: + +- Debugging failed runs +- Reproducing results with the same settings +- Tracking which endpoint/model was used +- Understanding performance characteristics + +```{r, eval = FALSE} +metadata <- jsonlite::read_json("embeddings_output/metadata.json") + +# check which endpoint was used +metadata$endpoint_url +``` + +**Note:** Add output directories to `.gitignore` to avoid committing API responses and metadata. + Read the [LLM Providers Vignette](articles/llm_providers.html), and the [Structured Outputs Vignette](vignettes/structured_outputs_json_schema.Rmd) for more information on common workflows with the OpenAI Chat Completions API [^1] [^1]: Content pending implementation for Anthroic Messages API, Gemini API, and OpenAI Responses API diff --git a/README.md b/README.md index da1601a..e54bd32 100644 --- a/README.md +++ b/README.md @@ -89,8 +89,11 @@ hf_embed_df( id_var = review_id, endpoint_url = endpoint_url, key_name = "HF_API_KEY", + output_dir = "embeddings_output", # writes .parquet chunks to this directory + chunk_size = 5000, # process 5000 rows per chunk concurrent_requests = 2, - batch_size = 3 + max_retries = 5, + timeout = 15 ) ``` @@ -132,8 +135,12 @@ hf_classify_df( id_var = review_id, endpoint_url = sentiment_endpoint, key_name = "HF_API_KEY", - batch_size = 8, - concurrent_requests = 3 + max_length = 512, # truncate texts longer than 512 tokens + output_dir = "classification_output", # writes .parquet chunks to this directory + chunk_size = 2500, # process 2500 rows per chunk + concurrent_requests = 3, + max_retries = 5, + timeout = 60 ) |> dplyr::rename(!!!labelid_2class()) ``` @@ -190,7 +197,11 @@ oai_complete_df( id_var = review_id, system_prompt = "Classify the following review:", key_name = "OPENAI_API_KEY", - concurrent_requests = 5 # send 5 rows of data simultaneously + output_file = "completions_output.parquet", # writes results to this file + chunk_size = 1000, # process 1000 rows per chunk + concurrent_requests = 5, # send 5 rows of data simultaneously + max_retries = 5, + timeout = 30 ) ``` @@ -204,10 +215,64 @@ oai_complete_df( system_prompt = "Classify the following review:", schema = sentiment_schema, key_name = "OPENAI_API_KEY", - concurrent_requests = 5 # send 5 rows of data simultaneously + output_file = "completions_output.parquet", + chunk_size = 1000, + concurrent_requests = 5 ) ``` +# Working with Output Files + +## Reading Results from Disk + +Hugging Face functions (`hf_embed_df()`, `hf_classify_df()`) write +intermediate results as `.parquet` files in the specified `output_dir`. +To read all results back: + +``` r +# List all parquet files (excludes metadata.json automatically) +parquet_files <- list.files("embeddings_output", + pattern = "\\.parquet$", + full.names = TRUE) + +# Read all chunks into a single data frame +results <- arrow::open_dataset(parquet_files, format = "parquet") |> + dplyr::collect() +``` + +## Understanding metadata.json + +Each Hugging Face output directory contains a `metadata.json` file that +records: + +- `endpoint_url`: The API endpoint used +- `chunk_size`: Number of rows processed per chunk +- `n_texts`: Total number of texts processed +- `concurrent_requests`: Parallel request setting +- `timeout`: Request timeout in seconds +- `max_retries`: Maximum retry attempts +- `inference_parameters`: Model-specific parameters (e.g., truncate, + max_length) +- `timestamp`: When the job was run +- `key_name`: Which API key was used + +This metadata is useful for: + +- Debugging failed runs +- Reproducing results with the same settings +- Tracking which endpoint/model was used +- Understanding performance characteristics + +``` r +metadata <- jsonlite::read_json("embeddings_output/metadata.json") + +# check which endpoint was used +metadata$endpoint_url +``` + +**Note:** Add output directories to `.gitignore` to avoid committing API +responses and metadata. + Read the [LLM Providers Vignette](articles/llm_providers.html), and the [Structured Outputs Vignette](vignettes/structured_outputs_json_schema.Rmd) for more From 4a8becaf04727624fa57d450d5ae4c50a514e646 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Thu, 20 Nov 2025 14:47:44 +0000 Subject: [PATCH 39/56] add chunking and updated tests for hf_classify_*, similar to hf_embed_* --- tests/testthat/test-hf_classify.R | 116 +++++++++++++++++++++++++----- 1 file changed, 100 insertions(+), 16 deletions(-) diff --git a/tests/testthat/test-hf_classify.R b/tests/testthat/test-hf_classify.R index a6a0aaa..dd9b221 100644 --- a/tests/testthat/test-hf_classify.R +++ b/tests/testthat/test-hf_classify.R @@ -148,6 +148,102 @@ test_that("hf_classify_batch processes a batch of texts and returns a tidied cla }) +test_that("hf_classify_chunks processes chunks correctly", { + texts <- paste0("text", 1:6) + ids <- paste0("id", 1:length(texts)) + temp_dir <- withr::local_tempdir() + expected_cols <- c("id", "text", ".error", ".error_msg", ".chunk", "positive", "negative", "neutral") + + # Test with chunk_size = 2 + chunk_2 <- expect_no_error(hf_classify_chunks( + texts = texts, + ids = ids, + endpoint_url = server$url("/test_single_sentiment"), + key_name = "HF_TEST_API_KEY", + chunk_size = 2, + concurrent_requests = 1, + output_dir = temp_dir + )) |> suppressMessages() + + expect_setequal(unique(chunk_2$`.chunk`), c(1, 2, 3)) + expect_setequal(names(chunk_2), expected_cols) + expect_equal(nrow(chunk_2), 6) + + # Test with chunk_size = 1 + chunk_1 <- expect_no_error(hf_classify_chunks( + texts = texts, + ids = ids, + endpoint_url = server$url("/test_single_sentiment"), + key_name = "HF_TEST_API_KEY", + chunk_size = 1, + concurrent_requests = 1, + output_dir = temp_dir + )) |> suppressMessages() + + expect_setequal(unique(chunk_1$`.chunk`), 1:6) + expect_equal(nrow(chunk_1), 6) +}) + +test_that("hf_classify_df works correctly with chunk processing", { + test_df <- data.frame( + id = paste0("id", 1:2), + text = c("text1", "text2"), + stringsAsFactors = FALSE + ) + output_dir <- withr::local_tempdir() + + result <- expect_no_error( + hf_classify_df( + df = test_df, + text_var = text, + id_var = id, + endpoint_url = server$url("/test_single_sentiment"), + key_name = "HF_TEST_API_KEY", + chunk_size = 1, + output_dir = output_dir + ) + ) |> + suppressMessages() + + expect_s3_class(result, "data.frame") + expect_equal(nrow(result), 2) + expect_true(all(c("id", "positive", "negative", "neutral", ".error", ".error_msg", ".chunk") %in% names(result))) + expect_equal(result$id, c("id1", "id2")) + expect_equal(result$positive, c(0.9, 0.9), tolerance = 1e-7) + expect_equal(result$negative, c(0.05, 0.05), tolerance = 1e-7) + expect_equal(result$neutral, c(0.05, 0.05), tolerance = 1e-7) + expect_equal(result$.error, c(FALSE, FALSE)) +}) + +test_that("hf_classify_df works with different chunk sizes", { + test_df <- data.frame( + id = paste0("id", 1:4), + text = paste0("text", 1:4), + stringsAsFactors = FALSE + ) + temp_dir <- withr::local_tempdir() + + result <- expect_no_error( + hf_classify_df( + df = test_df, + text_var = text, + id_var = id, + endpoint_url = server$url("/test_single_sentiment"), + key_name = "HF_TEST_API_KEY", + chunk_size = 2, + concurrent_requests = 1, + output_dir = temp_dir + ) + ) |> + suppressMessages() + + expect_s3_class(result, "data.frame") + expect_equal(nrow(result), 4) + expect_true(all(c("id", ".chunk", ".error", ".error_msg") %in% names(result))) + expect_equal(result$.error, c(FALSE, FALSE, FALSE, FALSE)) + expect_setequal(unique(result$.chunk), c(1, 2)) +}) + test_that("hf_classify_df's input validation is working", { # safety net for changes @@ -183,26 +279,14 @@ test_that("hf_classify_df's input validation is working", { ) expect_error( - hf_classify_df(df = test_df, text_var = text_content, id_var = doc_id, endpoint_url = "url", key_name = "key", batch_size = "text"), - "batch_size must be a number greater than 0" + hf_classify_df(df = test_df, text_var = text_content, id_var = doc_id, endpoint_url = "url", key_name = "key", chunk_size = "text"), + "chunk_size must be a number greater than 0" ) expect_error( - hf_classify_df(df = test_df, text_var = text_content, id_var = doc_id, endpoint_url = "url", key_name = "key", batch_size = NULL), - "batch_size must be a number greater than 0" + hf_classify_df(df = test_df, text_var = text_content, id_var = doc_id, endpoint_url = "url", key_name = "key", chunk_size = NULL), + "chunk_size must be a number greater than 0" ) }) - -# test_that("hf_classify_df processes a data frame of texts and returns a data frame", { -# -# -# test_df <- data.frame( -# id = c(1, 2), -# text = c("positive text", "negative text"), -# stringsAsFactors = FALSE -# ) -# -# -# }) From b55148fb016955773da1916f8361d9ccd9edbbf0 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Thu, 20 Nov 2025 14:50:35 +0000 Subject: [PATCH 40/56] add new functions to _pkgdown.yml and new section for HF utilities --- _pkgdown.yml | 8 +++++++- todos.qmd | 20 ++++++++++++++------ 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/_pkgdown.yml b/_pkgdown.yml index c1c4629..aa2657d 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -70,6 +70,7 @@ reference: contents: - hf_classify_text - hf_classify_batch + - hf_classify_chunks - hf_classify_df - tidy_classification_response @@ -81,6 +82,12 @@ reference: - hf_build_request_df - hf_perform_request +- title: "Hugging Face Endpoint Utilities" + desc: "Small functions for checking things with Hugging FACE APIs" + contents: + - hf_get_endpoint_info + - hf_get_model_max_length + - title: "OpenAI Completions" desc: "Functions for working with OpenAI's APIs including structured outputs" contents: @@ -138,7 +145,6 @@ reference: - single_embedding_hf - df_embeddings_hf - authors: Jack Penzer: href: https://github.com/jpcompartir diff --git a/todos.qmd b/todos.qmd index e65bd47..9fed3d6 100644 --- a/todos.qmd +++ b/todos.qmd @@ -2,6 +2,11 @@ # 0.1.2 +- [x] tests passing following hf\_\* changes +- [x] max_length for hf functions - not applicable in hf_embed\_\* functions as they use HF's TEI which doesn't allow max_length +- [ ] documented +- [x] Update README + similarly to the upgrades applied in 0.1.1: ``` @@ -10,12 +15,15 @@ similarly to the upgrades applied in 0.1.1: - [x] parquet - [x] fix output dir - [x] chunk_size - - [ ] Update tests -- [ ] `hf_classify_df()` - - [ ] write to files - - [ ] parquet - - [ ] fix output dir - - [ ] chunk_size + - [x] update tests + - [ ] update docs +- [x] `hf_classify_df()` + - [x] write to files + - [x] parquet + - [x] fix output dir + - [x] chunk_size + - [x] update tests + - [ ] update docs - [ ] `oai_complete_df` - [x] write to files - [ ] parquet From aa402f8431f06fc826aa5bd889377d57e3aefeba Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Thu, 20 Nov 2025 15:08:59 +0000 Subject: [PATCH 41/56] update the hugging_face_inference vignette in line with recent changes to the hf_*_df functions. --- vignettes/hugging_face_inference.Rmd | 612 +++++++++++++++++++++++++-- 1 file changed, 574 insertions(+), 38 deletions(-) diff --git a/vignettes/hugging_face_inference.Rmd b/vignettes/hugging_face_inference.Rmd index eb2b2ce..68b945f 100644 --- a/vignettes/hugging_face_inference.Rmd +++ b/vignettes/hugging_face_inference.Rmd @@ -1,6 +1,6 @@ --- title: "Using Hugging Face Inference Endpoints" -output: +output: html_document: toc: true toc_float: true @@ -37,12 +37,13 @@ library(EndpointR) library(dplyr) library(httr2) library(tibble) +library(arrow) my_data <- tibble( id = 1:3, text = c( "Machine learning is fascinating", - "I love working with embeddings", + "I love working with embeddings", "Natural language processing is powerful" ), category = c("ML", "embeddings", "NLP") @@ -52,7 +53,7 @@ my_data <- tibble( Follow Hugging Face's [docs](https://huggingface.co/docs/hub/security-tokens) to generate a Hugging Face token, and then register it with EndpointR: ```{r, keys_and_urls} -set_api_key("HF_TEST_API_KEY") +set_api_key("HF_TEST_API_KEY") ``` # Choosing Your Service @@ -68,6 +69,138 @@ For this vignette, we'll use the Inference API. To switch to dedicated endpoints Go to [Hugging Face's models hub](https://huggingface.co/models) and fetch the Inference API's URL for the model you want to embed your data with. Not all models are available via the Hugging Face Inference API, if you need to use a model that is not available you may need to deploy a [Dedicated Inference Endpoint](https://huggingface.co/inference-endpoints/dedicated). +# Understanding the Function Hierarchy + +EndpointR provides four levels of functions for working with Hugging Face endpoints: + +## Single Text Functions + +- `hf_embed_text()` - Embed a single text +- `hf_classify_text()` - Classify a single text + +Use these for one-off requests or testing. + +## Batch Functions + +- `hf_embed_batch()` - Embed multiple texts in memory +- `hf_classify_batch()` - Classify multiple texts in memory + +Use these for small to medium datasets (<5000 texts) that fit in memory. Results are returned as a single data frame. + +## Chunk Functions (NEW in v0.1.2) + +- `hf_embed_chunks()` - Process large volumes with incremental file writing +- `hf_classify_chunks()` - Process large volumes with incremental file writing + +Use these for large datasets (>5000 texts). Results are written incrementally as `.parquet` files to avoid memory issues and provide safety against crashes. + +## Data Frame Functions + +- `hf_embed_df()` - Convenience wrapper that calls `hf_embed_chunks()` +- `hf_classify_df()` - Convenience wrapper that calls `hf_classify_chunks()` + +**Most users will use these.** They handle extraction from data frames and call the chunk functions internally. + +## Choosing the Right Function + +Use this decision tree: + +```{r} +# Single text? Use _text functions +if (n_texts == 1) { + result <- hf_embed_text(text, endpoint_url, key_name) + # or + result <- hf_classify_text(text, endpoint_url, key_name) +} + +# Small batch (<5000 texts) and want results in memory only? +if (n_texts < 5000 && !need_file_output) { + results <- hf_embed_batch(texts, endpoint_url, key_name, batch_size = 10) + # or + results <- hf_classify_batch(texts, endpoint_url, key_name, batch_size = 8) +} + +# Large dataset or want file output for safety? +# Use _df functions (they call _chunks internally) +if (n_texts >= 5000 || need_safety) { + results <- hf_embed_df(df, text, id, endpoint_url, key_name, + chunk_size = 5000, output_dir = "my_results") + # or + results <- hf_classify_df(df, text, id, endpoint_url, key_name, + chunk_size = 2500, output_dir = "my_results", + max_length = 512) +} +``` + +> **Recommendation**: For most production use cases, use `_df` functions even for smaller datasets. The safety of incremental file writing is worth it. + +# Key Differences: Embeddings vs Classification + +Understanding the differences between embedding and classification functions is crucial for effective use. + +## Text Truncation Handling + +**Embeddings** (`hf_embed_*`): + +- **NO** `max_length` parameter in the R functions +- Truncation is handled **at the endpoint level** +- For Dedicated Endpoints: Set `AUTO_TRUNCATE=true` in your endpoint's environment variables +- For Inference API: Truncation is typically handled automatically by the model +- Uses TEI (Text Embeddings Inference) which only accepts `truncate`, not `truncation` or `max_length` + +**Classification** (`hf_classify_*`): + +- **HAS** `max_length` parameter (default: `512L`) +- Truncation is controlled **in your R code** +- Texts longer than `max_length` tokens are truncated before classification +- Uses standard inference parameters: `truncation=TRUE` and `max_length` + +```{r} +# Embeddings - NO max_length parameter +hf_embed_df( + df = my_data, + text_var = text, + id_var = id, + endpoint_url = embed_url, + key_name = "HF_API_KEY" + # max_length not available - set AUTO_TRUNCATE in endpoint settings +) + +# Classification - max_length IS available +hf_classify_df( + df = my_data, + text_var = text, + id_var = id, + endpoint_url = classify_url, + key_name = "HF_API_KEY", + max_length = 512 # Control truncation here +) +``` + +## Inference Parameters Sent to API + +The functions send different parameters to the Hugging Face API: + +**Embeddings**: + +```json +{ + "truncate": true +} +``` + +**Classification**: + +```json +{ + "return_all_scores": true, + "truncation": true, + "max_length": 512 +} +``` + +These differences are handled automatically - you don't need to worry about them unless you're debugging API issues. Check `metadata.json` (see below) to see what parameters were used. + # Embeddings ## Single Text @@ -115,35 +248,137 @@ The result includes: - `.error_msg`: what went wrong (if anything) - `V1` to `V384`: the embedding values -## Data Frame +## Processing Data Frames with Chunk Writing + +Most commonly, you'll want to embed a column in a data frame. The `hf_embed_df()` function processes data in chunks and writes intermediate results to disk. + +### Understanding output_dir -Most commonly, you'll want to embed a column in a data frame: +Both `hf_embed_df()` and `hf_classify_df()` write intermediate results to disk as `.parquet` files. This provides: + +1. **Safety**: If your job crashes, you don't lose all progress +2. **Memory efficiency**: Large datasets don't overwhelm your RAM +3. **Reproducibility**: Metadata tracks exactly what parameters you used ```{r} +# Basic usage - auto-generates output directory embedding_result <- hf_embed_df( df = my_data, text_var = text, # column with your text id_var = id, # column with unique ids endpoint_url = embed_url, - key_name = "HF_API_KEY" + key_name = "HF_API_KEY", + output_dir = "auto", # Creates "hf_embeddings_batch_TIMESTAMP" + chunk_size = 5000, # Writes every 5000 rows + concurrent_requests = 2 +) + +# Custom output directory +embedding_result <- hf_embed_df( + df = my_data, + text_var = text, + id_var = id, + endpoint_url = embed_url, + key_name = "HF_API_KEY", + output_dir = "my_embeddings_v1", # Your custom directory name + chunk_size = 5000 ) ``` -Check for errors: +### Output Directory Structure + +After running `hf_embed_df()` or `hf_classify_df()`, you'll have: + +``` +my_embeddings_v1/ +├── chunk_001.parquet +├── chunk_002.parquet +├── chunk_003.parquet +└── metadata.json +``` + +**IMPORTANT**: Add your output directories to `.gitignore`! These files contain API responses and can be large. + +```r +# .gitignore +hf_embeddings_batch_*/ +hf_classification_chunks_*/ +my_embeddings_v1/ +``` + +### Reading Results from Disk + +If your R session crashes or you want to reload results later: ```{r} -embedding_result |> count(.error) +# List all parquet files (excludes metadata.json automatically) +parquet_files <- list.files("my_embeddings_v1", + pattern = "\\.parquet$", + full.names = TRUE) + +# Read all chunks into a single data frame +results <- arrow::open_dataset(parquet_files, format = "parquet") |> + dplyr::collect() + +# Check for any errors +results |> count(.error) + +# Extract only successful embeddings +successful <- results |> filter(.error == FALSE) ``` -Extract just the embeddings: +### Understanding metadata.json + +The metadata file records everything about your processing job: ```{r} -embeddings_only <- embedding_result |> select(V1:V384) +metadata <- jsonlite::read_json("my_embeddings_v1/metadata.json") + +# Check which endpoint was used +metadata$endpoint_url + +# See processing parameters +metadata$chunk_size +metadata$concurrent_requests +metadata$timeout + +# See inference parameters (differs between embed and classify!) +metadata$inference_parameters +# For embeddings: {truncate: true} +# For classification: {return_all_scores: true, truncation: true, max_length: 512} + +# Check when the job ran +metadata$timestamp +``` + +This metadata is invaluable for: + +- Debugging why a job failed +- Reproducing results with identical settings +- Tracking which model/endpoint version was used +- Understanding performance characteristics + +### Check for Errors + +Always verify your results: + +```{r} +embedding_result |> count(.error) + +# View any failures +failures <- embedding_result |> + filter(.error == TRUE) |> + select(id, text, .error_message) + +# Extract just the embeddings for successful rows +embeddings_only <- embedding_result |> + filter(.error == FALSE) |> + select(starts_with("V")) ``` # Classification -Classification works the same way as embeddings, just with a different URL and output format. If neceessary, you can also provide a custom function for tidying the output. +Classification works similarly to embeddings, but with a different URL, output format, and the additional `max_length` parameter for controlling text truncation. ## Single Text @@ -157,7 +392,7 @@ sentiment <- hf_classify_text( ) ``` -## Data Frame +## Processing Data Frames ```{r} classification_result <- hf_classify_df( @@ -165,24 +400,105 @@ classification_result <- hf_classify_df( text_var = text, id_var = id, endpoint_url = classify_url, - key_name = "HF_API_KEY" + key_name = "HF_API_KEY", + max_length = 512, # Truncate texts longer than 512 tokens + output_dir = "my_classification_v1", + chunk_size = 2500, # Smaller chunks for classification + concurrent_requests = 1, + timeout = 60 # Longer timeout for classification ) ``` The result includes: -- Your original `id` column +- Your original `id` and `text` columns - Classification labels (e.g., POSITIVE, NEGATIVE) - Confidence scores -- Error tracking columns. +- Error tracking columns (`.error`, `.error_message`) +- Chunk tracking (`.chunk`) + +> **NOTE**: Classification labels are model and task specific. Check the model card on Hugging Face for label mappings. + +## Renaming Classification Labels + +Many classification models use generic labels like `LABEL_0`, `LABEL_1`. You can rename these: + +```{r} +# Create a mapping function +labelid_2class <- function() { + return(list( + negative = "LABEL_0", + neutral = "LABEL_1", + positive = "LABEL_2" + )) +} + +# Apply the mapping +classification_result <- hf_classify_df( + df = my_data, + text_var = text, + id_var = id, + endpoint_url = classify_url, + key_name = "HF_API_KEY", + max_length = 512 +) |> + dplyr::rename(!!!labelid_2class()) +``` + +# Utility Functions + +EndpointR provides utility functions to help you work with Hugging Face endpoints. + +## Get Model Token Limits + +Find out the maximum token length for a model: + +```{r} +# Get the model's max token length from Hugging Face +max_tokens <- hf_get_model_max_length( + model_name = "cardiffnlp/twitter-roberta-base-sentiment", + api_key = "HF_API_KEY" +) + +# Use this to set max_length for classification +hf_classify_df( + df = my_data, + text_var = text, + id_var = id, + endpoint_url = classify_url, + key_name = "HF_API_KEY", + max_length = max_tokens # Use the model's actual limit +) +``` + +This is especially useful when working with different models that have varying token limits (e.g., 512, 1024, 2048). + +## Get Endpoint Information + +Retrieve detailed information about your Dedicated Inference Endpoint: + +```{r} +endpoint_info <- hf_get_endpoint_info( + endpoint_url = "https://your-endpoint.endpoints.huggingface.cloud", + key_name = "HF_API_KEY" +) -> **NOTE**: Classification labels are model and task specific. +# Check endpoint configuration +endpoint_info +``` + +This is useful for: + +- Checking endpoint status +- Verifying model configuration +- Understanding available features +- Debugging connection issues # Using Dedicated Endpoints To use dedicated endpoints instead of the Inference API: -1. Deploy your model to a dedicated endpoint (see Hugging Face docs) +1. Deploy your model to a dedicated endpoint (see [Hugging Face docs](https://huggingface.co/docs/inference-endpoints)) 2. Get your endpoint URL 3. Replace the URL in any function: @@ -198,39 +514,259 @@ result <- hf_embed_text( ) ``` -> **Note**: Dedicated endpoints take 20-30 seconds to start if they're idle. Set `max_retries = 5` to give them time to wake up. +> **Note**: Dedicated endpoints take 20-30 seconds to start if they're idle (cold start). Set `max_retries = 10` to give them time to wake up. + +## Setting AUTO_TRUNCATE for Embedding Endpoints + +For Dedicated Inference Endpoints running embedding models, you should enable automatic truncation: + +1. In your endpoint settings on Hugging Face +2. Add environment variable: `AUTO_TRUNCATE=true` +3. This handles long texts automatically at the endpoint level + +Without this, very long texts may cause "Payload too large" errors. + +# Tips and Best Practices + +## Performance Tuning + +- **Start conservative**: Begin with `chunk_size = 2500` and `concurrent_requests = 1` +- **Scale gradually**: Monitor for errors as you increase concurrency +- **Embeddings are faster**: You can often use higher concurrency for embeddings than classification +- **Watch your rate limits**: + - Inference API: Shared limits, reduce concurrency if you hit errors + - Dedicated Endpoints: Limited by hardware, not API rate limits + +## Memory Management + +- Use `chunk_size` to control memory usage +- Smaller chunks = more frequent disk writes = less memory needed +- For very large datasets (>100k rows), use `chunk_size = 1000-2500` + +```{r} +# For very large datasets +hf_embed_df( + df = large_data, + text_var = text, + id_var = id, + endpoint_url = embed_url, + key_name = "HF_API_KEY", + chunk_size = 1000, # Smaller chunks for memory efficiency + concurrent_requests = 1 +) +``` + +## Truncation Strategy + +**For Embeddings**: + +1. Set `AUTO_TRUNCATE=true` in your Dedicated Endpoint's environment variables +2. For Inference API, truncation is handled automatically by most models +3. Consider preprocessing very long texts before embedding (e.g., take first N characters) -# Tips +**For Classification**: -- Start with small batch sizes (3-5) and increase gradually -- The Inference API has rate limits - dedicated endpoints have hardware constraints, increase hardware for higher limits -- For production use, choose dedicated endpoints -- Check the [Improving Performance](improving_performance.html) vignette for speed tips +1. Use `hf_get_model_max_length()` to check the model's token limit +2. Set `max_length` appropriately (default 512 works for most models) +3. For documents longer than `max_length`, consider: + - Chunking documents and classifying each chunk + - Summarization before classification + - Using models with longer context windows + +```{r} +# Get model's actual max length +model_limit <- hf_get_model_max_length( + model_name = "distilbert/distilbert-base-uncased-finetuned-sst-2-english", + api_key = "HF_API_KEY" +) + +# Use 90% of the limit to be safe +safe_limit <- as.integer(model_limit * 0.9) + +hf_classify_df( + df = my_data, + text_var = text, + id_var = id, + endpoint_url = classify_url, + key_name = "HF_API_KEY", + max_length = safe_limit +) +``` + +## Error Recovery + +Always check for errors and consider retrying failures: + +```{r} +# Check results for errors +results |> count(.error) + +# Identify failed texts +failed <- results |> filter(.error == TRUE) +failed |> select(id, text, .error_msg) + +# Retry failed texts with adjusted parameters +retry_results <- hf_embed_batch( + texts = failed$text, + endpoint_url = embed_url, + key_name = "HF_API_KEY", + batch_size = 1, # One at a time for failures + timeout = 30, # Longer timeout + max_retries = 10 # More retries +) +``` + +## Production Recommendations + +1. **Always use output_dir**: Never rely solely on in-memory results for large jobs +2. **Monitor metadata**: Check `metadata.json` to verify your settings +3. **Add to .gitignore**: Keep API responses out of version control +4. **Use Dedicated Endpoints**: For production workloads, avoid the free Inference API +5. **Set appropriate timeouts**: Classification needs longer timeouts than embeddings +6. **Test with small samples**: Before processing 1M rows, test with 100 rows +7. **Monitor costs**: Track your Dedicated Endpoint usage on Hugging Face # Common Issues -**Rate limits**: Reduce batch size or add delays between requests +## "Payload too large" Errors + +**For Embeddings**: -**Model not available**: Not all models work with the Inference API. Check the model page or use dedicated endpoints. +- Not fixable in R code - must configure endpoint +- **Dedicated Endpoints**: Set `AUTO_TRUNCATE=true` in endpoint environment variables +- **Inference API**: Preprocess and truncate texts before sending -**Timeouts**: Increase `max_retries` or reduce batch size +```{r} +# Preprocessing approach for Inference API +my_data <- my_data |> + mutate(text = substr(text, 1, 5000)) # Limit to ~5000 characters +``` + +**For Classification**: + +- Reduce the `max_length` parameter + +```{r} +hf_classify_df( + df = my_data, + text_var = text, + id_var = id, + endpoint_url = classify_url, + key_name = "HF_API_KEY", + max_length = 256 # Reduce from default 512 +) +``` + +## Timeouts + +Classification takes longer than embeddings. Increase timeout if needed: + +```{r} +hf_classify_df( + df = my_data, + text_var = text, + id_var = id, + endpoint_url = classify_url, + key_name = "HF_API_KEY", + timeout = 120, # Increase from default 60 + max_retries = 10 +) +``` + +## Dedicated Endpoint Cold Starts + +Dedicated endpoints take 20-30 seconds to wake up from idle: + +```{r} +# Set higher max_retries to allow for cold start +hf_embed_df( + df = my_data, + text_var = text, + id_var = id, + endpoint_url = dedicated_url, + key_name = "HF_API_KEY", + max_retries = 10, # Give it time to wake up + timeout = 30 +) +``` + +The first chunk may fail or be slow, but subsequent chunks will be fast once the endpoint is warm. + +## Out of Memory Errors + +Reduce `chunk_size`: + +```{r} +# Instead of default 5000 +hf_embed_df( + df = large_data, + text_var = text, + id_var = id, + endpoint_url = embed_url, + key_name = "HF_API_KEY", + chunk_size = 1000, # Smaller chunks + concurrent_requests = 1 +) +``` + +## Rate Limit Errors + +**For Inference API**: + +- Reduce `concurrent_requests` to 1 +- Increase delays between requests (handled automatically by retries) + +```{r} +hf_embed_df( + df = my_data, + text_var = text, + id_var = id, + endpoint_url = embed_url, + key_name = "HF_API_KEY", + concurrent_requests = 1, # Sequential processing + max_retries = 10 # More retries with backoff +) +``` + +**For Dedicated Endpoints**: + +- Not typically rate-limited +- If you see errors, your hardware may be overwhelmed +- Reduce `concurrent_requests` or upgrade your endpoint hardware + +## Model Not Available + +Not all models work with the Inference API. Check the model page on Hugging Face. If the model isn't available via Inference API, you'll need to: + +1. Deploy a Dedicated Inference Endpoint +2. Use a different model that is available via Inference API +3. Run the model locally (outside of EndpointR) # Improving Performance -EndpointR's functions come with knobs and dials and you can turn to improve throughput and performance. Visit the [Improving Performance](articles/improving_performance.html) vignette for more information. +For detailed performance optimization strategies, visit the [Improving Performance](improving_performance.html) vignette. + +Quick tips: + +- Increase `concurrent_requests` gradually while monitoring errors +- Use larger `chunk_size` values for faster processing (if memory allows) +- For Dedicated Endpoints, upgrade hardware for better throughput +- Use batch functions (`hf_embed_batch()`, `hf_classify_batch()`) for small datasets to avoid file I/O overhead # Appendix ## Comparison of Inference API vs Dedicated Inference Endpoints -| Feature | Inference API | Dedicated Inference Endpoints | -|----|----|----| -| **Accessibility** | Public, shared service | Private, dedicated hardware | -| **Cost** | Free (with paid tiers) | Paid service - rent specific hardware | -| **Hardware** | Shared computing resources | Dedicated hardware allocation | -| **Wait Times** | Variable, unknowable in advance | Predictable, minimal queuing, \~30s for first request | -| **Production Ready** | Not recommended for production | Recommended for production use | -| **Use Case** | Casual usage, testing, prototyping | Production applications, consistent performance | -| **Scalability** | Limited by shared resources | Scales with dedicated allocation | -| **Availability** | Subject to shared infrastructure limits | Guaranteed availability during rental period | -| **Model Coverage** | Commonly-used models, models selected by Hugging Face | Virtually all models on the Hub are available | +| Feature | Inference API | Dedicated Inference Endpoints | +|------------------------|------------------------------------------------|------------------------------------------| +| **Accessibility** | Public, shared service | Private, dedicated hardware | +| **Cost** | Free (with paid tiers) | Paid service - rent specific hardware | +| **Hardware** | Shared computing resources | Dedicated hardware allocation | +| **Wait Times** | Variable, unknowable in advance | Predictable, \~30s for cold start | +| **Production Ready** | Not recommended for production | Recommended for production use | +| **Use Case** | Casual usage, testing, prototyping | Production applications | +| **Scalability** | Limited by shared resources | Scales with dedicated allocation | +| **Availability** | Subject to shared infrastructure limits | Guaranteed availability during rental | +| **Model Coverage** | Commonly-used models, models selected by HF | Virtually all models on the Hub | +| **Truncation Control** | Limited (model-dependent) | Full control via environment variables | + From e5f36408b56d2169058ddaf32da87d72b28662bf Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Thu, 20 Nov 2025 15:19:33 +0000 Subject: [PATCH 42/56] update roxygen2 docs for classify build rd files add comma in embed test add test/dev docs to .Rbuildignore --- .Rbuildignore | 3 ++- R/hf_classify.R | 22 +++++++++++++++------- man/hf_classify_chunks.Rd | 11 +++++++++-- man/hf_classify_df.Rd | 15 +++++++-------- tests/testthat/test-hf_embed.R | 2 +- 5 files changed, 34 insertions(+), 19 deletions(-) diff --git a/.Rbuildignore b/.Rbuildignore index 4ceb83f..d902ab7 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -2,7 +2,7 @@ ^EndpointR\.Rproj$ ^\.Rproj\.user$ todos.md -dev_docs/ +^dev_docs/ README\.Rmd CONTRIBUTORS\.md todos\.qmd @@ -11,3 +11,4 @@ todos\.qmd ^docs$ ^pkgdown$ ^\.github$ +^test_dir/ diff --git a/R/hf_classify.R b/R/hf_classify.R index 50b2547..d16d3d2 100644 --- a/R/hf_classify.R +++ b/R/hf_classify.R @@ -364,10 +364,19 @@ hf_classify_batch <- function(texts, # hf_classify_chunks docs ---- #' Efficiently classify vectors of text in chunks #' -#' TODO - description +#' @description +#' Classifies large batches of text using a Hugging Face classification endpoint. +#' Processes texts in chunks with concurrent requests, writes intermediate results +#' to disk as Parquet files, and returns a combined data frame of all classifications. +#' #' -#' TODO - details +#' @details +#' The function creates a metadata JSON file in `output_dir` containing processing +#' parameters and timestamps. Each chunk is saved as a separate Parquet file before +#' being combined into the final result. Use `output_dir = "auto"` to generate a +#' timestamped directory automatically. #' +#' For single text classification, use `hf_classify_text()` instead. #' #' @param texts Character vector of texts to classify #' @param ids Vector of unique identifiers corresponding to each text (same length as texts) @@ -595,15 +604,14 @@ hf_classify_chunks <- function(texts, #' @param id_var Column name to use as identifier for joining (unquoted) #' @param endpoint_url URL of the Hugging Face Inference API endpoint #' @param key_name Name of environment variable containing the API key +#' @param max_length The maximum number of tokens in the text variable. Beyond this cut-off everything is truncated. +#' @param output_dir Path to directory for the .parquet chunks #' @param tidy_func Function to process API responses, defaults to #' `tidy_batch_classification_response` -#' @param parameters List of parameters for the API endpoint, defaults to -#' `list(return_all_scores = TRUE)` -#' @param batch_size Integer; number of texts per batch (default: 4) +#' @param chunk_size Number of texts to process in each chunk before writing to disk (default: 5000) #' @param concurrent_requests Integer; number of concurrent requests (default: 1) #' @param max_retries Integer; maximum retry attempts (default: 5) #' @param timeout Numeric; request timeout in seconds (default: 30) -#' @param progress Logical; whether to show progress bar (default: TRUE) #' #' @return Original data frame with additional columns for classification scores, #' or classification results table if row counts don't match @@ -634,7 +642,7 @@ hf_classify_df <- function(df, max_length = 512L, output_dir = "auto", tidy_func = tidy_classification_response, - chunk_size = 2500, + chunk_size = 5000, concurrent_requests = 1, max_retries = 5, timeout = 60) { diff --git a/man/hf_classify_chunks.Rd b/man/hf_classify_chunks.Rd index f100e9a..5a59e9e 100644 --- a/man/hf_classify_chunks.Rd +++ b/man/hf_classify_chunks.Rd @@ -46,10 +46,17 @@ hf_classify_chunks( A data frame of classified documents with successes and failures } \description{ -TODO - description +Classifies large batches of text using a Hugging Face classification endpoint. +Processes texts in chunks with concurrent requests, writes intermediate results +to disk as Parquet files, and returns a combined data frame of all classifications. } \details{ -TODO - details +The function creates a metadata JSON file in \code{output_dir} containing processing +parameters and timestamps. Each chunk is saved as a separate Parquet file before +being combined into the final result. Use \code{output_dir = "auto"} to generate a +timestamped directory automatically. + +For single text classification, use \code{hf_classify_text()} instead. } \examples{ \dontrun{ diff --git a/man/hf_classify_df.Rd b/man/hf_classify_df.Rd index 0a946b2..e8eaca8 100644 --- a/man/hf_classify_df.Rd +++ b/man/hf_classify_df.Rd @@ -13,7 +13,7 @@ hf_classify_df( max_length = 512L, output_dir = "auto", tidy_func = tidy_classification_response, - chunk_size = 2500, + chunk_size = 5000, concurrent_requests = 1, max_retries = 5, timeout = 60 @@ -30,21 +30,20 @@ hf_classify_df( \item{key_name}{Name of environment variable containing the API key} +\item{max_length}{The maximum number of tokens in the text variable. Beyond this cut-off everything is truncated.} + +\item{output_dir}{Path to directory for the .parquet chunks} + \item{tidy_func}{Function to process API responses, defaults to \code{tidy_batch_classification_response}} +\item{chunk_size}{Number of texts to process in each chunk before writing to disk (default: 5000)} + \item{concurrent_requests}{Integer; number of concurrent requests (default: 1)} \item{max_retries}{Integer; maximum retry attempts (default: 5)} \item{timeout}{Numeric; request timeout in seconds (default: 30)} - -\item{parameters}{List of parameters for the API endpoint, defaults to -\code{list(return_all_scores = TRUE)}} - -\item{batch_size}{Integer; number of texts per batch (default: 4)} - -\item{progress}{Logical; whether to show progress bar (default: TRUE)} } \value{ Original data frame with additional columns for classification scores, diff --git a/tests/testthat/test-hf_embed.R b/tests/testthat/test-hf_embed.R index a63276b..02ec038 100644 --- a/tests/testthat/test-hf_embed.R +++ b/tests/testthat/test-hf_embed.R @@ -151,7 +151,7 @@ test_that("hf_embed_df works correctly with real endpoint", { test_that("hf_embed_df works with different batch sizes", { test_df <- data.frame( - id = c(paste0("id" 1:2)), + id = c(paste0("id", 1:2)), text = c("text1", "text2"), stringsAsFactors = FALSE ) From a79bba88e4fd8e9c67717f1ac02daa790eb20629 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Thu, 20 Nov 2025 15:53:05 +0000 Subject: [PATCH 43/56] Re-factor such that chunks doesn't overwrite variable names - writes to files with the origianl variable names. --- R/hf_classify.R | 37 +++++++++++++++++++++------- R/hf_embed.R | 16 ++++++++---- dev_docs/hf_classify_dev.qmd | 24 +++++++++++++++--- man/hf_classify_chunks.Rd | 23 ++++++++++++++--- man/hf_embed_chunks.Rd | 7 ++++-- tests/testthat/test-hf_embed.R | 2 +- vignettes/hugging_face_inference.Rmd | 22 +++++++++++------ 7 files changed, 100 insertions(+), 31 deletions(-) diff --git a/R/hf_classify.R b/R/hf_classify.R index d16d3d2..efc7303 100644 --- a/R/hf_classify.R +++ b/R/hf_classify.R @@ -380,7 +380,7 @@ hf_classify_batch <- function(texts, #' #' @param texts Character vector of texts to classify #' @param ids Vector of unique identifiers corresponding to each text (same length as texts) -#' @param endpoint_url Hugging Face Embedding Endpoint +#' @param endpoint_url Hugging Face Classification Endpoint #' @param max_length The maximum number of tokens in the text variable. Beyond this cut-off everything is truncated. #' @param tidy_func Function to process API responses, defaults to #' `tidy_classification_response` @@ -388,15 +388,26 @@ hf_classify_batch <- function(texts, #' @param chunk_size Number of texts to process in each chunk before writing to disk (default: 5000) #' @param concurrent_requests Integer; number of concurrent requests (default: 5) #' @param max_retries Integer; maximum retry attempts (default: 5) -#' @param timeout Numeric; request timeout in seconds (default: 20) +#' @param timeout Numeric; request timeout in seconds (default: 30) #' @param key_name Name of environment variable containing the API key +#' @param id_col_name Name for the ID column in output (default: "id"). When called from hf_classify_df(), this preserves the original column name. +#' @param text_col_name Name for the text column in output (default: "text"). When called from hf_classify_df(), this preserves the original column name. #' #' @returns A data frame of classified documents with successes and failures #' @export #' #' @examples #' \dontrun{ -#' 1+1 = 2 +#' # basic usage with vectors +#' texts <- c("I love this", "I hate this", "This is ok") +#' ids <- c("review_1", "review_2", "review_3") +#' +#' results <- hf_classify_chunks( +#' texts = texts, +#' ids = ids, +#' endpoint_url = "https://your-endpoint.huggingface.cloud", +#' key_name = "HF_API_KEY" +#' ) #' } # hf_classify_chunks docs ---- hf_classify_chunks <- function(texts, @@ -409,7 +420,9 @@ hf_classify_chunks <- function(texts, concurrent_requests = 5L, max_retries = 5L, timeout = 30L, - key_name = "HF_API_KEY" + key_name = "HF_API_KEY", + id_col_name = "id", + text_col_name = "text" ) { # input validation ---- @@ -534,8 +547,8 @@ hf_classify_chunks <- function(texts, purrr::list_rbind() chunk_results$successes <- tibble::tibble( - id = successes_ids, - text = successes_texts, + !!id_col_name := successes_ids, + !!text_col_name := successes_texts, .error = FALSE, .error_msg = NA_character_, .chunk = chunk_num @@ -552,8 +565,8 @@ hf_classify_chunks <- function(texts, chunk_results$failures <- tibble::tibble( - id = failures_ids, - text = failures_texts, + !!id_col_name := failures_ids, + !!text_col_name := failures_texts, .error = TRUE, .error_msg = failures_msgs, .chunk = chunk_num @@ -665,6 +678,10 @@ hf_classify_df <- function(df, text_vec <- dplyr::pull(df, !!text_sym) indices_vec <- dplyr::pull(df, !!id_sym) + # preserve original column names + id_col_name <- rlang::as_name(id_sym) + text_col_name <- rlang::as_name(text_sym) + chunk_size <- if(is.null(chunk_size) || chunk_size <=1) 1 else chunk_size results <- hf_classify_chunks( @@ -678,7 +695,9 @@ hf_classify_df <- function(df, max_retries = max_retries, timeout = timeout, key_name = key_name, - output_dir = output_dir + output_dir = output_dir, + id_col_name = id_col_name, + text_col_name = text_col_name ) return(results) diff --git a/R/hf_embed.R b/R/hf_embed.R index 971bcf2..a199b46 100644 --- a/R/hf_embed.R +++ b/R/hf_embed.R @@ -253,9 +253,10 @@ hf_embed_batch <- function(texts, #' @param max_retries Maximum retry attempts per failed request (default: 5) #' @param timeout Request timeout in seconds (default: 10) #' @param key_name Name of environment variable containing the API key (default: "HF_API_KEY") +#' @param id_col_name Name for the ID column in output (default: "id"). When called from hf_embed_df(), this preserves the original column name. #' #' @return A tibble with columns: -#' - `id`: Original identifier from input +#' - ID column (name specified by `id_col_name`): Original identifier from input #' - `.error`: Logical indicating if request failed #' - `.error_msg`: Error message if failed, NA otherwise #' - `.chunk`: Chunk number for tracking @@ -271,7 +272,8 @@ hf_embed_chunks <- function(texts, concurrent_requests = 5L, max_retries = 5L, timeout = 10L, - key_name = "HF_API_KEY") { + key_name = "HF_API_KEY", + id_col_name = "id") { # input validation ---- stopifnot( @@ -374,7 +376,7 @@ hf_embed_chunks <- function(texts, purrr::list_rbind() chunk_results$successes <- tibble::tibble( - id = successes_ids, + !!id_col_name := successes_ids, .error = FALSE, .error_msg = NA_character_, .chunk = chunk_num @@ -387,7 +389,7 @@ hf_embed_chunks <- function(texts, failures_msgs <- purrr::map_chr(failures, \(x) purrr::pluck(x, "message", .default = "Unknown error")) chunk_results$failures <- tibble::tibble( - id = failures_ids, + !!id_col_name := failures_ids, .error = TRUE, .error_msg = failures_msgs, .chunk = chunk_num @@ -499,6 +501,9 @@ hf_embed_df <- function(df, texts <- dplyr::pull(df, !!text_sym) indices <- dplyr::pull(df, !!id_sym) + # preserve original column name + id_col_name <- rlang::as_name(id_sym) + chunk_size <- if(is.null(chunk_size) || chunk_size <= 1) 1 else chunk_size results <- hf_embed_chunks( @@ -510,7 +515,8 @@ hf_embed_df <- function(df, concurrent_requests = concurrent_requests, max_retries = max_retries, timeout = timeout, - output_dir = output_dir + output_dir = output_dir, + id_col_name = id_col_name ) return(results) diff --git a/dev_docs/hf_classify_dev.qmd b/dev_docs/hf_classify_dev.qmd index db01c26..e034d6a 100644 --- a/dev_docs/hf_classify_dev.qmd +++ b/dev_docs/hf_classify_dev.qmd @@ -536,10 +536,10 @@ should_pass <- hf_classify_chunks( concurrent_requests = 5, timeout = 60, max_retries = 10, - max_length = 128L + max_length = 128L, + id_col_name = "universal_message_id", + text_col_name = ".text_col" ) - - ``` ``` @@ -604,3 +604,21 @@ Test the function over 10k data points timeout = 60 ) ``` + +```{r} +x |> + slice(1:10) |> + hf_classify_df( + message, + universal_message_id, + endpoint_url = test_sent_url, + key_name = "HF_API_KEY", + max_length = 128, + output_dir = "test_dir/test_classify/ten_thousand_rows", + tidy_func = tidy_classification_response, + chunk_size = 2500, + concurrent_requests = 15, + max_retries = 10L, + timeout = 60 + ) +``` diff --git a/man/hf_classify_chunks.Rd b/man/hf_classify_chunks.Rd index 5a59e9e..9fd875b 100644 --- a/man/hf_classify_chunks.Rd +++ b/man/hf_classify_chunks.Rd @@ -15,7 +15,9 @@ hf_classify_chunks( concurrent_requests = 5L, max_retries = 5L, timeout = 30L, - key_name = "HF_API_KEY" + key_name = "HF_API_KEY", + id_col_name = "id", + text_col_name = "text" ) } \arguments{ @@ -23,7 +25,7 @@ hf_classify_chunks( \item{ids}{Vector of unique identifiers corresponding to each text (same length as texts)} -\item{endpoint_url}{Hugging Face Embedding Endpoint} +\item{endpoint_url}{Hugging Face Classification Endpoint} \item{max_length}{The maximum number of tokens in the text variable. Beyond this cut-off everything is truncated.} @@ -38,9 +40,13 @@ hf_classify_chunks( \item{max_retries}{Integer; maximum retry attempts (default: 5)} -\item{timeout}{Numeric; request timeout in seconds (default: 20)} +\item{timeout}{Numeric; request timeout in seconds (default: 30)} \item{key_name}{Name of environment variable containing the API key} + +\item{id_col_name}{Name for the ID column in output (default: "id"). When called from hf_classify_df(), this preserves the original column name.} + +\item{text_col_name}{Name for the text column in output (default: "text"). When called from hf_classify_df(), this preserves the original column name.} } \value{ A data frame of classified documents with successes and failures @@ -60,6 +66,15 @@ For single text classification, use \code{hf_classify_text()} instead. } \examples{ \dontrun{ -1+1 = 2 +# basic usage with vectors +texts <- c("I love this", "I hate this", "This is ok") +ids <- c("review_1", "review_2", "review_3") + +results <- hf_classify_chunks( + texts = texts, + ids = ids, + endpoint_url = "https://your-endpoint.huggingface.cloud", + key_name = "HF_API_KEY" +) } } diff --git a/man/hf_embed_chunks.Rd b/man/hf_embed_chunks.Rd index 353a203..9920509 100644 --- a/man/hf_embed_chunks.Rd +++ b/man/hf_embed_chunks.Rd @@ -13,7 +13,8 @@ hf_embed_chunks( concurrent_requests = 5L, max_retries = 5L, timeout = 10L, - key_name = "HF_API_KEY" + key_name = "HF_API_KEY", + id_col_name = "id" ) } \arguments{ @@ -34,11 +35,13 @@ hf_embed_chunks( \item{timeout}{Request timeout in seconds (default: 10)} \item{key_name}{Name of environment variable containing the API key (default: "HF_API_KEY")} + +\item{id_col_name}{Name for the ID column in output (default: "id"). When called from hf_embed_df(), this preserves the original column name.} } \value{ A tibble with columns: \itemize{ -\item \code{id}: Original identifier from input +\item ID column (name specified by \code{id_col_name}): Original identifier from input \item \code{.error}: Logical indicating if request failed \item \code{.error_msg}: Error message if failed, NA otherwise \item \code{.chunk}: Chunk number for tracking diff --git a/tests/testthat/test-hf_embed.R b/tests/testthat/test-hf_embed.R index 02ec038..50c6a7c 100644 --- a/tests/testthat/test-hf_embed.R +++ b/tests/testthat/test-hf_embed.R @@ -166,7 +166,7 @@ test_that("hf_embed_df works with different batch sizes", { key_name = "HF_TEST_API_KEY", chunk_size = 1, concurrent_requests = 1, - output_dir = output_dir + output_dir = temp_dir ) ) |> suppressMessages() diff --git a/vignettes/hugging_face_inference.Rmd b/vignettes/hugging_face_inference.Rmd index 68b945f..f985801 100644 --- a/vignettes/hugging_face_inference.Rmd +++ b/vignettes/hugging_face_inference.Rmd @@ -71,7 +71,9 @@ Go to [Hugging Face's models hub](https://huggingface.co/models) and fetch the I # Understanding the Function Hierarchy -EndpointR provides four levels of functions for working with Hugging Face endpoints: +EndpointR provides four levels of functions for working with Hugging Face endpoints. + +> **KEY FEATURE**: The `*_df()` and `*_chunks()` functions preserve your original column names. If you pass a data frame with columns named `review_id` and `review_text`, those exact names will appear in the output and in the saved `.parquet` files. This makes it easy to join results back to your original data. ## Single Text Functions @@ -365,10 +367,10 @@ Always verify your results: ```{r} embedding_result |> count(.error) -# View any failures +# View any failures (column names match your original data frame) failures <- embedding_result |> filter(.error == TRUE) |> - select(id, text, .error_message) + select(id, .error_message) # Extract just the embeddings for successful rows embeddings_only <- embedding_result |> @@ -411,7 +413,7 @@ classification_result <- hf_classify_df( The result includes: -- Your original `id` and `text` columns +- Your original ID and text columns (with their original names preserved) - Classification labels (e.g., POSITIVE, NEGATIVE) - Confidence scores - Error tracking columns (`.error`, `.error_message`) @@ -419,6 +421,8 @@ The result includes: > **NOTE**: Classification labels are model and task specific. Check the model card on Hugging Face for label mappings. +> **IMPORTANT**: The function preserves your original column names. If your data frame has `review_id` and `review_text`, those names will appear in the output, not generic `id` and `text`. + ## Renaming Classification Labels Many classification models use generic labels like `LABEL_0`, `LABEL_1`. You can rename these: @@ -601,13 +605,17 @@ Always check for errors and consider retrying failures: # Check results for errors results |> count(.error) -# Identify failed texts +# Identify failed texts (column names match your input data frame) failed <- results |> filter(.error == TRUE) -failed |> select(id, text, .error_msg) + +# Note: Column names below will match your original data frame +# If you used review_id and review_text, use those names instead +failed |> select(id, .error_msg) # Retry failed texts with adjusted parameters +# Access text column by its actual name from your data retry_results <- hf_embed_batch( - texts = failed$text, + texts = failed$text, # Use your actual column name endpoint_url = embed_url, key_name = "HF_API_KEY", batch_size = 1, # One at a time for failures From 6690822c91098b9c0e9763ec2cbda0d5d4ab93da Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Wed, 3 Dec 2025 22:10:09 +0000 Subject: [PATCH 44/56] rephrase to chunking vocab switch to output_dir = for oai_complete_chunks & oai_complete_df add id_col_name to oai_complete_chunks fix docs --- R/openai_completions.R | 191 ++++++++++++++++++++++++++--------------- 1 file changed, 123 insertions(+), 68 deletions(-) diff --git a/R/openai_completions.R b/R/openai_completions.R index 957d1be..8e318a6 100644 --- a/R/openai_completions.R +++ b/R/openai_completions.R @@ -312,7 +312,7 @@ oai_complete_text <- function(text, #' Process text chunks through OpenAI's Chat Completions API with batch file output #' #' This function processes large volumes of text through OpenAI's Chat Completions API -#' in configurable chunks, writing results progressively to a CSV file. It handles +#' in configurable chunks, writing results progressively to parquet files. It handles #' concurrent requests, automatic retries, and structured outputs while #' managing memory efficiently for large-scale processing. #' @@ -322,23 +322,25 @@ oai_complete_text <- function(text, #' memory usage. #' #' The function preserves data integrity by matching results to source texts through -#' the `ids` parameter. Each chunk is processed independently with results appended -#' to the output file, allowing for resumable processing if interrupted. +#' the `ids` parameter. Each chunk is processed independently with results written as +#' parquet files to the output directory, allowing for resumable processing if interrupted. #' #' When using structured outputs with a `schema`, responses are validated against -#' the JSON schema but stored as raw JSON strings in the output file. This allows +#' the JSON schema but stored as raw JSON strings in the output files. This allows #' for flexible post-processing without memory constraints during the API calls. #' #' The chunking strategy balances API efficiency with memory management. Larger #' `chunk_size` values reduce overhead but increase memory usage. Adjust based on #' your system resources and text sizes. #' +#' Avoid risk of data loss by setting a low-ish chunk_size (e.g. 5,000, 10,000). Each chunk is written to a `.parquet` file in the `output_dir=` directory, which also contains a `metadata.json` file which tracks important information such as the model and endpoint URL used. Be sure to add output directories to .gitignore! +#' #' @param texts Character vector of texts to process #' @param ids Vector of unique identifiers corresponding to each text (same length as texts) #' @param chunk_size Number of texts to process in each batch (default: 5000) #' @param model OpenAI model to use (default: "gpt-4.1-nano") #' @param system_prompt Optional system prompt applied to all requests -#' @param output_file Path to .CSV file for results. "auto" generates the filename, location and is persistent across sessions. If NULL, generates timestamped filename. +#' @param output_dir Path to directory for the .parquet chunks. "auto" generates a timestamped directory name. If NULL, uses a temporary directory. #' @param schema Optional JSON schema for structured output (json_schema object or list) #' @param concurrent_requests Integer; number of concurrent requests (default: 5) #' @param temperature Sampling temperature (0-2), lower = more deterministic (default: 0) @@ -347,29 +349,46 @@ oai_complete_text <- function(text, #' @param timeout Request timeout in seconds (default: 30) #' @param key_name Name of environment variable containing the API key (default: OPENAI_API_KEY) #' @param endpoint_url OpenAI API endpoint URL +#' @param id_col_name Name for the ID column in output (default: "id"). When called from oai_complete_df(), this preserves the original column name. #' #' @return A tibble containing all results with columns: -#' - `id`: Original identifier from input +#' - ID column (name specified by `id_col_name`): Original identifier from input #' - `content`: API response content (text or JSON string if schema used) #' - `.error`: Logical indicating if request failed #' - `.error_msg`: Error message if failed, NA otherwise -#' - `.batch`: Batch number for tracking +#' - `.chunk`: Chunk number for tracking #' #' @export #' @examples #' \dontrun{ -#' # basic usage with automatic file naming: -#' -#' # large-scale processing with custom output file: - -#' #structured extraction with schema: -#' +#' # basic usage with automatic directory naming: +#' result <- oai_complete_chunks( +#' texts = my_texts, +#' ids = my_ids, +#' model = "gpt-4.1-nano" +#' ) +#' +#' # large-scale processing with custom output directory: +#' result <- oai_complete_chunks( +#' texts = my_texts, +#' ids = my_ids, +#' output_dir = "my_results", +#' chunk_size = 10000 +#' ) +#' +#' # structured extraction with schema: +#' result <- oai_complete_chunks( +#' texts = my_texts, +#' ids = my_ids, +#' schema = my_schema, +#' temperature = 0 +#' ) #' #' # post-process structured results: -#' xx <- xx |> +#' processed <- result |> #' dplyr::filter(!.error) |> -#' dplyr::mutate(parsed = map(content, ~jsonlite::fromJSON(.x))) |> -#' unnest_wider(parsed) +#' dplyr::mutate(parsed = purrr::map(content, ~jsonlite::fromJSON(.x))) |> +#' tidyr::unnest_wider(parsed) #' } # oai_complete_chunks docs ---- oai_complete_chunks <- function(texts, @@ -377,7 +396,7 @@ oai_complete_chunks <- function(texts, chunk_size = 5000L, model = "gpt-4.1-nano", system_prompt = NULL, - output_file = "auto", + output_dir = "auto", schema = NULL, concurrent_requests = 5L, temperature = 0L, @@ -385,7 +404,8 @@ oai_complete_chunks <- function(texts, max_retries = 5L, timeout = 30L, key_name = "OPENAI_API_KEY", - endpoint_url = "https://api.openai.com/v1/chat/completions" + endpoint_url = "https://api.openai.com/v1/chat/completions", + id_col_name = "id" ) { # input validation ---- stopifnot( @@ -395,7 +415,11 @@ oai_complete_chunks <- function(texts, "chunk_size must be a positive integer greater than 1" = is.numeric(chunk_size) && chunk_size > 0 ) - output_file <- .handle_output_filename(output_file) + output_dir <- .handle_output_directory(output_dir, base_dir_name = "oai_completions_batch") + + if (!dir.exists(output_dir)) { + dir.create(output_dir, recursive = TRUE) + } # make sure we json_dump the schema here if necessary, so that we don't json_dump for every individual document if(!is.null(schema) && inherits(schema, "EndpointR::json_schema")) { @@ -405,27 +429,52 @@ oai_complete_chunks <- function(texts, } batch_data <- batch_vector(seq_along(texts), chunk_size) - n_batches <- length(batch_data$batch_indices) + n_chunks <- length(batch_data$batch_indices) - cli::cli_alert_info("Processing {length(texts)} text{?s} in {n_batches} chunk{?s} of up to {chunk_size} each") + # write metadata to track important information for debugging and reproducibility + metadata <- list( + model = model, + endpoint_url = endpoint_url, + chunk_size = chunk_size, + n_texts = length(texts), + concurrent_requests = concurrent_requests, + timeout = timeout, + max_retries = max_retries, + temperature = temperature, + max_tokens = max_tokens, + output_dir = output_dir, + key_name = key_name, + n_chunks = n_chunks, + has_schema = !is.null(schema), + has_system_prompt = !is.null(system_prompt), + timestamp = Sys.time() + ) + + jsonlite::write_json(metadata, + file.path(output_dir, "metadata.json"), + auto_unbox = TRUE, + pretty = TRUE) + + cli::cli_alert_info("Processing {length(texts)} text{?s} in {n_chunks} chunk{?s} of up to {chunk_size} each") + cli::cli_alert_info("Intermediate results will be saved as parquet files in {output_dir}") total_successes <- 0 total_failures <- 0 - ## batch processing ---- - for (batch_num in seq_along(batch_data$batch_indices)) + ## chunk processing ---- + for (chunk_num in seq_along(batch_data$batch_indices)) { - batch_indices <- batch_data$batch_indices[[batch_num]] - batch_texts <- texts[batch_indices] - batch_ids <- ids[batch_indices] + chunk_indices <- batch_data$batch_indices[[chunk_num]] + chunk_texts <- texts[chunk_indices] + chunk_ids <- ids[chunk_indices] - cli::cli_progress_message("Processing batch {batch_num}/{n_batches} ({length(batch_indices)} text{?s})") + cli::cli_progress_message("Processing chunk {chunk_num}/{n_chunks} ({length(chunk_indices)} text{?s})") - ## build batch requests ---- + ## build chunk requests ---- requests <- oai_build_completions_request_list( - inputs = batch_texts, + inputs = chunk_texts, model = model, temperature = temperature, max_tokens = max_tokens, @@ -435,20 +484,19 @@ oai_complete_chunks <- function(texts, endpoint_url = endpoint_url, max_retries = max_retries, timeout = timeout, - endpointr_ids = batch_ids + endpointr_ids = chunk_ids ) - # make sure we have some valid requests, or skip to the next iter + # make sure we have some valid requests, or skip to the next iteration is_valid_request <- purrr::map_lgl(requests, ~inherits(.x, "httr2_request")) valid_requests <- requests[is_valid_request] if (length(valid_requests) == 0) { - cli::cli_alert_warning("No valid request{?s} in batch {batch_num}, skipping") + cli::cli_alert_warning("No valid request{?s} in chunk {chunk_num}, skipping") next } - # perform batch requests ---- - # get chunk_size individual responses and then handle them + # perform chunk requests ---- responses <- perform_requests_with_strategy( valid_requests, concurrent_requests = concurrent_requests, @@ -463,20 +511,20 @@ oai_complete_chunks <- function(texts, total_successes <- total_successes + n_successes total_failures <- total_failures + n_failures - ## process batch responses ---- - # within batch results - batch_results <- list() + ## process chunk responses ---- + # within chunk results + chunk_results <- list() if (length(successes) > 0) { successes_ids <- purrr::map(successes, ~purrr::pluck(.x, "request", "headers", "endpointr_id")) |> unlist() successes_content <- purrr::map_chr(successes, .extract_successful_completion_content) - batch_results$successes <- tibble::tibble( - id = successes_ids, + chunk_results$successes <- tibble::tibble( + !!id_col_name := successes_ids, content = successes_content, .error = FALSE, .error_msg = NA_character_, - .batch = batch_num + .chunk = chunk_num ) } @@ -484,39 +532,33 @@ oai_complete_chunks <- function(texts, failures_ids <- purrr::map(failures, ~purrr::pluck(.x, "request", "headers", "endpointr_id")) |> unlist() failures_msgs <- purrr::map_chr(failures, ~purrr::pluck(.x, "message", .default = "Unknown error")) - batch_results$failures <- tibble::tibble( - id = failures_ids, + chunk_results$failures <- tibble::tibble( + !!id_col_name := failures_ids, content = NA_character_, .error = TRUE, .error_msg = failures_msgs, - .batch = batch_num + .chunk = chunk_num ) } - batch_df <- dplyr::bind_rows(batch_results) + chunk_df <- dplyr::bind_rows(chunk_results) - # if(!is.null(output_file)){ # skip writing if output_file = NULL - can't be NULL after .handle_output_filename - as if NULL we write to tmp file - if (nrow(batch_df) > 0) { - if (batch_num == 1) { - # if we're in the first batch write to csv with headers (col names) - readr::write_csv(batch_df, output_file, append = FALSE) - } else { - # all other batches, append and don't use col names - readr::write_csv(batch_df, output_file, append = TRUE, col_names = FALSE) - } + if (nrow(chunk_df) > 0) { + chunk_file <- glue::glue("{output_dir}/chunk_{stringr::str_pad(chunk_num, 3, pad = '0')}.parquet") + arrow::write_parquet(chunk_df, chunk_file) } - # } - cli::cli_alert_success("Batch {batch_num}: {n_successes} successful, {n_failures} failed") + cli::cli_alert_success("Chunk {chunk_num}: {n_successes} successful, {n_failures} failed") - rm(requests, responses, successes, failures, batch_results, batch_df) + rm(requests, responses, successes, failures, chunk_results, chunk_df) gc(verbose = FALSE) } - cli::cli_alert_success("Completed processing: {total_successes} successful, {total_failures} failed") + parquet_files <- list.files(output_dir, pattern = "\\.parquet$", full.names = TRUE) - # retrieve all results from the output file (results for all batches) - this may still be too inefficient, and should perhaps write to duckdb(?) - final_results <- readr::read_csv(output_file, show_col_types = FALSE) + cli::cli_alert_info("Processing completed, there were {total_successes} successes\n and {total_failures} failures.") + final_results <- arrow::open_dataset(parquet_files, format = "parquet") |> + dplyr::collect() return(final_results) } @@ -535,8 +577,8 @@ oai_complete_chunks <- function(texts, #' results matched to the original data through the `id_var` parameter. #' #' The chunking approach enables processing of large data frames without memory -#' constraints. Results are written progressively to a CSV file (either specified -#' or auto-generated) and then read back as the return value. +#' constraints. Results are written progressively as parquet files (either to a specified +#' directory or auto-generated) and then read back as the return value. #' #' When using structured outputs with a `schema`, responses are validated against #' the JSON schema and stored as JSON strings. Post-processing may be needed to @@ -545,6 +587,8 @@ oai_complete_chunks <- function(texts, #' Failed requests are marked with `.error = TRUE` and include error messages, #' allowing for easy filtering and retry logic on failures. #' +#' Avoid risk of data loss by setting a low-ish chunk_size (e.g. 5,000, 10,000). Each chunk is written to a `.parquet` file in the `output_dir=` directory, which also contains a `metadata.json` file. Be sure to add output directories to .gitignore! +#' #' @param df Data frame containing text to process #' @param text_var Column name (unquoted) containing text inputs #' @param id_var Column name (unquoted) for unique row identifiers @@ -554,19 +598,29 @@ oai_complete_chunks <- function(texts, #' - `content`: API response content (text or JSON string if schema used) #' - `.error`: Logical indicating if request failed #' - `.error_msg`: Error message if failed, NA otherwise -#' - `.batch`: Batch number for tracking +#' - `.chunk`: Chunk number for tracking #' #' @export #' @examples #' \dontrun{ +#' df <- data.frame( +#' id = 1:100, +#' text = paste("Analyse this text:", 1:100) +#' ) #' +#' results <- oai_complete_df( +#' df = df, +#' text_var = text, +#' id_var = id, +#' model = "gpt-4.1-nano" +#' ) #' } #oai_complete_df docs---- oai_complete_df <- function(df, text_var, id_var, model = "gpt-4.1-nano", - output_file = "auto", + output_dir = "auto", system_prompt = NULL, schema = NULL, chunk_size = 1000, @@ -591,12 +645,14 @@ oai_complete_df <- function(df, "`chunk_size` must be a positive integer" = is.numeric(chunk_size) && chunk_size > 0 ) - - output_file <- .handle_output_filename(output_file, base_file_name = "oai_batch") + output_dir <- .handle_output_directory(output_dir, base_dir_name = "oai_completions_batch") text_vec <- dplyr::pull(df, !!text_sym) id_vec <- dplyr::pull(df, !!id_sym) + # preserve original column name + id_col_name <- rlang::as_name(id_sym) + results <- oai_complete_chunks( texts = text_vec, ids = id_vec, @@ -611,11 +667,10 @@ oai_complete_df <- function(df, max_tokens = max_tokens, key_name = key_name, endpoint_url = endpoint_url, - output_file = output_file + output_dir = output_dir, + id_col_name = id_col_name ) - results <- dplyr::rename(results, !!id_sym := id) - return(results) } From 6112078908d0f72561ba8fb6cf7b3e80e8463382 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Thu, 4 Dec 2025 10:14:12 +0000 Subject: [PATCH 45/56] refactor openai_embed.R for chunks --- R/openai_embed.R | 254 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 241 insertions(+), 13 deletions(-) diff --git a/R/openai_embed.R b/R/openai_embed.R index f64bcb1..0e09e1d 100644 --- a/R/openai_embed.R +++ b/R/openai_embed.R @@ -497,8 +497,230 @@ oai_embed_batch <- function(texts, } -oai_embed_df <- function(df, text_var, id_var, model = "text-embedding-3-small", dimensions = NULL, batch_size = 1, concurrent_requests = 1, max_retries = 5, timeout = 20, endpoint_url = "https://api.openai.com/v1/embeddings", key_name = "OPENAI_API_KEY" ) { +# oai_embed_chunks docs ---- +#' Embed text chunks through OpenAI's Embeddings API +#' +#' This function processes large volumes of text through OpenAI's Embeddings API +#' in configurable chunks, writing results progressively to parquet files. It handles +#' concurrent requests, automatic retries, while managing memory efficiently for +#' large-scale processing. +#' +#' @details This function is designed for processing large text datasets that may not +#' fit comfortably in memory. It divides the input into chunks, processes each chunk +#' with concurrent API requests, and writes results immediately to disk to minimise +#' memory usage. +#' +#' The function preserves data integrity by matching results to source texts through +#' the `ids` parameter. Each chunk is processed independently with results written as +#' parquet files to the output directory. +#' +#' The chunking strategy balances API efficiency with memory management. Larger +#' `chunk_size` values reduce overhead but increase memory usage. Adjust based on +#' your system resources and text sizes. +#' +#' Avoid risk of data loss by setting a low-ish chunk_size (e.g. 5,000, 10,000). Each chunk is written to a `.parquet` file in the `output_dir=` directory, which also contains a `metadata.json` file which tracks important information such as the model and endpoint URL used. Be sure to add output directories to .gitignore! +#' +#' @param texts Character vector of texts to process +#' @param ids Vector of unique identifiers corresponding to each text (same length as texts) +#' @param model OpenAI embedding model to use (default: "text-embedding-3-small") +#' @param dimensions Number of embedding dimensions (default: 1536 for text-embedding-3-small) +#' @param output_dir Path to directory for the .parquet chunks. "auto" generates a timestamped directory name. If NULL, uses a temporary directory. +#' @param chunk_size Number of texts to process in each chunk before writing to disk (default: 5000) +#' @param concurrent_requests Number of concurrent requests (default: 5) +#' @param max_retries Maximum retry attempts per failed request (default: 5) +#' @param timeout Request timeout in seconds (default: 20) +#' @param endpoint_url OpenAI API endpoint URL (default: OpenAI's embedding endpoint) +#' @param key_name Name of environment variable containing the API key (default: "OPENAI_API_KEY") +#' @param id_col_name Name for the ID column in output (default: "id"). When called from oai_embed_df(), this preserves the original column name. +#' +#' @return A tibble with columns: +#' - ID column (name specified by `id_col_name`): Original identifier from input +#' - `.error`: Logical indicating if request failed +#' - `.error_msg`: Error message if failed, NA otherwise +#' - `.chunk`: Chunk number for tracking +#' - Embedding columns (V1, V2, etc.) +#' @export +#' +#' @examples +#' \dontrun{ +#' # basic usage with automatic directory naming +#' result <- oai_embed_chunks( +#' texts = my_texts, +#' ids = my_ids, +#' model = "text-embedding-3-small" +#' ) +#' +#' # large-scale processing with custom settings +#' result <- oai_embed_chunks( +#' texts = my_texts, +#' ids = my_ids, +#' output_dir = "my_embeddings", +#' chunk_size = 10000, +#' dimensions = 512, +#' concurrent_requests = 10 +#' ) +#' } +# oai_embed_chunks docs ---- +oai_embed_chunks <- function(texts, + ids, + model = "text-embedding-3-small", + dimensions = 1536, + output_dir = "auto", + chunk_size = 5000L, + concurrent_requests = 5L, + max_retries = 5L, + timeout = 20L, + endpoint_url = "https://api.openai.com/v1/embeddings", + key_name = "OPENAI_API_KEY", + id_col_name = "id") { + + # input validation ---- + stopifnot( + "texts must be a vector" = is.vector(texts), + "ids must be a vector" = is.vector(ids), + "texts and ids must be the same length" = length(texts) == length(ids), + "chunk_size must be a positive integer greater than 1" = is.numeric(chunk_size) && chunk_size > 0 + ) + + output_dir <- .handle_output_directory(output_dir, base_dir_name = "oai_embeddings_batch") + + if (!dir.exists(output_dir)) { + dir.create(output_dir, recursive = TRUE) + } + + chunk_data <- batch_vector(seq_along(texts), chunk_size) + n_chunks <- length(chunk_data$batch_indices) + + # write metadata to track important information for debugging and reproducibility + metadata <- list( + model = model, + endpoint_url = endpoint_url, + dimensions = dimensions, + chunk_size = chunk_size, + n_texts = length(texts), + concurrent_requests = concurrent_requests, + timeout = timeout, + max_retries = max_retries, + output_dir = output_dir, + key_name = key_name, + n_chunks = n_chunks, + timestamp = Sys.time() + ) + jsonlite::write_json(metadata, + file.path(output_dir, "metadata.json"), + auto_unbox = TRUE, + pretty = TRUE) + + cli::cli_alert_info("Processing {length(texts)} text{?s} in {n_chunks} chunk{?s} of up to {chunk_size} each") + cli::cli_alert_info("Intermediate results will be saved as parquet files in {output_dir}") + + total_successes <- 0 + total_failures <- 0 + + ## chunk processing ---- + for (chunk_num in seq_along(chunk_data$batch_indices)) { + + chunk_indices <- chunk_data$batch_indices[[chunk_num]] + chunk_texts <- texts[chunk_indices] + chunk_ids <- ids[chunk_indices] + + cli::cli_progress_message("Processing chunk {chunk_num}/{n_chunks} ({length(chunk_indices)} text{?s})") + + ## build chunk requests ---- + # use individual requests for each text rather than batching within request + requests <- purrr::map2( + .x = chunk_texts, + .y = chunk_ids, + .f = function(text, id) { + req <- oai_build_embedding_request( + input = text, + model = model, + dimensions = dimensions, + max_retries = max_retries, + timeout = timeout, + endpoint_url = endpoint_url, + key_name = key_name + ) + # attach id to request headers for tracking + httr2::req_headers(req, endpointr_id = id) + } + ) + + # make sure we have some valid requests, or skip to the next iteration + is_valid_request <- purrr::map_lgl(requests, ~inherits(.x, "httr2_request")) + valid_requests <- requests[is_valid_request] + + if (length(valid_requests) == 0) { + cli::cli_alert_warning("No valid request{?s} in chunk {chunk_num}, skipping") + next + } + + # perform chunk requests ---- + responses <- perform_requests_with_strategy( + valid_requests, + concurrent_requests = concurrent_requests, + progress = TRUE + ) + + successes <- httr2::resps_successes(responses) + failures <- httr2::resps_failures(responses) + + n_successes <- length(successes) + n_failures <- length(failures) + total_successes <- total_successes + n_successes + total_failures <- total_failures + n_failures + + ## process chunk responses ---- + # within chunk results + chunk_results <- list() + + if (n_successes > 0) { + successes_ids <- purrr::map(successes, ~purrr::pluck(.x, "request", "headers", "endpointr_id")) |> unlist() + successes_content <- purrr::map(successes, tidy_oai_embedding) |> + purrr::list_rbind() + + chunk_results$successes <- tibble::tibble( + !!id_col_name := successes_ids, + .error = FALSE, + .error_msg = NA_character_, + .chunk = chunk_num + ) |> + dplyr::bind_cols(successes_content) + } + + if (n_failures > 0) { + failures_ids <- purrr::map(failures, ~purrr::pluck(.x, "request", "headers", "endpointr_id")) |> unlist() + failures_msgs <- purrr::map_chr(failures, ~purrr::pluck(.x, "message", .default = "Unknown error")) + + chunk_results$failures <- tibble::tibble( + !!id_col_name := failures_ids, + .error = TRUE, + .error_msg = failures_msgs, + .chunk = chunk_num + ) + } + + chunk_df <- dplyr::bind_rows(chunk_results) + + if (nrow(chunk_df) > 0) { + chunk_file <- glue::glue("{output_dir}/chunk_{stringr::str_pad(chunk_num, 3, pad = '0')}.parquet") + arrow::write_parquet(chunk_df, chunk_file) + } + + cli::cli_alert_success("Chunk {chunk_num}: {n_successes} successful, {n_failures} failed") + + rm(requests, responses, successes, failures, chunk_results, chunk_df) + gc(verbose = FALSE) + } + + parquet_files <- list.files(output_dir, pattern = "\\.parquet$", full.names = TRUE) + + cli::cli_alert_info("Processing completed, there were {total_successes} successes\n and {total_failures} failures.") + final_results <- arrow::open_dataset(parquet_files, format = "parquet") |> + dplyr::collect() + + return(final_results) } @@ -508,37 +730,43 @@ oai_embed_df <- function(df, text_var, id_var, model = "text-embedding-3-small", #' @description #' High-level function to generate embeddings for texts in a data frame using #' OpenAI's embedding API. This function handles the entire process from request -#' creation to response processing, with options for batching & concurrent requests. +#' creation to response processing, with options for chunking & concurrent requests. #' #' @details #' This function extracts texts from a specified column, generates embeddings using -#' `oai_embed_batch()`, and joins the results back to the original data frame using -#' a specified ID column. +#' `oai_embed_chunks()`, and returns the results matched to the original IDs. #' -#' The function preserves the original data frame structure and adds new columns -#' for embedding dimensions (V1, V2, ..., Vn). If the number of rows doesn't match -#' after processing (due to errors), it returns the results with a warning. +#' The chunking approach enables processing of large data frames without memory +#' constraints. Results are written progressively as parquet files (either to a specified +#' directory or auto-generated) and then read back as the return value. #' #' OpenAI's embedding API allows you to specify the number of dimensions for the -#' output embeddings, which can be useful for reducing memory usage, storage cost,s or matching +#' output embeddings, which can be useful for reducing memory usage, storage costs, or matching #' specific downstream requirements. The default is model-specific (1536 for #' text-embedding-3-small). \href{https://openai.com/index/new-embedding-models-and-api-updates/}{OpenAI Embedding Updates} #' +#' Avoid risk of data loss by setting a low-ish chunk_size (e.g. 5,000, 10,000). Each chunk is written to a `.parquet` file in the `output_dir=` directory, which also contains a `metadata.json` file. Be sure to add output directories to .gitignore! +#' #' @param df Data frame containing texts to embed #' @param text_var Column name (unquoted) containing texts to embed #' @param id_var Column name (unquoted) for unique row identifiers #' @param model OpenAI embedding model to use (default: "text-embedding-3-small") -#' @param dimensions Number of embedding dimensions (NULL uses model default) +#' @param dimensions Number of embedding dimensions (default: 1536) #' @param key_name Name of environment variable containing the API key -#' @param batch_size Number of texts to process in one batch (default: 10) +#' @param output_dir Path to directory for the .parquet chunks. "auto" generates a timestamped directory name. If NULL, uses a temporary directory. +#' @param chunk_size Number of texts to process in each chunk before writing to disk (default: 5000) #' @param concurrent_requests Number of concurrent requests (default: 1) #' @param max_retries Maximum retry attempts per request (default: 5) #' @param timeout Request timeout in seconds (default: 20) #' @param endpoint_url OpenAI API endpoint URL #' @param progress Whether to display a progress bar (default: TRUE) #' -#' @return Original data frame with additional columns for embeddings (V1, V2, etc.), -#' plus .error and .error_msg columns indicating any failures +#' @return A tibble with columns: +#' - ID column (preserves original column name): Original identifier from input +#' - `.error`: Logical indicating if request failed +#' - `.error_msg`: Error message if failed, NA otherwise +#' - `.chunk`: Chunk number for tracking +#' - Embedding columns (V1, V2, etc.) #' #' @export #' @@ -549,7 +777,7 @@ oai_embed_df <- function(df, text_var, id_var, model = "text-embedding-3-small", #' text = c("First example", "Second example", "Third example") #' ) #' -#' # Generate embeddings with default dimensions +#' # Generate embeddings with default settings #' embeddings_df <- oai_embed_df( #' df = df, #' text_var = text, From 3013cd0efefeb697f817ee0cc9a9267758cc97df Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Wed, 3 Dec 2025 22:11:23 +0000 Subject: [PATCH 46/56] Move oai_embed_df to chunks func output_dir arg switch to chunk vocab --- R/openai_embed.R | 68 ++++++++++++++++-------------------------------- 1 file changed, 23 insertions(+), 45 deletions(-) diff --git a/R/openai_embed.R b/R/openai_embed.R index 0e09e1d..9acdcd4 100644 --- a/R/openai_embed.R +++ b/R/openai_embed.R @@ -789,8 +789,8 @@ oai_embed_chunks <- function(texts, #' df = df, #' text_var = text, #' id_var = id, -#' dimensions = 360, # smaller embeddings -#' batch_size = 5 +#' dimensions = 512, # smaller embeddings +#' chunk_size = 10000 #' ) #' #' # Use with concurrent requests for faster processing @@ -799,7 +799,7 @@ oai_embed_chunks <- function(texts, #' text_var = text, #' id_var = id, #' model = "text-embedding-3-large", -#' concurrent_requests = 3 +#' concurrent_requests = 5 #' ) #' } oai_embed_df <- function(df, @@ -808,10 +808,11 @@ oai_embed_df <- function(df, model = "text-embedding-3-small", dimensions = 1536, key_name = "OPENAI_API_KEY", - batch_size = 10, - concurrent_requests = 1, - max_retries = 5, - timeout = 20, + output_dir = "auto", + chunk_size = 5000L, + concurrent_requests = 1L, + max_retries = 5L, + timeout = 20L, endpoint_url = "https://api.openai.com/v1/embeddings", progress = TRUE) { @@ -820,60 +821,37 @@ oai_embed_df <- function(df, stopifnot( "df must be a data frame" = is.data.frame(df), + "df must not be empty" = nrow(df) > 0, + "text_var must exist in df" = rlang::as_name(text_sym) %in% names(df), + "id_var must exist in df" = rlang::as_name(id_sym) %in% names(df), "endpoint_url must be provided" = !is.null(endpoint_url) && nchar(endpoint_url) > 0, - "concurrent_requests must be a number greater than 0" = is.numeric(concurrent_requests) && concurrent_requests > 0, - "batch_size must be a number greater than 0" = is.numeric(batch_size) && batch_size > 0 + "concurrent_requests must be an integer" = is.numeric(concurrent_requests) && concurrent_requests > 0 ) - if (!rlang::as_string(text_sym) %in% names(df)) { - cli::cli_abort("Column {.code {rlang::as_string(text_sym)}} not found in data frame") - } - - if (!rlang::as_string(id_sym) %in% names(df)) { - cli::cli_abort("Column {.code {rlang::as_string(id_sym)}} not found in data frame") - } - - original_num_rows <- nrow(df) + output_dir <- .handle_output_directory(output_dir, base_dir_name = "oai_embeddings_batch") - # pull texts & ids into vectors for batch function texts <- dplyr::pull(df, !!text_sym) indices <- dplyr::pull(df, !!id_sym) - batch_size <- if(is.null(batch_size) || batch_size <= 1) 1 else batch_size + # preserve original column name + id_col_name <- rlang::as_name(id_sym) + + chunk_size <- if(is.null(chunk_size) || chunk_size <= 1) 1 else chunk_size - embeddings_tbl <- oai_embed_batch( + results <- oai_embed_chunks( texts = texts, + ids = indices, model = model, dimensions = dimensions, - batch_size = batch_size, + output_dir = output_dir, + chunk_size = chunk_size, concurrent_requests = concurrent_requests, max_retries = max_retries, timeout = timeout, endpoint_url = endpoint_url, key_name = key_name, - include_texts = FALSE, - relocate_col = 1 + id_col_name = id_col_name ) - df_with_row_id <- df |> dplyr::mutate(.row_id = dplyr::row_number()) - - embeddings_tbl <- embeddings_tbl |> - dplyr::mutate(.row_id = dplyr::row_number()) - - result_df <- df_with_row_id |> - dplyr::left_join(embeddings_tbl, by = ".row_id") |> - dplyr::select(-.row_id) - - # sanity check and alert user if there's a mismatch - final_num_rows <- nrow(result_df) - - if(final_num_rows != original_num_rows){ - cli::cli_warn("Rows in original data frame and returned data frame do not match:") - cli::cli_bullets(text = c( - "Rows in original data frame: {original_num_rows}", - "Rows in returned data frame: {final_num_rows}" - )) - } - - return(result_df) + return(results) } From 7d62a276ca6052eb4247826e9b9800b091a9a42f Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Wed, 3 Dec 2025 22:16:44 +0000 Subject: [PATCH 47/56] fix tests affected by output_file -> output_dir changes add oai_embed_chunks to _pkgdown.yml update README and build --- README.Rmd | 4 +- README.md | 4 +- _pkgdown.yml | 1 + man/oai_complete_chunks.Rd | 52 +++++++++---- man/oai_complete_df.Rd | 24 ++++-- man/oai_embed_chunks.Rd | 98 ++++++++++++++++++++++++ tests/testthat/test-openai_completions.R | 12 +-- 7 files changed, 164 insertions(+), 31 deletions(-) create mode 100644 man/oai_embed_chunks.Rd diff --git a/README.Rmd b/README.Rmd index 22071ee..b5441c8 100644 --- a/README.Rmd +++ b/README.Rmd @@ -196,7 +196,7 @@ oai_complete_df( id_var = review_id, system_prompt = "Classify the following review:", key_name = "OPENAI_API_KEY", - output_file = "completions_output.parquet", # writes results to this file + output_dir = "completions_output", # writes .parquet chunks to this directory chunk_size = 1000, # process 1000 rows per chunk concurrent_requests = 5, # send 5 rows of data simultaneously max_retries = 5, @@ -214,7 +214,7 @@ oai_complete_df( system_prompt = "Classify the following review:", schema = sentiment_schema, key_name = "OPENAI_API_KEY", - output_file = "completions_output.parquet", + output_dir = "completions_output", chunk_size = 1000, concurrent_requests = 5 ) diff --git a/README.md b/README.md index e54bd32..12b1482 100644 --- a/README.md +++ b/README.md @@ -197,7 +197,7 @@ oai_complete_df( id_var = review_id, system_prompt = "Classify the following review:", key_name = "OPENAI_API_KEY", - output_file = "completions_output.parquet", # writes results to this file + output_dir = "completions_output", # writes .parquet chunks to this directory chunk_size = 1000, # process 1000 rows per chunk concurrent_requests = 5, # send 5 rows of data simultaneously max_retries = 5, @@ -215,7 +215,7 @@ oai_complete_df( system_prompt = "Classify the following review:", schema = sentiment_schema, key_name = "OPENAI_API_KEY", - output_file = "completions_output.parquet", + output_dir = "completions_output", chunk_size = 1000, concurrent_requests = 5 ) diff --git a/_pkgdown.yml b/_pkgdown.yml index aa2657d..0c24613 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -103,6 +103,7 @@ reference: - oai_build_embedding_request - oai_embed_text - oai_embed_batch + - oai_embed_chunks - oai_embed_df - tidy_oai_embedding - title: "JSON Schema for Structured Outputs" diff --git a/man/oai_complete_chunks.Rd b/man/oai_complete_chunks.Rd index d433341..60b1ccb 100644 --- a/man/oai_complete_chunks.Rd +++ b/man/oai_complete_chunks.Rd @@ -10,7 +10,7 @@ oai_complete_chunks( chunk_size = 5000L, model = "gpt-4.1-nano", system_prompt = NULL, - output_file = "auto", + output_dir = "auto", schema = NULL, concurrent_requests = 5L, temperature = 0L, @@ -18,7 +18,8 @@ oai_complete_chunks( max_retries = 5L, timeout = 30L, key_name = "OPENAI_API_KEY", - endpoint_url = "https://api.openai.com/v1/chat/completions" + endpoint_url = "https://api.openai.com/v1/chat/completions", + id_col_name = "id" ) } \arguments{ @@ -32,7 +33,7 @@ oai_complete_chunks( \item{system_prompt}{Optional system prompt applied to all requests} -\item{output_file}{Path to .CSV file for results. "auto" generates the filename, location and is persistent across sessions. If NULL, generates timestamped filename.} +\item{output_dir}{Path to directory for the .parquet chunks. "auto" generates a timestamped directory name. If NULL, uses a temporary directory.} \item{schema}{Optional JSON schema for structured output (json_schema object or list)} @@ -49,20 +50,22 @@ oai_complete_chunks( \item{key_name}{Name of environment variable containing the API key (default: OPENAI_API_KEY)} \item{endpoint_url}{OpenAI API endpoint URL} + +\item{id_col_name}{Name for the ID column in output (default: "id"). When called from oai_complete_df(), this preserves the original column name.} } \value{ A tibble containing all results with columns: \itemize{ -\item \code{id}: Original identifier from input +\item ID column (name specified by \code{id_col_name}): Original identifier from input \item \code{content}: API response content (text or JSON string if schema used) \item \code{.error}: Logical indicating if request failed \item \code{.error_msg}: Error message if failed, NA otherwise -\item \code{.batch}: Batch number for tracking +\item \code{.chunk}: Chunk number for tracking } } \description{ This function processes large volumes of text through OpenAI's Chat Completions API -in configurable chunks, writing results progressively to a CSV file. It handles +in configurable chunks, writing results progressively to parquet files. It handles concurrent requests, automatic retries, and structured outputs while managing memory efficiently for large-scale processing. } @@ -73,29 +76,48 @@ with concurrent API requests, and writes results immediately to disk to minimise memory usage. The function preserves data integrity by matching results to source texts through -the \code{ids} parameter. Each chunk is processed independently with results appended -to the output file, allowing for resumable processing if interrupted. +the \code{ids} parameter. Each chunk is processed independently with results written as +parquet files to the output directory, allowing for resumable processing if interrupted. When using structured outputs with a \code{schema}, responses are validated against -the JSON schema but stored as raw JSON strings in the output file. This allows +the JSON schema but stored as raw JSON strings in the output files. This allows for flexible post-processing without memory constraints during the API calls. The chunking strategy balances API efficiency with memory management. Larger \code{chunk_size} values reduce overhead but increase memory usage. Adjust based on your system resources and text sizes. + +Avoid risk of data loss by setting a low-ish chunk_size (e.g. 5,000, 10,000). Each chunk is written to a \code{.parquet} file in the \verb{output_dir=} directory, which also contains a \code{metadata.json} file which tracks important information such as the model and endpoint URL used. Be sure to add output directories to .gitignore! } \examples{ \dontrun{ -# basic usage with automatic file naming: +# basic usage with automatic directory naming: +result <- oai_complete_chunks( + texts = my_texts, + ids = my_ids, + model = "gpt-4.1-nano" +) -# large-scale processing with custom output file: -#structured extraction with schema: +# large-scale processing with custom output directory: +result <- oai_complete_chunks( + texts = my_texts, + ids = my_ids, + output_dir = "my_results", + chunk_size = 10000 +) +# structured extraction with schema: +result <- oai_complete_chunks( + texts = my_texts, + ids = my_ids, + schema = my_schema, + temperature = 0 +) # post-process structured results: -xx <- xx |> +processed <- result |> dplyr::filter(!.error) |> - dplyr::mutate(parsed = map(content, ~jsonlite::fromJSON(.x))) |> - unnest_wider(parsed) + dplyr::mutate(parsed = purrr::map(content, ~jsonlite::fromJSON(.x))) |> + tidyr::unnest_wider(parsed) } } diff --git a/man/oai_complete_df.Rd b/man/oai_complete_df.Rd index 15554b2..7679117 100644 --- a/man/oai_complete_df.Rd +++ b/man/oai_complete_df.Rd @@ -9,7 +9,7 @@ oai_complete_df( text_var, id_var, model = "gpt-4.1-nano", - output_file = "auto", + output_dir = "auto", system_prompt = NULL, schema = NULL, chunk_size = 1000, @@ -31,7 +31,7 @@ oai_complete_df( \item{model}{OpenAI model to use (default: "gpt-4.1-nano")} -\item{output_file}{Path to .CSV file for results. "auto" generates the filename, location and is persistent across sessions. If NULL, generates timestamped filename.} +\item{output_dir}{Path to directory for the .parquet chunks. "auto" generates a timestamped directory name. If NULL, uses a temporary directory.} \item{system_prompt}{Optional system prompt applied to all requests} @@ -59,7 +59,7 @@ A tibble with the original id column and additional columns: \item \code{content}: API response content (text or JSON string if schema used) \item \code{.error}: Logical indicating if request failed \item \code{.error_msg}: Error message if failed, NA otherwise -\item \code{.batch}: Batch number for tracking +\item \code{.chunk}: Chunk number for tracking } } \description{ @@ -75,8 +75,8 @@ processes texts in configurable chunks with concurrent API requests, and returns results matched to the original data through the \code{id_var} parameter. The chunking approach enables processing of large data frames without memory -constraints. Results are written progressively to a CSV file (either specified -or auto-generated) and then read back as the return value. +constraints. Results are written progressively as parquet files (either to a specified +directory or auto-generated) and then read back as the return value. When using structured outputs with a \code{schema}, responses are validated against the JSON schema and stored as JSON strings. Post-processing may be needed to @@ -84,9 +84,21 @@ unnest these into separate columns. Failed requests are marked with \code{.error = TRUE} and include error messages, allowing for easy filtering and retry logic on failures. + +Avoid risk of data loss by setting a low-ish chunk_size (e.g. 5,000, 10,000). Each chunk is written to a \code{.parquet} file in the \verb{output_dir=} directory, which also contains a \code{metadata.json} file. Be sure to add output directories to .gitignore! } \examples{ \dontrun{ - + df <- data.frame( + id = 1:100, + text = paste("Analyse this text:", 1:100) + ) + + results <- oai_complete_df( + df = df, + text_var = text, + id_var = id, + model = "gpt-4.1-nano" + ) } } diff --git a/man/oai_embed_chunks.Rd b/man/oai_embed_chunks.Rd new file mode 100644 index 0000000..ff15b36 --- /dev/null +++ b/man/oai_embed_chunks.Rd @@ -0,0 +1,98 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/openai_embed.R +\name{oai_embed_chunks} +\alias{oai_embed_chunks} +\title{Embed text chunks through OpenAI's Embeddings API} +\usage{ +oai_embed_chunks( + texts, + ids, + model = "text-embedding-3-small", + dimensions = 1536, + output_dir = "auto", + chunk_size = 5000L, + concurrent_requests = 5L, + max_retries = 5L, + timeout = 20L, + endpoint_url = "https://api.openai.com/v1/embeddings", + key_name = "OPENAI_API_KEY", + id_col_name = "id" +) +} +\arguments{ +\item{texts}{Character vector of texts to process} + +\item{ids}{Vector of unique identifiers corresponding to each text (same length as texts)} + +\item{model}{OpenAI embedding model to use (default: "text-embedding-3-small")} + +\item{dimensions}{Number of embedding dimensions (default: 1536 for text-embedding-3-small)} + +\item{output_dir}{Path to directory for the .parquet chunks. "auto" generates a timestamped directory name. If NULL, uses a temporary directory.} + +\item{chunk_size}{Number of texts to process in each chunk before writing to disk (default: 5000)} + +\item{concurrent_requests}{Number of concurrent requests (default: 5)} + +\item{max_retries}{Maximum retry attempts per failed request (default: 5)} + +\item{timeout}{Request timeout in seconds (default: 20)} + +\item{endpoint_url}{OpenAI API endpoint URL (default: OpenAI's embedding endpoint)} + +\item{key_name}{Name of environment variable containing the API key (default: "OPENAI_API_KEY")} + +\item{id_col_name}{Name for the ID column in output (default: "id"). When called from oai_embed_df(), this preserves the original column name.} +} +\value{ +A tibble with columns: +\itemize{ +\item ID column (name specified by \code{id_col_name}): Original identifier from input +\item \code{.error}: Logical indicating if request failed +\item \code{.error_msg}: Error message if failed, NA otherwise +\item \code{.chunk}: Chunk number for tracking +\item Embedding columns (V1, V2, etc.) +} +} +\description{ +This function processes large volumes of text through OpenAI's Embeddings API +in configurable chunks, writing results progressively to parquet files. It handles +concurrent requests, automatic retries, while managing memory efficiently for +large-scale processing. +} +\details{ +This function is designed for processing large text datasets that may not +fit comfortably in memory. It divides the input into chunks, processes each chunk +with concurrent API requests, and writes results immediately to disk to minimise +memory usage. + +The function preserves data integrity by matching results to source texts through +the \code{ids} parameter. Each chunk is processed independently with results written as +parquet files to the output directory. + +The chunking strategy balances API efficiency with memory management. Larger +\code{chunk_size} values reduce overhead but increase memory usage. Adjust based on +your system resources and text sizes. + +Avoid risk of data loss by setting a low-ish chunk_size (e.g. 5,000, 10,000). Each chunk is written to a \code{.parquet} file in the \verb{output_dir=} directory, which also contains a \code{metadata.json} file which tracks important information such as the model and endpoint URL used. Be sure to add output directories to .gitignore! +} +\examples{ +\dontrun{ + # basic usage with automatic directory naming + result <- oai_embed_chunks( + texts = my_texts, + ids = my_ids, + model = "text-embedding-3-small" + ) + + # large-scale processing with custom settings + result <- oai_embed_chunks( + texts = my_texts, + ids = my_ids, + output_dir = "my_embeddings", + chunk_size = 10000, + dimensions = 512, + concurrent_requests = 10 + ) +} +} diff --git a/tests/testthat/test-openai_completions.R b/tests/testthat/test-openai_completions.R index 3e8ed92..bb29a55 100644 --- a/tests/testthat/test-openai_completions.R +++ b/tests/testthat/test-openai_completions.R @@ -202,13 +202,13 @@ test_that("oai_complete_df takes single row, multi-row data frames as inputs", { endpoint_url = endpoint_url, concurrent_requests = 1, max_retries = 1, - output_file = NULL) + output_dir = NULL) ) ) expect_setequal(names(successful_response), - c("id", "content", ".error_msg", ".error", ".batch")) + c("id", "content", ".error", ".error_msg", ".chunk")) expect_setequal(unique(successful_response$content), "positive") withr::with_envvar( @@ -220,7 +220,7 @@ test_that("oai_complete_df takes single row, multi-row data frames as inputs", { endpoint_url = endpoint_url, concurrent_requests = 1, max_retries = 1, - output_file = NULL), + output_dir = NULL), regexp = "Performing 5 requests sequentially" ) @@ -236,7 +236,7 @@ test_that("oai_complete_df takes single row, multi-row data frames as inputs", { endpoint_url = endpoint_url, concurrent_requests = 5, max_retries = 1, - output_file = NULL), + output_dir = NULL), regexp = "with 5 concurrent requests" ) @@ -287,7 +287,7 @@ test_that("oai_complete_df takes a schema as input", { concurrent_requests = 1, max_retries = 1, schema = sentiment_schema, - output_file = NULL + output_dir = NULL ) ) @@ -357,7 +357,7 @@ test_that("oai_complete_df handles mixed validation success/failure", { concurrent_requests = 1, max_retries = 1, schema = sentiment_schema, - output_file = NULL + output_dir = NULL )} ) From 38e92c9c7e370bb09817ffc9d201b95e07cfaab2 Mon Sep 17 00:00:00 2001 From: jpcompartir Date: Thu, 4 Dec 2025 10:15:29 +0000 Subject: [PATCH 48/56] continue rebase --- NAMESPACE | 1 + man/oai_embed_df.Rd | 48 ++++++++++++++++++++++--------------- vignettes/llm_providers.Rmd | 16 ++++++------- 3 files changed, 38 insertions(+), 27 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 6346a46..5b2e951 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -25,6 +25,7 @@ export(oai_complete_chunks) export(oai_complete_df) export(oai_complete_text) export(oai_embed_batch) +export(oai_embed_chunks) export(oai_embed_df) export(oai_embed_text) export(perform_requests_with_strategy) diff --git a/man/oai_embed_df.Rd b/man/oai_embed_df.Rd index f7d90c4..ae62236 100644 --- a/man/oai_embed_df.Rd +++ b/man/oai_embed_df.Rd @@ -11,10 +11,11 @@ oai_embed_df( model = "text-embedding-3-small", dimensions = 1536, key_name = "OPENAI_API_KEY", - batch_size = 10, - concurrent_requests = 1, - max_retries = 5, - timeout = 20, + output_dir = "auto", + chunk_size = 5000L, + concurrent_requests = 1L, + max_retries = 5L, + timeout = 20L, endpoint_url = "https://api.openai.com/v1/embeddings", progress = TRUE ) @@ -28,11 +29,13 @@ oai_embed_df( \item{model}{OpenAI embedding model to use (default: "text-embedding-3-small")} -\item{dimensions}{Number of embedding dimensions (NULL uses model default)} +\item{dimensions}{Number of embedding dimensions (default: 1536)} \item{key_name}{Name of environment variable containing the API key} -\item{batch_size}{Number of texts to process in one batch (default: 10)} +\item{output_dir}{Path to directory for the .parquet chunks. "auto" generates a timestamped directory name. If NULL, uses a temporary directory.} + +\item{chunk_size}{Number of texts to process in each chunk before writing to disk (default: 5000)} \item{concurrent_requests}{Number of concurrent requests (default: 1)} @@ -45,27 +48,34 @@ oai_embed_df( \item{progress}{Whether to display a progress bar (default: TRUE)} } \value{ -Original data frame with additional columns for embeddings (V1, V2, etc.), -plus .error and .error_msg columns indicating any failures +A tibble with columns: +\itemize{ +\item ID column (preserves original column name): Original identifier from input +\item \code{.error}: Logical indicating if request failed +\item \code{.error_msg}: Error message if failed, NA otherwise +\item \code{.chunk}: Chunk number for tracking +\item Embedding columns (V1, V2, etc.) +} } \description{ High-level function to generate embeddings for texts in a data frame using OpenAI's embedding API. This function handles the entire process from request -creation to response processing, with options for batching & concurrent requests. +creation to response processing, with options for chunking & concurrent requests. } \details{ This function extracts texts from a specified column, generates embeddings using -\code{oai_embed_batch()}, and joins the results back to the original data frame using -a specified ID column. +\code{oai_embed_chunks()}, and returns the results matched to the original IDs. -The function preserves the original data frame structure and adds new columns -for embedding dimensions (V1, V2, ..., Vn). If the number of rows doesn't match -after processing (due to errors), it returns the results with a warning. +The chunking approach enables processing of large data frames without memory +constraints. Results are written progressively as parquet files (either to a specified +directory or auto-generated) and then read back as the return value. OpenAI's embedding API allows you to specify the number of dimensions for the -output embeddings, which can be useful for reducing memory usage, storage cost,s or matching +output embeddings, which can be useful for reducing memory usage, storage costs, or matching specific downstream requirements. The default is model-specific (1536 for text-embedding-3-small). \href{https://openai.com/index/new-embedding-models-and-api-updates/}{OpenAI Embedding Updates} + +Avoid risk of data loss by setting a low-ish chunk_size (e.g. 5,000, 10,000). Each chunk is written to a \code{.parquet} file in the \verb{output_dir=} directory, which also contains a \code{metadata.json} file. Be sure to add output directories to .gitignore! } \examples{ \dontrun{ @@ -74,7 +84,7 @@ text-embedding-3-small). \href{https://openai.com/index/new-embedding-models-and text = c("First example", "Second example", "Third example") ) - # Generate embeddings with default dimensions + # Generate embeddings with default settings embeddings_df <- oai_embed_df( df = df, text_var = text, @@ -86,8 +96,8 @@ text-embedding-3-small). \href{https://openai.com/index/new-embedding-models-and df = df, text_var = text, id_var = id, - dimensions = 360, # smaller embeddings - batch_size = 5 + dimensions = 512, # smaller embeddings + chunk_size = 10000 ) # Use with concurrent requests for faster processing @@ -96,7 +106,7 @@ text-embedding-3-small). \href{https://openai.com/index/new-embedding-models-and text_var = text, id_var = id, model = "text-embedding-3-large", - concurrent_requests = 3 + concurrent_requests = 5 ) } } diff --git a/vignettes/llm_providers.Rmd b/vignettes/llm_providers.Rmd index 381ed67..ff53a9c 100644 --- a/vignettes/llm_providers.Rmd +++ b/vignettes/llm_providers.Rmd @@ -85,7 +85,7 @@ oai_complete_df( review_df, text_var = text, id_var = id, - output_file = NULL, # leave this to 'auto' to have your results written to a file in your current working directory + output_dir = NULL, # leave this to 'auto' to have your results written to a directory in your current working directory system_prompt = sentiment_system_prompt, concurrent_requests = 2, chunk_size = 5 @@ -99,7 +99,7 @@ oai_complete_df( ✔ Batch 1: 5 successful, 0 failed ✔ Completed processing: 5 successful, 0 failed # A tibble: 5 × 5 - id content .error .error_msg .batch + id content .error .error_msg .chunk 1 1 "The sentiment of the text is highly positive." FALSE NA 1 @@ -131,9 +131,9 @@ structured_df <- oai_complete_df( text_var = text, id_var = id, schema = sentiment_schema, - output_file = NULL, + output_dir = NULL, system_prompt = sentiment_system_prompt, - concurrent_requests = 2, + concurrent_requests = 2, chunk_size = 5 ) @@ -146,7 +146,7 @@ structured_df <- oai_complete_df( ✔ Batch 1: 5 successful, 0 failed ✔ Completed processing: 5 successful, 0 failed # A tibble: 5 × 5 - id content .error .error_msg .batch + id content .error .error_msg .chunk 1 1 "{\"sentiment\":\"positive\"}" FALSE NA 1 @@ -169,7 +169,7 @@ structured_df |> ```{=html}
 # A tibble: 5 × 5
-     id sentiment .error .error_msg .batch
+     id sentiment .error .error_msg .chunk
                  
 1     1 positive  FALSE  NA              1
 2     2 negative  FALSE  NA              1
@@ -394,14 +394,14 @@ df_classical_texts <- tibble(
 )
 ```
 
-We have a function `oai_complete_df()` which takes a data frame, an id variable, and a text variable as mandatory inputs, and returns a data frame with columns: `id_var`, `text+var`, `.error_msg`, `.error`, `.batch`.
+We have a function `oai_complete_df()` which takes a data frame, an id variable, and a text variable as mandatory inputs, and returns a data frame with columns: `id_var`, `content`, `.error`, `.error_msg`, `.chunk`.
 
 ```{r}
 oai_complete_df(df_classical_texts, 
                 text_var = text, 
                 id_var = id,
                 concurrent_requests = 5,
-                output_file = NULL # set this to write to a temporary file, useful for documentation and testing.
+                output_dir = NULL # set this to write to a temporary directory, useful for documentation and testing.
                 )
 ```
 

From 4cd3e869b2987434a5e88acb21e350c74ccfacc3 Mon Sep 17 00:00:00 2001
From: jpcompartir 
Date: Thu, 4 Dec 2025 10:58:48 +0000
Subject: [PATCH 49/56] After refactoring clear that we actually want to return
 the .status as well as the .error and .error_msg, for easier filtering Can't
 simply put into .error because we can't have numeric with FALSE (when no
 .error) due to single-typed vecs in R

---
 R/core.R               | 13 +++++++++----
 R/hf_classify.R        |  7 ++++++-
 R/hf_embed.R           |  6 ++++++
 R/openai_completions.R |  6 ++++++
 R/openai_embed.R       |  6 ++++++
 R/zzz.R                |  2 +-
 6 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/R/core.R b/R/core.R
index f5c9130..a216123 100644
--- a/R/core.R
+++ b/R/core.R
@@ -172,7 +172,7 @@ process_response <- function(resp, indices, tidy_func) {
     if (status >= 400) {
       error_msg <- .extract_api_error(resp)
       cli::cli_warn("Request failed with status {status}: {error_msg}")
-      return(.create_error_tibble(indices, error_msg))
+      return(.create_error_tibble(indices, error_msg, status = status))
     }
 
     tryCatch({
@@ -180,6 +180,7 @@ process_response <- function(resp, indices, tidy_func) {
       result$original_index <- indices
       result$.error <- FALSE
       result$.error_msg <- NA_character_
+      result$.status <- NA_integer_
       return(result)
     }, error = function(e) {
       cli::cli_warn("Error processing response: {conditionMessage(e)}")
@@ -201,14 +202,17 @@ process_response <- function(resp, indices, tidy_func) {
 #'
 #' @param indices Vector of indices indicating original request positions
 #' @param error_msg Character string or condition object describing the error
+#' @param status HTTP status code (integer) or NA_integer_ for non-HTTP errors.
+#'   Defaults to NA_integer_.
 #'
 #' @return A tibble with columns:
 #'   - original_index: Position in original request batch
-#'   - .error: Always TRUE for error tibbles
+#'   - .error: TRUE for errors
 #'   - .error_msg: Character description of the error
+#'   - .status: HTTP status code (integer) or NA for non-HTTP errors
 #'
 #' @keywords internal
-.create_error_tibble <- function(indices, error_msg) {
+.create_error_tibble <- function(indices, error_msg, status = NA_integer_) {
   # for consistent outputs with safely function(s)
   if (!is.character(error_msg)) {
     if (inherits(error_msg, "condition")) {
@@ -221,7 +225,8 @@ process_response <- function(resp, indices, tidy_func) {
   return(tibble::tibble(
     original_index = indices,
     .error = TRUE,
-    .error_msg = error_msg
+    .error_msg = error_msg,
+    .status = status
   ))
 }
 
diff --git a/R/hf_classify.R b/R/hf_classify.R
index efc7303..28a9ba1 100644
--- a/R/hf_classify.R
+++ b/R/hf_classify.R
@@ -551,6 +551,7 @@ hf_classify_chunks <- function(texts,
         !!text_col_name := successes_texts,
         .error = FALSE,
         .error_msg = NA_character_,
+        .status = NA_integer_,
         .chunk = chunk_num
       ) |>
         dplyr::bind_cols(successes_content)
@@ -562,13 +563,17 @@ hf_classify_chunks <- function(texts,
       failures_ids <- purrr::map(chunk_failures, \(x) purrr::pluck(x, "request", "headers", "endpointr_id")) |>  unlist()
       failures_texts <- purrr::map_chr(chunk_failures, \(x) purrr::pluck(x, "request", "body", "data", "inputs")) |> unlist()
       failures_msgs <- purrr::map_chr(chunk_failures, \(x) purrr::pluck(x, "message", .default = "Unknown error"))
-
+      failures_status <- purrr::map_int(chunk_failures, \(x) {
+        resp <- purrr::pluck(x, "resp")
+        if (!is.null(resp)) httr2::resp_status(resp) else NA_integer_
+      })
 
       chunk_results$failures <- tibble::tibble(
         !!id_col_name := failures_ids,
         !!text_col_name := failures_texts,
         .error = TRUE,
         .error_msg = failures_msgs,
+        .status = failures_status,
         .chunk = chunk_num
       )
     }
diff --git a/R/hf_embed.R b/R/hf_embed.R
index a199b46..3d849b7 100644
--- a/R/hf_embed.R
+++ b/R/hf_embed.R
@@ -379,6 +379,7 @@ hf_embed_chunks <- function(texts,
         !!id_col_name := successes_ids,
         .error = FALSE,
         .error_msg = NA_character_,
+        .status = NA_integer_,
         .chunk = chunk_num
       ) |>
         dplyr::bind_cols(successes_content)
@@ -387,11 +388,16 @@ hf_embed_chunks <- function(texts,
     if (n_failures > 0) {
       failures_ids <- purrr::map(failures, \(x) purrr::pluck(x, "request", "headers", "endpointr_id")) |>  unlist()
       failures_msgs <- purrr::map_chr(failures, \(x) purrr::pluck(x, "message", .default = "Unknown error"))
+      failures_status <- purrr::map_int(failures, \(x) {
+        resp <- purrr::pluck(x, "resp")
+        if (!is.null(resp)) httr2::resp_status(resp) else NA_integer_
+      })
 
       chunk_results$failures <- tibble::tibble(
         !!id_col_name := failures_ids,
         .error = TRUE,
         .error_msg = failures_msgs,
+        .status = failures_status,
         .chunk = chunk_num
       )
     }
diff --git a/R/openai_completions.R b/R/openai_completions.R
index 8e318a6..e1e01f7 100644
--- a/R/openai_completions.R
+++ b/R/openai_completions.R
@@ -524,6 +524,7 @@ oai_complete_chunks <- function(texts,
         content = successes_content,
         .error = FALSE,
         .error_msg = NA_character_,
+        .status = NA_integer_,
         .chunk = chunk_num
       )
     }
@@ -531,12 +532,17 @@ oai_complete_chunks <- function(texts,
     if (length(failures) > 0) {
       failures_ids <- purrr::map(failures, ~purrr::pluck(.x, "request", "headers", "endpointr_id")) |> unlist()
       failures_msgs <- purrr::map_chr(failures, ~purrr::pluck(.x, "message", .default = "Unknown error"))
+      failures_status <- purrr::map_int(failures, ~{
+        resp <- purrr::pluck(.x, "resp")
+        if (!is.null(resp)) httr2::resp_status(resp) else NA_integer_
+      })
 
       chunk_results$failures <- tibble::tibble(
         !!id_col_name := failures_ids,
         content = NA_character_,
         .error = TRUE,
         .error_msg = failures_msgs,
+        .status = failures_status,
         .chunk = chunk_num
       )
     }
diff --git a/R/openai_embed.R b/R/openai_embed.R
index 9acdcd4..723daf8 100644
--- a/R/openai_embed.R
+++ b/R/openai_embed.R
@@ -684,6 +684,7 @@ oai_embed_chunks <- function(texts,
         !!id_col_name := successes_ids,
         .error = FALSE,
         .error_msg = NA_character_,
+        .status = NA_integer_,
         .chunk = chunk_num
       ) |>
         dplyr::bind_cols(successes_content)
@@ -692,11 +693,16 @@ oai_embed_chunks <- function(texts,
     if (n_failures > 0) {
       failures_ids <- purrr::map(failures, ~purrr::pluck(.x, "request", "headers", "endpointr_id")) |> unlist()
       failures_msgs <- purrr::map_chr(failures, ~purrr::pluck(.x, "message", .default = "Unknown error"))
+      failures_status <- purrr::map_int(failures, ~{
+        resp <- purrr::pluck(.x, "resp")
+        if (!is.null(resp)) httr2::resp_status(resp) else NA_integer_
+      })
 
       chunk_results$failures <- tibble::tibble(
         !!id_col_name := failures_ids,
         .error = TRUE,
         .error_msg = failures_msgs,
+        .status = failures_status,
         .chunk = chunk_num
       )
     }
diff --git a/R/zzz.R b/R/zzz.R
index db1fa09..a925ff3 100644
--- a/R/zzz.R
+++ b/R/zzz.R
@@ -1,5 +1,5 @@
 utils::globalVariables(c(".embeddings", ".request", ".response", ".row_num", ".data", ".error",
-                         ".error_msg", "original_index", "text", ":=", ".row_id", "id", "label", "score", "verbose"))
+                         ".error_msg", ".status", "original_index", "text", ":=", ".row_id", "id", "label", "score", "verbose"))
 
 .onLoad <- function(...) {
   S7::methods_register()

From 8bef2c60acb21faf699635ae5586b7af10e0dab8 Mon Sep 17 00:00:00 2001
From: jpcompartir 
Date: Thu, 4 Dec 2025 10:59:04 +0000
Subject: [PATCH 50/56] Continue refactor with altering tests for .status
 expectation

---
 tests/testthat/test-core.R               | 6 +++---
 tests/testthat/test-hf_classify.R        | 2 +-
 tests/testthat/test-hf_embed.R           | 2 +-
 tests/testthat/test-openai_completions.R | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/testthat/test-core.R b/tests/testthat/test-core.R
index 8006828..d3f7a85 100644
--- a/tests/testthat/test-core.R
+++ b/tests/testthat/test-core.R
@@ -131,7 +131,7 @@ test_that("process_response handles batches of inputs when passed the correct ti
   single_batch <- expect_no_error(process_response(resp = mock_batch_response,
                    indices = 1:3,
                    tidy_func = tidy_batch_classification_response))
-  expect_setequal(names(single_batch), c("positive", "negative", "neutral", "original_index", ".error", ".error_msg"))
+  expect_setequal(names(single_batch), c("positive", "negative", "neutral", "original_index", ".error", ".error_msg", ".status"))
   expect_equal(nrow(single_batch), 3)
 
   # multi-batches
@@ -160,7 +160,7 @@ test_that("process_response handles batches of inputs when passed the correct ti
     )
 
     expect_equal(nrow(processed_batch_results), 9)
-    expect_equal(ncol(processed_batch_results), 6)
+    expect_equal(ncol(processed_batch_results), 7)
 
 })
 
@@ -405,5 +405,5 @@ test_that(".create_error_tibble produces correct structure with .error_msg colum
   expect_true(all(error_tib$.error))
   expect_true(all(error_tib$.error_msg == "Something went wrong"))
   expect_equal(error_tib$original_index, c(1, 2, 3))
-  expect_setequal(names(error_tib), c("original_index", ".error", ".error_msg"))
+  expect_setequal(names(error_tib), c("original_index", ".error", ".error_msg", ".status"))
 })
diff --git a/tests/testthat/test-hf_classify.R b/tests/testthat/test-hf_classify.R
index dd9b221..0b66729 100644
--- a/tests/testthat/test-hf_classify.R
+++ b/tests/testthat/test-hf_classify.R
@@ -152,7 +152,7 @@ test_that("hf_classify_chunks processes chunks correctly", {
   texts <- paste0("text", 1:6)
   ids <- paste0("id", 1:length(texts))
   temp_dir <- withr::local_tempdir()
-  expected_cols <- c("id", "text", ".error", ".error_msg", ".chunk", "positive", "negative", "neutral")
+  expected_cols <- c("id", "text", ".error", ".error_msg", ".status", ".chunk", "positive", "negative", "neutral")
 
   # Test with chunk_size = 2
   chunk_2 <- expect_no_error(hf_classify_chunks(
diff --git a/tests/testthat/test-hf_embed.R b/tests/testthat/test-hf_embed.R
index 50c6a7c..e8e6399 100644
--- a/tests/testthat/test-hf_embed.R
+++ b/tests/testthat/test-hf_embed.R
@@ -88,7 +88,7 @@ test_that("hf_embed_chunks replaces hf_embed_batch", {
   texts <- paste0("text", 1:6)
   ids <- paste0('id', 1:length(texts))
   temp_dir <- withr::local_tempdir()
-  expected_cols <- c("id", ".error", ".error_msg", ".chunk", "V1", "V2", "V3")
+  expected_cols <- c("id", ".error", ".error_msg", ".status", ".chunk", "V1", "V2", "V3")
 
 
   chunk_2 <- expect_no_error(hf_embed_chunks(
diff --git a/tests/testthat/test-openai_completions.R b/tests/testthat/test-openai_completions.R
index bb29a55..97ad2d5 100644
--- a/tests/testthat/test-openai_completions.R
+++ b/tests/testthat/test-openai_completions.R
@@ -208,7 +208,7 @@ test_that("oai_complete_df takes single row, multi-row data frames as inputs", {
     )
 
   expect_setequal(names(successful_response),
-                  c("id", "content", ".error", ".error_msg", ".chunk"))
+                  c("id", "content", ".error", ".error_msg", ".status", ".chunk"))
   expect_setequal(unique(successful_response$content), "positive")
 
   withr::with_envvar(

From 09e08b70f9dea8d83e9741290f374e0dea0c35a1 Mon Sep 17 00:00:00 2001
From: jpcompartir 
Date: Thu, 4 Dec 2025 11:14:52 +0000
Subject: [PATCH 51/56] update .create_error_tibble.Rd

---
 man/dot-create_error_tibble.Rd | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/man/dot-create_error_tibble.Rd b/man/dot-create_error_tibble.Rd
index 1ef9608..1b55fcb 100644
--- a/man/dot-create_error_tibble.Rd
+++ b/man/dot-create_error_tibble.Rd
@@ -4,19 +4,23 @@
 \alias{.create_error_tibble}
 \title{Create standardised error tibble for failed requests}
 \usage{
-.create_error_tibble(indices, error_msg)
+.create_error_tibble(indices, error_msg, status = NA_integer_)
 }
 \arguments{
 \item{indices}{Vector of indices indicating original request positions}
 
 \item{error_msg}{Character string or condition object describing the error}
+
+\item{status}{HTTP status code (integer) or NA_integer_ for non-HTTP errors.
+Defaults to NA_integer_.}
 }
 \value{
 A tibble with columns:
 \itemize{
 \item original_index: Position in original request batch
-\item .error: Always TRUE for error tibbles
+\item .error: TRUE for errors
 \item .error_msg: Character description of the error
+\item .status: HTTP status code (integer) or NA for non-HTTP errors
 }
 }
 \description{

From 7cdc9f0af2c03409a8065913039125eeaa16fcb2 Mon Sep 17 00:00:00 2001
From: jpcompartir 
Date: Thu, 4 Dec 2025 11:17:45 +0000
Subject: [PATCH 52/56] Update news ahead of merge in feature branch and
 eventual version push

---
 NEWS.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index 24b4428..74dad18 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,10 @@
 # EndpointR 0.2
 
+-   error message and status propagation improvement. Now writes .error, .error_msg (standardised across package), and .status. Main change is preventing httr2 eating the errors before we can deal with them
+-   adds parquet writing to oai_complete_df and oai_embed_df
+-   adds chunks func to oai_embed, and re-writes all batch -\> chunk logic
+-   implements the Anthropic messages API with structured outputs (via BETA)
+
 # EndpointR 0.1.2
 
 -   **File writing improvements**: `hf_embed_df()` and `hf_classify_df()` now write intermediate results as `.parquet` files to `output_dir` directories, similar to improvements in 0.1.1 for OpenAI functions
@@ -40,4 +45,3 @@ Initial BETA release, ships with:
 -   Support for text completion using OpenAI models via the Chat Completions API
 -   Support for embeddings with the OpenAI Embeddings API
 -   Structured outputs via JSON schemas and validators
-

From c9e1db38c5f27496db2503cbac2f0b2633db06c3 Mon Sep 17 00:00:00 2001
From: jpcompartir 
Date: Thu, 4 Dec 2025 11:21:14 +0000
Subject: [PATCH 53/56] prevent httr2 from capturing error messages, as we've
 done in the refactor across oai_* funcs and hf_* funcs with the changes to
 basee_request (which ant doesn't use due to auth differences)

---
 R/anthropic_messages.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/R/anthropic_messages.R b/R/anthropic_messages.R
index d39d628..e828609 100644
--- a/R/anthropic_messages.R
+++ b/R/anthropic_messages.R
@@ -134,6 +134,7 @@ ant_build_messages_request <- function(
       "x-api-key" = api_key,
       "anthropic-version" = .ANT_API_VERSION
     ) |>
+    httr2::req_error(is_error = ~ FALSE) |> # don't let httr2 auto-throw errors; we handle them ourselves
     httr2::req_timeout(timeout) |>
     httr2::req_retry(
       max_tries = max_retries,

From 446d154c1958a132ad75bbda461cf83fdda0c65d Mon Sep 17 00:00:00 2001
From: jpcompartir 
Date: Thu, 4 Dec 2025 11:48:08 +0000
Subject: [PATCH 54/56] move funcs to .extract_api_error instead of manual
 plucking

---
 R/hf_classify.R        |  5 ++++-
 R/hf_embed.R           |  5 ++++-
 R/openai_completions.R | 15 ++++++---------
 R/openai_embed.R       |  5 ++++-
 4 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/R/hf_classify.R b/R/hf_classify.R
index 28a9ba1..c95a2fc 100644
--- a/R/hf_classify.R
+++ b/R/hf_classify.R
@@ -562,7 +562,10 @@ hf_classify_chunks <- function(texts,
 
       failures_ids <- purrr::map(chunk_failures, \(x) purrr::pluck(x, "request", "headers", "endpointr_id")) |>  unlist()
       failures_texts <- purrr::map_chr(chunk_failures, \(x) purrr::pluck(x, "request", "body", "data", "inputs")) |> unlist()
-      failures_msgs <- purrr::map_chr(chunk_failures, \(x) purrr::pluck(x, "message", .default = "Unknown error"))
+      failures_msgs <- purrr::map_chr(chunk_failures, \(x) {
+        resp <- purrr::pluck(x, "resp")
+        if (!is.null(resp)) .extract_api_error(resp) else .extract_api_error(x, "Unknown error")
+      })
       failures_status <- purrr::map_int(chunk_failures, \(x) {
         resp <- purrr::pluck(x, "resp")
         if (!is.null(resp)) httr2::resp_status(resp) else NA_integer_
diff --git a/R/hf_embed.R b/R/hf_embed.R
index 3d849b7..bc109b5 100644
--- a/R/hf_embed.R
+++ b/R/hf_embed.R
@@ -387,7 +387,10 @@ hf_embed_chunks <- function(texts,
 
     if (n_failures > 0) {
       failures_ids <- purrr::map(failures, \(x) purrr::pluck(x, "request", "headers", "endpointr_id")) |>  unlist()
-      failures_msgs <- purrr::map_chr(failures, \(x) purrr::pluck(x, "message", .default = "Unknown error"))
+      failures_msgs <- purrr::map_chr(failures, \(x) {
+        resp <- purrr::pluck(x, "resp")
+        if (!is.null(resp)) .extract_api_error(resp) else .extract_api_error(x, "Unknown error")
+      })
       failures_status <- purrr::map_int(failures, \(x) {
         resp <- purrr::pluck(x, "resp")
         if (!is.null(resp)) httr2::resp_status(resp) else NA_integer_
diff --git a/R/openai_completions.R b/R/openai_completions.R
index 5d6d234..ec73c21 100644
--- a/R/openai_completions.R
+++ b/R/openai_completions.R
@@ -275,10 +275,7 @@ oai_complete_text <- function(text,
 
   # basic request sending with non-comprehensive error handling
   if (httr2::resp_status(response) != 200) {
-    error_msg <- tryCatch(
-      httr2::resp_body_json(response)$error$message,
-      error = function(e) paste("HTTP", httr2::resp_status(response))
-    )
+    error_msg <- .extract_api_error(response)
     cli::cli_abort(c(
       "API request failed",
       "x" = error_msg
@@ -531,7 +528,10 @@ oai_complete_chunks <- function(texts,
 
     if (length(failures) > 0) {
       failures_ids <- purrr::map(failures, ~purrr::pluck(.x, "request", "headers", "endpointr_id")) |> unlist()
-      failures_msgs <- purrr::map_chr(failures, ~purrr::pluck(.x, "message", .default = "Unknown error"))
+      failures_msgs <- purrr::map_chr(failures, ~{
+        resp <- purrr::pluck(.x, "resp")
+        if (!is.null(resp)) .extract_api_error(resp) else .extract_api_error(.x, "Unknown error")
+      })
       failures_status <- purrr::map_int(failures, ~{
         resp <- purrr::pluck(.x, "resp")
         if (!is.null(resp)) httr2::resp_status(resp) else NA_integer_
@@ -742,10 +742,7 @@ oai_complete_df <- function(df,
       .error_msg = NA_character_
     ))
   } else {
-    .error_msg <- tryCatch(
-      httr2::resp_body_json(response)$error$message,
-      error = function(e) paste("HTTP", status)
-    )
+    .error_msg <- .extract_api_error(response)
 
     return(list(
       status = status,
diff --git a/R/openai_embed.R b/R/openai_embed.R
index 723daf8..8c2b611 100644
--- a/R/openai_embed.R
+++ b/R/openai_embed.R
@@ -692,7 +692,10 @@ oai_embed_chunks <- function(texts,
 
     if (n_failures > 0) {
       failures_ids <- purrr::map(failures, ~purrr::pluck(.x, "request", "headers", "endpointr_id")) |> unlist()
-      failures_msgs <- purrr::map_chr(failures, ~purrr::pluck(.x, "message", .default = "Unknown error"))
+      failures_msgs <- purrr::map_chr(failures, ~{
+        resp <- purrr::pluck(.x, "resp")
+        if (!is.null(resp)) .extract_api_error(resp) else .extract_api_error(.x, "Unknown error")
+      })
       failures_status <- purrr::map_int(failures, ~{
         resp <- purrr::pluck(.x, "resp")
         if (!is.null(resp)) httr2::resp_status(resp) else NA_integer_

From 0e32d87af14fd3336ca6d226731cb2f400d35d59 Mon Sep 17 00:00:00 2001
From: jpcompartir 
Date: Thu, 4 Dec 2025 12:44:13 +0000
Subject: [PATCH 55/56] Bring together some integration tests for when making
 big changes

---
 dev_docs/integrations.qmd | 172 ++++++++++++++++++++++++++++++++++++++
 todos.qmd                 |   3 +-
 2 files changed, 174 insertions(+), 1 deletion(-)
 create mode 100644 dev_docs/integrations.qmd

diff --git a/dev_docs/integrations.qmd b/dev_docs/integrations.qmd
new file mode 100644
index 0000000..d4bc839
--- /dev/null
+++ b/dev_docs/integrations.qmd
@@ -0,0 +1,172 @@
+---
+title: "integrations"
+format: html
+---
+
+```{r, setup}
+library(tidyverse)
+library(httr2)
+library(EndpointR)
+
+n_tests <- 5
+test_df <- tibble(
+    id = 1:n_tests,
+    text = paste("This is test sentence number", 1:n_tests)
+  )
+```
+
+Space for integration tests (useful for not relying on unit tests when interacting with real APIs)
+
+# oai embed
+
+```{r, invalid_model_oai_embed}
+oai_embed_invalid_model <- oai_embed_df(
+    test_df,
+    text_var = "text",
+    id_var = "id",
+    model = "text-embedding-FAKE-model",
+    output_dir = NULL,
+    concurrent_requests = n_tests
+  )
+
+oai_embed_invalid_model |> dplyr::select(id, .error, .error_msg, .status)
+# expect: .error = TRUE, .status = 404, .error_msg contains model info
+```
+
+invalid API key returns 401 authentication error
+
+```{r, oai_embed_401_auth}
+oai_embed_bad_auth <- oai_embed_df(
+  test_df,
+  text_var = "text",
+  id_var = "id",
+  model = "text-embedding-3-small",
+  key_name = "FAKE_API_KEY",
+  output_dir = NULL,
+  concurrent_requests = n_tests
+)
+
+oai_embed_bad_auth |> dplyr::select(id, .error, .error_msg, .status)
+# expect: .error = TRUE, .status = 401, .error_msg mentions authentication
+```
+
+now succeed:
+
+```{r oai_embed_success}
+oai_embed_success <- oai_embed_df(
+  test_df, 
+  text_var = text, 
+  id_var = id, 
+  model = "text-embedding-3-small",
+  output_dir = NULL,
+  concurrent_requests = n_tests)
+
+oai_embed_success
+```
+
+# oai completions
+
+invalid API key for completions
+
+```{r}
+oai_complete_bad_auth <- oai_complete_df(
+  test_df,
+  text_var = "text",
+  id_var = "id",
+  model = "gpt-4o-mini",
+  system_prompt = "Summarize in one word.",
+  key_name = "FAKE_API_KEY",
+  output_dir = NULL,
+  concurrent_requests = n_tests
+)
+
+oai_complete_bad_auth |> dplyr::select(id, .error, .error_msg, .status)
+
+# expect: .error = TRUE, .status = 401
+```
+
+```{r}
+oai_complete_good_auth <- oai_complete_df(
+  test_df,
+  text_var = "text",
+  id_var = "id",
+  model = "gpt-4o-mini",
+  system_prompt = "Summarize in one word.",
+  key_name = "OPENAI_API_KEY",
+  output_dir = NULL,
+  concurrent_requests = n_tests
+)
+
+oai_complete_good_auth
+```
+
+# hf embed
+
+non-existent HuggingFace model
+
+```{r}
+ hf_embed_invalid_model <- hf_embed_df(
+   test_df,
+   text_var = "text",
+   id_var = "id",
+   endpoint_url = "https://api-inference.huggingface.co/pipeline/feature-extraction/FAKE-model-404",
+   key_name = "HF_TEST_API_KEY",
+   output_dir = NULL,
+   concurrent_requests = n_tests
+  )
+
+hf_embed_invalid_model |> dplyr::select(id, .error, .error_msg, .status)
+
+# expect: .error = TRUE, .status = 410, .error_msg mentions mhttps://api-inference.huggingface.co is no longer...
+```
+
+invalid HuggingFace token
+
+```{r, invalid_key_hf_embed}
+hf_embed_bad_auth <- hf_embed_df(
+  test_df, 
+  text_var = "text",
+  id_var = "id", 
+  endpoint_url = "https://router.huggingface.co/hf-inference/models/sentence-transformers/all-MiniLM-L6-v2/pipeline/sentence-similarity", 
+  key_name = "FAKE_API_KEY",
+  output_dir = NULL,
+  concurrent_requests = n_tests)
+
+hf_embed_bad_auth |> dplyr::select(id, .error, .error_msg, .status) 
+
+# expect: .error = TRUE, .status = 401
+```
+
+Non-existent classification model
+
+```{r, invalid_model_hf_classify}
+hf_classify_invalid_model <- hf_classify_df(
+  test_df, 
+  text_var = "text", 
+  id_var = "id", 
+  endpoint_url = "https://api-inference.huggingface.co/pipeline/feature-extraction/FAKE-model-404",
+  output_dir = NULL,
+  key_name = "HF_TEST_API_KEY",
+  concurrent_requests = n_tests
+  )
+
+hf_classify_invalid_model |> dplyr::select(id, .error, .error_msg, .status) 
+# expect: .error = TRUE, .status = 410
+```
+
+Using an embedding model for classification (wrong task type)
+
+```{r, bad_task_hf_classify}
+hf_classify_wrong_task <- hf_classify_df(
+  test_df, 
+  text_var = text, 
+  id_var = id, 
+  endpoint_url = "https://router.huggingface.co/hf-inference/models/sentence-transformers/all-MiniLM-L6-v2/pipeline/sentence-similarity",
+  key_name = "HF_TEST_API_KEY",
+  output_dir = NULL,
+  concurrent_requests = n_tests)
+
+hf_classify_wrong_task |> dplyr::select(id, .error, .error_msg, .status) 
+
+# expect: .error = TRUE, error message should indicate task mismatch, .status = 400
+```
diff --git a/todos.qmd b/todos.qmd
index b3696ed..fe7293d 100644
--- a/todos.qmd
+++ b/todos.qmd
@@ -13,7 +13,8 @@
     -   [ ] Completions
     -   [ ] Structured Outputs
 -   [ ] LLM Providers Vignette Updated
--   [ ] Structured Outpits Vignette Updated
+-   [ ] Structured Outputs Vignette Updated
+-   [ ] Better error propagation throughout package (refactor, large)
 
 Error reporting is somewhat annoying by default with httr2::req_perform() if we don't:
 

From 21d9d83b14f973e8678ab76bdceab61649bda19d Mon Sep 17 00:00:00 2001
From: jpcompartir 
Date: Thu, 4 Dec 2025 12:44:52 +0000
Subject: [PATCH 56/56] switch to how we identify successes/failures - not
 using the httr2 funcs which check the httr2 request was successful/not

---
 R/hf_classify.R        | 31 +++++++++++++++++++++++++------
 R/hf_embed.R           | 31 +++++++++++++++++++++++++------
 R/openai_completions.R | 31 +++++++++++++++++++++++++------
 R/openai_embed.R       | 31 +++++++++++++++++++++++++------
 4 files changed, 100 insertions(+), 24 deletions(-)

diff --git a/R/hf_classify.R b/R/hf_classify.R
index c95a2fc..d9cd98c 100644
--- a/R/hf_classify.R
+++ b/R/hf_classify.R
@@ -527,8 +527,18 @@ hf_classify_chunks <- function(texts,
       progress = TRUE
     )
 
-    chunk_successes <- httr2::resps_successes(responses)
-    chunk_failures <- httr2::resps_failures(responses)
+    # separate actual responses from error objects (network failures, etc.)
+    is_response <- purrr::map_lgl(responses, inherits, "httr2_response")
+    response_objects <- responses[is_response]
+    error_objects <- responses[!is_response]
+
+    # split responses by HTTP status code (not just by type)
+    is_success <- purrr::map_lgl(response_objects, ~httr2::resp_status(.x) < 400)
+    chunk_successes <- response_objects[is_success]
+    http_failures <- response_objects[!is_success]
+
+    # combine HTTP failures with network/other errors
+    chunk_failures <- c(http_failures, error_objects)
 
     n_chunk_successes <- length(chunk_successes)
     n_chunk_failures <- length(chunk_failures)
@@ -563,12 +573,21 @@ hf_classify_chunks <- function(texts,
       failures_ids <- purrr::map(chunk_failures, \(x) purrr::pluck(x, "request", "headers", "endpointr_id")) |>  unlist()
       failures_texts <- purrr::map_chr(chunk_failures, \(x) purrr::pluck(x, "request", "body", "data", "inputs")) |> unlist()
       failures_msgs <- purrr::map_chr(chunk_failures, \(x) {
-        resp <- purrr::pluck(x, "resp")
-        if (!is.null(resp)) .extract_api_error(resp) else .extract_api_error(x, "Unknown error")
+        if (inherits(x, "httr2_response")) {
+          .extract_api_error(x)
+        } else {
+          # error object - try to get resp from it
+          resp <- purrr::pluck(x, "resp")
+          if (!is.null(resp)) .extract_api_error(resp) else .extract_api_error(x, "Unknown error")
+        }
       })
       failures_status <- purrr::map_int(chunk_failures, \(x) {
-        resp <- purrr::pluck(x, "resp")
-        if (!is.null(resp)) httr2::resp_status(resp) else NA_integer_
+        if (inherits(x, "httr2_response")) {
+          httr2::resp_status(x)
+        } else {
+          resp <- purrr::pluck(x, "resp")
+          if (!is.null(resp)) httr2::resp_status(resp) else NA_integer_
+        }
       })
 
       chunk_results$failures <- tibble::tibble(
diff --git a/R/hf_embed.R b/R/hf_embed.R
index bc109b5..e7001d3 100644
--- a/R/hf_embed.R
+++ b/R/hf_embed.R
@@ -359,8 +359,18 @@ hf_embed_chunks <- function(texts,
       progress = TRUE
     )
 
-    successes <- httr2::resps_successes(responses)
-    failures <- httr2::resps_failures(responses)
+    # separate actual responses from error objects (network failures, etc.)
+    is_response <- purrr::map_lgl(responses, inherits, "httr2_response")
+    response_objects <- responses[is_response]
+    error_objects <- responses[!is_response]
+
+    # split responses by HTTP status code (not just by type)
+    is_success <- purrr::map_lgl(response_objects, ~httr2::resp_status(.x) < 400)
+    successes <- response_objects[is_success]
+    http_failures <- response_objects[!is_success]
+
+    # combine HTTP failures with network/other errors
+    failures <- c(http_failures, error_objects)
 
     n_successes <- length(successes)
     n_failures <- length(failures)
@@ -388,12 +398,21 @@ hf_embed_chunks <- function(texts,
     if (n_failures > 0) {
       failures_ids <- purrr::map(failures, \(x) purrr::pluck(x, "request", "headers", "endpointr_id")) |>  unlist()
       failures_msgs <- purrr::map_chr(failures, \(x) {
-        resp <- purrr::pluck(x, "resp")
-        if (!is.null(resp)) .extract_api_error(resp) else .extract_api_error(x, "Unknown error")
+        if (inherits(x, "httr2_response")) {
+          .extract_api_error(x)
+        } else {
+          # error object - try to get resp from it
+          resp <- purrr::pluck(x, "resp")
+          if (!is.null(resp)) .extract_api_error(resp) else .extract_api_error(x, "Unknown error")
+        }
       })
       failures_status <- purrr::map_int(failures, \(x) {
-        resp <- purrr::pluck(x, "resp")
-        if (!is.null(resp)) httr2::resp_status(resp) else NA_integer_
+        if (inherits(x, "httr2_response")) {
+          httr2::resp_status(x)
+        } else {
+          resp <- purrr::pluck(x, "resp")
+          if (!is.null(resp)) httr2::resp_status(resp) else NA_integer_
+        }
       })
 
       chunk_results$failures <- tibble::tibble(
diff --git a/R/openai_completions.R b/R/openai_completions.R
index ec73c21..34da373 100644
--- a/R/openai_completions.R
+++ b/R/openai_completions.R
@@ -500,8 +500,18 @@ oai_complete_chunks <- function(texts,
       progress = TRUE
     )
 
-    successes <- httr2::resps_successes(responses)
-    failures <- httr2::resps_failures(responses)
+    # separate actual responses from error objects (network failures, etc.)
+    is_response <- purrr::map_lgl(responses, inherits, "httr2_response")
+    response_objects <- responses[is_response]
+    error_objects <- responses[!is_response]
+
+    # split responses by HTTP status code (not just by type)
+    is_success <- purrr::map_lgl(response_objects, ~httr2::resp_status(.x) < 400)
+    successes <- response_objects[is_success]
+    http_failures <- response_objects[!is_success]
+
+    # combine HTTP failures with network/other errors
+    failures <- c(http_failures, error_objects)
 
     n_successes <- length(successes)
     n_failures <- length(failures)
@@ -529,12 +539,21 @@ oai_complete_chunks <- function(texts,
     if (length(failures) > 0) {
       failures_ids <- purrr::map(failures, ~purrr::pluck(.x, "request", "headers", "endpointr_id")) |> unlist()
       failures_msgs <- purrr::map_chr(failures, ~{
-        resp <- purrr::pluck(.x, "resp")
-        if (!is.null(resp)) .extract_api_error(resp) else .extract_api_error(.x, "Unknown error")
+        if (inherits(.x, "httr2_response")) {
+          .extract_api_error(.x)
+        } else {
+          # Error object - try to get resp from it
+          resp <- purrr::pluck(.x, "resp")
+          if (!is.null(resp)) .extract_api_error(resp) else .extract_api_error(.x, "Unknown error")
+        }
       })
       failures_status <- purrr::map_int(failures, ~{
-        resp <- purrr::pluck(.x, "resp")
-        if (!is.null(resp)) httr2::resp_status(resp) else NA_integer_
+        if (inherits(.x, "httr2_response")) {
+          httr2::resp_status(.x)
+        } else {
+          resp <- purrr::pluck(.x, "resp")
+          if (!is.null(resp)) httr2::resp_status(resp) else NA_integer_
+        }
       })
 
       chunk_results$failures <- tibble::tibble(
diff --git a/R/openai_embed.R b/R/openai_embed.R
index 8c2b611..b88dcb9 100644
--- a/R/openai_embed.R
+++ b/R/openai_embed.R
@@ -663,8 +663,18 @@ oai_embed_chunks <- function(texts,
       progress = TRUE
     )
 
-    successes <- httr2::resps_successes(responses)
-    failures <- httr2::resps_failures(responses)
+    # separate actual responses from error objects (network failures, etc.)
+    is_response <- purrr::map_lgl(responses, inherits, "httr2_response")
+    response_objects <- responses[is_response]
+    error_objects <- responses[!is_response]
+
+    # split responses by HTTP status code (not just by type)
+    is_success <- purrr::map_lgl(response_objects, ~httr2::resp_status(.x) < 400)
+    successes <- response_objects[is_success]
+    http_failures <- response_objects[!is_success]
+
+    # combine HTTP failures with network/other errors
+    failures <- c(http_failures, error_objects)
 
     n_successes <- length(successes)
     n_failures <- length(failures)
@@ -693,12 +703,21 @@ oai_embed_chunks <- function(texts,
     if (n_failures > 0) {
       failures_ids <- purrr::map(failures, ~purrr::pluck(.x, "request", "headers", "endpointr_id")) |> unlist()
       failures_msgs <- purrr::map_chr(failures, ~{
-        resp <- purrr::pluck(.x, "resp")
-        if (!is.null(resp)) .extract_api_error(resp) else .extract_api_error(.x, "Unknown error")
+        if (inherits(.x, "httr2_response")) {
+          .extract_api_error(.x)
+        } else {
+          # Error object - try to get resp from it
+          resp <- purrr::pluck(.x, "resp")
+          if (!is.null(resp)) .extract_api_error(resp) else .extract_api_error(.x, "Unknown error")
+        }
       })
       failures_status <- purrr::map_int(failures, ~{
-        resp <- purrr::pluck(.x, "resp")
-        if (!is.null(resp)) httr2::resp_status(resp) else NA_integer_
+        if (inherits(.x, "httr2_response")) {
+          httr2::resp_status(.x)
+        } else {
+          resp <- purrr::pluck(.x, "resp")
+          if (!is.null(resp)) httr2::resp_status(resp) else NA_integer_
+        }
       })
 
       chunk_results$failures <- tibble::tibble(