diff --git a/_quarto.yml b/_quarto.yml index d64441c..ee4bbe4 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -31,6 +31,7 @@ book: - chapters/06-your-first-agent.qmd - chapters/05-procedures-and-outputs.qmd - chapters/07-tools.qmd + - chapters/07a-models.qmd - chapters/08-state-management.qmd - chapters/09-the-agent-loop.qmd - chapters/10-human-in-the-loop.qmd diff --git a/chapters/02-transparent-durability.qmd b/chapters/02-transparent-durability.qmd index d0492cf..8c0c03f 100644 --- a/chapters/02-transparent-durability.qmd +++ b/chapters/02-transparent-durability.qmd @@ -45,7 +45,7 @@ Tactus automatically checkpoints these operations: | `Human.approve()` | The approval request and human's response | | `Human.input()` | The input request and human's response | | `Procedure.run()` | Sub-procedure inputs, outputs, and state | -| `Model.predict()` | ML model inputs and outputs | +| `Model("sentiment")({text = "..."})` | Model inputs and outputs | State changes (`state.*` assignment, `State.increment`) and log entries are also tracked, ensuring consistent replay. diff --git a/chapters/07a-models.qmd b/chapters/07a-models.qmd new file mode 100644 index 0000000..8390f15 --- /dev/null +++ b/chapters/07a-models.qmd @@ -0,0 +1,82 @@ +# Predictable Inference with Models + +Agents are great when you want a conversation: multi-turn reasoning, tool use, and flexible behavior. + +But sometimes you want something else: + +- A single, schema-defined input +- A single, schema-defined output +- No tool calls +- A deterministic, testable control-flow story in *your* code + +That is what the **Model primitive** is for. + +## Model vs Agent (Quick Rule of Thumb) + +Use an **Agent** when you want a conversational loop (LLM turns, tools, planning). + +Use a **Model** when you want *inference* (classification, extraction, scoring) that plugs into your procedure like a function call. + +In practice, a great pattern is: + +1) Use a Model to make a fast, predictable decision (or score). +2) Use an Agent only for the harder cases (low confidence, missing info, complex reasoning). + +## A Trainable (and Mockable) Model Example + +This example is deliberately simple: a sentiment classifier (IMDB) that drives a small piece of business logic: + +- confident positive => `decision = "yes"` +- confident negative => `decision = "no"` +- low confidence => `decision = "review"` + +The important part is the *testing story*: + +- In CI (and when learning), we run in **mock** mode for deterministic behavior. +- When you want to see the real thing, you **train** the model and run against the **registry**. + +```lua {file="code/chapter-07a/10-model-naive-bayes-imdb.tac" show-path="true"} +``` + +## Run It (Mock Mode) + +Mock mode never calls external services. It uses the `Mocks { ... }` block in the same file. + +```bash +tactus test code/chapter-07a/10-model-naive-bayes-imdb.tac --mock +``` + +You should see all three scenarios pass. + +## Train It (Real Model) + +Training requires extra dependencies (datasets + scikit-learn). Install: + +```bash +pip install "tactus[ml]" +``` + +Then train the model declared in the file: + +```bash +tactus train code/chapter-07a/10-model-naive-bayes-imdb.tac --model imdb_nb +``` + +Training writes a versioned artifact to the registry under the model's `name` (here: `imdb_nb`). Runtime reads from the registry when you call `Model("imdb_nb")`. + +After training, run the real (non-mocked) test: + +```bash +tactus test code/chapter-07a/10-model-naive-bayes-imdb.tac +``` + +## Evaluate It (Optional) + +If you want metrics (accuracy/precision/recall/F1) on the test split declared in `Model.training.data`: + +```bash +tactus models evaluate code/chapter-07a/10-model-naive-bayes-imdb.tac --model imdb_nb +``` + +This does not re-train; it evaluates a registry-backed model version against the declared test data. + diff --git a/code/chapter-07a/10-model-naive-bayes-imdb.tac b/code/chapter-07a/10-model-naive-bayes-imdb.tac new file mode 100644 index 0000000..a0d274c --- /dev/null +++ b/code/chapter-07a/10-model-naive-bayes-imdb.tac @@ -0,0 +1,122 @@ +-- Example: Train + run a registry-backed Naive Bayes sentiment classifier (IMDB). +-- +-- Train a real model (requires tactus[ml]): +-- tactus train code/chapter-07a/10-model-naive-bayes-imdb.tac --model imdb_nb +-- +-- Test your procedure logic only (deterministic, CI-safe): +-- tactus test code/chapter-07a/10-model-naive-bayes-imdb.tac --mock +-- +-- After training, you can run the "real" test (loads from the registry): +-- tactus test code/chapter-07a/10-model-naive-bayes-imdb.tac + +Model "imdb_nb" { + type = "registry", + name = "imdb_nb", + version = "latest", + input = { text = "string" }, + output = { label = "string", confidence = "float" }, + + -- Training config lives alongside runtime config. + -- The runtime reads from the registry; training writes to the registry. + training = { + data = { + source = "hf", + name = "imdb", + train = "train", + test = "test", + shuffle = { train = true, test = true }, + limit = { train = 5000, test = 5000 }, + seed = 42, + text_field = "text", + label_field = "label" + }, + candidates = { + { + name = "nb-tfidf", + trainer = "naive_bayes", + hyperparameters = { + alpha = 1.0, + max_features = 50000, + ngram_min = 1, + ngram_max = 2 + } + } + } + } +} + +Procedure { + input = { + text = field.string{required = true} + }, + output = { + label = field.string{required = true}, + confidence = field.number{required = false}, + decision = field.string{required = true} + }, + function(input) + -- In runtime, this fetches a trained artifact from the registry. + local classifier = Model("imdb_nb") + local result = classifier({text = input.text}) + local output = result.output or result + + -- Example "business logic" driven by the model's prediction. + -- High-confidence positives => yes + -- High-confidence negatives => no + -- Low-confidence anything => review + local decision = "review" + if output.confidence ~= nil and output.confidence >= 0.7 then + if output.label == "positive" then + decision = "yes" + else + decision = "no" + end + end + + return { + label = output.label, + confidence = output.confidence, + decision = decision + } + end +} + +-- Mocked model responses for deterministic specs. +-- Run mocked: tactus test code/chapter-07a/10-model-naive-bayes-imdb.tac --mock +Mocks { + imdb_nb = { + conditional = { + {when = {text = "A wonderful movie with great acting."}, returns = {label = "positive", confidence = 0.92}}, + {when = {text = "This was a terrible movie with bad acting."}, returns = {label = "negative", confidence = 0.87}}, + {when = {text = "A confusing movie with uneven pacing."}, returns = {label = "positive", confidence = 0.42}} + } + } +} + +Specification([[ +Feature: Model primitive (mocked + trainable) + Scenario: Positive review routes to yes + Given the procedure has started + And the input text is "A wonderful movie with great acting." + When the procedure runs + Then the output decision should be "yes" + And the output label should be "positive" + And the procedure should complete successfully + + Scenario: Negative review routes to no + Given the procedure has started + And the input text is "This was a terrible movie with bad acting." + When the procedure runs + Then the output decision should be "no" + And the output label should be "negative" + And the procedure should complete successfully + + Scenario: Low confidence routes to review + Given the procedure has started + And the input text is "A confusing movie with uneven pacing." + When the procedure runs + Then the output decision should be "review" + And the output confidence should exist + And the procedure should complete successfully +]]) +