From 40392f7b1d0441ef5440ecca181c86dfc7dc23c6 Mon Sep 17 00:00:00 2001 From: thomashebrard Date: Mon, 26 Jan 2026 16:06:02 +0100 Subject: [PATCH 1/5] feature/Chicago --- .blackboxrules | 329 +++++++++++------- .cursor/rules/run_pipelex.mdc | 21 +- .cursor/rules/write_pipelex.mdc | 228 ++++++++++-- .github/copilot-instructions.md | 329 +++++++++++------- .gitignore | 1 - .windsurfrules.md | 329 +++++++++++------- AGENTS.md | 329 +++++++++++------- CLAUDE.md | 329 +++++++++++------- api/main.py | 2 +- api/routes/__init__.py | 4 +- api/routes/{pipelex => }/health.py | 7 +- api/routes/pipelex/__init__.py | 8 +- api/routes/pipelex/build/__init__.py | 11 + api/routes/pipelex/build/inputs.py | 86 +++++ api/routes/pipelex/build/pipe.py | 57 +++ .../{pipe_builder.py => build/runner.py} | 62 +--- api/routes/pipelex/hello_world.plx | 13 - api/routes/pipelex/pipeline.py | 54 --- .../pipelex/{plx_validator.py => validate.py} | 12 +- api/routes/{main.py => version.py} | 2 +- pyproject.toml | 2 +- uv.lock | 75 +++- 22 files changed, 1502 insertions(+), 788 deletions(-) rename api/routes/{pipelex => }/health.py (51%) create mode 100644 api/routes/pipelex/build/__init__.py create mode 100644 api/routes/pipelex/build/inputs.py create mode 100644 api/routes/pipelex/build/pipe.py rename api/routes/pipelex/{pipe_builder.py => build/runner.py} (51%) delete mode 100644 api/routes/pipelex/hello_world.plx rename api/routes/pipelex/{plx_validator.py => validate.py} (87%) rename api/routes/{main.py => version.py} (89%) diff --git a/.blackboxrules b/.blackboxrules index 9f0a612..792cfa5 100644 --- a/.blackboxrules +++ b/.blackboxrules @@ -1,12 +1,12 @@ -# Pipelex Rules +# Pipelex Coding Rules ## Guide to write or edit pipelines using the Pipelex language in .plx files - Always first write your "plan" in natural language, then transcribe it in pipelex. - You should ALWAYS RUN validation when you are writing or editing a `.plx` file. It will ensure the pipe is runnable. If not, iterate. - For a specific file: `pipelex validate path_to_file.plx` - - For all pipelines: `pipelex validate all` + - For all pipelines: `pipelex validate --all` - **IMPORTANT**: Ensure the Python virtual environment is activated before running `pipelex` commands. For standard installations, the venv is named `.venv` - always check that first. The commands will not work without proper venv activation. - Please use POSIX standard for files. (empty lines, no trailing whitespaces, etc.) @@ -23,10 +23,10 @@ A pipeline file has three main sections: #### Domain Statement ```plx -domain = "domain_name" +domain = "domain_code" description = "Description of the domain" # Optional ``` -Note: The domain name usually matches the plx filename for single-file domains. For multi-file domains, use the subdirectory name. +Note: The domain code usually matches the plx filename for single-file domains. For multi-file domains, use the subdirectory name. #### Concept Definitions @@ -41,20 +41,28 @@ ConceptName = "Description of the concept" - Use PascalCase for concept names - Never use plurals (no "Stories", use "Story") - lists are handled implicitly by Pipelex - Avoid circumstantial adjectives (no "LargeText", use "Text") - focus on the essence of what the concept represents -- Don't redefine native concepts (Text, Image, PDF, TextAndImages, Number, Page) +- Don't redefine native concepts (Text, Image, PDF, TextAndImages, Number, Page, JSON) **Native Concepts:** -Pipelex provides built-in native concepts: `Text`, `Image`, `PDF`, `TextAndImages`, `Number`, `Page`. Use these directly or refine them when appropriate. +Pipelex provides built-in native concepts: `Text`, `Image`, `PDF`, `TextAndImages`, `Number`, `Page`, `JSON`. Use these directly or refine them when appropriate. -**Refining Native Concepts:** -To create a concept that specializes a native concept without adding fields: +**Refining Concepts:** +To create a concept that specializes another concept without adding fields, use `refines`: ```plx +## Refining a native concept [concept.Landscape] description = "A scenic outdoor photograph" refines = "Image" + +## Refining a custom concept (must be in domain.ConceptCode format) +[concept.PremiumCustomer] +description = "A premium customer with special benefits" +refines = "myapp.Customer" ``` +Note: When refining a custom (non-native) concept, you must use the fully qualified concept ref in `domain.ConceptCode` format. Pipelex automatically handles the dependency order to ensure referenced concepts are loaded first. + For details on how to structure concepts with fields, see the "Structuring Models" section below. #### Pipe Definitions @@ -62,7 +70,7 @@ For details on how to structure concepts with fields, see the "Structuring Model ### Pipe Base Definition ```plx -[pipe.your_pipe_name] +[pipe.your_pipe_code] type = "PipeLLM" description = "A description of what your pipe does" inputs = { input_1 = "ConceptName1", input_2 = "ConceptName2" } @@ -72,7 +80,7 @@ output = "ConceptName" The pipes will all have at least this base definition. - `inputs`: Dictionary of key being the variable used in the prompts, and the value being the ConceptName. It should ALSO LIST THE INPUTS OF THE INTERMEDIATE STEPS (if PipeSequence) or of the conditional pipes (if PipeCondition). So If you have this error: -`StaticValidationError: missing_input_variable • domain='expense_validator' • pipe='validate_expense' • +`PipeValidationError: missing_input_variable • domain='expense_validator' • pipe='validate_expense' • variable='['invoice']'`` That means that the pipe validate_expense is missing the input `invoice` because one of the subpipe is needing it. @@ -127,16 +135,16 @@ For concepts with structured fields, define them inline using TOML syntax: description = "A commercial document issued by a seller to a buyer" [concept.Invoice.structure] -invoice_number = "The unique invoice identifier" +invoice_number = "The unique invoice identifier" # This will be optional by default issue_date = { type = "date", description = "The date the invoice was issued", required = true } total_amount = { type = "number", description = "The total invoice amount", required = true } -vendor_name = "The name of the vendor" -line_items = { type = "list", item_type = "text", description = "List of items", required = false } +vendor_name = "The name of the vendor" # This will be optional by default +line_items = { type = "list", item_type = "text", description = "List of items" } ``` -**Supported inline field types:** `text`, `integer`, `boolean`, `number`, `date`, `list`, `dict` +**Supported inline field types:** `text`, `integer`, `boolean`, `number`, `date`, `list`, `dict`, `concept` -**Field properties:** `type`, `description`, `required` (default: true), `default_value`, `choices`, `item_type` (for lists), `key_type` and `value_type` (for dicts) +**Field properties:** `type`, `description`, `required` (default: false), `default_value`, `choices`, `item_type` (for lists), `key_type` and `value_type` (for dicts), `concept_ref` (for concept references), `item_concept_ref` (for lists of concepts) **Simple syntax** (creates required text field): ```plx @@ -145,9 +153,46 @@ field_name = "Field description" **Detailed syntax** (with explicit properties): ```plx -field_name = { type = "text", description = "Field description", required = false, default_value = "default" } +field_name = { type = "text", description = "Field description", default_value = "default" } +``` + +**Concept reference syntax** (referencing another concept): +```plx +## Single concept reference +customer = { type = "concept", concept_ref = "myapp.Customer", description = "The customer" } + +## List of concepts +line_items = { type = "list", item_type = "concept", item_concept_ref = "myapp.LineItem", description = "Line items" } ``` +Example with concept references: +```plx +[concept.Customer] +description = "A customer entity" + +[concept.Customer.structure] +name = { type = "text", description = "Customer name" } +email = { type = "text", description = "Customer email" } + +[concept.LineItem] +description = "A line item in an invoice" + +[concept.LineItem.structure] +product = { type = "text", description = "Product name" } +quantity = { type = "integer", description = "Quantity ordered" } +unit_price = { type = "number", description = "Price per unit" } + +[concept.Invoice] +description = "An invoice document" + +[concept.Invoice.structure] +customer = { type = "concept", concept_ref = "myapp.Customer", description = "The customer" } +items = { type = "list", item_type = "concept", item_concept_ref = "myapp.LineItem", description = "Line items" } +total = { type = "number", description = "Invoice total" } +``` + +Note: Pipelex automatically determines the correct loading order for concepts based on their dependencies (topological sort), so concepts can reference each other across domains as long as there are no circular dependencies. + **3. Python StructuredContent Class (For Advanced Features)** Create a Python class when you need: @@ -200,12 +245,14 @@ class Invoice(StructuredContent): #### Inline Structure Limitations Inline structures: -- ✅ Support all common field types (text, number, date, list, dict, etc.) +- ✅ Support all common field types (text, number, date, list, dict, concept, etc.) - ✅ Support required/optional fields, defaults, choices +- ✅ Support concept-to-concept references (type = "concept" with concept_ref) +- ✅ Support lists of concepts (type = "list" with item_type = "concept") +- ✅ Support refining both native and custom concepts - ✅ Generate full Pydantic models with validation - ❌ Cannot have custom validators or complex validation logic - ❌ Cannot have computed properties or custom methods -- ❌ Cannot refine custom (non-native) concepts - ❌ Limited IDE autocomplete compared to explicit Python classes @@ -471,7 +518,7 @@ The PipeExtract operator is used to extract text and images from an image or a P [pipe.extract_info] type = "PipeExtract" description = "extract the information" -inputs = { document = "PDF" } # or { image = "Image" } if it's an image. This is the only input. +inputs = { document = "Document" } # or { image = "Image" } if it's an image. This is the only input. output = "Page" ``` @@ -480,7 +527,7 @@ Using Extract Model Settings: [pipe.extract_with_model] type = "PipeExtract" description = "Extract with specific model" -inputs = { document = "PDF" } +inputs = { document = "Document" } output = "Page" model = "base_extract_mistral" # Use predefined extract preset or model alias ``` @@ -588,15 +635,16 @@ $sales_rep.phone | $sales_rep.email """ ``` -#### Key Parameters +#### Key Parameters (Template Mode) -- `template`: Inline template string (mutually exclusive with template_name) +- `template`: Inline template string (mutually exclusive with template_name and construct) - `template_name`: Name of a predefined template (mutually exclusive with template) - `template_category`: Template type ("llm_prompt", "html", "markdown", "mermaid", etc.) - `templating_style`: Styling options for template rendering - `extra_context`: Additional context variables for template For more control, you can use a nested `template` section instead of the `template` field: + - `template.template`: The template string - `template.category`: Template type - `template.templating_style`: Styling options @@ -604,9 +652,143 @@ For more control, you can use a nested `template` section instead of the `templa #### Template Variables Use the same variable insertion rules as PipeLLM: + - `@variable` for block insertion (multi-line content) - `$variable` for inline insertion (short text) +#### Construct Mode (for StructuredContent Output) + +PipeCompose can also generate `StructuredContent` objects using the `construct` section. This mode composes field values from fixed values, variable references, templates, or nested structures. + +**When to use construct mode:** + +- You need to output a structured object (not just Text) +- You want to deterministically compose fields from existing data +- No LLM is needed - just data composition and templating + +##### Basic Construct Usage + +```plx +[concept.SalesSummary] +description = "A structured sales summary" + +[concept.SalesSummary.structure] +report_title = { type = "text", description = "Title of the report" } +customer_name = { type = "text", description = "Customer name" } +deal_value = { type = "number", description = "Deal value" } +summary_text = { type = "text", description = "Generated summary text" } + +[pipe.compose_summary] +type = "PipeCompose" +description = "Compose a sales summary from deal data" +inputs = { deal = "Deal" } +output = "SalesSummary" + +[pipe.compose_summary.construct] +report_title = "Monthly Sales Report" +customer_name = { from = "deal.customer_name" } +deal_value = { from = "deal.amount" } +summary_text = { template = "Deal worth $deal.amount with $deal.customer_name" } +``` + +##### Field Composition Methods + +There are four ways to define field values in a construct: + +**1. Fixed Value (literal)** + +Use a literal value directly: + +```plx +[pipe.compose_report.construct] +report_title = "Annual Report" +report_year = 2024 +is_draft = false +``` + +**2. Variable Reference (`from`)** + +Get a value from working memory using a dotted path: + +```plx +[pipe.compose_report.construct] +customer_name = { from = "deal.customer_name" } +total_amount = { from = "order.total" } +street_address = { from = "customer.address.street" } +``` + +**3. Template (`template`)** + +Render a Jinja2 template with variable substitution: + +```plx +[pipe.compose_report.construct] +invoice_number = { template = "INV-$order.id" } +summary = { template = "Deal worth $deal.amount with $deal.customer_name on {{ current_date }}" } +``` + +**4. Nested Construct** + +For nested structures, use a TOML subsection: + +```plx +[pipe.compose_invoice.construct] +invoice_number = { template = "INV-$order.id" } +total = { from = "order.total_amount" } + +[pipe.compose_invoice.construct.billing_address] +street = { from = "customer.address.street" } +city = { from = "customer.address.city" } +country = "France" +``` + +##### Complete Construct Example + +```plx +domain = "invoicing" + +[concept.Address] +description = "A postal address" + +[concept.Address.structure] +street = { type = "text", description = "Street address" } +city = { type = "text", description = "City name" } +country = { type = "text", description = "Country name" } + +[concept.Invoice] +description = "An invoice document" + +[concept.Invoice.structure] +invoice_number = { type = "text", description = "Invoice number" } +total = { type = "number", description = "Total amount" } + +[pipe.compose_invoice] +type = "PipeCompose" +description = "Compose an invoice from order and customer data" +inputs = { order = "Order", customer = "Customer" } +output = "Invoice" + +[pipe.compose_invoice.construct] +invoice_number = { template = "INV-$order.id" } +total = { from = "order.total_amount" } + +[pipe.compose_invoice.construct.billing_address] +street = { from = "customer.address.street" } +city = { from = "customer.address.city" } +country = "France" +``` + +##### Key Parameters (Construct Mode) + +- `construct`: Dictionary mapping field names to their composition rules +- Each field can be: + - A literal value (string, number, boolean) + - A dict with `from` key for variable reference + - A dict with `template` key for template rendering + - A nested dict for nested structures + +**Note:** You must use either `template` or `construct`, not both. They are mutually exclusive. + ### PipeImgGen operator The PipeImgGen operator is used to generate images using AI image generation models. @@ -820,7 +1002,7 @@ Presets are meant to record the choice of an llm with its hyper parameters (temp Examples: ```toml -llm_for_complex_reasoning = { model = "base-claude", temperature = 1 } +llm_to_engineer = { model = "base-claude", temperature = 1 } llm_to_extract_invoice = { model = "claude-3-7-sonnet", temperature = 0.1, max_tokens = "auto" } ``` @@ -851,7 +1033,7 @@ You can override the predefined llm presets by setting them in `.pipelex/inferen ALWAYS RUN validation when you are finished writing pipelines: This checks for errors. If there are errors, iterate until it works. - For a specific bundle/file: `pipelex validate path_to_file.plx` -- For all pipelines: `pipelex validate all` +- For all pipelines: `pipelex validate --all` - Remember: Ensure your Python virtual environment is activated (typically `.venv` for standard installations) before running `pipelex` commands. Then, create an example file to run the pipeline in the `examples` folder. @@ -859,21 +1041,6 @@ But don't write documentation unless asked explicitly to. ## Guide to execute a pipeline and write example code -### Prerequisites: Virtual Environment - -**CRITICAL**: Before running any `pipelex` commands or `pytest`, you MUST activate the appropriate Python virtual environment. Without proper venv activation, these commands will not work. - -For standard installations, the virtual environment is named `.venv`. Always check this first: - -```bash -## Activate the virtual environment (standard installation) -source .venv/bin/activate # On macOS/Linux -## or -.venv\Scripts\activate # On Windows -``` - -If your installation uses a different venv name or location, activate that one instead. All subsequent `pipelex` and `pytest` commands assume the venv is active. - ### Example to execute a pipeline with text output ```python @@ -967,13 +1134,13 @@ So here are a few concrete examples of calls to execute_pipeline with various wa }, ) -## Here we have a single input and it's a PDF. -## Because PDFContent is a native concept, we can use it directly as a value, +## Here we have a single input and it's a document. +## Because DocumentContent is a native concept, we can use it directly as a value, ## the system knows what content it corresponds to: pipe_output = await execute_pipeline( pipe_code="power_extractor_dpe", inputs={ - "document": PDFContent(url=pdf_url), + "document": DocumentContent(url=pdf_url), }, ) @@ -1096,82 +1263,4 @@ result_list = pipe_output.main_stuff_as_items(item_type=GanttChart) ``` --- - -## Rules to choose LLM models used in PipeLLMs. - -### LLM Configuration System - -In order to use it in a pipe, an LLM is referenced by its llm_handle (alias) and possibly by an llm_preset. -LLM configurations are managed through the new inference backend system with files located in `.pipelex/inference/`: - -- **Model Deck**: `.pipelex/inference/deck/base_deck.toml` and `.pipelex/inference/deck/overrides.toml` -- **Backends**: `.pipelex/inference/backends.toml` and `.pipelex/inference/backends/*.toml` -- **Routing**: `.pipelex/inference/routing_profiles.toml` - -### LLM Handles - -An llm_handle can be either: -1. **A direct model name** (like "gpt-4o-mini", "claude-3-sonnet") - automatically available for all models loaded by the inference backend system -2. **An alias** - user-defined shortcuts that map to model names, defined in the `[aliases]` section: - -```toml -[aliases] -base-claude = "claude-4.5-sonnet" -base-gpt = "gpt-5" -base-gemini = "gemini-2.5-flash" -base-mistral = "mistral-medium" -``` - -The system first looks for direct model names, then checks aliases if no direct match is found. The system handles model routing through backends automatically. - -### Using an LLM Handle in a PipeLLM - -Here is an example of using an llm_handle to specify which LLM to use in a PipeLLM: - -```plx -[pipe.hello_world] -type = "PipeLLM" -description = "Write text about Hello World." -output = "Text" -model = { model = "gpt-5", temperature = 0.9 } -prompt = """ -Write a haiku about Hello World. -""" -``` - -As you can see, to use the LLM, you must also indicate the temperature (float between 0 and 1) and max_tokens (either an int or the string "auto"). - -### LLM Presets - -Presets are meant to record the choice of an llm with its hyper parameters (temperature and max_tokens) if it's good for a particular task. LLM Presets are skill-oriented. - -Examples: -```toml -llm_for_complex_reasoning = { model = "base-claude", temperature = 1 } -llm_to_extract_invoice = { model = "claude-3-7-sonnet", temperature = 0.1, max_tokens = "auto" } -``` - -The interest is that these presets can be used to set the LLM choice in a PipeLLM, like this: - -```plx -[pipe.extract_invoice] -type = "PipeLLM" -description = "Extract invoice information from an invoice text transcript" -inputs = { invoice_text = "InvoiceText" } -output = "Invoice" -model = "llm_to_extract_invoice" -prompt = """ -Extract invoice information from this invoice: - -The category of this invoice is: $invoice_details.category. - -@invoice_text -""" -``` - -The setting here `model = "llm_to_extract_invoice"` works because "llm_to_extract_invoice" has been declared as an llm_preset in the deck. -You must not use an LLM preset in a PipeLLM that does not exist in the deck. If needed, you can add llm presets. - - -You can override the predefined llm presets by setting them in `.pipelex/inference/deck/overrides.toml`. diff --git a/.cursor/rules/run_pipelex.mdc b/.cursor/rules/run_pipelex.mdc index 78aed74..7650051 100644 --- a/.cursor/rules/run_pipelex.mdc +++ b/.cursor/rules/run_pipelex.mdc @@ -6,21 +6,6 @@ globs: --- # Guide to execute a pipeline and write example code -## Prerequisites: Virtual Environment - -**CRITICAL**: Before running any `pipelex` commands or `pytest`, you MUST activate the appropriate Python virtual environment. Without proper venv activation, these commands will not work. - -For standard installations, the virtual environment is named `.venv`. Always check this first: - -```bash -# Activate the virtual environment (standard installation) -source .venv/bin/activate # On macOS/Linux -# or -.venv\Scripts\activate # On Windows -``` - -If your installation uses a different venv name or location, activate that one instead. All subsequent `pipelex` and `pytest` commands assume the venv is active. - ## Example to execute a pipeline with text output ```python @@ -114,13 +99,13 @@ So here are a few concrete examples of calls to execute_pipeline with various wa }, ) -# Here we have a single input and it's a PDF. -# Because PDFContent is a native concept, we can use it directly as a value, +# Here we have a single input and it's a document. +# Because DocumentContent is a native concept, we can use it directly as a value, # the system knows what content it corresponds to: pipe_output = await execute_pipeline( pipe_code="power_extractor_dpe", inputs={ - "document": PDFContent(url=pdf_url), + "document": DocumentContent(url=pdf_url), }, ) diff --git a/.cursor/rules/write_pipelex.mdc b/.cursor/rules/write_pipelex.mdc index 2d977d3..781789e 100644 --- a/.cursor/rules/write_pipelex.mdc +++ b/.cursor/rules/write_pipelex.mdc @@ -10,7 +10,7 @@ globs: - Always first write your "plan" in natural language, then transcribe it in pipelex. - You should ALWAYS RUN validation when you are writing or editing a `.plx` file. It will ensure the pipe is runnable. If not, iterate. - For a specific file: `pipelex validate path_to_file.plx` - - For all pipelines: `pipelex validate all` + - For all pipelines: `pipelex validate --all` - **IMPORTANT**: Ensure the Python virtual environment is activated before running `pipelex` commands. For standard installations, the venv is named `.venv` - always check that first. The commands will not work without proper venv activation. - Please use POSIX standard for files. (empty lines, no trailing whitespaces, etc.) @@ -27,10 +27,10 @@ A pipeline file has three main sections: ### Domain Statement ```plx -domain = "domain_name" +domain = "domain_code" description = "Description of the domain" # Optional ``` -Note: The domain name usually matches the plx filename for single-file domains. For multi-file domains, use the subdirectory name. +Note: The domain code usually matches the plx filename for single-file domains. For multi-file domains, use the subdirectory name. ### Concept Definitions @@ -45,20 +45,28 @@ ConceptName = "Description of the concept" - Use PascalCase for concept names - Never use plurals (no "Stories", use "Story") - lists are handled implicitly by Pipelex - Avoid circumstantial adjectives (no "LargeText", use "Text") - focus on the essence of what the concept represents -- Don't redefine native concepts (Text, Image, PDF, TextAndImages, Number, Page) +- Don't redefine native concepts (Text, Image, PDF, TextAndImages, Number, Page, JSON) **Native Concepts:** -Pipelex provides built-in native concepts: `Text`, `Image`, `PDF`, `TextAndImages`, `Number`, `Page`. Use these directly or refine them when appropriate. +Pipelex provides built-in native concepts: `Text`, `Image`, `PDF`, `TextAndImages`, `Number`, `Page`, `JSON`. Use these directly or refine them when appropriate. -**Refining Native Concepts:** -To create a concept that specializes a native concept without adding fields: +**Refining Concepts:** +To create a concept that specializes another concept without adding fields, use `refines`: ```plx +# Refining a native concept [concept.Landscape] description = "A scenic outdoor photograph" refines = "Image" + +# Refining a custom concept (must be in domain.ConceptCode format) +[concept.PremiumCustomer] +description = "A premium customer with special benefits" +refines = "myapp.Customer" ``` +Note: When refining a custom (non-native) concept, you must use the fully qualified concept ref in `domain.ConceptCode` format. Pipelex automatically handles the dependency order to ensure referenced concepts are loaded first. + For details on how to structure concepts with fields, see the "Structuring Models" section below. ### Pipe Definitions @@ -66,7 +74,7 @@ For details on how to structure concepts with fields, see the "Structuring Model ## Pipe Base Definition ```plx -[pipe.your_pipe_name] +[pipe.your_pipe_code] type = "PipeLLM" description = "A description of what your pipe does" inputs = { input_1 = "ConceptName1", input_2 = "ConceptName2" } @@ -76,7 +84,7 @@ output = "ConceptName" The pipes will all have at least this base definition. - `inputs`: Dictionary of key being the variable used in the prompts, and the value being the ConceptName. It should ALSO LIST THE INPUTS OF THE INTERMEDIATE STEPS (if PipeSequence) or of the conditional pipes (if PipeCondition). So If you have this error: -`StaticValidationError: missing_input_variable • domain='expense_validator' • pipe='validate_expense' • +`PipeValidationError: missing_input_variable • domain='expense_validator' • pipe='validate_expense' • variable='['invoice']'`` That means that the pipe validate_expense is missing the input `invoice` because one of the subpipe is needing it. @@ -131,16 +139,16 @@ For concepts with structured fields, define them inline using TOML syntax: description = "A commercial document issued by a seller to a buyer" [concept.Invoice.structure] -invoice_number = "The unique invoice identifier" +invoice_number = "The unique invoice identifier" # This will be optional by default issue_date = { type = "date", description = "The date the invoice was issued", required = true } total_amount = { type = "number", description = "The total invoice amount", required = true } -vendor_name = "The name of the vendor" -line_items = { type = "list", item_type = "text", description = "List of items", required = false } +vendor_name = "The name of the vendor" # This will be optional by default +line_items = { type = "list", item_type = "text", description = "List of items" } ``` -**Supported inline field types:** `text`, `integer`, `boolean`, `number`, `date`, `list`, `dict` +**Supported inline field types:** `text`, `integer`, `boolean`, `number`, `date`, `list`, `dict`, `concept` -**Field properties:** `type`, `description`, `required` (default: true), `default_value`, `choices`, `item_type` (for lists), `key_type` and `value_type` (for dicts) +**Field properties:** `type`, `description`, `required` (default: false), `default_value`, `choices`, `item_type` (for lists), `key_type` and `value_type` (for dicts), `concept_ref` (for concept references), `item_concept_ref` (for lists of concepts) **Simple syntax** (creates required text field): ```plx @@ -149,9 +157,46 @@ field_name = "Field description" **Detailed syntax** (with explicit properties): ```plx -field_name = { type = "text", description = "Field description", required = false, default_value = "default" } +field_name = { type = "text", description = "Field description", default_value = "default" } +``` + +**Concept reference syntax** (referencing another concept): +```plx +# Single concept reference +customer = { type = "concept", concept_ref = "myapp.Customer", description = "The customer" } + +# List of concepts +line_items = { type = "list", item_type = "concept", item_concept_ref = "myapp.LineItem", description = "Line items" } +``` + +Example with concept references: +```plx +[concept.Customer] +description = "A customer entity" + +[concept.Customer.structure] +name = { type = "text", description = "Customer name" } +email = { type = "text", description = "Customer email" } + +[concept.LineItem] +description = "A line item in an invoice" + +[concept.LineItem.structure] +product = { type = "text", description = "Product name" } +quantity = { type = "integer", description = "Quantity ordered" } +unit_price = { type = "number", description = "Price per unit" } + +[concept.Invoice] +description = "An invoice document" + +[concept.Invoice.structure] +customer = { type = "concept", concept_ref = "myapp.Customer", description = "The customer" } +items = { type = "list", item_type = "concept", item_concept_ref = "myapp.LineItem", description = "Line items" } +total = { type = "number", description = "Invoice total" } ``` +Note: Pipelex automatically determines the correct loading order for concepts based on their dependencies (topological sort), so concepts can reference each other across domains as long as there are no circular dependencies. + **3. Python StructuredContent Class (For Advanced Features)** Create a Python class when you need: @@ -204,12 +249,14 @@ class Invoice(StructuredContent): ### Inline Structure Limitations Inline structures: -- ✅ Support all common field types (text, number, date, list, dict, etc.) +- ✅ Support all common field types (text, number, date, list, dict, concept, etc.) - ✅ Support required/optional fields, defaults, choices +- ✅ Support concept-to-concept references (type = "concept" with concept_ref) +- ✅ Support lists of concepts (type = "list" with item_type = "concept") +- ✅ Support refining both native and custom concepts - ✅ Generate full Pydantic models with validation - ❌ Cannot have custom validators or complex validation logic - ❌ Cannot have computed properties or custom methods -- ❌ Cannot refine custom (non-native) concepts - ❌ Limited IDE autocomplete compared to explicit Python classes @@ -475,7 +522,7 @@ The PipeExtract operator is used to extract text and images from an image or a P [pipe.extract_info] type = "PipeExtract" description = "extract the information" -inputs = { document = "PDF" } # or { image = "Image" } if it's an image. This is the only input. +inputs = { document = "Document" } # or { image = "Image" } if it's an image. This is the only input. output = "Page" ``` @@ -484,7 +531,7 @@ Using Extract Model Settings: [pipe.extract_with_model] type = "PipeExtract" description = "Extract with specific model" -inputs = { document = "PDF" } +inputs = { document = "Document" } output = "Page" model = "base_extract_mistral" # Use predefined extract preset or model alias ``` @@ -592,15 +639,16 @@ $sales_rep.phone | $sales_rep.email """ ``` -### Key Parameters +### Key Parameters (Template Mode) -- `template`: Inline template string (mutually exclusive with template_name) +- `template`: Inline template string (mutually exclusive with template_name and construct) - `template_name`: Name of a predefined template (mutually exclusive with template) - `template_category`: Template type ("llm_prompt", "html", "markdown", "mermaid", etc.) - `templating_style`: Styling options for template rendering - `extra_context`: Additional context variables for template For more control, you can use a nested `template` section instead of the `template` field: + - `template.template`: The template string - `template.category`: Template type - `template.templating_style`: Styling options @@ -608,9 +656,143 @@ For more control, you can use a nested `template` section instead of the `templa ### Template Variables Use the same variable insertion rules as PipeLLM: + - `@variable` for block insertion (multi-line content) - `$variable` for inline insertion (short text) +### Construct Mode (for StructuredContent Output) + +PipeCompose can also generate `StructuredContent` objects using the `construct` section. This mode composes field values from fixed values, variable references, templates, or nested structures. + +**When to use construct mode:** + +- You need to output a structured object (not just Text) +- You want to deterministically compose fields from existing data +- No LLM is needed - just data composition and templating + +#### Basic Construct Usage + +```plx +[concept.SalesSummary] +description = "A structured sales summary" + +[concept.SalesSummary.structure] +report_title = { type = "text", description = "Title of the report" } +customer_name = { type = "text", description = "Customer name" } +deal_value = { type = "number", description = "Deal value" } +summary_text = { type = "text", description = "Generated summary text" } + +[pipe.compose_summary] +type = "PipeCompose" +description = "Compose a sales summary from deal data" +inputs = { deal = "Deal" } +output = "SalesSummary" + +[pipe.compose_summary.construct] +report_title = "Monthly Sales Report" +customer_name = { from = "deal.customer_name" } +deal_value = { from = "deal.amount" } +summary_text = { template = "Deal worth $deal.amount with $deal.customer_name" } +``` + +#### Field Composition Methods + +There are four ways to define field values in a construct: + +**1. Fixed Value (literal)** + +Use a literal value directly: + +```plx +[pipe.compose_report.construct] +report_title = "Annual Report" +report_year = 2024 +is_draft = false +``` + +**2. Variable Reference (`from`)** + +Get a value from working memory using a dotted path: + +```plx +[pipe.compose_report.construct] +customer_name = { from = "deal.customer_name" } +total_amount = { from = "order.total" } +street_address = { from = "customer.address.street" } +``` + +**3. Template (`template`)** + +Render a Jinja2 template with variable substitution: + +```plx +[pipe.compose_report.construct] +invoice_number = { template = "INV-$order.id" } +summary = { template = "Deal worth $deal.amount with $deal.customer_name on {{ current_date }}" } +``` + +**4. Nested Construct** + +For nested structures, use a TOML subsection: + +```plx +[pipe.compose_invoice.construct] +invoice_number = { template = "INV-$order.id" } +total = { from = "order.total_amount" } + +[pipe.compose_invoice.construct.billing_address] +street = { from = "customer.address.street" } +city = { from = "customer.address.city" } +country = "France" +``` + +#### Complete Construct Example + +```plx +domain = "invoicing" + +[concept.Address] +description = "A postal address" + +[concept.Address.structure] +street = { type = "text", description = "Street address" } +city = { type = "text", description = "City name" } +country = { type = "text", description = "Country name" } + +[concept.Invoice] +description = "An invoice document" + +[concept.Invoice.structure] +invoice_number = { type = "text", description = "Invoice number" } +total = { type = "number", description = "Total amount" } + +[pipe.compose_invoice] +type = "PipeCompose" +description = "Compose an invoice from order and customer data" +inputs = { order = "Order", customer = "Customer" } +output = "Invoice" + +[pipe.compose_invoice.construct] +invoice_number = { template = "INV-$order.id" } +total = { from = "order.total_amount" } + +[pipe.compose_invoice.construct.billing_address] +street = { from = "customer.address.street" } +city = { from = "customer.address.city" } +country = "France" +``` + +#### Key Parameters (Construct Mode) + +- `construct`: Dictionary mapping field names to their composition rules +- Each field can be: + - A literal value (string, number, boolean) + - A dict with `from` key for variable reference + - A dict with `template` key for template rendering + - A nested dict for nested structures + +**Note:** You must use either `template` or `construct`, not both. They are mutually exclusive. + ## PipeImgGen operator The PipeImgGen operator is used to generate images using AI image generation models. @@ -824,7 +1006,7 @@ Presets are meant to record the choice of an llm with its hyper parameters (temp Examples: ```toml -llm_for_complex_reasoning = { model = "base-claude", temperature = 1 } +llm_to_engineer = { model = "base-claude", temperature = 1 } llm_to_extract_invoice = { model = "claude-3-7-sonnet", temperature = 0.1, max_tokens = "auto" } ``` @@ -855,7 +1037,7 @@ You can override the predefined llm presets by setting them in `.pipelex/inferen ALWAYS RUN validation when you are finished writing pipelines: This checks for errors. If there are errors, iterate until it works. - For a specific bundle/file: `pipelex validate path_to_file.plx` -- For all pipelines: `pipelex validate all` +- For all pipelines: `pipelex validate --all` - Remember: Ensure your Python virtual environment is activated (typically `.venv` for standard installations) before running `pipelex` commands. Then, create an example file to run the pipeline in the `examples` folder. diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 9f0a612..792cfa5 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -1,12 +1,12 @@ -# Pipelex Rules +# Pipelex Coding Rules ## Guide to write or edit pipelines using the Pipelex language in .plx files - Always first write your "plan" in natural language, then transcribe it in pipelex. - You should ALWAYS RUN validation when you are writing or editing a `.plx` file. It will ensure the pipe is runnable. If not, iterate. - For a specific file: `pipelex validate path_to_file.plx` - - For all pipelines: `pipelex validate all` + - For all pipelines: `pipelex validate --all` - **IMPORTANT**: Ensure the Python virtual environment is activated before running `pipelex` commands. For standard installations, the venv is named `.venv` - always check that first. The commands will not work without proper venv activation. - Please use POSIX standard for files. (empty lines, no trailing whitespaces, etc.) @@ -23,10 +23,10 @@ A pipeline file has three main sections: #### Domain Statement ```plx -domain = "domain_name" +domain = "domain_code" description = "Description of the domain" # Optional ``` -Note: The domain name usually matches the plx filename for single-file domains. For multi-file domains, use the subdirectory name. +Note: The domain code usually matches the plx filename for single-file domains. For multi-file domains, use the subdirectory name. #### Concept Definitions @@ -41,20 +41,28 @@ ConceptName = "Description of the concept" - Use PascalCase for concept names - Never use plurals (no "Stories", use "Story") - lists are handled implicitly by Pipelex - Avoid circumstantial adjectives (no "LargeText", use "Text") - focus on the essence of what the concept represents -- Don't redefine native concepts (Text, Image, PDF, TextAndImages, Number, Page) +- Don't redefine native concepts (Text, Image, PDF, TextAndImages, Number, Page, JSON) **Native Concepts:** -Pipelex provides built-in native concepts: `Text`, `Image`, `PDF`, `TextAndImages`, `Number`, `Page`. Use these directly or refine them when appropriate. +Pipelex provides built-in native concepts: `Text`, `Image`, `PDF`, `TextAndImages`, `Number`, `Page`, `JSON`. Use these directly or refine them when appropriate. -**Refining Native Concepts:** -To create a concept that specializes a native concept without adding fields: +**Refining Concepts:** +To create a concept that specializes another concept without adding fields, use `refines`: ```plx +## Refining a native concept [concept.Landscape] description = "A scenic outdoor photograph" refines = "Image" + +## Refining a custom concept (must be in domain.ConceptCode format) +[concept.PremiumCustomer] +description = "A premium customer with special benefits" +refines = "myapp.Customer" ``` +Note: When refining a custom (non-native) concept, you must use the fully qualified concept ref in `domain.ConceptCode` format. Pipelex automatically handles the dependency order to ensure referenced concepts are loaded first. + For details on how to structure concepts with fields, see the "Structuring Models" section below. #### Pipe Definitions @@ -62,7 +70,7 @@ For details on how to structure concepts with fields, see the "Structuring Model ### Pipe Base Definition ```plx -[pipe.your_pipe_name] +[pipe.your_pipe_code] type = "PipeLLM" description = "A description of what your pipe does" inputs = { input_1 = "ConceptName1", input_2 = "ConceptName2" } @@ -72,7 +80,7 @@ output = "ConceptName" The pipes will all have at least this base definition. - `inputs`: Dictionary of key being the variable used in the prompts, and the value being the ConceptName. It should ALSO LIST THE INPUTS OF THE INTERMEDIATE STEPS (if PipeSequence) or of the conditional pipes (if PipeCondition). So If you have this error: -`StaticValidationError: missing_input_variable • domain='expense_validator' • pipe='validate_expense' • +`PipeValidationError: missing_input_variable • domain='expense_validator' • pipe='validate_expense' • variable='['invoice']'`` That means that the pipe validate_expense is missing the input `invoice` because one of the subpipe is needing it. @@ -127,16 +135,16 @@ For concepts with structured fields, define them inline using TOML syntax: description = "A commercial document issued by a seller to a buyer" [concept.Invoice.structure] -invoice_number = "The unique invoice identifier" +invoice_number = "The unique invoice identifier" # This will be optional by default issue_date = { type = "date", description = "The date the invoice was issued", required = true } total_amount = { type = "number", description = "The total invoice amount", required = true } -vendor_name = "The name of the vendor" -line_items = { type = "list", item_type = "text", description = "List of items", required = false } +vendor_name = "The name of the vendor" # This will be optional by default +line_items = { type = "list", item_type = "text", description = "List of items" } ``` -**Supported inline field types:** `text`, `integer`, `boolean`, `number`, `date`, `list`, `dict` +**Supported inline field types:** `text`, `integer`, `boolean`, `number`, `date`, `list`, `dict`, `concept` -**Field properties:** `type`, `description`, `required` (default: true), `default_value`, `choices`, `item_type` (for lists), `key_type` and `value_type` (for dicts) +**Field properties:** `type`, `description`, `required` (default: false), `default_value`, `choices`, `item_type` (for lists), `key_type` and `value_type` (for dicts), `concept_ref` (for concept references), `item_concept_ref` (for lists of concepts) **Simple syntax** (creates required text field): ```plx @@ -145,9 +153,46 @@ field_name = "Field description" **Detailed syntax** (with explicit properties): ```plx -field_name = { type = "text", description = "Field description", required = false, default_value = "default" } +field_name = { type = "text", description = "Field description", default_value = "default" } +``` + +**Concept reference syntax** (referencing another concept): +```plx +## Single concept reference +customer = { type = "concept", concept_ref = "myapp.Customer", description = "The customer" } + +## List of concepts +line_items = { type = "list", item_type = "concept", item_concept_ref = "myapp.LineItem", description = "Line items" } ``` +Example with concept references: +```plx +[concept.Customer] +description = "A customer entity" + +[concept.Customer.structure] +name = { type = "text", description = "Customer name" } +email = { type = "text", description = "Customer email" } + +[concept.LineItem] +description = "A line item in an invoice" + +[concept.LineItem.structure] +product = { type = "text", description = "Product name" } +quantity = { type = "integer", description = "Quantity ordered" } +unit_price = { type = "number", description = "Price per unit" } + +[concept.Invoice] +description = "An invoice document" + +[concept.Invoice.structure] +customer = { type = "concept", concept_ref = "myapp.Customer", description = "The customer" } +items = { type = "list", item_type = "concept", item_concept_ref = "myapp.LineItem", description = "Line items" } +total = { type = "number", description = "Invoice total" } +``` + +Note: Pipelex automatically determines the correct loading order for concepts based on their dependencies (topological sort), so concepts can reference each other across domains as long as there are no circular dependencies. + **3. Python StructuredContent Class (For Advanced Features)** Create a Python class when you need: @@ -200,12 +245,14 @@ class Invoice(StructuredContent): #### Inline Structure Limitations Inline structures: -- ✅ Support all common field types (text, number, date, list, dict, etc.) +- ✅ Support all common field types (text, number, date, list, dict, concept, etc.) - ✅ Support required/optional fields, defaults, choices +- ✅ Support concept-to-concept references (type = "concept" with concept_ref) +- ✅ Support lists of concepts (type = "list" with item_type = "concept") +- ✅ Support refining both native and custom concepts - ✅ Generate full Pydantic models with validation - ❌ Cannot have custom validators or complex validation logic - ❌ Cannot have computed properties or custom methods -- ❌ Cannot refine custom (non-native) concepts - ❌ Limited IDE autocomplete compared to explicit Python classes @@ -471,7 +518,7 @@ The PipeExtract operator is used to extract text and images from an image or a P [pipe.extract_info] type = "PipeExtract" description = "extract the information" -inputs = { document = "PDF" } # or { image = "Image" } if it's an image. This is the only input. +inputs = { document = "Document" } # or { image = "Image" } if it's an image. This is the only input. output = "Page" ``` @@ -480,7 +527,7 @@ Using Extract Model Settings: [pipe.extract_with_model] type = "PipeExtract" description = "Extract with specific model" -inputs = { document = "PDF" } +inputs = { document = "Document" } output = "Page" model = "base_extract_mistral" # Use predefined extract preset or model alias ``` @@ -588,15 +635,16 @@ $sales_rep.phone | $sales_rep.email """ ``` -#### Key Parameters +#### Key Parameters (Template Mode) -- `template`: Inline template string (mutually exclusive with template_name) +- `template`: Inline template string (mutually exclusive with template_name and construct) - `template_name`: Name of a predefined template (mutually exclusive with template) - `template_category`: Template type ("llm_prompt", "html", "markdown", "mermaid", etc.) - `templating_style`: Styling options for template rendering - `extra_context`: Additional context variables for template For more control, you can use a nested `template` section instead of the `template` field: + - `template.template`: The template string - `template.category`: Template type - `template.templating_style`: Styling options @@ -604,9 +652,143 @@ For more control, you can use a nested `template` section instead of the `templa #### Template Variables Use the same variable insertion rules as PipeLLM: + - `@variable` for block insertion (multi-line content) - `$variable` for inline insertion (short text) +#### Construct Mode (for StructuredContent Output) + +PipeCompose can also generate `StructuredContent` objects using the `construct` section. This mode composes field values from fixed values, variable references, templates, or nested structures. + +**When to use construct mode:** + +- You need to output a structured object (not just Text) +- You want to deterministically compose fields from existing data +- No LLM is needed - just data composition and templating + +##### Basic Construct Usage + +```plx +[concept.SalesSummary] +description = "A structured sales summary" + +[concept.SalesSummary.structure] +report_title = { type = "text", description = "Title of the report" } +customer_name = { type = "text", description = "Customer name" } +deal_value = { type = "number", description = "Deal value" } +summary_text = { type = "text", description = "Generated summary text" } + +[pipe.compose_summary] +type = "PipeCompose" +description = "Compose a sales summary from deal data" +inputs = { deal = "Deal" } +output = "SalesSummary" + +[pipe.compose_summary.construct] +report_title = "Monthly Sales Report" +customer_name = { from = "deal.customer_name" } +deal_value = { from = "deal.amount" } +summary_text = { template = "Deal worth $deal.amount with $deal.customer_name" } +``` + +##### Field Composition Methods + +There are four ways to define field values in a construct: + +**1. Fixed Value (literal)** + +Use a literal value directly: + +```plx +[pipe.compose_report.construct] +report_title = "Annual Report" +report_year = 2024 +is_draft = false +``` + +**2. Variable Reference (`from`)** + +Get a value from working memory using a dotted path: + +```plx +[pipe.compose_report.construct] +customer_name = { from = "deal.customer_name" } +total_amount = { from = "order.total" } +street_address = { from = "customer.address.street" } +``` + +**3. Template (`template`)** + +Render a Jinja2 template with variable substitution: + +```plx +[pipe.compose_report.construct] +invoice_number = { template = "INV-$order.id" } +summary = { template = "Deal worth $deal.amount with $deal.customer_name on {{ current_date }}" } +``` + +**4. Nested Construct** + +For nested structures, use a TOML subsection: + +```plx +[pipe.compose_invoice.construct] +invoice_number = { template = "INV-$order.id" } +total = { from = "order.total_amount" } + +[pipe.compose_invoice.construct.billing_address] +street = { from = "customer.address.street" } +city = { from = "customer.address.city" } +country = "France" +``` + +##### Complete Construct Example + +```plx +domain = "invoicing" + +[concept.Address] +description = "A postal address" + +[concept.Address.structure] +street = { type = "text", description = "Street address" } +city = { type = "text", description = "City name" } +country = { type = "text", description = "Country name" } + +[concept.Invoice] +description = "An invoice document" + +[concept.Invoice.structure] +invoice_number = { type = "text", description = "Invoice number" } +total = { type = "number", description = "Total amount" } + +[pipe.compose_invoice] +type = "PipeCompose" +description = "Compose an invoice from order and customer data" +inputs = { order = "Order", customer = "Customer" } +output = "Invoice" + +[pipe.compose_invoice.construct] +invoice_number = { template = "INV-$order.id" } +total = { from = "order.total_amount" } + +[pipe.compose_invoice.construct.billing_address] +street = { from = "customer.address.street" } +city = { from = "customer.address.city" } +country = "France" +``` + +##### Key Parameters (Construct Mode) + +- `construct`: Dictionary mapping field names to their composition rules +- Each field can be: + - A literal value (string, number, boolean) + - A dict with `from` key for variable reference + - A dict with `template` key for template rendering + - A nested dict for nested structures + +**Note:** You must use either `template` or `construct`, not both. They are mutually exclusive. + ### PipeImgGen operator The PipeImgGen operator is used to generate images using AI image generation models. @@ -820,7 +1002,7 @@ Presets are meant to record the choice of an llm with its hyper parameters (temp Examples: ```toml -llm_for_complex_reasoning = { model = "base-claude", temperature = 1 } +llm_to_engineer = { model = "base-claude", temperature = 1 } llm_to_extract_invoice = { model = "claude-3-7-sonnet", temperature = 0.1, max_tokens = "auto" } ``` @@ -851,7 +1033,7 @@ You can override the predefined llm presets by setting them in `.pipelex/inferen ALWAYS RUN validation when you are finished writing pipelines: This checks for errors. If there are errors, iterate until it works. - For a specific bundle/file: `pipelex validate path_to_file.plx` -- For all pipelines: `pipelex validate all` +- For all pipelines: `pipelex validate --all` - Remember: Ensure your Python virtual environment is activated (typically `.venv` for standard installations) before running `pipelex` commands. Then, create an example file to run the pipeline in the `examples` folder. @@ -859,21 +1041,6 @@ But don't write documentation unless asked explicitly to. ## Guide to execute a pipeline and write example code -### Prerequisites: Virtual Environment - -**CRITICAL**: Before running any `pipelex` commands or `pytest`, you MUST activate the appropriate Python virtual environment. Without proper venv activation, these commands will not work. - -For standard installations, the virtual environment is named `.venv`. Always check this first: - -```bash -## Activate the virtual environment (standard installation) -source .venv/bin/activate # On macOS/Linux -## or -.venv\Scripts\activate # On Windows -``` - -If your installation uses a different venv name or location, activate that one instead. All subsequent `pipelex` and `pytest` commands assume the venv is active. - ### Example to execute a pipeline with text output ```python @@ -967,13 +1134,13 @@ So here are a few concrete examples of calls to execute_pipeline with various wa }, ) -## Here we have a single input and it's a PDF. -## Because PDFContent is a native concept, we can use it directly as a value, +## Here we have a single input and it's a document. +## Because DocumentContent is a native concept, we can use it directly as a value, ## the system knows what content it corresponds to: pipe_output = await execute_pipeline( pipe_code="power_extractor_dpe", inputs={ - "document": PDFContent(url=pdf_url), + "document": DocumentContent(url=pdf_url), }, ) @@ -1096,82 +1263,4 @@ result_list = pipe_output.main_stuff_as_items(item_type=GanttChart) ``` --- - -## Rules to choose LLM models used in PipeLLMs. - -### LLM Configuration System - -In order to use it in a pipe, an LLM is referenced by its llm_handle (alias) and possibly by an llm_preset. -LLM configurations are managed through the new inference backend system with files located in `.pipelex/inference/`: - -- **Model Deck**: `.pipelex/inference/deck/base_deck.toml` and `.pipelex/inference/deck/overrides.toml` -- **Backends**: `.pipelex/inference/backends.toml` and `.pipelex/inference/backends/*.toml` -- **Routing**: `.pipelex/inference/routing_profiles.toml` - -### LLM Handles - -An llm_handle can be either: -1. **A direct model name** (like "gpt-4o-mini", "claude-3-sonnet") - automatically available for all models loaded by the inference backend system -2. **An alias** - user-defined shortcuts that map to model names, defined in the `[aliases]` section: - -```toml -[aliases] -base-claude = "claude-4.5-sonnet" -base-gpt = "gpt-5" -base-gemini = "gemini-2.5-flash" -base-mistral = "mistral-medium" -``` - -The system first looks for direct model names, then checks aliases if no direct match is found. The system handles model routing through backends automatically. - -### Using an LLM Handle in a PipeLLM - -Here is an example of using an llm_handle to specify which LLM to use in a PipeLLM: - -```plx -[pipe.hello_world] -type = "PipeLLM" -description = "Write text about Hello World." -output = "Text" -model = { model = "gpt-5", temperature = 0.9 } -prompt = """ -Write a haiku about Hello World. -""" -``` - -As you can see, to use the LLM, you must also indicate the temperature (float between 0 and 1) and max_tokens (either an int or the string "auto"). - -### LLM Presets - -Presets are meant to record the choice of an llm with its hyper parameters (temperature and max_tokens) if it's good for a particular task. LLM Presets are skill-oriented. - -Examples: -```toml -llm_for_complex_reasoning = { model = "base-claude", temperature = 1 } -llm_to_extract_invoice = { model = "claude-3-7-sonnet", temperature = 0.1, max_tokens = "auto" } -``` - -The interest is that these presets can be used to set the LLM choice in a PipeLLM, like this: - -```plx -[pipe.extract_invoice] -type = "PipeLLM" -description = "Extract invoice information from an invoice text transcript" -inputs = { invoice_text = "InvoiceText" } -output = "Invoice" -model = "llm_to_extract_invoice" -prompt = """ -Extract invoice information from this invoice: - -The category of this invoice is: $invoice_details.category. - -@invoice_text -""" -``` - -The setting here `model = "llm_to_extract_invoice"` works because "llm_to_extract_invoice" has been declared as an llm_preset in the deck. -You must not use an LLM preset in a PipeLLM that does not exist in the deck. If needed, you can add llm presets. - - -You can override the predefined llm presets by setting them in `.pipelex/inference/deck/overrides.toml`. diff --git a/.gitignore b/.gitignore index 3dd0d7c..a4c50c3 100644 --- a/.gitignore +++ b/.gitignore @@ -7,7 +7,6 @@ __pycache__/ # Distribution / packaging dist/ -build/ *.egg-info/ # mkdocs diff --git a/.windsurfrules.md b/.windsurfrules.md index 9f0a612..792cfa5 100644 --- a/.windsurfrules.md +++ b/.windsurfrules.md @@ -1,12 +1,12 @@ -# Pipelex Rules +# Pipelex Coding Rules ## Guide to write or edit pipelines using the Pipelex language in .plx files - Always first write your "plan" in natural language, then transcribe it in pipelex. - You should ALWAYS RUN validation when you are writing or editing a `.plx` file. It will ensure the pipe is runnable. If not, iterate. - For a specific file: `pipelex validate path_to_file.plx` - - For all pipelines: `pipelex validate all` + - For all pipelines: `pipelex validate --all` - **IMPORTANT**: Ensure the Python virtual environment is activated before running `pipelex` commands. For standard installations, the venv is named `.venv` - always check that first. The commands will not work without proper venv activation. - Please use POSIX standard for files. (empty lines, no trailing whitespaces, etc.) @@ -23,10 +23,10 @@ A pipeline file has three main sections: #### Domain Statement ```plx -domain = "domain_name" +domain = "domain_code" description = "Description of the domain" # Optional ``` -Note: The domain name usually matches the plx filename for single-file domains. For multi-file domains, use the subdirectory name. +Note: The domain code usually matches the plx filename for single-file domains. For multi-file domains, use the subdirectory name. #### Concept Definitions @@ -41,20 +41,28 @@ ConceptName = "Description of the concept" - Use PascalCase for concept names - Never use plurals (no "Stories", use "Story") - lists are handled implicitly by Pipelex - Avoid circumstantial adjectives (no "LargeText", use "Text") - focus on the essence of what the concept represents -- Don't redefine native concepts (Text, Image, PDF, TextAndImages, Number, Page) +- Don't redefine native concepts (Text, Image, PDF, TextAndImages, Number, Page, JSON) **Native Concepts:** -Pipelex provides built-in native concepts: `Text`, `Image`, `PDF`, `TextAndImages`, `Number`, `Page`. Use these directly or refine them when appropriate. +Pipelex provides built-in native concepts: `Text`, `Image`, `PDF`, `TextAndImages`, `Number`, `Page`, `JSON`. Use these directly or refine them when appropriate. -**Refining Native Concepts:** -To create a concept that specializes a native concept without adding fields: +**Refining Concepts:** +To create a concept that specializes another concept without adding fields, use `refines`: ```plx +## Refining a native concept [concept.Landscape] description = "A scenic outdoor photograph" refines = "Image" + +## Refining a custom concept (must be in domain.ConceptCode format) +[concept.PremiumCustomer] +description = "A premium customer with special benefits" +refines = "myapp.Customer" ``` +Note: When refining a custom (non-native) concept, you must use the fully qualified concept ref in `domain.ConceptCode` format. Pipelex automatically handles the dependency order to ensure referenced concepts are loaded first. + For details on how to structure concepts with fields, see the "Structuring Models" section below. #### Pipe Definitions @@ -62,7 +70,7 @@ For details on how to structure concepts with fields, see the "Structuring Model ### Pipe Base Definition ```plx -[pipe.your_pipe_name] +[pipe.your_pipe_code] type = "PipeLLM" description = "A description of what your pipe does" inputs = { input_1 = "ConceptName1", input_2 = "ConceptName2" } @@ -72,7 +80,7 @@ output = "ConceptName" The pipes will all have at least this base definition. - `inputs`: Dictionary of key being the variable used in the prompts, and the value being the ConceptName. It should ALSO LIST THE INPUTS OF THE INTERMEDIATE STEPS (if PipeSequence) or of the conditional pipes (if PipeCondition). So If you have this error: -`StaticValidationError: missing_input_variable • domain='expense_validator' • pipe='validate_expense' • +`PipeValidationError: missing_input_variable • domain='expense_validator' • pipe='validate_expense' • variable='['invoice']'`` That means that the pipe validate_expense is missing the input `invoice` because one of the subpipe is needing it. @@ -127,16 +135,16 @@ For concepts with structured fields, define them inline using TOML syntax: description = "A commercial document issued by a seller to a buyer" [concept.Invoice.structure] -invoice_number = "The unique invoice identifier" +invoice_number = "The unique invoice identifier" # This will be optional by default issue_date = { type = "date", description = "The date the invoice was issued", required = true } total_amount = { type = "number", description = "The total invoice amount", required = true } -vendor_name = "The name of the vendor" -line_items = { type = "list", item_type = "text", description = "List of items", required = false } +vendor_name = "The name of the vendor" # This will be optional by default +line_items = { type = "list", item_type = "text", description = "List of items" } ``` -**Supported inline field types:** `text`, `integer`, `boolean`, `number`, `date`, `list`, `dict` +**Supported inline field types:** `text`, `integer`, `boolean`, `number`, `date`, `list`, `dict`, `concept` -**Field properties:** `type`, `description`, `required` (default: true), `default_value`, `choices`, `item_type` (for lists), `key_type` and `value_type` (for dicts) +**Field properties:** `type`, `description`, `required` (default: false), `default_value`, `choices`, `item_type` (for lists), `key_type` and `value_type` (for dicts), `concept_ref` (for concept references), `item_concept_ref` (for lists of concepts) **Simple syntax** (creates required text field): ```plx @@ -145,9 +153,46 @@ field_name = "Field description" **Detailed syntax** (with explicit properties): ```plx -field_name = { type = "text", description = "Field description", required = false, default_value = "default" } +field_name = { type = "text", description = "Field description", default_value = "default" } +``` + +**Concept reference syntax** (referencing another concept): +```plx +## Single concept reference +customer = { type = "concept", concept_ref = "myapp.Customer", description = "The customer" } + +## List of concepts +line_items = { type = "list", item_type = "concept", item_concept_ref = "myapp.LineItem", description = "Line items" } ``` +Example with concept references: +```plx +[concept.Customer] +description = "A customer entity" + +[concept.Customer.structure] +name = { type = "text", description = "Customer name" } +email = { type = "text", description = "Customer email" } + +[concept.LineItem] +description = "A line item in an invoice" + +[concept.LineItem.structure] +product = { type = "text", description = "Product name" } +quantity = { type = "integer", description = "Quantity ordered" } +unit_price = { type = "number", description = "Price per unit" } + +[concept.Invoice] +description = "An invoice document" + +[concept.Invoice.structure] +customer = { type = "concept", concept_ref = "myapp.Customer", description = "The customer" } +items = { type = "list", item_type = "concept", item_concept_ref = "myapp.LineItem", description = "Line items" } +total = { type = "number", description = "Invoice total" } +``` + +Note: Pipelex automatically determines the correct loading order for concepts based on their dependencies (topological sort), so concepts can reference each other across domains as long as there are no circular dependencies. + **3. Python StructuredContent Class (For Advanced Features)** Create a Python class when you need: @@ -200,12 +245,14 @@ class Invoice(StructuredContent): #### Inline Structure Limitations Inline structures: -- ✅ Support all common field types (text, number, date, list, dict, etc.) +- ✅ Support all common field types (text, number, date, list, dict, concept, etc.) - ✅ Support required/optional fields, defaults, choices +- ✅ Support concept-to-concept references (type = "concept" with concept_ref) +- ✅ Support lists of concepts (type = "list" with item_type = "concept") +- ✅ Support refining both native and custom concepts - ✅ Generate full Pydantic models with validation - ❌ Cannot have custom validators or complex validation logic - ❌ Cannot have computed properties or custom methods -- ❌ Cannot refine custom (non-native) concepts - ❌ Limited IDE autocomplete compared to explicit Python classes @@ -471,7 +518,7 @@ The PipeExtract operator is used to extract text and images from an image or a P [pipe.extract_info] type = "PipeExtract" description = "extract the information" -inputs = { document = "PDF" } # or { image = "Image" } if it's an image. This is the only input. +inputs = { document = "Document" } # or { image = "Image" } if it's an image. This is the only input. output = "Page" ``` @@ -480,7 +527,7 @@ Using Extract Model Settings: [pipe.extract_with_model] type = "PipeExtract" description = "Extract with specific model" -inputs = { document = "PDF" } +inputs = { document = "Document" } output = "Page" model = "base_extract_mistral" # Use predefined extract preset or model alias ``` @@ -588,15 +635,16 @@ $sales_rep.phone | $sales_rep.email """ ``` -#### Key Parameters +#### Key Parameters (Template Mode) -- `template`: Inline template string (mutually exclusive with template_name) +- `template`: Inline template string (mutually exclusive with template_name and construct) - `template_name`: Name of a predefined template (mutually exclusive with template) - `template_category`: Template type ("llm_prompt", "html", "markdown", "mermaid", etc.) - `templating_style`: Styling options for template rendering - `extra_context`: Additional context variables for template For more control, you can use a nested `template` section instead of the `template` field: + - `template.template`: The template string - `template.category`: Template type - `template.templating_style`: Styling options @@ -604,9 +652,143 @@ For more control, you can use a nested `template` section instead of the `templa #### Template Variables Use the same variable insertion rules as PipeLLM: + - `@variable` for block insertion (multi-line content) - `$variable` for inline insertion (short text) +#### Construct Mode (for StructuredContent Output) + +PipeCompose can also generate `StructuredContent` objects using the `construct` section. This mode composes field values from fixed values, variable references, templates, or nested structures. + +**When to use construct mode:** + +- You need to output a structured object (not just Text) +- You want to deterministically compose fields from existing data +- No LLM is needed - just data composition and templating + +##### Basic Construct Usage + +```plx +[concept.SalesSummary] +description = "A structured sales summary" + +[concept.SalesSummary.structure] +report_title = { type = "text", description = "Title of the report" } +customer_name = { type = "text", description = "Customer name" } +deal_value = { type = "number", description = "Deal value" } +summary_text = { type = "text", description = "Generated summary text" } + +[pipe.compose_summary] +type = "PipeCompose" +description = "Compose a sales summary from deal data" +inputs = { deal = "Deal" } +output = "SalesSummary" + +[pipe.compose_summary.construct] +report_title = "Monthly Sales Report" +customer_name = { from = "deal.customer_name" } +deal_value = { from = "deal.amount" } +summary_text = { template = "Deal worth $deal.amount with $deal.customer_name" } +``` + +##### Field Composition Methods + +There are four ways to define field values in a construct: + +**1. Fixed Value (literal)** + +Use a literal value directly: + +```plx +[pipe.compose_report.construct] +report_title = "Annual Report" +report_year = 2024 +is_draft = false +``` + +**2. Variable Reference (`from`)** + +Get a value from working memory using a dotted path: + +```plx +[pipe.compose_report.construct] +customer_name = { from = "deal.customer_name" } +total_amount = { from = "order.total" } +street_address = { from = "customer.address.street" } +``` + +**3. Template (`template`)** + +Render a Jinja2 template with variable substitution: + +```plx +[pipe.compose_report.construct] +invoice_number = { template = "INV-$order.id" } +summary = { template = "Deal worth $deal.amount with $deal.customer_name on {{ current_date }}" } +``` + +**4. Nested Construct** + +For nested structures, use a TOML subsection: + +```plx +[pipe.compose_invoice.construct] +invoice_number = { template = "INV-$order.id" } +total = { from = "order.total_amount" } + +[pipe.compose_invoice.construct.billing_address] +street = { from = "customer.address.street" } +city = { from = "customer.address.city" } +country = "France" +``` + +##### Complete Construct Example + +```plx +domain = "invoicing" + +[concept.Address] +description = "A postal address" + +[concept.Address.structure] +street = { type = "text", description = "Street address" } +city = { type = "text", description = "City name" } +country = { type = "text", description = "Country name" } + +[concept.Invoice] +description = "An invoice document" + +[concept.Invoice.structure] +invoice_number = { type = "text", description = "Invoice number" } +total = { type = "number", description = "Total amount" } + +[pipe.compose_invoice] +type = "PipeCompose" +description = "Compose an invoice from order and customer data" +inputs = { order = "Order", customer = "Customer" } +output = "Invoice" + +[pipe.compose_invoice.construct] +invoice_number = { template = "INV-$order.id" } +total = { from = "order.total_amount" } + +[pipe.compose_invoice.construct.billing_address] +street = { from = "customer.address.street" } +city = { from = "customer.address.city" } +country = "France" +``` + +##### Key Parameters (Construct Mode) + +- `construct`: Dictionary mapping field names to their composition rules +- Each field can be: + - A literal value (string, number, boolean) + - A dict with `from` key for variable reference + - A dict with `template` key for template rendering + - A nested dict for nested structures + +**Note:** You must use either `template` or `construct`, not both. They are mutually exclusive. + ### PipeImgGen operator The PipeImgGen operator is used to generate images using AI image generation models. @@ -820,7 +1002,7 @@ Presets are meant to record the choice of an llm with its hyper parameters (temp Examples: ```toml -llm_for_complex_reasoning = { model = "base-claude", temperature = 1 } +llm_to_engineer = { model = "base-claude", temperature = 1 } llm_to_extract_invoice = { model = "claude-3-7-sonnet", temperature = 0.1, max_tokens = "auto" } ``` @@ -851,7 +1033,7 @@ You can override the predefined llm presets by setting them in `.pipelex/inferen ALWAYS RUN validation when you are finished writing pipelines: This checks for errors. If there are errors, iterate until it works. - For a specific bundle/file: `pipelex validate path_to_file.plx` -- For all pipelines: `pipelex validate all` +- For all pipelines: `pipelex validate --all` - Remember: Ensure your Python virtual environment is activated (typically `.venv` for standard installations) before running `pipelex` commands. Then, create an example file to run the pipeline in the `examples` folder. @@ -859,21 +1041,6 @@ But don't write documentation unless asked explicitly to. ## Guide to execute a pipeline and write example code -### Prerequisites: Virtual Environment - -**CRITICAL**: Before running any `pipelex` commands or `pytest`, you MUST activate the appropriate Python virtual environment. Without proper venv activation, these commands will not work. - -For standard installations, the virtual environment is named `.venv`. Always check this first: - -```bash -## Activate the virtual environment (standard installation) -source .venv/bin/activate # On macOS/Linux -## or -.venv\Scripts\activate # On Windows -``` - -If your installation uses a different venv name or location, activate that one instead. All subsequent `pipelex` and `pytest` commands assume the venv is active. - ### Example to execute a pipeline with text output ```python @@ -967,13 +1134,13 @@ So here are a few concrete examples of calls to execute_pipeline with various wa }, ) -## Here we have a single input and it's a PDF. -## Because PDFContent is a native concept, we can use it directly as a value, +## Here we have a single input and it's a document. +## Because DocumentContent is a native concept, we can use it directly as a value, ## the system knows what content it corresponds to: pipe_output = await execute_pipeline( pipe_code="power_extractor_dpe", inputs={ - "document": PDFContent(url=pdf_url), + "document": DocumentContent(url=pdf_url), }, ) @@ -1096,82 +1263,4 @@ result_list = pipe_output.main_stuff_as_items(item_type=GanttChart) ``` --- - -## Rules to choose LLM models used in PipeLLMs. - -### LLM Configuration System - -In order to use it in a pipe, an LLM is referenced by its llm_handle (alias) and possibly by an llm_preset. -LLM configurations are managed through the new inference backend system with files located in `.pipelex/inference/`: - -- **Model Deck**: `.pipelex/inference/deck/base_deck.toml` and `.pipelex/inference/deck/overrides.toml` -- **Backends**: `.pipelex/inference/backends.toml` and `.pipelex/inference/backends/*.toml` -- **Routing**: `.pipelex/inference/routing_profiles.toml` - -### LLM Handles - -An llm_handle can be either: -1. **A direct model name** (like "gpt-4o-mini", "claude-3-sonnet") - automatically available for all models loaded by the inference backend system -2. **An alias** - user-defined shortcuts that map to model names, defined in the `[aliases]` section: - -```toml -[aliases] -base-claude = "claude-4.5-sonnet" -base-gpt = "gpt-5" -base-gemini = "gemini-2.5-flash" -base-mistral = "mistral-medium" -``` - -The system first looks for direct model names, then checks aliases if no direct match is found. The system handles model routing through backends automatically. - -### Using an LLM Handle in a PipeLLM - -Here is an example of using an llm_handle to specify which LLM to use in a PipeLLM: - -```plx -[pipe.hello_world] -type = "PipeLLM" -description = "Write text about Hello World." -output = "Text" -model = { model = "gpt-5", temperature = 0.9 } -prompt = """ -Write a haiku about Hello World. -""" -``` - -As you can see, to use the LLM, you must also indicate the temperature (float between 0 and 1) and max_tokens (either an int or the string "auto"). - -### LLM Presets - -Presets are meant to record the choice of an llm with its hyper parameters (temperature and max_tokens) if it's good for a particular task. LLM Presets are skill-oriented. - -Examples: -```toml -llm_for_complex_reasoning = { model = "base-claude", temperature = 1 } -llm_to_extract_invoice = { model = "claude-3-7-sonnet", temperature = 0.1, max_tokens = "auto" } -``` - -The interest is that these presets can be used to set the LLM choice in a PipeLLM, like this: - -```plx -[pipe.extract_invoice] -type = "PipeLLM" -description = "Extract invoice information from an invoice text transcript" -inputs = { invoice_text = "InvoiceText" } -output = "Invoice" -model = "llm_to_extract_invoice" -prompt = """ -Extract invoice information from this invoice: - -The category of this invoice is: $invoice_details.category. - -@invoice_text -""" -``` - -The setting here `model = "llm_to_extract_invoice"` works because "llm_to_extract_invoice" has been declared as an llm_preset in the deck. -You must not use an LLM preset in a PipeLLM that does not exist in the deck. If needed, you can add llm presets. - - -You can override the predefined llm presets by setting them in `.pipelex/inference/deck/overrides.toml`. diff --git a/AGENTS.md b/AGENTS.md index 9f0a612..792cfa5 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,12 +1,12 @@ -# Pipelex Rules +# Pipelex Coding Rules ## Guide to write or edit pipelines using the Pipelex language in .plx files - Always first write your "plan" in natural language, then transcribe it in pipelex. - You should ALWAYS RUN validation when you are writing or editing a `.plx` file. It will ensure the pipe is runnable. If not, iterate. - For a specific file: `pipelex validate path_to_file.plx` - - For all pipelines: `pipelex validate all` + - For all pipelines: `pipelex validate --all` - **IMPORTANT**: Ensure the Python virtual environment is activated before running `pipelex` commands. For standard installations, the venv is named `.venv` - always check that first. The commands will not work without proper venv activation. - Please use POSIX standard for files. (empty lines, no trailing whitespaces, etc.) @@ -23,10 +23,10 @@ A pipeline file has three main sections: #### Domain Statement ```plx -domain = "domain_name" +domain = "domain_code" description = "Description of the domain" # Optional ``` -Note: The domain name usually matches the plx filename for single-file domains. For multi-file domains, use the subdirectory name. +Note: The domain code usually matches the plx filename for single-file domains. For multi-file domains, use the subdirectory name. #### Concept Definitions @@ -41,20 +41,28 @@ ConceptName = "Description of the concept" - Use PascalCase for concept names - Never use plurals (no "Stories", use "Story") - lists are handled implicitly by Pipelex - Avoid circumstantial adjectives (no "LargeText", use "Text") - focus on the essence of what the concept represents -- Don't redefine native concepts (Text, Image, PDF, TextAndImages, Number, Page) +- Don't redefine native concepts (Text, Image, PDF, TextAndImages, Number, Page, JSON) **Native Concepts:** -Pipelex provides built-in native concepts: `Text`, `Image`, `PDF`, `TextAndImages`, `Number`, `Page`. Use these directly or refine them when appropriate. +Pipelex provides built-in native concepts: `Text`, `Image`, `PDF`, `TextAndImages`, `Number`, `Page`, `JSON`. Use these directly or refine them when appropriate. -**Refining Native Concepts:** -To create a concept that specializes a native concept without adding fields: +**Refining Concepts:** +To create a concept that specializes another concept without adding fields, use `refines`: ```plx +## Refining a native concept [concept.Landscape] description = "A scenic outdoor photograph" refines = "Image" + +## Refining a custom concept (must be in domain.ConceptCode format) +[concept.PremiumCustomer] +description = "A premium customer with special benefits" +refines = "myapp.Customer" ``` +Note: When refining a custom (non-native) concept, you must use the fully qualified concept ref in `domain.ConceptCode` format. Pipelex automatically handles the dependency order to ensure referenced concepts are loaded first. + For details on how to structure concepts with fields, see the "Structuring Models" section below. #### Pipe Definitions @@ -62,7 +70,7 @@ For details on how to structure concepts with fields, see the "Structuring Model ### Pipe Base Definition ```plx -[pipe.your_pipe_name] +[pipe.your_pipe_code] type = "PipeLLM" description = "A description of what your pipe does" inputs = { input_1 = "ConceptName1", input_2 = "ConceptName2" } @@ -72,7 +80,7 @@ output = "ConceptName" The pipes will all have at least this base definition. - `inputs`: Dictionary of key being the variable used in the prompts, and the value being the ConceptName. It should ALSO LIST THE INPUTS OF THE INTERMEDIATE STEPS (if PipeSequence) or of the conditional pipes (if PipeCondition). So If you have this error: -`StaticValidationError: missing_input_variable • domain='expense_validator' • pipe='validate_expense' • +`PipeValidationError: missing_input_variable • domain='expense_validator' • pipe='validate_expense' • variable='['invoice']'`` That means that the pipe validate_expense is missing the input `invoice` because one of the subpipe is needing it. @@ -127,16 +135,16 @@ For concepts with structured fields, define them inline using TOML syntax: description = "A commercial document issued by a seller to a buyer" [concept.Invoice.structure] -invoice_number = "The unique invoice identifier" +invoice_number = "The unique invoice identifier" # This will be optional by default issue_date = { type = "date", description = "The date the invoice was issued", required = true } total_amount = { type = "number", description = "The total invoice amount", required = true } -vendor_name = "The name of the vendor" -line_items = { type = "list", item_type = "text", description = "List of items", required = false } +vendor_name = "The name of the vendor" # This will be optional by default +line_items = { type = "list", item_type = "text", description = "List of items" } ``` -**Supported inline field types:** `text`, `integer`, `boolean`, `number`, `date`, `list`, `dict` +**Supported inline field types:** `text`, `integer`, `boolean`, `number`, `date`, `list`, `dict`, `concept` -**Field properties:** `type`, `description`, `required` (default: true), `default_value`, `choices`, `item_type` (for lists), `key_type` and `value_type` (for dicts) +**Field properties:** `type`, `description`, `required` (default: false), `default_value`, `choices`, `item_type` (for lists), `key_type` and `value_type` (for dicts), `concept_ref` (for concept references), `item_concept_ref` (for lists of concepts) **Simple syntax** (creates required text field): ```plx @@ -145,9 +153,46 @@ field_name = "Field description" **Detailed syntax** (with explicit properties): ```plx -field_name = { type = "text", description = "Field description", required = false, default_value = "default" } +field_name = { type = "text", description = "Field description", default_value = "default" } +``` + +**Concept reference syntax** (referencing another concept): +```plx +## Single concept reference +customer = { type = "concept", concept_ref = "myapp.Customer", description = "The customer" } + +## List of concepts +line_items = { type = "list", item_type = "concept", item_concept_ref = "myapp.LineItem", description = "Line items" } ``` +Example with concept references: +```plx +[concept.Customer] +description = "A customer entity" + +[concept.Customer.structure] +name = { type = "text", description = "Customer name" } +email = { type = "text", description = "Customer email" } + +[concept.LineItem] +description = "A line item in an invoice" + +[concept.LineItem.structure] +product = { type = "text", description = "Product name" } +quantity = { type = "integer", description = "Quantity ordered" } +unit_price = { type = "number", description = "Price per unit" } + +[concept.Invoice] +description = "An invoice document" + +[concept.Invoice.structure] +customer = { type = "concept", concept_ref = "myapp.Customer", description = "The customer" } +items = { type = "list", item_type = "concept", item_concept_ref = "myapp.LineItem", description = "Line items" } +total = { type = "number", description = "Invoice total" } +``` + +Note: Pipelex automatically determines the correct loading order for concepts based on their dependencies (topological sort), so concepts can reference each other across domains as long as there are no circular dependencies. + **3. Python StructuredContent Class (For Advanced Features)** Create a Python class when you need: @@ -200,12 +245,14 @@ class Invoice(StructuredContent): #### Inline Structure Limitations Inline structures: -- ✅ Support all common field types (text, number, date, list, dict, etc.) +- ✅ Support all common field types (text, number, date, list, dict, concept, etc.) - ✅ Support required/optional fields, defaults, choices +- ✅ Support concept-to-concept references (type = "concept" with concept_ref) +- ✅ Support lists of concepts (type = "list" with item_type = "concept") +- ✅ Support refining both native and custom concepts - ✅ Generate full Pydantic models with validation - ❌ Cannot have custom validators or complex validation logic - ❌ Cannot have computed properties or custom methods -- ❌ Cannot refine custom (non-native) concepts - ❌ Limited IDE autocomplete compared to explicit Python classes @@ -471,7 +518,7 @@ The PipeExtract operator is used to extract text and images from an image or a P [pipe.extract_info] type = "PipeExtract" description = "extract the information" -inputs = { document = "PDF" } # or { image = "Image" } if it's an image. This is the only input. +inputs = { document = "Document" } # or { image = "Image" } if it's an image. This is the only input. output = "Page" ``` @@ -480,7 +527,7 @@ Using Extract Model Settings: [pipe.extract_with_model] type = "PipeExtract" description = "Extract with specific model" -inputs = { document = "PDF" } +inputs = { document = "Document" } output = "Page" model = "base_extract_mistral" # Use predefined extract preset or model alias ``` @@ -588,15 +635,16 @@ $sales_rep.phone | $sales_rep.email """ ``` -#### Key Parameters +#### Key Parameters (Template Mode) -- `template`: Inline template string (mutually exclusive with template_name) +- `template`: Inline template string (mutually exclusive with template_name and construct) - `template_name`: Name of a predefined template (mutually exclusive with template) - `template_category`: Template type ("llm_prompt", "html", "markdown", "mermaid", etc.) - `templating_style`: Styling options for template rendering - `extra_context`: Additional context variables for template For more control, you can use a nested `template` section instead of the `template` field: + - `template.template`: The template string - `template.category`: Template type - `template.templating_style`: Styling options @@ -604,9 +652,143 @@ For more control, you can use a nested `template` section instead of the `templa #### Template Variables Use the same variable insertion rules as PipeLLM: + - `@variable` for block insertion (multi-line content) - `$variable` for inline insertion (short text) +#### Construct Mode (for StructuredContent Output) + +PipeCompose can also generate `StructuredContent` objects using the `construct` section. This mode composes field values from fixed values, variable references, templates, or nested structures. + +**When to use construct mode:** + +- You need to output a structured object (not just Text) +- You want to deterministically compose fields from existing data +- No LLM is needed - just data composition and templating + +##### Basic Construct Usage + +```plx +[concept.SalesSummary] +description = "A structured sales summary" + +[concept.SalesSummary.structure] +report_title = { type = "text", description = "Title of the report" } +customer_name = { type = "text", description = "Customer name" } +deal_value = { type = "number", description = "Deal value" } +summary_text = { type = "text", description = "Generated summary text" } + +[pipe.compose_summary] +type = "PipeCompose" +description = "Compose a sales summary from deal data" +inputs = { deal = "Deal" } +output = "SalesSummary" + +[pipe.compose_summary.construct] +report_title = "Monthly Sales Report" +customer_name = { from = "deal.customer_name" } +deal_value = { from = "deal.amount" } +summary_text = { template = "Deal worth $deal.amount with $deal.customer_name" } +``` + +##### Field Composition Methods + +There are four ways to define field values in a construct: + +**1. Fixed Value (literal)** + +Use a literal value directly: + +```plx +[pipe.compose_report.construct] +report_title = "Annual Report" +report_year = 2024 +is_draft = false +``` + +**2. Variable Reference (`from`)** + +Get a value from working memory using a dotted path: + +```plx +[pipe.compose_report.construct] +customer_name = { from = "deal.customer_name" } +total_amount = { from = "order.total" } +street_address = { from = "customer.address.street" } +``` + +**3. Template (`template`)** + +Render a Jinja2 template with variable substitution: + +```plx +[pipe.compose_report.construct] +invoice_number = { template = "INV-$order.id" } +summary = { template = "Deal worth $deal.amount with $deal.customer_name on {{ current_date }}" } +``` + +**4. Nested Construct** + +For nested structures, use a TOML subsection: + +```plx +[pipe.compose_invoice.construct] +invoice_number = { template = "INV-$order.id" } +total = { from = "order.total_amount" } + +[pipe.compose_invoice.construct.billing_address] +street = { from = "customer.address.street" } +city = { from = "customer.address.city" } +country = "France" +``` + +##### Complete Construct Example + +```plx +domain = "invoicing" + +[concept.Address] +description = "A postal address" + +[concept.Address.structure] +street = { type = "text", description = "Street address" } +city = { type = "text", description = "City name" } +country = { type = "text", description = "Country name" } + +[concept.Invoice] +description = "An invoice document" + +[concept.Invoice.structure] +invoice_number = { type = "text", description = "Invoice number" } +total = { type = "number", description = "Total amount" } + +[pipe.compose_invoice] +type = "PipeCompose" +description = "Compose an invoice from order and customer data" +inputs = { order = "Order", customer = "Customer" } +output = "Invoice" + +[pipe.compose_invoice.construct] +invoice_number = { template = "INV-$order.id" } +total = { from = "order.total_amount" } + +[pipe.compose_invoice.construct.billing_address] +street = { from = "customer.address.street" } +city = { from = "customer.address.city" } +country = "France" +``` + +##### Key Parameters (Construct Mode) + +- `construct`: Dictionary mapping field names to their composition rules +- Each field can be: + - A literal value (string, number, boolean) + - A dict with `from` key for variable reference + - A dict with `template` key for template rendering + - A nested dict for nested structures + +**Note:** You must use either `template` or `construct`, not both. They are mutually exclusive. + ### PipeImgGen operator The PipeImgGen operator is used to generate images using AI image generation models. @@ -820,7 +1002,7 @@ Presets are meant to record the choice of an llm with its hyper parameters (temp Examples: ```toml -llm_for_complex_reasoning = { model = "base-claude", temperature = 1 } +llm_to_engineer = { model = "base-claude", temperature = 1 } llm_to_extract_invoice = { model = "claude-3-7-sonnet", temperature = 0.1, max_tokens = "auto" } ``` @@ -851,7 +1033,7 @@ You can override the predefined llm presets by setting them in `.pipelex/inferen ALWAYS RUN validation when you are finished writing pipelines: This checks for errors. If there are errors, iterate until it works. - For a specific bundle/file: `pipelex validate path_to_file.plx` -- For all pipelines: `pipelex validate all` +- For all pipelines: `pipelex validate --all` - Remember: Ensure your Python virtual environment is activated (typically `.venv` for standard installations) before running `pipelex` commands. Then, create an example file to run the pipeline in the `examples` folder. @@ -859,21 +1041,6 @@ But don't write documentation unless asked explicitly to. ## Guide to execute a pipeline and write example code -### Prerequisites: Virtual Environment - -**CRITICAL**: Before running any `pipelex` commands or `pytest`, you MUST activate the appropriate Python virtual environment. Without proper venv activation, these commands will not work. - -For standard installations, the virtual environment is named `.venv`. Always check this first: - -```bash -## Activate the virtual environment (standard installation) -source .venv/bin/activate # On macOS/Linux -## or -.venv\Scripts\activate # On Windows -``` - -If your installation uses a different venv name or location, activate that one instead. All subsequent `pipelex` and `pytest` commands assume the venv is active. - ### Example to execute a pipeline with text output ```python @@ -967,13 +1134,13 @@ So here are a few concrete examples of calls to execute_pipeline with various wa }, ) -## Here we have a single input and it's a PDF. -## Because PDFContent is a native concept, we can use it directly as a value, +## Here we have a single input and it's a document. +## Because DocumentContent is a native concept, we can use it directly as a value, ## the system knows what content it corresponds to: pipe_output = await execute_pipeline( pipe_code="power_extractor_dpe", inputs={ - "document": PDFContent(url=pdf_url), + "document": DocumentContent(url=pdf_url), }, ) @@ -1096,82 +1263,4 @@ result_list = pipe_output.main_stuff_as_items(item_type=GanttChart) ``` --- - -## Rules to choose LLM models used in PipeLLMs. - -### LLM Configuration System - -In order to use it in a pipe, an LLM is referenced by its llm_handle (alias) and possibly by an llm_preset. -LLM configurations are managed through the new inference backend system with files located in `.pipelex/inference/`: - -- **Model Deck**: `.pipelex/inference/deck/base_deck.toml` and `.pipelex/inference/deck/overrides.toml` -- **Backends**: `.pipelex/inference/backends.toml` and `.pipelex/inference/backends/*.toml` -- **Routing**: `.pipelex/inference/routing_profiles.toml` - -### LLM Handles - -An llm_handle can be either: -1. **A direct model name** (like "gpt-4o-mini", "claude-3-sonnet") - automatically available for all models loaded by the inference backend system -2. **An alias** - user-defined shortcuts that map to model names, defined in the `[aliases]` section: - -```toml -[aliases] -base-claude = "claude-4.5-sonnet" -base-gpt = "gpt-5" -base-gemini = "gemini-2.5-flash" -base-mistral = "mistral-medium" -``` - -The system first looks for direct model names, then checks aliases if no direct match is found. The system handles model routing through backends automatically. - -### Using an LLM Handle in a PipeLLM - -Here is an example of using an llm_handle to specify which LLM to use in a PipeLLM: - -```plx -[pipe.hello_world] -type = "PipeLLM" -description = "Write text about Hello World." -output = "Text" -model = { model = "gpt-5", temperature = 0.9 } -prompt = """ -Write a haiku about Hello World. -""" -``` - -As you can see, to use the LLM, you must also indicate the temperature (float between 0 and 1) and max_tokens (either an int or the string "auto"). - -### LLM Presets - -Presets are meant to record the choice of an llm with its hyper parameters (temperature and max_tokens) if it's good for a particular task. LLM Presets are skill-oriented. - -Examples: -```toml -llm_for_complex_reasoning = { model = "base-claude", temperature = 1 } -llm_to_extract_invoice = { model = "claude-3-7-sonnet", temperature = 0.1, max_tokens = "auto" } -``` - -The interest is that these presets can be used to set the LLM choice in a PipeLLM, like this: - -```plx -[pipe.extract_invoice] -type = "PipeLLM" -description = "Extract invoice information from an invoice text transcript" -inputs = { invoice_text = "InvoiceText" } -output = "Invoice" -model = "llm_to_extract_invoice" -prompt = """ -Extract invoice information from this invoice: - -The category of this invoice is: $invoice_details.category. - -@invoice_text -""" -``` - -The setting here `model = "llm_to_extract_invoice"` works because "llm_to_extract_invoice" has been declared as an llm_preset in the deck. -You must not use an LLM preset in a PipeLLM that does not exist in the deck. If needed, you can add llm presets. - - -You can override the predefined llm presets by setting them in `.pipelex/inference/deck/overrides.toml`. diff --git a/CLAUDE.md b/CLAUDE.md index 9f0a612..792cfa5 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,12 +1,12 @@ -# Pipelex Rules +# Pipelex Coding Rules ## Guide to write or edit pipelines using the Pipelex language in .plx files - Always first write your "plan" in natural language, then transcribe it in pipelex. - You should ALWAYS RUN validation when you are writing or editing a `.plx` file. It will ensure the pipe is runnable. If not, iterate. - For a specific file: `pipelex validate path_to_file.plx` - - For all pipelines: `pipelex validate all` + - For all pipelines: `pipelex validate --all` - **IMPORTANT**: Ensure the Python virtual environment is activated before running `pipelex` commands. For standard installations, the venv is named `.venv` - always check that first. The commands will not work without proper venv activation. - Please use POSIX standard for files. (empty lines, no trailing whitespaces, etc.) @@ -23,10 +23,10 @@ A pipeline file has three main sections: #### Domain Statement ```plx -domain = "domain_name" +domain = "domain_code" description = "Description of the domain" # Optional ``` -Note: The domain name usually matches the plx filename for single-file domains. For multi-file domains, use the subdirectory name. +Note: The domain code usually matches the plx filename for single-file domains. For multi-file domains, use the subdirectory name. #### Concept Definitions @@ -41,20 +41,28 @@ ConceptName = "Description of the concept" - Use PascalCase for concept names - Never use plurals (no "Stories", use "Story") - lists are handled implicitly by Pipelex - Avoid circumstantial adjectives (no "LargeText", use "Text") - focus on the essence of what the concept represents -- Don't redefine native concepts (Text, Image, PDF, TextAndImages, Number, Page) +- Don't redefine native concepts (Text, Image, PDF, TextAndImages, Number, Page, JSON) **Native Concepts:** -Pipelex provides built-in native concepts: `Text`, `Image`, `PDF`, `TextAndImages`, `Number`, `Page`. Use these directly or refine them when appropriate. +Pipelex provides built-in native concepts: `Text`, `Image`, `PDF`, `TextAndImages`, `Number`, `Page`, `JSON`. Use these directly or refine them when appropriate. -**Refining Native Concepts:** -To create a concept that specializes a native concept without adding fields: +**Refining Concepts:** +To create a concept that specializes another concept without adding fields, use `refines`: ```plx +## Refining a native concept [concept.Landscape] description = "A scenic outdoor photograph" refines = "Image" + +## Refining a custom concept (must be in domain.ConceptCode format) +[concept.PremiumCustomer] +description = "A premium customer with special benefits" +refines = "myapp.Customer" ``` +Note: When refining a custom (non-native) concept, you must use the fully qualified concept ref in `domain.ConceptCode` format. Pipelex automatically handles the dependency order to ensure referenced concepts are loaded first. + For details on how to structure concepts with fields, see the "Structuring Models" section below. #### Pipe Definitions @@ -62,7 +70,7 @@ For details on how to structure concepts with fields, see the "Structuring Model ### Pipe Base Definition ```plx -[pipe.your_pipe_name] +[pipe.your_pipe_code] type = "PipeLLM" description = "A description of what your pipe does" inputs = { input_1 = "ConceptName1", input_2 = "ConceptName2" } @@ -72,7 +80,7 @@ output = "ConceptName" The pipes will all have at least this base definition. - `inputs`: Dictionary of key being the variable used in the prompts, and the value being the ConceptName. It should ALSO LIST THE INPUTS OF THE INTERMEDIATE STEPS (if PipeSequence) or of the conditional pipes (if PipeCondition). So If you have this error: -`StaticValidationError: missing_input_variable • domain='expense_validator' • pipe='validate_expense' • +`PipeValidationError: missing_input_variable • domain='expense_validator' • pipe='validate_expense' • variable='['invoice']'`` That means that the pipe validate_expense is missing the input `invoice` because one of the subpipe is needing it. @@ -127,16 +135,16 @@ For concepts with structured fields, define them inline using TOML syntax: description = "A commercial document issued by a seller to a buyer" [concept.Invoice.structure] -invoice_number = "The unique invoice identifier" +invoice_number = "The unique invoice identifier" # This will be optional by default issue_date = { type = "date", description = "The date the invoice was issued", required = true } total_amount = { type = "number", description = "The total invoice amount", required = true } -vendor_name = "The name of the vendor" -line_items = { type = "list", item_type = "text", description = "List of items", required = false } +vendor_name = "The name of the vendor" # This will be optional by default +line_items = { type = "list", item_type = "text", description = "List of items" } ``` -**Supported inline field types:** `text`, `integer`, `boolean`, `number`, `date`, `list`, `dict` +**Supported inline field types:** `text`, `integer`, `boolean`, `number`, `date`, `list`, `dict`, `concept` -**Field properties:** `type`, `description`, `required` (default: true), `default_value`, `choices`, `item_type` (for lists), `key_type` and `value_type` (for dicts) +**Field properties:** `type`, `description`, `required` (default: false), `default_value`, `choices`, `item_type` (for lists), `key_type` and `value_type` (for dicts), `concept_ref` (for concept references), `item_concept_ref` (for lists of concepts) **Simple syntax** (creates required text field): ```plx @@ -145,9 +153,46 @@ field_name = "Field description" **Detailed syntax** (with explicit properties): ```plx -field_name = { type = "text", description = "Field description", required = false, default_value = "default" } +field_name = { type = "text", description = "Field description", default_value = "default" } +``` + +**Concept reference syntax** (referencing another concept): +```plx +## Single concept reference +customer = { type = "concept", concept_ref = "myapp.Customer", description = "The customer" } + +## List of concepts +line_items = { type = "list", item_type = "concept", item_concept_ref = "myapp.LineItem", description = "Line items" } ``` +Example with concept references: +```plx +[concept.Customer] +description = "A customer entity" + +[concept.Customer.structure] +name = { type = "text", description = "Customer name" } +email = { type = "text", description = "Customer email" } + +[concept.LineItem] +description = "A line item in an invoice" + +[concept.LineItem.structure] +product = { type = "text", description = "Product name" } +quantity = { type = "integer", description = "Quantity ordered" } +unit_price = { type = "number", description = "Price per unit" } + +[concept.Invoice] +description = "An invoice document" + +[concept.Invoice.structure] +customer = { type = "concept", concept_ref = "myapp.Customer", description = "The customer" } +items = { type = "list", item_type = "concept", item_concept_ref = "myapp.LineItem", description = "Line items" } +total = { type = "number", description = "Invoice total" } +``` + +Note: Pipelex automatically determines the correct loading order for concepts based on their dependencies (topological sort), so concepts can reference each other across domains as long as there are no circular dependencies. + **3. Python StructuredContent Class (For Advanced Features)** Create a Python class when you need: @@ -200,12 +245,14 @@ class Invoice(StructuredContent): #### Inline Structure Limitations Inline structures: -- ✅ Support all common field types (text, number, date, list, dict, etc.) +- ✅ Support all common field types (text, number, date, list, dict, concept, etc.) - ✅ Support required/optional fields, defaults, choices +- ✅ Support concept-to-concept references (type = "concept" with concept_ref) +- ✅ Support lists of concepts (type = "list" with item_type = "concept") +- ✅ Support refining both native and custom concepts - ✅ Generate full Pydantic models with validation - ❌ Cannot have custom validators or complex validation logic - ❌ Cannot have computed properties or custom methods -- ❌ Cannot refine custom (non-native) concepts - ❌ Limited IDE autocomplete compared to explicit Python classes @@ -471,7 +518,7 @@ The PipeExtract operator is used to extract text and images from an image or a P [pipe.extract_info] type = "PipeExtract" description = "extract the information" -inputs = { document = "PDF" } # or { image = "Image" } if it's an image. This is the only input. +inputs = { document = "Document" } # or { image = "Image" } if it's an image. This is the only input. output = "Page" ``` @@ -480,7 +527,7 @@ Using Extract Model Settings: [pipe.extract_with_model] type = "PipeExtract" description = "Extract with specific model" -inputs = { document = "PDF" } +inputs = { document = "Document" } output = "Page" model = "base_extract_mistral" # Use predefined extract preset or model alias ``` @@ -588,15 +635,16 @@ $sales_rep.phone | $sales_rep.email """ ``` -#### Key Parameters +#### Key Parameters (Template Mode) -- `template`: Inline template string (mutually exclusive with template_name) +- `template`: Inline template string (mutually exclusive with template_name and construct) - `template_name`: Name of a predefined template (mutually exclusive with template) - `template_category`: Template type ("llm_prompt", "html", "markdown", "mermaid", etc.) - `templating_style`: Styling options for template rendering - `extra_context`: Additional context variables for template For more control, you can use a nested `template` section instead of the `template` field: + - `template.template`: The template string - `template.category`: Template type - `template.templating_style`: Styling options @@ -604,9 +652,143 @@ For more control, you can use a nested `template` section instead of the `templa #### Template Variables Use the same variable insertion rules as PipeLLM: + - `@variable` for block insertion (multi-line content) - `$variable` for inline insertion (short text) +#### Construct Mode (for StructuredContent Output) + +PipeCompose can also generate `StructuredContent` objects using the `construct` section. This mode composes field values from fixed values, variable references, templates, or nested structures. + +**When to use construct mode:** + +- You need to output a structured object (not just Text) +- You want to deterministically compose fields from existing data +- No LLM is needed - just data composition and templating + +##### Basic Construct Usage + +```plx +[concept.SalesSummary] +description = "A structured sales summary" + +[concept.SalesSummary.structure] +report_title = { type = "text", description = "Title of the report" } +customer_name = { type = "text", description = "Customer name" } +deal_value = { type = "number", description = "Deal value" } +summary_text = { type = "text", description = "Generated summary text" } + +[pipe.compose_summary] +type = "PipeCompose" +description = "Compose a sales summary from deal data" +inputs = { deal = "Deal" } +output = "SalesSummary" + +[pipe.compose_summary.construct] +report_title = "Monthly Sales Report" +customer_name = { from = "deal.customer_name" } +deal_value = { from = "deal.amount" } +summary_text = { template = "Deal worth $deal.amount with $deal.customer_name" } +``` + +##### Field Composition Methods + +There are four ways to define field values in a construct: + +**1. Fixed Value (literal)** + +Use a literal value directly: + +```plx +[pipe.compose_report.construct] +report_title = "Annual Report" +report_year = 2024 +is_draft = false +``` + +**2. Variable Reference (`from`)** + +Get a value from working memory using a dotted path: + +```plx +[pipe.compose_report.construct] +customer_name = { from = "deal.customer_name" } +total_amount = { from = "order.total" } +street_address = { from = "customer.address.street" } +``` + +**3. Template (`template`)** + +Render a Jinja2 template with variable substitution: + +```plx +[pipe.compose_report.construct] +invoice_number = { template = "INV-$order.id" } +summary = { template = "Deal worth $deal.amount with $deal.customer_name on {{ current_date }}" } +``` + +**4. Nested Construct** + +For nested structures, use a TOML subsection: + +```plx +[pipe.compose_invoice.construct] +invoice_number = { template = "INV-$order.id" } +total = { from = "order.total_amount" } + +[pipe.compose_invoice.construct.billing_address] +street = { from = "customer.address.street" } +city = { from = "customer.address.city" } +country = "France" +``` + +##### Complete Construct Example + +```plx +domain = "invoicing" + +[concept.Address] +description = "A postal address" + +[concept.Address.structure] +street = { type = "text", description = "Street address" } +city = { type = "text", description = "City name" } +country = { type = "text", description = "Country name" } + +[concept.Invoice] +description = "An invoice document" + +[concept.Invoice.structure] +invoice_number = { type = "text", description = "Invoice number" } +total = { type = "number", description = "Total amount" } + +[pipe.compose_invoice] +type = "PipeCompose" +description = "Compose an invoice from order and customer data" +inputs = { order = "Order", customer = "Customer" } +output = "Invoice" + +[pipe.compose_invoice.construct] +invoice_number = { template = "INV-$order.id" } +total = { from = "order.total_amount" } + +[pipe.compose_invoice.construct.billing_address] +street = { from = "customer.address.street" } +city = { from = "customer.address.city" } +country = "France" +``` + +##### Key Parameters (Construct Mode) + +- `construct`: Dictionary mapping field names to their composition rules +- Each field can be: + - A literal value (string, number, boolean) + - A dict with `from` key for variable reference + - A dict with `template` key for template rendering + - A nested dict for nested structures + +**Note:** You must use either `template` or `construct`, not both. They are mutually exclusive. + ### PipeImgGen operator The PipeImgGen operator is used to generate images using AI image generation models. @@ -820,7 +1002,7 @@ Presets are meant to record the choice of an llm with its hyper parameters (temp Examples: ```toml -llm_for_complex_reasoning = { model = "base-claude", temperature = 1 } +llm_to_engineer = { model = "base-claude", temperature = 1 } llm_to_extract_invoice = { model = "claude-3-7-sonnet", temperature = 0.1, max_tokens = "auto" } ``` @@ -851,7 +1033,7 @@ You can override the predefined llm presets by setting them in `.pipelex/inferen ALWAYS RUN validation when you are finished writing pipelines: This checks for errors. If there are errors, iterate until it works. - For a specific bundle/file: `pipelex validate path_to_file.plx` -- For all pipelines: `pipelex validate all` +- For all pipelines: `pipelex validate --all` - Remember: Ensure your Python virtual environment is activated (typically `.venv` for standard installations) before running `pipelex` commands. Then, create an example file to run the pipeline in the `examples` folder. @@ -859,21 +1041,6 @@ But don't write documentation unless asked explicitly to. ## Guide to execute a pipeline and write example code -### Prerequisites: Virtual Environment - -**CRITICAL**: Before running any `pipelex` commands or `pytest`, you MUST activate the appropriate Python virtual environment. Without proper venv activation, these commands will not work. - -For standard installations, the virtual environment is named `.venv`. Always check this first: - -```bash -## Activate the virtual environment (standard installation) -source .venv/bin/activate # On macOS/Linux -## or -.venv\Scripts\activate # On Windows -``` - -If your installation uses a different venv name or location, activate that one instead. All subsequent `pipelex` and `pytest` commands assume the venv is active. - ### Example to execute a pipeline with text output ```python @@ -967,13 +1134,13 @@ So here are a few concrete examples of calls to execute_pipeline with various wa }, ) -## Here we have a single input and it's a PDF. -## Because PDFContent is a native concept, we can use it directly as a value, +## Here we have a single input and it's a document. +## Because DocumentContent is a native concept, we can use it directly as a value, ## the system knows what content it corresponds to: pipe_output = await execute_pipeline( pipe_code="power_extractor_dpe", inputs={ - "document": PDFContent(url=pdf_url), + "document": DocumentContent(url=pdf_url), }, ) @@ -1096,82 +1263,4 @@ result_list = pipe_output.main_stuff_as_items(item_type=GanttChart) ``` --- - -## Rules to choose LLM models used in PipeLLMs. - -### LLM Configuration System - -In order to use it in a pipe, an LLM is referenced by its llm_handle (alias) and possibly by an llm_preset. -LLM configurations are managed through the new inference backend system with files located in `.pipelex/inference/`: - -- **Model Deck**: `.pipelex/inference/deck/base_deck.toml` and `.pipelex/inference/deck/overrides.toml` -- **Backends**: `.pipelex/inference/backends.toml` and `.pipelex/inference/backends/*.toml` -- **Routing**: `.pipelex/inference/routing_profiles.toml` - -### LLM Handles - -An llm_handle can be either: -1. **A direct model name** (like "gpt-4o-mini", "claude-3-sonnet") - automatically available for all models loaded by the inference backend system -2. **An alias** - user-defined shortcuts that map to model names, defined in the `[aliases]` section: - -```toml -[aliases] -base-claude = "claude-4.5-sonnet" -base-gpt = "gpt-5" -base-gemini = "gemini-2.5-flash" -base-mistral = "mistral-medium" -``` - -The system first looks for direct model names, then checks aliases if no direct match is found. The system handles model routing through backends automatically. - -### Using an LLM Handle in a PipeLLM - -Here is an example of using an llm_handle to specify which LLM to use in a PipeLLM: - -```plx -[pipe.hello_world] -type = "PipeLLM" -description = "Write text about Hello World." -output = "Text" -model = { model = "gpt-5", temperature = 0.9 } -prompt = """ -Write a haiku about Hello World. -""" -``` - -As you can see, to use the LLM, you must also indicate the temperature (float between 0 and 1) and max_tokens (either an int or the string "auto"). - -### LLM Presets - -Presets are meant to record the choice of an llm with its hyper parameters (temperature and max_tokens) if it's good for a particular task. LLM Presets are skill-oriented. - -Examples: -```toml -llm_for_complex_reasoning = { model = "base-claude", temperature = 1 } -llm_to_extract_invoice = { model = "claude-3-7-sonnet", temperature = 0.1, max_tokens = "auto" } -``` - -The interest is that these presets can be used to set the LLM choice in a PipeLLM, like this: - -```plx -[pipe.extract_invoice] -type = "PipeLLM" -description = "Extract invoice information from an invoice text transcript" -inputs = { invoice_text = "InvoiceText" } -output = "Invoice" -model = "llm_to_extract_invoice" -prompt = """ -Extract invoice information from this invoice: - -The category of this invoice is: $invoice_details.category. - -@invoice_text -""" -``` - -The setting here `model = "llm_to_extract_invoice"` works because "llm_to_extract_invoice" has been declared as an llm_preset in the deck. -You must not use an LLM preset in a PipeLLM that does not exist in the deck. If needed, you can add llm presets. - - -You can override the predefined llm presets by setting them in `.pipelex/inference/deck/overrides.toml`. diff --git a/api/main.py b/api/main.py index 5a3144b..bb4d8bd 100644 --- a/api/main.py +++ b/api/main.py @@ -4,7 +4,7 @@ from pipelex.system.runtime import IntegrationMode from api.routes import router as api_router -from api.routes.pipelex.health import router as health_router +from api.routes.health import router as health_router from api.security import get_auth_dependency Pipelex.make(IntegrationMode.FASTAPI) diff --git a/api/routes/__init__.py b/api/routes/__init__.py index 0dcfa49..1a05791 100644 --- a/api/routes/__init__.py +++ b/api/routes/__init__.py @@ -1,12 +1,12 @@ from fastapi import APIRouter -from api.routes.main import router as main_router +from api.routes.version import router as version_router from .pipelex import router as pipelex_router from .uploader import router as uploader_router router = APIRouter() -router.include_router(main_router) +router.include_router(version_router) router.include_router(pipelex_router) router.include_router(uploader_router) diff --git a/api/routes/pipelex/health.py b/api/routes/health.py similarity index 51% rename from api/routes/pipelex/health.py rename to api/routes/health.py index fa1414d..a608ac4 100644 --- a/api/routes/pipelex/health.py +++ b/api/routes/health.py @@ -1,17 +1,14 @@ from fastapi import APIRouter from fastapi.responses import JSONResponse -router = APIRouter(tags=["pipeline"]) +router = APIRouter(tags=["health"]) @router.get("/health", response_model=dict) async def get_health(): - """Health check endpoint to verify the API is running. - Returns a simple JSON response indicating the service is healthy. - """ return JSONResponse( content={ "status": "ok", - "message": "API is running", + "message": "Pipelex API is running", } ) diff --git a/api/routes/pipelex/__init__.py b/api/routes/pipelex/__init__.py index bad27d8..d682583 100644 --- a/api/routes/pipelex/__init__.py +++ b/api/routes/pipelex/__init__.py @@ -1,11 +1,11 @@ from fastapi import APIRouter -from .pipe_builder import router as pipe_builder_router +from .build import router as build_router from .pipeline import router as pipeline_router -from .plx_validator import router as plx_validator_router +from .validate import router as validate_router router = APIRouter() +router.include_router(build_router) router.include_router(pipeline_router) -router.include_router(pipe_builder_router) -router.include_router(plx_validator_router) +router.include_router(validate_router) diff --git a/api/routes/pipelex/build/__init__.py b/api/routes/pipelex/build/__init__.py new file mode 100644 index 0000000..335e222 --- /dev/null +++ b/api/routes/pipelex/build/__init__.py @@ -0,0 +1,11 @@ +from fastapi import APIRouter + +from .inputs import router as inputs_router +from .pipe import router as pipe_router +from .runner import router as runner_router + +router = APIRouter() + +router.include_router(inputs_router) +router.include_router(pipe_router) +router.include_router(runner_router) diff --git a/api/routes/pipelex/build/inputs.py b/api/routes/pipelex/build/inputs.py new file mode 100644 index 0000000..81438ae --- /dev/null +++ b/api/routes/pipelex/build/inputs.py @@ -0,0 +1,86 @@ +import json +from typing import Any + +from fastapi import APIRouter, HTTPException +from fastapi.responses import JSONResponse +from pipelex.hub import get_library_manager, get_required_pipe, set_current_library +from pipelex.pipeline.validate_bundle import validate_bundle +from pydantic import BaseModel, Field + +router = APIRouter(tags=["build"]) + + +class BuildInputsRequest(BaseModel): + plx_content: str | None = Field(default=None, description="PLX content to load pipes from") + pipe_code: str = Field(..., description="Pipe code to generate inputs JSON for") + + +class BuildInputsResponse(BaseModel): + inputs_json: dict[str, Any] = Field(..., description="Generated inputs JSON object") + pipe_code: str = Field(..., description="Pipe code that was used") + success: bool = Field(default=True, description="Whether the operation was successful") + message: str = Field(default="Inputs JSON generated successfully", description="Status message") + + +@router.post("/build/inputs", response_model=BuildInputsResponse) +async def build_inputs(request_data: BuildInputsRequest): + """Generate example input JSON for a pipe. + + This endpoint generates a JSON object with example values for all pipe inputs + based on their concept types. + + If plx_content is provided, it will: + 1. Parse and validate the PLX content + 2. Load pipes from the bundle + 3. Generate inputs JSON for the specified pipe + + If plx_content is not provided, it will generate inputs for an already-loaded pipe. + """ + library_manager = get_library_manager() + + try: + if request_data.plx_content: + # Validate and load the PLX content + validate_bundle_result = await validate_bundle(plx_content=request_data.plx_content) + blueprint = validate_bundle_result.blueprints[0] + + library_id, _ = library_manager.open_library() + set_current_library(library_id) + # Load pipes temporarily + library_manager.load_from_blueprints(library_id=library_id, blueprints=[blueprint]) + + # Get the pipe + the_pipe = get_required_pipe(pipe_code=request_data.pipe_code) + + # Check if pipe has any inputs + if not the_pipe.inputs.root: + return JSONResponse( + content=BuildInputsResponse( + inputs_json={}, + pipe_code=request_data.pipe_code, + success=True, + message=f"No inputs required for pipe '{request_data.pipe_code}'", + ).model_dump(serialize_as_any=True) + ) + + # Generate the input JSON + inputs_json_str = the_pipe.inputs.generate_json_string(indent=2) + inputs_json = json.loads(inputs_json_str) + + response_data = BuildInputsResponse( + inputs_json=inputs_json, + pipe_code=request_data.pipe_code, + success=True, + message="Inputs JSON generated successfully", + ) + + return JSONResponse(content=response_data.model_dump(serialize_as_any=True)) + + except Exception as exc: + raise HTTPException( + status_code=500, + detail={ + "error_type": type(exc).__name__, + "message": str(exc), + }, + ) from exc diff --git a/api/routes/pipelex/build/pipe.py b/api/routes/pipelex/build/pipe.py new file mode 100644 index 0000000..9fb5f45 --- /dev/null +++ b/api/routes/pipelex/build/pipe.py @@ -0,0 +1,57 @@ +from typing import Any + +from fastapi import APIRouter +from fastapi.responses import JSONResponse +from pipelex.builder.builder_loop import BuilderLoop +from pipelex.hub import get_library_manager, set_current_library +from pipelex.language.plx_factory import PlxFactory +from pydantic import BaseModel, Field + +from api.routes.helpers import extract_pipe_structures + +router = APIRouter(tags=["build"]) + + +class BuildPipeRequest(BaseModel): + brief: str = Field(..., description="Brief description of the pipeline to build") + + +class BuildPipeResponse(BaseModel): + plx_content: str = Field(..., description="Generated PLX content as string") + pipelex_bundle_blueprint: dict[str, Any] = Field(..., description="Generated pipelex bundle blueprint") + pipe_structures: dict[str, dict[str, Any]] = Field( + default_factory=dict, description="Structure class information for each pipe's inputs and output" + ) + success: bool = Field(default=True, description="Whether the operation was successful") + message: str = Field(default="Pipeline generated successfully", description="Status message") + + +@router.post("/build/pipe", response_model=BuildPipeResponse) +async def build_pipe(request_data: BuildPipeRequest): + """Build a pipeline from a brief description. + + This endpoint takes a brief description and generates both PLX content + and the corresponding pipelex bundle blueprint, along with pipe structures. + """ + # Execute the pipe_builder pipeline + library_manager = get_library_manager() + builder_loop = BuilderLoop() + pipelex_bundle_spec, _ = await builder_loop.build_and_fix(inputs={"brief": request_data.brief}, builder_pipe="pipe_builder") + blueprint = pipelex_bundle_spec.to_blueprint() + + library_id, _ = library_manager.open_library() + set_current_library(library_id) + # Load pipes temporarily to extract structures + pipes = library_manager.load_from_blueprints(library_id=library_id, blueprints=[blueprint]) + pipe_structures = extract_pipe_structures(pipes) + + plx_content = PlxFactory.make_plx_content(blueprint=blueprint) + response_data = BuildPipeResponse( + plx_content=plx_content, + pipelex_bundle_blueprint=blueprint.model_dump(serialize_as_any=True), + pipe_structures=pipe_structures, + success=True, + message="Pipeline generated successfully", + ) + + return JSONResponse(content=response_data.model_dump(serialize_as_any=True)) diff --git a/api/routes/pipelex/pipe_builder.py b/api/routes/pipelex/build/runner.py similarity index 51% rename from api/routes/pipelex/pipe_builder.py rename to api/routes/pipelex/build/runner.py index a9e41df..e643981 100644 --- a/api/routes/pipelex/pipe_builder.py +++ b/api/routes/pipelex/build/runner.py @@ -1,81 +1,31 @@ import traceback -from typing import Any from fastapi import APIRouter, HTTPException from fastapi.responses import JSONResponse from pipelex import log -from pipelex.builder.builder_loop import BuilderLoop from pipelex.builder.runner_code import generate_runner_code from pipelex.core.interpreter.interpreter import PipelexInterpreter from pipelex.hub import get_library_manager, get_required_pipe, set_current_library -from pipelex.language.plx_factory import PlxFactory from pipelex.pipe_run.dry_run import dry_run_pipes from pydantic import BaseModel, Field -from api.routes.helpers import extract_pipe_structures +router = APIRouter(tags=["build"]) -router = APIRouter(tags=["pipe-builder"]) - -class PipeBuilderRequest(BaseModel): - brief: str = Field(..., description="Brief description of the pipeline to build") - - -class PipeBuilderResponse(BaseModel): - plx_content: str = Field(..., description="Generated PLX content as string") - pipelex_bundle_blueprint: dict[str, Any] = Field(..., description="Generated pipelex bundle blueprint") - pipe_structures: dict[str, dict[str, Any]] = Field( - default_factory=dict, description="Structure class information for each pipe's inputs and output" - ) - success: bool = Field(default=True, description="Whether the operation was successful") - message: str = Field(default="Pipeline generated successfully", description="Status message") - - -@router.post("/pipe-builder/build", response_model=PipeBuilderResponse) -async def build_pipe(request_data: PipeBuilderRequest): - """Build a pipeline from a brief description. - - This endpoint takes a brief description and generates both PLX content - and the corresponding pipelex bundle blueprint, along with pipe structures. - """ - # Execute the pipe_builder pipeline - library_manager = get_library_manager() - builder_loop = BuilderLoop() - pipelex_bundle_spec, _ = await builder_loop.build_and_fix(inputs={"brief": request_data.brief}, builder_pipe="pipe_builder") - blueprint = pipelex_bundle_spec.to_blueprint() - - library_id, _ = library_manager.open_library() - set_current_library(library_id) - # Load pipes temporarily to extract structures - pipes = library_manager.load_from_blueprints(library_id=library_id, blueprints=[blueprint]) - pipe_structures = extract_pipe_structures(pipes) - - plx_content = PlxFactory.make_plx_content(blueprint=blueprint) - response_data = PipeBuilderResponse( - plx_content=plx_content, - pipelex_bundle_blueprint=blueprint.model_dump(serialize_as_any=True), - pipe_structures=pipe_structures, - success=True, - message="Pipeline generated successfully", - ) - - return JSONResponse(content=response_data.model_dump(serialize_as_any=True)) - - -class RunnerCodeRequest(BaseModel): +class BuildRunnerRequest(BaseModel): plx_content: str = Field(..., description="PLX content to load and generate runner code for") pipe_code: str = Field(..., description="Pipe code to generate runner code for") -class RunnerCodeResponse(BaseModel): +class BuildRunnerResponse(BaseModel): python_code: str = Field(..., description="Generated Python code for running the workflow") pipe_code: str = Field(..., description="Pipe code that was used") success: bool = Field(default=True, description="Whether the operation was successful") message: str = Field(default="Runner code generated successfully", description="Status message") -@router.post("/pipe-builder/generate-runner", response_model=RunnerCodeResponse) -async def generate_runner(request_data: RunnerCodeRequest): +@router.post("/build/runner", response_model=BuildRunnerResponse) +async def build_runner(request_data: BuildRunnerRequest): """Generate Python runner code for a pipe from PLX content. This endpoint: @@ -109,7 +59,7 @@ async def generate_runner(request_data: RunnerCodeRequest): python_code = generate_runner_code(pipe=pipe) # Create the response - response_data = RunnerCodeResponse( + response_data = BuildRunnerResponse( python_code=python_code, pipe_code=request_data.pipe_code, success=True, diff --git a/api/routes/pipelex/hello_world.plx b/api/routes/pipelex/hello_world.plx deleted file mode 100644 index db3f488..0000000 --- a/api/routes/pipelex/hello_world.plx +++ /dev/null @@ -1,13 +0,0 @@ -domain = "quick_start" -description = "Discovering Pipelex" - -[pipe] -[pipe.hello_world] -type = "PipeLLM" -description = "Write text about Hello World." -output = "Text" -model = { model = "gpt-4o-mini", temperature = 0.9, max_tokens = "auto" } -prompt = """ -Write a haiku about Hello World. -""" - diff --git a/api/routes/pipelex/pipeline.py b/api/routes/pipelex/pipeline.py index 299cc05..8fd62ad 100644 --- a/api/routes/pipelex/pipeline.py +++ b/api/routes/pipelex/pipeline.py @@ -2,14 +2,12 @@ from typing import Annotated from fastapi import APIRouter, Depends, HTTPException, Request -from fastapi.responses import JSONResponse from kajson import kajson from pipelex import log from pipelex.client.pipeline_request_factory import PipelineRequestFactory from pipelex.client.pipeline_response_factory import PipelineResponseFactory from pipelex.client.protocol import PipelineRequest, PipelineResponse, PipelineState from pipelex.pipeline.execute import execute_pipeline -from pipelex.pipeline.start import start_pipeline from api.routes.pipelex.utils import get_current_iso_timestamp @@ -67,55 +65,3 @@ async def execute( "message": str(exc), }, ) from exc - - -@router.post("/pipeline/start", response_model=PipelineResponse) -async def start( - pipeline_request: Annotated[PipelineRequest, Depends(request_deserialization)], -): - """Starts a pipe execution with the given memory but does not wait for completion. - - This endpoint can operate in two modes: - 1. If 'plx_content' is provided: validates, loads pipes from the PLX content, then starts execution - 2. If 'plx_content' is not provided: starts execution of an already-loaded pipe - - This is a non-blocking operation that returns immediately with a workflow ID. - - Note: If plx_content is provided, pipes remain loaded after this call returns. - """ - try: - if pipeline_request.plx_content: - raise HTTPException(status_code=400, detail="PLX content is not supported when using the route 'start'") - if not pipeline_request.pipe_code: - raise HTTPException(status_code=400, detail="Pipe code is required when using the route 'start'") - - created_at = get_current_iso_timestamp() - pipeline_run_id, _ = await start_pipeline( - pipe_code=pipeline_request.pipe_code, - inputs=pipeline_request.inputs, - output_name=pipeline_request.output_name, - output_multiplicity=pipeline_request.output_multiplicity, - dynamic_output_concept_code=pipeline_request.dynamic_output_concept_code, - ) - - response_data = PipelineResponse( - pipeline_run_id=pipeline_run_id, - pipeline_state=PipelineState.STARTED, - created_at=created_at, - pipe_output=None, - main_stuff_name=None, - status="success", - ) - return JSONResponse(content=response_data.model_dump(serialize_as_any=True)) - - except Exception as exc: - log.error("Pipeline start error details:") - traceback.print_exc() - - raise HTTPException( - status_code=500, - detail={ - "error_type": type(exc).__name__, - "message": str(exc), - }, - ) from exc diff --git a/api/routes/pipelex/plx_validator.py b/api/routes/pipelex/validate.py similarity index 87% rename from api/routes/pipelex/plx_validator.py rename to api/routes/pipelex/validate.py index 51e6a10..3e6a5b3 100644 --- a/api/routes/pipelex/plx_validator.py +++ b/api/routes/pipelex/validate.py @@ -8,14 +8,14 @@ from api.routes.helpers import extract_pipe_structures -router = APIRouter() +router = APIRouter(tags=["validate"]) -class PlxValidatorRequest(BaseModel): +class ValidateRequest(BaseModel): plx_content: str = Field(..., description="PLX content to validate") -class PlxValidatorResponse(BaseModel): +class ValidateResponse(BaseModel): plx_content: str = Field(..., description="The PLX content that was validated") pipelex_bundle_blueprint: PipelexBundleBlueprint = Field(..., description="Generated pipelex bundle blueprint") pipe_structures: dict[str, dict[str, Any]] = Field( @@ -25,8 +25,8 @@ class PlxValidatorResponse(BaseModel): message: str = Field(default="PLX content validated successfully", description="Status message") -@router.post("/validate", response_model=PlxValidatorResponse) -async def validate_pipes(request_data: PlxValidatorRequest): +@router.post("/validate", response_model=ValidateResponse) +async def validate_plx(request_data: ValidateRequest): """Validate PLX content by parsing, loading, and dry-running pipes. This endpoint takes PLX content and validates it by: @@ -41,7 +41,7 @@ async def validate_pipes(request_data: PlxValidatorRequest): pipes = validate_bundle_result.pipes pipe_structures = extract_pipe_structures(pipes) - response_data = PlxValidatorResponse( + response_data = ValidateResponse( plx_content=request_data.plx_content, pipelex_bundle_blueprint=blueprint[0], pipe_structures=pipe_structures, diff --git a/api/routes/main.py b/api/routes/version.py similarity index 89% rename from api/routes/main.py rename to api/routes/version.py index ec4708b..5c122ec 100644 --- a/api/routes/main.py +++ b/api/routes/version.py @@ -2,7 +2,7 @@ from fastapi import APIRouter -router = APIRouter() +router = APIRouter(tags=["version"]) @router.get("/pipelex_version") diff --git a/pyproject.toml b/pyproject.toml index 87db9b9..b9352e8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,7 @@ dependencies = [ ] [tool.uv.sources] -pipelex = { git = "https://github.com/Pipelex/pipelex.git", branch = "feature/Chicago" } +pipelex = { path = "../pipelex", editable = true} [build-system] diff --git a/uv.lock b/uv.lock index bb60a6b..15bc1cb 100644 --- a/uv.lock +++ b/uv.lock @@ -1856,8 +1856,8 @@ wheels = [ [[package]] name = "pipelex" -version = "0.17.3" -source = { git = "https://github.com/Pipelex/pipelex.git?branch=feature%2FChicago#6c6301aeec180ffcf9ae2cc3cfa1af354004122d" } +version = "0.18.0b2" +source = { editable = "../pipelex" } dependencies = [ { name = "aiofiles" }, { name = "filetype" }, @@ -1911,6 +1911,75 @@ mistralai = [ { name = "mistralai" }, ] +[package.metadata] +requires-dist = [ + { name = "aioboto3", marker = "extra == 'bedrock'", specifier = ">=13.4.0" }, + { name = "aioboto3", marker = "extra == 's3'", specifier = ">=13.4.0" }, + { name = "aiofiles", specifier = ">=23.2.1" }, + { name = "anthropic", marker = "extra == 'anthropic'", specifier = ">=0.60.0" }, + { name = "backports-strenum", marker = "python_full_version < '3.11'", specifier = ">=1.3.0" }, + { name = "boto3", marker = "extra == 'bedrock'", specifier = ">=1.34.131" }, + { name = "boto3", marker = "extra == 's3'", specifier = ">=1.34.131" }, + { name = "boto3-stubs", marker = "extra == 'dev'", specifier = ">=1.35.24" }, + { name = "docling", marker = "extra == 'docling'", specifier = ">=2.64.0" }, + { name = "fal-client", marker = "extra == 'fal'", specifier = ">=0.4.1" }, + { name = "filetype", specifier = ">=1.2.0" }, + { name = "google-auth-oauthlib", marker = "extra == 'google'", specifier = ">=1.2.1" }, + { name = "google-cloud-storage", marker = "extra == 'gcp-storage'", specifier = ">=2.10.0" }, + { name = "google-genai", marker = "extra == 'google-genai'" }, + { name = "httpx", specifier = ">=0.23.0,<1.0.0" }, + { name = "huggingface-hub", marker = "extra == 'huggingface'", specifier = ">=0.23,<1.0.0" }, + { name = "instructor", specifier = ">=1.8.3,!=1.11.*,!=1.12.*" }, + { name = "instructor", extras = ["google-genai"], marker = "extra == 'google-genai'" }, + { name = "jinja2", specifier = ">=3.1.4" }, + { name = "json2html", specifier = ">=1.3.0" }, + { name = "kajson", specifier = "==0.3.1" }, + { name = "markdown", specifier = ">=3.6" }, + { name = "mike", marker = "extra == 'docs'", specifier = ">=2.1.3" }, + { name = "mistralai", marker = "extra == 'mistralai'", specifier = "==1.5.2" }, + { name = "mkdocs", marker = "extra == 'docs'", specifier = ">=1.6.1" }, + { name = "mkdocs-glightbox", marker = "extra == 'docs'", specifier = ">=0.4.0" }, + { name = "mkdocs-material", marker = "extra == 'docs'", specifier = ">=9.6.14" }, + { name = "mkdocs-meta-manager", marker = "extra == 'docs'", specifier = ">=1.1.0" }, + { name = "moto", extras = ["s3"], marker = "extra == 'dev'", specifier = ">=5.0.0" }, + { name = "mypy", marker = "extra == 'dev'", specifier = "==1.19.1" }, + { name = "networkx", specifier = ">=3.4.2" }, + { name = "openai", specifier = ">=1.108.1" }, + { name = "opentelemetry-api" }, + { name = "opentelemetry-exporter-otlp-proto-http" }, + { name = "opentelemetry-sdk" }, + { name = "opentelemetry-semantic-conventions" }, + { name = "pillow", specifier = ">=11.2.1" }, + { name = "polyfactory", specifier = ">=2.21.0" }, + { name = "portkey-ai", specifier = ">=2.1.0" }, + { name = "posthog", specifier = ">=6.7.0" }, + { name = "pydantic", specifier = ">=2.10.6,<3.0.0" }, + { name = "pylint", marker = "extra == 'dev'", specifier = "==4.0.4" }, + { name = "pypdfium2", specifier = ">=4.30.0,!=4.30.1" }, + { name = "pyright", marker = "extra == 'dev'", specifier = "==1.1.408" }, + { name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.2" }, + { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.24.0" }, + { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=6.1.1" }, + { name = "pytest-mock", marker = "extra == 'dev'", specifier = ">=3.14.0" }, + { name = "pytest-sugar", marker = "extra == 'dev'", specifier = ">=1.0.0" }, + { name = "pytest-xdist", marker = "extra == 'dev'", specifier = ">=3.6.1" }, + { name = "python-dotenv", specifier = ">=1.0.1" }, + { name = "pyyaml", specifier = ">=6.0.2" }, + { name = "rich", specifier = ">=13.8.1" }, + { name = "ruff", marker = "extra == 'dev'", specifier = "==0.14.13" }, + { name = "shortuuid", specifier = ">=1.0.13" }, + { name = "tomli", specifier = ">=2.3.0" }, + { name = "tomlkit", specifier = ">=0.13.2" }, + { name = "typer", specifier = ">=0.16.0" }, + { name = "types-aioboto3", extras = ["bedrock", "bedrock-runtime"], marker = "extra == 'dev'", specifier = ">=13.4.0" }, + { name = "types-aiofiles", marker = "extra == 'dev'", specifier = ">=24.1.0.20240626" }, + { name = "types-markdown", marker = "extra == 'dev'", specifier = ">=3.6.0.20240316" }, + { name = "types-networkx", marker = "extra == 'dev'", specifier = ">=3.3.0.20241020" }, + { name = "types-pyyaml", marker = "extra == 'dev'", specifier = ">=6.0.12.20250326" }, + { name = "typing-extensions", specifier = ">=4.13.2" }, +] +provides-extras = ["anthropic", "bedrock", "docling", "fal", "gcp-storage", "google", "google-genai", "huggingface", "mistralai", "s3", "docs", "dev"] + [[package]] name = "pipelex-api" version = "0.0.12" @@ -1963,7 +2032,7 @@ requires-dist = [ { name = "mkdocs-meta-manager", marker = "extra == 'docs'", specifier = "==1.1.0" }, { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.11.2" }, { name = "pandas-stubs", marker = "extra == 'dev'", specifier = ">=2.2.3.241126" }, - { name = "pipelex", extras = ["mistralai", "anthropic", "google", "google-genai", "bedrock", "fal"], git = "https://github.com/Pipelex/pipelex.git?branch=feature%2FChicago" }, + { name = "pipelex", extras = ["mistralai", "anthropic", "google", "google-genai", "bedrock", "fal"], editable = "../pipelex" }, { name = "pyjwt", specifier = ">=2.10.1" }, { name = "pylint", marker = "extra == 'dev'", specifier = ">=3.3.8" }, { name = "pyright", marker = "extra == 'dev'", specifier = ">=1.1.405" }, From a3fcbac38363aa3f802d4297a57c5cf870eb45ef Mon Sep 17 00:00:00 2001 From: thomashebrard Date: Mon, 26 Jan 2026 16:32:11 +0100 Subject: [PATCH 2/5] remove useless code --- api/routes/helpers.py | 81 ------------------------------ api/routes/pipelex/build/inputs.py | 32 ++++-------- api/routes/pipelex/build/pipe.py | 17 +------ api/routes/pipelex/validate.py | 12 +---- api/security.py | 1 - 5 files changed, 11 insertions(+), 132 deletions(-) delete mode 100644 api/routes/helpers.py diff --git a/api/routes/helpers.py b/api/routes/helpers.py deleted file mode 100644 index 73bdff1..0000000 --- a/api/routes/helpers.py +++ /dev/null @@ -1,81 +0,0 @@ -from typing import Any - -from pipelex.core.concepts.concept import Concept -from pipelex.core.pipes.pipe_abstract import PipeAbstract -from pipelex.hub import get_class_registry - - -def get_concept_structure(concept: Concept) -> dict[str, Any]: - """Extract structure information for a concept. - - Args: - concept: Concept object to extract structure from - - Returns: - Dictionary containing: - { - "concept_code": str, - "structure_class_name": str, - "class_structure": {schema from Pydantic model_json_schema} - } - """ - class_registry = get_class_registry() - - try: - structure_class = class_registry.get_required_class(concept.structure_class_name) - class_structure = structure_class.model_json_schema() - - return { - "concept_code": concept.code, - "structure_class_name": concept.structure_class_name, - "class_structure": class_structure, - } - except Exception as e: - return { - "concept_code": concept.code, - "structure_class_name": concept.structure_class_name, - "error": str(e), - } - - -def extract_pipe_structures(pipes: list[PipeAbstract]) -> dict[str, dict[str, Any]]: - """Extract structure information for a list of pipes. - - For each pipe, extracts input and output concept structures. - - Args: - pipes: List of PipeAbstract objects to extract structures from - - Returns: - Dictionary mapping pipe_code to structure information: - { - "pipe_code": { - "inputs": { - "input_name": {concept_structure}, - ... - }, - "output": {concept_structure} - }, - ... - } - """ - pipe_structures: dict[str, dict[str, Any]] = {} - - for pipe in pipes: - inputs_specs: dict[str, Any] = {} - - # Process inputs - extract concept structures - for input_name, input_spec in pipe.inputs.root.items(): - concept = input_spec.concept - inputs_specs[input_name] = get_concept_structure(concept) - - # Process output - extract concept structure - output_spec = get_concept_structure(pipe.output.concept) - - # Store structure info for this pipe - pipe_structures[pipe.code] = { - "inputs": inputs_specs, - "output": output_spec, - } - - return pipe_structures diff --git a/api/routes/pipelex/build/inputs.py b/api/routes/pipelex/build/inputs.py index 81438ae..d4c7bca 100644 --- a/api/routes/pipelex/build/inputs.py +++ b/api/routes/pipelex/build/inputs.py @@ -11,7 +11,7 @@ class BuildInputsRequest(BaseModel): - plx_content: str | None = Field(default=None, description="PLX content to load pipes from") + plx_content: str = Field(..., description="PLX content to load pipes from") pipe_code: str = Field(..., description="Pipe code to generate inputs JSON for") @@ -29,40 +29,26 @@ async def build_inputs(request_data: BuildInputsRequest): This endpoint generates a JSON object with example values for all pipe inputs based on their concept types. - If plx_content is provided, it will: + It will: 1. Parse and validate the PLX content 2. Load pipes from the bundle 3. Generate inputs JSON for the specified pipe - - If plx_content is not provided, it will generate inputs for an already-loaded pipe. """ library_manager = get_library_manager() try: - if request_data.plx_content: - # Validate and load the PLX content - validate_bundle_result = await validate_bundle(plx_content=request_data.plx_content) - blueprint = validate_bundle_result.blueprints[0] + # Validate and load the PLX content + validate_bundle_result = await validate_bundle(plx_content=request_data.plx_content) + blueprint = validate_bundle_result.blueprints[0] - library_id, _ = library_manager.open_library() - set_current_library(library_id) - # Load pipes temporarily - library_manager.load_from_blueprints(library_id=library_id, blueprints=[blueprint]) + library_id, _ = library_manager.open_library() + set_current_library(library_id) + # Load pipes temporarily + library_manager.load_from_blueprints(library_id=library_id, blueprints=[blueprint]) # Get the pipe the_pipe = get_required_pipe(pipe_code=request_data.pipe_code) - # Check if pipe has any inputs - if not the_pipe.inputs.root: - return JSONResponse( - content=BuildInputsResponse( - inputs_json={}, - pipe_code=request_data.pipe_code, - success=True, - message=f"No inputs required for pipe '{request_data.pipe_code}'", - ).model_dump(serialize_as_any=True) - ) - # Generate the input JSON inputs_json_str = the_pipe.inputs.generate_json_string(indent=2) inputs_json = json.loads(inputs_json_str) diff --git a/api/routes/pipelex/build/pipe.py b/api/routes/pipelex/build/pipe.py index 9fb5f45..185bc16 100644 --- a/api/routes/pipelex/build/pipe.py +++ b/api/routes/pipelex/build/pipe.py @@ -3,12 +3,9 @@ from fastapi import APIRouter from fastapi.responses import JSONResponse from pipelex.builder.builder_loop import BuilderLoop -from pipelex.hub import get_library_manager, set_current_library from pipelex.language.plx_factory import PlxFactory from pydantic import BaseModel, Field -from api.routes.helpers import extract_pipe_structures - router = APIRouter(tags=["build"]) @@ -19,9 +16,6 @@ class BuildPipeRequest(BaseModel): class BuildPipeResponse(BaseModel): plx_content: str = Field(..., description="Generated PLX content as string") pipelex_bundle_blueprint: dict[str, Any] = Field(..., description="Generated pipelex bundle blueprint") - pipe_structures: dict[str, dict[str, Any]] = Field( - default_factory=dict, description="Structure class information for each pipe's inputs and output" - ) success: bool = Field(default=True, description="Whether the operation was successful") message: str = Field(default="Pipeline generated successfully", description="Status message") @@ -31,25 +25,16 @@ async def build_pipe(request_data: BuildPipeRequest): """Build a pipeline from a brief description. This endpoint takes a brief description and generates both PLX content - and the corresponding pipelex bundle blueprint, along with pipe structures. + and the corresponding pipelex bundle blueprint. """ - # Execute the pipe_builder pipeline - library_manager = get_library_manager() builder_loop = BuilderLoop() pipelex_bundle_spec, _ = await builder_loop.build_and_fix(inputs={"brief": request_data.brief}, builder_pipe="pipe_builder") blueprint = pipelex_bundle_spec.to_blueprint() - library_id, _ = library_manager.open_library() - set_current_library(library_id) - # Load pipes temporarily to extract structures - pipes = library_manager.load_from_blueprints(library_id=library_id, blueprints=[blueprint]) - pipe_structures = extract_pipe_structures(pipes) - plx_content = PlxFactory.make_plx_content(blueprint=blueprint) response_data = BuildPipeResponse( plx_content=plx_content, pipelex_bundle_blueprint=blueprint.model_dump(serialize_as_any=True), - pipe_structures=pipe_structures, success=True, message="Pipeline generated successfully", ) diff --git a/api/routes/pipelex/validate.py b/api/routes/pipelex/validate.py index 3e6a5b3..8e5a91b 100644 --- a/api/routes/pipelex/validate.py +++ b/api/routes/pipelex/validate.py @@ -1,13 +1,9 @@ -from typing import Any - from fastapi import APIRouter from fastapi.responses import JSONResponse from pipelex.core.bundles.pipelex_bundle_blueprint import PipelexBundleBlueprint from pipelex.pipeline.validate_bundle import validate_bundle from pydantic import BaseModel, Field -from api.routes.helpers import extract_pipe_structures - router = APIRouter(tags=["validate"]) @@ -18,9 +14,6 @@ class ValidateRequest(BaseModel): class ValidateResponse(BaseModel): plx_content: str = Field(..., description="The PLX content that was validated") pipelex_bundle_blueprint: PipelexBundleBlueprint = Field(..., description="Generated pipelex bundle blueprint") - pipe_structures: dict[str, dict[str, Any]] = Field( - default_factory=dict, description="Structure class information for each pipe's inputs and output" - ) success: bool = Field(default=True, description="Whether the validation was successful") message: str = Field(default="PLX content validated successfully", description="Status message") @@ -34,17 +27,14 @@ async def validate_plx(request_data: ValidateRequest): 2. Loading pipes into the library 3. Running static validation and dry runs - Returns validation results with blueprint and pipe structures. + Returns validation results with blueprint. """ validate_bundle_result = await validate_bundle(plx_content=request_data.plx_content) blueprint = validate_bundle_result.blueprints - pipes = validate_bundle_result.pipes - pipe_structures = extract_pipe_structures(pipes) response_data = ValidateResponse( plx_content=request_data.plx_content, pipelex_bundle_blueprint=blueprint[0], - pipe_structures=pipe_structures, success=True, message="PLX content validated successfully", ) diff --git a/api/security.py b/api/security.py index 1159373..5fb3199 100644 --- a/api/security.py +++ b/api/security.py @@ -124,7 +124,6 @@ async def verify_api_key(credentials: Annotated[HTTPAuthorizationCredentials, De headers={"WWW-Authenticate": "Bearer"}, ) - log.info("✅ API key validated") return credentials.credentials except HTTPException: From ac402e0b251037b31fbab081c1f2aadf883feb07 Mon Sep 17 00:00:00 2001 From: thomashebrard Date: Mon, 26 Jan 2026 17:47:10 +0100 Subject: [PATCH 3/5] fix inputs --- api/routes/pipelex/build/inputs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/routes/pipelex/build/inputs.py b/api/routes/pipelex/build/inputs.py index d4c7bca..2e24207 100644 --- a/api/routes/pipelex/build/inputs.py +++ b/api/routes/pipelex/build/inputs.py @@ -23,7 +23,7 @@ class BuildInputsResponse(BaseModel): @router.post("/build/inputs", response_model=BuildInputsResponse) -async def build_inputs(request_data: BuildInputsRequest): +async def generate_inputs_json(request_data: BuildInputsRequest): """Generate example input JSON for a pipe. This endpoint generates a JSON object with example values for all pipe inputs From f5286c169ebb2435fc9535e7c09a606d8534c870 Mon Sep 17 00:00:00 2001 From: thomashebrard Date: Tue, 27 Jan 2026 10:50:12 +0100 Subject: [PATCH 4/5] pipelex init --- .pipelex/inference/deck/base_deck.toml | 101 ++++++----- .pipelex/inference/routing_profiles.toml | 4 + .pipelex/pipelex.toml | 222 +++++++++++++---------- 3 files changed, 181 insertions(+), 146 deletions(-) diff --git a/.pipelex/inference/deck/base_deck.toml b/.pipelex/inference/deck/base_deck.toml index 63ab39e..5481065 100644 --- a/.pipelex/inference/deck/base_deck.toml +++ b/.pipelex/inference/deck/base_deck.toml @@ -13,38 +13,41 @@ #################################################################################################### #################################################################################################### -# Aliases +# LLM Choices #################################################################################################### -[aliases] -base-claude = "claude-4.5-sonnet" -base-gpt = "gpt-4o" -base-gemini = "gemini-2.5-flash" -base-mistral = "mistral-medium" -base-groq = "llama-3.3-70b-versatile" -base-grok = "grok-4-fast-non-reasoning" +# Default llm choices for a PipeLLM generation. -best-gpt = "gpt-5.1" -best-claude = "claude-4.5-opus" -best-gemini = "gemini-3.0-pro" -best-mistral = "mistral-medium" - -# Groq-specific aliases -fast-groq = "llama-3.1-8b-instant" -vision-groq = "llama-4-scout-17b-16e-instruct" +[llm.choice_defaults] +for_text = "cheap_llm" +for_object = "cheap_llm_for_structured" -# Image generation aliases -base-img-gen = "flux-pro/v1.1" -best-img-gen = "flux-2" -fast-img-gen = "fast-lightning-sdxl" #################################################################################################### -# Waterfalls +# Waterfalls : list of models that will be tried in order until one of them succeeds. #################################################################################################### [waterfalls] - # --- Waterfalls for LLMs --------------------------------------------------------------------- +cheap_llm = [ + "gpt-4o-mini", + "gemini-2.5-flash-lite", + "mistral-small", + "claude-3-haiku", + "grok-3-mini", +] +cheap_llm_for_structured = ["gpt-4o-mini", "mistral-small", "claude-3-haiku"] +cheap_llm_for_vision = [ + "gemini-2.5-flash-lite", + "gpt-4o-mini", + "claude-3-haiku", +] +cheap_llm_for_creativity = [ + "gemini-2.5-flash", + "grok-3-mini", + "gpt-4o-mini", + "claude-4.5-haiku", +] smart_llm = [ "claude-4.5-opus", "claude-4.5-sonnet", @@ -87,25 +90,6 @@ llm_for_large_codebase = [ "gemini-2.5-flash", "grok-4-fast-non-reasoning", ] -cheap_llm = [ - "gpt-4o-mini", - "gemini-2.5-flash-lite", - "mistral-small", - "claude-3-haiku", - "grok-3-mini", -] -cheap_llm_for_vision = [ - "gemini-2.5-flash-lite", - "gpt-4o-mini", - "claude-3-haiku", -] -cheap_llm_for_structured = ["gpt-4o-mini", "mistral-small", "claude-3-haiku"] -cheap_llm_for_creativity = [ - "gemini-2.5-flash", - "grok-3-mini", - "gpt-4o-mini", - "claude-4.5-haiku", -] # --- Waterfalls for Extracts --------------------------------------------------------------------- pdf_text_extractor = [ @@ -115,6 +99,32 @@ pdf_text_extractor = [ ] image_text_extractor = ["mistral-ocr"] +#################################################################################################### +# Aliases +#################################################################################################### + +[aliases] +base-claude = "claude-4.5-sonnet" +base-gpt = "gpt-4o" +base-gemini = "gemini-2.5-flash" +base-mistral = "mistral-medium" +base-groq = "llama-3.3-70b-versatile" +base-grok = "grok-4-fast-non-reasoning" + +best-gpt = "gpt-5.1" +best-claude = "claude-4.5-opus" +best-gemini = "gemini-3.0-pro" +best-mistral = "mistral-medium" + +# Groq-specific aliases +fast-groq = "llama-3.1-8b-instant" +vision-groq = "llama-4-scout-17b-16e-instruct" + +# Image generation aliases +base-img-gen = "flux-2-pro" +best-img-gen = "nano-banana-pro" +fast-img-gen = "gpt-image-1-mini" + #################################################################################################### # LLM Presets #################################################################################################### @@ -160,13 +170,6 @@ llm_for_testing_gen_object = { model = "cheap_llm_for_structured", temperature = llm_for_testing_vision = { model = "cheap_llm_for_vision", temperature = 0.5 } llm_for_testing_vision_structured = { model = "cheap_llm_for_vision", temperature = 0.5 } -#################################################################################################### -# LLM Choices -#################################################################################################### - -[llm.choice_defaults] -for_text = "cheap_llm" -for_object = "cheap_llm_for_structured" #################################################################################################### # Extract Presets @@ -199,4 +202,4 @@ img_gen_for_art = { model = "best-img-gen", quality = "high", guidance_scale = 9 img_gen_for_diagram = { model = "base-img-gen", quality = "medium", guidance_scale = 7.0, is_moderated = true, safety_tolerance = 2 } img_gen_for_mockup = { model = "base-img-gen", quality = "medium", guidance_scale = 6.5, is_moderated = true, safety_tolerance = 3 } img_gen_for_product = { model = "best-img-gen", quality = "high", guidance_scale = 8.5, is_moderated = true, safety_tolerance = 2 } -img_gen_for_testing = { model = "fast-img-gen", nb_steps = 4, guidance_scale = 4.0, is_moderated = true, safety_tolerance = 4 } +img_gen_for_testing = { model = "gpt-image-1-mini", quality = "low" } diff --git a/.pipelex/inference/routing_profiles.toml b/.pipelex/inference/routing_profiles.toml index bf40281..e05f8db 100644 --- a/.pipelex/inference/routing_profiles.toml +++ b/.pipelex/inference/routing_profiles.toml @@ -108,6 +108,10 @@ default = "portkey" description = "Use Scaleway backend for all its supported models" default = "scaleway" +[profiles.all_vertexai] +description = "Use Vertex AI backend for all its supported models" +default = "vertexai" + [profiles.all_xai] description = "Use xAI backend for all its supported models" default = "xai" diff --git a/.pipelex/pipelex.toml b/.pipelex/pipelex.toml index 7a7ddaf..c99fd02 100644 --- a/.pipelex/pipelex.toml +++ b/.pipelex/pipelex.toml @@ -6,15 +6,15 @@ # `pipelex init config` # # Purpose: -# - This file allows to override Pipelex's default settings for specific projects -# - Feel free to modify any settings below to suit your needs -# - You can add any configuration sections that exist in the main pipelex.toml +# - This file allows you to override Pipelex's default settings for specific projects +# - All values below are set to their defaults - modify them as needed +# - The values here will override the defaults from the Pipelex package # # Finding Available Settings: # - See the full default configuration in: pipelex/pipelex.toml (in the Pipelex package) # - See the configuration structure classes in: pipelex/config.py and pipelex/cogt/config_cogt.py # -# Common customizations are proposed below, such as: +# Common customizations include: # - Logging levels and behavior # - Excluded directories for scanning # - LLM prompt dumping for debugging @@ -26,136 +26,164 @@ # #################################################################################################### +#################################################################################################### +# Pipeline Execution Config +#################################################################################################### + [pipelex.pipeline_execution_config] -# Uncomment to disable conversion of data URLs to pipelex-storage:// URIs -# is_normalize_data_urls_to_storage = false +# Set to false to disable conversion of incoming data URLs to pipelex-storage:// URIs +is_normalize_data_urls_to_storage = false +# Set to false to disable generation of execution graphs +is_generate_graph = true [pipelex.pipeline_execution_config.graph_config.data_inclusion] -# Uncomment to include stuff data in graph outputs: +# Control what data is included in graph outputs stuff_json_content = true stuff_text_content = true stuff_html_content = true error_stack_traces = true [pipelex.pipeline_execution_config.graph_config.graphs_inclusion] -# Uncomment to customize which graph outputs are generated (all enabled by default): -# graphspec_json = false -# mermaidflow_mmd = false -# mermaidflow_html = false -# reactflow_viewspec = false -# reactflow_html = false +# Control which graph outputs are generated +graphspec_json = true +mermaidflow_mmd = true +mermaidflow_html = true +reactflow_viewspec = true +reactflow_html = true [pipelex.pipeline_execution_config.graph_config.reactflow_config] -# Uncomment to customize ReactFlow graph rendering: -# edge_type = "bezier" # Options: "bezier", "smoothstep", "step", "straight" -# nodesep = 50 # Horizontal spacing between nodes -# ranksep = 80 # Vertical spacing between ranks/levels -# initial_zoom = 1.0 # Initial zoom level (1.0 = 100%) -# pan_to_top = true # Pan to show top of graph on load +# Customize ReactFlow graph rendering +edge_type = "bezier" # Options: "bezier", "smoothstep", "step", "straight" +nodesep = 50 # Horizontal spacing between nodes +ranksep = 30 # Vertical spacing between ranks/levels +initial_zoom = 1.0 # Initial zoom level (1.0 = 100%) +pan_to_top = true # Pan to show top of graph on load -[pipelex.storage_config] -# Storage method: "local" (default), "in_memory", "s3", or "gcp" -# method = "local" +#################################################################################################### +# Storage Config +#################################################################################################### +[pipelex.storage_config] +# Storage method: "local", "in_memory" (default), "s3", or "gcp" +method = "in_memory" # Whether to fetch remote HTTP URLs and store them locally -# is_fetch_remote_content_enabled = true +is_fetch_remote_content_enabled = true -# --- Local Storage Configuration --- -# Uncomment to customize local storage settings: [pipelex.storage_config.local] -# uri_format = "{primary_id}/{secondary_id}/{hash}.{extension}" -# local_storage_path = ".pipelex/storage" +# Local storage settings +uri_format = "{primary_id}/{secondary_id}/{hash}.{extension}" +local_storage_path = ".pipelex/storage" + +[pipelex.storage_config.in_memory] +# In-memory storage settings +uri_format = "{primary_id}/{secondary_id}/{hash}.{extension}" -# --- AWS S3 Storage Configuration --- -# Uncomment to use S3 storage (requires boto3: `pip install pipelex[s3]`): [pipelex.storage_config.s3] -# uri_format = "{primary_id}/{secondary_id}/{hash}.{extension}" -# bucket_name = "your-bucket-name" -# region = "us-east-1" -# signed_urls_lifespan_seconds = 3600 # Set to "disabled" for public URLs +# AWS S3 storage settings (requires boto3: `pip install pipelex[s3]`) +uri_format = "{primary_id}/{secondary_id}/{hash}.{extension}" +bucket_name = "" +region = "" +signed_urls_lifespan_seconds = 3600 # Set to "disabled" for public URLs -# --- Google Cloud Storage Configuration --- -# Uncomment to use GCP storage (requires google-cloud-storage: `pip install pipelex[gcp-storage]`): [pipelex.storage_config.gcp] -# uri_format = "{primary_id}/{secondary_id}/{hash}.{extension}" -# bucket_name = "your-bucket-name" -# project_id = "your-project-id" -# signed_urls_lifespan_seconds = 3600 # Set to "disabled" for public URLs +# Google Cloud Storage settings (requires google-cloud-storage: `pip install pipelex[gcp-storage]`) +uri_format = "{primary_id}/{secondary_id}/{hash}.{extension}" +bucket_name = "" +project_id = "" +signed_urls_lifespan_seconds = 3600 # Set to "disabled" for public URLs + +#################################################################################################### +# Scan Config +#################################################################################################### [pipelex.scan_config] -# Uncomment to customize the excluded directories for scanning -# excluded_dirs = [ -# ".venv", -# "venv", -# "env", -# ".env", -# "virtualenv", -# ".virtualenv", -# ".git", -# "__pycache__", -# ".pytest_cache", -# ".mypy_cache", -# ".ruff_cache", -# "node_modules", -# "results", -# ] +# Directories to exclude when scanning for pipeline files +excluded_dirs = [ + ".venv", + "venv", + "env", + ".env", + "virtualenv", + ".virtualenv", + ".git", + "__pycache__", + ".pytest_cache", + ".mypy_cache", + ".ruff_cache", + "node_modules", + "results", +] + +#################################################################################################### +# Builder Config +#################################################################################################### [pipelex.builder_config] -# Uncomment to change where the generated pipelines are saved: -# default_output_dir = "." -# default_bundle_file_name = "bundle" -# default_directory_base_name = "pipeline" +# Settings for generated pipelines +default_output_dir = "." +default_bundle_file_name = "bundle" +default_directory_base_name = "pipeline" -[pipelex.log_config] -# Uncomment to change the default log level: -# default_log_level = "INFO" +#################################################################################################### +# Log Config +#################################################################################################### -# Uncomment to log to stderr instead of stdout -# console_log_target = "stderr" -# console_print_target = "stderr" +[pipelex.log_config] +# Default logging level: "DEBUG", "INFO", "WARNING", "ERROR" +default_log_level = "INFO" +# Log output target: "stdout" or "stderr" +console_log_target = "stdout" +console_print_target = "stdout" [pipelex.log_config.package_log_levels] -# Uncomment to change the log level for specific packages: -# pipelex = "INFO" +# Log levels for specific packages (use "-" instead of "." in package names) +pipelex = "INFO" -[pipelex.observer_config] -# Uncomment to change the directory where the observer will save its results: -# observer_dir = "results/observer" +#################################################################################################### +# Feature Config +#################################################################################################### [pipelex.feature_config] -# WIP/Experimental feature flags: -# is_reporting_enabled = true +# WIP/Experimental feature flags +is_reporting_enabled = true + +#################################################################################################### +# Reporting Config +#################################################################################################### [pipelex.reporting_config] -# Uncomment to customize the reporting configuration: -# is_log_costs_to_console = false -# is_generate_cost_report_file_enabled = false -# cost_report_dir_path = "reports" -# cost_report_base_name = "cost_report" -# cost_report_extension = "csv" -# cost_report_unit_scale = 1.0 - -[cogt] +# Cost reporting settings +is_log_costs_to_console = false +is_generate_cost_report_file_enabled = false +cost_report_dir_path = "reports" +cost_report_base_name = "cost_report" +cost_report_extension = "csv" +cost_report_unit_scale = 1.0 + +#################################################################################################### +# Cogt (Cognitive Tools) Config +#################################################################################################### + [cogt.model_deck_config] -# Uncomment to disable model fallback: it will raise errors instead of using secondary model options: -# is_model_fallback_enabled = false -# Uncomment to change the reaction to missing presets: "raise" (default), "log" or "none" -# missing_presets_reaction = "raise" +# Model fallback behavior: if true, uses secondary model options when primary fails +is_model_fallback_enabled = true +# Reaction to missing presets: "raise", "log", or "none" +missing_presets_reaction = "log" [cogt.tenacity_config] -# Uncomment to change those values as needed: -# max_retries = 50 # Maximum number of retry attempts before giving up -# wait_multiplier = 0.2 # Multiplier applied to the wait time between retries (in seconds) -# wait_max = 20 # Maximum wait time between retries (in seconds) -# wait_exp_base = 1.3 # Base for exponential backoff calculation +# Retry behavior for API calls +max_retries = 50 # Maximum number of retry attempts before giving up +wait_multiplier = 0.2 # Multiplier applied to the wait time between retries (in seconds) +wait_max = 20 # Maximum wait time between retries (in seconds) +wait_exp_base = 1.3 # Base for exponential backoff calculation [cogt.llm_config] -# Uncomment any of these to enable dumping the inputs or outputs of text-generation with an LLM: -# is_dump_text_prompts_enabled = true -# is_dump_response_text_enabled = true +# Enable dumping of LLM inputs/outputs for debugging +is_dump_text_prompts_enabled = false +is_dump_response_text_enabled = false [cogt.llm_config.instructor_config] -# Uncomment any of these to enable dumping the kwargs, response or errors when generating structured content: -# is_dump_kwargs_enabled = true -# is_dump_response_enabled = true -# is_dump_error_enabled = true +# Enable dumping of structured content generation details for debugging +is_dump_kwargs_enabled = false +is_dump_response_enabled = false +is_dump_error_enabled = false From abb1411309910d04b2ae0d9ca7291a107f9a4185 Mon Sep 17 00:00:00 2001 From: thomashebrard Date: Tue, 27 Jan 2026 20:38:24 +0100 Subject: [PATCH 5/5] pipelex init --- .pipelex/inference/deck/1_llm_deck.toml | 79 +++++++ .pipelex/inference/deck/2_img_gen_deck.toml | 49 +++++ .pipelex/inference/deck/3_extract_deck.toml | 42 ++++ .pipelex/inference/deck/base_deck.toml | 205 ------------------ .pipelex/inference/deck/overrides.toml | 19 -- .../inference/deck/x_custom_extract_deck.toml | 39 ++++ .../inference/deck/x_custom_llm_deck.toml | 71 ++++++ .pipelex/inference/routing_profiles.toml | 41 +--- .pipelex/pipelex.toml | 2 +- api/routes/pipelex/build/__init__.py | 2 + api/routes/pipelex/build/inputs.py | 2 +- api/routes/pipelex/build/output.py | 72 ++++++ 12 files changed, 361 insertions(+), 262 deletions(-) create mode 100644 .pipelex/inference/deck/1_llm_deck.toml create mode 100644 .pipelex/inference/deck/2_img_gen_deck.toml create mode 100644 .pipelex/inference/deck/3_extract_deck.toml delete mode 100644 .pipelex/inference/deck/base_deck.toml delete mode 100644 .pipelex/inference/deck/overrides.toml create mode 100644 .pipelex/inference/deck/x_custom_extract_deck.toml create mode 100644 .pipelex/inference/deck/x_custom_llm_deck.toml create mode 100644 api/routes/pipelex/build/output.py diff --git a/.pipelex/inference/deck/1_llm_deck.toml b/.pipelex/inference/deck/1_llm_deck.toml new file mode 100644 index 0000000..6b9d8c3 --- /dev/null +++ b/.pipelex/inference/deck/1_llm_deck.toml @@ -0,0 +1,79 @@ +#################################################################################################### +# Pipelex Model Deck - LLM Configuration +#################################################################################################### +# +# This file defines model defaults, aliases, and presets for LLMs +# +# Model Reference Syntax: +# - Preset: $preset_name or preset:preset_name +# - Alias: @alias_name or alias:alias_name +# +# Documentation: https://docs.pipelex.com +# Support: https://go.pipelex.com/discord +# +#################################################################################################### + +#################################################################################################### +# LLM Default Choices +#################################################################################################### + +[llm.choice_defaults] +default_temperature = 0.5 +for_text = "@default-general" +for_object = "@default-general" + +#################################################################################################### +# LLM Aliases +#################################################################################################### + +[llm.aliases] +best-gpt = "gpt-5.2" +best-claude = "claude-4.5-opus" +best-gemini = "gemini-3.0-pro" +best-mistral = "mistral-large-3" + +# Default aliases (first choice from waterfalls) +default-general = "claude-4.5-sonnet" +default-premium = "claude-4.5-opus" +default-premium-vision = "claude-4.5-opus" +default-premium-structured = "claude-4.5-opus" +default-large-context-code = "gemini-3.0-pro" +default-large-context-text = "gemini-2.5-flash" +default-small = "gemini-2.5-flash-lite" +default-small-structured = "gemini-2.5-flash-lite" +default-small-vision = "gemini-2.5-flash-lite" +default-small-creative = "gemini-2.5-flash-lite" + +#################################################################################################### +# LLM Presets +#################################################################################################### + +[llm.presets] + +# Writing +writing-factual = { model = "@default-premium", temperature = 0.1 } +writing-creative = { model = "@default-premium", temperature = 0.9 } + +# Retrieval +retrieval = { model = "@default-large-context-text", temperature = 0.1 } + +# Engineering +engineering-structured = { model = "@default-premium-structured", temperature = 0.2 } +engineering-code = { model = "@default-premium", temperature = 0.1 } +engineering-codebase-analysis = { model = "@best-gemini", temperature = 0.1 } + +# Vision +vision = { model = "@default-premium-vision", temperature = 0.5 } +vision-cheap = { model = "@default-small-vision", temperature = 0.5 } +vision-diagram = { model = "@default-premium-vision", temperature = 0.3 } +vision-table = { model = "@default-premium-vision", temperature = 0.3 } + +# Image generation prompting +img-gen-prompting = { model = "@default-premium", temperature = 0.5 } +img-gen-prompting-cheap = { model = "@default-small", temperature = 0.5 } + +# Testing +testing-text = { model = "@default-small", temperature = 0.5 } +testing-structured = { model = "@default-small-structured", temperature = 0.1 } +testing-vision = { model = "@default-small-vision", temperature = 0.5 } +testing-vision-structured = { model = "@default-small-vision", temperature = 0.5 } diff --git a/.pipelex/inference/deck/2_img_gen_deck.toml b/.pipelex/inference/deck/2_img_gen_deck.toml new file mode 100644 index 0000000..8b1725c --- /dev/null +++ b/.pipelex/inference/deck/2_img_gen_deck.toml @@ -0,0 +1,49 @@ +#################################################################################################### +# Pipelex Model Deck - Image Generation Configuration +#################################################################################################### +# +# This file defines model aliases and presets for image generation models +# +# Model Reference Syntax: +# - Preset: $preset_name or preset:preset_name +# - Alias: @alias_name or alias:alias_name +# +# Documentation: https://docs.pipelex.com +# Support: https://go.pipelex.com/discord +# +#################################################################################################### + +#################################################################################################### +# Image Generation Default Choices +#################################################################################################### + +[img_gen] +default_quality = "medium" +choice_default = "$gen-image" + +#################################################################################################### +# Image Generation Aliases +#################################################################################################### + +[img_gen.aliases] +best-gpt = "gpt-image-1.5" +best-gemini = "nano-banana-pro" +best-blackforestlabs = "flux-2-pro" + +default-general = "flux-2-pro" +default-premium = "nano-banana-pro" +default-small = "gpt-image-1-mini" + +#################################################################################################### +# Image Generation Presets +#################################################################################################### + +[img_gen.presets] + +# General purpose +gen-image = { model = "@default-general", quality = "medium" } +gen-image-fast = { model = "@default-small", quality = "low" } +gen-image-high-quality = { model = "@default-premium", quality = "high" } + +# Testing +gen-image-testing = { model = "@default-small", quality = "low" } diff --git a/.pipelex/inference/deck/3_extract_deck.toml b/.pipelex/inference/deck/3_extract_deck.toml new file mode 100644 index 0000000..1092489 --- /dev/null +++ b/.pipelex/inference/deck/3_extract_deck.toml @@ -0,0 +1,42 @@ +#################################################################################################### +# Pipelex Model Deck - Base Configuration +#################################################################################################### +# +# This file defines model aliases and presets for Document extraction models, including +# extraction of text and images from documents and OCR and text extraction from images. +# +# Model Reference Syntax: +# - Preset: $preset_name or preset:preset_name +# - Alias: @alias_name or alias:alias_name +# +# Documentation: https://docs.pipelex.com +# Support: https://go.pipelex.com/discord +# +#################################################################################################### + +#################################################################################################### +# Document Extraction Default Choices +#################################################################################################### + +[extract] +choice_default = "$extract-all-from-document" + +#################################################################################################### +# Aliases +#################################################################################################### + +[extract.aliases] +default-extract-document = "azure-document-intelligence" +default-extract-text-from-pdf = "pypdfium2-extract-pdf" +default-software-extract-no-inference = "pypdfium2-extract-pdf" + +#################################################################################################### +# Extract Presets +#################################################################################################### + +[extract.presets] +extract-all-from-document = { model = "@default-extract-document", max_nb_images = 100, image_min_size = 50 } +extract-text-from-pdf = { model = "@default-extract-text-from-pdf", max_nb_images = 100, image_min_size = 50 } + +# Testing +extract-testing = { model = "@default-software-extract-no-inference", max_nb_images = 100, image_min_size = 50 } diff --git a/.pipelex/inference/deck/base_deck.toml b/.pipelex/inference/deck/base_deck.toml deleted file mode 100644 index 5481065..0000000 --- a/.pipelex/inference/deck/base_deck.toml +++ /dev/null @@ -1,205 +0,0 @@ -#################################################################################################### -# Pipelex Model Deck - Base Configuration -#################################################################################################### -# -# This file defines model aliases and presets for: -# - LLMs (language models for text generation and structured output) -# - Image generation models (for creating images from text prompts) -# - Document extraction models (OCR and text extraction from PDFs/images) -# -# Documentation: https://docs.pipelex.com -# Support: https://go.pipelex.com/discord -# -#################################################################################################### - -#################################################################################################### -# LLM Choices -#################################################################################################### - -# Default llm choices for a PipeLLM generation. - -[llm.choice_defaults] -for_text = "cheap_llm" -for_object = "cheap_llm_for_structured" - - -#################################################################################################### -# Waterfalls : list of models that will be tried in order until one of them succeeds. -#################################################################################################### - -[waterfalls] -# --- Waterfalls for LLMs --------------------------------------------------------------------- -cheap_llm = [ - "gpt-4o-mini", - "gemini-2.5-flash-lite", - "mistral-small", - "claude-3-haiku", - "grok-3-mini", -] -cheap_llm_for_structured = ["gpt-4o-mini", "mistral-small", "claude-3-haiku"] -cheap_llm_for_vision = [ - "gemini-2.5-flash-lite", - "gpt-4o-mini", - "claude-3-haiku", -] -cheap_llm_for_creativity = [ - "gemini-2.5-flash", - "grok-3-mini", - "gpt-4o-mini", - "claude-4.5-haiku", -] -smart_llm = [ - "claude-4.5-opus", - "claude-4.5-sonnet", - "gemini-3.0-pro", - "gpt-5.1", - "claude-4.1-opus", - "gemini-2.5-pro", - "claude-4-sonnet", - "grok-4-fast-non-reasoning", -] -smart_llm_with_vision = [ - "claude-4.5-opus", - "claude-4.5-sonnet", - "gemini-3.0-pro", - "gpt-5.1", - "claude-4.1-opus", - "gemini-2.5-pro", - "claude-4-sonnet", - "grok-4-fast-non-reasoning", -] -smart_llm_for_structured = [ - "claude-4.5-opus", - "claude-4.5-sonnet", - "gemini-3.0-pro", - "gpt-5.1", - "claude-4.1-opus", - "claude-4-sonnet", -] -llm_for_creativity = [ - "claude-4.5-opus", - "claude-4.1-opus", - "gemini-2.5-pro", - "gpt-5.1", -] -llm_for_large_codebase = [ - "gemini-2.5-pro", - "claude-4.5-sonnet", - "gemini-3.0-pro", - "gpt-5.1", - "gemini-2.5-flash", - "grok-4-fast-non-reasoning", -] - -# --- Waterfalls for Extracts --------------------------------------------------------------------- -pdf_text_extractor = [ - "azure-document-intelligence", - "mistral-ocr", - "pypdfium2-extract-pdf", -] -image_text_extractor = ["mistral-ocr"] - -#################################################################################################### -# Aliases -#################################################################################################### - -[aliases] -base-claude = "claude-4.5-sonnet" -base-gpt = "gpt-4o" -base-gemini = "gemini-2.5-flash" -base-mistral = "mistral-medium" -base-groq = "llama-3.3-70b-versatile" -base-grok = "grok-4-fast-non-reasoning" - -best-gpt = "gpt-5.1" -best-claude = "claude-4.5-opus" -best-gemini = "gemini-3.0-pro" -best-mistral = "mistral-medium" - -# Groq-specific aliases -fast-groq = "llama-3.1-8b-instant" -vision-groq = "llama-4-scout-17b-16e-instruct" - -# Image generation aliases -base-img-gen = "flux-2-pro" -best-img-gen = "nano-banana-pro" -fast-img-gen = "gpt-image-1-mini" - -#################################################################################################### -# LLM Presets -#################################################################################################### - -[llm.presets] - -# LLM Presets — Specific skills ------------------------------------------------------------- - -# Generation skills -llm_for_factual_writing = { model = "base-gpt", temperature = 0.1 } -llm_for_creative_writing = { model = "base-gpt", temperature = 0.9 } -llm_for_writing_cheap = { model = "gpt-4o-mini", temperature = 0.3 } - -# Retrieve and answer questions skills -llm_to_answer_questions_cheap = { model = "gpt-4o-mini", temperature = 0.3 } -llm_to_answer_questions = { model = "base-claude", temperature = 0.3 } -llm_to_retrieve = { model = "base-claude", temperature = 0.1 } - -# Engineering skills -llm_to_engineer = { model = "smart_llm_for_structured", temperature = 0.2 } -llm_to_code = { model = "base-claude", temperature = 0.1 } -llm_to_analyze_large_codebase = { model = "base-claude", temperature = 0.1 } - -# Vision skills -llm_for_img_to_text_cheap = { model = "gpt-4o-mini", temperature = 0.1 } -llm_for_img_to_text = { model = "base-claude", temperature = 0.1 } -llm_for_diagram_to_text = { model = "best-claude", temperature = 0.3 } -llm_for_table_to_text = { model = "base-claude", temperature = 0.3 } - -# Image generation prompting skills -llm_to_prompt_img_gen = { model = "base-claude", temperature = 0.2 } -llm_to_prompt_img_gen_cheap = { model = "gpt-4o-mini", temperature = 0.5 } - -# Groq-specific presets (fast inference, low cost) -llm_groq_fast_text = { model = "fast-groq", temperature = 0.7 } -llm_groq_balanced = { model = "base-groq", temperature = 0.5 } -llm_groq_vision = { model = "vision-groq", temperature = 0.3 } - -# LLM Presets — For Testing --------------------------------------------------------------------- - -llm_for_testing_gen_text = { model = "cheap_llm", temperature = 0.5 } -llm_for_testing_gen_object = { model = "cheap_llm_for_structured", temperature = 0.1 } -llm_for_testing_vision = { model = "cheap_llm_for_vision", temperature = 0.5 } -llm_for_testing_vision_structured = { model = "cheap_llm_for_vision", temperature = 0.5 } - - -#################################################################################################### -# Extract Presets -#################################################################################################### - -[extract] -choice_default = "extract_ocr_from_document" - -[extract.presets] -extract_ocr_from_document = { model = "azure-document-intelligence", max_nb_images = 100, image_min_size = 50 } -extract_basic_from_pdf = { model = "pypdfium2-extract-pdf", max_nb_images = 100, image_min_size = 50 } - -#################################################################################################### -# Image Generation Presets -#################################################################################################### - -[img_gen] -choice_default = "gen_image_basic" - -[img_gen.presets] - -# General purpose -gen_image_basic = { model = "base-img-gen", quality = "medium", guidance_scale = 7.5, is_moderated = true, safety_tolerance = 3 } -gen_image_fast = { model = "fast-img-gen", nb_steps = 4, guidance_scale = 5.0, is_moderated = true, safety_tolerance = 3 } -gen_image_high_quality = { model = "best-img-gen", quality = "high", guidance_scale = 8.0, is_moderated = true, safety_tolerance = 3 } -gen_image_openai_low_quality = { model = "gpt-image-1", quality = "low" } - -# Specific skills -img_gen_for_art = { model = "best-img-gen", quality = "high", guidance_scale = 9.0, is_moderated = false, safety_tolerance = 5 } -img_gen_for_diagram = { model = "base-img-gen", quality = "medium", guidance_scale = 7.0, is_moderated = true, safety_tolerance = 2 } -img_gen_for_mockup = { model = "base-img-gen", quality = "medium", guidance_scale = 6.5, is_moderated = true, safety_tolerance = 3 } -img_gen_for_product = { model = "best-img-gen", quality = "high", guidance_scale = 8.5, is_moderated = true, safety_tolerance = 2 } -img_gen_for_testing = { model = "gpt-image-1-mini", quality = "low" } diff --git a/.pipelex/inference/deck/overrides.toml b/.pipelex/inference/deck/overrides.toml deleted file mode 100644 index 08814db..0000000 --- a/.pipelex/inference/deck/overrides.toml +++ /dev/null @@ -1,19 +0,0 @@ -#################################################################################################### -# Pipelex Model Deck - Overrides -#################################################################################################### -# -# This file allows you to override the default model choices defined in base_deck.toml. -# You can customize presets for LLMs, image generation, and document extraction models. -# -# Documentation: https://docs.pipelex.com -# Support: https://go.pipelex.com/discord -# -#################################################################################################### - -#################################################################################################### -# LLM Deck overrides -#################################################################################################### - -[llm.choice_overrides] -for_text = "disabled" -for_object = "disabled" diff --git a/.pipelex/inference/deck/x_custom_extract_deck.toml b/.pipelex/inference/deck/x_custom_extract_deck.toml new file mode 100644 index 0000000..c3c3b9f --- /dev/null +++ b/.pipelex/inference/deck/x_custom_extract_deck.toml @@ -0,0 +1,39 @@ +#################################################################################################### +# Pipelex Model Deck - Custom Configurations for Document Extraction Models +#################################################################################################### +# +# This file allows you to override or complete the base model decks. +# +# ADVANCED USERS ONLY: This file is for users who bring their own API keys and connect directly +# to AI providers (Azure, Mistral, etc.) without using the Pipelex Gateway. +# +# If you're using the standard Pipelex Gateway setup, you don't need to modify this file. +# The Gateway handles model routing automatically and supports all available models. +# +# Waterfalls are useful when using multiple backends directly - they define ordered lists +# of models that are resolved at configuration time based on which backends are available. +# This enables defining pipelines that work across different environments with varying +# backend configurations. +# +# Documentation: https://docs.pipelex.com +# Support: https://go.pipelex.com/discord +# +#################################################################################################### + + +#################################################################################################### +# Waterfalls — ordered lists of models resolved at configuration time by backend availability +# +# Example (uncomment to use): +# [extract.waterfalls] +# document_extractor = ["azure-document-intelligence", "mistral-document-ai-2505"] +# pdf_text_extractor = [ +# "azure-document-intelligence", +# "mistral-document-ai-2505", +# "pypdfium2-extract-pdf", +# ] +# image_text_extractor = [ +# "azure-document-intelligence", +# "mistral-document-ai-2505", +# ] +#################################################################################################### diff --git a/.pipelex/inference/deck/x_custom_llm_deck.toml b/.pipelex/inference/deck/x_custom_llm_deck.toml new file mode 100644 index 0000000..eb47480 --- /dev/null +++ b/.pipelex/inference/deck/x_custom_llm_deck.toml @@ -0,0 +1,71 @@ +#################################################################################################### +# Pipelex Model Deck - Custom Configurations for LLMs +#################################################################################################### +# +# This file allows you to override or complete the base model decks. +# +# ADVANCED USERS ONLY: This file is for users who bring their own API keys and connect directly +# to AI providers (OpenAI, Anthropic, Google, etc.) without using the Pipelex Gateway. +# +# If you're using the standard Pipelex Gateway setup, you don't need to modify this file. +# The Gateway handles model routing automatically and supports all available models. +# +# Waterfalls are useful when using multiple backends directly - they define ordered lists +# of models that are resolved at configuration time based on which backends are available. +# This enables defining pipelines that work across different environments with varying +# backend configurations. +# +# Documentation: https://docs.pipelex.com +# Support: https://go.pipelex.com/discord +# +#################################################################################################### + +#################################################################################################### +# LLM Deck overrides +#################################################################################################### + +[llm.choice_overrides] +for_text = "disabled" +for_object = "disabled" + + +#################################################################################################### +# Waterfalls — ordered lists of models resolved at configuration time by backend availability +# +# Example (uncomment to use): +# [llm.waterfalls] +# premium-llm = ["claude-4.5-opus", "gemini-3.0-pro", "gpt-5.2", "grok-4"] +# premium-llm-vision = [ +# "claude-4.5-opus", +# "gemini-3.0-pro", +# "gpt-5.2", +# "grok-4-fast-reasoning", +# ] +# premium-llm-structured = [ +# "claude-4.5-opus", +# "gemini-3.0-pro", +# "gpt-5.2", +# "grok-4", +# ] +# large-context-llm-code = [ +# "gemini-3.0-pro", +# "claude-4.5-opus", +# "gpt-5.2", +# "grok-4-fast-reasoning", +# ] +# large-context-llm-text = ["gemini-2.5-flash", "claude-4.5-sonnet"] +# small-llm = [ +# "gemini-2.5-flash-lite", +# "gpt-4o-mini", +# "claude-3-haiku", +# "phi-4", +# "grok-3-mini", +# ] +# small-llm-structured = [ +# "gemini-2.5-flash-lite", +# "gpt-4o-mini", +# "claude-3-haiku", +# ] +# small-llm-vision = ["gemini-2.5-flash-lite", "gpt-4o-mini", "claude-3-haiku"] +# small-llm-creative = ["gemini-2.5-flash-lite", "gpt-4o-mini", "claude-3-haiku"] +#################################################################################################### diff --git a/.pipelex/inference/routing_profiles.toml b/.pipelex/inference/routing_profiles.toml index e05f8db..59a77db 100644 --- a/.pipelex/inference/routing_profiles.toml +++ b/.pipelex/inference/routing_profiles.toml @@ -9,49 +9,18 @@ # ========================================================================================= # Which profile to use (change this to switch routing) -active = "pipelex_gateway_first" +active = "all_pipelex_gateway" -# We recommend using the "pipelex_gateway_first" profile to get a head start with all models. +# We recommend using the "all_pipelex_gateway" profile to get a head start with all models. # To use the Pipelex Gateway backend: -# 1. Join our Discord community to get your free API key (no credit card required): -# Visit https://go.pipelex.com/discord and request your key in the appropriate channel -# 2. Set the environment variable (or add it to your .env file): -# - Linux/macOS: export PIPELEX_GATEWAY_API_KEY="your-api-key" -# - Windows CMD: set PIPELEX_GATEWAY_API_KEY=your-api-key -# - Windows PowerShell: $env:PIPELEX_GATEWAY_API_KEY="your-api-key" -# 3. The .pipelex/inference/backends.toml is already configured with api_key = "${PIPELEX_GATEWAY_API_KEY}" -# which will get the key from the environment variable. +# 1. Get your API key at https://app.pipelex.com (free credits included) +# 2. Add it to your .env file: PIPELEX_GATEWAY_API_KEY=your-key-here +# 3. Run `pipelex init` and accept the Gateway terms of service # ========================================================================================= # Routing Profiles # ========================================================================================= -[profiles.pipelex_gateway_first] -description = "Use Pipelex Gateway backend for all its supported models" -default = "pipelex_gateway" -fallback_order = [ - "pipelex_gateway", - "azure_openai", - "bedrock", - "google", - "blackboxai", - "mistral", - "fal", -] - -[profiles.pipelex_gateway_first.routes] -# Pattern matching: "model-pattern" = "backend-name" - -[profiles.pipelex_gateway_first.optional_routes] # Each optional route is considered only if its backend is available -"gpt-*" = "pipelex_gateway" -"gpt-image-1" = "openai" -"claude-*" = "pipelex_gateway" -"grok-*" = "pipelex_gateway" -"gemini-*" = "pipelex_gateway" -"*-sdxl" = "fal" -"flux-*" = "fal" -"mistral-ocr" = "mistral" - [profiles.all_pipelex_gateway] description = "Use Pipelex Gateway for all its supported models" default = "pipelex_gateway" diff --git a/.pipelex/pipelex.toml b/.pipelex/pipelex.toml index c99fd02..5787db6 100644 --- a/.pipelex/pipelex.toml +++ b/.pipelex/pipelex.toml @@ -32,7 +32,7 @@ [pipelex.pipeline_execution_config] # Set to false to disable conversion of incoming data URLs to pipelex-storage:// URIs -is_normalize_data_urls_to_storage = false +is_normalize_data_urls_to_storage = true # Set to false to disable generation of execution graphs is_generate_graph = true diff --git a/api/routes/pipelex/build/__init__.py b/api/routes/pipelex/build/__init__.py index 335e222..3e7e8d3 100644 --- a/api/routes/pipelex/build/__init__.py +++ b/api/routes/pipelex/build/__init__.py @@ -1,11 +1,13 @@ from fastapi import APIRouter from .inputs import router as inputs_router +from .output import router as output_router from .pipe import router as pipe_router from .runner import router as runner_router router = APIRouter() router.include_router(inputs_router) +router.include_router(output_router) router.include_router(pipe_router) router.include_router(runner_router) diff --git a/api/routes/pipelex/build/inputs.py b/api/routes/pipelex/build/inputs.py index 2e24207..6ac98c6 100644 --- a/api/routes/pipelex/build/inputs.py +++ b/api/routes/pipelex/build/inputs.py @@ -50,7 +50,7 @@ async def generate_inputs_json(request_data: BuildInputsRequest): the_pipe = get_required_pipe(pipe_code=request_data.pipe_code) # Generate the input JSON - inputs_json_str = the_pipe.inputs.generate_json_string(indent=2) + inputs_json_str = the_pipe.inputs.render_inputs(indent=2) inputs_json = json.loads(inputs_json_str) response_data = BuildInputsResponse( diff --git a/api/routes/pipelex/build/output.py b/api/routes/pipelex/build/output.py new file mode 100644 index 0000000..eee10bc --- /dev/null +++ b/api/routes/pipelex/build/output.py @@ -0,0 +1,72 @@ +from typing import Any + +from fastapi import APIRouter, HTTPException +from fastapi.responses import JSONResponse +from pipelex.core.concepts.concept_representation_generator import ConceptRepresentationFormat +from pipelex.hub import get_library_manager, get_required_pipe, set_current_library +from pipelex.pipeline.validate_bundle import validate_bundle +from pydantic import BaseModel, Field + +router = APIRouter(tags=["build"]) + + +class BuildOutputRequest(BaseModel): + plx_content: str = Field(..., description="PLX content to load pipes from") + pipe_code: str = Field(..., description="Pipe code to generate output JSON for") + + +class BuildOutputResponse(BaseModel): + output_json: dict[str, Any] = Field(..., description="Generated output JSON object") + pipe_code: str = Field(..., description="Pipe code that was used") + success: bool = Field(default=True, description="Whether the operation was successful") + message: str = Field(default="Output JSON generated successfully", description="Status message") + + +@router.post("/build/output", response_model=BuildOutputResponse) +async def build_output(request_data: BuildOutputRequest): + """Generate example output JSON for a pipe. + + This endpoint generates a JSON object showing the expected output structure + based on the pipe's output concept type. + + It will: + 1. Parse and validate the PLX content + 2. Load pipes from the bundle + 3. Generate output JSON for the specified pipe + """ + library_manager = get_library_manager() + + try: + # Validate and load the PLX content + validate_bundle_result = await validate_bundle(plx_content=request_data.plx_content) + blueprint = validate_bundle_result.blueprints[0] + + library_id, _ = library_manager.open_library() + set_current_library(library_id) + # Load pipes temporarily + library_manager.load_from_blueprints(library_id=library_id, blueprints=[blueprint]) + + # Get the pipe + the_pipe = get_required_pipe(pipe_code=request_data.pipe_code) + + # Generate the output JSON (content only, no concept wrapper) + output_dict = the_pipe.output.render_stuff_spec(ConceptRepresentationFormat.JSON) + output_json = output_dict.get("content", output_dict) + + response_data = BuildOutputResponse( + output_json=output_json, + pipe_code=request_data.pipe_code, + success=True, + message="Output JSON generated successfully", + ) + + return JSONResponse(content=response_data.model_dump(serialize_as_any=True)) + + except Exception as exc: + raise HTTPException( + status_code=500, + detail={ + "error_type": type(exc).__name__, + "message": str(exc), + }, + ) from exc