From 16dbc2da7e51160f78187b1c99553388e59cd1a1 Mon Sep 17 00:00:00 2001 From: "johnwayne.jiang" Date: Fri, 23 May 2025 11:34:22 -0700 Subject: [PATCH 1/8] improve data_tempalte --- .github/workflows/publish_testpypi.yaml | 4 +- internal | 2 +- .../function_calling/sample_run.ipynb | 206 +++++- prebuilt_template/generate_by_topic/README.md | 102 +++ .../generate_by_topic/sample_run.ipynb | 438 +++++++++++ src/starfish/data_gen_template/core.py | 36 +- .../starfish/function_calling/generator.py | 16 + .../starfish/generate_by_topic/generator.py | 21 + tests/data_factory/factory/data_factory.ipynb | 681 ++++++++++++++++++ 9 files changed, 1495 insertions(+), 11 deletions(-) create mode 100644 prebuilt_template/generate_by_topic/README.md create mode 100644 prebuilt_template/generate_by_topic/sample_run.ipynb create mode 100644 tests/data_factory/factory/data_factory.ipynb diff --git a/.github/workflows/publish_testpypi.yaml b/.github/workflows/publish_testpypi.yaml index b3da109..b22dbf2 100644 --- a/.github/workflows/publish_testpypi.yaml +++ b/.github/workflows/publish_testpypi.yaml @@ -88,13 +88,13 @@ jobs: --ExecutePreprocessor.timeout=120 \ --no-prompt --no-input \ --stdout \ - tests/data_factory/factory/test_resume_index_1.ipynb; then + tests/data_factory/factory/data_factory.ipynb; then echo "::error::Notebook execution failed" fi echo "Notebook executed successfully. Summary:" && \ jupyter nbconvert --to markdown --stdout \ - tests/data_factory/factory/test_resume_index_1.ipynb | \ + tests/data_factory/factory/data_factory.ipynb | \ grep -E '^#|^##' || true # Add tag deletion step diff --git a/internal b/internal index 0cb9662..9a7ccce 160000 --- a/internal +++ b/internal @@ -1 +1 @@ -Subproject commit 0cb9662b18a4da964ebde579e6b95c53ebfc700c +Subproject commit 9a7ccce145ab67429334c8f2fa24e444df149cd5 diff --git a/prebuilt_template/function_calling/sample_run.ipynb b/prebuilt_template/function_calling/sample_run.ipynb index c4934d2..1195856 100644 --- a/prebuilt_template/function_calling/sample_run.ipynb +++ b/prebuilt_template/function_calling/sample_run.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -38,6 +38,206 @@ "loaded = data_gen_template.get(\"starfish/generate_func_call_dataset\")\n" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "get the template input_data schema and example" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m2025-05-23 11:08:41\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mPlease run the template with this input schema\u001b[0m\n", + "\u001b[32m2025-05-23 11:08:41\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m{\n", + " \"$defs\": {\n", + " \"APIContract\": {\n", + " \"description\": \"Pydantic model representing an API contract structure.\",\n", + " \"properties\": {\n", + " \"name\": {\n", + " \"title\": \"Name\",\n", + " \"type\": \"string\"\n", + " },\n", + " \"description\": {\n", + " \"title\": \"Description\",\n", + " \"type\": \"string\"\n", + " },\n", + " \"parameters\": {\n", + " \"additionalProperties\": {\n", + " \"$ref\": \"#/$defs/ParameterDefinition\"\n", + " },\n", + " \"title\": \"Parameters\",\n", + " \"type\": \"object\"\n", + " }\n", + " },\n", + " \"required\": [\n", + " \"name\",\n", + " \"description\",\n", + " \"parameters\"\n", + " ],\n", + " \"title\": \"APIContract\",\n", + " \"type\": \"object\"\n", + " },\n", + " \"ParameterDefinition\": {\n", + " \"description\": \"Pydantic model representing parameter definition in an API contract.\",\n", + " \"properties\": {\n", + " \"type\": {\n", + " \"title\": \"Type\",\n", + " \"type\": \"string\"\n", + " },\n", + " \"description\": {\n", + " \"title\": \"Description\",\n", + " \"type\": \"string\"\n", + " },\n", + " \"required\": {\n", + " \"default\": true,\n", + " \"title\": \"Required\",\n", + " \"type\": \"boolean\"\n", + " }\n", + " },\n", + " \"required\": [\n", + " \"type\",\n", + " \"description\"\n", + " ],\n", + " \"title\": \"ParameterDefinition\",\n", + " \"type\": \"object\"\n", + " }\n", + " },\n", + " \"description\": \"Input schema for the generate_by_topic template.\\n\\nIMPORTANT: This Pydantic model is the single source of truth for default values.\\nThe validation and default values are controlled by this model, not the function signature.\",\n", + " \"properties\": {\n", + " \"num_records\": {\n", + " \"anyOf\": [\n", + " {\n", + " \"type\": \"integer\"\n", + " },\n", + " {\n", + " \"type\": \"null\"\n", + " }\n", + " ],\n", + " \"default\": 10,\n", + " \"title\": \"Num Records\"\n", + " },\n", + " \"api_contract\": {\n", + " \"$ref\": \"#/$defs/APIContract\"\n", + " },\n", + " \"topic_model_name\": {\n", + " \"default\": \"openai/gpt-4o-mini\",\n", + " \"title\": \"Topic Model Name\",\n", + " \"type\": \"string\"\n", + " },\n", + " \"topic_model_kwargs\": {\n", + " \"anyOf\": [\n", + " {\n", + " \"additionalProperties\": true,\n", + " \"type\": \"object\"\n", + " },\n", + " {\n", + " \"type\": \"null\"\n", + " }\n", + " ],\n", + " \"default\": null,\n", + " \"title\": \"Topic Model Kwargs\"\n", + " },\n", + " \"generation_model_name\": {\n", + " \"default\": \"openai/gpt-4o-mini\",\n", + " \"title\": \"Generation Model Name\",\n", + " \"type\": \"string\"\n", + " },\n", + " \"generation_model_kwargs\": {\n", + " \"anyOf\": [\n", + " {\n", + " \"additionalProperties\": true,\n", + " \"type\": \"object\"\n", + " },\n", + " {\n", + " \"type\": \"null\"\n", + " }\n", + " ],\n", + " \"default\": null,\n", + " \"title\": \"Generation Model Kwargs\"\n", + " },\n", + " \"data_factory_config\": {\n", + " \"anyOf\": [\n", + " {\n", + " \"additionalProperties\": true,\n", + " \"type\": \"object\"\n", + " },\n", + " {\n", + " \"type\": \"null\"\n", + " }\n", + " ],\n", + " \"default\": {},\n", + " \"title\": \"Data Factory Config\"\n", + " }\n", + " },\n", + " \"required\": [\n", + " \"api_contract\"\n", + " ],\n", + " \"title\": \"GenerateFuncCallDataSet\",\n", + " \"type\": \"object\"\n", + "}\u001b[0m\n" + ] + } + ], + "source": [ + "loaded.print_schema()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m2025-05-23 11:09:02\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mHere is an example with api_contract.name as weather_api.get_current_weather\u001b[0m\n", + "\u001b[32m2025-05-23 11:09:02\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m{\n", + " \"num_records\": 4,\n", + " \"api_contract\": {\n", + " \"name\": \"weather_api.get_current_weather\",\n", + " \"description\": \"Retrieves the current weather conditions for a specified location .\",\n", + " \"parameters\": {\n", + " \"location\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"The name of the city or geographic location .\",\n", + " \"required\": true\n", + " },\n", + " \"units\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"The units for temperature measurement( e.g., 'Celsius', 'Fahrenheit') .\",\n", + " \"required\": false\n", + " }\n", + " }\n", + " },\n", + " \"topic_model_name\": \"openai/gpt-4\",\n", + " \"topic_model_kwargs\": {\n", + " \"temperature\": 0.7\n", + " },\n", + " \"generation_model_name\": \"openai/gpt-4o-mini\",\n", + " \"generation_model_kwargs\": {\n", + " \"temperature\": 0.8,\n", + " \"max_tokens\": 200\n", + " },\n", + " \"data_factory_config\": {\n", + " \"max_concurrency\": 24,\n", + " \"task_runner_timeout\": 120\n", + " }\n", + "}\u001b[0m\n" + ] + } + ], + "source": [ + "loaded.print_example()" + ] + }, { "cell_type": "code", "execution_count": 5, @@ -203,7 +403,7 @@ ], "metadata": { "kernelspec": { - "display_name": "starfish-core-T7IInzTH-py3.11", + "display_name": ".venv", "language": "python", "name": "python3" }, @@ -217,7 +417,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.7" + "version": "3.11.4" } }, "nbformat": 4, diff --git a/prebuilt_template/generate_by_topic/README.md b/prebuilt_template/generate_by_topic/README.md new file mode 100644 index 0000000..9d5dc10 --- /dev/null +++ b/prebuilt_template/generate_by_topic/README.md @@ -0,0 +1,102 @@ + +## Overview +The `generate_by_topic` template is designed to create diverse synthetic data across multiple topics based on user instructions. It can automatically generate relevant topics if not provided and handles deduplication across generated content. + +## Key Features +- Automatic topic generation based on user instructions +- Customizable number of records and records per topic +- Built-in deduplication mechanism +- Flexible output schema configuration +- Parallel data generation with configurable concurrency + +## Input Schema +```python +class GenerateByTopicInput(BaseModel): + user_instruction: Optional[str] = None + num_records: Optional[int] = 10 + records_per_topic: int = 10 + topics: Optional[List[Union[str, Dict[str, int]]]] = None + topic_model_name: str = "openai/gpt-4o-mini" + topic_model_kwargs: Optional[Dict[str, Any]] = None + generation_model_name: str = "openai/gpt-4o-mini" + generation_model_kwargs: Optional[Dict[str, Any]] = None + output_schema: Optional[Union[List[Dict[str, Any]], Dict[str, Any], type]] = [ + {"name": "question", "type": "str"}, + {"name": "answer", "type": "str"} + ] + data_factory_config: Optional[Dict[str, Any]] = {} +``` + +## Parameters +| Parameter | Type | Description | Default | +|-----------|------|-------------|---------| +| `user_instruction` | str | Instruction for data generation | None | +| `num_records` | int | Total number of records to generate | 10 | +| `records_per_topic` | int | Number of records per topic | 10 | +| `topics` | List[Union[str, Dict[str, int]]] | List of topics or topic with specific record count | None | +| `topic_model_name` | str | Model name for topic generation | "openai/gpt-4o-mini" | +| `topic_model_kwargs` | Dict[str, Any] | Additional parameters for topic model | None | +| `generation_model_name` | str | Model name for data generation | "openai/gpt-4o-mini" | +| `generation_model_kwargs` | Dict[str, Any] | Additional parameters for generation model | None | +| `output_schema` | Union[List[Dict[str, Any]], Dict[str, Any], type] | Schema for generated data | [{"name": "question", "type": "str"}, {"name": "answer", "type": "str"}] | +| `data_factory_config` | Dict[str, Any] | Configuration for data generation process | {} | + +## Example Usage +```python +{ + "user_instruction": "Generate Q&A pairs about machine learning concepts", + "num_records": 100, + "records_per_topic": 5, + "topics": [ + "supervised learning", + "unsupervised learning", + {"reinforcement learning": 3}, + "neural networks", + ], + "topic_model_name": "openai/gpt-4", + "topic_model_kwargs": {"temperature": 0.7}, + "generation_model_name": "openai/gpt-4", + "generation_model_kwargs": {"temperature": 0.8, "max_tokens": 200}, + "output_schema": [ + {"name": "question", "type": "str"}, + {"name": "answer", "type": "str"}, + {"name": "difficulty", "type": "str"}, + ], + "data_factory_config": {"max_concurrency": 4, "task_runner_timeout": 60 * 2}, +} +``` + +## Workflow +1. Topic Preparation: + - If topics are not provided, generates relevant topics based on user instruction + - Shuffles topics for better distribution and deduplication + +2. Data Generation: + - Generates data for each topic using the specified model + - Implements deduplication by tracking previously generated examples + - Adds topic information to each generated record + +## Output +The generated data will include: +- Fields specified in the output schema +- An additional `topic` field indicating the topic of each record + +## Dependencies +- `starfish` framework +- `pydantic` for input validation + + +## Sample Run + +Check out [`sample_run.ipynb`](./sample_run.ipynb) for a complete example you can run right away. + +## Source Implementation + +The actual template code is located at: +``` +src/starfish/data_gen_template/templates/starfish/generate_by_topic/ +``` + +--- + +**Try it out!** If you have any questions, let us know - we'd be happy to help. If you like this template, consider starring the repo and building your own! We welcome community contributions and are always happy to chat about new ideas. ⭐ \ No newline at end of file diff --git a/prebuilt_template/generate_by_topic/sample_run.ipynb b/prebuilt_template/generate_by_topic/sample_run.ipynb new file mode 100644 index 0000000..a55a46e --- /dev/null +++ b/prebuilt_template/generate_by_topic/sample_run.ipynb @@ -0,0 +1,438 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from starfish import data_gen_template" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['starfish/generate_func_call_dataset', 'starfish/generate_by_topic']" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_gen_template.list()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "loaded = data_gen_template.get(\"starfish/generate_by_topic\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "get the template input_data schema and example" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m2025-05-23 11:23:57\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mPlease run the template with this input schema\u001b[0m\n", + "\u001b[32m2025-05-23 11:23:57\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m{\n", + " \"description\": \"Input schema for the generate_by_topic template.\\n\\nIMPORTANT: This Pydantic model is the single source of truth for default values.\\nThe validation and default values are controlled by this model, not the function signature.\",\n", + " \"properties\": {\n", + " \"user_instruction\": {\n", + " \"anyOf\": [\n", + " {\n", + " \"type\": \"string\"\n", + " },\n", + " {\n", + " \"type\": \"null\"\n", + " }\n", + " ],\n", + " \"default\": null,\n", + " \"title\": \"User Instruction\"\n", + " },\n", + " \"num_records\": {\n", + " \"anyOf\": [\n", + " {\n", + " \"type\": \"integer\"\n", + " },\n", + " {\n", + " \"type\": \"null\"\n", + " }\n", + " ],\n", + " \"default\": 10,\n", + " \"title\": \"Num Records\"\n", + " },\n", + " \"records_per_topic\": {\n", + " \"default\": 10,\n", + " \"title\": \"Records Per Topic\",\n", + " \"type\": \"integer\"\n", + " },\n", + " \"topics\": {\n", + " \"anyOf\": [\n", + " {\n", + " \"items\": {\n", + " \"anyOf\": [\n", + " {\n", + " \"type\": \"string\"\n", + " },\n", + " {\n", + " \"additionalProperties\": {\n", + " \"type\": \"integer\"\n", + " },\n", + " \"type\": \"object\"\n", + " }\n", + " ]\n", + " },\n", + " \"type\": \"array\"\n", + " },\n", + " {\n", + " \"type\": \"null\"\n", + " }\n", + " ],\n", + " \"default\": null,\n", + " \"title\": \"Topics\"\n", + " },\n", + " \"topic_model_name\": {\n", + " \"default\": \"openai/gpt-4o-mini\",\n", + " \"title\": \"Topic Model Name\",\n", + " \"type\": \"string\"\n", + " },\n", + " \"topic_model_kwargs\": {\n", + " \"anyOf\": [\n", + " {\n", + " \"additionalProperties\": true,\n", + " \"type\": \"object\"\n", + " },\n", + " {\n", + " \"type\": \"null\"\n", + " }\n", + " ],\n", + " \"default\": null,\n", + " \"title\": \"Topic Model Kwargs\"\n", + " },\n", + " \"generation_model_name\": {\n", + " \"default\": \"openai/gpt-4o-mini\",\n", + " \"title\": \"Generation Model Name\",\n", + " \"type\": \"string\"\n", + " },\n", + " \"generation_model_kwargs\": {\n", + " \"anyOf\": [\n", + " {\n", + " \"additionalProperties\": true,\n", + " \"type\": \"object\"\n", + " },\n", + " {\n", + " \"type\": \"null\"\n", + " }\n", + " ],\n", + " \"default\": null,\n", + " \"title\": \"Generation Model Kwargs\"\n", + " },\n", + " \"output_schema\": {\n", + " \"anyOf\": [\n", + " {\n", + " \"items\": {\n", + " \"additionalProperties\": true,\n", + " \"type\": \"object\"\n", + " },\n", + " \"type\": \"array\"\n", + " },\n", + " {\n", + " \"additionalProperties\": true,\n", + " \"type\": \"object\"\n", + " },\n", + " {\n", + " \"type\": \"null\"\n", + " }\n", + " ],\n", + " \"default\": [\n", + " {\n", + " \"name\": \"question\",\n", + " \"type\": \"str\"\n", + " },\n", + " {\n", + " \"name\": \"answer\",\n", + " \"type\": \"str\"\n", + " }\n", + " ],\n", + " \"title\": \"Output Schema\"\n", + " },\n", + " \"data_factory_config\": {\n", + " \"anyOf\": [\n", + " {\n", + " \"additionalProperties\": true,\n", + " \"type\": \"object\"\n", + " },\n", + " {\n", + " \"type\": \"null\"\n", + " }\n", + " ],\n", + " \"default\": {},\n", + " \"title\": \"Data Factory Config\"\n", + " }\n", + " },\n", + " \"title\": \"GenerateByTopicInput\",\n", + " \"type\": \"object\"\n", + "}\u001b[0m\n" + ] + } + ], + "source": [ + "loaded.print_schema()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m2025-05-23 11:24:01\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mHere is an example with api_contract.name as weather_api.get_current_weather\u001b[0m\n", + "\u001b[32m2025-05-23 11:24:01\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m{\n", + " \"user_instruction\": \"Generate Q&A pairs about machine learning concepts\",\n", + " \"num_records\": 100,\n", + " \"records_per_topic\": 5,\n", + " \"topics\": [\n", + " \"supervised learning\",\n", + " \"unsupervised learning\",\n", + " {\"reinforcement learning\": 3}, # This means generate 3 records for this topic\n", + " \"neural networks\",\n", + " ],\n", + " \"topic_model_name\": \"openai/gpt-4\",\n", + " \"topic_model_kwargs\": {\"temperature\": 0.7},\n", + " \"generation_model_name\": \"openai/gpt-4\",\n", + " \"generation_model_kwargs\": {\"temperature\": 0.8, \"max_tokens\": 200},\n", + " \"output_schema\": [\n", + " {\"name\": \"question\", \"type\": \"str\"},\n", + " {\"name\": \"answer\", \"type\": \"str\"},\n", + " {\"name\": \"difficulty\", \"type\": \"str\"}, # Added an additional field\n", + " ],\n", + " \"data_factory_config\": {\"max_concurrency\": 4, \"task_runner_timeout\": 60 * 2},\n", + " }\u001b[0m\n" + ] + } + ], + "source": [ + "loaded.print_example()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🌟 Function Calling Dataset Generation Pipeline\n", + "============================================================\n", + "πŸ“‹ Process Overview:\n", + " 1. Calculate optimal data distribution\n", + " 2. Generate diverse topics\n", + " 3. Create subtopics for each topic\n", + " 4. Generate query-answer pairs\n", + " 5. Verify and validate generated data\n", + " 6. Regenerate failed cases\n", + "============================================================\n", + "πŸ“Š Data Distribution Plan:\n", + " β€’ Requested: 10 records\n", + " β€’ Distribution: 1 topics Γ— 1 subtopics Γ— 10 records\n", + " β€’ Total generation: 10 records\n", + " β€’ API calls needed: 3\n", + "\n", + "🎯 Step 1: Generating diverse topics...\n", + " βœ… Generated 1 topics\n", + "\n", + "🌿 Step 2: Creating subtopics for each topic...\n", + "\u001b[32m2025-05-23 00:27:04\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: e6763e50-6438-4df5-81a9-5a68ce3f8468\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n", + "\u001b[32m2025-05-23 00:27:04\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n", + "\u001b[32m2025-05-23 00:27:06\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 1/1\u001b[0m | \u001b[33mAttempted: 1\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n", + " βœ… Generated 1 subtopics total\n", + "\n", + "πŸ’¬ Step 3: Generating query-answer pairs...\n", + "\u001b[32m2025-05-23 00:27:06\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 1931c5c8-c1f3-4268-98b7-1a5295b8abf2\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n", + "\u001b[32m2025-05-23 00:27:06\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n", + "\u001b[32m2025-05-23 00:27:09\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n", + "\u001b[32m2025-05-23 00:27:12\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n", + "\u001b[32m2025-05-23 00:27:15\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n", + "\u001b[32m2025-05-23 00:27:18\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n", + "\u001b[32m2025-05-23 00:27:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n", + "\u001b[32m2025-05-23 00:27:24\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n", + "\u001b[32m2025-05-23 00:27:27\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n", + "\u001b[32m2025-05-23 00:27:28\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 1/1\u001b[0m | \u001b[33mAttempted: 1\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n", + " βœ… Generated 10 initial query-answer pairs\n", + "\n", + "πŸ” Step 4: Verifying data quality...\n", + "\u001b[32m2025-05-23 00:27:28\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: f036c07c-1cd2-4690-be92-bac359e45544\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n", + "\u001b[32m2025-05-23 00:27:28\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/10\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n", + "\u001b[32m2025-05-23 00:27:31\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/10\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n", + "\u001b[32m2025-05-23 00:27:34\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 9/10\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 9\u001b[0m (\u001b[32mCompleted: 9\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n", + "\u001b[32m2025-05-23 00:27:35\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 10/10\u001b[0m | \u001b[33mAttempted: 10\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n", + "\u001b[32m2025-05-23 00:27:35\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[33m\u001b[1mCannot serialize function for resume due to unsupported type: cannot pickle '_hashlib.HMAC' object\u001b[0m\n", + " βœ… Quality check complete: 9 passed, 1 failed\n", + "\n", + "πŸ”„ Step 5: Regenerating failed cases...\n", + "\u001b[32m2025-05-23 00:27:35\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 3d6183a2-e465-4807-9e18-cbb84dc0d28f\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n", + "\u001b[32m2025-05-23 00:27:35\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n", + "\u001b[32m2025-05-23 00:27:37\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 1/1\u001b[0m | \u001b[33mAttempted: 1\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n", + "\u001b[32m2025-05-23 00:27:37\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 8754bec6-25e3-40bd-9743-f2763fc1091f\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n", + "\u001b[32m2025-05-23 00:27:37\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n", + "\u001b[32m2025-05-23 00:27:40\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n", + "\u001b[32m2025-05-23 00:27:41\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 1/1\u001b[0m | \u001b[33mAttempted: 1\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n", + "\u001b[32m2025-05-23 00:27:41\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[33m\u001b[1mCannot serialize function for resume due to unsupported type: cannot pickle '_hashlib.HMAC' object\u001b[0m\n", + " βœ… Regenerated 1 pairs, 1 still failing\n", + "\u001b[32m2025-05-23 00:27:41\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[33m\u001b[1mSome data still failing after regeneration - prompts may need improvement\u001b[0m\n", + "🎯 Perfect! Generated exactly 10 records as requested\n", + "\n", + "πŸŽ‰ Generation Complete!\n", + "============================================================\n", + "πŸ“ˆ Final Results:\n", + " β€’ Records generated: 10\n", + " β€’ Success rate: 10/10 (100.0%)\n", + " β€’ Distribution used: 1T Γ— 1S Γ— 10R\n", + "\n", + "⭐ If you found this helpful, please consider starring our repo!\n", + " Your support means the world to us! 🌟\n", + "============================================================\n" + ] + } + ], + "source": [ + "input_data = {\n", + " \"user_instruction\": \"Generate Q&A pairs about machine learning concepts\",\n", + " \"num_records\": 100,\n", + " \"records_per_topic\": 5,\n", + " \"topics\": [\n", + " \"supervised learning\",\n", + " \"unsupervised learning\",\n", + " {\"reinforcement learning\": 3}, # This means generate 3 records for this topic\n", + " \"neural networks\",\n", + " ],\n", + " \"topic_model_name\": \"openai/gpt-4\",\n", + " \"topic_model_kwargs\": {\"temperature\": 0.7},\n", + " \"generation_model_name\": \"openai/gpt-4\",\n", + " \"generation_model_kwargs\": {\"temperature\": 0.8, \"max_tokens\": 200},\n", + " \"output_schema\": [\n", + " {\"name\": \"question\", \"type\": \"str\"},\n", + " {\"name\": \"answer\", \"type\": \"str\"},\n", + " {\"name\": \"difficulty\", \"type\": \"str\"}, # Added an additional field\n", + " ],\n", + " \"data_factory_config\": {\"max_concurrency\": 4, \"task_runner_timeout\": 60 * 2},\n", + " }\n", + "data = await loaded.run(input_data=input_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'query': 'Can you check the current weather in Toronto and Rome? Use Fahrenheit for both locations.',\n", + " 'answer': [{'name': 'weather_api.get_current_weather',\n", + " 'arguments': {'location': 'Toronto', 'units': 'Fahrenheit'}},\n", + " {'name': 'weather_api.get_current_weather',\n", + " 'arguments': {'location': 'Rome', 'units': 'Fahrenheit'}}]},\n", + " {'query': 'Get me the current weather in Mumbai and also in Johannesburg, please use Fahrenheit for both.',\n", + " 'answer': [{'name': 'weather_api.get_current_weather',\n", + " 'arguments': {'location': 'Mumbai', 'units': 'Fahrenheit'}},\n", + " {'name': 'weather_api.get_current_weather',\n", + " 'arguments': {'location': 'Johannesburg', 'units': 'Fahrenheit'}}]},\n", + " {'query': 'I need the current weather for Sydney and London. What are the temperatures in Celsius?',\n", + " 'answer': [{'name': 'weather_api.get_current_weather',\n", + " 'arguments': {'location': 'Sydney', 'units': 'Celsius'}},\n", + " {'name': 'weather_api.get_current_weather',\n", + " 'arguments': {'location': 'London', 'units': 'Celsius'}}]},\n", + " {'query': 'Please find the current weather in Buenos Aires and Cape Town, using Celsius for Buenos Aires.',\n", + " 'answer': [{'name': 'weather_api.get_current_weather',\n", + " 'arguments': {'location': 'Buenos Aires', 'units': 'Celsius'}},\n", + " {'name': 'weather_api.get_current_weather',\n", + " 'arguments': {'location': 'Cape Town'}}]},\n", + " {'query': 'What’s the weather like in Moscow? Also, can you get the current conditions in Beijing?',\n", + " 'answer': [{'name': 'weather_api.get_current_weather',\n", + " 'arguments': {'location': 'Moscow'}},\n", + " {'name': 'weather_api.get_current_weather',\n", + " 'arguments': {'location': 'Beijing'}}]},\n", + " {'query': 'Can you tell me the current weather in Tokyo and in Los Angeles? Please provide both in Fahrenheit.',\n", + " 'answer': [{'name': 'weather_api.get_current_weather',\n", + " 'arguments': {'location': 'Tokyo', 'units': 'Fahrenheit'}},\n", + " {'name': 'weather_api.get_current_weather',\n", + " 'arguments': {'location': 'Los Angeles', 'units': 'Fahrenheit'}}]},\n", + " {'query': 'Please provide the current weather for Berlin and Cairo, using Celsius for Berlin and no specific unit for Cairo.',\n", + " 'answer': [{'name': 'weather_api.get_current_weather',\n", + " 'arguments': {'location': 'Berlin', 'units': 'Celsius'}},\n", + " {'name': 'weather_api.get_current_weather',\n", + " 'arguments': {'location': 'Cairo'}}]},\n", + " {'query': 'I need the current weather in Seattle and in Santiago. Use Fahrenheit for Seattle and Celsius for Santiago.',\n", + " 'answer': [{'name': 'weather_api.get_current_weather',\n", + " 'arguments': {'location': 'Seattle', 'units': 'Fahrenheit'}},\n", + " {'name': 'weather_api.get_current_weather',\n", + " 'arguments': {'location': 'Santiago', 'units': 'Celsius'}}]},\n", + " {'query': \"What's the current temperature in San Francisco? Can you also check the weather in Paris?\",\n", + " 'answer': [{'name': 'weather_api.get_current_weather',\n", + " 'arguments': {'location': 'San Francisco'}},\n", + " {'name': 'weather_api.get_current_weather',\n", + " 'arguments': {'location': 'Paris'}}]},\n", + " {'query': 'What is the current weather in New York City? And can you also provide the temperature in Celsius?',\n", + " 'answer': [{'name': 'weather_api.get_current_weather',\n", + " 'arguments': {'location': 'New York City', 'units': 'Celsius'}}]}]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/starfish/data_gen_template/core.py b/src/starfish/data_gen_template/core.py index 5b1417c..deddde1 100644 --- a/src/starfish/data_gen_template/core.py +++ b/src/starfish/data_gen_template/core.py @@ -4,10 +4,13 @@ import pydantic import importlib.util import ast -from typing import Any, Union, List, Dict +from typing import Any, Union, List, Dict, get_type_hints import inspect - +import json from starfish.data_gen_template.utils.errors import DataTemplateValueError, ImportModuleError, ImportPackageError +from starfish.common.logger import get_logger + +logger = get_logger(__name__) def _check_dependencies(dependencies: list[str]) -> None: @@ -40,11 +43,21 @@ class Template: """Class representing a single template instance.""" def __init__( - self, name: str, func: callable, input_schema: type, output_schema: type, description: str, author: str, starfish_version: str, dependencies: list[str] + self, + name: str, + func: callable, + input_schema: type, + input_example: str, + output_schema: type, + description: str, + author: str, + starfish_version: str, + dependencies: list[str], ): self.name = name self.func = func self.input_schema = input_schema + self.input_example = input_example self.output_schema = output_schema self.description = description self.author = author @@ -113,6 +126,17 @@ async def run(self, *args, **kwargs) -> Any: return result + def print_schema(self): + type_hints = get_type_hints(self.func) + input_schema = type_hints.get("input_data").schema() + # Pretty print the schema + logger.info("Please run the template with this input schema") + logger.info(json.dumps(input_schema, indent=4)) + + def print_example(self): + logger.info("Here is an example with api_contract.name as weather_api.get_current_weather") + logger.info(self.input_example) # Pretty print with 4-space indentation + def _get_validated_model(self, args, kwargs): """Convert input arguments into a validated Pydantic model instance.""" # Case 1: User passed a model instance directly @@ -221,14 +245,16 @@ def get(template_name: str) -> Template: return data_gen_template._template_instance_registry[template_name] @staticmethod - def register(name: str, input_schema: type, output_schema: type, description: str, author: str, starfish_version: str, dependencies: list): + def register( + name: str, input_schema: type, input_example: str, output_schema: type, description: str, author: str, starfish_version: str, dependencies: list + ): """Decorator factory for registering data templates.""" def decorator(func: callable): # Check if this is an import call (function already has _is_template flag) if name not in data_gen_template._template_instance_registry: data_gen_template._template_instance_registry[name] = Template( - name, func, input_schema, output_schema, description, author, starfish_version, dependencies + name, func, input_schema, input_example, output_schema, description, author, starfish_version, dependencies ) return func diff --git a/src/starfish/data_gen_template/templates/starfish/function_calling/generator.py b/src/starfish/data_gen_template/templates/starfish/function_calling/generator.py index 837dd86..325169d 100644 --- a/src/starfish/data_gen_template/templates/starfish/function_calling/generator.py +++ b/src/starfish/data_gen_template/templates/starfish/function_calling/generator.py @@ -67,6 +67,22 @@ class GenerateFuncCallDataSet(BaseModel): author="Wendao Liu", starfish_version="0.1.3", dependencies=[], + input_example="""{ + "num_records": 4, + "api_contract": { + "name": "weather_api.get_current_weather", + "description": "Retrieves the current weather conditions for a specified location .", + "parameters": { + "location": {"type": "string", "description": "The name of the city or geographic location .", "required": True}, + "units": {"type": "string", "description": "The units for temperature measurement( e.g., 'Celsius', 'Fahrenheit') .", "required": False}, + }, + }, + "topic_model_name": "openai/gpt-4", + "topic_model_kwargs": {"temperature": 0.7}, + "generation_model_name": "openai/gpt-4o-mini", + "generation_model_kwargs": {"temperature": 0.8, "max_tokens": 200}, + "data_factory_config": {"max_concurrency": 24, "task_runner_timeout": 60 * 2}, + }""", ) async def api_contract_workflow(input_data: GenerateFuncCallDataSet): api_contract = input_data.api_contract.model_dump() diff --git a/src/starfish/data_gen_template/templates/starfish/generate_by_topic/generator.py b/src/starfish/data_gen_template/templates/starfish/generate_by_topic/generator.py index 4f97c51..a13d059 100644 --- a/src/starfish/data_gen_template/templates/starfish/generate_by_topic/generator.py +++ b/src/starfish/data_gen_template/templates/starfish/generate_by_topic/generator.py @@ -41,6 +41,27 @@ class GenerateByTopicInput(BaseModel): author="Wendao Liu", starfish_version="0.1.3", dependencies=[], + input_example="""{ + "user_instruction": "Generate Q&A pairs about machine learning concepts", + "num_records": 100, + "records_per_topic": 5, + "topics": [ + "supervised learning", + "unsupervised learning", + {"reinforcement learning": 3}, # This means generate 3 records for this topic + "neural networks", + ], + "topic_model_name": "openai/gpt-4", + "topic_model_kwargs": {"temperature": 0.7}, + "generation_model_name": "openai/gpt-4", + "generation_model_kwargs": {"temperature": 0.8, "max_tokens": 200}, + "output_schema": [ + {"name": "question", "type": "str"}, + {"name": "answer", "type": "str"}, + {"name": "difficulty", "type": "str"}, # Added an additional field + ], + "data_factory_config": {"max_concurrency": 4, "task_runner_timeout": 60 * 2}, + }""", ) async def generate_by_topic(input_data: GenerateByTopicInput): """ diff --git a/tests/data_factory/factory/data_factory.ipynb b/tests/data_factory/factory/data_factory.ipynb new file mode 100644 index 0000000..bac6c16 --- /dev/null +++ b/tests/data_factory/factory/data_factory.ipynb @@ -0,0 +1,681 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Google Colab Version: [Open this notebook in Google Colab](https://colab.research.google.com/github/starfishdata/starfish/blob/main/examples/data_factory.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Dependencies " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: starfish-core in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (0.1.0)\n", + "Requirement already satisfied: aiofiles<25.0.0,>=24.1.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (24.1.0)\n", + "Requirement already satisfied: aiosqlite<0.22.0,>=0.21.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (0.21.0)\n", + "Requirement already satisfied: cachetools<6.0.0,>=5.5.2 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (5.5.2)\n", + "Requirement already satisfied: litellm<2.0.0,>=1.65.1 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (1.65.1)\n", + "Requirement already satisfied: loguru<0.8.0,>=0.7.3 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (0.7.3)\n", + "Requirement already satisfied: ollama<0.5.0,>=0.4.7 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (0.4.7)\n", + "Requirement already satisfied: platformdirs<5.0.0,>=4.1.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (4.3.7)\n", + "Requirement already satisfied: psutil<8.0.0,>=7.0.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (7.0.0)\n", + "Requirement already satisfied: python-dotenv<2.0.0,>=1.1.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (1.1.0)\n", + "Requirement already satisfied: typing-extensions<5.0.0,>=4.0.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (4.13.0)\n", + "Requirement already satisfied: aiohttp in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (3.11.16)\n", + "Requirement already satisfied: click in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (8.1.8)\n", + "Requirement already satisfied: httpx>=0.23.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (0.28.1)\n", + "Requirement already satisfied: importlib-metadata>=6.8.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (8.6.1)\n", + "Requirement already satisfied: jinja2<4.0.0,>=3.1.2 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (3.1.6)\n", + "Requirement already satisfied: jsonschema<5.0.0,>=4.22.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (4.23.0)\n", + "Requirement already satisfied: openai>=1.68.2 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (1.70.0)\n", + "Requirement already satisfied: pydantic<3.0.0,>=2.0.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (2.11.1)\n", + "Requirement already satisfied: tiktoken>=0.7.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (0.9.0)\n", + "Requirement already satisfied: tokenizers in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (0.21.1)\n", + "Requirement already satisfied: anyio in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from httpx>=0.23.0->litellm<2.0.0,>=1.65.1->starfish-core) (4.9.0)\n", + "Requirement already satisfied: certifi in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from httpx>=0.23.0->litellm<2.0.0,>=1.65.1->starfish-core) (2025.1.31)\n", + "Requirement already satisfied: httpcore==1.* in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from httpx>=0.23.0->litellm<2.0.0,>=1.65.1->starfish-core) (1.0.7)\n", + "Requirement already satisfied: idna in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from httpx>=0.23.0->litellm<2.0.0,>=1.65.1->starfish-core) (3.10)\n", + "Requirement already satisfied: h11<0.15,>=0.13 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from httpcore==1.*->httpx>=0.23.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.14.0)\n", + "Requirement already satisfied: zipp>=3.20 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from importlib-metadata>=6.8.0->litellm<2.0.0,>=1.65.1->starfish-core) (3.21.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from jinja2<4.0.0,>=3.1.2->litellm<2.0.0,>=1.65.1->starfish-core) (3.0.2)\n", + "Requirement already satisfied: attrs>=22.2.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm<2.0.0,>=1.65.1->starfish-core) (25.3.0)\n", + "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm<2.0.0,>=1.65.1->starfish-core) (2024.10.1)\n", + "Requirement already satisfied: referencing>=0.28.4 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.36.2)\n", + "Requirement already satisfied: rpds-py>=0.7.1 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.24.0)\n", + "Requirement already satisfied: distro<2,>=1.7.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from openai>=1.68.2->litellm<2.0.0,>=1.65.1->starfish-core) (1.9.0)\n", + "Requirement already satisfied: jiter<1,>=0.4.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from openai>=1.68.2->litellm<2.0.0,>=1.65.1->starfish-core) (0.9.0)\n", + "Requirement already satisfied: sniffio in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from openai>=1.68.2->litellm<2.0.0,>=1.65.1->starfish-core) (1.3.1)\n", + "Requirement already satisfied: tqdm>4 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from openai>=1.68.2->litellm<2.0.0,>=1.65.1->starfish-core) (4.67.1)\n", + "Requirement already satisfied: annotated-types>=0.6.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from pydantic<3.0.0,>=2.0.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.33.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from pydantic<3.0.0,>=2.0.0->litellm<2.0.0,>=1.65.1->starfish-core) (2.33.0)\n", + "Requirement already satisfied: typing-inspection>=0.4.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from pydantic<3.0.0,>=2.0.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.4.0)\n", + "Requirement already satisfied: regex>=2022.1.18 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from tiktoken>=0.7.0->litellm<2.0.0,>=1.65.1->starfish-core) (2024.11.6)\n", + "Requirement already satisfied: requests>=2.26.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from tiktoken>=0.7.0->litellm<2.0.0,>=1.65.1->starfish-core) (2.32.3)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (2.6.1)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (1.3.2)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (1.5.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (6.3.1)\n", + "Requirement already satisfied: propcache>=0.2.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (0.3.1)\n", + "Requirement already satisfied: yarl<2.0,>=1.17.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (1.18.3)\n", + "Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (0.30.1)\n", + "Requirement already satisfied: filelock in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (3.18.0)\n", + "Requirement already satisfied: fsspec>=2023.5.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (2025.3.2)\n", + "Requirement already satisfied: packaging>=20.9 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (24.2)\n", + "Requirement already satisfied: pyyaml>=5.1 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (6.0.2)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from requests>=2.26.0->tiktoken>=0.7.0->litellm<2.0.0,>=1.65.1->starfish-core) (3.4.1)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from requests>=2.26.0->tiktoken>=0.7.0->litellm<2.0.0,>=1.65.1->starfish-core) (2.3.0)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install starfish-core" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "## Fix for Jupyter Notebook only β€” do NOT use in production\n", + "## Enables async code execution in notebooks, but may cause issues with sync/async issues\n", + "## For production, please run in standard .py files without this workaround\n", + "## See: https://github.com/erdewit/nest_asyncio for more details\n", + "import nest_asyncio\n", + "nest_asyncio.apply()\n", + "\n", + "from starfish import StructuredLLM, data_factory\n", + "from starfish.llm.utils import merge_structured_outputs\n", + "\n", + "from starfish.common.env_loader import load_env_file ## Load environment variables from .env file\n", + "load_env_file()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# setup your openai api key if not already set\n", + "# import os\n", + "# os.environ[\"OPENAI_API_KEY\"] = \"your_key_here\"\n", + "\n", + "# If you dont have any API key, please navigate to local model section" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "## Helper function mock llm call\n", + "# When developing data pipelines with LLMs, making thousands of real API calls\n", + "# can be expensive. Using mock LLM calls lets you test your pipeline's reliability,\n", + "# failure handling, and recovery without spending money on API calls.\n", + "from starfish.data_factory.utils.mock import mock_llm_call" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 1. Your First Data Factory: Simple Scaling\n", + "\n", + "The @data_factory decorator transforms any async function into a scalable data processing pipeline.\n", + "It handles:\n", + "- Parallel execution \n", + "- Automatic batching\n", + "- Error handling & retries\n", + "- Progress tracking\n", + "\n", + "Let's start with a single LLM call and then show how easy it is to scale it.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'fact': 'New Yorkers consume around 1,000,000 slices of pizza every day, which means if you laid them all in a line, they would stretch from the Statue of Liberty to the Eiffel Tower... and back!'}]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# First, create a StructuredLLM instance for generating facts about cities\n", + "json_llm = StructuredLLM(\n", + " model_name = \"openai/gpt-4o-mini\",\n", + " prompt = \"Funny facts about city {{city_name}}.\",\n", + " output_schema = [{'name': 'fact', 'type': 'str'}],\n", + " model_kwargs = {\"temperature\": 0.7},\n", + ")\n", + "\n", + "json_llm_response = await json_llm.run(city_name='New York')\n", + "json_llm_response.data" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m2025-04-25 10:16:32\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 8c926411-63e7-4dc6-98c9-861c3489fb8b\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n", + "\u001b[32m2025-04-25 10:16:32\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/5\u001b[0m | \u001b[33mRunning: 5\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", + "Processing New York at 2025-04-25 10:16:32.524033\n", + "Processing London at 2025-04-25 10:16:32.524286\n", + "Processing Tokyo at 2025-04-25 10:16:32.524979\n", + "Processing Paris at 2025-04-25 10:16:32.525535\n", + "Processing Sydney at 2025-04-25 10:16:32.526729\n", + "\u001b[32m2025-04-25 10:16:34\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n", + "\u001b[32m2025-04-25 10:16:34\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 5/5\u001b[0m | \u001b[33mAttempted: 5\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n" + ] + }, + { + "data": { + "text/plain": [ + "[{'fact': \"In Tokyo, there's a train station called 'Shinjuku' that handles more passengers each day than the entire population of the United States!\"},\n", + " {'fact': \"London has a 'secret' underground city known as the 'London Stone', which is said to have magical powers, making it one of the city's most famous and quirky legends!\"},\n", + " {'fact': 'In Paris, you can legally marry a dead person! This quirky law allows for posthumous marriages, as long as you can prove that the deceased had intended to marry you before their untimely demise.'},\n", + " {'fact': 'In New York City, there are more than 25,000 licensed taxis, but only about 1,200 of them are actually yellow. The rest are a rainbow of colors, including pink, blue, and even animal print!'},\n", + " {'fact': 'Sydney has a beach where you can surf, swim, and even watch a film – all in one day! Just don’t forget your sunscreen and popcorn!'}]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Now, scale to multiple cities using data_factory\n", + "# Just add the @data_factory decorator to process many cities in parallel\n", + "\n", + "from datetime import datetime\n", + "@data_factory(max_concurrency=10)\n", + "async def process_json_llm(city_name: str):\n", + " ## Adding a print statement to indicate the start of the processing\n", + " print(f\"Processing {city_name} at {datetime.now()}\")\n", + " json_llm_response = await json_llm.run(city_name=city_name)\n", + " return json_llm_response.data\n", + "\n", + "# This is all it takes to scale from one city to many cities!\n", + "process_json_llm.run(city_name=[\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2. Works with any aysnc function\n", + "\n", + "Data Factory works with any async function, not just LLM calls, you can build complex pipelines involving multiple LLMs, data processing, etc.\n", + "\n", + "Here is example of two chained structured llm" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m2025-04-25 10:16:34\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 466fca03-85a2-46de-b135-629cd76738f7\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n", + "\u001b[32m2025-04-25 10:16:34\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/3\u001b[0m | \u001b[33mRunning: 3\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", + "\u001b[32m2025-04-25 10:16:37\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/3\u001b[0m | \u001b[33mRunning: 3\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", + "\u001b[32m2025-04-25 10:16:40\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/3\u001b[0m | \u001b[33mRunning: 3\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", + "\u001b[32m2025-04-25 10:16:43\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 2/3\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 2\u001b[0m (\u001b[32mCompleted: 2\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", + "\u001b[32m2025-04-25 10:16:44\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n", + "\u001b[32m2025-04-25 10:16:44\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 3/3\u001b[0m | \u001b[33mAttempted: 3\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n" + ] + } + ], + "source": [ + "# Example of a more complex function that chains multiple LLM calls\n", + "# This was grabbed from structured llm examples \n", + "\n", + "@data_factory(max_concurrency=10)\n", + "async def complex_process_cities(topic: str):\n", + " ## topic β†’ generator_llm β†’ rating_llm β†’ merged results\n", + " # First LLM to generate question/answer pairs\n", + " generator_llm = StructuredLLM(\n", + " model_name=\"openai/gpt-4o-mini\",\n", + " prompt=\"Generate question/answer pairs about {{topic}}.\",\n", + " output_schema=[\n", + " {\"name\": \"question\", \"type\": \"str\"},\n", + " {\"name\": \"answer\", \"type\": \"str\"}\n", + " ],\n", + " )\n", + "\n", + " # Second LLM to rate the generated pairs\n", + " rater_llm = StructuredLLM(\n", + " model_name=\"openai/gpt-4o-mini\",\n", + " prompt='''Rate the following Q&A pairs based on accuracy and clarity (1-10).\n", + " Pairs: {{generated_pairs}}''',\n", + " output_schema=[\n", + " {\"name\": \"accuracy_rating\", \"type\": \"int\"},\n", + " {\"name\": \"clarity_rating\", \"type\": \"int\"}\n", + " ],\n", + " model_kwargs={\"temperature\": 0.5}\n", + ")\n", + "\n", + " generation_response = await generator_llm.run(topic=topic, num_records=5)\n", + " rating_response = await rater_llm.run(generated_pairs=generation_response.data)\n", + " \n", + " # Merge the results\n", + " return merge_structured_outputs(generation_response.data, rating_response.data)\n", + "\n", + "\n", + "### To save on token here we only use 3 topics as example\n", + "complex_process_cities_data = complex_process_cities.run(topic=['Science', 'History', 'Technology'])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "15\n", + "[{'question': 'What is the primary function of a CPU in a computer?', 'answer': 'The CPU, or Central Processing Unit, is responsible for executing instructions and processing data in a computer system.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What does IoT stand for and what is its significance?', 'answer': 'IoT stands for Internet of Things, which refers to the interconnection of everyday devices to the internet, allowing them to send and receive data, thereby enhancing efficiency and convenience.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is the difference between RAM and ROM?', 'answer': 'RAM (Random Access Memory) is volatile memory that temporarily stores data and applications currently in use, while ROM (Read-Only Memory) is non-volatile memory that permanently stores firmware and system software.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is cloud computing?', 'answer': 'Cloud computing is the delivery of computing services over the internet, enabling users to access and store data and applications on remote servers rather than on local computers.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What are the benefits of using artificial intelligence in business?', 'answer': 'Artificial intelligence can enhance efficiency, improve decision-making, personalize customer experiences, automate repetitive tasks, and generate insights from data analytics in business operations.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is the chemical formula for water?', 'answer': 'The chemical formula for water is H2O.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is the process by which plants make their own food?', 'answer': 'The process by which plants make their own food is called photosynthesis.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is the speed of light in a vacuum?', 'answer': 'The speed of light in a vacuum is approximately 299,792 kilometers per second (or about 186,282 miles per second).', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is the smallest unit of life?', 'answer': 'The smallest unit of life is the cell.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What gas do living organisms need for respiration?', 'answer': 'Living organisms need oxygen for respiration.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What was the primary cause of World War I?', 'answer': 'The primary cause of World War I was the complex system of alliances, militarism, imperialism, and nationalism, which escalated tensions following the assassination of Archduke Franz Ferdinand of Austria in 1914.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'Who was the first President of the United States?', 'answer': 'George Washington was the first President of the United States, serving from April 30, 1789, to March 4, 1797.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What year did the Berlin Wall fall?', 'answer': 'The Berlin Wall fell on November 9, 1989, symbolizing the end of the Cold War and the division between East and West Germany.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'Which ancient civilization is known for creating the first known writing system?', 'answer': 'The Sumerians, who inhabited ancient Mesopotamia around 3500 BCE, are known for creating the first known writing system called cuneiform.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What was the significance of the Magna Carta?', 'answer': 'The Magna Carta, signed in 1215, was significant because it limited the power of the monarchy and established the principle that everyone, including the king, was subject to the law.', 'accuracy_rating': 10, 'clarity_rating': 10}]\n" + ] + } + ], + "source": [ + "### Each topic has 5 question/answer pairs so 3 topics has 15 pairs!\n", + "print(len(complex_process_cities_data))\n", + "print(complex_process_cities_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3. Working with Different Input Formats\n", + "\n", + "\n", + "Data Factory is flexible with how you provide inputs. Let's demonstrate different ways to pass parameters to data_factory functions.\n", + "\n", + "'data' is a reserved keyword expecting list(dict) or tuple(dict) - this design make it super easy to pass large data and support HuggingFace and Pandas dataframe very easily" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'answer': 'New York_5'}, {'answer': 'New York_2'}, {'answer': 'New York_3'}]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## We will be using mock llm call for rest of example to save on token\n", + "## Mock LLM call is a function that simulates an LLM API call with random delays (controlled by sleep_time) and occasional failures (controlled by fail_rate)\n", + "await mock_llm_call(city_name=\"New York\", num_records_per_city=3)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "@data_factory(max_concurrency=100)\n", + "async def input_format_mock_llm(city_name: str, num_records_per_city: int):\n", + " return await mock_llm_call(city_name=city_name, num_records_per_city=num_records_per_city, fail_rate=0.01)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m2025-04-25 10:16:49\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 05c84608-fec3-4010-8876-e59eed12bb6a\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n", + "\u001b[32m2025-04-25 10:16:49\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/5\u001b[0m | \u001b[33mRunning: 5\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", + "\u001b[32m2025-04-25 10:16:50\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n", + "\u001b[32m2025-04-25 10:16:50\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 5/5\u001b[0m | \u001b[33mAttempted: 5\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n" + ] + } + ], + "source": [ + "# Format 1: Multiple lists that get zipped together\n", + "input_format_data1 = input_format_mock_llm.run(city_name=[\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"], num_records_per_city=[2, 1, 1, 1, 1])" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m2025-04-25 10:16:50\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: fedb98e5-c408-4bc8-9479-6087f4a298b7\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n", + "\u001b[32m2025-04-25 10:16:50\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/5\u001b[0m | \u001b[33mRunning: 5\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", + "\u001b[32m2025-04-25 10:16:51\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n", + "\u001b[32m2025-04-25 10:16:51\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 5/5\u001b[0m | \u001b[33mAttempted: 5\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n" + ] + } + ], + "source": [ + "# Format 2: List + single value (single value gets broadcasted)\n", + "input_format_data2 = input_format_mock_llm.run(city_name=[\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"], num_records_per_city=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m2025-04-25 10:16:51\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 2f5cb7cc-83c9-4b7e-9ebb-386cd66bdd42\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n", + "\u001b[32m2025-04-25 10:16:51\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/5\u001b[0m | \u001b[33mRunning: 5\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", + "\u001b[32m2025-04-25 10:16:52\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n", + "\u001b[32m2025-04-25 10:16:52\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 5/5\u001b[0m | \u001b[33mAttempted: 5\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n" + ] + } + ], + "source": [ + "# Format 3: Special 'data' parameter\n", + "# 'data' is a reserved keyword expecting list(dict) or tuple(dict)\n", + "# Makes integration with various data sources easier\n", + "input_format_data3 = input_format_mock_llm.run(data=[{\"city_name\": \"New York\", \"num_records_per_city\": 2}, {\"city_name\": \"London\", \"num_records_per_city\": 1}, {\"city_name\": \"Tokyo\", \"num_records_per_city\": 1}, {\"city_name\": \"Paris\", \"num_records_per_city\": 1}, {\"city_name\": \"Sydney\", \"num_records_per_city\": 1}])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 4. Resilient error retry\n", + "Data Factory automatically handles errors and retries, making your pipelines robust.\n", + "\n", + "Let's demonstrate with a high failure rate example." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m2025-04-25 10:16:56\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 38c50ab6-f24b-4cba-a2c5-070130ab420e\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n", + "\u001b[32m2025-04-25 10:16:56\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/25\u001b[0m | \u001b[33mRunning: 25\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", + "\u001b[32m2025-04-25 10:16:59\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 16/25\u001b[0m | \u001b[33mRunning: 9\u001b[0m | \u001b[36mAttempted: 16\u001b[0m (\u001b[32mCompleted: 16\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", + "\u001b[32m2025-04-25 10:16:59\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Tokyo\u001b[0m\n", + "\u001b[32m2025-04-25 10:16:59\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: London\u001b[0m\n", + "\u001b[32m2025-04-25 10:16:59\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: New York\u001b[0m\n", + "\u001b[32m2025-04-25 10:17:02\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 23/25\u001b[0m | \u001b[33mRunning: 2\u001b[0m | \u001b[36mAttempted: 26\u001b[0m (\u001b[32mCompleted: 23\u001b[0m, \u001b[31mFailed: 3\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", + "\u001b[32m2025-04-25 10:17:03\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: New York\u001b[0m\n", + "\u001b[32m2025-04-25 10:17:05\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n", + "\u001b[32m2025-04-25 10:17:05\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 25/25\u001b[0m | \u001b[33mAttempted: 29\u001b[0m (Failed: 4, Filtered: 0, Duplicate: 0)\u001b[0m\n", + "\n", + "Successfully completed 25 out of 25 tasks\n", + "Data Factory automatically handled the failures and continued processing\n", + "The results only include successful tasks\n" + ] + } + ], + "source": [ + "@data_factory(max_concurrency=100)\n", + "async def high_error_rate_mock_llm(city_name: str, num_records_per_city: int):\n", + " return await mock_llm_call(city_name=city_name, num_records_per_city=num_records_per_city, fail_rate=0.3) # Hardcode to 30% chance of failure\n", + "\n", + "# Process all cities - some will fail, but data_factory keeps going\n", + "cities = [\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"] * 5 # 25 cities\n", + "high_error_rate_mock_lllm_data = high_error_rate_mock_llm.run(city_name=cities, num_records_per_city=1)\n", + "\n", + "print(f\"\\nSuccessfully completed {len(high_error_rate_mock_lllm_data)} out of {len(cities)} tasks\")\n", + "print(\"Data Factory automatically handled the failures and continued processing\")\n", + "print(\"The results only include successful tasks\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5. Resume\n", + "\n", + "This is essential for long-running jobs with thousands of tasks.\n", + "\n", + "If a job is interrupted, you can pick up where you left off using one of two resume methods:\n", + "\n", + "\n", + "1. **Same Session Resume**: If you're still in the same session where the job was interrupted, simply call - Same instance with .resume()\n", + "\n", + "2. **Cross-Session Resume**: If you've closed your notebook or lost your session, you can resume using the job ID:\n", + " ```python\n", + " from starfish import DataFactory\n", + " # Resume using the master job ID from a previous run\n", + " data_factory = DataFactory.resume_from_checkpoint(job_id=\"your_job_id\")\n", + " ```\n", + "\n", + "The key difference:\n", + "- `resume()` uses the same DataFactory instance you defined\n", + "- `resume_from_checkpoint()` reconstructs your DataFactory from persistent storage where tasks and progress are saved\n", + "\n", + "> **Note**: Google Colab users may experience issues with `resume_from_checkpoint()` due to how Colab works\n", + "\n", + "We're simulating an interruption here. In a real scenario, this might happen if your notebook errors out, is manually interrupted with a keyboard command, encounters API rate limits, or experiences any other issues that halt execution." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m2025-04-25 10:17:12\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: b2a400b3-32e7-45ee-b8e8-c2bc7afe9f11\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n", + "\u001b[32m2025-04-25 10:17:12\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", + "\u001b[32m2025-04-25 10:17:15\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 17/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 17\u001b[0m (\u001b[32mCompleted: 17\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", + "\u001b[32m2025-04-25 10:17:15\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n", + "\u001b[32m2025-04-25 10:17:15\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError occurred: KeyboardInterrupt\u001b[0m\n", + "\u001b[32m2025-04-25 10:17:15\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[RESUME INFO] 🚨 Job stopped unexpectedly. You can resume the job by calling .resume()\u001b[0m\n", + "\u001b[32m2025-04-25 10:17:15\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 20/100\u001b[0m | \u001b[33mAttempted: 20\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n" + ] + } + ], + "source": [ + "@data_factory(max_concurrency=10)\n", + "async def re_run_mock_llm(city_name: str, num_records_per_city: int):\n", + " return await mock_llm_call(city_name=city_name, num_records_per_city=num_records_per_city, fail_rate=0.3)\n", + "\n", + "cities = [\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"] * 20 # 100 cities\n", + "re_run_mock_llm_data_1 = re_run_mock_llm.run(city_name=cities, num_records_per_city=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "When a job is interrupted, you'll see a message like:\n", + "[RESUME INFO] 🚨 Job stopped unexpectedly. You can resume the job by calling .resume()\n", + "\n", + "To resume an interrupted job, simply call:\n", + "interrupted_job_mock_llm.resume()\n", + "\n", + "For this example we have 20/100 data generated and not finished yet!\n" + ] + } + ], + "source": [ + "print(\"When a job is interrupted, you'll see a message like:\")\n", + "print(\"[RESUME INFO] 🚨 Job stopped unexpectedly. You can resume the job by calling .resume()\")\n", + "\n", + "print(\"\\nTo resume an interrupted job, simply call:\")\n", + "print(\"interrupted_job_mock_llm.resume()\")\n", + "print('')\n", + "print(f\"For this example we have {len(re_run_mock_llm_data_1)}/{len(cities)} data generated and not finished yet!\")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m2025-04-25 10:18:00\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB RESUME START]\u001b[0m \u001b[33mPICKING UP FROM WHERE THE JOB WAS LEFT OFF...\u001b[0m\n", + "\u001b[0m\n", + "\u001b[32m2025-04-25 10:18:00\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[RESUME PROGRESS] STATUS AT THE TIME OF RESUME:\u001b[0m \u001b[32mCompleted: 20 / 100\u001b[0m | \u001b[31mFailed: 0\u001b[0m | \u001b[31mDuplicate: 0\u001b[0m | \u001b[33mFiltered: 0\u001b[0m\u001b[0m\n", + "\u001b[32m2025-04-25 10:18:00\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 20/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 20\u001b[0m (\u001b[32mCompleted: 20\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", + "\u001b[32m2025-04-25 10:18:03\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 32/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 32\u001b[0m (\u001b[32mCompleted: 32\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", + "\u001b[32m2025-04-25 10:18:03\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Paris\u001b[0m\n", + "\u001b[32m2025-04-25 10:18:03\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Sydney\u001b[0m\n", + "\u001b[32m2025-04-25 10:18:03\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Sydney\u001b[0m\n", + "\u001b[32m2025-04-25 10:18:06\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 56/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 59\u001b[0m (\u001b[32mCompleted: 56\u001b[0m, \u001b[31mFailed: 3\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", + "\u001b[32m2025-04-25 10:18:08\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Sydney\u001b[0m\n", + "\u001b[32m2025-04-25 10:18:08\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: London\u001b[0m\n", + "\u001b[32m2025-04-25 10:18:09\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 69/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 74\u001b[0m (\u001b[32mCompleted: 69\u001b[0m, \u001b[31mFailed: 5\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", + "\u001b[32m2025-04-25 10:18:09\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: New York\u001b[0m\n", + "\u001b[32m2025-04-25 10:18:12\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 89/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 95\u001b[0m (\u001b[32mCompleted: 89\u001b[0m, \u001b[31mFailed: 6\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", + "\u001b[32m2025-04-25 10:18:12\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Paris\u001b[0m\n", + "\u001b[32m2025-04-25 10:18:13\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: New York\u001b[0m\n", + "\u001b[32m2025-04-25 10:18:14\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: New York\u001b[0m\n", + "\u001b[32m2025-04-25 10:18:14\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n", + "\u001b[32m2025-04-25 10:18:14\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 100/100\u001b[0m | \u001b[33mAttempted: 109\u001b[0m (Failed: 9, Filtered: 0, Duplicate: 0)\u001b[0m\n" + ] + } + ], + "source": [ + "## Lets keep continue the rest of run by resume_from_checkpoint \n", + "re_run_mock_llm_data_2 = re_run_mock_llm.resume()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Now we still able to finished with what is left!! 100 data generated!\n" + ] + } + ], + "source": [ + "print(f\"Now we still able to finished with what is left!! {len(re_run_mock_llm_data_2)} data generated!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 6. Dry run\n", + "Before running a large job, you can do a \"dry run\" to test your pipeline. This only processes a single item and doesn't save state to the database." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m2025-04-25 10:18:14\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: None\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n", + "\u001b[32m2025-04-25 10:18:14\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", + "\u001b[32m2025-04-25 10:18:15\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n", + "\u001b[32m2025-04-25 10:18:15\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 1/0\u001b[0m | \u001b[33mAttempted: 1\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n" + ] + } + ], + "source": [ + "@data_factory(max_concurrency=10)\n", + "async def dry_run_mock_llm(city_name: str, num_records_per_city: int):\n", + " return await mock_llm_call(city_name=city_name, num_records_per_city=num_records_per_city, fail_rate=0.3)\n", + "\n", + "dry_run_mock_llm_data = dry_run_mock_llm.dry_run(city_name=[\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"]*20, num_records_per_city=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 8. Advanced Usage\n", + "Data Factory offers more advanced capabilities for complete pipeline customization, including hooks that execute at key stages and shareable state to coordinate between tasks. These powerful features enable complex workflows and fine-grained control. Our dedicated examples for advanced data_factory usage will be coming soon!" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From d932eaff31436cbd4b3243c0e263d4d50a5be45e Mon Sep 17 00:00:00 2001 From: "johnwayne.jiang" Date: Fri, 23 May 2025 11:41:44 -0700 Subject: [PATCH 2/8] update workflow --- .github/workflows/lint-and-test.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/lint-and-test.yaml b/.github/workflows/lint-and-test.yaml index 8aadbc0..8365bda 100644 --- a/.github/workflows/lint-and-test.yaml +++ b/.github/workflows/lint-and-test.yaml @@ -9,6 +9,7 @@ on: branches: - main - dev + - '!f/pypi_release' jobs: test-integration: From f8fb713a0007c496ad2b0c3ba455879b5e482d83 Mon Sep 17 00:00:00 2001 From: "johnwayne.jiang" Date: Fri, 23 May 2025 12:34:19 -0700 Subject: [PATCH 3/8] add cli --- .gitignore | 2 + prebuilt_template/README.md | 28 ++++++ pyproject.toml | 1 + src/starfish/data_gen_template/cli.py | 117 ++++++++++++++++++++++ tests/data_template/test_data_template.py | 2 +- 5 files changed, 149 insertions(+), 1 deletion(-) create mode 100644 src/starfish/data_gen_template/cli.py diff --git a/.gitignore b/.gitignore index a28f571..c0477d4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ # Adhoc stuff +input.json +output.json .serena/ docs/ /vibe_coding/response.md diff --git a/prebuilt_template/README.md b/prebuilt_template/README.md index 0c46a1e..97b8694 100644 --- a/prebuilt_template/README.md +++ b/prebuilt_template/README.md @@ -14,6 +14,32 @@ Data generation templates are **prebuilt** that encapsulate sophisticated data g 4. **Generate Data**: Run the template to produce high-quality synthetic data 5. **Export & Use**: Data comes ready for training, testing, or evaluation +## Use the data-template CLI like this: +``` +# List all templates +data-template list-templates + +# List with details +data-template list-templates --detail + +# Get template details +data-template get-template my_template + +# Print schema +data-template print-schema my_template + +# Print example +data-template print-example my_template + +# Run template with interactive input +data-template run-template my_template + +# Run template with input file +data-template run-template my_template --input-file input.json + +# Run template and save output +data-template run-template my_template --input-file input.json --output-file output.json +``` ## Source Code Location The actual implementation of these templates can be found in: @@ -21,6 +47,8 @@ The actual implementation of these templates can be found in: src/starfish/data_gen_template/templates/ ``` + + ## Community & Contributions 🀝 Like what you see? We'd love your help in expanding our template collection! Here's how you can get involved: diff --git a/pyproject.toml b/pyproject.toml index 908b389..da0c709 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,6 +74,7 @@ nbval = "^0.11.0" [tool.poetry.scripts] starfish = "starfish.api.cli:main" +data-template = "src.starfish.data_gen_template.cli:main" [tool.ruff] diff --git a/src/starfish/data_gen_template/cli.py b/src/starfish/data_gen_template/cli.py new file mode 100644 index 0000000..25a8015 --- /dev/null +++ b/src/starfish/data_gen_template/cli.py @@ -0,0 +1,117 @@ +import typer +from pathlib import Path +from typing import Optional +from starfish.data_gen_template.core import data_gen_template + +app = typer.Typer(help="Data Template CLI") + + +@app.command() +def list_templates(detail: bool = False): + """List all available templates""" + templates = data_gen_template.list(is_detail=detail) + if detail: + for template in templates: + typer.echo(f"Template: {template['name']}") + typer.echo(f" Description: {template['description']}") + typer.echo(f" Author: {template['author']}") + typer.echo(f" Version: {template['starfish_version']}") + typer.echo(f" Dependencies: {', '.join(template.get('dependencies', []))}") + typer.echo() + else: + for template in templates: + typer.echo(template) + + +@app.command() +def get_template(name: str): + """Get details about a specific template""" + try: + data_gen_template.list() + template = data_gen_template.get(name) + typer.echo(f"Template: {template.name}") + typer.echo(f"Description: {template.description}") + typer.echo(f"Author: {template.author}") + typer.echo(f"Version: {template.starfish_version}") + typer.echo(f"Dependencies: {', '.join(template.dependencies)}") + except Exception as e: + typer.echo(f"Error: {str(e)}", err=True) + + +# @app.command() +# def export_template(name: str, output_path: str): +# """Export a template to a specific path""" +# try: +# template = data_gen_template.get(name) +# exported_path = template.export(output_path) +# typer.echo(f"Template exported to: {exported_path}") +# except Exception as e: +# typer.echo(f"Error: {str(e)}", err=True) + + +@app.command() +def run_template( + name: str, + input_file: Optional[Path] = typer.Option(None, help="Path to JSON file with input data"), + output_file: Optional[Path] = typer.Option(None, help="Path to save output to"), +): + """Run a template with the provided input data""" + try: + data_gen_template.list() + template = data_gen_template.get(name) + + # Load input data + if input_file: + import json + + with open(input_file) as f: + input_data = json.load(f) + else: + typer.echo("Please enter the input data (JSON format):") + input_data = json.loads(typer.prompt("Input data")) + + # Run the template + import asyncio + + result = asyncio.run(template.run(input_data=input_data)) + + # Handle output + if output_file: + with open(output_file, "w") as f: + json.dump(result, f, indent=2) + typer.echo(f"Output saved to {output_file}") + else: + typer.echo(json.dumps(result, indent=2)) + + except Exception as e: + typer.echo(f"Error: {str(e)}", err=True) + + +@app.command() +def print_schema(name: str): + """Print the input schema for a template""" + try: + data_gen_template.list() + template = data_gen_template.get(name) + template.print_schema() + except Exception as e: + typer.echo(f"Error: {str(e)}", err=True) + + +@app.command() +def print_example(name: str): + """Print an example input for a template""" + try: + data_gen_template.list() + template = data_gen_template.get(name) + template.print_example() + except Exception as e: + typer.echo(f"Error: {str(e)}", err=True) + + +def main(): + app() + + +if __name__ == "__main__": + main() diff --git a/tests/data_template/test_data_template.py b/tests/data_template/test_data_template.py index 855f35a..fd94f29 100644 --- a/tests/data_template/test_data_template.py +++ b/tests/data_template/test_data_template.py @@ -2,7 +2,7 @@ import pytest import os from starfish.common.env_loader import load_env_file -from starfish.data_template.template_gen import data_gen_template +from starfish import data_gen_template nest_asyncio.apply() load_env_file() From 046e953c111ebda9711998a1516b95203a428305 Mon Sep 17 00:00:00 2001 From: "johnwayne.jiang" Date: Fri, 23 May 2025 17:16:22 -0700 Subject: [PATCH 4/8] update test notbook --- .../factory/test_resume_index_1.ipynb | 570 ------------------ tests/test_notebooks.py | 2 +- 2 files changed, 1 insertion(+), 571 deletions(-) delete mode 100644 tests/data_factory/factory/test_resume_index_1.ipynb diff --git a/tests/data_factory/factory/test_resume_index_1.ipynb b/tests/data_factory/factory/test_resume_index_1.ipynb deleted file mode 100644 index 5691362..0000000 --- a/tests/data_factory/factory/test_resume_index_1.ipynb +++ /dev/null @@ -1,570 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install --index-url https://test.pypi.org/simple/ \\\n", - " --extra-index-url https://pypi.org/simple \\\n", - " starfish-core==0.1.3" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import asyncio\n", - "import time\n", - "import signal\n", - "import threading\n", - "from typing import List, Dict, Any, Optional, Set, Tuple\n", - "\n", - "from starfish import data_factory\n", - "from starfish.common.env_loader import load_env_file\n", - "from starfish.data_factory.utils.mock import mock_llm_call\n", - "from starfish.common.logger import get_logger\n", - "logger = get_logger(__name__)\n", - "# Load environment variables\n", - "load_env_file()\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "# Apply nest_asyncio for use in Jupyter notebooks\n", - "try:\n", - " import nest_asyncio\n", - " nest_asyncio.apply()\n", - "except ImportError:\n", - " print(\"nest_asyncio not found, skipping. This may cause issues if run in a notebook.\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "# Define the data factory function at module level\n", - "@data_factory(max_concurrency=10)\n", - "async def mock_llm_processor(city_name: str, num_records_per_city: int):\n", - " \"\"\"Mock LLM processor that simulates processing with a delay\"\"\"\n", - " # Added sleep to make the process take longer for demonstration\n", - " await asyncio.sleep(0.5)\n", - " return await mock_llm_call(city_name=city_name, num_records_per_city=num_records_per_city, fail_rate=0)\n", - "\n", - "class TestRunner:\n", - " def __init__(self, total_time_limit=15, checkpoint_interval=3, max_checkpoints=10):\n", - " \"\"\"\n", - " Initialize the test runner\n", - " \n", - " Args:\n", - " total_time_limit: Maximum time allowed for the whole test in seconds\n", - " checkpoint_interval: Time between checkpoints (stop/resume) in seconds\n", - " max_checkpoints: Maximum number of checkpoints before forced termination\n", - " \"\"\"\n", - " self.total_time_limit = total_time_limit\n", - " self.checkpoint_interval = checkpoint_interval\n", - " self.max_checkpoints = max_checkpoints\n", - " self.errors = [] # Each item will be a tuple of (step, error_message)\n", - " self.results = None\n", - " self.stop_events = []\n", - " self.job = None\n", - " self.timeout_triggered = False\n", - " self.all_checkpoint_errors = {} # Dictionary to track errors per checkpoint\n", - " \n", - " def add_error(self, step: str, error_message: str):\n", - " \"\"\"Add an error with the associated step information\"\"\"\n", - " self.errors.append((step, error_message))\n", - " \n", - " # Also track errors by checkpoint\n", - " if step not in self.all_checkpoint_errors:\n", - " self.all_checkpoint_errors[step] = []\n", - " self.all_checkpoint_errors[step].append(error_message)\n", - " \n", - " def validate_completion_indices(self, indices, step_name=\"Validation\", is_final=False):\n", - " \"\"\"\n", - " Validate that the completion indices are correct\n", - " \n", - " Args:\n", - " indices: The indices to validate\n", - " step_name: The name of the step for error reporting\n", - " is_final: Whether this is the final validation (expecting all indices to be complete)\n", - " \n", - " Returns:\n", - " List of errors found\n", - " \"\"\"\n", - " errors = []\n", - " \n", - " # Safety check for None\n", - " if indices is None:\n", - " error = \"Indices are None\"\n", - " self.add_error(step_name, error)\n", - " return [error]\n", - " \n", - " # Get the completed count\n", - " completed_values = [idx for idx in indices if idx is not None]\n", - " completed_count = len(completed_values)\n", - " \n", - " # For final validation, check if all indices are completed\n", - " if is_final:\n", - " # Check length\n", - " if len(indices) != 100:\n", - " error = f\"Expected 100 indices total, but found {len(indices)}\"\n", - " self.add_error(step_name, error)\n", - " errors.append(error)\n", - " \n", - " # Check that all are completed (no None values)\n", - " if completed_count != 100:\n", - " error = f\"Expected 100 completed indices, but found {completed_count}\"\n", - " self.add_error(step_name, error)\n", - " errors.append(error)\n", - " \n", - " # Check for uniqueness among completed indices (always important)\n", - " unique_indices = set(completed_values)\n", - " if len(unique_indices) != len(completed_values):\n", - " duplicates = [idx for idx in unique_indices if indices.count(idx) > 1]\n", - " error = f\"Found duplicate values: {duplicates}\"\n", - " self.add_error(step_name, error)\n", - " errors.append(error)\n", - " \n", - " # Check range of indices (0-99)\n", - " expected_range = set(range(100))\n", - " extra = unique_indices - expected_range\n", - " \n", - " if extra:\n", - " error = f\"Unexpected indices: {sorted(extra)}\"\n", - " self.add_error(step_name, error)\n", - " errors.append(error)\n", - " \n", - " # For final validation, check if any indices are missing\n", - " if is_final:\n", - " missing = expected_range - unique_indices\n", - " if missing:\n", - " error = f\"Missing indices: {sorted(missing)}\"\n", - " self.add_error(step_name, error)\n", - " errors.append(error)\n", - " \n", - " return errors\n", - "\n", - " def interrupt_execution(self):\n", - " \"\"\"Schedule an interruption after the checkpoint interval\"\"\"\n", - " print(f\"⏱️ Scheduling interruption in {self.checkpoint_interval} seconds\")\n", - " timer = threading.Timer(self.checkpoint_interval, self.raise_interrupt)\n", - " self.stop_events.append(timer)\n", - " timer.start()\n", - "\n", - " def raise_interrupt(self):\n", - " \"\"\"Raise a KeyboardInterrupt to stop the execution\"\"\"\n", - " print(\"πŸ›‘ Raising interruption signal\")\n", - " signal.raise_signal(signal.SIGINT)\n", - "\n", - " def setup_timeout(self):\n", - " \"\"\"Set up the overall timeout for the test\"\"\"\n", - " print(f\"⏱️ Setting up timeout limit of {self.total_time_limit} seconds\")\n", - " timeout_timer = threading.Timer(self.total_time_limit, self.handle_timeout)\n", - " self.stop_events.append(timeout_timer)\n", - " timeout_timer.start()\n", - "\n", - " def handle_timeout(self):\n", - " \"\"\"Handle the timeout by setting a flag instead of forcefully exiting\"\"\"\n", - " print(\"⏰ Timeout reached! Stopping the job gracefully.\")\n", - " self.add_error(\"Timeout\", f\"Test exceeded maximum time limit of {self.total_time_limit} seconds\")\n", - " # Set a flag instead of hard exiting - this is more Jupyter-friendly\n", - " self.timeout_triggered = True\n", - " # Signal the main thread to stop\n", - " signal.raise_signal(signal.SIGINT)\n", - "\n", - " def cleanup_timers(self):\n", - " \"\"\"Clean up all running timers\"\"\"\n", - " for timer in self.stop_events:\n", - " if timer.is_alive():\n", - " timer.cancel()\n", - " self.stop_events = []\n", - " \n", - " def check_progress_and_validate(self, checkpoint_name):\n", - " \"\"\"\n", - " Check the current progress and validate indices for the current checkpoint\n", - " \n", - " Returns:\n", - " Tuple of (progress_info, completed)\n", - " \"\"\"\n", - " progress_info = \"Unknown\"\n", - " completed = False\n", - " \n", - " try:\n", - " # Safely get job status - avoid calling methods directly on potentially None objects\n", - " if hasattr(self.job, 'get_index_completed') and callable(getattr(self.job, 'get_index_completed')):\n", - " indices = self.job.get_index_completed()\n", - " \n", - " # Safety check\n", - " if indices is not None:\n", - " # Determine if this is final validation based on completion status\n", - " completed_count = len([i for i in indices if i is not None])\n", - " is_final = completed_count == 100\n", - " \n", - " # Perform validation for this checkpoint\n", - " validation_errors = self.validate_completion_indices(\n", - " indices, \n", - " checkpoint_name + \" Validation\",\n", - " is_final=is_final\n", - " )\n", - " if validation_errors:\n", - " print(f\"❌ {checkpoint_name} validation failed:\")\n", - " for err in validation_errors:\n", - " print(f\" - {err}\")\n", - " elif is_final:\n", - " print(f\"βœ… {checkpoint_name} validation passed: All indices are correct\")\n", - " else:\n", - " print(f\"βœ… {checkpoint_name} partial validation passed: {completed_count} indices processed\")\n", - " \n", - " progress_info = f\"{completed_count}/100\"\n", - " \n", - " # Check if all tasks are completed\n", - " if completed_count == 100:\n", - " completed = True\n", - " else:\n", - " self.add_error(checkpoint_name, \"Failed to get indices: indices is None\")\n", - " print(f\"⚠️ {checkpoint_name}: Failed to get indices: indices is None\")\n", - " else:\n", - " self.add_error(checkpoint_name, \"Job does not have get_index_completed method\")\n", - " print(f\"⚠️ {checkpoint_name}: Job does not have get_index_completed method\")\n", - " \n", - " except Exception as e:\n", - " self.add_error(checkpoint_name, f\"Error getting indices: {str(e)}\")\n", - " print(f\"❌ {checkpoint_name}: Error getting indices: {str(e)}\")\n", - " \n", - " return progress_info, completed\n", - "\n", - " def _finish_test(self, start_time):\n", - " \"\"\"Finish the test by cleaning up and returning results\"\"\"\n", - " # Clean up timers\n", - " self.cleanup_timers()\n", - " \n", - " # Final validation if we have a job\n", - " if self.job and hasattr(self.job, 'get_index_completed'):\n", - " try:\n", - " final_indices = self.job.get_index_completed()\n", - " # Always perform full validation in the final step\n", - " validation_errors = self.validate_completion_indices(final_indices, \"Final Validation\", is_final=True)\n", - " if validation_errors:\n", - " print(\"❌ Final validation failed:\")\n", - " for err in validation_errors:\n", - " print(f\" - {err}\")\n", - " else:\n", - " print(\"βœ… Final validation passed: All indices are correct\")\n", - " except Exception as e:\n", - " self.add_error(\"Final Validation\", f\"Error getting final indices: {str(e)}\")\n", - " print(f\"❌ Error in final validation: {str(e)}\")\n", - "\n", - " def run_test(self):\n", - " \"\"\"Run the complete test with interruptions and resumptions\"\"\"\n", - " # Create input data\n", - " cities = [\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"] * 20 # 100 cities\n", - " \n", - " print(\"=== Starting Initial Run ===\")\n", - " start_time = time.time()\n", - " \n", - " try:\n", - " # Setup timers\n", - " self.setup_timeout()\n", - " self.interrupt_execution()\n", - " logger.info(\"start to setup the job\")\n", - " # Start initial run - use the module level decorated function\n", - " self.job = mock_llm_processor # Use the module-level function\n", - " logger.info(\"finish to setup the job\")\n", - " logger.info(self.job.get_index_completed)\n", - " try:\n", - " self.results = self.job.run(city_name=cities, num_records_per_city=1)\n", - " print(\"βœ… Initial run completed without interruption\")\n", - " \n", - " # Check progress and validate after initial run\n", - " progress_info, completed = self.check_progress_and_validate(\"Initial Run\")\n", - " if completed:\n", - " print(\"βœ… All tasks completed in initial run\")\n", - " return self._finish_test(start_time)\n", - " \n", - " except Exception as e:\n", - " self.add_error(\"Initial Run\", f\"Error: {str(e)}\")\n", - " print(f\"❌ Error in Initial Run: {str(e)}\")\n", - " # Don't return here, continue with checkpoint attempts\n", - " except KeyboardInterrupt:\n", - " print(\"⚠️ Initial run interrupted\")\n", - " \n", - " # Check progress and validate after interruption\n", - " progress_info, completed = self.check_progress_and_validate(\"Initial Run (Interrupted)\")\n", - " if completed:\n", - " print(\"βœ… All tasks completed after initial interruption\")\n", - " return self._finish_test(start_time)\n", - " \n", - " except Exception as e:\n", - " self.add_error(\"Initial Run Setup\", f\"Error: {str(e)}\")\n", - " print(f\"❌ Error in Initial Run setup: {str(e)}\")\n", - " # Don't return here, continue with checkpoint attempts\n", - " \n", - " # Resume until complete\n", - " checkpoint_count = 1\n", - " \n", - " # Add a safety counter to prevent infinite loops\n", - " while checkpoint_count <= self.max_checkpoints:\n", - " checkpoint_name = f\"Checkpoint {checkpoint_count}\"\n", - " \n", - " # Check if timeout was triggered\n", - " if self.timeout_triggered:\n", - " print(\"⏰ Test timed out - stopping testing loop\")\n", - " break\n", - " \n", - " # Check if we have reached the total time limit\n", - " if time.time() - start_time >= self.total_time_limit:\n", - " self.add_error(checkpoint_name, f\"Test exceeded maximum time limit of {self.total_time_limit} seconds\")\n", - " print(f\"⏰ Test timed out after {self.total_time_limit} seconds\")\n", - " break\n", - " \n", - " # Check if we've hit the max checkpoint count\n", - " if checkpoint_count == self.max_checkpoints:\n", - " self.add_error(checkpoint_name, f\"Test reached maximum checkpoint count of {self.max_checkpoints}\")\n", - " print(f\"⚠️ Test reached maximum checkpoint count of {self.max_checkpoints}\")\n", - " break\n", - " \n", - " # Check if we have a job to resume\n", - " if self.job is None:\n", - " self.add_error(checkpoint_name, \"Cannot continue: job is None\")\n", - " print(\"❌ Cannot continue: job is None\")\n", - " break\n", - " \n", - " # Check progress before resuming\n", - " progress_info, completed = self.check_progress_and_validate(f\"Before {checkpoint_name}\")\n", - " if completed:\n", - " print(f\"βœ… All tasks completed before {checkpoint_name}\")\n", - " break\n", - " \n", - " print(f\"=== Starting {checkpoint_name} ({progress_info}) ===\")\n", - " \n", - " # Resume the job\n", - " try:\n", - " # Setup interruption for the next checkpoint\n", - " self.interrupt_execution()\n", - " \n", - " # Try to resume if the method exists\n", - " if hasattr(self.job, 'resume') and callable(getattr(self.job, 'resume')):\n", - " try:\n", - " self.results = self.job.resume()\n", - " print(f\"βœ… {checkpoint_name} completed without interruption\")\n", - " \n", - " # Check progress after resumption\n", - " progress_info, completed = self.check_progress_and_validate(f\"After {checkpoint_name}\")\n", - " if completed:\n", - " print(f\"βœ… All tasks completed after {checkpoint_name}\")\n", - " break\n", - " \n", - " except Exception as e:\n", - " self.add_error(checkpoint_name, f\"Error: {str(e)}\")\n", - " print(f\"❌ Error in {checkpoint_name}: {str(e)}\")\n", - " # Continue to the next checkpoint\n", - " else:\n", - " self.add_error(checkpoint_name, \"Job does not have resume method\")\n", - " print(\"⚠️ Job does not have resume method\")\n", - " break # Can't continue without resume method\n", - " \n", - " except KeyboardInterrupt:\n", - " print(f\"⚠️ {checkpoint_name} interrupted\")\n", - " \n", - " # Check progress after interruption\n", - " progress_info, completed = self.check_progress_and_validate(f\"After {checkpoint_name} (Interrupted)\")\n", - " if completed:\n", - " print(f\"βœ… All tasks completed after {checkpoint_name} interruption\")\n", - " break\n", - " \n", - " checkpoint_count += 1\n", - "\n", - " # Finish the test\n", - " return self._finish_test(start_time)\n", - " \n", - " def _finish_test(self, start_time):\n", - " \"\"\"Finish the test by cleaning up and returning results\"\"\"\n", - " # Clean up timers\n", - " self.cleanup_timers()\n", - " \n", - " # Final validation if we have a job\n", - " if self.job and hasattr(self.job, 'get_index_completed'):\n", - " try:\n", - " final_indices = self.job.get_index_completed()\n", - " validation_errors = self.validate_completion_indices(final_indices, \"Final Validation\")\n", - " if validation_errors:\n", - " print(\"❌ Final validation failed:\")\n", - " for err in validation_errors:\n", - " print(f\" - {err}\")\n", - " else:\n", - " print(\"βœ… Final validation passed: All indices are correct\")\n", - " except Exception as e:\n", - " self.add_error(\"Final Validation\", f\"Error getting final indices: {str(e)}\")\n", - " print(f\"❌ Error in final validation: {str(e)}\")\n", - " \n", - " # Report final status\n", - " total_time = time.time() - start_time\n", - " print(f\"\\n=== Test Summary ===\")\n", - " print(f\"Total execution time: {total_time:.2f} seconds\")\n", - " \n", - " # Group errors by phase type for summary\n", - " validation_phases = [p for p in self.all_checkpoint_errors.keys() if \"Validation\" in p]\n", - " checkpoint_phases = [p for p in self.all_checkpoint_errors.keys() if \"Checkpoint\" in p and \"Validation\" not in p]\n", - " timeout_phases = [p for p in self.all_checkpoint_errors.keys() if \"Timeout\" in p]\n", - " other_phases = [p for p in self.all_checkpoint_errors.keys() \n", - " if p not in validation_phases and p not in checkpoint_phases and p not in timeout_phases]\n", - " \n", - " # Report errors by category\n", - " print(\"\\n=== Errors by Phase ===\")\n", - " \n", - " # Show timeout errors first\n", - " if timeout_phases:\n", - " print(\"\\nTimeout Errors:\")\n", - " for phase in timeout_phases:\n", - " for err in self.all_checkpoint_errors[phase]:\n", - " print(f\" - {err}\")\n", - " \n", - " # Show checkpoint execution errors\n", - " if checkpoint_phases:\n", - " print(\"\\nCheckpoint Execution Errors:\")\n", - " for phase in sorted(checkpoint_phases):\n", - " if phase in self.all_checkpoint_errors and self.all_checkpoint_errors[phase]:\n", - " print(f\" {phase}:\")\n", - " for err in self.all_checkpoint_errors[phase]:\n", - " print(f\" - {err}\")\n", - " \n", - " # Show validation errors for each checkpoint\n", - " if validation_phases:\n", - " print(\"\\nValidation Errors:\")\n", - " for phase in sorted(validation_phases):\n", - " if phase in self.all_checkpoint_errors and self.all_checkpoint_errors[phase]:\n", - " print(f\" {phase}:\")\n", - " for err in self.all_checkpoint_errors[phase]:\n", - " print(f\" - {err}\")\n", - " \n", - " # Show other errors\n", - " if other_phases:\n", - " print(\"\\nOther Errors:\")\n", - " for phase in sorted(other_phases):\n", - " if phase in self.all_checkpoint_errors and self.all_checkpoint_errors[phase]:\n", - " print(f\" {phase}:\")\n", - " for err in self.all_checkpoint_errors[phase]:\n", - " print(f\" - {err}\")\n", - " \n", - " if not self.errors:\n", - " print(\"\\nβœ… Test completed successfully with no errors\")\n", - " else:\n", - " validation_error_count = sum(len(self.all_checkpoint_errors[p]) for p in validation_phases if p in self.all_checkpoint_errors)\n", - " checkpoint_error_count = sum(len(self.all_checkpoint_errors[p]) for p in checkpoint_phases if p in self.all_checkpoint_errors)\n", - " timeout_error_count = sum(len(self.all_checkpoint_errors[p]) for p in timeout_phases if p in self.all_checkpoint_errors)\n", - " other_error_count = sum(len(self.all_checkpoint_errors[p]) for p in other_phases if p in self.all_checkpoint_errors)\n", - " \n", - " print(f\"\\n❌ Test completed with {len(self.errors)} errors:\")\n", - " print(f\" - {timeout_error_count} timeout errors\")\n", - " print(f\" - {checkpoint_error_count} checkpoint execution errors\")\n", - " print(f\" - {validation_error_count} validation errors\")\n", - " print(f\" - {other_error_count} other errors\")\n", - " \n", - " return {\n", - " \"success\": len(self.errors) == 0,\n", - " \"errors\": self.errors,\n", - " \"errors_by_checkpoint\": self.all_checkpoint_errors,\n", - " \"total_time\": total_time,\n", - " \"results\": self.results\n", - " }" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "# Run the test\n", - "runner = TestRunner(total_time_limit=20, checkpoint_interval=3, max_checkpoints=10)\n", - "result = runner.run_test()\n", - "if not result[\"success\"]:\n", - " # Format error message to include all errors organized by category\n", - " error_parts = []\n", - " \n", - " # Categorize phases\n", - " validation_phases = [p for p in result[\"errors_by_checkpoint\"].keys() if \"Validation\" in p]\n", - " checkpoint_phases = [p for p in result[\"errors_by_checkpoint\"].keys() if \"Checkpoint\" in p and \"Validation\" not in p]\n", - " timeout_phases = [p for p in result[\"errors_by_checkpoint\"].keys() if \"Timeout\" in p]\n", - " other_phases = [p for p in result[\"errors_by_checkpoint\"].keys() \n", - " if p not in validation_phases and p not in checkpoint_phases and p not in timeout_phases]\n", - " \n", - " # Add timeout errors first\n", - " if timeout_phases:\n", - " error_parts.append(\"\\n=== TIMEOUT ERRORS ===\")\n", - " for phase in timeout_phases:\n", - " for err in result[\"errors_by_checkpoint\"][phase]:\n", - " error_parts.append(f\"- {err}\")\n", - " \n", - " # Add checkpoint execution errors\n", - " if checkpoint_phases:\n", - " error_parts.append(\"\\n=== CHECKPOINT EXECUTION ERRORS ===\")\n", - " for phase in sorted(checkpoint_phases):\n", - " if phase in result[\"errors_by_checkpoint\"] and result[\"errors_by_checkpoint\"][phase]:\n", - " error_parts.append(f\"\\n-- {phase} --\")\n", - " for err in result[\"errors_by_checkpoint\"][phase]:\n", - " error_parts.append(f\"- {err}\")\n", - " \n", - " # Add validation errors for each checkpoint\n", - " if validation_phases:\n", - " error_parts.append(\"\\n=== VALIDATION ERRORS ===\")\n", - " for phase in sorted(validation_phases):\n", - " if phase in result[\"errors_by_checkpoint\"] and result[\"errors_by_checkpoint\"][phase]:\n", - " error_parts.append(f\"\\n-- {phase} --\")\n", - " for err in result[\"errors_by_checkpoint\"][phase]:\n", - " error_parts.append(f\"- {err}\")\n", - " \n", - " # Add other errors\n", - " if other_phases:\n", - " error_parts.append(\"\\n=== OTHER ERRORS ===\")\n", - " for phase in sorted(other_phases):\n", - " if phase in result[\"errors_by_checkpoint\"] and result[\"errors_by_checkpoint\"][phase]:\n", - " error_parts.append(f\"\\n-- {phase} --\")\n", - " for err in result[\"errors_by_checkpoint\"][phase]:\n", - " error_parts.append(f\"- {err}\")\n", - " \n", - " error_message = \"\\n\".join(error_parts)\n", - " validation_error_count = sum(len(result[\"errors_by_checkpoint\"][p]) for p in validation_phases if p in result[\"errors_by_checkpoint\"])\n", - " checkpoint_error_count = sum(len(result[\"errors_by_checkpoint\"][p]) for p in checkpoint_phases if p in result[\"errors_by_checkpoint\"])\n", - " timeout_error_count = sum(len(result[\"errors_by_checkpoint\"][p]) for p in timeout_phases if p in result[\"errors_by_checkpoint\"])\n", - " other_error_count = sum(len(result[\"errors_by_checkpoint\"][p]) for p in other_phases if p in result[\"errors_by_checkpoint\"])\n", - " \n", - " raise RuntimeError(f\"Test failed with {len(result['errors'])} total errors ({timeout_error_count} timeout, {checkpoint_error_count} execution, {validation_error_count} validation, {other_error_count} other):{error_message}\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "starfish-core-T7IInzTH-py3.11", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.7" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/tests/test_notebooks.py b/tests/test_notebooks.py index f3d9ce3..cddd5ff 100644 --- a/tests/test_notebooks.py +++ b/tests/test_notebooks.py @@ -13,7 +13,7 @@ def get_notebooks(base_dir=None): """Find all test notebooks in the project directory.""" if base_dir is None: - base_dir = Path(__file__).parent.parent + base_dir = Path(__file__).parent else: base_dir = Path(base_dir) From e26d0387e5babbc77507c2ccba3f51274424d5d2 Mon Sep 17 00:00:00 2001 From: "johnwayne.jiang" Date: Fri, 23 May 2025 17:20:40 -0700 Subject: [PATCH 5/8] update test notbook --- tests/test_notebooks.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_notebooks.py b/tests/test_notebooks.py index cddd5ff..ae8400b 100644 --- a/tests/test_notebooks.py +++ b/tests/test_notebooks.py @@ -22,6 +22,9 @@ def get_notebooks(base_dir=None): # Skip checkpoints if ".ipynb_checkpoints" in str(nb_path): continue + # Skip specific notebook + if "data_factory.ipynb" in str(nb_path): + continue # Only include notebooks that follow test naming convention if nb_path.name.startswith("test_"): notebooks.append(str(nb_path)) From 185ebd97a95aae4ee91b161297c83bb2ec1eed34 Mon Sep 17 00:00:00 2001 From: "johnwayne.jiang" Date: Fri, 23 May 2025 17:37:22 -0700 Subject: [PATCH 6/8] update test notbook --- tests/test_notebooks.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_notebooks.py b/tests/test_notebooks.py index ae8400b..f34830a 100644 --- a/tests/test_notebooks.py +++ b/tests/test_notebooks.py @@ -38,6 +38,9 @@ def test_notebook_execution(notebook_file): """Run the notebook through pytest to verify it executes without errors.""" pytest.importorskip("nbval") + if "data_factory.ipynb" in notebook_file: + pytest.skip("Skipping data_factory.ipynb as it is excluded from testing") + # This test will be collected by pytest # We just need to ensure the file exists assert os.path.exists(notebook_file), f"Notebook file not found: {notebook_file}" From 410846649d53df606c1beea369f65eec73803bc0 Mon Sep 17 00:00:00 2001 From: "johnwayne.jiang" Date: Fri, 23 May 2025 17:43:35 -0700 Subject: [PATCH 7/8] update test notbook --- pytest.ini | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pytest.ini b/pytest.ini index bf3fe09..09da31e 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,4 +1,7 @@ [pytest] asyncio_mode = auto timeout = 300 -timeout_method = thread \ No newline at end of file +timeout_method = thread +norecursedirs = .ipynb_checkpoints +python_files = test_*.py +ignore = tests/data_factory/factory/data_factory.ipynb \ No newline at end of file From cedafcb647595b0c27fe279c53fdcfa738fac8b2 Mon Sep 17 00:00:00 2001 From: "johnwayne.jiang" Date: Fri, 23 May 2025 17:50:32 -0700 Subject: [PATCH 8/8] fix pipeline error --- .github/workflows/publish_testpypi.yaml | 5 +- tests/data_factory/factory/data_factory.ipynb | 681 ------------------ 2 files changed, 3 insertions(+), 683 deletions(-) delete mode 100644 tests/data_factory/factory/data_factory.ipynb diff --git a/.github/workflows/publish_testpypi.yaml b/.github/workflows/publish_testpypi.yaml index b22dbf2..a090efa 100644 --- a/.github/workflows/publish_testpypi.yaml +++ b/.github/workflows/publish_testpypi.yaml @@ -57,6 +57,7 @@ jobs: with: sparse-checkout: | tests/* + examples/data_factory.ipynb sparse-checkout-cone-mode: false - name: Update system packages run: | @@ -88,13 +89,13 @@ jobs: --ExecutePreprocessor.timeout=120 \ --no-prompt --no-input \ --stdout \ - tests/data_factory/factory/data_factory.ipynb; then + examples/data_factory.ipynb; then echo "::error::Notebook execution failed" fi echo "Notebook executed successfully. Summary:" && \ jupyter nbconvert --to markdown --stdout \ - tests/data_factory/factory/data_factory.ipynb | \ + examples/data_factory.ipynb | \ grep -E '^#|^##' || true # Add tag deletion step diff --git a/tests/data_factory/factory/data_factory.ipynb b/tests/data_factory/factory/data_factory.ipynb deleted file mode 100644 index bac6c16..0000000 --- a/tests/data_factory/factory/data_factory.ipynb +++ /dev/null @@ -1,681 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Google Colab Version: [Open this notebook in Google Colab](https://colab.research.google.com/github/starfishdata/starfish/blob/main/examples/data_factory.ipynb)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Dependencies " - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: starfish-core in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (0.1.0)\n", - "Requirement already satisfied: aiofiles<25.0.0,>=24.1.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (24.1.0)\n", - "Requirement already satisfied: aiosqlite<0.22.0,>=0.21.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (0.21.0)\n", - "Requirement already satisfied: cachetools<6.0.0,>=5.5.2 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (5.5.2)\n", - "Requirement already satisfied: litellm<2.0.0,>=1.65.1 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (1.65.1)\n", - "Requirement already satisfied: loguru<0.8.0,>=0.7.3 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (0.7.3)\n", - "Requirement already satisfied: ollama<0.5.0,>=0.4.7 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (0.4.7)\n", - "Requirement already satisfied: platformdirs<5.0.0,>=4.1.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (4.3.7)\n", - "Requirement already satisfied: psutil<8.0.0,>=7.0.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (7.0.0)\n", - "Requirement already satisfied: python-dotenv<2.0.0,>=1.1.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (1.1.0)\n", - "Requirement already satisfied: typing-extensions<5.0.0,>=4.0.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (4.13.0)\n", - "Requirement already satisfied: aiohttp in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (3.11.16)\n", - "Requirement already satisfied: click in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (8.1.8)\n", - "Requirement already satisfied: httpx>=0.23.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (0.28.1)\n", - "Requirement already satisfied: importlib-metadata>=6.8.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (8.6.1)\n", - "Requirement already satisfied: jinja2<4.0.0,>=3.1.2 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (3.1.6)\n", - "Requirement already satisfied: jsonschema<5.0.0,>=4.22.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (4.23.0)\n", - "Requirement already satisfied: openai>=1.68.2 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (1.70.0)\n", - "Requirement already satisfied: pydantic<3.0.0,>=2.0.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (2.11.1)\n", - "Requirement already satisfied: tiktoken>=0.7.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (0.9.0)\n", - "Requirement already satisfied: tokenizers in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (0.21.1)\n", - "Requirement already satisfied: anyio in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from httpx>=0.23.0->litellm<2.0.0,>=1.65.1->starfish-core) (4.9.0)\n", - "Requirement already satisfied: certifi in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from httpx>=0.23.0->litellm<2.0.0,>=1.65.1->starfish-core) (2025.1.31)\n", - "Requirement already satisfied: httpcore==1.* in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from httpx>=0.23.0->litellm<2.0.0,>=1.65.1->starfish-core) (1.0.7)\n", - "Requirement already satisfied: idna in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from httpx>=0.23.0->litellm<2.0.0,>=1.65.1->starfish-core) (3.10)\n", - "Requirement already satisfied: h11<0.15,>=0.13 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from httpcore==1.*->httpx>=0.23.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.14.0)\n", - "Requirement already satisfied: zipp>=3.20 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from importlib-metadata>=6.8.0->litellm<2.0.0,>=1.65.1->starfish-core) (3.21.0)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from jinja2<4.0.0,>=3.1.2->litellm<2.0.0,>=1.65.1->starfish-core) (3.0.2)\n", - "Requirement already satisfied: attrs>=22.2.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm<2.0.0,>=1.65.1->starfish-core) (25.3.0)\n", - "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm<2.0.0,>=1.65.1->starfish-core) (2024.10.1)\n", - "Requirement already satisfied: referencing>=0.28.4 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.36.2)\n", - "Requirement already satisfied: rpds-py>=0.7.1 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.24.0)\n", - "Requirement already satisfied: distro<2,>=1.7.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from openai>=1.68.2->litellm<2.0.0,>=1.65.1->starfish-core) (1.9.0)\n", - "Requirement already satisfied: jiter<1,>=0.4.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from openai>=1.68.2->litellm<2.0.0,>=1.65.1->starfish-core) (0.9.0)\n", - "Requirement already satisfied: sniffio in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from openai>=1.68.2->litellm<2.0.0,>=1.65.1->starfish-core) (1.3.1)\n", - "Requirement already satisfied: tqdm>4 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from openai>=1.68.2->litellm<2.0.0,>=1.65.1->starfish-core) (4.67.1)\n", - "Requirement already satisfied: annotated-types>=0.6.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from pydantic<3.0.0,>=2.0.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.7.0)\n", - "Requirement already satisfied: pydantic-core==2.33.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from pydantic<3.0.0,>=2.0.0->litellm<2.0.0,>=1.65.1->starfish-core) (2.33.0)\n", - "Requirement already satisfied: typing-inspection>=0.4.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from pydantic<3.0.0,>=2.0.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.4.0)\n", - "Requirement already satisfied: regex>=2022.1.18 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from tiktoken>=0.7.0->litellm<2.0.0,>=1.65.1->starfish-core) (2024.11.6)\n", - "Requirement already satisfied: requests>=2.26.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from tiktoken>=0.7.0->litellm<2.0.0,>=1.65.1->starfish-core) (2.32.3)\n", - "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (2.6.1)\n", - "Requirement already satisfied: aiosignal>=1.1.2 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (1.3.2)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (1.5.0)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (6.3.1)\n", - "Requirement already satisfied: propcache>=0.2.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (0.3.1)\n", - "Requirement already satisfied: yarl<2.0,>=1.17.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (1.18.3)\n", - "Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (0.30.1)\n", - "Requirement already satisfied: filelock in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (3.18.0)\n", - "Requirement already satisfied: fsspec>=2023.5.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (2025.3.2)\n", - "Requirement already satisfied: packaging>=20.9 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (24.2)\n", - "Requirement already satisfied: pyyaml>=5.1 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (6.0.2)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from requests>=2.26.0->tiktoken>=0.7.0->litellm<2.0.0,>=1.65.1->starfish-core) (3.4.1)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from requests>=2.26.0->tiktoken>=0.7.0->litellm<2.0.0,>=1.65.1->starfish-core) (2.3.0)\n", - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], - "source": [ - "%pip install starfish-core" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "## Fix for Jupyter Notebook only β€” do NOT use in production\n", - "## Enables async code execution in notebooks, but may cause issues with sync/async issues\n", - "## For production, please run in standard .py files without this workaround\n", - "## See: https://github.com/erdewit/nest_asyncio for more details\n", - "import nest_asyncio\n", - "nest_asyncio.apply()\n", - "\n", - "from starfish import StructuredLLM, data_factory\n", - "from starfish.llm.utils import merge_structured_outputs\n", - "\n", - "from starfish.common.env_loader import load_env_file ## Load environment variables from .env file\n", - "load_env_file()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# setup your openai api key if not already set\n", - "# import os\n", - "# os.environ[\"OPENAI_API_KEY\"] = \"your_key_here\"\n", - "\n", - "# If you dont have any API key, please navigate to local model section" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "## Helper function mock llm call\n", - "# When developing data pipelines with LLMs, making thousands of real API calls\n", - "# can be expensive. Using mock LLM calls lets you test your pipeline's reliability,\n", - "# failure handling, and recovery without spending money on API calls.\n", - "from starfish.data_factory.utils.mock import mock_llm_call" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 1. Your First Data Factory: Simple Scaling\n", - "\n", - "The @data_factory decorator transforms any async function into a scalable data processing pipeline.\n", - "It handles:\n", - "- Parallel execution \n", - "- Automatic batching\n", - "- Error handling & retries\n", - "- Progress tracking\n", - "\n", - "Let's start with a single LLM call and then show how easy it is to scale it.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'fact': 'New Yorkers consume around 1,000,000 slices of pizza every day, which means if you laid them all in a line, they would stretch from the Statue of Liberty to the Eiffel Tower... and back!'}]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# First, create a StructuredLLM instance for generating facts about cities\n", - "json_llm = StructuredLLM(\n", - " model_name = \"openai/gpt-4o-mini\",\n", - " prompt = \"Funny facts about city {{city_name}}.\",\n", - " output_schema = [{'name': 'fact', 'type': 'str'}],\n", - " model_kwargs = {\"temperature\": 0.7},\n", - ")\n", - "\n", - "json_llm_response = await json_llm.run(city_name='New York')\n", - "json_llm_response.data" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[32m2025-04-25 10:16:32\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 8c926411-63e7-4dc6-98c9-861c3489fb8b\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n", - "\u001b[32m2025-04-25 10:16:32\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/5\u001b[0m | \u001b[33mRunning: 5\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", - "Processing New York at 2025-04-25 10:16:32.524033\n", - "Processing London at 2025-04-25 10:16:32.524286\n", - "Processing Tokyo at 2025-04-25 10:16:32.524979\n", - "Processing Paris at 2025-04-25 10:16:32.525535\n", - "Processing Sydney at 2025-04-25 10:16:32.526729\n", - "\u001b[32m2025-04-25 10:16:34\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n", - "\u001b[32m2025-04-25 10:16:34\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 5/5\u001b[0m | \u001b[33mAttempted: 5\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n" - ] - }, - { - "data": { - "text/plain": [ - "[{'fact': \"In Tokyo, there's a train station called 'Shinjuku' that handles more passengers each day than the entire population of the United States!\"},\n", - " {'fact': \"London has a 'secret' underground city known as the 'London Stone', which is said to have magical powers, making it one of the city's most famous and quirky legends!\"},\n", - " {'fact': 'In Paris, you can legally marry a dead person! This quirky law allows for posthumous marriages, as long as you can prove that the deceased had intended to marry you before their untimely demise.'},\n", - " {'fact': 'In New York City, there are more than 25,000 licensed taxis, but only about 1,200 of them are actually yellow. The rest are a rainbow of colors, including pink, blue, and even animal print!'},\n", - " {'fact': 'Sydney has a beach where you can surf, swim, and even watch a film – all in one day! Just don’t forget your sunscreen and popcorn!'}]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Now, scale to multiple cities using data_factory\n", - "# Just add the @data_factory decorator to process many cities in parallel\n", - "\n", - "from datetime import datetime\n", - "@data_factory(max_concurrency=10)\n", - "async def process_json_llm(city_name: str):\n", - " ## Adding a print statement to indicate the start of the processing\n", - " print(f\"Processing {city_name} at {datetime.now()}\")\n", - " json_llm_response = await json_llm.run(city_name=city_name)\n", - " return json_llm_response.data\n", - "\n", - "# This is all it takes to scale from one city to many cities!\n", - "process_json_llm.run(city_name=[\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 2. Works with any aysnc function\n", - "\n", - "Data Factory works with any async function, not just LLM calls, you can build complex pipelines involving multiple LLMs, data processing, etc.\n", - "\n", - "Here is example of two chained structured llm" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[32m2025-04-25 10:16:34\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 466fca03-85a2-46de-b135-629cd76738f7\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n", - "\u001b[32m2025-04-25 10:16:34\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/3\u001b[0m | \u001b[33mRunning: 3\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", - "\u001b[32m2025-04-25 10:16:37\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/3\u001b[0m | \u001b[33mRunning: 3\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", - "\u001b[32m2025-04-25 10:16:40\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/3\u001b[0m | \u001b[33mRunning: 3\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", - "\u001b[32m2025-04-25 10:16:43\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 2/3\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 2\u001b[0m (\u001b[32mCompleted: 2\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", - "\u001b[32m2025-04-25 10:16:44\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n", - "\u001b[32m2025-04-25 10:16:44\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 3/3\u001b[0m | \u001b[33mAttempted: 3\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n" - ] - } - ], - "source": [ - "# Example of a more complex function that chains multiple LLM calls\n", - "# This was grabbed from structured llm examples \n", - "\n", - "@data_factory(max_concurrency=10)\n", - "async def complex_process_cities(topic: str):\n", - " ## topic β†’ generator_llm β†’ rating_llm β†’ merged results\n", - " # First LLM to generate question/answer pairs\n", - " generator_llm = StructuredLLM(\n", - " model_name=\"openai/gpt-4o-mini\",\n", - " prompt=\"Generate question/answer pairs about {{topic}}.\",\n", - " output_schema=[\n", - " {\"name\": \"question\", \"type\": \"str\"},\n", - " {\"name\": \"answer\", \"type\": \"str\"}\n", - " ],\n", - " )\n", - "\n", - " # Second LLM to rate the generated pairs\n", - " rater_llm = StructuredLLM(\n", - " model_name=\"openai/gpt-4o-mini\",\n", - " prompt='''Rate the following Q&A pairs based on accuracy and clarity (1-10).\n", - " Pairs: {{generated_pairs}}''',\n", - " output_schema=[\n", - " {\"name\": \"accuracy_rating\", \"type\": \"int\"},\n", - " {\"name\": \"clarity_rating\", \"type\": \"int\"}\n", - " ],\n", - " model_kwargs={\"temperature\": 0.5}\n", - ")\n", - "\n", - " generation_response = await generator_llm.run(topic=topic, num_records=5)\n", - " rating_response = await rater_llm.run(generated_pairs=generation_response.data)\n", - " \n", - " # Merge the results\n", - " return merge_structured_outputs(generation_response.data, rating_response.data)\n", - "\n", - "\n", - "### To save on token here we only use 3 topics as example\n", - "complex_process_cities_data = complex_process_cities.run(topic=['Science', 'History', 'Technology'])" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "15\n", - "[{'question': 'What is the primary function of a CPU in a computer?', 'answer': 'The CPU, or Central Processing Unit, is responsible for executing instructions and processing data in a computer system.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What does IoT stand for and what is its significance?', 'answer': 'IoT stands for Internet of Things, which refers to the interconnection of everyday devices to the internet, allowing them to send and receive data, thereby enhancing efficiency and convenience.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is the difference between RAM and ROM?', 'answer': 'RAM (Random Access Memory) is volatile memory that temporarily stores data and applications currently in use, while ROM (Read-Only Memory) is non-volatile memory that permanently stores firmware and system software.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is cloud computing?', 'answer': 'Cloud computing is the delivery of computing services over the internet, enabling users to access and store data and applications on remote servers rather than on local computers.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What are the benefits of using artificial intelligence in business?', 'answer': 'Artificial intelligence can enhance efficiency, improve decision-making, personalize customer experiences, automate repetitive tasks, and generate insights from data analytics in business operations.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is the chemical formula for water?', 'answer': 'The chemical formula for water is H2O.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is the process by which plants make their own food?', 'answer': 'The process by which plants make their own food is called photosynthesis.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is the speed of light in a vacuum?', 'answer': 'The speed of light in a vacuum is approximately 299,792 kilometers per second (or about 186,282 miles per second).', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is the smallest unit of life?', 'answer': 'The smallest unit of life is the cell.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What gas do living organisms need for respiration?', 'answer': 'Living organisms need oxygen for respiration.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What was the primary cause of World War I?', 'answer': 'The primary cause of World War I was the complex system of alliances, militarism, imperialism, and nationalism, which escalated tensions following the assassination of Archduke Franz Ferdinand of Austria in 1914.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'Who was the first President of the United States?', 'answer': 'George Washington was the first President of the United States, serving from April 30, 1789, to March 4, 1797.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What year did the Berlin Wall fall?', 'answer': 'The Berlin Wall fell on November 9, 1989, symbolizing the end of the Cold War and the division between East and West Germany.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'Which ancient civilization is known for creating the first known writing system?', 'answer': 'The Sumerians, who inhabited ancient Mesopotamia around 3500 BCE, are known for creating the first known writing system called cuneiform.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What was the significance of the Magna Carta?', 'answer': 'The Magna Carta, signed in 1215, was significant because it limited the power of the monarchy and established the principle that everyone, including the king, was subject to the law.', 'accuracy_rating': 10, 'clarity_rating': 10}]\n" - ] - } - ], - "source": [ - "### Each topic has 5 question/answer pairs so 3 topics has 15 pairs!\n", - "print(len(complex_process_cities_data))\n", - "print(complex_process_cities_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 3. Working with Different Input Formats\n", - "\n", - "\n", - "Data Factory is flexible with how you provide inputs. Let's demonstrate different ways to pass parameters to data_factory functions.\n", - "\n", - "'data' is a reserved keyword expecting list(dict) or tuple(dict) - this design make it super easy to pass large data and support HuggingFace and Pandas dataframe very easily" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'answer': 'New York_5'}, {'answer': 'New York_2'}, {'answer': 'New York_3'}]" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "## We will be using mock llm call for rest of example to save on token\n", - "## Mock LLM call is a function that simulates an LLM API call with random delays (controlled by sleep_time) and occasional failures (controlled by fail_rate)\n", - "await mock_llm_call(city_name=\"New York\", num_records_per_city=3)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "@data_factory(max_concurrency=100)\n", - "async def input_format_mock_llm(city_name: str, num_records_per_city: int):\n", - " return await mock_llm_call(city_name=city_name, num_records_per_city=num_records_per_city, fail_rate=0.01)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[32m2025-04-25 10:16:49\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 05c84608-fec3-4010-8876-e59eed12bb6a\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n", - "\u001b[32m2025-04-25 10:16:49\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/5\u001b[0m | \u001b[33mRunning: 5\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", - "\u001b[32m2025-04-25 10:16:50\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n", - "\u001b[32m2025-04-25 10:16:50\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 5/5\u001b[0m | \u001b[33mAttempted: 5\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n" - ] - } - ], - "source": [ - "# Format 1: Multiple lists that get zipped together\n", - "input_format_data1 = input_format_mock_llm.run(city_name=[\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"], num_records_per_city=[2, 1, 1, 1, 1])" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[32m2025-04-25 10:16:50\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: fedb98e5-c408-4bc8-9479-6087f4a298b7\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n", - "\u001b[32m2025-04-25 10:16:50\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/5\u001b[0m | \u001b[33mRunning: 5\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", - "\u001b[32m2025-04-25 10:16:51\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n", - "\u001b[32m2025-04-25 10:16:51\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 5/5\u001b[0m | \u001b[33mAttempted: 5\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n" - ] - } - ], - "source": [ - "# Format 2: List + single value (single value gets broadcasted)\n", - "input_format_data2 = input_format_mock_llm.run(city_name=[\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"], num_records_per_city=1)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[32m2025-04-25 10:16:51\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 2f5cb7cc-83c9-4b7e-9ebb-386cd66bdd42\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n", - "\u001b[32m2025-04-25 10:16:51\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/5\u001b[0m | \u001b[33mRunning: 5\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", - "\u001b[32m2025-04-25 10:16:52\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n", - "\u001b[32m2025-04-25 10:16:52\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 5/5\u001b[0m | \u001b[33mAttempted: 5\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n" - ] - } - ], - "source": [ - "# Format 3: Special 'data' parameter\n", - "# 'data' is a reserved keyword expecting list(dict) or tuple(dict)\n", - "# Makes integration with various data sources easier\n", - "input_format_data3 = input_format_mock_llm.run(data=[{\"city_name\": \"New York\", \"num_records_per_city\": 2}, {\"city_name\": \"London\", \"num_records_per_city\": 1}, {\"city_name\": \"Tokyo\", \"num_records_per_city\": 1}, {\"city_name\": \"Paris\", \"num_records_per_city\": 1}, {\"city_name\": \"Sydney\", \"num_records_per_city\": 1}])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 4. Resilient error retry\n", - "Data Factory automatically handles errors and retries, making your pipelines robust.\n", - "\n", - "Let's demonstrate with a high failure rate example." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[32m2025-04-25 10:16:56\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 38c50ab6-f24b-4cba-a2c5-070130ab420e\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n", - "\u001b[32m2025-04-25 10:16:56\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/25\u001b[0m | \u001b[33mRunning: 25\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", - "\u001b[32m2025-04-25 10:16:59\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 16/25\u001b[0m | \u001b[33mRunning: 9\u001b[0m | \u001b[36mAttempted: 16\u001b[0m (\u001b[32mCompleted: 16\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", - "\u001b[32m2025-04-25 10:16:59\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Tokyo\u001b[0m\n", - "\u001b[32m2025-04-25 10:16:59\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: London\u001b[0m\n", - "\u001b[32m2025-04-25 10:16:59\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: New York\u001b[0m\n", - "\u001b[32m2025-04-25 10:17:02\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 23/25\u001b[0m | \u001b[33mRunning: 2\u001b[0m | \u001b[36mAttempted: 26\u001b[0m (\u001b[32mCompleted: 23\u001b[0m, \u001b[31mFailed: 3\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", - "\u001b[32m2025-04-25 10:17:03\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: New York\u001b[0m\n", - "\u001b[32m2025-04-25 10:17:05\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n", - "\u001b[32m2025-04-25 10:17:05\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 25/25\u001b[0m | \u001b[33mAttempted: 29\u001b[0m (Failed: 4, Filtered: 0, Duplicate: 0)\u001b[0m\n", - "\n", - "Successfully completed 25 out of 25 tasks\n", - "Data Factory automatically handled the failures and continued processing\n", - "The results only include successful tasks\n" - ] - } - ], - "source": [ - "@data_factory(max_concurrency=100)\n", - "async def high_error_rate_mock_llm(city_name: str, num_records_per_city: int):\n", - " return await mock_llm_call(city_name=city_name, num_records_per_city=num_records_per_city, fail_rate=0.3) # Hardcode to 30% chance of failure\n", - "\n", - "# Process all cities - some will fail, but data_factory keeps going\n", - "cities = [\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"] * 5 # 25 cities\n", - "high_error_rate_mock_lllm_data = high_error_rate_mock_llm.run(city_name=cities, num_records_per_city=1)\n", - "\n", - "print(f\"\\nSuccessfully completed {len(high_error_rate_mock_lllm_data)} out of {len(cities)} tasks\")\n", - "print(\"Data Factory automatically handled the failures and continued processing\")\n", - "print(\"The results only include successful tasks\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 5. Resume\n", - "\n", - "This is essential for long-running jobs with thousands of tasks.\n", - "\n", - "If a job is interrupted, you can pick up where you left off using one of two resume methods:\n", - "\n", - "\n", - "1. **Same Session Resume**: If you're still in the same session where the job was interrupted, simply call - Same instance with .resume()\n", - "\n", - "2. **Cross-Session Resume**: If you've closed your notebook or lost your session, you can resume using the job ID:\n", - " ```python\n", - " from starfish import DataFactory\n", - " # Resume using the master job ID from a previous run\n", - " data_factory = DataFactory.resume_from_checkpoint(job_id=\"your_job_id\")\n", - " ```\n", - "\n", - "The key difference:\n", - "- `resume()` uses the same DataFactory instance you defined\n", - "- `resume_from_checkpoint()` reconstructs your DataFactory from persistent storage where tasks and progress are saved\n", - "\n", - "> **Note**: Google Colab users may experience issues with `resume_from_checkpoint()` due to how Colab works\n", - "\n", - "We're simulating an interruption here. In a real scenario, this might happen if your notebook errors out, is manually interrupted with a keyboard command, encounters API rate limits, or experiences any other issues that halt execution." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[32m2025-04-25 10:17:12\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: b2a400b3-32e7-45ee-b8e8-c2bc7afe9f11\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n", - "\u001b[32m2025-04-25 10:17:12\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", - "\u001b[32m2025-04-25 10:17:15\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 17/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 17\u001b[0m (\u001b[32mCompleted: 17\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", - "\u001b[32m2025-04-25 10:17:15\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n", - "\u001b[32m2025-04-25 10:17:15\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError occurred: KeyboardInterrupt\u001b[0m\n", - "\u001b[32m2025-04-25 10:17:15\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[RESUME INFO] 🚨 Job stopped unexpectedly. You can resume the job by calling .resume()\u001b[0m\n", - "\u001b[32m2025-04-25 10:17:15\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 20/100\u001b[0m | \u001b[33mAttempted: 20\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n" - ] - } - ], - "source": [ - "@data_factory(max_concurrency=10)\n", - "async def re_run_mock_llm(city_name: str, num_records_per_city: int):\n", - " return await mock_llm_call(city_name=city_name, num_records_per_city=num_records_per_city, fail_rate=0.3)\n", - "\n", - "cities = [\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"] * 20 # 100 cities\n", - "re_run_mock_llm_data_1 = re_run_mock_llm.run(city_name=cities, num_records_per_city=1)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "When a job is interrupted, you'll see a message like:\n", - "[RESUME INFO] 🚨 Job stopped unexpectedly. You can resume the job by calling .resume()\n", - "\n", - "To resume an interrupted job, simply call:\n", - "interrupted_job_mock_llm.resume()\n", - "\n", - "For this example we have 20/100 data generated and not finished yet!\n" - ] - } - ], - "source": [ - "print(\"When a job is interrupted, you'll see a message like:\")\n", - "print(\"[RESUME INFO] 🚨 Job stopped unexpectedly. You can resume the job by calling .resume()\")\n", - "\n", - "print(\"\\nTo resume an interrupted job, simply call:\")\n", - "print(\"interrupted_job_mock_llm.resume()\")\n", - "print('')\n", - "print(f\"For this example we have {len(re_run_mock_llm_data_1)}/{len(cities)} data generated and not finished yet!\")" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[32m2025-04-25 10:18:00\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB RESUME START]\u001b[0m \u001b[33mPICKING UP FROM WHERE THE JOB WAS LEFT OFF...\u001b[0m\n", - "\u001b[0m\n", - "\u001b[32m2025-04-25 10:18:00\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[RESUME PROGRESS] STATUS AT THE TIME OF RESUME:\u001b[0m \u001b[32mCompleted: 20 / 100\u001b[0m | \u001b[31mFailed: 0\u001b[0m | \u001b[31mDuplicate: 0\u001b[0m | \u001b[33mFiltered: 0\u001b[0m\u001b[0m\n", - "\u001b[32m2025-04-25 10:18:00\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 20/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 20\u001b[0m (\u001b[32mCompleted: 20\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", - "\u001b[32m2025-04-25 10:18:03\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 32/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 32\u001b[0m (\u001b[32mCompleted: 32\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", - "\u001b[32m2025-04-25 10:18:03\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Paris\u001b[0m\n", - "\u001b[32m2025-04-25 10:18:03\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Sydney\u001b[0m\n", - "\u001b[32m2025-04-25 10:18:03\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Sydney\u001b[0m\n", - "\u001b[32m2025-04-25 10:18:06\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 56/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 59\u001b[0m (\u001b[32mCompleted: 56\u001b[0m, \u001b[31mFailed: 3\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", - "\u001b[32m2025-04-25 10:18:08\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Sydney\u001b[0m\n", - "\u001b[32m2025-04-25 10:18:08\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: London\u001b[0m\n", - "\u001b[32m2025-04-25 10:18:09\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 69/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 74\u001b[0m (\u001b[32mCompleted: 69\u001b[0m, \u001b[31mFailed: 5\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", - "\u001b[32m2025-04-25 10:18:09\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: New York\u001b[0m\n", - "\u001b[32m2025-04-25 10:18:12\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 89/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 95\u001b[0m (\u001b[32mCompleted: 89\u001b[0m, \u001b[31mFailed: 6\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", - "\u001b[32m2025-04-25 10:18:12\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Paris\u001b[0m\n", - "\u001b[32m2025-04-25 10:18:13\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: New York\u001b[0m\n", - "\u001b[32m2025-04-25 10:18:14\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: New York\u001b[0m\n", - "\u001b[32m2025-04-25 10:18:14\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n", - "\u001b[32m2025-04-25 10:18:14\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 100/100\u001b[0m | \u001b[33mAttempted: 109\u001b[0m (Failed: 9, Filtered: 0, Duplicate: 0)\u001b[0m\n" - ] - } - ], - "source": [ - "## Lets keep continue the rest of run by resume_from_checkpoint \n", - "re_run_mock_llm_data_2 = re_run_mock_llm.resume()" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Now we still able to finished with what is left!! 100 data generated!\n" - ] - } - ], - "source": [ - "print(f\"Now we still able to finished with what is left!! {len(re_run_mock_llm_data_2)} data generated!\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 6. Dry run\n", - "Before running a large job, you can do a \"dry run\" to test your pipeline. This only processes a single item and doesn't save state to the database." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[32m2025-04-25 10:18:14\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: None\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n", - "\u001b[32m2025-04-25 10:18:14\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n", - "\u001b[32m2025-04-25 10:18:15\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n", - "\u001b[32m2025-04-25 10:18:15\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 1/0\u001b[0m | \u001b[33mAttempted: 1\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n" - ] - } - ], - "source": [ - "@data_factory(max_concurrency=10)\n", - "async def dry_run_mock_llm(city_name: str, num_records_per_city: int):\n", - " return await mock_llm_call(city_name=city_name, num_records_per_city=num_records_per_city, fail_rate=0.3)\n", - "\n", - "dry_run_mock_llm_data = dry_run_mock_llm.dry_run(city_name=[\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"]*20, num_records_per_city=1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 8. Advanced Usage\n", - "Data Factory offers more advanced capabilities for complete pipeline customization, including hooks that execute at key stages and shareable state to coordinate between tasks. These powerful features enable complex workflows and fine-grained control. Our dedicated examples for advanced data_factory usage will be coming soon!" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.4" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}