From 6c9bd4dac781812bb6406dead191459e34aebc71 Mon Sep 17 00:00:00 2001 From: Celestine Akpanoko Date: Tue, 20 May 2025 10:48:50 -0500 Subject: [PATCH 1/6] llm integration initial commit --- requirements.txt | 7 + run.py | 4 + src/core/api/models/llm/request.py | 13 + src/core/api/models/llm/response.py | 35 +++ src/core/api/routes/llm.py | 88 ++++++ src/core/contexts/engineering.json | 11 + .../contexts/manufacturing/materials.json | 21 ++ src/core/main.py | 2 + src/core/models/supply_trees.py | 2 +- src/core/prompts/default.txt | 5 + src/core/prompts/engineering.txt | 10 + .../prompts/manufacturing/substitution.txt | 13 + src/core/services/llm_provider.py | 257 ++++++++++++++++++ 13 files changed, 467 insertions(+), 1 deletion(-) create mode 100644 src/core/api/models/llm/request.py create mode 100644 src/core/api/models/llm/response.py create mode 100644 src/core/api/routes/llm.py create mode 100644 src/core/contexts/engineering.json create mode 100644 src/core/contexts/manufacturing/materials.json create mode 100644 src/core/prompts/default.txt create mode 100644 src/core/prompts/engineering.txt create mode 100644 src/core/prompts/manufacturing/substitution.txt create mode 100644 src/core/services/llm_provider.py diff --git a/requirements.txt b/requirements.txt index 64eb8ff..7d401ab 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,10 @@ mkdocs-mermaid2-plugin networkx fastapi uvicorn +torch +transformers +accelerate>=0.21.0 +bitsandbytes>=0.41.0 +safetensors +sentencepiece +dotenv \ No newline at end of file diff --git a/run.py b/run.py index efc6268..a6dbaa4 100644 --- a/run.py +++ b/run.py @@ -1,4 +1,8 @@ import uvicorn +from dotenv import load_dotenv + +# Load environment variables from .env file +load_dotenv() if __name__ == "__main__": uvicorn.run("src.core.main:app", diff --git a/src/core/api/models/llm/request.py b/src/core/api/models/llm/request.py new file mode 100644 index 0000000..87c3316 --- /dev/null +++ b/src/core/api/models/llm/request.py @@ -0,0 +1,13 @@ +from pydantic import BaseModel, Field +from typing import Dict, Any, Optional, List + +class LLMRequest(BaseModel): + """Request model for the LLM endpoint""" + prompt: str = Field(..., description="The prompt to send to the LLM") + prompt_template: Optional[str] = Field(None, description="Name of a prompt template to use") + context_name: Optional[str] = Field(None, description="Name of the context to apply") + template_variables: Dict[str, Any] = Field(default_factory=dict, description="Variables to fill in prompt template") + system_message: Optional[str] = Field(None, description="Optional system message to guide the LLM") + temperature: float = Field(0.7, ge=0.0, le=1.0, description="Temperature parameter (0-1)") + max_tokens: int = Field(1000, gt=0, description="Maximum tokens to generate") + stop_sequences: Optional[List[str]] = Field(None, description="Sequences that will stop generation") \ No newline at end of file diff --git a/src/core/api/models/llm/response.py b/src/core/api/models/llm/response.py new file mode 100644 index 0000000..dfe97b8 --- /dev/null +++ b/src/core/api/models/llm/response.py @@ -0,0 +1,35 @@ +from pydantic import BaseModel, Field +from typing import Dict, Any, Optional, List + +class TokenUsage(BaseModel): + """Token usage information""" + prompt_tokens: int + completion_tokens: int + total_tokens: int + +class LLMResponse(BaseModel): + """Response model for the LLM endpoint""" + response: str = Field(..., description="The generated text from the LLM") + usage: TokenUsage = Field(..., description="Token usage statistics") + model: str = Field(..., description="Model used for generation") + finish_reason: Optional[str] = Field(None, description="Reason for completion") + processing_time_ms: int = Field(..., description="Processing time in milliseconds") + +class PromptInfo(BaseModel): + """Information about an available prompt template""" + name: str = Field(..., description="Name of the prompt template") + description: Optional[str] = Field(None, description="Brief description of the prompt") + required_variables: List[str] = Field(default_factory=list, description="Required template variables") + +class PromptListResponse(BaseModel): + """Response with available prompt templates""" + prompts: List[PromptInfo] = Field(..., description="Available prompt templates") + +class ContextInfo(BaseModel): + """Information about an available context""" + name: str = Field(..., description="Name of the context") + description: Optional[str] = Field(None, description="Brief description of the context") + +class ContextListResponse(BaseModel): + """Response with available contexts""" + contexts: List[ContextInfo] = Field(..., description="Available contexts") \ No newline at end of file diff --git a/src/core/api/routes/llm.py b/src/core/api/routes/llm.py new file mode 100644 index 0000000..ae1cc19 --- /dev/null +++ b/src/core/api/routes/llm.py @@ -0,0 +1,88 @@ +from fastapi import APIRouter, HTTPException, status, Body +from typing import Dict, Any, List, Optional + +from ..models.llm.request import LLMRequest +from ..models.llm.response import LLMResponse, PromptListResponse, ContextListResponse +from ..models.llm.response import PromptInfo, ContextInfo, TokenUsage + +from src.core.services.llm_provider import LLMProvider +from src.core.services.directory_manager import DirectoryManager + +# Create router with prefix and tags +router = APIRouter(prefix="/v1", tags=["llm"]) + +# Initialize services +llm_provider = LLMProvider() +directory_manager = DirectoryManager() + +@router.get("/llm/prompts", response_model=PromptListResponse) +async def list_prompts(): + """List all available prompt templates""" + prompt_list = directory_manager.list_prompts() + prompts = [ + PromptInfo( + name=p["name"], + description=p["description"], + required_variables=p["required_variables"] + ) for p in prompt_list + ] + return PromptListResponse(prompts=prompts) + +@router.get("/llm/contexts", response_model=ContextListResponse) +async def list_contexts(): + """List all available contexts""" + context_list = directory_manager.list_contexts() + contexts = [ + ContextInfo( + name=c["name"], + description=c["description"] + ) for c in context_list + ] + return ContextListResponse(contexts=contexts) + +@router.get("/llm/models", response_model=List[Dict[str, str]]) +async def list_available_models(): + """List all available LLM models that can be used""" + return llm_provider.get_available_models() + +@router.post("/llm", response_model=LLMResponse) +async def process_llm_request(request: LLMRequest): + """Process a request with the LLM""" + try: + result = await llm_provider.generate( + prompt=request.prompt, + prompt_template=request.prompt_template, + context_name=request.context_name, + template_variables=request.template_variables, + system_message=request.system_message, + temperature=request.temperature, + max_tokens=request.max_tokens, + stop_sequences=request.stop_sequences + ) + + return LLMResponse( + response=result["response"], + usage=TokenUsage( + prompt_tokens=result["usage"]["prompt_tokens"], + completion_tokens=result["usage"]["completion_tokens"], + total_tokens=result["usage"]["total_tokens"] + ), + model=result["model"], + finish_reason=result["finish_reason"], + processing_time_ms=result["processing_time_ms"] + ) + except FileNotFoundError as e: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=str(e) + ) + except ValueError as e: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=str(e) + ) + except Exception as e: + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"LLM processing error: {str(e)}" + ) \ No newline at end of file diff --git a/src/core/contexts/engineering.json b/src/core/contexts/engineering.json new file mode 100644 index 0000000..bc8d54e --- /dev/null +++ b/src/core/contexts/engineering.json @@ -0,0 +1,11 @@ +{ + "description": "Engineering domain knowledge", + "data": { + "common_materials": ["steel", "aluminum", "titanium", "composites"], + "analysis_methods": ["FEA", "CFD", "experimental testing"], + "standards": { + "mechanical": ["ASME", "ISO", "ASTM"], + "electrical": ["IEC", "IEEE", "UL"] + } + } + } \ No newline at end of file diff --git a/src/core/contexts/manufacturing/materials.json b/src/core/contexts/manufacturing/materials.json new file mode 100644 index 0000000..def329a --- /dev/null +++ b/src/core/contexts/manufacturing/materials.json @@ -0,0 +1,21 @@ +{ + "metals": { + "aluminum": { + "properties": ["lightweight", "corrosion-resistant", "conductive"], + "substitutes": ["titanium", "magnesium", "certain composites"], + "common_processes": ["extrusion", "casting", "machining"] + }, + "steel": { + "properties": ["strong", "durable", "magnetic"], + "substitutes": ["aluminum alloys", "titanium", "reinforced polymers"], + "common_processes": ["forging", "machining", "welding"] + } + }, + "polymers": { + "abs": { + "properties": ["impact-resistant", "thermoformable", "lightweight"], + "substitutes": ["pla", "petg", "polycarbonate"], + "common_processes": ["3d printing", "injection molding"] + } + } +} \ No newline at end of file diff --git a/src/core/main.py b/src/core/main.py index d6ae5f7..873d5af 100644 --- a/src/core/main.py +++ b/src/core/main.py @@ -4,6 +4,7 @@ from src.core.domains.cooking.matchers import CookingMatcher from src.core.domains.cooking.validators import CookingValidator from src.core.registry.domain_registry import DomainRegistry +from src.core.api.routes.llm import router as llm_router # Create FastAPI app app = FastAPI(title="Open Matching Engine API") @@ -15,6 +16,7 @@ async def health_check(): # Register routes app.include_router(match_router, tags=["matching"]) +app.include_router(llm_router, tags=["llm"]) # Register domain components DomainRegistry.register_extractor("cooking", CookingExtractor()) diff --git a/src/core/models/supply_trees.py b/src/core/models/supply_trees.py index 5cc2b39..52fc86a 100644 --- a/src/core/models/supply_trees.py +++ b/src/core/models/supply_trees.py @@ -1,6 +1,6 @@ from dataclasses import dataclass, field from datetime import datetime, timedelta -from typing import Dict, List, Optional, Set, Union, Tuple +from typing import Any, Dict, List, Optional, Set, Union, Tuple from enum import Enum from uuid import UUID, uuid4 import networkx as nx diff --git a/src/core/prompts/default.txt b/src/core/prompts/default.txt new file mode 100644 index 0000000..07663a0 --- /dev/null +++ b/src/core/prompts/default.txt @@ -0,0 +1,5 @@ +You are an AI assistant helping with {{domain}} tasks. + +{{user_query}} + +Please provide a helpful response. \ No newline at end of file diff --git a/src/core/prompts/engineering.txt b/src/core/prompts/engineering.txt new file mode 100644 index 0000000..5b1ea86 --- /dev/null +++ b/src/core/prompts/engineering.txt @@ -0,0 +1,10 @@ +# Engineering analysis prompt +You are an engineering expert. Please analyze the following problem: + +{{prompt}} + +Additional context: +- Application: {{application}} +- Requirements: {{requirements}} + +Provide a detailed technical analysis with practical recommendations. \ No newline at end of file diff --git a/src/core/prompts/manufacturing/substitution.txt b/src/core/prompts/manufacturing/substitution.txt new file mode 100644 index 0000000..6ace03a --- /dev/null +++ b/src/core/prompts/manufacturing/substitution.txt @@ -0,0 +1,13 @@ +You are a manufacturing expert. Given the following material or process requirement, +suggest possible substitutions that could work as alternatives: + +Requirement: {{requirement}} +Context: {{context}} + +For each alternative, provide: +1. Name of the alternative +2. Confidence score (0-1) indicating how well it substitutes +3. Brief explanation of why this could work +4. Any limitations or considerations + +Format your response as a list of alternatives with these details. \ No newline at end of file diff --git a/src/core/services/llm_provider.py b/src/core/services/llm_provider.py new file mode 100644 index 0000000..c850deb --- /dev/null +++ b/src/core/services/llm_provider.py @@ -0,0 +1,257 @@ +import time +import re +from typing import Dict, Any, List, Optional, Tuple +import os +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline + +from .directory_manager import DirectoryManager + +class LLMProvider: + """Provider for LLM services using actual model calls""" + + def __init__(self, model_name: str = None): + """Initialize the LLM provider with model and directories""" + # Use environment variable or default to an open-source model that doesn't require login + # TinyLlama is a smaller model that's openly available + self.model_name = model_name or os.environ.get("LLM_MODEL", "TinyLlama/TinyLlama-1.1B-Chat-v1.0") + self.directory_manager = DirectoryManager() + self._initialize_model() + + def _initialize_model(self): + """Initialize the model and tokenizer with additional error handling""" + print(f"Initializing model: {self.model_name}") + + try: + # Set up device configuration + device = "cuda" if torch.cuda.is_available() else "cpu" + print(f"Using device: {device}") + + # Load tokenizer with fallbacks + try: + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) + except Exception as e: + print(f"Error loading tokenizer: {str(e)}") + print("Falling back to default tokenizer") + self.tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m") # Very small fallback + + # Load model with appropriate configuration + try: + if device == "cuda": + self.model = AutoModelForCausalLM.from_pretrained( + self.model_name, + device_map="auto", + torch_dtype=torch.float16, + low_cpu_mem_usage=True + ) + else: + # For CPU + self.model = AutoModelForCausalLM.from_pretrained( + self.model_name, + device_map={"": device}, + low_cpu_mem_usage=True + ) + + # Create generation pipeline + self.generator = pipeline( + "text-generation", + model=self.model, + tokenizer=self.tokenizer, + device=0 if device == "cuda" else -1 + ) + print(f"Successfully loaded model: {self.model_name}") + except Exception as e: + print(f"Error loading model: {str(e)}") + print("Falling back to text-only response mode") + self.model = None + self.generator = None + except Exception as e: + print(f"Critical error initializing LLM provider: {str(e)}") + print("API will continue to run, but LLM features will be unavailable") + self.model = None + self.tokenizer = None + self.generator = None + + # Flag to check if model is available + self.model_available = self.generator is not None + + def _format_prompt_for_model(self, prompt: str, system_message: Optional[str] = None) -> str: + """Format prompt based on the model architecture""" + # Check if model is a Llama variant + if "llama" in self.model_name.lower(): + if system_message: + # TinyLlama uses the same format as Llama-2 + return f"[INST] <>\n{system_message}\n<>\n\n{prompt} [/INST]" + else: + return f"[INST] {prompt} [/INST]" + # Add support for different model formats as needed + else: + # Generic format for other models + if system_message: + return f"{system_message}\n\n{prompt}" + else: + return prompt + + def _fill_template(self, template: str, variables: Dict[str, Any]) -> str: + """Fill a template with variables""" + result = template + for key, value in variables.items(): + pattern = r"{{" + re.escape(key) + r"}}" + result = re.sub(pattern, str(value), result) + return result + + def _prepare_context(self, context_name: Optional[str]) -> Dict[str, Any]: + """Prepare context data from file if specified""" + if not context_name: + return {} + + try: + return self.directory_manager.get_context(context_name) + except FileNotFoundError as e: + print(f"Warning: {str(e)}") + return {} + + def _prepare_prompt(self, + raw_prompt: str, + prompt_template: Optional[str], + template_variables: Dict[str, Any]) -> str: + """Prepare final prompt text using template if specified""" + if not prompt_template: + return raw_prompt + + try: + template, required_vars = self.directory_manager.get_prompt(prompt_template) + + # Check if all required variables are provided + missing_vars = [var for var in required_vars if var not in template_variables] + if missing_vars: + raise ValueError(f"Missing required template variables: {', '.join(missing_vars)}") + + # Add raw prompt to template variables if not overridden + if 'prompt' not in template_variables: + template_variables['prompt'] = raw_prompt + + return self._fill_template(template, template_variables) + except FileNotFoundError: + # Fall back to raw prompt if template not found + return raw_prompt + + async def generate(self, + prompt: str, + prompt_template: Optional[str] = None, + context_name: Optional[str] = None, + template_variables: Dict[str, Any] = None, + system_message: Optional[str] = None, + temperature: float = 0.7, + max_tokens: int = 1000, + stop_sequences: Optional[List[str]] = None) -> Dict[str, Any]: + """Generate a response using the LLM""" + start_time = time.time() + + # If model isn't available, return a fallback response + if not hasattr(self, "model_available") or not self.model_available: + return { + "response": f"LLM model is not available. I received your prompt: '{prompt}'", + "usage": { + "prompt_tokens": len(prompt.split()), + "completion_tokens": 0, + "total_tokens": len(prompt.split()) + }, + "model": "fallback_text_only", + "finish_reason": "model_unavailable", + "processing_time_ms": 0 + } + + if template_variables is None: + template_variables = {} + + # Get context data if specified + context_data = self._prepare_context(context_name) + + # Prepare the final prompt + prepared_prompt = self._prepare_prompt( + raw_prompt=prompt, + prompt_template=prompt_template, + template_variables=template_variables + ) + + # Format for the specific model + formatted_prompt = self._format_prompt_for_model( + prompt=prepared_prompt, + system_message=system_message + ) + + # Configure generation parameters + generation_config = { + "max_new_tokens": max_tokens, + "temperature": temperature, + "do_sample": temperature > 0, + "top_p": 0.95, + "top_k": 50, + "return_full_text": False, + } + + if stop_sequences: + generation_config["stop_sequences"] = stop_sequences + + # Perform actual model inference + try: + result = self.generator( + formatted_prompt, + **generation_config + ) + + # Extract generated text + generated_text = result[0]["generated_text"].strip() + + # Calculate token usage + input_tokens = len(self.tokenizer.encode(formatted_prompt)) + output_tokens = len(self.tokenizer.encode(generated_text)) + total_tokens = input_tokens + output_tokens + + end_time = time.time() + processing_time_ms = int((end_time - start_time) * 1000) + + return { + "response": generated_text, + "usage": { + "prompt_tokens": input_tokens, + "completion_tokens": output_tokens, + "total_tokens": total_tokens + }, + "model": self.model_name, + "finish_reason": result[0].get("finish_reason", "length"), + "processing_time_ms": processing_time_ms + } + + except Exception as e: + raise RuntimeError(f"Error generating LLM response: {str(e)}") + + def get_available_models(self) -> List[Dict[str, str]]: + """Return a list of recommended open-source models that can be used""" + return [ + { + "id": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "name": "TinyLlama 1.1B Chat", + "description": "Small and fast Llama model for chat (1.1B parameters)", + "memory_required": "~2GB" + }, + { + "id": "microsoft/phi-2", + "name": "Microsoft Phi-2", + "description": "Small but capable 2.7B parameter model", + "memory_required": "~5GB" + }, + { + "id": "facebook/opt-1.3b", + "name": "OPT 1.3B", + "description": "Open Pre-trained Transformer 1.3B parameter model", + "memory_required": "~3GB" + }, + { + "id": "stabilityai/stablelm-base-alpha-3b", + "name": "StableLM 3B", + "description": "Base 3B parameter model from Stability AI", + "memory_required": "~6GB" + } + ] \ No newline at end of file From f1e60911106739300d033023dba86117a63b70c6 Mon Sep 17 00:00:00 2001 From: Celestine Akpanoko Date: Tue, 20 May 2025 11:29:35 -0500 Subject: [PATCH 2/6] remove previous implementation to make it simple and work --- src/core/api/routes/llm.py | 88 ------ src/core/contexts/engineering.json | 11 - .../contexts/manufacturing/materials.json | 21 -- src/core/prompts/default.txt | 5 - src/core/prompts/engineering.txt | 10 - .../prompts/manufacturing/substitution.txt | 13 - src/core/services/llm_provider.py | 257 ------------------ 7 files changed, 405 deletions(-) delete mode 100644 src/core/api/routes/llm.py delete mode 100644 src/core/contexts/engineering.json delete mode 100644 src/core/contexts/manufacturing/materials.json delete mode 100644 src/core/prompts/default.txt delete mode 100644 src/core/prompts/engineering.txt delete mode 100644 src/core/prompts/manufacturing/substitution.txt delete mode 100644 src/core/services/llm_provider.py diff --git a/src/core/api/routes/llm.py b/src/core/api/routes/llm.py deleted file mode 100644 index ae1cc19..0000000 --- a/src/core/api/routes/llm.py +++ /dev/null @@ -1,88 +0,0 @@ -from fastapi import APIRouter, HTTPException, status, Body -from typing import Dict, Any, List, Optional - -from ..models.llm.request import LLMRequest -from ..models.llm.response import LLMResponse, PromptListResponse, ContextListResponse -from ..models.llm.response import PromptInfo, ContextInfo, TokenUsage - -from src.core.services.llm_provider import LLMProvider -from src.core.services.directory_manager import DirectoryManager - -# Create router with prefix and tags -router = APIRouter(prefix="/v1", tags=["llm"]) - -# Initialize services -llm_provider = LLMProvider() -directory_manager = DirectoryManager() - -@router.get("/llm/prompts", response_model=PromptListResponse) -async def list_prompts(): - """List all available prompt templates""" - prompt_list = directory_manager.list_prompts() - prompts = [ - PromptInfo( - name=p["name"], - description=p["description"], - required_variables=p["required_variables"] - ) for p in prompt_list - ] - return PromptListResponse(prompts=prompts) - -@router.get("/llm/contexts", response_model=ContextListResponse) -async def list_contexts(): - """List all available contexts""" - context_list = directory_manager.list_contexts() - contexts = [ - ContextInfo( - name=c["name"], - description=c["description"] - ) for c in context_list - ] - return ContextListResponse(contexts=contexts) - -@router.get("/llm/models", response_model=List[Dict[str, str]]) -async def list_available_models(): - """List all available LLM models that can be used""" - return llm_provider.get_available_models() - -@router.post("/llm", response_model=LLMResponse) -async def process_llm_request(request: LLMRequest): - """Process a request with the LLM""" - try: - result = await llm_provider.generate( - prompt=request.prompt, - prompt_template=request.prompt_template, - context_name=request.context_name, - template_variables=request.template_variables, - system_message=request.system_message, - temperature=request.temperature, - max_tokens=request.max_tokens, - stop_sequences=request.stop_sequences - ) - - return LLMResponse( - response=result["response"], - usage=TokenUsage( - prompt_tokens=result["usage"]["prompt_tokens"], - completion_tokens=result["usage"]["completion_tokens"], - total_tokens=result["usage"]["total_tokens"] - ), - model=result["model"], - finish_reason=result["finish_reason"], - processing_time_ms=result["processing_time_ms"] - ) - except FileNotFoundError as e: - raise HTTPException( - status_code=status.HTTP_404_NOT_FOUND, - detail=str(e) - ) - except ValueError as e: - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail=str(e) - ) - except Exception as e: - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=f"LLM processing error: {str(e)}" - ) \ No newline at end of file diff --git a/src/core/contexts/engineering.json b/src/core/contexts/engineering.json deleted file mode 100644 index bc8d54e..0000000 --- a/src/core/contexts/engineering.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "description": "Engineering domain knowledge", - "data": { - "common_materials": ["steel", "aluminum", "titanium", "composites"], - "analysis_methods": ["FEA", "CFD", "experimental testing"], - "standards": { - "mechanical": ["ASME", "ISO", "ASTM"], - "electrical": ["IEC", "IEEE", "UL"] - } - } - } \ No newline at end of file diff --git a/src/core/contexts/manufacturing/materials.json b/src/core/contexts/manufacturing/materials.json deleted file mode 100644 index def329a..0000000 --- a/src/core/contexts/manufacturing/materials.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "metals": { - "aluminum": { - "properties": ["lightweight", "corrosion-resistant", "conductive"], - "substitutes": ["titanium", "magnesium", "certain composites"], - "common_processes": ["extrusion", "casting", "machining"] - }, - "steel": { - "properties": ["strong", "durable", "magnetic"], - "substitutes": ["aluminum alloys", "titanium", "reinforced polymers"], - "common_processes": ["forging", "machining", "welding"] - } - }, - "polymers": { - "abs": { - "properties": ["impact-resistant", "thermoformable", "lightweight"], - "substitutes": ["pla", "petg", "polycarbonate"], - "common_processes": ["3d printing", "injection molding"] - } - } -} \ No newline at end of file diff --git a/src/core/prompts/default.txt b/src/core/prompts/default.txt deleted file mode 100644 index 07663a0..0000000 --- a/src/core/prompts/default.txt +++ /dev/null @@ -1,5 +0,0 @@ -You are an AI assistant helping with {{domain}} tasks. - -{{user_query}} - -Please provide a helpful response. \ No newline at end of file diff --git a/src/core/prompts/engineering.txt b/src/core/prompts/engineering.txt deleted file mode 100644 index 5b1ea86..0000000 --- a/src/core/prompts/engineering.txt +++ /dev/null @@ -1,10 +0,0 @@ -# Engineering analysis prompt -You are an engineering expert. Please analyze the following problem: - -{{prompt}} - -Additional context: -- Application: {{application}} -- Requirements: {{requirements}} - -Provide a detailed technical analysis with practical recommendations. \ No newline at end of file diff --git a/src/core/prompts/manufacturing/substitution.txt b/src/core/prompts/manufacturing/substitution.txt deleted file mode 100644 index 6ace03a..0000000 --- a/src/core/prompts/manufacturing/substitution.txt +++ /dev/null @@ -1,13 +0,0 @@ -You are a manufacturing expert. Given the following material or process requirement, -suggest possible substitutions that could work as alternatives: - -Requirement: {{requirement}} -Context: {{context}} - -For each alternative, provide: -1. Name of the alternative -2. Confidence score (0-1) indicating how well it substitutes -3. Brief explanation of why this could work -4. Any limitations or considerations - -Format your response as a list of alternatives with these details. \ No newline at end of file diff --git a/src/core/services/llm_provider.py b/src/core/services/llm_provider.py deleted file mode 100644 index c850deb..0000000 --- a/src/core/services/llm_provider.py +++ /dev/null @@ -1,257 +0,0 @@ -import time -import re -from typing import Dict, Any, List, Optional, Tuple -import os -import torch -from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline - -from .directory_manager import DirectoryManager - -class LLMProvider: - """Provider for LLM services using actual model calls""" - - def __init__(self, model_name: str = None): - """Initialize the LLM provider with model and directories""" - # Use environment variable or default to an open-source model that doesn't require login - # TinyLlama is a smaller model that's openly available - self.model_name = model_name or os.environ.get("LLM_MODEL", "TinyLlama/TinyLlama-1.1B-Chat-v1.0") - self.directory_manager = DirectoryManager() - self._initialize_model() - - def _initialize_model(self): - """Initialize the model and tokenizer with additional error handling""" - print(f"Initializing model: {self.model_name}") - - try: - # Set up device configuration - device = "cuda" if torch.cuda.is_available() else "cpu" - print(f"Using device: {device}") - - # Load tokenizer with fallbacks - try: - self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) - except Exception as e: - print(f"Error loading tokenizer: {str(e)}") - print("Falling back to default tokenizer") - self.tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m") # Very small fallback - - # Load model with appropriate configuration - try: - if device == "cuda": - self.model = AutoModelForCausalLM.from_pretrained( - self.model_name, - device_map="auto", - torch_dtype=torch.float16, - low_cpu_mem_usage=True - ) - else: - # For CPU - self.model = AutoModelForCausalLM.from_pretrained( - self.model_name, - device_map={"": device}, - low_cpu_mem_usage=True - ) - - # Create generation pipeline - self.generator = pipeline( - "text-generation", - model=self.model, - tokenizer=self.tokenizer, - device=0 if device == "cuda" else -1 - ) - print(f"Successfully loaded model: {self.model_name}") - except Exception as e: - print(f"Error loading model: {str(e)}") - print("Falling back to text-only response mode") - self.model = None - self.generator = None - except Exception as e: - print(f"Critical error initializing LLM provider: {str(e)}") - print("API will continue to run, but LLM features will be unavailable") - self.model = None - self.tokenizer = None - self.generator = None - - # Flag to check if model is available - self.model_available = self.generator is not None - - def _format_prompt_for_model(self, prompt: str, system_message: Optional[str] = None) -> str: - """Format prompt based on the model architecture""" - # Check if model is a Llama variant - if "llama" in self.model_name.lower(): - if system_message: - # TinyLlama uses the same format as Llama-2 - return f"[INST] <>\n{system_message}\n<>\n\n{prompt} [/INST]" - else: - return f"[INST] {prompt} [/INST]" - # Add support for different model formats as needed - else: - # Generic format for other models - if system_message: - return f"{system_message}\n\n{prompt}" - else: - return prompt - - def _fill_template(self, template: str, variables: Dict[str, Any]) -> str: - """Fill a template with variables""" - result = template - for key, value in variables.items(): - pattern = r"{{" + re.escape(key) + r"}}" - result = re.sub(pattern, str(value), result) - return result - - def _prepare_context(self, context_name: Optional[str]) -> Dict[str, Any]: - """Prepare context data from file if specified""" - if not context_name: - return {} - - try: - return self.directory_manager.get_context(context_name) - except FileNotFoundError as e: - print(f"Warning: {str(e)}") - return {} - - def _prepare_prompt(self, - raw_prompt: str, - prompt_template: Optional[str], - template_variables: Dict[str, Any]) -> str: - """Prepare final prompt text using template if specified""" - if not prompt_template: - return raw_prompt - - try: - template, required_vars = self.directory_manager.get_prompt(prompt_template) - - # Check if all required variables are provided - missing_vars = [var for var in required_vars if var not in template_variables] - if missing_vars: - raise ValueError(f"Missing required template variables: {', '.join(missing_vars)}") - - # Add raw prompt to template variables if not overridden - if 'prompt' not in template_variables: - template_variables['prompt'] = raw_prompt - - return self._fill_template(template, template_variables) - except FileNotFoundError: - # Fall back to raw prompt if template not found - return raw_prompt - - async def generate(self, - prompt: str, - prompt_template: Optional[str] = None, - context_name: Optional[str] = None, - template_variables: Dict[str, Any] = None, - system_message: Optional[str] = None, - temperature: float = 0.7, - max_tokens: int = 1000, - stop_sequences: Optional[List[str]] = None) -> Dict[str, Any]: - """Generate a response using the LLM""" - start_time = time.time() - - # If model isn't available, return a fallback response - if not hasattr(self, "model_available") or not self.model_available: - return { - "response": f"LLM model is not available. I received your prompt: '{prompt}'", - "usage": { - "prompt_tokens": len(prompt.split()), - "completion_tokens": 0, - "total_tokens": len(prompt.split()) - }, - "model": "fallback_text_only", - "finish_reason": "model_unavailable", - "processing_time_ms": 0 - } - - if template_variables is None: - template_variables = {} - - # Get context data if specified - context_data = self._prepare_context(context_name) - - # Prepare the final prompt - prepared_prompt = self._prepare_prompt( - raw_prompt=prompt, - prompt_template=prompt_template, - template_variables=template_variables - ) - - # Format for the specific model - formatted_prompt = self._format_prompt_for_model( - prompt=prepared_prompt, - system_message=system_message - ) - - # Configure generation parameters - generation_config = { - "max_new_tokens": max_tokens, - "temperature": temperature, - "do_sample": temperature > 0, - "top_p": 0.95, - "top_k": 50, - "return_full_text": False, - } - - if stop_sequences: - generation_config["stop_sequences"] = stop_sequences - - # Perform actual model inference - try: - result = self.generator( - formatted_prompt, - **generation_config - ) - - # Extract generated text - generated_text = result[0]["generated_text"].strip() - - # Calculate token usage - input_tokens = len(self.tokenizer.encode(formatted_prompt)) - output_tokens = len(self.tokenizer.encode(generated_text)) - total_tokens = input_tokens + output_tokens - - end_time = time.time() - processing_time_ms = int((end_time - start_time) * 1000) - - return { - "response": generated_text, - "usage": { - "prompt_tokens": input_tokens, - "completion_tokens": output_tokens, - "total_tokens": total_tokens - }, - "model": self.model_name, - "finish_reason": result[0].get("finish_reason", "length"), - "processing_time_ms": processing_time_ms - } - - except Exception as e: - raise RuntimeError(f"Error generating LLM response: {str(e)}") - - def get_available_models(self) -> List[Dict[str, str]]: - """Return a list of recommended open-source models that can be used""" - return [ - { - "id": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "name": "TinyLlama 1.1B Chat", - "description": "Small and fast Llama model for chat (1.1B parameters)", - "memory_required": "~2GB" - }, - { - "id": "microsoft/phi-2", - "name": "Microsoft Phi-2", - "description": "Small but capable 2.7B parameter model", - "memory_required": "~5GB" - }, - { - "id": "facebook/opt-1.3b", - "name": "OPT 1.3B", - "description": "Open Pre-trained Transformer 1.3B parameter model", - "memory_required": "~3GB" - }, - { - "id": "stabilityai/stablelm-base-alpha-3b", - "name": "StableLM 3B", - "description": "Base 3B parameter model from Stability AI", - "memory_required": "~6GB" - } - ] \ No newline at end of file From 72c5381d5654e78a7d15506ec674090ea203f23d Mon Sep 17 00:00:00 2001 From: Celestine Akpanoko Date: Tue, 20 May 2025 12:11:03 -0500 Subject: [PATCH 3/6] Implement llm endpoint to serve llm to client via in-house api --- requirements.txt | 9 +--- src/core/api/models/llm/request.py | 22 ++++---- src/core/api/models/llm/response.py | 38 +++---------- src/core/api/routes/llm.py | 83 +++++++++++++++++++++++++++++ src/core/main.py | 5 ++ 5 files changed, 107 insertions(+), 50 deletions(-) create mode 100644 src/core/api/routes/llm.py diff --git a/requirements.txt b/requirements.txt index 7d401ab..0ffda19 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,10 +4,5 @@ mkdocs-mermaid2-plugin networkx fastapi uvicorn -torch -transformers -accelerate>=0.21.0 -bitsandbytes>=0.41.0 -safetensors -sentencepiece -dotenv \ No newline at end of file +dotenv +groq \ No newline at end of file diff --git a/src/core/api/models/llm/request.py b/src/core/api/models/llm/request.py index 87c3316..c914c28 100644 --- a/src/core/api/models/llm/request.py +++ b/src/core/api/models/llm/request.py @@ -1,13 +1,13 @@ -from pydantic import BaseModel, Field -from typing import Dict, Any, Optional, List +from pydantic import BaseModel +from typing import Optional class LLMRequest(BaseModel): - """Request model for the LLM endpoint""" - prompt: str = Field(..., description="The prompt to send to the LLM") - prompt_template: Optional[str] = Field(None, description="Name of a prompt template to use") - context_name: Optional[str] = Field(None, description="Name of the context to apply") - template_variables: Dict[str, Any] = Field(default_factory=dict, description="Variables to fill in prompt template") - system_message: Optional[str] = Field(None, description="Optional system message to guide the LLM") - temperature: float = Field(0.7, ge=0.0, le=1.0, description="Temperature parameter (0-1)") - max_tokens: int = Field(1000, gt=0, description="Maximum tokens to generate") - stop_sequences: Optional[List[str]] = Field(None, description="Sequences that will stop generation") \ No newline at end of file + """ Request model for LLM API. """ + prompt: str + context: Optional[str] = None + model: Optional[str] = 'llama-3.1-8b-instant' + max_tokens: Optional[int] = 512 + temperature: Optional[float] = 0.7 + top_p: Optional[float] = 0.9 + stream: Optional[bool] = False + \ No newline at end of file diff --git a/src/core/api/models/llm/response.py b/src/core/api/models/llm/response.py index dfe97b8..ac9a1d0 100644 --- a/src/core/api/models/llm/response.py +++ b/src/core/api/models/llm/response.py @@ -1,35 +1,9 @@ -from pydantic import BaseModel, Field -from typing import Dict, Any, Optional, List - -class TokenUsage(BaseModel): - """Token usage information""" - prompt_tokens: int - completion_tokens: int - total_tokens: int +from pydantic import BaseModel +from typing import Optional class LLMResponse(BaseModel): - """Response model for the LLM endpoint""" - response: str = Field(..., description="The generated text from the LLM") - usage: TokenUsage = Field(..., description="Token usage statistics") - model: str = Field(..., description="Model used for generation") - finish_reason: Optional[str] = Field(None, description="Reason for completion") - processing_time_ms: int = Field(..., description="Processing time in milliseconds") - -class PromptInfo(BaseModel): - """Information about an available prompt template""" - name: str = Field(..., description="Name of the prompt template") - description: Optional[str] = Field(None, description="Brief description of the prompt") - required_variables: List[str] = Field(default_factory=list, description="Required template variables") - -class PromptListResponse(BaseModel): - """Response with available prompt templates""" - prompts: List[PromptInfo] = Field(..., description="Available prompt templates") - -class ContextInfo(BaseModel): - """Information about an available context""" - name: str = Field(..., description="Name of the context") - description: Optional[str] = Field(None, description="Brief description of the context") + """ Response model for LLM API. """ -class ContextListResponse(BaseModel): - """Response with available contexts""" - contexts: List[ContextInfo] = Field(..., description="Available contexts") \ No newline at end of file + response: str + model: str + usage: Optional[dict] = None \ No newline at end of file diff --git a/src/core/api/routes/llm.py b/src/core/api/routes/llm.py new file mode 100644 index 0000000..7ad836b --- /dev/null +++ b/src/core/api/routes/llm.py @@ -0,0 +1,83 @@ +from fastapi import APIRouter, HTTPException +from fastapi.responses import StreamingResponse +from groq import Groq +import os +from dotenv import load_dotenv +from src.core.api.models.llm.request import LLMRequest +from src.core.api.models.llm.response import LLMResponse + +load_dotenv() + +router = APIRouter(prefix="/v1/llm", tags=["llm"]) + +client = Groq(api_key=os.getenv("GROQ_API_KEY")) +if not client.api_key: + raise HTTPException(status_code=500, detail="Groq API key is not set.") + +@router.post("/", response_model=LLMResponse) +async def generate_response(request: LLMRequest): + """ Generate response the LLM provider based on prompt and optional context. + Supports streaming and non-streaming responses. """ + + try: + # combine prompt and context (if provided) in the request + messages = [] + if request.context: + messages.append({"role": "system", "content": request.context}) + messages.append({"role": "user", "content": request.prompt}) + + # Call the Groq API + response = client.chat.completions.create( + messages=messages, + model=request.model, + max_tokens=request.max_tokens, + temperature=request.temperature, + top_p=request.top_p, + stream=request.stream + ) + + # If streaming is enabled, return a streaming response + if request.stream: + def stream_response(): + for chunk in response: + content = chunk.choices[0].delta.content + if content: + yield content + yield "[DONE]" + return StreamingResponse(stream_response(), media_type="text/event-stream") + else: + # if not streaming, return the full response + full_response = response.choices[0].message.content + usage = ( + response.usage.dict() + if hasattr(response, "usage") and response.usage + else None + ) + return LLMResponse( + response=full_response, + model=response.model, + usage=usage + ) + + except Exception as e: + raise HTTPException(status_code=500, detail=f"Error communicating with LLM provider: {str(e)}") + + + +@router.get("/health") +async def health_check(): + """ Health check for the LLM Endpoint """ + + try: + # test a simple API call to verify Groq connection + response = client.chat.completions.create( + messages=[{"role": "user", "content": "Hello!"}], + model="llama-3.1-8b-instant", + max_tokens=10, + ) + return { + "status": "healthy" + } + + except Exception as e: + raise HTTPException(status_code=503, detail=f"LLM provider is not healthy: {str(e)}") \ No newline at end of file diff --git a/src/core/main.py b/src/core/main.py index 873d5af..c2a4e3c 100644 --- a/src/core/main.py +++ b/src/core/main.py @@ -14,6 +14,11 @@ async def health_check(): """Simple health check endpoint.""" return {"status": "ok", "domains": list(DomainRegistry._extractors.keys())} +# @app.get("/llm") +# async def llm_check(): +# """Simple LLM check endpoint.""" +# return {"status": "ok", "llm": "gpt-3.5-turbo"} + # Register routes app.include_router(match_router, tags=["matching"]) app.include_router(llm_router, tags=["llm"]) From b7031db17f1ac354af84ccf75e1d695a277d458b Mon Sep 17 00:00:00 2001 From: Celestine Akpanoko Date: Tue, 20 May 2025 14:31:27 -0500 Subject: [PATCH 4/6] Create test cases for unit, integration and end to end testing --- .gitignore | 2 + pytest.ini | 18 +++++++ tests/llm/conftest.py | 24 ++++++++++ tests/llm/e2e/__init__.py | 0 tests/llm/e2e/test_llm_workflow.py | 27 +++++++++++ tests/llm/integration/__init__.py | 0 tests/llm/integration/test_llm_routes.py | 25 ++++++++++ tests/llm/unit/__init__.py | 0 tests/llm/unit/test_llm_models.py | 60 ++++++++++++++++++++++++ 9 files changed, 156 insertions(+) create mode 100644 pytest.ini create mode 100644 tests/llm/conftest.py create mode 100644 tests/llm/e2e/__init__.py create mode 100644 tests/llm/e2e/test_llm_workflow.py create mode 100644 tests/llm/integration/__init__.py create mode 100644 tests/llm/integration/test_llm_routes.py create mode 100644 tests/llm/unit/__init__.py create mode 100644 tests/llm/unit/test_llm_models.py diff --git a/.gitignore b/.gitignore index cf6bd05..fa512c5 100644 --- a/.gitignore +++ b/.gitignore @@ -172,3 +172,5 @@ cython_debug/ # PyPI configuration file .pypirc + +*.txt \ No newline at end of file diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..63a80a6 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,18 @@ +[pytest] +# Configure asyncio for async tests +asyncio_mode = strict +asyncio_default_fixture_loop_scope = function + +# Test discovery patterns with wildcards +python_files = test_*.py +python_classes = Test* +python_functions = test_* + +# Make sure Python can find your modules +pythonpath = . + +# Recursive test discovery in all subdirectories +norecursedirs = .* venv build dist + +markers = + e2e: Mark end-to-end test \ No newline at end of file diff --git a/tests/llm/conftest.py b/tests/llm/conftest.py new file mode 100644 index 0000000..40c6f94 --- /dev/null +++ b/tests/llm/conftest.py @@ -0,0 +1,24 @@ +import pytest +from fastapi.testclient import TestClient +from src.core.main import app +from unittest.mock import MagicMock, patch + +@pytest.fixture +def client(): + """Fixture for FastAPI test client.""" + return TestClient(app) + +@pytest.fixture +def mock_groq(): + """Fixture to mock Groq client.""" + # Create a mock with the properly configured return value + mock_client = MagicMock() + + # Configure the mock to work with async code + mock_create = MagicMock() + # Make the mock.return_value property directly accessible (not awaitable) + mock_client.chat.completions.create.return_value = mock_create + + # Patch the client in the LLM route module + with patch("src.core.api.routes.llm.client", mock_client): + yield mock_client \ No newline at end of file diff --git a/tests/llm/e2e/__init__.py b/tests/llm/e2e/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/llm/e2e/test_llm_workflow.py b/tests/llm/e2e/test_llm_workflow.py new file mode 100644 index 0000000..a6426c3 --- /dev/null +++ b/tests/llm/e2e/test_llm_workflow.py @@ -0,0 +1,27 @@ +import pytest +from fastapi.testclient import TestClient +from unittest.mock import MagicMock + +# Register the e2e marker to avoid the warning +pytest.mark.e2e = pytest.mark.skipif(False, reason="E2E test marker") + +@pytest.mark.e2e +def test_llm_workflow(client, mock_groq): + """End-to-end test for LLM workflow""" + # Setup mock response at the correct path in the mock structure + mock_response = mock_groq.chat.completions.create.return_value + mock_response.choices = [MagicMock()] + mock_response.choices[0].message = MagicMock() + mock_response.choices[0].message.content = '{"result": "processed data"}' + mock_response.model = "llama-3.1-8b-instant" + mock_response.usage = MagicMock() + mock_response.usage.dict.return_value = {"total_tokens": 20} + + # Call endpoint + response = client.post( + "/v1/llm/", + json={"prompt": "Process this text", "context": "Return JSON"} + ) + + assert response.status_code == 200 + assert "result" in response.json()["response"] diff --git a/tests/llm/integration/__init__.py b/tests/llm/integration/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/llm/integration/test_llm_routes.py b/tests/llm/integration/test_llm_routes.py new file mode 100644 index 0000000..c871499 --- /dev/null +++ b/tests/llm/integration/test_llm_routes.py @@ -0,0 +1,25 @@ +import pytest +from fastapi.testclient import TestClient +from unittest.mock import MagicMock + +def test_llm_endpoint_non_streaming(client, mock_groq): + """Test non-streaming LLM endpoint""" + # Setup mock response at the correct place in the mock structure + mock_response = mock_groq.chat.completions.create.return_value + mock_response.choices = [MagicMock()] + mock_response.choices[0].message = MagicMock() + mock_response.choices[0].message.content = "Test response" + mock_response.model = "llama-3.1-8b-instant" + mock_response.usage = MagicMock() + mock_response.usage.dict.return_value = {"total_tokens": 10} + + # Make request to endpoint + response = client.post( + "/v1/llm/", + json={"prompt": "Test prompt"} + ) + + assert response.status_code == 200 + assert response.json()["response"] == "Test response" + +# Similar updates for other failing tests... \ No newline at end of file diff --git a/tests/llm/unit/__init__.py b/tests/llm/unit/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/llm/unit/test_llm_models.py b/tests/llm/unit/test_llm_models.py new file mode 100644 index 0000000..6930ee1 --- /dev/null +++ b/tests/llm/unit/test_llm_models.py @@ -0,0 +1,60 @@ +import pytest +from src.core.api.models.llm.request import LLMRequest +from src.core.api.models.llm.response import LLMResponse + +def test_llm_request_valid(): + """Test valid LLMRequest model with all fields.""" + data = { + "prompt": "What is the capital of France?", + "context": "Provide a concise answer.", + "model": "llama-3.1-8b-instant", + "max_tokens": 100, + "temperature": 0.7, + "top_p": 0.95, + "stream": False + } + request = LLMRequest(**data) + assert request.prompt == "What is the capital of France?" + assert request.context == "Provide a concise answer." + assert request.model == "llama-3.1-8b-instant" + assert request.max_tokens == 100 + assert request.temperature == 0.7 + assert request.top_p == 0.95 + assert request.stream is False + +def test_llm_request_no_context(): + """Test LLMRequest with no context, using defaults.""" + data = {"prompt": "Test prompt"} + request = LLMRequest(**data) + assert request.prompt == "Test prompt" + assert request.context is None + assert request.model == "llama-3.1-8b-instant" + assert request.max_tokens == 512 + assert request.temperature == 0.7 + assert request.top_p == 0.9 + assert request.stream is False + +def test_llm_request_invalid_prompt(): + """Test LLMRequest with empty prompt raises error.""" + with pytest.raises(ValueError, match="prompt"): + LLMRequest(prompt="") + +def test_llm_response_valid(): + """Test valid LLMResponse model.""" + data = { + "response": "The capital is Paris.", + "model": "llama-3.1-8b-instant", + "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15} + } + response = LLMResponse(**data) + assert response.response == "The capital is Paris." + assert response.model == "llama-3.1-8b-instant" + assert response.usage == {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15} + +def test_llm_response_no_usage(): + """Test LLMResponse with no usage data.""" + data = {"response": "Test response", "model": "llama-3.1-8b-instant"} + response = LLMResponse(**data) + assert response.response == "Test response" + assert response.model == "llama-3.1-8b-instant" + assert response.usage is None \ No newline at end of file From 91c0f60d77f024edac457e501e447cc229535975 Mon Sep 17 00:00:00 2001 From: Celestine Akpanoko Date: Tue, 20 May 2025 14:31:56 -0500 Subject: [PATCH 5/6] Fix failed test cases issues --- src/core/api/models/llm/request.py | 10 ++++++++-- src/core/api/routes/llm.py | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/core/api/models/llm/request.py b/src/core/api/models/llm/request.py index c914c28..5208fc6 100644 --- a/src/core/api/models/llm/request.py +++ b/src/core/api/models/llm/request.py @@ -1,4 +1,4 @@ -from pydantic import BaseModel +from pydantic import BaseModel, Field, field_validator from typing import Optional class LLMRequest(BaseModel): @@ -10,4 +10,10 @@ class LLMRequest(BaseModel): temperature: Optional[float] = 0.7 top_p: Optional[float] = 0.9 stream: Optional[bool] = False - \ No newline at end of file + + @field_validator('prompt') + @classmethod # This is now required in V2 + def prompt_must_not_be_empty(cls, v): + if not v or v.strip() == '': + raise ValueError('prompt cannot be empty') + return v diff --git a/src/core/api/routes/llm.py b/src/core/api/routes/llm.py index 7ad836b..1cdfaaa 100644 --- a/src/core/api/routes/llm.py +++ b/src/core/api/routes/llm.py @@ -43,7 +43,7 @@ def stream_response(): content = chunk.choices[0].delta.content if content: yield content - yield "[DONE]" + yield "[DONE]" return StreamingResponse(stream_response(), media_type="text/event-stream") else: # if not streaming, return the full response From 02e88c71ae2023ad3295ef3e27fad2fa5cd64016 Mon Sep 17 00:00:00 2001 From: Celestine Akpanoko Date: Tue, 20 May 2025 14:32:30 -0500 Subject: [PATCH 6/6] Document for PR and package requirement --- docs/api/llm.md | 101 +++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 6 ++- 2 files changed, 106 insertions(+), 1 deletion(-) create mode 100644 docs/api/llm.md diff --git a/docs/api/llm.md b/docs/api/llm.md new file mode 100644 index 0000000..9761538 --- /dev/null +++ b/docs/api/llm.md @@ -0,0 +1,101 @@ +# LLM API +A FastAPI application for serving LLM inferences. + +## Features + +New: `/llm` POST endpoint for generic LLM inference with prompt and context, powered by Groq Cloud (LLaMA 3.1). +Models defined in `src/core/api/models/llm/request.py` and `src/core/api/models/llm/response.py`. + + +## API Documentation +### LLM Endpoint +`POST /llm` +Generate text using an LLM provider with a prompt and optional context. +Request Body +``` +{ + "prompt": "string", // Required: The main task or query for the LLM + "context": "string", // Optional: Supplementary information to guide the response + "model": "string", // Optional: LLM model (default: "llama-3.1-8b-instant") + "max_tokens": integer, // Optional: Maximum tokens in response (default: 512) + "temperature": float, // Optional: Sampling temperature (default: 0.8) + "top_p": float, // Optional: Top-p sampling (default: 0.95) + "stream": boolean // Optional: Enable streaming response (default: false) +} +``` +Responses + +200 OK (Non-Streaming): +``` +{ + "response": "string", // LLM-generated text + "model": "string", // Model used + "usage": { // Optional: Token usage + "prompt_tokens": integer, + "completion_tokens": integer, + "total_tokens": integer + } +} +``` + +200 OK (Streaming): Text/event-stream with chunks of text, ending with [DONE]. +500 Internal Server Error: If the LLM provider fails. + +Example +``` +curl -X POST http://localhost:8000/llm \ + -H "Content-Type: application/json" \ + -d '{ + "prompt": "What is the capital of Brazil?", + "context": "Answer concisely with no explanation.", + "model": "llama-3.1-8b-instant", + "max_tokens": 50, + "temperature": 0.7, + "stream": false + }' +``` + + +## Usage Guide +### Using the LLM Endpoint +The `/llm` endpoint allows clients to send a prompt and optional context to generate text using an LLM (e.g., LLaMA 3.1 via Groq Cloud). The request and response are defined in `src/core/api/models/llm/request.py` and `src/core/api/models/llm/response.py`, respectively. +### Example: Non-Streaming Request +Send a prompt to get a concise response: +``` +curl -X POST http://localhost:8000/llm \ + -H "Content-Type: application/json" \ + -d '{ + "prompt": "What is the capital of Brazil?", + "context": "Answer concisely with no explanation.", + "model": "llama-3.1-8b-instant", + "max_tokens": 50 + }' +``` +Response: +``` +{ + "response": "The capital is Brasília.", + "model": "llama-3.1-8b-instant", + "usage": { + "prompt_tokens": 12, + "completion_tokens": 6, + "total_tokens": 18 + } +} +``` + +### Example: Streaming Request +Stream a response for a creative task: +``` +curl -X POST http://localhost:8000/llm \ + -H "Content-Type: application/json" \ + -d '{ + "prompt": "Tell me a short story.", + "context": "The story should be about a dragon in a fantasy setting.", + "model": "llama-3.1-8b-instant", + "max_tokens": 100, + "stream": true + }' +``` +Response: Streams tokens like "Once ", "upon ", "a time...", ending with `[DONE]`. + diff --git a/requirements.txt b/requirements.txt index 0ffda19..b75aee0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,8 @@ networkx fastapi uvicorn dotenv -groq \ No newline at end of file +groq +pytest +pytest-asyncio +pytest-mock +httpx \ No newline at end of file