From 6c9bd4dac781812bb6406dead191459e34aebc71 Mon Sep 17 00:00:00 2001
From: Celestine Akpanoko <celestineakpanoko@gmail.com>
Date: Tue, 20 May 2025 10:48:50 -0500
Subject: [PATCH 1/6] llm integration initial commit

---
 requirements.txt                              |   7 +
 run.py                                        |   4 +
 src/core/api/models/llm/request.py            |  13 +
 src/core/api/models/llm/response.py           |  35 +++
 src/core/api/routes/llm.py                    |  88 ++++++
 src/core/contexts/engineering.json            |  11 +
 .../contexts/manufacturing/materials.json     |  21 ++
 src/core/main.py                              |   2 +
 src/core/models/supply_trees.py               |   2 +-
 src/core/prompts/default.txt                  |   5 +
 src/core/prompts/engineering.txt              |  10 +
 .../prompts/manufacturing/substitution.txt    |  13 +
 src/core/services/llm_provider.py             | 257 ++++++++++++++++++
 13 files changed, 467 insertions(+), 1 deletion(-)
 create mode 100644 src/core/api/models/llm/request.py
 create mode 100644 src/core/api/models/llm/response.py
 create mode 100644 src/core/api/routes/llm.py
 create mode 100644 src/core/contexts/engineering.json
 create mode 100644 src/core/contexts/manufacturing/materials.json
 create mode 100644 src/core/prompts/default.txt
 create mode 100644 src/core/prompts/engineering.txt
 create mode 100644 src/core/prompts/manufacturing/substitution.txt
 create mode 100644 src/core/services/llm_provider.py

diff --git a/requirements.txt b/requirements.txt
index 64eb8ff..7d401ab 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,10 @@ mkdocs-mermaid2-plugin
 networkx
 fastapi
 uvicorn
+torch
+transformers
+accelerate>=0.21.0
+bitsandbytes>=0.41.0
+safetensors
+sentencepiece
+dotenv
\ No newline at end of file
diff --git a/run.py b/run.py
index efc6268..a6dbaa4 100644
--- a/run.py
+++ b/run.py
@@ -1,4 +1,8 @@
 import uvicorn
+from dotenv import load_dotenv
+
+# Load environment variables from .env file
+load_dotenv()
 
 if __name__ == "__main__":
     uvicorn.run("src.core.main:app", 
diff --git a/src/core/api/models/llm/request.py b/src/core/api/models/llm/request.py
new file mode 100644
index 0000000..87c3316
--- /dev/null
+++ b/src/core/api/models/llm/request.py
@@ -0,0 +1,13 @@
+from pydantic import BaseModel, Field
+from typing import Dict, Any, Optional, List
+
+class LLMRequest(BaseModel):
+    """Request model for the LLM endpoint"""
+    prompt: str = Field(..., description="The prompt to send to the LLM")
+    prompt_template: Optional[str] = Field(None, description="Name of a prompt template to use")
+    context_name: Optional[str] = Field(None, description="Name of the context to apply")
+    template_variables: Dict[str, Any] = Field(default_factory=dict, description="Variables to fill in prompt template")
+    system_message: Optional[str] = Field(None, description="Optional system message to guide the LLM")
+    temperature: float = Field(0.7, ge=0.0, le=1.0, description="Temperature parameter (0-1)")
+    max_tokens: int = Field(1000, gt=0, description="Maximum tokens to generate")
+    stop_sequences: Optional[List[str]] = Field(None, description="Sequences that will stop generation")
\ No newline at end of file
diff --git a/src/core/api/models/llm/response.py b/src/core/api/models/llm/response.py
new file mode 100644
index 0000000..dfe97b8
--- /dev/null
+++ b/src/core/api/models/llm/response.py
@@ -0,0 +1,35 @@
+from pydantic import BaseModel, Field
+from typing import Dict, Any, Optional, List
+
+class TokenUsage(BaseModel):
+    """Token usage information"""
+    prompt_tokens: int
+    completion_tokens: int
+    total_tokens: int
+
+class LLMResponse(BaseModel):
+    """Response model for the LLM endpoint"""
+    response: str = Field(..., description="The generated text from the LLM")
+    usage: TokenUsage = Field(..., description="Token usage statistics")
+    model: str = Field(..., description="Model used for generation")
+    finish_reason: Optional[str] = Field(None, description="Reason for completion")
+    processing_time_ms: int = Field(..., description="Processing time in milliseconds")
+
+class PromptInfo(BaseModel):
+    """Information about an available prompt template"""
+    name: str = Field(..., description="Name of the prompt template")
+    description: Optional[str] = Field(None, description="Brief description of the prompt")
+    required_variables: List[str] = Field(default_factory=list, description="Required template variables")
+    
+class PromptListResponse(BaseModel):
+    """Response with available prompt templates"""
+    prompts: List[PromptInfo] = Field(..., description="Available prompt templates")
+    
+class ContextInfo(BaseModel):
+    """Information about an available context"""
+    name: str = Field(..., description="Name of the context")
+    description: Optional[str] = Field(None, description="Brief description of the context")
+    
+class ContextListResponse(BaseModel):
+    """Response with available contexts"""
+    contexts: List[ContextInfo] = Field(..., description="Available contexts")
\ No newline at end of file
diff --git a/src/core/api/routes/llm.py b/src/core/api/routes/llm.py
new file mode 100644
index 0000000..ae1cc19
--- /dev/null
+++ b/src/core/api/routes/llm.py
@@ -0,0 +1,88 @@
+from fastapi import APIRouter, HTTPException, status, Body
+from typing import Dict, Any, List, Optional
+
+from ..models.llm.request import LLMRequest
+from ..models.llm.response import LLMResponse, PromptListResponse, ContextListResponse
+from ..models.llm.response import PromptInfo, ContextInfo, TokenUsage
+
+from src.core.services.llm_provider import LLMProvider
+from src.core.services.directory_manager import DirectoryManager
+
+# Create router with prefix and tags
+router = APIRouter(prefix="/v1", tags=["llm"])
+
+# Initialize services
+llm_provider = LLMProvider()
+directory_manager = DirectoryManager()
+
+@router.get("/llm/prompts", response_model=PromptListResponse)
+async def list_prompts():
+    """List all available prompt templates"""
+    prompt_list = directory_manager.list_prompts()
+    prompts = [
+        PromptInfo(
+            name=p["name"],
+            description=p["description"],
+            required_variables=p["required_variables"]
+        ) for p in prompt_list
+    ]
+    return PromptListResponse(prompts=prompts)
+
+@router.get("/llm/contexts", response_model=ContextListResponse)
+async def list_contexts():
+    """List all available contexts"""
+    context_list = directory_manager.list_contexts()
+    contexts = [
+        ContextInfo(
+            name=c["name"],
+            description=c["description"]
+        ) for c in context_list
+    ]
+    return ContextListResponse(contexts=contexts)
+
+@router.get("/llm/models", response_model=List[Dict[str, str]])
+async def list_available_models():
+    """List all available LLM models that can be used"""
+    return llm_provider.get_available_models()
+
+@router.post("/llm", response_model=LLMResponse)
+async def process_llm_request(request: LLMRequest):
+    """Process a request with the LLM"""
+    try:
+        result = await llm_provider.generate(
+            prompt=request.prompt,
+            prompt_template=request.prompt_template,
+            context_name=request.context_name,
+            template_variables=request.template_variables,
+            system_message=request.system_message,
+            temperature=request.temperature,
+            max_tokens=request.max_tokens,
+            stop_sequences=request.stop_sequences
+        )
+        
+        return LLMResponse(
+            response=result["response"],
+            usage=TokenUsage(
+                prompt_tokens=result["usage"]["prompt_tokens"],
+                completion_tokens=result["usage"]["completion_tokens"],
+                total_tokens=result["usage"]["total_tokens"]
+            ),
+            model=result["model"],
+            finish_reason=result["finish_reason"],
+            processing_time_ms=result["processing_time_ms"]
+        )
+    except FileNotFoundError as e:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=str(e)
+        )
+    except ValueError as e:
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail=str(e)
+        )
+    except Exception as e:
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"LLM processing error: {str(e)}"
+        )
\ No newline at end of file
diff --git a/src/core/contexts/engineering.json b/src/core/contexts/engineering.json
new file mode 100644
index 0000000..bc8d54e
--- /dev/null
+++ b/src/core/contexts/engineering.json
@@ -0,0 +1,11 @@
+{
+    "description": "Engineering domain knowledge",
+    "data": {
+      "common_materials": ["steel", "aluminum", "titanium", "composites"],
+      "analysis_methods": ["FEA", "CFD", "experimental testing"],
+      "standards": {
+        "mechanical": ["ASME", "ISO", "ASTM"],
+        "electrical": ["IEC", "IEEE", "UL"]
+      }
+    }
+  }
\ No newline at end of file
diff --git a/src/core/contexts/manufacturing/materials.json b/src/core/contexts/manufacturing/materials.json
new file mode 100644
index 0000000..def329a
--- /dev/null
+++ b/src/core/contexts/manufacturing/materials.json
@@ -0,0 +1,21 @@
+{
+  "metals": {
+    "aluminum": {
+      "properties": ["lightweight", "corrosion-resistant", "conductive"],
+      "substitutes": ["titanium", "magnesium", "certain composites"],
+      "common_processes": ["extrusion", "casting", "machining"]
+    },
+    "steel": {
+      "properties": ["strong", "durable", "magnetic"],
+      "substitutes": ["aluminum alloys", "titanium", "reinforced polymers"],
+      "common_processes": ["forging", "machining", "welding"]
+    }
+  },
+  "polymers": {
+    "abs": {
+      "properties": ["impact-resistant", "thermoformable", "lightweight"],
+      "substitutes": ["pla", "petg", "polycarbonate"],
+      "common_processes": ["3d printing", "injection molding"]
+    }
+  }
+}
\ No newline at end of file
diff --git a/src/core/main.py b/src/core/main.py
index d6ae5f7..873d5af 100644
--- a/src/core/main.py
+++ b/src/core/main.py
@@ -4,6 +4,7 @@
 from src.core.domains.cooking.matchers import CookingMatcher
 from src.core.domains.cooking.validators import CookingValidator
 from src.core.registry.domain_registry import DomainRegistry
+from src.core.api.routes.llm import router as llm_router
 
 # Create FastAPI app
 app = FastAPI(title="Open Matching Engine API")
@@ -15,6 +16,7 @@ async def health_check():
 
 # Register routes
 app.include_router(match_router, tags=["matching"])
+app.include_router(llm_router, tags=["llm"])
 
 # Register domain components
 DomainRegistry.register_extractor("cooking", CookingExtractor())
diff --git a/src/core/models/supply_trees.py b/src/core/models/supply_trees.py
index 5cc2b39..52fc86a 100644
--- a/src/core/models/supply_trees.py
+++ b/src/core/models/supply_trees.py
@@ -1,6 +1,6 @@
 from dataclasses import dataclass, field
 from datetime import datetime, timedelta
-from typing import Dict, List, Optional, Set, Union, Tuple
+from typing import Any, Dict, List, Optional, Set, Union, Tuple
 from enum import Enum
 from uuid import UUID, uuid4
 import networkx as nx
diff --git a/src/core/prompts/default.txt b/src/core/prompts/default.txt
new file mode 100644
index 0000000..07663a0
--- /dev/null
+++ b/src/core/prompts/default.txt
@@ -0,0 +1,5 @@
+You are an AI assistant helping with {{domain}} tasks.
+
+{{user_query}}
+
+Please provide a helpful response.
\ No newline at end of file
diff --git a/src/core/prompts/engineering.txt b/src/core/prompts/engineering.txt
new file mode 100644
index 0000000..5b1ea86
--- /dev/null
+++ b/src/core/prompts/engineering.txt
@@ -0,0 +1,10 @@
+# Engineering analysis prompt
+You are an engineering expert. Please analyze the following problem:
+
+{{prompt}}
+
+Additional context:
+- Application: {{application}}
+- Requirements: {{requirements}}
+
+Provide a detailed technical analysis with practical recommendations.
\ No newline at end of file
diff --git a/src/core/prompts/manufacturing/substitution.txt b/src/core/prompts/manufacturing/substitution.txt
new file mode 100644
index 0000000..6ace03a
--- /dev/null
+++ b/src/core/prompts/manufacturing/substitution.txt
@@ -0,0 +1,13 @@
+You are a manufacturing expert. Given the following material or process requirement, 
+suggest possible substitutions that could work as alternatives:
+
+Requirement: {{requirement}}
+Context: {{context}}
+
+For each alternative, provide:
+1. Name of the alternative
+2. Confidence score (0-1) indicating how well it substitutes
+3. Brief explanation of why this could work
+4. Any limitations or considerations
+
+Format your response as a list of alternatives with these details.
\ No newline at end of file
diff --git a/src/core/services/llm_provider.py b/src/core/services/llm_provider.py
new file mode 100644
index 0000000..c850deb
--- /dev/null
+++ b/src/core/services/llm_provider.py
@@ -0,0 +1,257 @@
+import time
+import re
+from typing import Dict, Any, List, Optional, Tuple
+import os
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+
+from .directory_manager import DirectoryManager
+
+class LLMProvider:
+    """Provider for LLM services using actual model calls"""
+    
+    def __init__(self, model_name: str = None):
+        """Initialize the LLM provider with model and directories"""
+        # Use environment variable or default to an open-source model that doesn't require login
+        # TinyLlama is a smaller model that's openly available
+        self.model_name = model_name or os.environ.get("LLM_MODEL", "TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+        self.directory_manager = DirectoryManager()
+        self._initialize_model()
+        
+    def _initialize_model(self):
+        """Initialize the model and tokenizer with additional error handling"""
+        print(f"Initializing model: {self.model_name}")
+        
+        try:
+            # Set up device configuration
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            print(f"Using device: {device}")
+            
+            # Load tokenizer with fallbacks
+            try:
+                self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+            except Exception as e:
+                print(f"Error loading tokenizer: {str(e)}")
+                print("Falling back to default tokenizer")
+                self.tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")  # Very small fallback
+            
+            # Load model with appropriate configuration
+            try:
+                if device == "cuda":
+                    self.model = AutoModelForCausalLM.from_pretrained(
+                        self.model_name,
+                        device_map="auto",
+                        torch_dtype=torch.float16,
+                        low_cpu_mem_usage=True
+                    )
+                else:
+                    # For CPU
+                    self.model = AutoModelForCausalLM.from_pretrained(
+                        self.model_name,
+                        device_map={"": device},
+                        low_cpu_mem_usage=True
+                    )
+                    
+                # Create generation pipeline
+                self.generator = pipeline(
+                    "text-generation",
+                    model=self.model,
+                    tokenizer=self.tokenizer,
+                    device=0 if device == "cuda" else -1
+                )
+                print(f"Successfully loaded model: {self.model_name}")
+            except Exception as e:
+                print(f"Error loading model: {str(e)}")
+                print("Falling back to text-only response mode")
+                self.model = None
+                self.generator = None
+        except Exception as e:
+            print(f"Critical error initializing LLM provider: {str(e)}")
+            print("API will continue to run, but LLM features will be unavailable")
+            self.model = None
+            self.tokenizer = None
+            self.generator = None
+            
+        # Flag to check if model is available
+        self.model_available = self.generator is not None
+        
+    def _format_prompt_for_model(self, prompt: str, system_message: Optional[str] = None) -> str:
+        """Format prompt based on the model architecture"""
+        # Check if model is a Llama variant
+        if "llama" in self.model_name.lower():
+            if system_message:
+                # TinyLlama uses the same format as Llama-2
+                return f"<s>[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n{prompt} [/INST]"
+            else:
+                return f"<s>[INST] {prompt} [/INST]"
+        # Add support for different model formats as needed
+        else:
+            # Generic format for other models
+            if system_message:
+                return f"{system_message}\n\n{prompt}"
+            else:
+                return prompt
+    
+    def _fill_template(self, template: str, variables: Dict[str, Any]) -> str:
+        """Fill a template with variables"""
+        result = template
+        for key, value in variables.items():
+            pattern = r"{{" + re.escape(key) + r"}}"
+            result = re.sub(pattern, str(value), result)
+        return result
+    
+    def _prepare_context(self, context_name: Optional[str]) -> Dict[str, Any]:
+        """Prepare context data from file if specified"""
+        if not context_name:
+            return {}
+            
+        try:
+            return self.directory_manager.get_context(context_name)
+        except FileNotFoundError as e:
+            print(f"Warning: {str(e)}")
+            return {}
+    
+    def _prepare_prompt(self, 
+                      raw_prompt: str, 
+                      prompt_template: Optional[str], 
+                      template_variables: Dict[str, Any]) -> str:
+        """Prepare final prompt text using template if specified"""
+        if not prompt_template:
+            return raw_prompt
+            
+        try:
+            template, required_vars = self.directory_manager.get_prompt(prompt_template)
+            
+            # Check if all required variables are provided
+            missing_vars = [var for var in required_vars if var not in template_variables]
+            if missing_vars:
+                raise ValueError(f"Missing required template variables: {', '.join(missing_vars)}")
+                
+            # Add raw prompt to template variables if not overridden
+            if 'prompt' not in template_variables:
+                template_variables['prompt'] = raw_prompt
+                
+            return self._fill_template(template, template_variables)
+        except FileNotFoundError:
+            # Fall back to raw prompt if template not found
+            return raw_prompt
+            
+    async def generate(self, 
+                      prompt: str,
+                      prompt_template: Optional[str] = None,
+                      context_name: Optional[str] = None, 
+                      template_variables: Dict[str, Any] = None,
+                      system_message: Optional[str] = None,
+                      temperature: float = 0.7,
+                      max_tokens: int = 1000,
+                      stop_sequences: Optional[List[str]] = None) -> Dict[str, Any]:
+        """Generate a response using the LLM"""
+        start_time = time.time()
+        
+        # If model isn't available, return a fallback response
+        if not hasattr(self, "model_available") or not self.model_available:
+            return {
+                "response": f"LLM model is not available. I received your prompt: '{prompt}'",
+                "usage": {
+                    "prompt_tokens": len(prompt.split()),
+                    "completion_tokens": 0,
+                    "total_tokens": len(prompt.split())
+                },
+                "model": "fallback_text_only",
+                "finish_reason": "model_unavailable",
+                "processing_time_ms": 0
+            }
+        
+        if template_variables is None:
+            template_variables = {}
+            
+        # Get context data if specified
+        context_data = self._prepare_context(context_name)
+        
+        # Prepare the final prompt
+        prepared_prompt = self._prepare_prompt(
+            raw_prompt=prompt,
+            prompt_template=prompt_template,
+            template_variables=template_variables
+        )
+        
+        # Format for the specific model
+        formatted_prompt = self._format_prompt_for_model(
+            prompt=prepared_prompt,
+            system_message=system_message
+        )
+        
+        # Configure generation parameters
+        generation_config = {
+            "max_new_tokens": max_tokens,
+            "temperature": temperature,
+            "do_sample": temperature > 0,
+            "top_p": 0.95,
+            "top_k": 50,
+            "return_full_text": False,
+        }
+        
+        if stop_sequences:
+            generation_config["stop_sequences"] = stop_sequences
+            
+        # Perform actual model inference
+        try:
+            result = self.generator(
+                formatted_prompt,
+                **generation_config
+            )
+            
+            # Extract generated text
+            generated_text = result[0]["generated_text"].strip()
+            
+            # Calculate token usage
+            input_tokens = len(self.tokenizer.encode(formatted_prompt))
+            output_tokens = len(self.tokenizer.encode(generated_text))
+            total_tokens = input_tokens + output_tokens
+            
+            end_time = time.time()
+            processing_time_ms = int((end_time - start_time) * 1000)
+            
+            return {
+                "response": generated_text,
+                "usage": {
+                    "prompt_tokens": input_tokens,
+                    "completion_tokens": output_tokens,
+                    "total_tokens": total_tokens
+                },
+                "model": self.model_name,
+                "finish_reason": result[0].get("finish_reason", "length"),
+                "processing_time_ms": processing_time_ms
+            }
+            
+        except Exception as e:
+            raise RuntimeError(f"Error generating LLM response: {str(e)}")
+    
+    def get_available_models(self) -> List[Dict[str, str]]:
+        """Return a list of recommended open-source models that can be used"""
+        return [
+            {
+                "id": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+                "name": "TinyLlama 1.1B Chat",
+                "description": "Small and fast Llama model for chat (1.1B parameters)",
+                "memory_required": "~2GB"
+            },
+            {
+                "id": "microsoft/phi-2",
+                "name": "Microsoft Phi-2",
+                "description": "Small but capable 2.7B parameter model",
+                "memory_required": "~5GB"
+            },
+            {
+                "id": "facebook/opt-1.3b",
+                "name": "OPT 1.3B",
+                "description": "Open Pre-trained Transformer 1.3B parameter model",
+                "memory_required": "~3GB"
+            },
+            {
+                "id": "stabilityai/stablelm-base-alpha-3b",
+                "name": "StableLM 3B",
+                "description": "Base 3B parameter model from Stability AI",
+                "memory_required": "~6GB"
+            }
+        ]
\ No newline at end of file

From f1e60911106739300d033023dba86117a63b70c6 Mon Sep 17 00:00:00 2001
From: Celestine Akpanoko <celestineakpanoko@gmail.com>
Date: Tue, 20 May 2025 11:29:35 -0500
Subject: [PATCH 2/6] remove previous implementation to make it simple and work

---
 src/core/api/routes/llm.py                    |  88 ------
 src/core/contexts/engineering.json            |  11 -
 .../contexts/manufacturing/materials.json     |  21 --
 src/core/prompts/default.txt                  |   5 -
 src/core/prompts/engineering.txt              |  10 -
 .../prompts/manufacturing/substitution.txt    |  13 -
 src/core/services/llm_provider.py             | 257 ------------------
 7 files changed, 405 deletions(-)
 delete mode 100644 src/core/api/routes/llm.py
 delete mode 100644 src/core/contexts/engineering.json
 delete mode 100644 src/core/contexts/manufacturing/materials.json
 delete mode 100644 src/core/prompts/default.txt
 delete mode 100644 src/core/prompts/engineering.txt
 delete mode 100644 src/core/prompts/manufacturing/substitution.txt
 delete mode 100644 src/core/services/llm_provider.py

diff --git a/src/core/api/routes/llm.py b/src/core/api/routes/llm.py
deleted file mode 100644
index ae1cc19..0000000
--- a/src/core/api/routes/llm.py
+++ /dev/null
@@ -1,88 +0,0 @@
-from fastapi import APIRouter, HTTPException, status, Body
-from typing import Dict, Any, List, Optional
-
-from ..models.llm.request import LLMRequest
-from ..models.llm.response import LLMResponse, PromptListResponse, ContextListResponse
-from ..models.llm.response import PromptInfo, ContextInfo, TokenUsage
-
-from src.core.services.llm_provider import LLMProvider
-from src.core.services.directory_manager import DirectoryManager
-
-# Create router with prefix and tags
-router = APIRouter(prefix="/v1", tags=["llm"])
-
-# Initialize services
-llm_provider = LLMProvider()
-directory_manager = DirectoryManager()
-
-@router.get("/llm/prompts", response_model=PromptListResponse)
-async def list_prompts():
-    """List all available prompt templates"""
-    prompt_list = directory_manager.list_prompts()
-    prompts = [
-        PromptInfo(
-            name=p["name"],
-            description=p["description"],
-            required_variables=p["required_variables"]
-        ) for p in prompt_list
-    ]
-    return PromptListResponse(prompts=prompts)
-
-@router.get("/llm/contexts", response_model=ContextListResponse)
-async def list_contexts():
-    """List all available contexts"""
-    context_list = directory_manager.list_contexts()
-    contexts = [
-        ContextInfo(
-            name=c["name"],
-            description=c["description"]
-        ) for c in context_list
-    ]
-    return ContextListResponse(contexts=contexts)
-
-@router.get("/llm/models", response_model=List[Dict[str, str]])
-async def list_available_models():
-    """List all available LLM models that can be used"""
-    return llm_provider.get_available_models()
-
-@router.post("/llm", response_model=LLMResponse)
-async def process_llm_request(request: LLMRequest):
-    """Process a request with the LLM"""
-    try:
-        result = await llm_provider.generate(
-            prompt=request.prompt,
-            prompt_template=request.prompt_template,
-            context_name=request.context_name,
-            template_variables=request.template_variables,
-            system_message=request.system_message,
-            temperature=request.temperature,
-            max_tokens=request.max_tokens,
-            stop_sequences=request.stop_sequences
-        )
-        
-        return LLMResponse(
-            response=result["response"],
-            usage=TokenUsage(
-                prompt_tokens=result["usage"]["prompt_tokens"],
-                completion_tokens=result["usage"]["completion_tokens"],
-                total_tokens=result["usage"]["total_tokens"]
-            ),
-            model=result["model"],
-            finish_reason=result["finish_reason"],
-            processing_time_ms=result["processing_time_ms"]
-        )
-    except FileNotFoundError as e:
-        raise HTTPException(
-            status_code=status.HTTP_404_NOT_FOUND,
-            detail=str(e)
-        )
-    except ValueError as e:
-        raise HTTPException(
-            status_code=status.HTTP_400_BAD_REQUEST,
-            detail=str(e)
-        )
-    except Exception as e:
-        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail=f"LLM processing error: {str(e)}"
-        )
\ No newline at end of file
diff --git a/src/core/contexts/engineering.json b/src/core/contexts/engineering.json
deleted file mode 100644
index bc8d54e..0000000
--- a/src/core/contexts/engineering.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-    "description": "Engineering domain knowledge",
-    "data": {
-      "common_materials": ["steel", "aluminum", "titanium", "composites"],
-      "analysis_methods": ["FEA", "CFD", "experimental testing"],
-      "standards": {
-        "mechanical": ["ASME", "ISO", "ASTM"],
-        "electrical": ["IEC", "IEEE", "UL"]
-      }
-    }
-  }
\ No newline at end of file
diff --git a/src/core/contexts/manufacturing/materials.json b/src/core/contexts/manufacturing/materials.json
deleted file mode 100644
index def329a..0000000
--- a/src/core/contexts/manufacturing/materials.json
+++ /dev/null
@@ -1,21 +0,0 @@
-{
-  "metals": {
-    "aluminum": {
-      "properties": ["lightweight", "corrosion-resistant", "conductive"],
-      "substitutes": ["titanium", "magnesium", "certain composites"],
-      "common_processes": ["extrusion", "casting", "machining"]
-    },
-    "steel": {
-      "properties": ["strong", "durable", "magnetic"],
-      "substitutes": ["aluminum alloys", "titanium", "reinforced polymers"],
-      "common_processes": ["forging", "machining", "welding"]
-    }
-  },
-  "polymers": {
-    "abs": {
-      "properties": ["impact-resistant", "thermoformable", "lightweight"],
-      "substitutes": ["pla", "petg", "polycarbonate"],
-      "common_processes": ["3d printing", "injection molding"]
-    }
-  }
-}
\ No newline at end of file
diff --git a/src/core/prompts/default.txt b/src/core/prompts/default.txt
deleted file mode 100644
index 07663a0..0000000
--- a/src/core/prompts/default.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-You are an AI assistant helping with {{domain}} tasks.
-
-{{user_query}}
-
-Please provide a helpful response.
\ No newline at end of file
diff --git a/src/core/prompts/engineering.txt b/src/core/prompts/engineering.txt
deleted file mode 100644
index 5b1ea86..0000000
--- a/src/core/prompts/engineering.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-# Engineering analysis prompt
-You are an engineering expert. Please analyze the following problem:
-
-{{prompt}}
-
-Additional context:
-- Application: {{application}}
-- Requirements: {{requirements}}
-
-Provide a detailed technical analysis with practical recommendations.
\ No newline at end of file
diff --git a/src/core/prompts/manufacturing/substitution.txt b/src/core/prompts/manufacturing/substitution.txt
deleted file mode 100644
index 6ace03a..0000000
--- a/src/core/prompts/manufacturing/substitution.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-You are a manufacturing expert. Given the following material or process requirement, 
-suggest possible substitutions that could work as alternatives:
-
-Requirement: {{requirement}}
-Context: {{context}}
-
-For each alternative, provide:
-1. Name of the alternative
-2. Confidence score (0-1) indicating how well it substitutes
-3. Brief explanation of why this could work
-4. Any limitations or considerations
-
-Format your response as a list of alternatives with these details.
\ No newline at end of file
diff --git a/src/core/services/llm_provider.py b/src/core/services/llm_provider.py
deleted file mode 100644
index c850deb..0000000
--- a/src/core/services/llm_provider.py
+++ /dev/null
@@ -1,257 +0,0 @@
-import time
-import re
-from typing import Dict, Any, List, Optional, Tuple
-import os
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
-
-from .directory_manager import DirectoryManager
-
-class LLMProvider:
-    """Provider for LLM services using actual model calls"""
-    
-    def __init__(self, model_name: str = None):
-        """Initialize the LLM provider with model and directories"""
-        # Use environment variable or default to an open-source model that doesn't require login
-        # TinyLlama is a smaller model that's openly available
-        self.model_name = model_name or os.environ.get("LLM_MODEL", "TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-        self.directory_manager = DirectoryManager()
-        self._initialize_model()
-        
-    def _initialize_model(self):
-        """Initialize the model and tokenizer with additional error handling"""
-        print(f"Initializing model: {self.model_name}")
-        
-        try:
-            # Set up device configuration
-            device = "cuda" if torch.cuda.is_available() else "cpu"
-            print(f"Using device: {device}")
-            
-            # Load tokenizer with fallbacks
-            try:
-                self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-            except Exception as e:
-                print(f"Error loading tokenizer: {str(e)}")
-                print("Falling back to default tokenizer")
-                self.tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")  # Very small fallback
-            
-            # Load model with appropriate configuration
-            try:
-                if device == "cuda":
-                    self.model = AutoModelForCausalLM.from_pretrained(
-                        self.model_name,
-                        device_map="auto",
-                        torch_dtype=torch.float16,
-                        low_cpu_mem_usage=True
-                    )
-                else:
-                    # For CPU
-                    self.model = AutoModelForCausalLM.from_pretrained(
-                        self.model_name,
-                        device_map={"": device},
-                        low_cpu_mem_usage=True
-                    )
-                    
-                # Create generation pipeline
-                self.generator = pipeline(
-                    "text-generation",
-                    model=self.model,
-                    tokenizer=self.tokenizer,
-                    device=0 if device == "cuda" else -1
-                )
-                print(f"Successfully loaded model: {self.model_name}")
-            except Exception as e:
-                print(f"Error loading model: {str(e)}")
-                print("Falling back to text-only response mode")
-                self.model = None
-                self.generator = None
-        except Exception as e:
-            print(f"Critical error initializing LLM provider: {str(e)}")
-            print("API will continue to run, but LLM features will be unavailable")
-            self.model = None
-            self.tokenizer = None
-            self.generator = None
-            
-        # Flag to check if model is available
-        self.model_available = self.generator is not None
-        
-    def _format_prompt_for_model(self, prompt: str, system_message: Optional[str] = None) -> str:
-        """Format prompt based on the model architecture"""
-        # Check if model is a Llama variant
-        if "llama" in self.model_name.lower():
-            if system_message:
-                # TinyLlama uses the same format as Llama-2
-                return f"<s>[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n{prompt} [/INST]"
-            else:
-                return f"<s>[INST] {prompt} [/INST]"
-        # Add support for different model formats as needed
-        else:
-            # Generic format for other models
-            if system_message:
-                return f"{system_message}\n\n{prompt}"
-            else:
-                return prompt
-    
-    def _fill_template(self, template: str, variables: Dict[str, Any]) -> str:
-        """Fill a template with variables"""
-        result = template
-        for key, value in variables.items():
-            pattern = r"{{" + re.escape(key) + r"}}"
-            result = re.sub(pattern, str(value), result)
-        return result
-    
-    def _prepare_context(self, context_name: Optional[str]) -> Dict[str, Any]:
-        """Prepare context data from file if specified"""
-        if not context_name:
-            return {}
-            
-        try:
-            return self.directory_manager.get_context(context_name)
-        except FileNotFoundError as e:
-            print(f"Warning: {str(e)}")
-            return {}
-    
-    def _prepare_prompt(self, 
-                      raw_prompt: str, 
-                      prompt_template: Optional[str], 
-                      template_variables: Dict[str, Any]) -> str:
-        """Prepare final prompt text using template if specified"""
-        if not prompt_template:
-            return raw_prompt
-            
-        try:
-            template, required_vars = self.directory_manager.get_prompt(prompt_template)
-            
-            # Check if all required variables are provided
-            missing_vars = [var for var in required_vars if var not in template_variables]
-            if missing_vars:
-                raise ValueError(f"Missing required template variables: {', '.join(missing_vars)}")
-                
-            # Add raw prompt to template variables if not overridden
-            if 'prompt' not in template_variables:
-                template_variables['prompt'] = raw_prompt
-                
-            return self._fill_template(template, template_variables)
-        except FileNotFoundError:
-            # Fall back to raw prompt if template not found
-            return raw_prompt
-            
-    async def generate(self, 
-                      prompt: str,
-                      prompt_template: Optional[str] = None,
-                      context_name: Optional[str] = None, 
-                      template_variables: Dict[str, Any] = None,
-                      system_message: Optional[str] = None,
-                      temperature: float = 0.7,
-                      max_tokens: int = 1000,
-                      stop_sequences: Optional[List[str]] = None) -> Dict[str, Any]:
-        """Generate a response using the LLM"""
-        start_time = time.time()
-        
-        # If model isn't available, return a fallback response
-        if not hasattr(self, "model_available") or not self.model_available:
-            return {
-                "response": f"LLM model is not available. I received your prompt: '{prompt}'",
-                "usage": {
-                    "prompt_tokens": len(prompt.split()),
-                    "completion_tokens": 0,
-                    "total_tokens": len(prompt.split())
-                },
-                "model": "fallback_text_only",
-                "finish_reason": "model_unavailable",
-                "processing_time_ms": 0
-            }
-        
-        if template_variables is None:
-            template_variables = {}
-            
-        # Get context data if specified
-        context_data = self._prepare_context(context_name)
-        
-        # Prepare the final prompt
-        prepared_prompt = self._prepare_prompt(
-            raw_prompt=prompt,
-            prompt_template=prompt_template,
-            template_variables=template_variables
-        )
-        
-        # Format for the specific model
-        formatted_prompt = self._format_prompt_for_model(
-            prompt=prepared_prompt,
-            system_message=system_message
-        )
-        
-        # Configure generation parameters
-        generation_config = {
-            "max_new_tokens": max_tokens,
-            "temperature": temperature,
-            "do_sample": temperature > 0,
-            "top_p": 0.95,
-            "top_k": 50,
-            "return_full_text": False,
-        }
-        
-        if stop_sequences:
-            generation_config["stop_sequences"] = stop_sequences
-            
-        # Perform actual model inference
-        try:
-            result = self.generator(
-                formatted_prompt,
-                **generation_config
-            )
-            
-            # Extract generated text
-            generated_text = result[0]["generated_text"].strip()
-            
-            # Calculate token usage
-            input_tokens = len(self.tokenizer.encode(formatted_prompt))
-            output_tokens = len(self.tokenizer.encode(generated_text))
-            total_tokens = input_tokens + output_tokens
-            
-            end_time = time.time()
-            processing_time_ms = int((end_time - start_time) * 1000)
-            
-            return {
-                "response": generated_text,
-                "usage": {
-                    "prompt_tokens": input_tokens,
-                    "completion_tokens": output_tokens,
-                    "total_tokens": total_tokens
-                },
-                "model": self.model_name,
-                "finish_reason": result[0].get("finish_reason", "length"),
-                "processing_time_ms": processing_time_ms
-            }
-            
-        except Exception as e:
-            raise RuntimeError(f"Error generating LLM response: {str(e)}")
-    
-    def get_available_models(self) -> List[Dict[str, str]]:
-        """Return a list of recommended open-source models that can be used"""
-        return [
-            {
-                "id": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-                "name": "TinyLlama 1.1B Chat",
-                "description": "Small and fast Llama model for chat (1.1B parameters)",
-                "memory_required": "~2GB"
-            },
-            {
-                "id": "microsoft/phi-2",
-                "name": "Microsoft Phi-2",
-                "description": "Small but capable 2.7B parameter model",
-                "memory_required": "~5GB"
-            },
-            {
-                "id": "facebook/opt-1.3b",
-                "name": "OPT 1.3B",
-                "description": "Open Pre-trained Transformer 1.3B parameter model",
-                "memory_required": "~3GB"
-            },
-            {
-                "id": "stabilityai/stablelm-base-alpha-3b",
-                "name": "StableLM 3B",
-                "description": "Base 3B parameter model from Stability AI",
-                "memory_required": "~6GB"
-            }
-        ]
\ No newline at end of file

From 72c5381d5654e78a7d15506ec674090ea203f23d Mon Sep 17 00:00:00 2001
From: Celestine Akpanoko <celestineakpanoko@gmail.com>
Date: Tue, 20 May 2025 12:11:03 -0500
Subject: [PATCH 3/6] Implement llm endpoint to serve llm to client via
 in-house api

---
 requirements.txt                    |  9 +---
 src/core/api/models/llm/request.py  | 22 ++++----
 src/core/api/models/llm/response.py | 38 +++----------
 src/core/api/routes/llm.py          | 83 +++++++++++++++++++++++++++++
 src/core/main.py                    |  5 ++
 5 files changed, 107 insertions(+), 50 deletions(-)
 create mode 100644 src/core/api/routes/llm.py

diff --git a/requirements.txt b/requirements.txt
index 7d401ab..0ffda19 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,10 +4,5 @@ mkdocs-mermaid2-plugin
 networkx
 fastapi
 uvicorn
-torch
-transformers
-accelerate>=0.21.0
-bitsandbytes>=0.41.0
-safetensors
-sentencepiece
-dotenv
\ No newline at end of file
+dotenv
+groq
\ No newline at end of file
diff --git a/src/core/api/models/llm/request.py b/src/core/api/models/llm/request.py
index 87c3316..c914c28 100644
--- a/src/core/api/models/llm/request.py
+++ b/src/core/api/models/llm/request.py
@@ -1,13 +1,13 @@
-from pydantic import BaseModel, Field
-from typing import Dict, Any, Optional, List
+from pydantic import BaseModel
+from typing import Optional
 
 class LLMRequest(BaseModel):
-    """Request model for the LLM endpoint"""
-    prompt: str = Field(..., description="The prompt to send to the LLM")
-    prompt_template: Optional[str] = Field(None, description="Name of a prompt template to use")
-    context_name: Optional[str] = Field(None, description="Name of the context to apply")
-    template_variables: Dict[str, Any] = Field(default_factory=dict, description="Variables to fill in prompt template")
-    system_message: Optional[str] = Field(None, description="Optional system message to guide the LLM")
-    temperature: float = Field(0.7, ge=0.0, le=1.0, description="Temperature parameter (0-1)")
-    max_tokens: int = Field(1000, gt=0, description="Maximum tokens to generate")
-    stop_sequences: Optional[List[str]] = Field(None, description="Sequences that will stop generation")
\ No newline at end of file
+    """ Request model for LLM API. """
+    prompt: str
+    context: Optional[str] = None
+    model: Optional[str] = 'llama-3.1-8b-instant'
+    max_tokens: Optional[int] = 512
+    temperature: Optional[float] = 0.7
+    top_p: Optional[float] = 0.9
+    stream: Optional[bool] = False
+    
\ No newline at end of file
diff --git a/src/core/api/models/llm/response.py b/src/core/api/models/llm/response.py
index dfe97b8..ac9a1d0 100644
--- a/src/core/api/models/llm/response.py
+++ b/src/core/api/models/llm/response.py
@@ -1,35 +1,9 @@
-from pydantic import BaseModel, Field
-from typing import Dict, Any, Optional, List
-
-class TokenUsage(BaseModel):
-    """Token usage information"""
-    prompt_tokens: int
-    completion_tokens: int
-    total_tokens: int
+from pydantic import BaseModel
+from typing import Optional
 
 class LLMResponse(BaseModel):
-    """Response model for the LLM endpoint"""
-    response: str = Field(..., description="The generated text from the LLM")
-    usage: TokenUsage = Field(..., description="Token usage statistics")
-    model: str = Field(..., description="Model used for generation")
-    finish_reason: Optional[str] = Field(None, description="Reason for completion")
-    processing_time_ms: int = Field(..., description="Processing time in milliseconds")
-
-class PromptInfo(BaseModel):
-    """Information about an available prompt template"""
-    name: str = Field(..., description="Name of the prompt template")
-    description: Optional[str] = Field(None, description="Brief description of the prompt")
-    required_variables: List[str] = Field(default_factory=list, description="Required template variables")
-    
-class PromptListResponse(BaseModel):
-    """Response with available prompt templates"""
-    prompts: List[PromptInfo] = Field(..., description="Available prompt templates")
-    
-class ContextInfo(BaseModel):
-    """Information about an available context"""
-    name: str = Field(..., description="Name of the context")
-    description: Optional[str] = Field(None, description="Brief description of the context")
+    """ Response model for LLM API. """
     
-class ContextListResponse(BaseModel):
-    """Response with available contexts"""
-    contexts: List[ContextInfo] = Field(..., description="Available contexts")
\ No newline at end of file
+    response: str
+    model: str
+    usage: Optional[dict] = None
\ No newline at end of file
diff --git a/src/core/api/routes/llm.py b/src/core/api/routes/llm.py
new file mode 100644
index 0000000..7ad836b
--- /dev/null
+++ b/src/core/api/routes/llm.py
@@ -0,0 +1,83 @@
+from fastapi import APIRouter, HTTPException
+from fastapi.responses import StreamingResponse
+from groq import Groq
+import os
+from dotenv import load_dotenv
+from src.core.api.models.llm.request import LLMRequest
+from src.core.api.models.llm.response import LLMResponse
+
+load_dotenv()
+
+router = APIRouter(prefix="/v1/llm", tags=["llm"])
+
+client = Groq(api_key=os.getenv("GROQ_API_KEY"))
+if not client.api_key:
+    raise HTTPException(status_code=500, detail="Groq API key is not set.")
+
+@router.post("/", response_model=LLMResponse)
+async def generate_response(request: LLMRequest):
+    """ Generate response the LLM provider based on prompt and optional context.
+    Supports streaming and non-streaming responses. """
+    
+    try:
+        # combine prompt and context (if provided) in the request
+        messages = []
+        if request.context:
+            messages.append({"role": "system", "content": request.context})
+        messages.append({"role": "user", "content": request.prompt})
+        
+        # Call the Groq API
+        response = client.chat.completions.create(
+            messages=messages,
+            model=request.model,
+            max_tokens=request.max_tokens,
+            temperature=request.temperature,
+            top_p=request.top_p,
+            stream=request.stream
+        )
+        
+        # If streaming is enabled, return a streaming response
+        if request.stream:
+            def stream_response():
+                for chunk in response:
+                    content = chunk.choices[0].delta.content
+                    if content:
+                        yield content
+                    yield "[DONE]"
+            return StreamingResponse(stream_response(), media_type="text/event-stream")
+        else:
+            # if not streaming, return the full response
+            full_response = response.choices[0].message.content
+            usage = (
+                response.usage.dict()
+                if hasattr(response, "usage") and response.usage
+                else None
+            )
+            return LLMResponse(
+                response=full_response,
+                model=response.model,
+                usage=usage
+            )
+            
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error communicating with LLM provider: {str(e)}")
+    
+    
+    
+@router.get("/health")
+async def health_check():
+    """ Health check for the LLM Endpoint """
+    
+    try:
+        # test a simple API call to verify Groq connection
+        response = client.chat.completions.create(
+            messages=[{"role": "user", "content": "Hello!"}],
+            model="llama-3.1-8b-instant",
+            max_tokens=10,
+        )
+        return {
+            "status": "healthy"
+        }
+        
+    except Exception as e:
+        raise HTTPException(status_code=503, detail=f"LLM provider is not healthy: {str(e)}")
\ No newline at end of file
diff --git a/src/core/main.py b/src/core/main.py
index 873d5af..c2a4e3c 100644
--- a/src/core/main.py
+++ b/src/core/main.py
@@ -14,6 +14,11 @@ async def health_check():
     """Simple health check endpoint."""
     return {"status": "ok", "domains": list(DomainRegistry._extractors.keys())}
 
+# @app.get("/llm")
+# async def llm_check():
+#     """Simple LLM check endpoint."""
+#     return {"status": "ok", "llm": "gpt-3.5-turbo"}
+
 # Register routes
 app.include_router(match_router, tags=["matching"])
 app.include_router(llm_router, tags=["llm"])

From b7031db17f1ac354af84ccf75e1d695a277d458b Mon Sep 17 00:00:00 2001
From: Celestine Akpanoko <celestineakpanoko@gmail.com>
Date: Tue, 20 May 2025 14:31:27 -0500
Subject: [PATCH 4/6] Create test cases for unit, integration and end to end
 testing

---
 .gitignore                               |  2 +
 pytest.ini                               | 18 +++++++
 tests/llm/conftest.py                    | 24 ++++++++++
 tests/llm/e2e/__init__.py                |  0
 tests/llm/e2e/test_llm_workflow.py       | 27 +++++++++++
 tests/llm/integration/__init__.py        |  0
 tests/llm/integration/test_llm_routes.py | 25 ++++++++++
 tests/llm/unit/__init__.py               |  0
 tests/llm/unit/test_llm_models.py        | 60 ++++++++++++++++++++++++
 9 files changed, 156 insertions(+)
 create mode 100644 pytest.ini
 create mode 100644 tests/llm/conftest.py
 create mode 100644 tests/llm/e2e/__init__.py
 create mode 100644 tests/llm/e2e/test_llm_workflow.py
 create mode 100644 tests/llm/integration/__init__.py
 create mode 100644 tests/llm/integration/test_llm_routes.py
 create mode 100644 tests/llm/unit/__init__.py
 create mode 100644 tests/llm/unit/test_llm_models.py

diff --git a/.gitignore b/.gitignore
index cf6bd05..fa512c5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -172,3 +172,5 @@ cython_debug/
 
 # PyPI configuration file
 .pypirc
+
+*.txt
\ No newline at end of file
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000..63a80a6
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,18 @@
+[pytest]
+# Configure asyncio for async tests
+asyncio_mode = strict
+asyncio_default_fixture_loop_scope = function
+
+# Test discovery patterns with wildcards
+python_files = test_*.py
+python_classes = Test*
+python_functions = test_*
+
+# Make sure Python can find your modules
+pythonpath = .
+
+# Recursive test discovery in all subdirectories
+norecursedirs = .* venv build dist
+
+markers =
+    e2e: Mark end-to-end test
\ No newline at end of file
diff --git a/tests/llm/conftest.py b/tests/llm/conftest.py
new file mode 100644
index 0000000..40c6f94
--- /dev/null
+++ b/tests/llm/conftest.py
@@ -0,0 +1,24 @@
+import pytest
+from fastapi.testclient import TestClient
+from src.core.main import app
+from unittest.mock import MagicMock, patch
+
+@pytest.fixture
+def client():
+    """Fixture for FastAPI test client."""
+    return TestClient(app)
+
+@pytest.fixture
+def mock_groq():
+    """Fixture to mock Groq client."""
+    # Create a mock with the properly configured return value
+    mock_client = MagicMock()
+    
+    # Configure the mock to work with async code
+    mock_create = MagicMock()
+    # Make the mock.return_value property directly accessible (not awaitable)
+    mock_client.chat.completions.create.return_value = mock_create
+    
+    # Patch the client in the LLM route module
+    with patch("src.core.api.routes.llm.client", mock_client):
+        yield mock_client
\ No newline at end of file
diff --git a/tests/llm/e2e/__init__.py b/tests/llm/e2e/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/llm/e2e/test_llm_workflow.py b/tests/llm/e2e/test_llm_workflow.py
new file mode 100644
index 0000000..a6426c3
--- /dev/null
+++ b/tests/llm/e2e/test_llm_workflow.py
@@ -0,0 +1,27 @@
+import pytest
+from fastapi.testclient import TestClient
+from unittest.mock import MagicMock
+
+# Register the e2e marker to avoid the warning
+pytest.mark.e2e = pytest.mark.skipif(False, reason="E2E test marker")
+
+@pytest.mark.e2e
+def test_llm_workflow(client, mock_groq):
+    """End-to-end test for LLM workflow"""
+    # Setup mock response at the correct path in the mock structure
+    mock_response = mock_groq.chat.completions.create.return_value
+    mock_response.choices = [MagicMock()]
+    mock_response.choices[0].message = MagicMock()
+    mock_response.choices[0].message.content = '{"result": "processed data"}'
+    mock_response.model = "llama-3.1-8b-instant"
+    mock_response.usage = MagicMock()
+    mock_response.usage.dict.return_value = {"total_tokens": 20}
+    
+    # Call endpoint
+    response = client.post(
+        "/v1/llm/",
+        json={"prompt": "Process this text", "context": "Return JSON"}
+    )
+    
+    assert response.status_code == 200
+    assert "result" in response.json()["response"]
diff --git a/tests/llm/integration/__init__.py b/tests/llm/integration/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/llm/integration/test_llm_routes.py b/tests/llm/integration/test_llm_routes.py
new file mode 100644
index 0000000..c871499
--- /dev/null
+++ b/tests/llm/integration/test_llm_routes.py
@@ -0,0 +1,25 @@
+import pytest
+from fastapi.testclient import TestClient
+from unittest.mock import MagicMock
+
+def test_llm_endpoint_non_streaming(client, mock_groq):
+    """Test non-streaming LLM endpoint"""
+    # Setup mock response at the correct place in the mock structure
+    mock_response = mock_groq.chat.completions.create.return_value
+    mock_response.choices = [MagicMock()]
+    mock_response.choices[0].message = MagicMock()
+    mock_response.choices[0].message.content = "Test response"
+    mock_response.model = "llama-3.1-8b-instant"
+    mock_response.usage = MagicMock()
+    mock_response.usage.dict.return_value = {"total_tokens": 10}
+    
+    # Make request to endpoint
+    response = client.post(
+        "/v1/llm/",
+        json={"prompt": "Test prompt"}
+    )
+    
+    assert response.status_code == 200
+    assert response.json()["response"] == "Test response"
+
+# Similar updates for other failing tests...
\ No newline at end of file
diff --git a/tests/llm/unit/__init__.py b/tests/llm/unit/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/llm/unit/test_llm_models.py b/tests/llm/unit/test_llm_models.py
new file mode 100644
index 0000000..6930ee1
--- /dev/null
+++ b/tests/llm/unit/test_llm_models.py
@@ -0,0 +1,60 @@
+import pytest
+from src.core.api.models.llm.request import LLMRequest
+from src.core.api.models.llm.response import LLMResponse
+
+def test_llm_request_valid():
+    """Test valid LLMRequest model with all fields."""
+    data = {
+        "prompt": "What is the capital of France?",
+        "context": "Provide a concise answer.",
+        "model": "llama-3.1-8b-instant",
+        "max_tokens": 100,
+        "temperature": 0.7,
+        "top_p": 0.95,
+        "stream": False
+    }
+    request = LLMRequest(**data)
+    assert request.prompt == "What is the capital of France?"
+    assert request.context == "Provide a concise answer."
+    assert request.model == "llama-3.1-8b-instant"
+    assert request.max_tokens == 100
+    assert request.temperature == 0.7
+    assert request.top_p == 0.95
+    assert request.stream is False
+
+def test_llm_request_no_context():
+    """Test LLMRequest with no context, using defaults."""
+    data = {"prompt": "Test prompt"}
+    request = LLMRequest(**data)
+    assert request.prompt == "Test prompt"
+    assert request.context is None
+    assert request.model == "llama-3.1-8b-instant"
+    assert request.max_tokens == 512
+    assert request.temperature == 0.7  
+    assert request.top_p == 0.9  
+    assert request.stream is False
+
+def test_llm_request_invalid_prompt():
+    """Test LLMRequest with empty prompt raises error."""
+    with pytest.raises(ValueError, match="prompt"):
+        LLMRequest(prompt="")
+
+def test_llm_response_valid():
+    """Test valid LLMResponse model."""
+    data = {
+        "response": "The capital is Paris.",
+        "model": "llama-3.1-8b-instant",
+        "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}
+    }
+    response = LLMResponse(**data)
+    assert response.response == "The capital is Paris."
+    assert response.model == "llama-3.1-8b-instant"
+    assert response.usage == {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}
+
+def test_llm_response_no_usage():
+    """Test LLMResponse with no usage data."""
+    data = {"response": "Test response", "model": "llama-3.1-8b-instant"}
+    response = LLMResponse(**data)
+    assert response.response == "Test response"
+    assert response.model == "llama-3.1-8b-instant"
+    assert response.usage is None
\ No newline at end of file

From 91c0f60d77f024edac457e501e447cc229535975 Mon Sep 17 00:00:00 2001
From: Celestine Akpanoko <celestineakpanoko@gmail.com>
Date: Tue, 20 May 2025 14:31:56 -0500
Subject: [PATCH 5/6] Fix failed test cases issues

---
 src/core/api/models/llm/request.py | 10 ++++++++--
 src/core/api/routes/llm.py         |  2 +-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/core/api/models/llm/request.py b/src/core/api/models/llm/request.py
index c914c28..5208fc6 100644
--- a/src/core/api/models/llm/request.py
+++ b/src/core/api/models/llm/request.py
@@ -1,4 +1,4 @@
-from pydantic import BaseModel
+from pydantic import BaseModel, Field, field_validator
 from typing import Optional
 
 class LLMRequest(BaseModel):
@@ -10,4 +10,10 @@ class LLMRequest(BaseModel):
     temperature: Optional[float] = 0.7
     top_p: Optional[float] = 0.9
     stream: Optional[bool] = False
-    
\ No newline at end of file
+    
+    @field_validator('prompt')
+    @classmethod  # This is now required in V2
+    def prompt_must_not_be_empty(cls, v):
+        if not v or v.strip() == '':
+            raise ValueError('prompt cannot be empty')
+        return v
diff --git a/src/core/api/routes/llm.py b/src/core/api/routes/llm.py
index 7ad836b..1cdfaaa 100644
--- a/src/core/api/routes/llm.py
+++ b/src/core/api/routes/llm.py
@@ -43,7 +43,7 @@ def stream_response():
                     content = chunk.choices[0].delta.content
                     if content:
                         yield content
-                    yield "[DONE]"
+                yield "[DONE]"
             return StreamingResponse(stream_response(), media_type="text/event-stream")
         else:
             # if not streaming, return the full response

From 02e88c71ae2023ad3295ef3e27fad2fa5cd64016 Mon Sep 17 00:00:00 2001
From: Celestine Akpanoko <celestineakpanoko@gmail.com>
Date: Tue, 20 May 2025 14:32:30 -0500
Subject: [PATCH 6/6] Document for PR and package requirement

---
 docs/api/llm.md  | 101 +++++++++++++++++++++++++++++++++++++++++++++++
 requirements.txt |   6 ++-
 2 files changed, 106 insertions(+), 1 deletion(-)
 create mode 100644 docs/api/llm.md

diff --git a/docs/api/llm.md b/docs/api/llm.md
new file mode 100644
index 0000000..9761538
--- /dev/null
+++ b/docs/api/llm.md
@@ -0,0 +1,101 @@
+# LLM API
+A FastAPI application for serving LLM inferences.
+
+## Features
+
+New: `/llm` POST endpoint for generic LLM inference with prompt and context, powered by Groq Cloud (LLaMA 3.1).
+Models defined in `src/core/api/models/llm/request.py` and `src/core/api/models/llm/response.py`.
+
+
+## API Documentation
+### LLM Endpoint
+`POST /llm`
+Generate text using an LLM provider with a prompt and optional context.
+Request Body
+```
+{
+  "prompt": "string", // Required: The main task or query for the LLM
+  "context": "string", // Optional: Supplementary information to guide the response
+  "model": "string", // Optional: LLM model (default: "llama-3.1-8b-instant")
+  "max_tokens": integer, // Optional: Maximum tokens in response (default: 512)
+  "temperature": float, // Optional: Sampling temperature (default: 0.8)
+  "top_p": float, // Optional: Top-p sampling (default: 0.95)
+  "stream": boolean // Optional: Enable streaming response (default: false)
+}
+```
+Responses
+
+200 OK (Non-Streaming):
+```
+{
+  "response": "string", // LLM-generated text
+  "model": "string", // Model used
+  "usage": { // Optional: Token usage
+    "prompt_tokens": integer,
+    "completion_tokens": integer,
+    "total_tokens": integer
+  }
+}
+```
+
+200 OK (Streaming): Text/event-stream with chunks of text, ending with [DONE].
+500 Internal Server Error: If the LLM provider fails.
+
+Example
+```
+curl -X POST http://localhost:8000/llm \
+  -H "Content-Type: application/json" \
+  -d '{
+    "prompt": "What is the capital of Brazil?",
+    "context": "Answer concisely with no explanation.",
+    "model": "llama-3.1-8b-instant",
+    "max_tokens": 50,
+    "temperature": 0.7,
+    "stream": false
+  }'
+```
+
+
+## Usage Guide
+### Using the LLM Endpoint
+The `/llm` endpoint allows clients to send a prompt and optional context to generate text using an LLM (e.g., LLaMA 3.1 via Groq Cloud). The request and response are defined in `src/core/api/models/llm/request.py` and `src/core/api/models/llm/response.py`, respectively.
+### Example: Non-Streaming Request
+Send a prompt to get a concise response:
+```
+curl -X POST http://localhost:8000/llm \
+  -H "Content-Type: application/json" \
+  -d '{
+    "prompt": "What is the capital of Brazil?",
+    "context": "Answer concisely with no explanation.",
+    "model": "llama-3.1-8b-instant",
+    "max_tokens": 50
+  }'
+```
+Response:
+```
+{
+  "response": "The capital is Brasília.",
+  "model": "llama-3.1-8b-instant",
+  "usage": {
+    "prompt_tokens": 12,
+    "completion_tokens": 6,
+    "total_tokens": 18
+  }
+}
+```
+
+### Example: Streaming Request
+Stream a response for a creative task:
+```
+curl -X POST http://localhost:8000/llm \
+  -H "Content-Type: application/json" \
+  -d '{
+    "prompt": "Tell me a short story.",
+    "context": "The story should be about a dragon in a fantasy setting.",
+    "model": "llama-3.1-8b-instant",
+    "max_tokens": 100,
+    "stream": true
+  }'
+```
+Response: Streams tokens like "Once ", "upon ", "a time...", ending with `[DONE]`.
+
diff --git a/requirements.txt b/requirements.txt
index 0ffda19..b75aee0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,4 +5,8 @@ networkx
 fastapi
 uvicorn
 dotenv
-groq
\ No newline at end of file
+groq
+pytest
+pytest-asyncio
+pytest-mock
+httpx
\ No newline at end of file