diff --git a/app/infrastructure/redis_client.py b/app/infrastructure/redis_client.py index 3e2e22f..57e1ad6 100644 --- a/app/infrastructure/redis_client.py +++ b/app/infrastructure/redis_client.py @@ -5,7 +5,7 @@ @lru_cache -def get_redis() -> Redis[str]: +def get_redis() -> Redis: if settings.redis_url: return Redis.from_url( settings.redis_url, diff --git a/app/intent_classification/README.md b/app/intent_classification/README.md new file mode 100644 index 0000000..33a54af --- /dev/null +++ b/app/intent_classification/README.md @@ -0,0 +1,572 @@ +# Intent Classification Module + +Standalone module for classifying user intents in international student service conversations with comprehensive testing and fine-tuning capabilities. + +## Overview + +This module provides a complete intent classification system that categorizes caller intents into three types: + +- **SCAM**: Fraud attempts, malicious callers → Terminate call immediately +- **OPPORTUNITY**: Legitimate chances for students (interviews, jobs, research) → Collect student info to help them seize opportunities +- **OTHER**: Complex issues, messages, unclear intents → Human review + +### Key Features + +✅ **Automated Testing** with comprehensive metrics +✅ **Fine-Tuning Pipeline** for continuous improvement +✅ **Complete Documentation** with examples +✅ **FastAPI Integration** ready to use + +## Quick Start + +### Basic Usage + +```python +from intent_classification import IntentClassifier + +classifier = IntentClassifier() +result = await classifier.classify_intent("We'd like to invite you for a job interview. When are you available?") + +# Result: +# { +# "intent": "opportunity", +# "confidence": 0.95, +# "reasoning": "Legitimate job interview invitation requesting student availability", +# "metadata": { +# "matched_keywords": ["job interview", "available", "invite"], +# "matched_characteristics": ["Mentions of job interviews or interview invitations"] +# } +# } +``` + +### Run Tests + +```bash +cd /Users/markwang/Documents/Dispatch\ AI/backend/ai +./app/intent_classification/tests/run_tests.sh +``` + +This will: +1. Run all test cases +2. Generate performance metrics (Accuracy, Precision, Recall, F1) +3. Create confusion matrix +4. Identify misclassifications +5. Generate fine-tuning datasets + +## Module Structure + +``` +intent_classification/ +├── README.md # This file +├── __init__.py # Module exports (v2.0.0) +│ +├── definitions/ # Intent definitions +│ ├── __init__.py +│ ├── scam.md # SCAM intent documentation +│ ├── opportunity.md # OPPORTUNITY intent documentation +│ ├── other.md # OTHER intent documentation +│ └── intent_definitions.py # Python definition data structures +│ +├── models/ # Data models +│ ├── __init__.py +│ ├── intent_types.py # IntentType enum (SCAM, OPPORTUNITY, OTHER) +│ ├── requests.py # Request models +│ └── responses.py # Response models +│ +├── services/ # Classification logic +│ ├── __init__.py +│ ├── classifier.py # Main classifier with OpenAI integration +│ └── prompts.py # LLM system prompts +│ +├── tests/ # Testing & Fine-Tuning Suite ⭐ NEW +│ ├── __init__.py +│ ├── test_runner.py # Automated test runner with metrics +│ ├── fine_tuning.py # Fine-tuning data generator +│ ├── demo.py # Quick demonstration script +│ ├── run_tests.sh # Convenience script +│ ├── README.md # Complete testing documentation +│ ├── QUICK_START.md # Quick reference guide +│ ├── TESTING_SUMMARY.md # System overview +│ │ +│ ├── test_data/ # Test cases organized by intent +│ │ ├── __init__.py +│ │ ├── scam_cases.py # 15 SCAM test cases +│ │ ├── opportunity_cases.py # 15 OPPORTUNITY test cases +│ │ ├── other_cases.py # 15 OTHER test cases +│ │ └── edge_cases.py # 5 edge cases +│ │ +│ ├── results/ # Test results (auto-generated) +│ │ ├── metrics_*.json # Performance metrics +│ │ ├── report_*.txt # Human-readable reports +│ │ └── misclassified_*.json # Failed cases for analysis +│ │ +│ └── fine_tuning_data/ # Fine-tuning datasets (auto-generated) +│ ├── train_*.jsonl # Training data (OpenAI format) +│ └── validation_*.jsonl # Validation data +│ +└── api/ # API endpoints + ├── __init__.py + └── routes.py # FastAPI routes +``` + +## Intent Types + +### 1. SCAM + +**Purpose**: Detect and terminate fraud attempts + +**Characteristics**: +- Requests for money transfers or immediate payments +- Impersonating government agencies, banks, or authorities +- Threatening or intimidating language +- Requesting sensitive information (bank details, passwords) +- High-pressure tactics + +**Examples**: +- "This is the Tax Office. You have unpaid taxes and must pay immediately..." +- "Your bank account has been frozen. Provide your password..." + +**Action**: Terminate call immediately, log incident + +[Full documentation](./definitions/scam.md) + +### 2. OPPORTUNITY + +**Purpose**: Capture legitimate chances for students (interviews, jobs, research, internships) + +**Characteristics**: +- Job interviews or interview invitations +- Employment opportunities or job offers +- Research opportunities or academic collaborations +- Internship positions or traineeships +- Networking events or professional development +- Scholarship or fellowship opportunities +- Requests for student availability or contact information + +**Examples**: +- "We'd like to invite you for a job interview. When are you available?" +- "Our company has an internship position available." +- "I'm a professor looking for research assistants." +- "There's a career fair on campus. Can we schedule a meeting?" + +**Action**: Collect student availability, email, contact details to help them seize opportunities + +**Important**: OPPORTUNITY distinguishes from SCAM by never requesting money/payments upfront + +[Full documentation](./definitions/opportunity.md) + +### 3. OTHER + +**Purpose**: Catch-all for complex cases requiring human handling + +**Characteristics**: +- Message leaving or callback requests +- Complex or unique situations +- Complaints or disputes +- Unclear or ambiguous intents +- Special circumstances + +**Examples**: +- "I'd like to leave a message" +- "Can someone call me back later?" +- "I have a complex visa situation..." + +**Action**: Preserve call summary, queue for human review + +[Full documentation](./definitions/other.md) + + +## API Endpoints + +### POST /api/ai/intent/classify + +Classify user intent from a message. + +**Request**: +```json +{ + "currentMessage": "We have an internship position available for you.", + "callSid": "CA123...", // Optional + "messages": [...] // Optional conversation history +} +``` + +**Response**: +```json +{ + "intent": "opportunity", + "confidence": 0.95, + "reasoning": "Legitimate internship opportunity being offered to student", + "metadata": { + "matched_keywords": ["internship", "position", "available"], + "matched_characteristics": ["Internship positions or traineeships"] + } +} +``` + +### GET /api/ai/intent/definitions + +Get all intent definitions with characteristics, examples, and keywords. + +**Response**: +```json +{ + "scam": { "name": "scam", "description": "...", ... }, + "opportunity": { "name": "opportunity", "description": "...", ... }, + "other": { "name": "other", "description": "...", ... } +} +``` + +### POST /api/ai/intent/test + +Run the classifier against all test cases and get accuracy metrics. + +**Response**: +```json +{ + "total_tests": , + "passed": , + "failed": , + "accuracy": , + "by_intent": { + "scam": {"tests": , "passed": , "accuracy": }, + "opportunity": {"tests": , "passed": , "accuracy": }, + "other": {"tests": , "passed": , "accuracy": } + } +} +``` + +### GET /api/ai/intent/health + +Health check endpoint for the service. + +## Usage Examples + +### Basic Classification + +```python +from intent_classification import IntentClassifier, IntentType + +classifier = IntentClassifier() + +# Classify a message +result = await classifier.classify_intent("We have a job interview for you next week") + +if result["intent"] == IntentType.OPPORTUNITY.value: + # Collect student info (availability, email, etc.) + collect_student_info(result) +elif result["intent"] == IntentType.SCAM.value: + # Terminate call + terminate_call() +elif result["intent"] == IntentType.OTHER.value: + # Queue for human review + queue_for_review(result) +``` + +### With Conversation History + +```python +# Provide conversation context for better accuracy +result = await classifier.classify_intent( + current_message="Can you help me?", + message_history=[ + {"role": "user", "content": "Hi"}, + {"role": "assistant", "content": "Hello! How can I help you?"} + ] +) +``` + +### Access Intent Definitions + +```python +from intent_classification import ( + get_scam_definition, + get_opportunity_definition, + get_other_definition +) + +opportunity_def = get_opportunity_definition() +print(opportunity_def["name"]) # "opportunity" +print(opportunity_def["characteristics"]) # List of characteristics +print(opportunity_def["positive_examples"]) # Example messages +print(opportunity_def["keywords"]) # List of keywords +``` + +### Programmatic Testing + +```python +from intent_classification.tests.test_runner import IntentTestRunner + +runner = IntentTestRunner() +metrics = await runner.run_all_tests() + +print(f"Accuracy: {metrics['summary']['accuracy']:.1%}") +print(f"F1 Score: {metrics['summary']['macro_f1']:.1%}") + +# Access per-intent metrics +scam_metrics = metrics['per_intent']['scam'] +print(f"SCAM Recall: {scam_metrics['recall']:.1%}") +``` + +## Classification Approach + +The module uses a **conservative classification approach**: + +1. **SCAM detection** is high priority - clear fraud indicators lead to immediate termination +2. **OPPORTUNITY** captures legitimate chances for students (interviews, jobs, research) +3. **OTHER** is the default fallback - when in doubt, route to human review + +**Decision Tree**: +``` +Is it clearly a fraud attempt? +├─ Yes → SCAM +└─ No → Continue + +Is it a legitimate opportunity (interview, job, research)? +├─ Yes → OPPORTUNITY +└─ No → Continue + +Is it complex, personalized, or unclear? +├─ Yes → OTHER +└─ Unsure → OTHER (default) +``` + +## Configuration + +### Environment Variables + +```bash +# Required +OPENAI_API_KEY=your_api_key + +# Optional (defaults shown) +OPENAI_MODEL=gpt-4o-mini +OPENAI_MAX_TOKENS=2500 +OPENAI_TEMPERATURE=0.0 +``` + +### Model Configuration + +The classifier uses OpenAI GPT models with: +- **Temperature**: 0.3 (for consistent classification) +- **Max Tokens**: 500 +- **Response Format**: JSON mode +- **Model**: gpt-4o-mini (default) or fine-tuned variant + +## Development + +### Adding New Test Cases + +Add test cases to the appropriate file in `tests/test_data/`: + +```python +# tests/test_data/opportunity_cases.py +OPPORTUNITY_TEST_CASES.append({ + "id": "opportunity_016", + "message": "We're offering a research fellowship. Can we discuss?", + "expected_intent": "opportunity", + "min_confidence": 0.85, + "description": "Research fellowship offer" +}) +``` + +After adding cases, run tests to validate: + +```bash +./app/intent_classification/tests/run_tests.sh +``` + +### Modifying Intent Definitions + +To update intent definitions: + +1. Update markdown file in `definitions/` (for documentation) +2. Update corresponding function in `definitions/intent_definitions.py` +3. Update system prompts in `services/prompts.py` if needed +4. Run tests to validate changes + +```bash +# After changes +./app/intent_classification/tests/run_tests.sh + +# Check accuracy hasn't degraded +cat app/intent_classification/tests/results/report_*.txt +``` + +### Running Tests Programmatically + +```python +# Test specific intent +from intent_classification.tests.test_data import OPPORTUNITY_TEST_CASES +from intent_classification import intent_classifier + +for test_case in OPPORTUNITY_TEST_CASES: + result = await intent_classifier.classify_intent(test_case["message"]) + is_correct = result["intent"] == test_case["expected_intent"] + print(f"{test_case['id']}: {'✅' if is_correct else '❌'} {result['intent']}") +``` + +## Best Practices + +1. **Always provide context**: Include conversation history when available for better accuracy +2. **Monitor confidence scores**: Low confidence (<0.7) may indicate ambiguous cases +3. **Review OTHER cases**: Regularly review messages classified as OTHER to identify patterns +4. **Update definitions**: Refine intent definitions based on real-world usage +5. **Test regularly**: Run the test suite after any changes to definitions or prompts +6. **Track metrics**: Monitor accuracy, precision, and recall in production +7. **Fine-tune when needed**: If accuracy drops below 95%, consider fine-tuning + +## Integration + +### With FastAPI Application + +```python +# In main.py +from intent_classification.api import router as intent_router + +app.include_router(intent_router, prefix="/api/ai") +``` + +### Standalone Usage + +```python +from intent_classification import ( + IntentClassifier, + IntentType, + IntentClassificationRequest, + IntentClassificationResponse +) + +# Initialize classifier +classifier = IntentClassifier() + +# Classify message +result = await classifier.classify_intent("Your message here") + +# Access result +intent = result["intent"] +confidence = result["confidence"] +``` + +## Performance Monitoring + +### Key Metrics to Track + +| Metric | Excellent | Good | Needs Attention | +|--------|-----------|------|-----------------| +| **Overall Accuracy** | > 95% | 90-95% | < 90% | +| **SCAM Recall** | 100% | 95-99% | < 95% ⚠️ | +| **OPPORTUNITY Precision** | > 90% | 85-90% | < 85% | +| **F1 Score (all intents)** | > 0.90 | 0.85-0.90 | < 0.85 | + +**Critical**: SCAM recall < 95% is dangerous - means missing fraud attempts! + +### When to Fine-Tune + +Consider fine-tuning if: +- Overall accuracy drops below 90% +- SCAM recall falls below 95% +- OPPORTUNITY precision falls below 85% +- High number of misclassifications in production +- New types of messages appear frequently + +## Troubleshooting + +### Low Accuracy + +**Symptoms**: Test accuracy < 90% + +**Solutions**: +1. Review failed test cases in `tests/results/misclassified_*.json` +2. Check intent definitions for clarity +3. Update system prompts with better examples +4. Generate fine-tuning data and train model +5. Add more test cases for weak areas + +### High False Positive Rate (SCAM) + +**Symptoms**: Non-scam messages classified as SCAM + +**Solutions**: +1. Review SCAM definition for over-broad keywords +2. Add negative examples to SCAM definition +3. Increase confidence threshold for SCAM classification +4. Fine-tune with correction examples + +### Missing Opportunities + +**Symptoms**: OPPORTUNITY messages classified as OTHER + +**Solutions**: +1. Expand OPPORTUNITY keywords list +2. Add more positive examples +3. Review OTHER vs OPPORTUNITY boundary cases +4. Fine-tune with OPPORTUNITY emphasis + +### Import Errors + +**Symptoms**: `ModuleNotFoundError` or import failures + +**Solutions**: +```bash +# Ensure in correct directory +cd /Users/markwang/Documents/Dispatch\ AI/backend/ai + +# Check Python path +export PYTHONPATH="${PYTHONPATH}:$(pwd)/app" + +# Verify dependencies +uv sync +``` + +### API Errors + +**Symptoms**: OpenAI API failures + +**Solutions**: +1. Check API key: `echo $OPENAI_API_KEY` +2. Verify model availability +3. Check rate limits +4. Review error logs in test results + +## Documentation + +### Main Documentation +- **This file**: Module overview and API reference +- `tests/README.md`: Complete testing guide +- `tests/QUICK_START.md`: Quick reference +- `tests/TESTING_SUMMARY.md`: System overview + +### Intent Definitions +- `definitions/scam.md`: SCAM intent documentation +- `definitions/opportunity.md`: OPPORTUNITY intent documentation +- `definitions/other.md`: OTHER intent documentation + +### Test Results +- `tests/results/report_*.txt`: Human-readable test reports +- `tests/results/metrics_*.json`: Detailed metrics in JSON +- `tests/results/misclassified_*.json`: Failed cases for analysis + +## License + +Internal use only. Part of Dispatch AI backend system. + +## Support + +### For Testing Issues +See `tests/README.md` for detailed testing documentation. + +### For Integration Issues +Contact the development team. + +### For Performance Issues +1. Run tests: `./app/intent_classification/tests/run_tests.sh` +2. Review metrics in `tests/results/` +3. Check logs for errors +4. Contact development team with test results + +--- + +**Status**: 🔧 In Development +**Version**: 2.0.0 +**Last Updated**: 2025-10-22 diff --git a/app/intent_classification/__init__.py b/app/intent_classification/__init__.py new file mode 100644 index 0000000..ee48147 --- /dev/null +++ b/app/intent_classification/__init__.py @@ -0,0 +1,38 @@ +""" +Intent Classification Module + +Standalone module for classifying user intents in student service conversations. + +Intents: +- SCAM: Fraud attempts, malicious callers +- OPPORTUNITY: Legitimate job/research/academic opportunities for students +- OTHER: Complex issues, messages, unclear intents requiring human handling + +Usage: + from intent_classification import IntentClassifier, IntentType + + classifier = IntentClassifier() + result = await classifier.classify_intent("We have a job interview for you") +""" + +from .models.intent_types import IntentType +from .models.requests import IntentClassificationRequest +from .models.responses import IntentClassificationResponse +from .services.classifier import IntentClassifier +from .definitions.intent_definitions import ( + get_scam_definition, + get_opportunity_definition, + get_other_definition +) + +__version__ = "2.0.0" + +__all__ = [ + "IntentType", + "IntentClassificationRequest", + "IntentClassificationResponse", + "IntentClassifier", + "get_scam_definition", + "get_opportunity_definition", + "get_other_definition", +] diff --git a/app/intent_classification/api/__init__.py b/app/intent_classification/api/__init__.py new file mode 100644 index 0000000..9d211c8 --- /dev/null +++ b/app/intent_classification/api/__init__.py @@ -0,0 +1,9 @@ +""" +Intent Classification API + +FastAPI routes for intent classification endpoints. +""" + +from .routes import router + +__all__ = ["router"] diff --git a/app/intent_classification/api/routes.py b/app/intent_classification/api/routes.py new file mode 100644 index 0000000..e392d44 --- /dev/null +++ b/app/intent_classification/api/routes.py @@ -0,0 +1,249 @@ +""" +Intent Classification API Routes + +FastAPI endpoints for intent classification. +Provides classification, testing, and definition retrieval endpoints. +""" + +from fastapi import APIRouter, HTTPException +from typing import Dict, Any + +# Import from intent_classification module +from ..models.requests import IntentClassificationRequest +from ..models.responses import IntentClassificationResponse, IntentDefinition +from ..services.classifier import intent_classifier +from ..definitions.intent_definitions import ( + get_scam_definition, + get_opportunity_definition, + get_other_definition +) +from ..tests.test_data import ALL_TEST_CASES + + +router = APIRouter( + prefix="/intent", + tags=["Intent Classification"], + responses={404: {"description": "Not found"}}, +) + + +@router.post("/classify", response_model=IntentClassificationResponse) +async def classify_intent(data: IntentClassificationRequest): + """Classify user intent from conversation + + Standalone endpoint that classifies user intent without affecting existing workflows. + + Args: + data: Request containing current message and optional conversation history + + Returns: + IntentClassificationResponse with intent, confidence, reasoning, and metadata + + Example: + ``` + POST /api/ai/intent/classify + { + "currentMessage": "What are your office hours?" + } + + Response: + { + "intent": "opportunity", + "confidence": 0.92, + "reasoning": "Legitimate job interview invitation", + "metadata": { + "matched_keywords": ["interview", "job"], + "matched_characteristics": ["Mentions of job interviews or interview invitations"] + } + } + ``` + """ + try: + # Classify intent + result = await intent_classifier.classify_intent( + current_message=data.currentMessage, + message_history=data.messages, + call_sid=data.callSid + ) + + return IntentClassificationResponse(**result) + + except Exception as e: + raise HTTPException( + status_code=500, + detail=f"Intent classification failed: {str(e)}" + ) + + +@router.get("/definitions") +async def get_intent_definitions() -> Dict[str, IntentDefinition]: + """Get all intent definitions + + Returns the definition templates for all supported intent types. + Useful for documentation and understanding classification criteria. + + Returns: + Dictionary of intent definitions with characteristics, examples, and keywords + + Example: + ``` + GET /api/ai/intent/definitions + + Response: + { + "scam": { + "name": "scam", + "description": "Scam calls or malicious callers attempting fraud", + "characteristics": [...], + "positive_examples": [...], + "negative_examples": [...], + "keywords": [...] + }, + "opportunity": { + "name": "opportunity", + "description": "Legitimate opportunities for students (interviews, jobs, research)", + "characteristics": [...], + "positive_examples": [...], + "negative_examples": [...], + "keywords": [...] + }, + "other": { + "name": "other", + "description": "Unrecognized intents requiring human handling", + "characteristics": [...], + "positive_examples": [...], + "negative_examples": [...], + "keywords": [...] + } + } + ``` + """ + return { + "scam": IntentDefinition(**get_scam_definition()), + "opportunity": IntentDefinition(**get_opportunity_definition()), + "other": IntentDefinition(**get_other_definition()) + } + + +@router.post("/test") +async def test_intent_classifier() -> Dict[str, Any]: + """Test intent classifier with predefined test cases + + Runs the classifier against all test cases and returns accuracy metrics. + Useful for validating classifier performance and identifying issues. + + Returns: + Test results with overall accuracy and per-intent breakdown + + Example: + ``` + POST /api/ai/intent/test + + Response: + { + "total_tests": 45, + "passed": 42, + "failed": 3, + "accuracy": 0.933, + "by_intent": { + "scam": {"tests": 15, "passed": 14, "accuracy": 0.93}, + "opportunity": {"tests": 15, "passed": 15, "accuracy": 1.0}, + "other": {"tests": 15, "passed": 13, "accuracy": 0.87} + }, + "failed_cases": [...] + } + ``` + """ + results = { + "total_tests": 0, + "passed": 0, + "failed": 0, + "accuracy": 0.0, + "by_intent": {}, + "failed_cases": [] + } + + # Test each intent category + for intent_type, test_cases in ALL_TEST_CASES.items(): + if intent_type == "edge_cases": + # Handle edge cases separately + continue + + intent_results = { + "tests": len(test_cases), + "passed": 0, + "failed": 0, + "accuracy": 0.0, + "failed_cases": [] + } + + for test_case in test_cases: + results["total_tests"] += 1 + + try: + # Run classification + result = await intent_classifier.classify_intent( + current_message=test_case["message"] + ) + + # Check if classification matches expected + is_correct = ( + result["intent"] == test_case["expected_intent"] and + result["confidence"] >= test_case["min_confidence"] + ) + + if is_correct: + results["passed"] += 1 + intent_results["passed"] += 1 + else: + results["failed"] += 1 + intent_results["failed"] += 1 + + # Record failure + failure_info = { + "test_id": test_case["id"], + "description": test_case["description"], + "message": test_case["message"], + "expected_intent": test_case["expected_intent"], + "actual_intent": result["intent"], + "expected_min_confidence": test_case["min_confidence"], + "actual_confidence": result["confidence"], + "reasoning": result["reasoning"] + } + results["failed_cases"].append(failure_info) + intent_results["failed_cases"].append(failure_info) + + except Exception as e: + results["failed"] += 1 + intent_results["failed"] += 1 + results["failed_cases"].append({ + "test_id": test_case["id"], + "description": test_case["description"], + "error": str(e) + }) + + # Calculate intent-specific accuracy + if intent_results["tests"] > 0: + intent_results["accuracy"] = intent_results["passed"] / intent_results["tests"] + + results["by_intent"][intent_type] = intent_results + + # Calculate overall accuracy + if results["total_tests"] > 0: + results["accuracy"] = results["passed"] / results["total_tests"] + + return results + + +@router.get("/health") +async def health_check(): + """Health check endpoint for intent classification service + + Returns: + Status of the intent classification service + """ + return { + "status": "healthy", + "service": "intent_classification", + "version": "1.0.0" + } diff --git a/app/intent_classification/definitions/__init__.py b/app/intent_classification/definitions/__init__.py new file mode 100644 index 0000000..2183e64 --- /dev/null +++ b/app/intent_classification/definitions/__init__.py @@ -0,0 +1,18 @@ +""" +Intent Definitions Module + +Contains intent definitions in both markdown format (for documentation) +and Python format (for programmatic access). +""" + +from .intent_definitions import ( + get_scam_definition, + get_opportunity_definition, + get_other_definition +) + +__all__ = [ + "get_scam_definition", + "get_opportunity_definition", + "get_other_definition", +] diff --git a/app/intent_classification/definitions/intent_definitions.py b/app/intent_classification/definitions/intent_definitions.py new file mode 100644 index 0000000..0b6965b --- /dev/null +++ b/app/intent_classification/definitions/intent_definitions.py @@ -0,0 +1,164 @@ +""" +Intent Definition Data Structures + +Provides structured access to intent definitions. +Definitions are maintained in markdown files for documentation, +and this module provides programmatic access to the data. +""" + +from typing import Dict, Any + + +def get_scam_definition() -> Dict[str, Any]: + """Get SCAM intent definition + + Returns: + dict: Intent definition with characteristics, examples, and keywords + """ + return { + "name": "scam", + "description": "Scam calls or malicious callers attempting fraud", + "characteristics": [ + "Requests for money transfers or payments (non-service related)", + "Impersonating government agencies, banks, or tax offices", + "Using threatening or intimidating language", + "Requesting sensitive information (bank account, password, credit card)", + "Claims of urgent situations requiring immediate action", + "Mentions of legal action, arrest, fines, or penalties", + "Claims that accounts are frozen or compromised", + "Requests to purchase gift cards or prepaid cards", + "High-pressure sales tactics or threats" + ], + "positive_examples": [ + "This is the Australian Tax Office. You have unpaid taxes and must pay immediately or face prosecution.", + "Your bank account has been frozen. Please provide your account number and password to unlock it.", + "This is the police department. You are suspected of money laundering and must transfer bail money immediately.", + "Congratulations, you've won the lottery! But you need to pay processing fees first to claim your prize.", + "Your parcel has been seized by customs. You must pay a fine immediately or it will be destroyed.", + "This is Medicare. Your account has suspicious activity. Please provide your bank details immediately.", + "Your internet will be disconnected in 24 hours. Call this number to make immediate payment.", + "I'm your grandson. I'm in trouble and need money urgently. Please transfer immediately." + ], + "negative_examples": [ + "What are your office hours?", + "I'd like to leave a message", + "I have a question about my enrollment", + "Can someone call me back?", + "I need help with my application" + ], + "keywords": [ + "tax office", "ATO", "police", "arrest", "fine", "transfer money", + "bank account", "password", "urgent", "immediate payment", "account frozen", + "lottery", "won prize", "customs", "Medicare", "Centrelink", + "gift card", "iTunes card", "threat", "lawsuit", "court", + "fraud", "scam", "suspicious", "verify identity" + ] + } + + +def get_opportunity_definition() -> Dict[str, Any]: + """Get OPPORTUNITY intent definition + + Returns: + dict: Intent definition with characteristics, examples, and keywords + """ + return { + "name": "opportunity", + "description": "Legitimate opportunities for students including interviews, jobs, research positions, and academic engagements", + "characteristics": [ + "Mentions of job interviews or interview invitations", + "Employment opportunities or job offers", + "Research opportunities or academic collaborations", + "Internship positions or traineeships", + "Networking events or professional meetups", + "Career fairs or recruitment events", + "Scholarship or fellowship opportunities", + "Requests for availability to schedule important meetings", + "Legitimate requests for contact information to follow up on opportunities", + "Questions about student's skills, qualifications, or experience", + "Offers of mentorship or professional guidance" + ], + "positive_examples": [ + "We'd like to invite you for a job interview next week. When are you available?", + "Our company has an internship position available. Are you interested?", + "I'm a professor looking for research assistants. Would you like to discuss this opportunity?", + "There's a career fair on campus next Friday. Can we schedule a time to meet?", + "We received your application and would like to schedule an interview.", + "Our startup is hiring international students. Can we send you more information?", + "I'm organizing a networking event for tech professionals. Would you like to attend?", + "We have a scholarship opportunity for international students.", + "Our lab is looking for graduate research assistants. Can we set up a meeting?", + "There's an internship opening in our marketing department.", + "We're hosting a workshop on career development. Would you like to join?", + "I'd like to discuss a potential research collaboration. What's your email address?" + ], + "negative_examples": [ + "Transfer money immediately for a job offer", + "You won a lottery prize, pay processing fee", + "What are your office hours?", + "I'd like to leave a message", + "Can someone call me back?", + "Send your bank details for salary advance" + ], + "keywords": [ + "interview", "job interview", "employment", "job opportunity", "job offer", + "hiring", "recruitment", "recruiting", "career", "internship", "intern position", + "research opportunity", "research assistant", "RA position", "PhD opportunity", + "scholarship", "fellowship", "grant", "mentorship", "mentor", + "networking event", "career fair", "job fair", "recruitment event", + "workshop", "seminar", "conference", "presentation", "collaboration", + "research collaboration", "academic opportunity", "professional development", + "available", "availability", "when are you free", "schedule meeting", + "send information", "contact information", "email address", + "skills", "qualifications", "experience", "CV", "resume", + "interested", "apply", "position available", "opening", "vacancy" + ] + } + + +def get_other_definition() -> Dict[str, Any]: + """Get OTHER intent definition + + Returns: + dict: Intent definition with characteristics, examples, and keywords + """ + return { + "name": "other", + "description": "Unrecognized intents requiring human handling (complex issues, messages, unclear requests)", + "characteristics": [ + "Complex or unique situations requiring individual attention", + "Requests that don't fit standard questions or opportunities", + "Wanting to leave a message or callback request", + "Unclear or ambiguous intent", + "Personal circumstances requiring case-by-case handling", + "Complaints or disputes", + "Special requests or exceptions", + "Currently unavailable and needs human follow-up" + ], + "positive_examples": [ + "I'd like to leave a message", + "Can someone call me back later?", + "I have a special situation regarding my visa that needs discussion", + "I can't talk now, but I need to leave some information", + "I need to speak with someone about a complex enrollment issue", + "This is regarding a personal matter that requires individual attention", + "I'm not satisfied with how my case was handled", + "Can I schedule a one-on-one consultation?", + "I have multiple questions that need detailed discussion", + "My situation is complicated and I need personalized help" + ], + "negative_examples": [ + "We'd like to invite you for a job interview", + "Transfer money immediately", + "Our company has an internship position available", + "I'm looking for research assistants", + "You have unpaid taxes, pay immediately" + ], + "keywords": [ + "leave message", "callback", "call back", "special case", + "complex", "individual attention", "personal", "discuss", + "consultation", "unique situation", "exception", "complaint", + "not standard", "busy", "later", "follow up", + "speak with someone", "detailed", "complicated" + ] + } diff --git a/app/intent_classification/definitions/opportunity.md b/app/intent_classification/definitions/opportunity.md new file mode 100644 index 0000000..1d4b7d9 --- /dev/null +++ b/app/intent_classification/definitions/opportunity.md @@ -0,0 +1,221 @@ +# OPPORTUNITY Intent Definition + +## Overview +Legitimate opportunities for international students, including job interviews, employment offers, research positions, internships, and other career/academic opportunities that require timely response and coordination. + +## Description +This intent identifies genuine opportunities (non-scam) that international students should consider and respond to. These include job interviews, employment opportunities, research collaborations, internships, networking events, and other legitimate professional/academic engagements. The system will help students capture these opportunities by collecting their availability, contact information, or scheduling follow-up actions. + +## Characteristics + +- Mentions of **job interviews** or interview invitations +- **Employment opportunities** or job offers +- **Research opportunities** or academic collaborations +- **Internship** positions or traineeships +- **Networking events** or professional meetups +- **Career fairs** or recruitment events +- **Scholarship** or fellowship opportunities +- **Academic presentations** or conference invitations +- Requests for **availability** to schedule important meetings +- Legitimate requests for **contact information** to follow up on opportunities +- Questions about student's **skills, qualifications, or experience** +- Offers of **mentorship** or professional guidance +- Invitations to **workshops** or professional development events + +## Positive Examples + +These messages **SHOULD** be classified as OPPORTUNITY: + +1. "We'd like to invite you for a job interview next week. When are you available?" +2. "Our company has an internship position available. Are you interested in applying?" +3. "I'm a professor looking for research assistants. Would you like to discuss this opportunity?" +4. "There's a career fair on campus next Friday. Can we schedule a time to meet?" +5. "We received your application and would like to schedule an interview. What times work for you?" +6. "Our startup is hiring international students. Can we send you more information?" +7. "I'm organizing a networking event for tech professionals. Would you like to attend?" +8. "We have a scholarship opportunity for international students. Are you interested?" +9. "Our lab is looking for graduate research assistants. Can we set up a meeting?" +10. "There's an internship opening in our marketing department. When can we talk?" +11. "We're hosting a workshop on career development. Would you like to join?" +12. "I'd like to discuss a potential research collaboration. What's your email address?" +13. "Our company is recruiting for entry-level positions. When are you free to chat?" +14. "We have a mentorship program for international students. Are you interested?" +15. "There's a conference next month where we're presenting. Would you like to participate?" + +## Negative Examples + +These messages should **NOT** be classified as OPPORTUNITY: + +1. "Transfer money immediately for a job offer" → SCAM (requires payment) +2. "You won a lottery prize, pay processing fee to claim" → SCAM (advance fee fraud) +3. "What are your office hours?" → FAQ (simple information request) +4. "I'd like to leave a message" → OTHER (message leaving request) +5. "Can someone call me back?" → OTHER (callback request) +6. "I have a complex visa situation" → OTHER (complex personal matter) +7. "Send your bank details for salary advance" → SCAM (requests sensitive info) +8. "Pay $500 to secure this job position" → SCAM (advance fee) + +## Keywords + +interview, job interview, employment, job opportunity, job offer, hiring, recruitment, recruiting, career, internship, intern position, research opportunity, research assistant, RA position, PhD opportunity, scholarship, fellowship, grant, mentorship, mentor, networking event, career fair, job fair, recruitment event, workshop, seminar, conference, presentation, collaboration, research collaboration, academic opportunity, professional development, trainee, traineeship, available, availability, when are you free, schedule, meeting, discuss opportunity, send information, contact information, email address, phone number, skills, qualifications, experience, CV, resume, application, interested, apply, position available, opening, vacancy + +## Classification Rules + +### High Confidence (>0.85) +- Clear mention of interviews, job offers, or research positions +- Explicit opportunity language combined with request for availability/contact info +- Legitimate professional/academic context +- No requests for money or sensitive financial information + +### Medium Confidence (0.70-0.85) +- Mentions opportunities but context is somewhat vague +- Could be legitimate opportunity or general inquiry +- Needs clarification but likely genuine + +### Low Confidence (<0.70) +- Ambiguous message that could be opportunity or something else +- If unclear → classify as OTHER for human review + +## Important Distinctions + +### OPPORTUNITY vs SCAM + +**OPPORTUNITY** characteristics: +- ✅ Legitimate professional/academic context +- ✅ Focuses on student's skills, availability, or interests +- ✅ May ask for CV, contact info, or availability +- ✅ Does NOT request money upfront +- ✅ Does NOT request sensitive financial information (bank details, passwords) +- ✅ Professional language and context +- ✅ Reasonable timeline for response + +**SCAM** characteristics: +- ❌ Requests money or payment +- ❌ Requests bank details, passwords, or credit card info +- ❌ Threatens negative consequences if don't comply +- ❌ Too good to be true (guaranteed high salary, lottery winnings) +- ❌ Requires gift cards or wire transfers +- ❌ High-pressure tactics or urgency without legitimate context + +### OPPORTUNITY vs FAQ + +**OPPORTUNITY** involves: +- 🎯 Specific opportunities requiring action +- 🎯 Requests for availability or contact information +- 🎯 Follow-up needed to pursue the opportunity +- 🎯 Time-sensitive engagement + +**FAQ** involves: +- 📋 General information requests +- 📋 Standard questions with factual answers +- 📋 No specific opportunity or action required + +### OPPORTUNITY vs OTHER + +**OPPORTUNITY** is for: +- 🎯 Legitimate job, research, or academic opportunities +- 🎯 Professional development engagements +- 🎯 Career advancement possibilities + +**OTHER** is for: +- 💬 General message leaving +- 💬 Callback requests without opportunity context +- 💬 Complex personal situations unrelated to opportunities +- 💬 Administrative or general inquiries + +### Examples of the Distinction + +| Message | Intent | Reason | +|---------|--------|--------| +| "We'd like to interview you for a position" | OPPORTUNITY | Legitimate interview invitation | +| "Pay $500 to secure your job interview" | SCAM | Requires payment | +| "What are your office hours?" | FAQ | General information | +| "Can someone call me back?" | OTHER | General callback request | +| "Our lab is hiring research assistants, interested?" | OPPORTUNITY | Legitimate research position | +| "You won a scholarship, send bank details to claim" | SCAM | Requests sensitive info | +| "I'd like to leave a message" | OTHER | Message leaving | +| "We have an internship opening, when are you available?" | OPPORTUNITY | Legitimate internship + availability request | + +## Action on Detection + +When OPPORTUNITY intent is detected: + +1. **Acknowledge the opportunity** + - Thank the caller for reaching out + - Confirm interest in the opportunity + +2. **Collect necessary information** + - Student's availability (times/dates they're free) + - Preferred contact method (email, phone) + - Email address for sending details + - Phone number if needed + - Any specific preferences or requirements + +3. **Provide confirmation** + - Summarize what information was collected + - Confirm next steps + - Set expectations for follow-up + +4. **Store and route appropriately** + - Save opportunity details and student response + - Flag as high priority for student review + - Send notification to student about the opportunity + - Enable student to confirm/decline/reschedule + +5. **Safety check** + - Verify legitimacy (no red flags indicating scam) + - If any suspicious elements detected → escalate to human review + - Better to be cautious than miss a scam disguised as opportunity + +## Follow-up Workflow + +### For Interview Invitations: +1. Collect: Available dates/times, preferred interview format (phone/video/in-person), contact email +2. Confirm: Interview details received, student will be contacted +3. Action: Send confirmation email with details to student + +### For Job/Research Opportunities: +1. Collect: Interest level, availability to discuss, CV/resume if needed, email for details +2. Confirm: Information received, details will be sent +3. Action: Connect student with opportunity provider + +### For Events (Career Fairs, Workshops): +1. Collect: Interest in attending, availability, contact for event details +2. Confirm: Registration noted or details will be sent +3. Action: Add to student's calendar, send event information + +## Edge Cases + +### Borderline Cases (Opportunity vs Scam): +- "We have a job for you, just pay training fee" → **SCAM** (requires payment) +- "Interview opportunity, deposit required to secure slot" → **SCAM** (requires deposit) +- "Internship available, but you need to buy equipment" → **SCAM** (requires purchase) +- When in doubt about legitimacy → classify as **OTHER** for human verification + +### Borderline Cases (Opportunity vs Other): +- "Can we schedule a meeting?" (no context) → **OTHER** (unclear purpose) +- "Can we schedule a meeting to discuss your career?" → **OPPORTUNITY** (career-focused) +- General networking without specific opportunity → **OTHER** or **OPPORTUNITY** based on context + +## Notes + +- **Priority**: OPPORTUNITY is high-priority intent - students don't want to miss legitimate chances +- **Timeliness**: These often require quick response - system should act promptly +- **Verification**: Always maintain scam detection vigilance - legitimate opportunities never ask for money upfront +- **Student Control**: Student should have final say on pursuing opportunities +- **Privacy**: Only collect information necessary for the specific opportunity +- **Legitimate vs Scam**: The key differentiator is whether money/sensitive financial info is requested + +## Examples of Information Collection + +### Good Collection (OPPORTUNITY): +- "When are you available this week for an interview?" +- "What's your email so we can send internship details?" +- "Do you have a CV we can review?" +- "What's the best phone number to reach you?" + +### Red Flags (Likely SCAM): +- "What's your bank account number for salary deposit?" +- "Send your password to verify identity" +- "Pay the registration fee to confirm interview" +- "Provide credit card details for background check" diff --git a/app/intent_classification/definitions/other.md b/app/intent_classification/definitions/other.md new file mode 100644 index 0000000..7767a75 --- /dev/null +++ b/app/intent_classification/definitions/other.md @@ -0,0 +1,175 @@ +# OTHER Intent Definition + +## Overview +Catch-all category for complex issues, message leaving requests, callbacks, unclear intents, or anything requiring human handling and personalized attention. + +## Description +This intent covers everything that doesn't fit SCAM or OPPORTUNITY. It includes requests to leave messages, callback requests, complex or unique situations, complaints, special circumstances, and any ambiguous cases. This is the **conservative default** when intent is unclear. + +## Characteristics + +- Requests to leave a message or voicemail +- Callback requests ("call me back later") +- Complex or unique situations requiring individual attention +- Requests that don't fit standard questions or opportunities +- Unclear or ambiguous intent +- Personal circumstances requiring case-by-case handling +- Complaints, disputes, or dissatisfaction +- Special requests or exceptions to standard procedures +- Currently unavailable and needs human follow-up +- Requests for one-on-one consultation or meeting +- Multiple complex questions needing detailed discussion +- Special cases or unique circumstances +- Situations requiring personalized assessment + +## Positive Examples + +These messages **SHOULD** be classified as OTHER: + +1. "I can't talk right now. Can I leave a message?" +2. "Can someone call me back later?" +3. "I have a special situation regarding my visa that needs discussion" +4. "I can't talk now, but I need to leave some information" +5. "I need to speak with someone about a complex enrollment issue" +6. "This is regarding a personal matter that requires individual attention" +7. "I'm not satisfied with how my case was handled" +8. "Can I schedule a one-on-one consultation?" +9. "I have multiple questions that need detailed discussion" +10. "My situation is complicated and I need personalized help" +11. "I'm busy right now. Please have someone contact me when they're available." +12. "I'd like to leave a message for the admissions office." +13. "I need help with a unique immigration case" +14. "Can someone review my special circumstances?" +15. "I'm driving right now, can you have someone call me back?" + +## Negative Examples + +These messages should **NOT** be classified as OTHER: + +1. "We'd like to invite you for a job interview" → OPPORTUNITY +2. "Transfer money immediately" → SCAM +3. "Our company has an internship position available" → OPPORTUNITY +4. "I'm looking for research assistants" → OPPORTUNITY +5. "You have unpaid taxes, pay immediately" → SCAM +6. "You must pay immediately or face arrest" → SCAM + +## Keywords + +leave message, message, voicemail, callback, call back, call me back, contact me later, special case, special situation, unique situation, complex, complicated, individual attention, personal, personal matter, discuss, discussion, detailed discussion, consultation, one-on-one, meeting, schedule appointment, exception, special circumstances, complaint, not satisfied, unhappy, dispute, issue with, problem with, not standard, unique, unusual, busy, not available, can't talk, driving, later, follow up, follow-up, speak with someone, talk to someone, human, representative, detailed, complicated situation, case-by-case, personalized, personalized help, individual help + +## Classification Rules + +### High Confidence (>0.80) +- Explicit message leaving or callback request +- Clear statement of complexity or special circumstances +- Complaints or dissatisfaction +- Unavailability requiring follow-up + +### Medium Confidence (0.65-0.80) +- Somewhat complex situation +- Might need personalization +- Ambiguous between FAQ and OTHER + +### Low Confidence (<0.65) +- Very unclear intent +- Could be anything +- **Default to OTHER for safety** + +## Conservative Classification Approach + +**IMPORTANT**: When classification is uncertain, always default to OTHER. + +- ✅ Better to have human review than incorrect automation +- ✅ False positive (human reviews FAQ) = minor inefficiency +- ❌ False negative (FAQ system handles complex case) = poor service + +### Decision Tree + +``` +Is it clearly a fraud attempt? +├─ Yes → SCAM +└─ No → Continue + +Is it a legitimate opportunity (interview, job, research)? +├─ Yes → OPPORTUNITY +└─ No → Continue + +Is it complex, personalized, or unclear? +├─ Yes → OTHER +└─ Unsure → OTHER (default) +``` + +## Action on Detection + +When OTHER intent is detected: + +1. **Preserve full call summary** +2. **Queue for human manual processing** +3. **Do not attempt automated response** +4. **Capture all context and details** +5. **Flag urgency if mentioned** +6. **Assign to appropriate department if specified** + +For message leaving: +- Record message content +- Capture contact information +- Note preferred callback time +- Route to appropriate person/department + +For callbacks: +- Record contact number +- Note availability/preferred time +- Capture reason for call +- Queue for staff follow-up + +## Important Distinctions + +### OTHER vs OPPORTUNITY + +**OTHER** is for: +- ❌ Complex situations +- ❌ Personalized needs +- ❌ Actions required (leave message, callback) +- ❌ Special circumstances +- ❌ Unclear intents + +**OPPORTUNITY** is for: +- ✅ Job interviews or employment offers +- ✅ Research or academic positions +- ✅ Internships or fellowships +- ✅ Networking events +- ✅ Legitimate chances for students + +### OTHER vs SCAM + +**OTHER** includes: +- ✅ Legitimate but complex inquiries +- ✅ Students needing help +- ✅ Unclear but non-threatening + +**SCAM** includes: +- ❌ Fraud attempts +- ❌ Threats and intimidation +- ❌ Requests for sensitive info/money +- ❌ Impersonation of authorities + +## Examples of Ambiguous Cases + +| Message | Intent | Reasoning | +|---------|--------|-----------| +| "I need this information urgently" | OTHER | Unclear what they need, requires clarification | +| "We have a position but need to discuss terms" | OPPORTUNITY | Employment opportunity with discussion needed | +| "Can I get an extension on the deadline?" | OTHER | Requires individual assessment | +| "I'm having trouble understanding this" | OTHER | Unclear, needs clarification | +| "Can you help me?" | OTHER | Too vague, needs clarification | +| "I need to discuss something" | OTHER | Unclear, requires human | + +## Notes + +- **Default to OTHER when uncertain** - this is the safest approach +- OTHER ensures human review for complex or unclear cases +- Students with genuine needs will get proper attention +- Prevents automated system from handling inappropriate cases +- Maintains service quality by routing complex cases to humans +- Message leaving is a common use case for international students +- Callback requests respect student schedules and time zones diff --git a/app/intent_classification/definitions/scam.md b/app/intent_classification/definitions/scam.md new file mode 100644 index 0000000..e15434b --- /dev/null +++ b/app/intent_classification/definitions/scam.md @@ -0,0 +1,90 @@ +# SCAM Intent Definition + +## Overview +Detection of scam calls or malicious callers attempting fraud targeting international students. + +## Description +Scam calls that impersonate authorities, use high-pressure tactics, or request sensitive information and payments. Common targets are international students who may be unfamiliar with local systems. + +## Characteristics + +- Requests for money transfers or immediate payments (non-service related) +- Impersonating government agencies (tax office, police, immigration, Medicare, Centrelink) +- Impersonating banks or financial institutions +- Using threatening or intimidating language +- Requesting sensitive information (bank account, password, credit card, TFN) +- Claims of urgent situations requiring immediate action +- Mentions of legal action, arrest, fines, or penalties +- Claims that accounts are frozen or compromised +- Requests to purchase gift cards or prepaid cards (iTunes, Steam, etc.) +- High-pressure sales tactics or threats +- Claims of winning lottery/prizes but requiring upfront fees +- Tech support scams claiming virus/security issues +- Deportation or visa threats + +## Positive Examples + +These messages **SHOULD** be classified as SCAM: + +1. "This is the Australian Tax Office. You have unpaid taxes of $5000 and must transfer immediately or face prosecution." +2. "Your bank account has been frozen. Please provide your account number and password to unlock it." +3. "This is the police. You are suspected of money laundering and must transfer bail money immediately." +4. "Congratulations! You've won a million dollars, but you need to pay $5000 processing fee first." +5. "Your package has been seized by customs. Pay the $3000 fine immediately or it will be destroyed." +6. "This is Medicare calling. Your account has suspicious activity. Please provide your bank details immediately." +7. "Your internet will be disconnected in 24 hours. Press 1 to make immediate payment." +8. "Your Centrelink benefits have been frozen. Provide your bank account and Tax File Number immediately." +9. "You need to purchase iTunes gift cards worth $2000 to pay your tax debt immediately." +10. "Your computer has viruses. We are Microsoft technical support and need remote access to fix it." +11. "Urgent: Your visa is expiring. Pay the renewal fee immediately or you will be deported." +12. "Hi, this is your grandson. I'm in an emergency and need money urgently. Please transfer immediately." + +## Negative Examples + +These messages should **NOT** be classified as SCAM: + +1. "What are your office hours?" → FAQ +2. "I'd like to leave a message" → OTHER +3. "I have a question about my enrollment" → FAQ or OTHER +4. "Can someone call me back?" → OTHER +5. "I need help with my application" → OTHER +6. "When is the application deadline?" → FAQ +7. "Do I need to pay the enrollment fee upfront?" → FAQ (legitimate payment question) + +## Keywords + +tax office, ATO, Australian Taxation Office, police, arrest, warrant, fine, penalty, transfer money, bank account, password, PIN, credit card, CVV, urgent, immediate payment, immediate action, account frozen, account locked, account suspended, lottery, won prize, prize winner, customs, parcel seized, Medicare, Centrelink, benefits frozen, gift card, iTunes card, Steam card, Google Play card, threat, threatening, lawsuit, court notice, legal action, fraud investigation, money laundering, suspicious activity, verify identity, remote access, virus, malware, Microsoft support, tech support, visa expiring, deportation, immigration officer, grandson scam, family emergency + +## Classification Rules + +### High Confidence (>0.85) +- Multiple scam characteristics match (3+) +- Contains explicit payment requests with threats +- Requests gift card payments +- Impersonates government agencies with threats + +### Medium Confidence (0.70-0.85) +- Some scam characteristics match (1-2) +- Urgent language without explicit threats +- Suspicious but not obvious fraud + +### Low Confidence (<0.70) +- Ambiguous cases → Classify as OTHER for safety +- Legitimate inquiries about payments → FAQ + +## Action on Detection + +When SCAM intent is detected with high confidence: + +1. **Terminate call immediately** +2. **Log incident for security review** +3. **Do not engage with caller** +4. **Do not provide any information** +5. **Record call details for pattern analysis** + +## Notes + +- International students are prime targets for scams +- Scammers often exploit unfamiliarity with Australian systems +- When in doubt about legitimacy → classify as OTHER for human review +- Better to have false positives (human review) than false negatives (student scammed) diff --git a/app/intent_classification/models/__init__.py b/app/intent_classification/models/__init__.py new file mode 100644 index 0000000..17094d0 --- /dev/null +++ b/app/intent_classification/models/__init__.py @@ -0,0 +1,16 @@ +""" +Intent Classification Data Models + +Pydantic models for intent classification requests and responses. +""" + +from .intent_types import IntentType +from .requests import IntentClassificationRequest +from .responses import IntentClassificationResponse, IntentDefinition + +__all__ = [ + "IntentType", + "IntentClassificationRequest", + "IntentClassificationResponse", + "IntentDefinition", +] diff --git a/app/intent_classification/models/intent_types.py b/app/intent_classification/models/intent_types.py new file mode 100644 index 0000000..259a1ed --- /dev/null +++ b/app/intent_classification/models/intent_types.py @@ -0,0 +1,17 @@ +""" +Intent Type Enumeration + +Defines the three intent types for classification: +- SCAM: Fraud attempts, malicious callers +- OPPORTUNITY: Legitimate job/research/academic opportunities for students +- OTHER: Complex issues, messages, unclear intents requiring human handling +""" + +from enum import Enum + + +class IntentType(str, Enum): + """Intent type enumeration""" + SCAM = "scam" + OPPORTUNITY = "opportunity" + OTHER = "other" diff --git a/app/intent_classification/models/requests.py b/app/intent_classification/models/requests.py new file mode 100644 index 0000000..6bb0003 --- /dev/null +++ b/app/intent_classification/models/requests.py @@ -0,0 +1,40 @@ +""" +Intent Classification Request Models + +Pydantic models for API requests. +""" + +from pydantic import BaseModel, Field +from typing import Optional, List +from models.call import Message + + +class IntentClassificationRequest(BaseModel): + """Request model for intent classification + + Attributes: + callSid: Optional Twilio CallSid to fetch conversation history from Redis + messages: Optional conversation history as list of messages + currentMessage: The current user message to classify (required) + """ + + callSid: Optional[str] = Field( + None, + description="Optional CallSid to fetch history from Redis" + ) + messages: Optional[List[Message]] = Field( + None, + description="Optional conversation history" + ) + currentMessage: str = Field( + ..., + description="Current user message to classify" + ) + + class Config: + json_schema_extra = { + "example": { + "currentMessage": "What are your office hours?", + "callSid": "CA1234567890abcdef1234567890abcdef" + } + } diff --git a/app/intent_classification/models/responses.py b/app/intent_classification/models/responses.py new file mode 100644 index 0000000..ce2ee87 --- /dev/null +++ b/app/intent_classification/models/responses.py @@ -0,0 +1,97 @@ +""" +Intent Classification Response Models + +Pydantic models for API responses. +""" + +from pydantic import BaseModel, Field +from typing import Dict, Any, List +from .intent_types import IntentType + + +class IntentClassificationResponse(BaseModel): + """Response model for intent classification + + Attributes: + intent: Classified intent type (scam/faq/other) + confidence: Confidence score between 0.0 and 1.0 + reasoning: Brief explanation of classification decision + metadata: Additional classification metadata (keywords, characteristics) + """ + + intent: IntentType = Field( + ..., + description="Classified intent type" + ) + confidence: float = Field( + ..., + description="Confidence score between 0.0 and 1.0", + ge=0.0, + le=1.0 + ) + reasoning: str = Field( + ..., + description="Brief explanation of classification decision" + ) + metadata: Dict[str, Any] = Field( + default_factory=dict, + description="Additional classification metadata" + ) + + class Config: + json_schema_extra = { + "example": { + "intent": "faq", + "confidence": 0.92, + "reasoning": "Student asking about office hours, simple FAQ question", + "metadata": { + "matched_keywords": ["office hours"], + "matched_characteristics": ["Asking about office hours or availability"] + } + } + } + + +class IntentDefinition(BaseModel): + """Intent definition model for documentation and testing + + Attributes: + name: Intent name (scam/faq/other) + description: Intent description + characteristics: List of characteristics that define this intent + positive_examples: Examples that should be classified as this intent + negative_examples: Examples that should NOT be classified as this intent + keywords: Key indicator words for this intent + """ + + name: str = Field(..., description="Intent name") + description: str = Field(..., description="Intent description") + characteristics: List[str] = Field(..., description="Intent characteristics") + positive_examples: List[str] = Field(..., description="Positive examples") + negative_examples: List[str] = Field(..., description="Negative examples") + keywords: List[str] = Field(..., description="Key indicator words") + + class Config: + json_schema_extra = { + "example": { + "name": "faq", + "description": "Common student questions that can be answered by FAQ system", + "characteristics": [ + "Asking about office hours or availability", + "Inquiring about enrollment deadlines" + ], + "positive_examples": [ + "What are your office hours?", + "When is the enrollment deadline?" + ], + "negative_examples": [ + "I want to leave a message", + "Transfer money immediately" + ], + "keywords": [ + "office hours", + "deadline", + "tuition" + ] + } + } diff --git a/app/intent_classification/services/__init__.py b/app/intent_classification/services/__init__.py new file mode 100644 index 0000000..28d1e6a --- /dev/null +++ b/app/intent_classification/services/__init__.py @@ -0,0 +1,12 @@ +""" +Intent Classification Services + +Core classification logic and prompt management. +""" + +from .classifier import IntentClassifier, intent_classifier + +__all__ = [ + "IntentClassifier", + "intent_classifier", +] diff --git a/app/intent_classification/services/classifier.py b/app/intent_classification/services/classifier.py new file mode 100644 index 0000000..13e8ecb --- /dev/null +++ b/app/intent_classification/services/classifier.py @@ -0,0 +1,180 @@ +""" +Intent Classification Service + +Provides intent classification functionality for student service conversations. +Uses OpenAI GPT models to classify user intents based on conversation context. +""" + +from openai import AsyncOpenAI +from typing import Optional, List, Dict, Any +import json +from config import get_settings +from services.redis_service import get_message_history +from .prompts import get_intent_classification_system_prompt + +settings = get_settings() + + +class IntentClassifier: + """Intent classification service + + Classifies user intent in international student service conversations into: + - SCAM: Scam or malicious calls (fraud attempts) + - OPPORTUNITY: Legitimate job/research/academic opportunities for students + - OTHER: Complex issues, messages, or unclear intents requiring human handling + """ + + def __init__(self, api_key: Optional[str] = None): + """Initialize intent classifier + + Args: + api_key: Optional OpenAI API key (defaults to settings) + """ + self.api_key = api_key + self._client = None + self.model = settings.openai_model + + @property + def client(self): + """Lazy initialization of OpenAI client""" + if self._client is None: + self._client = AsyncOpenAI(api_key=self.api_key or settings.openai_api_key) + return self._client + + async def classify_intent( + self, + current_message: str, + message_history: Optional[List[Dict]] = None, + call_sid: Optional[str] = None + ) -> Dict[str, Any]: + """Classify user intent from conversation + + Args: + current_message: Current user message to classify + message_history: Optional conversation history + call_sid: Optional CallSid to fetch history from Redis + + Returns: + Dict containing: + - intent: Classification result (scam/opportunity/other) + - confidence: Confidence score (0.0-1.0) + - reasoning: Explanation of classification + - metadata: Additional info (matched keywords, characteristics) + """ + # Get message history from Redis if call_sid provided + if call_sid and not message_history: + message_history = get_message_history(call_sid) + + # Build conversation context + conversation_context = self._build_conversation_context( + current_message, + message_history + ) + + # Get system prompt + system_prompt = get_intent_classification_system_prompt() + + # Call OpenAI API + try: + response = await self.client.chat.completions.create( + model=self.model, + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": conversation_context} + ], + temperature=0.3, # Lower temperature for consistent classification + max_tokens=500, + response_format={"type": "json_object"} # Ensure JSON output + ) + + # Parse response + result_text = response.choices[0].message.content + result = json.loads(result_text) + + # Validate and return result + return self._validate_result(result) + + except Exception as e: + print(f"❌ [INTENT_CLASSIFIER] Error during classification: {str(e)}") + # Return safe fallback + return { + "intent": "other", # Default to other for safety (human review) + "confidence": 0.0, + "reasoning": f"Classification failed due to error: {str(e)}", + "metadata": { + "error": str(e), + "matched_keywords": [], + "matched_characteristics": [] + } + } + + def _build_conversation_context( + self, + current_message: str, + message_history: Optional[List[Dict]] = None + ) -> str: + """Build conversation context for classification + + Args: + current_message: Current user message + message_history: Optional conversation history + + Returns: + Formatted conversation context string + """ + context = "CONVERSATION CONTEXT:\n\n" + + # Add message history if available + if message_history: + context += "Previous Messages:\n" + for msg in message_history[-5:]: # Last 5 messages for context + role = msg.get("role", "unknown") + content = msg.get("content", "") + speaker = "AI" if role == "assistant" else "Student" + context += f"{speaker}: {content}\n" + context += "\n" + + # Add current message + context += f"Current Student Message: {current_message}\n\n" + context += "Please classify the student's intent based on the conversation above." + + return context + + def _validate_result(self, result: Dict[str, Any]) -> Dict[str, Any]: + """Validate and sanitize classification result + + Args: + result: Raw classification result + + Returns: + Validated and sanitized result + """ + # Ensure all required fields exist + validated = { + "intent": result.get("intent", "other"), + "confidence": float(result.get("confidence", 0.0)), + "reasoning": result.get("reasoning", "No reasoning provided"), + "metadata": result.get("metadata", {}) + } + + # Validate intent value + valid_intents = ["scam", "opportunity", "other"] + if validated["intent"] not in valid_intents: + print(f"⚠️ [INTENT_CLASSIFIER] Invalid intent '{validated['intent']}', defaulting to 'other'") + validated["intent"] = "other" + validated["confidence"] = 0.0 + + # Validate confidence range + validated["confidence"] = max(0.0, min(1.0, validated["confidence"])) + + # Ensure metadata has required fields + if "matched_keywords" not in validated["metadata"]: + validated["metadata"]["matched_keywords"] = [] + if "matched_characteristics" not in validated["metadata"]: + validated["metadata"]["matched_characteristics"] = [] + + return validated + + +# Global classifier instance +intent_classifier = IntentClassifier() diff --git a/app/intent_classification/services/prompts.py b/app/intent_classification/services/prompts.py new file mode 100644 index 0000000..724f747 --- /dev/null +++ b/app/intent_classification/services/prompts.py @@ -0,0 +1,101 @@ +""" +Intent Classification System Prompts + +System prompts for OpenAI GPT models to classify user intents. +""" + +from ..definitions.intent_definitions import ( + get_scam_definition, + get_opportunity_definition, + get_other_definition +) + + +def get_intent_classification_system_prompt() -> str: + """Main system prompt for intent classification + + Generates a comprehensive system prompt that includes all intent definitions, + examples, and classification rules for the LLM. + + Returns: + str: Complete system prompt for intent classification + """ + scam_def = get_scam_definition() + opportunity_def = get_opportunity_definition() + other_def = get_other_definition() + + return f"""You are an intent classification system for an international student services AI assistant. +Analyze the conversation and classify the caller's intent into one of three categories. + +CONTEXT: The callers are international students with questions about services, enrollment, or other needs. + +INTENT DEFINITIONS: + +1. SCAM - {scam_def['description']} +Characteristics: +{chr(10).join(f'- {c}' for c in scam_def['characteristics'])} + +Positive Examples (SHOULD be classified as SCAM): +{chr(10).join(f'- "{e}"' for e in scam_def['positive_examples'])} + +Negative Examples (should NOT be classified as SCAM): +{chr(10).join(f'- "{e}"' for e in scam_def['negative_examples'])} + +Key Indicators: {', '.join(scam_def['keywords'][:15])} + + +2. OPPORTUNITY - {opportunity_def['description']} +NOTE: OPPORTUNITY captures legitimate chances for students (interviews, jobs, research positions, internships, networking). +The system will collect student availability, contact info, and other details to help them seize these opportunities. + +Characteristics: +{chr(10).join(f'- {c}' for c in opportunity_def['characteristics'])} + +Positive Examples (SHOULD be classified as OPPORTUNITY): +{chr(10).join(f'- "{e}"' for e in opportunity_def['positive_examples'])} + +Negative Examples (should NOT be classified as OPPORTUNITY): +{chr(10).join(f'- "{e}"' for e in opportunity_def['negative_examples'])} + +Key Indicators: {', '.join(opportunity_def['keywords'][:20])} + + +3. OTHER - {other_def['description']} +IMPORTANT: Use this for complex cases, messages, callbacks, or anything requiring human review. + +Characteristics: +{chr(10).join(f'- {c}' for c in other_def['characteristics'])} + +Positive Examples (SHOULD be classified as OTHER): +{chr(10).join(f'- "{e}"' for e in other_def['positive_examples'])} + +Negative Examples (should NOT be classified as OTHER): +{chr(10).join(f'- "{e}"' for e in other_def['negative_examples'])} + +Key Indicators: {', '.join(other_def['keywords'][:15])} + + +CLASSIFICATION RULES: +1. Analyze the user's message and conversation context carefully +2. Match against characteristics and keywords for each intent type +3. Assign confidence score based on match strength (0.0 - 1.0) +4. Provide clear reasoning for the classification decision +5. Include matched keywords and characteristics in metadata +6. IMPORTANT: When in doubt or unclear → classify as OTHER (conservative approach) +7. OPPORTUNITY is for legitimate chances (interviews, jobs, research) that benefit students +8. Distinguish OPPORTUNITY from SCAM: OPPORTUNITY never requests money/payment/fees upfront +9. OTHER includes: leave message, complex cases, special situations, unclear intents + +RESPONSE FORMAT: +Respond ONLY with JSON in this exact format (no markdown, no code blocks): +{{ + "intent": "scam" | "opportunity" | "other", + "confidence": 0.0-1.0, + "reasoning": "Brief explanation of classification decision (1-2 sentences)", + "metadata": {{ + "matched_keywords": ["list", "of", "matched", "keywords"], + "matched_characteristics": ["list", "of", "matched", "characteristics"] + }} +}} + +IMPORTANT: Output ONLY the JSON object, nothing else.""" diff --git a/app/intent_classification/tests/QUICK_START.md b/app/intent_classification/tests/QUICK_START.md new file mode 100644 index 0000000..bb8bfa3 --- /dev/null +++ b/app/intent_classification/tests/QUICK_START.md @@ -0,0 +1,190 @@ +# Quick Start Guide - Intent Classification Testing + +## Run Tests (Simple) + +```bash +cd /Users/markwang/Documents/Dispatch\ AI/backend/ai +./app/intent_classification/tests/run_tests.sh +``` + +This single command will: +1. ✅ Run all 50 test cases +2. ✅ Calculate accuracy, precision, recall, F1 scores +3. ✅ Generate confusion matrix +4. ✅ Create fine-tuning datasets +5. ✅ Save all results + +## View Results + +### Test Report (Human-Readable) +```bash +cat app/intent_classification/tests/results/report_*.txt +``` + +### Metrics (JSON) +```bash +cat app/intent_classification/tests/results/metrics_*.json | jq +``` + +### Misclassified Cases +```bash +cat app/intent_classification/tests/results/misclassified_*.json | jq +``` + +## Key Metrics to Check + +| Metric | Good | Warning | Action Needed | +|--------|------|---------|---------------| +| **Accuracy** | > 90% | 85-90% | < 85% | +| **SCAM Recall** | > 95% | 90-95% | < 90% ⚠️ CRITICAL | +| **OPPORTUNITY Precision** | > 85% | 80-85% | < 80% | +| **F1 Score (all)** | > 0.85 | 0.75-0.85 | < 0.75 | + +## Common Scenarios + +### First Time Setup +```bash +# 1. Run initial baseline tests +./app/intent_classification/tests/run_tests.sh + +# 2. Review results +cat app/intent_classification/tests/results/report_*.txt +``` + +### After Making Changes +```bash +# 1. Run tests +./app/intent_classification/tests/run_tests.sh + +# 2. Compare to previous results +diff app/intent_classification/tests/results/report_PREVIOUS.txt \ + app/intent_classification/tests/results/report_LATEST.txt +``` + +### Generate Fine-Tuning Data Only +```bash +cd /Users/markwang/Documents/Dispatch\ AI/backend/ai +.venv/bin/python app/intent_classification/tests/fine_tuning.py +``` + +### Use Fine-Tuned Model + +After OpenAI fine-tuning completes: + +```python +# Update in config or .env +OPENAI_MODEL=ft:gpt-3.5-turbo:your-org:intent-classification:abc123 + +# Run tests to validate +./app/intent_classification/tests/run_tests.sh +``` + +## Interpreting Output + +### Good Result Example +``` +OVERALL PERFORMANCE +Total Tests: 50 +Correct: 47 (94.0%) ← Good! +Accuracy: 94.0% ← Good! +Macro F1: 93.8% ← Good! + +SCAM: + Recall: 100.0% ← Perfect! No missed scams + +OPPORTUNITY: + Precision: 93.3% ← Good! Low false positives +``` + +### Needs Improvement Example +``` +OVERALL PERFORMANCE +Total Tests: 50 +Correct: 40 (80.0%) ← Low accuracy +Accuracy: 80.0% ← Needs work + +SCAM: + Recall: 86.7% ← ⚠️ Missing scams! Critical! + +OPPORTUNITY: + Precision: 73.3% ← Too many false positives +``` + +## Troubleshooting + +### "Module not found" error +```bash +cd /Users/markwang/Documents/Dispatch\ AI/backend/ai +source .venv/bin/activate +``` + +### "No test results found" +- First run creates baseline +- Results saved in `tests/results/` +- Look for `metrics_*.json` files + +### Tests running slow +- Normal: ~2-3 seconds per test (50 tests = ~2 minutes) +- Slow: Check OpenAI API connection +- Very slow: Check timeout settings + +## Next Steps + +1. **Baseline**: Run tests to establish baseline +2. **Analyze**: Review misclassified cases +3. **Improve**: Add test cases or refine definitions +4. **Fine-tune**: Generate data and train model +5. **Validate**: Re-run tests with fine-tuned model +6. **Deploy**: Update production model +7. **Monitor**: Continue testing regularly + +## Quick Commands Reference + +```bash +# Run everything +./app/intent_classification/tests/run_tests.sh + +# Tests only +.venv/bin/python app/intent_classification/tests/test_runner.py + +# Fine-tuning only +.venv/bin/python app/intent_classification/tests/fine_tuning.py + +# With specific results file +.venv/bin/python app/intent_classification/tests/fine_tuning.py \ + app/intent_classification/tests/results/metrics_20251022_143000.json + +# View latest report +ls -t app/intent_classification/tests/results/report_*.txt | head -1 | xargs cat + +# View latest metrics +ls -t app/intent_classification/tests/results/metrics_*.json | head -1 | xargs cat | jq + +# Count test files +echo "SCAM: $(grep -c '"id"' app/intent_classification/tests/test_data/scam_cases.py)" +echo "OPPORTUNITY: $(grep -c '"id"' app/intent_classification/tests/test_data/opportunity_cases.py)" +echo "OTHER: $(grep -c '"id"' app/intent_classification/tests/test_data/other_cases.py)" +``` + +## Files Generated + +``` +tests/ +├── results/ +│ ├── metrics_YYYYMMDD_HHMMSS.json # Full metrics +│ ├── report_YYYYMMDD_HHMMSS.txt # Human report +│ └── misclassified_YYYYMMDD_HHMMSS.json # Failed cases +└── fine_tuning_data/ + ├── train_YYYYMMDD_HHMMSS.jsonl # Training set + └── validation_YYYYMMDD_HHMMSS.jsonl # Validation set +``` + +## Getting Help + +See full documentation: `tests/README.md` + +For issues: +1. Check test output for errors +2. Review results files +3. Check logs +4. Contact development team diff --git a/app/intent_classification/tests/README.md b/app/intent_classification/tests/README.md new file mode 100644 index 0000000..01f5588 --- /dev/null +++ b/app/intent_classification/tests/README.md @@ -0,0 +1,441 @@ +# Intent Classification Testing & Fine-Tuning + +Comprehensive testing suite for the intent classification system with performance metrics and fine-tuning capabilities. + +## Overview + +This testing framework provides: + +1. **Automated Testing** - Run all test cases and get instant feedback +2. **Performance Metrics** - Accuracy, Precision, Recall, F1 Score, Confusion Matrix +3. **Misclassification Analysis** - Identify patterns and problem areas +4. **Fine-Tuning Dataset Generation** - Automatically generate OpenAI fine-tuning data +5. **Continuous Improvement** - Iterative model refinement workflow + +## Quick Start + +### Run Tests + +```bash +cd /Users/markwang/Documents/Dispatch\ AI/backend/ai +.venv/bin/python app/intent_classification/tests/test_runner.py +``` + +This will: +- Run all 50 test cases (15 SCAM + 15 OPPORTUNITY + 15 OTHER + 5 edge cases) +- Calculate performance metrics +- Generate detailed report +- Save results to `tests/results/` + +### Generate Fine-Tuning Data + +```bash +# Generate from base test cases only +.venv/bin/python app/intent_classification/tests/fine_tuning.py + +# Generate from test results (includes misclassifications) +.venv/bin/python app/intent_classification/tests/fine_tuning.py tests/results/metrics_YYYYMMDD_HHMMSS.json +``` + +This will: +- Create training and validation datasets +- Save in JSONL format for OpenAI fine-tuning +- Generate improvement analysis report + +## Performance Metrics + +### Metrics Calculated + +#### Overall Metrics +- **Accuracy**: (TP + TN) / Total - Overall correctness +- **Macro Precision**: Average precision across all intents +- **Macro Recall**: Average recall across all intents +- **Macro F1**: Average F1 score across all intents + +#### Per-Intent Metrics +For each intent (SCAM, OPPORTUNITY, OTHER): +- **Precision**: TP / (TP + FP) - How many predicted positives are correct +- **Recall**: TP / (TP + FN) - How many actual positives are found +- **F1 Score**: 2 × (Precision × Recall) / (Precision + Recall) +- **Support**: Total number of actual cases for this intent +- **Average Confidence**: Mean confidence score for predictions +- **Low Confidence Count**: Cases below minimum threshold + +#### Confusion Matrix +Shows how often each intent is classified as each other intent: + +``` +Actual/Predicted scam opportunity other +scam 13 1 1 +opportunity 0 14 1 +other 1 0 14 +``` + +### Understanding Results + +**Good Performance Indicators:** +- ✅ Accuracy > 90% +- ✅ F1 Score > 0.85 for all intents +- ✅ Low false positive rate for SCAM (critical!) +- ✅ High recall for OPPORTUNITY (don't miss chances!) + +**Areas for Improvement:** +- ⚠️ Accuracy < 85% +- ⚠️ F1 Score < 0.75 for any intent +- ⚠️ High false positives for SCAM +- ⚠️ Low confidence scores (< 0.7) + +## Test Data Structure + +### Test Cases Location +``` +tests/test_data/ +├── scam_cases.py # 15 fraud scenarios +├── opportunity_cases.py # 15 job/research opportunities +├── other_cases.py # 15 complex/message cases +└── edge_cases.py # 5 ambiguous boundary cases +``` + +### Test Case Format + +```python +{ + "id": "opportunity_001", + "message": "We'd like to invite you for a job interview next week.", + "expected_intent": "opportunity", + "min_confidence": 0.85, + "description": "Job interview invitation" +} +``` + +### Adding New Test Cases + +1. Add to appropriate file in `test_data/` +2. Follow the format above +3. Set realistic `min_confidence` threshold +4. Run tests to validate + +## Fine-Tuning Workflow + +### 1. Initial Baseline + +```bash +# Run tests to establish baseline +.venv/bin/python app/intent_classification/tests/test_runner.py +``` + +Review metrics and identify weak areas. + +### 2. Generate Fine-Tuning Data + +```bash +# Generate datasets from latest test results +.venv/bin/python app/intent_classification/tests/fine_tuning.py \ + tests/results/metrics_YYYYMMDD_HHMMSS.json +``` + +This creates: +- `fine_tuning_data/train_YYYYMMDD_HHMMSS.jsonl` +- `fine_tuning_data/validation_YYYYMMDD_HHMMSS.jsonl` + +### 3. Upload to OpenAI + +```bash +# Upload training data +openai api files.create \ + -f tests/fine_tuning_data/train_YYYYMMDD_HHMMSS.jsonl \ + -p fine-tune + +# Upload validation data +openai api files.create \ + -f tests/fine_tuning_data/validation_YYYYMMDD_HHMMSS.jsonl \ + -p fine-tune +``` + +### 4. Create Fine-Tuning Job + +```bash +openai api fine_tunes.create \ + -t file-TRAIN_FILE_ID \ + -v file-VAL_FILE_ID \ + -m gpt-3.5-turbo \ + --suffix "intent-classification" +``` + +### 5. Monitor Progress + +```bash +# Check status +openai api fine_tunes.follow -i ft-JOB_ID + +# List all jobs +openai api fine_tunes.list +``` + +### 6. Update Model + +Once fine-tuning completes: + +```python +# In config/settings.py or .env +OPENAI_MODEL=ft:gpt-3.5-turbo:your-org:intent-classification:JOB_ID +``` + +### 7. Validate Improvement + +```bash +# Run tests again with fine-tuned model +.venv/bin/python app/intent_classification/tests/test_runner.py +``` + +Compare metrics to baseline. Expected improvements: +- ✅ Higher accuracy +- ✅ Better F1 scores +- ✅ Fewer misclassifications +- ✅ Higher confidence scores + +## Continuous Improvement Cycle + +``` +┌─────────────────┐ +│ Run Tests │ +│ Get Baseline │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ Analyze Results │ +│ Find Patterns │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ Generate │ +│ Fine-Tune Data │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ Train Model │ +│ on OpenAI │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ Deploy New │ +│ Model │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ Validate │ +│ Improvements │ +└────────┬────────┘ + │ + └─────────► Repeat +``` + +## Advanced Usage + +### Custom Test Suites + +```python +from intent_classification.tests.test_runner import IntentTestRunner + +runner = IntentTestRunner() + +# Run specific test suite +custom_tests = { + "scam": SCAM_TEST_CASES, + "opportunity": OPPORTUNITY_TEST_CASES[:5] # First 5 only +} + +metrics = await runner.run_all_tests(custom_tests) +``` + +### Programmatic Metrics Access + +```python +from intent_classification.tests.test_runner import IntentTestRunner + +runner = IntentTestRunner() +metrics = await runner.run_all_tests() + +# Access specific metrics +accuracy = metrics["summary"]["accuracy"] +scam_f1 = metrics["per_intent"]["scam"]["f1_score"] +confusion = metrics["confusion_matrix"] + +# Get misclassified cases +for case in metrics["misclassified_cases"]: + print(f"Expected: {case['expected_intent']}") + print(f"Got: {case['predicted_intent']}") + print(f"Message: {case['message']}") +``` + +### Custom Fine-Tuning Data + +```python +from intent_classification.tests.fine_tuning import FineTuningDataGenerator + +generator = FineTuningDataGenerator() + +# Create custom example +example = generator.create_training_example( + message="We have a position available in our lab", + intent="opportunity", + confidence=0.95, + reasoning="Research lab position offer" +) + +# Generate from misclassifications +corrections = generator.generate_from_misclassifications( + misclassified_cases +) + +# Save +generator.save_fine_tuning_dataset(corrections, name="corrections") +``` + +## Output Files + +### Test Results Directory + +``` +tests/results/ +├── metrics_20251022_143000.json # Full metrics in JSON +├── report_20251022_143000.txt # Human-readable report +└── misclassified_20251022_143000.json # Misclassified cases +``` + +### Fine-Tuning Data Directory + +``` +tests/fine_tuning_data/ +├── train_20251022_143500.jsonl # Training dataset +└── validation_20251022_143500.jsonl # Validation dataset +``` + +## Interpreting Results + +### Example Report + +``` +============================================================ +INTENT CLASSIFICATION TEST REPORT +============================================================ + +OVERALL PERFORMANCE +------------------------------------------------------------ +Total Tests: 50 +Correct: 47 (94.0%) +Incorrect: 3 +Accuracy: 94.0% +Macro Precision: 93.5% +Macro Recall: 94.2% +Macro F1: 93.8% + +PER-INTENT PERFORMANCE +------------------------------------------------------------ + +SCAM: + Precision: 100.0% ← No false positives! + Recall: 93.3% ← Found 14/15 scams + F1 Score: 96.5% + Support: 15 cases + Avg Confidence: 0.94 + TP/FP/FN/TN: 14/0/1/35 + +OPPORTUNITY: + Precision: 87.5% ← Some false positives + Recall: 93.3% ← Found 14/15 opportunities + F1 Score: 90.3% + Support: 15 cases + Avg Confidence: 0.89 + TP/FP/FN/TN: 14/2/1/33 + +OTHER: + Precision: 93.3% + Recall: 93.3% + F1 Score: 93.3% + Support: 15 cases + Avg Confidence: 0.88 + TP/FP/FN/TN: 14/1/1/34 +``` + +### What to Focus On + +**If SCAM recall < 95%:** +- ❗ CRITICAL: Missing fraud attempts is dangerous +- Add more SCAM examples to fine-tuning data +- Review missed SCAM cases carefully +- Consider lowering SCAM detection threshold + +**If OPPORTUNITY precision < 85%:** +- False positives waste student time +- Review cases misclassified as OPPORTUNITY +- Refine OPPORTUNITY definition +- Add negative examples + +**If OTHER is catching too much:** +- Good for safety, but reduces automation +- Review OTHER cases to find patterns +- Consider creating new intent categories +- Improve SCAM/OPPORTUNITY definitions + +## Troubleshooting + +### Low Overall Accuracy + +1. Check if test data quality is good +2. Review intent definitions for clarity +3. Ensure system prompt is well-structured +4. Generate fine-tuning data and train model + +### High Misclassification Rate + +1. Analyze confusion matrix for patterns +2. Check if intents are well-separated +3. Review ambiguous edge cases +4. Add more training examples for weak areas + +### Low Confidence Scores + +1. Check if examples are truly clear +2. Review edge cases - may need different thresholds +3. Consider if intents need better definition +4. Fine-tune model with high-confidence examples + +## Best Practices + +1. **Run tests after ANY changes** to definitions, prompts, or model +2. **Set realistic confidence thresholds** in test cases (0.7-0.9) +3. **Prioritize SCAM detection** - false negatives are dangerous +4. **Monitor trends over time** - save all test results +5. **Fine-tune incrementally** - don't change too much at once +6. **Validate improvements** - always test after fine-tuning +7. **Document changes** - note what was changed and why + +## Contributing + +When adding test cases: + +1. Add to appropriate test data file +2. Follow existing format exactly +3. Set realistic min_confidence +4. Add descriptive description +5. Run tests to validate +6. Update this README if needed + +## Support + +For questions or issues: +1. Check test results for detailed error messages +2. Review metrics to identify specific problems +3. Consult the fine-tuning improvement report +4. Contact the development team + +--- + +**Last Updated**: 2025-10-22 +**Version**: 2.0.0 diff --git a/app/intent_classification/tests/TESTING_SUMMARY.md b/app/intent_classification/tests/TESTING_SUMMARY.md new file mode 100644 index 0000000..2096b40 --- /dev/null +++ b/app/intent_classification/tests/TESTING_SUMMARY.md @@ -0,0 +1,327 @@ +# Intent Classification Testing & Fine-Tuning System + +## 📊 System Overview + +A comprehensive testing and fine-tuning framework for the intent classification system with: + +✅ **Automated Testing** - Run 50 test cases with a single command +✅ **Performance Metrics** - Accuracy, Precision, Recall, F1, Confusion Matrix +✅ **Misclassification Analysis** - Identify patterns and problem areas +✅ **Fine-Tuning Pipeline** - Generate OpenAI fine-tuning datasets automatically +✅ **Continuous Improvement** - Iterative model refinement workflow + +## 🚀 Quick Start + +### Run Tests +```bash +cd /Users/markwang/Documents/Dispatch\ AI/backend/ai +./app/intent_classification/tests/run_tests.sh +``` + +### Run Demo (3 test cases) +```bash +.venv/bin/python app/intent_classification/tests/demo.py +``` + +## 📁 Files Created + +### Core Files +| File | Purpose | Lines | +|------|---------|-------| +| `test_runner.py` | Main test runner with metrics calculation | ~380 | +| `fine_tuning.py` | Fine-tuning data generator | ~320 | +| `demo.py` | Quick demo script | ~80 | + +### Documentation +| File | Purpose | +|------|---------| +| `README.md` | Complete testing documentation | +| `QUICK_START.md` | Quick reference guide | +| `TESTING_SUMMARY.md` | This file | + +### Scripts +| File | Purpose | +|------|---------| +| `run_tests.sh` | Single command to run everything | + +### Directories +| Directory | Purpose | +|-----------|---------| +| `results/` | Test metrics, reports, misclassifications | +| `fine_tuning_data/` | Generated fine-tuning datasets (JSONL) | + +## 📈 Metrics Calculated + +### Overall Metrics +- **Accuracy**: Percentage of correct classifications +- **Macro Precision**: Average precision across all intents +- **Macro Recall**: Average recall across all intents +- **Macro F1**: Average F1 score across all intents + +### Per-Intent Metrics (SCAM, OPPORTUNITY, OTHER) +- **Precision**: TP / (TP + FP) +- **Recall**: TP / (TP + FN) +- **F1 Score**: Harmonic mean of precision and recall +- **Support**: Total number of test cases +- **Average Confidence**: Mean confidence score +- **Confusion Matrix**: Intent x Intent classification matrix + +### Analysis Features +- Misclassification patterns (e.g., "scam → opportunity: 3 cases") +- Low confidence cases identification +- Detailed case-by-case breakdown +- Keyword and characteristic matching analysis + +## 🔧 Fine-Tuning Workflow + +### 1. Run Tests +```bash +./app/intent_classification/tests/run_tests.sh +``` +Output: `results/metrics_YYYYMMDD_HHMMSS.json` + +### 2. Generate Fine-Tuning Data +Automatically generated by `run_tests.sh` or manually: +```bash +.venv/bin/python app/intent_classification/tests/fine_tuning.py \ + app/intent_classification/tests/results/metrics_YYYYMMDD_HHMMSS.json +``` +Output: +- `fine_tuning_data/train_YYYYMMDD_HHMMSS.jsonl` +- `fine_tuning_data/validation_YYYYMMDD_HHMMSS.jsonl` + +### 3. Upload to OpenAI +```bash +openai api files.create -f fine_tuning_data/train_*.jsonl -p fine-tune +openai api files.create -f fine_tuning_data/validation_*.jsonl -p fine-tune +``` + +### 4. Create Fine-Tuning Job +```bash +openai api fine_tunes.create \ + -t file-TRAIN_ID \ + -v file-VAL_ID \ + -m gpt-3.5-turbo \ + --suffix "intent-classification" +``` + +### 5. Monitor Progress +```bash +openai api fine_tunes.follow -i ft-JOB_ID +``` + +### 6. Deploy Fine-Tuned Model +```python +# Update settings +OPENAI_MODEL=ft:gpt-3.5-turbo:org:intent-classification:JOB_ID +``` + +### 7. Validate Improvements +```bash +./app/intent_classification/tests/run_tests.sh +``` +Compare new metrics to baseline. + +## 📊 Output Examples + +### Test Report Sample +``` +============================================================ +INTENT CLASSIFICATION TEST REPORT +============================================================ + +OVERALL PERFORMANCE +Total Tests: 50 +Correct: 47 (94.0%) +Accuracy: 94.0% +Macro Precision: 93.5% +Macro Recall: 94.2% +Macro F1: 93.8% + +PER-INTENT PERFORMANCE + +SCAM: + Precision: 100.0% + Recall: 93.3% + F1 Score: 96.5% + Support: 15 cases + Avg Confidence: 0.94 + +CONFUSION MATRIX +Actual/Predicted scam opportunity other +scam 14 1 0 +opportunity 0 14 1 +other 0 1 14 +``` + +### Fine-Tuning Data Sample (JSONL) +```json +{ + "messages": [ + { + "role": "system", + "content": "You are an intent classification system..." + }, + { + "role": "user", + "content": "CONVERSATION CONTEXT:\n\nCurrent Student Message: We'd like to invite you for a job interview..." + }, + { + "role": "assistant", + "content": "{\"intent\": \"opportunity\", \"confidence\": 0.95, \"reasoning\": \"Legitimate job interview invitation\", \"metadata\": {...}}" + } + ] +} +``` + +## 🎯 Success Criteria + +| Metric | Excellent | Good | Needs Work | +|--------|-----------|------|------------| +| **Overall Accuracy** | > 95% | 90-95% | < 90% | +| **SCAM Recall** | 100% | 95-99% | < 95% ⚠️ | +| **OPPORTUNITY Precision** | > 90% | 85-90% | < 85% | +| **F1 Score (all intents)** | > 0.90 | 0.85-0.90 | < 0.85 | + +**Critical**: SCAM recall < 95% is dangerous - means missing fraud attempts! + +## 🔄 Continuous Improvement Cycle + +``` +1. Baseline Testing + ↓ +2. Identify Weak Areas (Confusion Matrix, Misclassifications) + ↓ +3. Generate Fine-Tuning Data (Emphasize weak areas) + ↓ +4. Train Model on OpenAI + ↓ +5. Deploy Fine-Tuned Model + ↓ +6. Validate Improvements + ↓ +7. Repeat → Continuous Improvement +``` + +## 🧪 Test Coverage + +### Test Data Statistics +- **SCAM**: 15 test cases (fraud scenarios) +- **OPPORTUNITY**: 15 test cases (jobs, research, internships) +- **OTHER**: 15 test cases (messages, callbacks, complex) +- **Edge Cases**: 5 test cases (ambiguous boundaries) +- **Total**: 50 test cases + +### Fine-Tuning Data Generation +- **Base Examples**: 50 (from test cases) +- **Correction Examples**: Variable (from misclassifications) +- **Augmentation**: 3x for misclassified cases +- **Train/Val Split**: 80/20 + +## 🛠️ Technical Details + +### Dependencies +- Python 3.8+ +- OpenAI Python SDK +- asyncio (async classification) +- JSON for data export +- Collections for metrics + +### Integration Points +- Works with existing `IntentClassifier` +- Uses existing test data structure +- Compatible with OpenAI fine-tuning API +- Outputs standard JSONL format + +### Performance +- Test Speed: ~2-3 seconds per case (API latency) +- Total Time: ~2-3 minutes for 50 cases +- Metrics Calculation: <1 second +- Fine-Tuning Data Generation: <5 seconds + +## 📝 Key Features + +### Test Runner (`test_runner.py`) +✅ Async test execution +✅ Real-time progress tracking +✅ Comprehensive metrics calculation +✅ Confusion matrix generation +✅ Misclassification pattern analysis +✅ JSON + Text report output +✅ Automatic results saving + +### Fine-Tuning Generator (`fine_tuning.py`) +✅ OpenAI chat format compatibility +✅ Base test case conversion +✅ Misclassification correction examples +✅ Data augmentation (3x for corrections) +✅ Train/validation split (80/20) +✅ JSONL export for OpenAI +✅ Improvement analysis report + +### Documentation +✅ Comprehensive README (300+ lines) +✅ Quick start guide +✅ Code examples +✅ Troubleshooting tips +✅ Best practices +✅ Step-by-step workflows + +## 🎓 Usage Examples + +### Programmatic Access +```python +from intent_classification.tests.test_runner import IntentTestRunner + +runner = IntentTestRunner() +metrics = await runner.run_all_tests() + +print(f"Accuracy: {metrics['summary']['accuracy']:.1%}") +print(f"F1 Score: {metrics['summary']['macro_f1']:.1%}") + +for case in metrics['misclassified_cases']: + print(f"Failed: {case['test_id']}") +``` + +### Custom Test Suite +```python +custom_tests = { + "scam": SCAM_TEST_CASES[:5], # First 5 only + "opportunity": OPPORTUNITY_TEST_CASES +} + +metrics = await runner.run_all_tests(custom_tests) +``` + +### Fine-Tuning Data Generation +```python +from intent_classification.tests.fine_tuning import FineTuningDataGenerator + +generator = FineTuningDataGenerator() +examples = generator.generate_from_test_cases() +train, val = generator.create_validation_split(examples) +generator.save_fine_tuning_dataset(train, name="custom_train") +``` + +## 📞 Support + +For questions or issues: +1. Check `README.md` for detailed documentation +2. Check `QUICK_START.md` for common tasks +3. Review test results for error messages +4. Check logs in `results/` directory +5. Contact development team + +## 🔗 Related Files + +- Main classifier: `app/intent_classification/services/classifier.py` +- Test data: `app/intent_classification/tests/test_data/*.py` +- Intent definitions: `app/intent_classification/definitions/*.py` +- API routes: `app/intent_classification/api/routes.py` + +--- + +**Version**: 2.0.0 +**Created**: 2025-10-22 +**Last Updated**: 2025-10-22 +**Author**: Dispatch AI Development Team diff --git a/app/intent_classification/tests/__init__.py b/app/intent_classification/tests/__init__.py new file mode 100644 index 0000000..a9cfc68 --- /dev/null +++ b/app/intent_classification/tests/__init__.py @@ -0,0 +1,5 @@ +""" +Intent Classification Tests + +Test suite for intent classification module. +""" diff --git a/app/intent_classification/tests/demo.py b/app/intent_classification/tests/demo.py new file mode 100644 index 0000000..f101bd6 --- /dev/null +++ b/app/intent_classification/tests/demo.py @@ -0,0 +1,77 @@ +""" +Intent Classification Testing Demo + +Quick demonstration of the testing and fine-tuning system. +Runs a small subset of tests to show capabilities. +""" + +import asyncio +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) + +from intent_classification import IntentClassifier +from intent_classification.tests.test_data import ( + SCAM_TEST_CASES, + OPPORTUNITY_TEST_CASES, + OTHER_TEST_CASES +) + + +async def demo(): + """Run a quick demo of the classification system""" + print("=" * 70) + print("INTENT CLASSIFICATION TESTING DEMO") + print("=" * 70) + print() + + classifier = IntentClassifier() + + # Select demo cases + demo_cases = [ + SCAM_TEST_CASES[0], # SCAM example + OPPORTUNITY_TEST_CASES[0], # OPPORTUNITY example + OTHER_TEST_CASES[0], # OTHER example + ] + + print("Testing 3 sample cases (1 from each intent)...") + print() + + correct = 0 + total = len(demo_cases) + + for i, test_case in enumerate(demo_cases, 1): + print(f"Test {i}/{total}: {test_case['id']}") + print(f"Message: \"{test_case['message'][:80]}...\"") + print(f"Expected: {test_case['expected_intent']}") + + # Classify + result = await classifier.classify_intent(test_case['message']) + + # Check result + is_correct = result['intent'] == test_case['expected_intent'] + if is_correct: + correct += 1 + + status = "✅ CORRECT" if is_correct else "❌ WRONG" + print(f"Predicted: {result['intent']} (confidence: {result['confidence']:.2f}) {status}") + print(f"Reasoning: {result['reasoning'][:100]}...") + print() + + # Summary + accuracy = correct / total + print("=" * 70) + print(f"DEMO RESULTS: {correct}/{total} correct ({accuracy:.1%} accuracy)") + print("=" * 70) + print() + + print("For full test suite with 50 cases, run:") + print(" ./app/intent_classification/tests/run_tests.sh") + print() + print("Or see QUICK_START.md for more options.") + print() + + +if __name__ == "__main__": + asyncio.run(demo()) diff --git a/app/intent_classification/tests/fine_tuning.py b/app/intent_classification/tests/fine_tuning.py new file mode 100644 index 0000000..a3d2f61 --- /dev/null +++ b/app/intent_classification/tests/fine_tuning.py @@ -0,0 +1,331 @@ +""" +Fine-Tuning Data Generator for Intent Classification + +Generates OpenAI fine-tuning datasets from test results and misclassifications. +Supports continuous improvement of the classification model. +""" + +import json +from pathlib import Path +from typing import List, Dict, Any +from datetime import datetime + +from intent_classification.tests.test_data import ALL_TEST_CASES +from intent_classification.definitions.intent_definitions import ( + get_scam_definition, + get_opportunity_definition, + get_other_definition +) + + +class FineTuningDataGenerator: + """Generate fine-tuning datasets for OpenAI models""" + + def __init__(self): + self.scam_def = get_scam_definition() + self.opportunity_def = get_opportunity_definition() + self.other_def = get_other_definition() + + def create_training_example( + self, + message: str, + intent: str, + confidence: float = None, + reasoning: str = None + ) -> Dict[str, Any]: + """Create a single training example in OpenAI format""" + + # Get intent definition + if intent == "scam": + intent_def = self.scam_def + elif intent == "opportunity": + intent_def = self.opportunity_def + else: + intent_def = self.other_def + + # Create ideal response + ideal_response = { + "intent": intent, + "confidence": confidence or 0.95, + "reasoning": reasoning or f"This message clearly indicates {intent_def['description']}", + "metadata": { + "matched_keywords": [], + "matched_characteristics": [] + } + } + + # OpenAI fine-tuning format (Chat format) + return { + "messages": [ + { + "role": "system", + "content": self._get_simplified_system_prompt() + }, + { + "role": "user", + "content": f"CONVERSATION CONTEXT:\n\nCurrent Student Message: {message}\n\nPlease classify the student's intent based on the conversation above." + }, + { + "role": "assistant", + "content": json.dumps(ideal_response, ensure_ascii=False) + } + ] + } + + def _get_simplified_system_prompt(self) -> str: + """Get simplified system prompt for fine-tuning""" + return """You are an intent classification system for international student services. +Classify caller intent into: scam, opportunity, or other. + +SCAM: Fraud attempts, malicious callers +OPPORTUNITY: Legitimate interviews, jobs, research positions, internships +OTHER: Complex issues, messages, unclear intents + +Respond with JSON: {"intent": "scam|opportunity|other", "confidence": 0.0-1.0, "reasoning": "explanation", "metadata": {...}}""" + + def generate_from_test_cases( + self, + test_suites: Dict[str, List[Dict]] = None, + include_all: bool = True + ) -> List[Dict[str, Any]]: + """Generate fine-tuning data from test cases""" + if test_suites is None: + test_suites = ALL_TEST_CASES + + training_examples = [] + + for suite_name, test_cases in test_suites.items(): + for test_case in test_cases: + if not include_all and suite_name == "edge_cases": + continue # Skip edge cases if not including all + + example = self.create_training_example( + message=test_case["message"], + intent=test_case["expected_intent"], + confidence=test_case.get("min_confidence", 0.9) + ) + + training_examples.append(example) + + return training_examples + + def generate_from_misclassifications( + self, + misclassified_cases: List[Dict[str, Any]] + ) -> List[Dict[str, Any]]: + """Generate corrective fine-tuning data from misclassifications""" + training_examples = [] + + for case in misclassified_cases: + # Create corrected example with the EXPECTED intent + example = self.create_training_example( + message=case["message"], + intent=case["expected_intent"], # Use expected, not predicted + confidence=case["min_confidence"], + reasoning=f"Correction: This should be classified as {case['expected_intent']}, not {case['predicted_intent']}. {case['reasoning']}" + ) + + training_examples.append(example) + + return training_examples + + def generate_augmented_examples( + self, + base_examples: List[Dict[str, Any]], + augmentation_factor: int = 2 + ) -> List[Dict[str, Any]]: + """Generate augmented examples by adding variations""" + # For now, just duplicate critical examples + # In production, you might use paraphrasing, synonym replacement, etc. + augmented = base_examples.copy() + + # Focus on misclassified cases - add them multiple times + for _ in range(augmentation_factor - 1): + augmented.extend(base_examples) + + return augmented + + def save_fine_tuning_dataset( + self, + examples: List[Dict[str, Any]], + output_path: Path = None, + name: str = "fine_tune_dataset" + ) -> Path: + """Save fine-tuning dataset in JSONL format""" + if output_path is None: + output_path = Path(__file__).parent / "fine_tuning_data" + + output_path.mkdir(exist_ok=True) + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"{name}_{timestamp}.jsonl" + file_path = output_path / filename + + with open(file_path, "w", encoding="utf-8") as f: + for example in examples: + f.write(json.dumps(example, ensure_ascii=False) + "\n") + + print(f"Fine-tuning dataset saved: {file_path}") + print(f"Total examples: {len(examples)}") + + return file_path + + def create_validation_split( + self, + examples: List[Dict[str, Any]], + validation_ratio: float = 0.2 + ) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: + """Split data into training and validation sets""" + import random + + # Shuffle examples + shuffled = examples.copy() + random.shuffle(shuffled) + + # Split + split_idx = int(len(shuffled) * (1 - validation_ratio)) + train = shuffled[:split_idx] + val = shuffled[split_idx:] + + return train, val + + def generate_improvement_report( + self, + misclassified_cases: List[Dict[str, Any]] + ) -> str: + """Generate a report analyzing areas for improvement""" + report = [] + report.append("=" * 80) + report.append("FINE-TUNING IMPROVEMENT ANALYSIS") + report.append("=" * 80) + report.append("") + + # Analyze misclassification patterns + from collections import defaultdict + patterns = defaultdict(list) + + for case in misclassified_cases: + key = f"{case['expected_intent']} → {case['predicted_intent']}" + patterns[key].append(case) + + report.append("MISCLASSIFICATION PATTERNS:") + report.append("-" * 80) + for pattern, cases in sorted(patterns.items(), key=lambda x: len(x[1]), reverse=True): + report.append(f"\n{pattern} ({len(cases)} cases)") + report.append("Sample messages:") + for case in cases[:3]: + report.append(f" - {case['message'][:80]}...") + + # Low confidence cases + low_conf = [c for c in misclassified_cases if not c.get("confidence_ok", True)] + if low_conf: + report.append("\n\nLOW CONFIDENCE MISCLASSIFICATIONS:") + report.append("-" * 80) + report.append(f"Total: {len(low_conf)} cases") + for case in low_conf[:5]: + report.append(f"\n Message: {case['message'][:70]}...") + report.append(f" Expected: {case['expected_intent']}") + report.append(f" Predicted: {case['predicted_intent']} (conf: {case['confidence']:.2f})") + + # Recommendations + report.append("\n\nRECOMMENDATIONS:") + report.append("-" * 80) + + if patterns: + top_pattern = max(patterns.items(), key=lambda x: len(x[1])) + report.append(f"1. Focus on pattern: {top_pattern[0]} ({len(top_pattern[1])} cases)") + + if low_conf: + report.append(f"2. Review {len(low_conf)} low-confidence cases") + + report.append("3. Consider adding more training examples for weak areas") + report.append("4. Review and refine intent definitions") + report.append("5. Add more keywords to intent definitions") + + report.append("") + report.append("=" * 80) + + return "\n".join(report) + + +def generate_complete_fine_tuning_dataset( + test_results_path: Path = None, + include_base_cases: bool = True, + augment_misclassified: bool = True +) -> Dict[str, Path]: + """Generate complete fine-tuning dataset with train/val splits""" + + generator = FineTuningDataGenerator() + all_examples = [] + + # 1. Include base test cases + if include_base_cases: + print("Generating examples from test cases...") + base_examples = generator.generate_from_test_cases() + all_examples.extend(base_examples) + print(f" Added {len(base_examples)} base examples") + + # 2. Include misclassified cases (if available) + if test_results_path and test_results_path.exists(): + print(f"Loading test results from {test_results_path}...") + with open(test_results_path, "r") as f: + results = json.load(f) + + misclassified = results.get("misclassified_cases", []) + if misclassified: + print(f"Generating corrective examples from {len(misclassified)} misclassifications...") + correction_examples = generator.generate_from_misclassifications(misclassified) + + if augment_misclassified: + # Add misclassified cases multiple times for emphasis + correction_examples = generator.generate_augmented_examples( + correction_examples, + augmentation_factor=3 + ) + + all_examples.extend(correction_examples) + print(f" Added {len(correction_examples)} correction examples") + + # Generate improvement report + report = generator.generate_improvement_report(misclassified) + print("\n") + print(report) + + # 3. Split into train/val + print("\nSplitting into train/validation sets...") + train_examples, val_examples = generator.create_validation_split(all_examples, 0.2) + print(f" Training: {len(train_examples)} examples") + print(f" Validation: {len(val_examples)} examples") + + # 4. Save datasets + print("\nSaving fine-tuning datasets...") + train_path = generator.save_fine_tuning_dataset(train_examples, name="train") + val_path = generator.save_fine_tuning_dataset(val_examples, name="validation") + + print("\n✅ Fine-tuning datasets generated successfully!") + print("\nNext steps:") + print("1. Upload datasets to OpenAI for fine-tuning:") + print(f" openai api fine_tunes.create -t {train_path} -v {val_path} -m gpt-3.5-turbo") + print("2. Monitor fine-tuning progress") + print("3. Update settings.openai_model with fine-tuned model ID") + print("4. Re-run tests to validate improvements") + + return { + "train": train_path, + "validation": val_path + } + + +if __name__ == "__main__": + # Example: Generate fine-tuning dataset + import sys + + results_file = None + if len(sys.argv) > 1: + results_file = Path(sys.argv[1]) + + generate_complete_fine_tuning_dataset( + test_results_path=results_file, + include_base_cases=True, + augment_misclassified=True + ) diff --git a/app/intent_classification/tests/results/metrics_20251022_220649.json b/app/intent_classification/tests/results/metrics_20251022_220649.json new file mode 100644 index 0000000..1b8b648 --- /dev/null +++ b/app/intent_classification/tests/results/metrics_20251022_220649.json @@ -0,0 +1,62 @@ +{ + "summary": { + "total_tests": 50, + "correct": 50, + "incorrect": 0, + "accuracy": 1.0, + "macro_precision": 1.0, + "macro_recall": 1.0, + "macro_f1": 1.0 + }, + "per_intent": { + "scam": { + "true_positives": 15, + "false_positives": 0, + "false_negatives": 0, + "true_negatives": 35, + "precision": 1.0, + "recall": 1.0, + "f1_score": 1.0, + "support": 15, + "avg_confidence": 0.9966666666666666, + "low_confidence_count": 0 + }, + "opportunity": { + "true_positives": 15, + "false_positives": 0, + "false_negatives": 0, + "true_negatives": 35, + "precision": 1.0, + "recall": 1.0, + "f1_score": 1.0, + "support": 15, + "avg_confidence": 0.9966666666666666, + "low_confidence_count": 0 + }, + "other": { + "true_positives": 20, + "false_positives": 0, + "false_negatives": 0, + "true_negatives": 30, + "precision": 1.0, + "recall": 1.0, + "f1_score": 1.0, + "support": 20, + "avg_confidence": 0.9225, + "low_confidence_count": 2 + } + }, + "confusion_matrix": { + "scam": { + "scam": 15 + }, + "opportunity": { + "opportunity": 15 + }, + "other": { + "other": 20 + } + }, + "misclassification_patterns": {}, + "misclassified_cases": [] +} \ No newline at end of file diff --git a/app/intent_classification/tests/results/report_20251022_220649.txt b/app/intent_classification/tests/results/report_20251022_220649.txt new file mode 100644 index 0000000..7f82d4e --- /dev/null +++ b/app/intent_classification/tests/results/report_20251022_220649.txt @@ -0,0 +1,51 @@ +================================================================================ +INTENT CLASSIFICATION TEST REPORT +================================================================================ + +OVERALL PERFORMANCE +-------------------------------------------------------------------------------- +Total Tests: 50 +Correct: 50 (100.0%) +Incorrect: 0 +Accuracy: 100.0% +Macro Precision: 100.0% +Macro Recall: 100.0% +Macro F1: 100.0% + +PER-INTENT PERFORMANCE +-------------------------------------------------------------------------------- + +SCAM: + Precision: 100.0% + Recall: 100.0% + F1 Score: 100.0% + Support: 15 cases + Avg Confidence: 1.00 + TP/FP/FN/TN: 15/0/0/35 + +OPPORTUNITY: + Precision: 100.0% + Recall: 100.0% + F1 Score: 100.0% + Support: 15 cases + Avg Confidence: 1.00 + TP/FP/FN/TN: 15/0/0/35 + +OTHER: + Precision: 100.0% + Recall: 100.0% + F1 Score: 100.0% + Support: 20 cases + Avg Confidence: 0.92 + TP/FP/FN/TN: 20/0/0/30 + +CONFUSION MATRIX +-------------------------------------------------------------------------------- +Actual/Predicted scam opportunity other +-------------------------------------------------------------------------------- +scam 15 0 0 +opportunity 0 15 0 +other 0 0 20 + + +================================================================================ \ No newline at end of file diff --git a/app/intent_classification/tests/run_tests.sh b/app/intent_classification/tests/run_tests.sh new file mode 100755 index 0000000..1729b5f --- /dev/null +++ b/app/intent_classification/tests/run_tests.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +# Intent Classification Test Runner Script +# Runs tests and generates fine-tuning data + +set -e + +cd "$(dirname "$0")/../../.." + +echo "==========================================" +echo "Intent Classification Testing Suite" +echo "==========================================" +echo "" + +# Activate virtual environment +if [ -d ".venv" ]; then + source .venv/bin/activate +else + echo "⚠️ Virtual environment not found. Using system Python." +fi + +# Run tests +echo "1. Running classification tests..." +echo "" +python app/intent_classification/tests/test_runner.py + +echo "" +echo "==========================================" +echo "" + +# Find latest metrics file +LATEST_METRICS=$(ls -t app/intent_classification/tests/results/metrics_*.json 2>/dev/null | head -1) + +if [ -z "$LATEST_METRICS" ]; then + echo "⚠️ No test results found. Generating fine-tuning data from base cases only." + echo "" + python app/intent_classification/tests/fine_tuning.py +else + echo "2. Generating fine-tuning data from results..." + echo " Using: $LATEST_METRICS" + echo "" + python app/intent_classification/tests/fine_tuning.py "$LATEST_METRICS" +fi + +echo "" +echo "==========================================" +echo "✅ Testing and fine-tuning data generation complete!" +echo "==========================================" +echo "" +echo "Next steps:" +echo " 1. Review test report in: app/intent_classification/tests/results/" +echo " 2. Upload fine-tuning data to OpenAI (see README.md)" +echo " 3. Monitor fine-tuning progress" +echo " 4. Update model configuration with fine-tuned model ID" +echo " 5. Re-run tests to validate improvements" +echo "" diff --git a/app/intent_classification/tests/test_data/__init__.py b/app/intent_classification/tests/test_data/__init__.py new file mode 100644 index 0000000..3b83fe1 --- /dev/null +++ b/app/intent_classification/tests/test_data/__init__.py @@ -0,0 +1,25 @@ +""" +Intent Classification Test Data + +Test cases for validating intent classification accuracy. +""" + +from .scam_cases import SCAM_TEST_CASES +from .opportunity_cases import OPPORTUNITY_TEST_CASES +from .other_cases import OTHER_TEST_CASES +from .edge_cases import EDGE_CASE_TEST_CASES + +ALL_TEST_CASES = { + "scam": SCAM_TEST_CASES, + "opportunity": OPPORTUNITY_TEST_CASES, + "other": OTHER_TEST_CASES, + "edge_cases": EDGE_CASE_TEST_CASES +} + +__all__ = [ + "SCAM_TEST_CASES", + "OPPORTUNITY_TEST_CASES", + "OTHER_TEST_CASES", + "EDGE_CASE_TEST_CASES", + "ALL_TEST_CASES", +] diff --git a/app/intent_classification/tests/test_data/edge_cases.py b/app/intent_classification/tests/test_data/edge_cases.py new file mode 100644 index 0000000..9617ec8 --- /dev/null +++ b/app/intent_classification/tests/test_data/edge_cases.py @@ -0,0 +1,44 @@ +""" +Edge Case Test Cases + +Test cases for potentially ambiguous situations that test the boundaries +between different intent classifications. +""" + +EDGE_CASE_TEST_CASES = [ + { + "id": "edge_001", + "message": "Do I need to pay the enrollment fee upfront?", + "expected_intent": "other", + "min_confidence": 0.70, + "description": "Edge: Payment inquiry (not opportunity, requires clarification)" + }, + { + "id": "edge_002", + "message": "I'm having trouble understanding this. Can someone help me?", + "expected_intent": "other", + "min_confidence": 0.65, + "description": "Edge: Unclear help request (should be OTHER)" + }, + { + "id": "edge_003", + "message": "I need this information urgently for tomorrow.", + "expected_intent": "other", + "min_confidence": 0.65, + "description": "Edge: Urgent but vague request (should be OTHER)" + }, + { + "id": "edge_004", + "message": "Do you accept cash payments?", + "expected_intent": "other", + "min_confidence": 0.75, + "description": "Edge: Payment method inquiry (not opportunity)" + }, + { + "id": "edge_005", + "message": "I need to discuss a complex issue about my enrollment.", + "expected_intent": "other", + "min_confidence": 0.70, + "description": "Edge: Complex issue (should be OTHER)" + } +] diff --git a/app/intent_classification/tests/test_data/opportunity_cases.py b/app/intent_classification/tests/test_data/opportunity_cases.py new file mode 100644 index 0000000..992641d --- /dev/null +++ b/app/intent_classification/tests/test_data/opportunity_cases.py @@ -0,0 +1,114 @@ +""" +OPPORTUNITY Intent Test Cases + +Test cases for validating OPPORTUNITY (legitimate interviews, jobs, research, internships) intent classification. +Each test case includes expected intent and minimum confidence threshold. +""" + +OPPORTUNITY_TEST_CASES = [ + { + "id": "opportunity_001", + "message": "We'd like to invite you for a job interview next week. When are you available?", + "expected_intent": "opportunity", + "min_confidence": 0.85, + "description": "Job interview invitation with availability request" + }, + { + "id": "opportunity_002", + "message": "Our company has an internship position available. Are you interested?", + "expected_intent": "opportunity", + "min_confidence": 0.85, + "description": "Internship opportunity offer" + }, + { + "id": "opportunity_003", + "message": "I'm a professor looking for research assistants. Would you like to discuss this opportunity?", + "expected_intent": "opportunity", + "min_confidence": 0.85, + "description": "Research assistant opportunity from professor" + }, + { + "id": "opportunity_004", + "message": "There's a career fair on campus next Friday. Can we schedule a time to meet?", + "expected_intent": "opportunity", + "min_confidence": 0.80, + "description": "Career fair networking opportunity" + }, + { + "id": "opportunity_005", + "message": "We received your application and would like to schedule an interview.", + "expected_intent": "opportunity", + "min_confidence": 0.90, + "description": "Interview scheduling for application" + }, + { + "id": "opportunity_006", + "message": "Our startup is hiring international students. Can we send you more information?", + "expected_intent": "opportunity", + "min_confidence": 0.85, + "description": "Startup hiring with info request" + }, + { + "id": "opportunity_007", + "message": "I'm organizing a networking event for tech professionals. Would you like to attend?", + "expected_intent": "opportunity", + "min_confidence": 0.80, + "description": "Professional networking event invitation" + }, + { + "id": "opportunity_008", + "message": "We have a scholarship opportunity for international students.", + "expected_intent": "opportunity", + "min_confidence": 0.85, + "description": "Scholarship opportunity notification" + }, + { + "id": "opportunity_009", + "message": "Our lab is looking for graduate research assistants. Can we set up a meeting?", + "expected_intent": "opportunity", + "min_confidence": 0.85, + "description": "Graduate research position with meeting request" + }, + { + "id": "opportunity_010", + "message": "There's an internship opening in our marketing department.", + "expected_intent": "opportunity", + "min_confidence": 0.85, + "description": "Marketing internship opening" + }, + { + "id": "opportunity_011", + "message": "We're hosting a workshop on career development. Would you like to join?", + "expected_intent": "opportunity", + "min_confidence": 0.75, + "description": "Career development workshop invitation" + }, + { + "id": "opportunity_012", + "message": "I'd like to discuss a potential research collaboration. What's your email address?", + "expected_intent": "opportunity", + "min_confidence": 0.85, + "description": "Research collaboration with contact request" + }, + { + "id": "opportunity_013", + "message": "Our company is recruiting for entry-level positions. When can we talk?", + "expected_intent": "opportunity", + "min_confidence": 0.85, + "description": "Entry-level recruitment with availability request" + }, + { + "id": "opportunity_014", + "message": "There's a PhD opening in our department. Would you be interested in applying?", + "expected_intent": "opportunity", + "min_confidence": 0.85, + "description": "PhD position opportunity" + }, + { + "id": "opportunity_015", + "message": "We're looking for interns for summer. Can I get your resume?", + "expected_intent": "opportunity", + "min_confidence": 0.85, + "description": "Summer internship with resume request" + } +] diff --git a/app/intent_classification/tests/test_data/other_cases.py b/app/intent_classification/tests/test_data/other_cases.py new file mode 100644 index 0000000..9aa151e --- /dev/null +++ b/app/intent_classification/tests/test_data/other_cases.py @@ -0,0 +1,114 @@ +""" +OTHER Intent Test Cases + +Test cases for validating OTHER (complex issues, messages, unclear intents) intent classification. +Each test case includes expected intent and minimum confidence threshold. +""" + +OTHER_TEST_CASES = [ + { + "id": "other_001", + "message": "I can't talk right now. Can I leave a message?", + "expected_intent": "other", + "min_confidence": 0.85, + "description": "Direct message leaving request" + }, + { + "id": "other_002", + "message": "Can someone call me back later?", + "expected_intent": "other", + "min_confidence": 0.90, + "description": "Callback request" + }, + { + "id": "other_003", + "message": "I have a special situation regarding my visa that needs discussion", + "expected_intent": "other", + "min_confidence": 0.80, + "description": "Complex visa situation" + }, + { + "id": "other_004", + "message": "I can't talk now, but I need to leave some information", + "expected_intent": "other", + "min_confidence": 0.85, + "description": "Unavailable, needs to leave info" + }, + { + "id": "other_005", + "message": "I need to speak with someone about a complex enrollment issue", + "expected_intent": "other", + "min_confidence": 0.80, + "description": "Complex enrollment discussion needed" + }, + { + "id": "other_006", + "message": "This is regarding a personal matter that requires individual attention", + "expected_intent": "other", + "min_confidence": 0.85, + "description": "Personal matter requiring individual attention" + }, + { + "id": "other_007", + "message": "I'm not satisfied with how my case was handled", + "expected_intent": "other", + "min_confidence": 0.85, + "description": "Complaint about case handling" + }, + { + "id": "other_008", + "message": "Can I schedule a one-on-one consultation?", + "expected_intent": "other", + "min_confidence": 0.80, + "description": "Individual consultation request" + }, + { + "id": "other_009", + "message": "I have multiple questions that need detailed discussion", + "expected_intent": "other", + "min_confidence": 0.80, + "description": "Multiple complex questions" + }, + { + "id": "other_010", + "message": "My situation is complicated and I need personalized help", + "expected_intent": "other", + "min_confidence": 0.85, + "description": "Complicated situation needing personalized help" + }, + { + "id": "other_011", + "message": "I'm busy right now. Please have someone contact me when they're available.", + "expected_intent": "other", + "min_confidence": 0.85, + "description": "Busy, requesting later contact" + }, + { + "id": "other_012", + "message": "I'd like to leave a message for the admissions office.", + "expected_intent": "other", + "min_confidence": 0.90, + "description": "Message for specific department" + }, + { + "id": "other_013", + "message": "I need help with a unique immigration case", + "expected_intent": "other", + "min_confidence": 0.80, + "description": "Unique immigration situation" + }, + { + "id": "other_014", + "message": "Can someone review my special circumstances?", + "expected_intent": "other", + "min_confidence": 0.75, + "description": "Special circumstances review" + }, + { + "id": "other_015", + "message": "I'm driving right now, can you have someone call me back?", + "expected_intent": "other", + "min_confidence": 0.85, + "description": "Driving, needs callback" + } +] diff --git a/app/intent_classification/tests/test_data/scam_cases.py b/app/intent_classification/tests/test_data/scam_cases.py new file mode 100644 index 0000000..737c375 --- /dev/null +++ b/app/intent_classification/tests/test_data/scam_cases.py @@ -0,0 +1,114 @@ +""" +SCAM Intent Test Cases + +Test cases for validating SCAM (fraud/malicious caller) intent classification. +Each test case includes expected intent and minimum confidence threshold. +""" + +SCAM_TEST_CASES = [ + { + "id": "scam_001", + "message": "This is the Australian Tax Office. You have unpaid taxes of $5000 and must transfer immediately or face prosecution.", + "expected_intent": "scam", + "min_confidence": 0.85, + "description": "Tax office impersonation scam" + }, + { + "id": "scam_002", + "message": "Your bank account has been frozen. Please provide your account number and password to unlock it.", + "expected_intent": "scam", + "min_confidence": 0.90, + "description": "Bank account scam requesting credentials" + }, + { + "id": "scam_003", + "message": "This is the police. You are suspected of money laundering and must transfer bail money to a secure account immediately.", + "expected_intent": "scam", + "min_confidence": 0.85, + "description": "Police impersonation scam" + }, + { + "id": "scam_004", + "message": "Congratulations! You've won a million dollars, but you need to pay $5000 processing fee first to claim your prize.", + "expected_intent": "scam", + "min_confidence": 0.80, + "description": "Lottery scam with upfront fee" + }, + { + "id": "scam_005", + "message": "Your package has been seized by customs. Pay the $3000 fine immediately or it will be destroyed.", + "expected_intent": "scam", + "min_confidence": 0.80, + "description": "Customs scam with threats" + }, + { + "id": "scam_006", + "message": "This is Medicare calling. Your account has suspicious activity. Please provide your bank details immediately.", + "expected_intent": "scam", + "min_confidence": 0.85, + "description": "Medicare impersonation" + }, + { + "id": "scam_007", + "message": "Your internet will be disconnected in 24 hours. Press 1 to make immediate payment.", + "expected_intent": "scam", + "min_confidence": 0.75, + "description": "Internet disconnection threat" + }, + { + "id": "scam_008", + "message": "Your Centrelink benefits have been frozen. Please provide your bank account and Tax File Number immediately.", + "expected_intent": "scam", + "min_confidence": 0.85, + "description": "Centrelink scam requesting TFN" + }, + { + "id": "scam_009", + "message": "This is a court notice. You have unprocessed legal documents. An arrest warrant will be issued if you don't act immediately.", + "expected_intent": "scam", + "min_confidence": 0.80, + "description": "Court/legal threat scam" + }, + { + "id": "scam_010", + "message": "Your credit card has suspicious transactions. Please provide your card number and CVV to verify your identity.", + "expected_intent": "scam", + "min_confidence": 0.85, + "description": "Credit card verification scam" + }, + { + "id": "scam_011", + "message": "You need to purchase iTunes gift cards worth $2000 to pay your tax debt immediately.", + "expected_intent": "scam", + "min_confidence": 0.90, + "description": "Gift card payment scam (obvious fraud)" + }, + { + "id": "scam_012", + "message": "Your computer has viruses. We are Microsoft technical support and need remote access to fix it.", + "expected_intent": "scam", + "min_confidence": 0.80, + "description": "Tech support scam" + }, + { + "id": "scam_013", + "message": "Urgent: Your visa is expiring. Pay the renewal fee immediately or you will be deported.", + "expected_intent": "scam", + "min_confidence": 0.75, + "description": "Visa/deportation threat" + }, + { + "id": "scam_014", + "message": "Investment opportunity: Guaranteed 20% monthly returns. You must decide and transfer at least $10,000 now.", + "expected_intent": "scam", + "min_confidence": 0.70, + "description": "Investment scam with unrealistic returns" + }, + { + "id": "scam_015", + "message": "Hi, this is your grandson. I'm in an emergency and need money urgently. Please transfer immediately.", + "expected_intent": "scam", + "min_confidence": 0.75, + "description": "Family member impersonation scam" + } +] diff --git a/app/intent_classification/tests/test_runner.py b/app/intent_classification/tests/test_runner.py new file mode 100644 index 0000000..0dee683 --- /dev/null +++ b/app/intent_classification/tests/test_runner.py @@ -0,0 +1,331 @@ +""" +Intent Classification Test Runner + +Runs all test cases and calculates performance metrics including: +- Accuracy, Precision, Recall, F1 Score +- Confusion Matrix +- Per-intent performance metrics +- Misclassification analysis +""" + +import asyncio +import sys +from pathlib import Path +from typing import Dict, List, Any +from collections import defaultdict +import json +from datetime import datetime + +# Add parent directory to path for imports +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) + +from intent_classification import IntentClassifier +from intent_classification.tests.test_data import ( + ALL_TEST_CASES +) + + +class IntentClassificationMetrics: + """Calculate and track classification performance metrics""" + + def __init__(self): + self.results: List[Dict[str, Any]] = [] + self.confusion_matrix: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int)) + + def add_result( + self, + test_case: Dict[str, Any], + predicted_intent: str, + confidence: float, + reasoning: str, + metadata: Dict[str, Any] + ): + """Add a test result to metrics""" + expected = test_case["expected_intent"] + is_correct = predicted_intent == expected + + result = { + "test_id": test_case["id"], + "message": test_case["message"], + "expected_intent": expected, + "predicted_intent": predicted_intent, + "confidence": confidence, + "reasoning": reasoning, + "metadata": metadata, + "is_correct": is_correct, + "min_confidence": test_case.get("min_confidence", 0.0), + "confidence_ok": confidence >= test_case.get("min_confidence", 0.0) + } + + self.results.append(result) + self.confusion_matrix[expected][predicted_intent] += 1 + + def calculate_metrics(self) -> Dict[str, Any]: + """Calculate comprehensive performance metrics""" + if not self.results: + return {} + + # Overall metrics + total = len(self.results) + correct = sum(1 for r in self.results if r["is_correct"]) + accuracy = correct / total + + # Per-intent metrics + intent_metrics = {} + all_intents = ["scam", "opportunity", "other"] + + for intent in all_intents: + # True Positives: correctly predicted as this intent + tp = sum(1 for r in self.results + if r["expected_intent"] == intent and r["predicted_intent"] == intent) + + # False Positives: incorrectly predicted as this intent + fp = sum(1 for r in self.results + if r["expected_intent"] != intent and r["predicted_intent"] == intent) + + # False Negatives: should be this intent but predicted as something else + fn = sum(1 for r in self.results + if r["expected_intent"] == intent and r["predicted_intent"] != intent) + + # True Negatives: correctly predicted as NOT this intent + tn = sum(1 for r in self.results + if r["expected_intent"] != intent and r["predicted_intent"] != intent) + + # Calculate metrics + precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0 + recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0 + f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0 + + # Confidence statistics + intent_results = [r for r in self.results if r["expected_intent"] == intent] + avg_confidence = sum(r["confidence"] for r in intent_results) / len(intent_results) if intent_results else 0.0 + low_confidence_count = sum(1 for r in intent_results if not r["confidence_ok"]) + + intent_metrics[intent] = { + "true_positives": tp, + "false_positives": fp, + "false_negatives": fn, + "true_negatives": tn, + "precision": precision, + "recall": recall, + "f1_score": f1_score, + "support": tp + fn, # Total actual instances + "avg_confidence": avg_confidence, + "low_confidence_count": low_confidence_count + } + + # Macro-averaged metrics (average across intents) + macro_precision = sum(m["precision"] for m in intent_metrics.values()) / len(all_intents) + macro_recall = sum(m["recall"] for m in intent_metrics.values()) / len(all_intents) + macro_f1 = sum(m["f1_score"] for m in intent_metrics.values()) / len(all_intents) + + # Misclassification analysis + misclassified = [r for r in self.results if not r["is_correct"]] + misclassification_patterns = defaultdict(int) + for r in misclassified: + pattern = f"{r['expected_intent']} → {r['predicted_intent']}" + misclassification_patterns[pattern] += 1 + + return { + "summary": { + "total_tests": total, + "correct": correct, + "incorrect": len(misclassified), + "accuracy": accuracy, + "macro_precision": macro_precision, + "macro_recall": macro_recall, + "macro_f1": macro_f1 + }, + "per_intent": intent_metrics, + "confusion_matrix": dict(self.confusion_matrix), + "misclassification_patterns": dict(sorted( + misclassification_patterns.items(), + key=lambda x: x[1], + reverse=True + )), + "misclassified_cases": misclassified + } + + def generate_report(self, metrics: Dict[str, Any]) -> str: + """Generate a human-readable report""" + report = [] + report.append("=" * 80) + report.append("INTENT CLASSIFICATION TEST REPORT") + report.append("=" * 80) + report.append("") + + # Summary + summary = metrics["summary"] + report.append("OVERALL PERFORMANCE") + report.append("-" * 80) + report.append(f"Total Tests: {summary['total_tests']}") + report.append(f"Correct: {summary['correct']} ({summary['accuracy']:.1%})") + report.append(f"Incorrect: {summary['incorrect']}") + report.append(f"Accuracy: {summary['accuracy']:.1%}") + report.append(f"Macro Precision: {summary['macro_precision']:.1%}") + report.append(f"Macro Recall: {summary['macro_recall']:.1%}") + report.append(f"Macro F1: {summary['macro_f1']:.1%}") + report.append("") + + # Per-intent metrics + report.append("PER-INTENT PERFORMANCE") + report.append("-" * 80) + for intent, m in metrics["per_intent"].items(): + report.append(f"\n{intent.upper()}:") + report.append(f" Precision: {m['precision']:.1%}") + report.append(f" Recall: {m['recall']:.1%}") + report.append(f" F1 Score: {m['f1_score']:.1%}") + report.append(f" Support: {m['support']} cases") + report.append(f" Avg Confidence: {m['avg_confidence']:.2f}") + report.append(f" TP/FP/FN/TN: {m['true_positives']}/{m['false_positives']}/{m['false_negatives']}/{m['true_negatives']}") + report.append("") + + # Confusion Matrix + report.append("CONFUSION MATRIX") + report.append("-" * 80) + intents = ["scam", "opportunity", "other"] + header = "Actual/Predicted".ljust(20) + "".join(i.ljust(15) for i in intents) + report.append(header) + report.append("-" * 80) + + for actual in intents: + row = actual.ljust(20) + for predicted in intents: + count = metrics["confusion_matrix"].get(actual, {}).get(predicted, 0) + row += str(count).ljust(15) + report.append(row) + report.append("") + + # Misclassification patterns + if metrics["misclassification_patterns"]: + report.append("MISCLASSIFICATION PATTERNS") + report.append("-" * 80) + for pattern, count in list(metrics["misclassification_patterns"].items())[:10]: + report.append(f"{pattern.ljust(30)} {count} cases") + report.append("") + + # Top misclassified cases + if metrics["misclassified_cases"]: + report.append("TOP MISCLASSIFIED CASES (First 10)") + report.append("-" * 80) + for i, case in enumerate(metrics["misclassified_cases"][:10], 1): + report.append(f"\n{i}. {case['test_id']}") + report.append(f" Message: {case['message'][:70]}...") + report.append(f" Expected: {case['expected_intent']}") + report.append(f" Predicted: {case['predicted_intent']} (confidence: {case['confidence']:.2f})") + report.append(f" Reasoning: {case['reasoning'][:100]}...") + + report.append("") + report.append("=" * 80) + + return "\n".join(report) + + +class IntentTestRunner: + """Run intent classification tests and generate metrics""" + + def __init__(self): + self.classifier = IntentClassifier() + self.metrics = IntentClassificationMetrics() + + async def run_test_case(self, test_case: Dict[str, Any]) -> Dict[str, Any]: + """Run a single test case""" + result = await self.classifier.classify_intent(test_case["message"]) + + self.metrics.add_result( + test_case=test_case, + predicted_intent=result["intent"], + confidence=result["confidence"], + reasoning=result["reasoning"], + metadata=result["metadata"] + ) + + return result + + async def run_all_tests(self, test_suites: Dict[str, List[Dict]] = None) -> Dict[str, Any]: + """Run all test cases""" + if test_suites is None: + test_suites = ALL_TEST_CASES + + print("Running intent classification tests...") + print() + + total_tests = sum(len(cases) for cases in test_suites.values()) + current = 0 + + for suite_name, test_cases in test_suites.items(): + print(f"Testing {suite_name.upper()} ({len(test_cases)} cases)...") + + for test_case in test_cases: + current += 1 + await self.run_test_case(test_case) + + # Progress indicator + if current % 5 == 0: + print(f" Progress: {current}/{total_tests}") + + print() + print("All tests completed!") + print() + + # Calculate metrics + metrics = self.metrics.calculate_metrics() + + return metrics + + def save_results(self, metrics: Dict[str, Any], output_dir: str = None): + """Save test results to files""" + if output_dir is None: + output_dir = Path(__file__).parent / "results" + else: + output_dir = Path(output_dir) + + output_dir.mkdir(exist_ok=True) + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + # Save JSON metrics + json_path = output_dir / f"metrics_{timestamp}.json" + with open(json_path, "w") as f: + json.dump(metrics, f, indent=2) + + # Save text report + report = self.metrics.generate_report(metrics) + report_path = output_dir / f"report_{timestamp}.txt" + with open(report_path, "w") as f: + f.write(report) + + # Save misclassified cases for fine-tuning + if metrics["misclassified_cases"]: + finetune_path = output_dir / f"misclassified_{timestamp}.json" + with open(finetune_path, "w") as f: + json.dump(metrics["misclassified_cases"], f, indent=2) + + print(f"Results saved to {output_dir}/") + print(f" - Metrics: {json_path.name}") + print(f" - Report: {report_path.name}") + if metrics["misclassified_cases"]: + print(f" - Misclassified: {finetune_path.name}") + + return output_dir + + +async def main(): + """Main test runner""" + runner = IntentTestRunner() + + # Run all tests + metrics = await runner.run_all_tests() + + # Print report + report = runner.metrics.generate_report(metrics) + print(report) + + # Save results + runner.save_results(metrics) + + return metrics + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/app/main.py b/app/main.py index 51705de..091ce6b 100644 --- a/app/main.py +++ b/app/main.py @@ -1,15 +1,17 @@ import sys from pathlib import Path + +# Add the app directory to Python path for absolute imports +app_dir = Path(__file__).parent +sys.path.insert(0, str(app_dir)) + from config import get_settings from api import health, chat, call, summary, email, dispatch +from intent_classification.api import router as intent_router from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from fastapi_mcp.server import FastApiMCP -# Add the app directory to Python path for absolute imports -app_dir = Path(__file__).parent -sys.path.insert(0, str(app_dir)) - settings = get_settings() @@ -35,6 +37,7 @@ app.include_router(summary.router, prefix=settings.api_prefix) app.include_router(email.router, prefix=settings.api_prefix) app.include_router(dispatch.router, prefix=settings.api_prefix) +app.include_router(intent_router, prefix=settings.api_prefix) @app.get("/")