From 6d55182831134cd8bbee8e7d9de12531f78df2bf Mon Sep 17 00:00:00 2001
From: Kannav02 <kannavsethi02@gmail.com>
Date: Wed, 9 Jul 2025 13:17:44 -0400
Subject: [PATCH 1/5] feat: added the file to ingest the document to vector db

Signed-off-by: Kannav02 <kannavsethi02@gmail.com>
---
 backend/src/dataset_gen_eval/ingest_doc.py | 108 +++++++++++++++++++++
 1 file changed, 108 insertions(+)
 create mode 100644 backend/src/dataset_gen_eval/ingest_doc.py

diff --git a/backend/src/dataset_gen_eval/ingest_doc.py b/backend/src/dataset_gen_eval/ingest_doc.py
new file mode 100644
index 00000000..49b95be1
--- /dev/null
+++ b/backend/src/dataset_gen_eval/ingest_doc.py
@@ -0,0 +1,108 @@
+from pathlib import Path
+import json, uuid
+import sys
+
+src_path = Path(__file__).parent.parent
+sys.path.insert(0, str(src_path))
+
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from src.vectorstores.faiss import FAISSVectorDatabase
+from tools.process_pdf import process_pdf_docs
+from tools.process_md import process_md
+
+DOMAINS = ["general_openroad", 
+           #"command_reference"
+           "installation_guides", "error_messages",
+           "opensta_yosys_klayout"
+           ]
+
+# discover files in the raw data dir
+def discover_files(domain:str)->list:
+    root = Path(f"../backend/data/raw/{domain}")
+    # getting all the files
+    return list(root.rglob("*.*"))
+
+# document loader function
+def load_as_documents(path):
+    
+    
+    suffix = path.suffix.lower()
+    if suffix == ".pdf":
+        return process_pdf_docs(str(path))
+    elif suffix in {".md", ".markdown"}:
+        return process_md(str(path.parent), split_text=False)
+    elif suffix == ".html":
+        print("HTML Files skipped for now")
+        return []
+    else:
+        return []                     
+
+# chunking function
+def chunk(docs, size=700, overlap=70):
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=size, chunk_overlap=overlap, add_start_index=True)
+    out = []
+    for d in docs:
+        out.extend(splitter.split_documents([d]))
+    return out
+
+# adding metadata to each chunk
+def enrich_metadata(chunks, domain, source_path):
+    for idx, doc in enumerate(chunks):
+        doc.metadata.update({
+            "domain": domain,
+            "doc_path": str(source_path.relative_to('../backend/data/raw')),
+            "chunk_id": f"{uuid.uuid4()}",
+        })
+    return chunks
+
+
+def build_domain_index(domain):
+    vdb = FAISSVectorDatabase(embeddings_type="HF",
+                              embeddings_model_name="sentence-transformers/all-MiniLM-L6-v2")
+    manifest = []
+    
+    files = discover_files(domain)
+    total_files = len(files)
+    print(f"Processing {total_files} files for domain: {domain}")
+    
+    for idx, fp in enumerate(files):
+        print(f"[{idx+1}/{total_files}] Processing: {fp}")
+
+        if fp.name.startswith('.'):
+            print(f"Skipping system file: {fp}")
+            continue
+        
+        
+        docs = load_as_documents(fp)
+        if not docs:  
+            print(f"No documents loaded from: {fp}")
+            continue
+            
+        chunks = chunk(docs)
+        if not chunks: 
+            print(f"No chunks created from: {fp}")
+            continue
+            
+        chunks = enrich_metadata(chunks, domain, fp)
+        manifest += [c.metadata for c in chunks]
+        
+        # Process chunks in smaller batches for better memory management
+        batch_size = 50
+        for i in range(0, len(chunks), batch_size):
+            batch = chunks[i:i+batch_size]
+            vdb._add_to_db(batch)
+            print(f"  Added batch {i//batch_size + 1} ({len(batch)} chunks)")
+    
+    vdb.save_db(name=domain)     
+    Path("../backend/data/manifests").mkdir(exist_ok=True, parents=True)
+    with open(f"../backend/data/manifests/{domain}.jsonl", "w") as f:
+        for row in manifest:
+            f.write(json.dumps(row) + "\n")
+
+
+if __name__ == "__main__":
+    for dom in DOMAINS:
+        build_domain_index(dom)
+    print("All domain indexes built & manifests written.")
+

From 5f839c8a010eba28bbd37ad00de5576846674950 Mon Sep 17 00:00:00 2001
From: Kannav02 <kannavsethi02@gmail.com>
Date: Wed, 9 Jul 2025 13:45:50 -0400
Subject: [PATCH 2/5] feat: added the file to generate the qa pairs

Signed-off-by: Kannav02 <kannavsethi02@gmail.com>
---
 .../src/dataset_gen_eval/generate_qa_pairs.py | 246 ++++++++++++++++++
 1 file changed, 246 insertions(+)
 create mode 100644 backend/src/dataset_gen_eval/generate_qa_pairs.py

diff --git a/backend/src/dataset_gen_eval/generate_qa_pairs.py b/backend/src/dataset_gen_eval/generate_qa_pairs.py
new file mode 100644
index 00000000..028d4f2d
--- /dev/null
+++ b/backend/src/dataset_gen_eval/generate_qa_pairs.py
@@ -0,0 +1,246 @@
+import json
+import sys
+from pathlib import Path
+from typing import List, Dict
+import random
+
+# Add src to path for imports
+src_path = Path(__file__).parent.parent
+sys.path.insert(0, str(src_path))
+
+from src.vectorstores.faiss import FAISSVectorDatabase
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain.docstore.document import Document
+from dotenv import load_dotenv
+
+load_dotenv()
+
+
+# didn't include command_reference cause didn't index it
+DOMAINS = [
+    "installation_guides", 
+    "error_messages",
+    "opensta_yosys_klayout",
+    "general_openroad"
+]
+
+QA_PAIRS_PER_DOMAIN = 10
+
+QA_GENERATION_PROMPT = """
+Your task is to write a factoid question and an answer given a context.
+Your factoid question should be answerable with a specific, concise piece of factual information from the context.
+Your factoid question should be formulated in the same style as questions users could ask in a search engine.
+This means that your factoid question MUST NOT mention something like "according to the passage" or "context".
+
+Provide your answer as follows:
+
+Output:::
+Factoid question: (your factoid question)
+Answer: (your answer to the factoid question)
+
+Now here is the context.
+
+Context: {context}\n
+Output:::"""
+
+
+def load_domain_database(domain: str) -> FAISSVectorDatabase:
+    """Load the FAISS vector database for a specific domain."""
+    print(f"Loading vector database for domain: {domain}")
+    
+    vdb = FAISSVectorDatabase(
+        embeddings_type="HF",
+        embeddings_model_name="sentence-transformers/all-MiniLM-L6-v2"
+    )
+    
+    try:
+        vdb.load_db(name=domain)
+        print(f"Successfully loaded {domain} database")
+        return vdb
+    except Exception as e:
+        print(f"Error loading database for {domain}: {e}")
+        return None
+
+
+def sample_documents_from_db(vdb: FAISSVectorDatabase, num_samples: int = 5) -> List[Document]:
+    """Sample random documents from the vector database to use for QA generation."""
+    try:
+        all_docs = list(vdb.get_documents())
+        print(f"Total documents in database: {len(all_docs)}")
+        
+        # Sample random documents
+        sample_size = min(num_samples, len(all_docs))
+        sampled_docs = random.sample(all_docs, sample_size)
+        
+        print(f"Sampled {len(sampled_docs)} documents")
+        return sampled_docs
+    
+    except Exception as e:
+        print(f"Error sampling documents: {e}")
+        return []
+
+
+def generate_qa_pairs_for_content(all_docs: List[Document], domain: str, num_qa: int = 5) -> List[Dict[str, str]]:
+    """Use Gemini to generate QA pairs from the given documents."""
+    try:
+        # Initialize Gemini model
+        llm = ChatGoogleGenerativeAI(
+            model="gemini-2.5-pro",
+            temperature=0.3,  
+        )
+        
+        print(f"Generating {num_qa} QA pairs for {domain} domain...")
+        
+        all_qa_pairs = []
+        
+        
+        for i in range(num_qa):
+            try:
+                # Sample different documents for each QA pair to get variety
+                sample_size = min(5, len(all_docs)) 
+                sampled_docs = random.sample(all_docs, sample_size)
+                
+                # Combine content from this sample
+                content = "\n\n---DOCUMENT SEPARATOR---\n\n".join([
+                    doc.page_content for doc in sampled_docs
+                ])
+                
+                prompt = QA_GENERATION_PROMPT.format(
+                    context=content[:15000]  
+                )
+                
+                print(f"  Generating QA pair {i+1}/{num_qa}...")
+                
+                # gemini cost analysis here, langsmith
+                response = llm.invoke(prompt)
+               
+                response_text = response.content.strip()
+                
+               
+                if "Output:::" in response_text:
+                    output_section = response_text.split("Output:::")[-1].strip()
+                    
+                   
+                    lines = output_section.split('\n')
+                    question = ""
+                    answer = ""
+                    
+                    for line in lines:
+                        line = line.strip()
+                        if line.startswith("Factoid question:"):
+                            question = line.replace("Factoid question:", "").strip()
+                        elif line.startswith("Answer:"):
+                            answer = line.replace("Answer:", "").strip()
+                    
+
+                    if question and answer:
+                        qa_pair = {
+                            "question": question,
+                            "answer": answer,
+                            "domain": domain,
+                            "source": "generated_from_docs", # context source add here
+                            "context": content[:15000]  # Add the context used for generation
+                        }
+                        all_qa_pairs.append(qa_pair)
+                        print(f"Generated: {question[:50]}...")
+                    else:
+                        print(f"Failed to parse QA pair from response")
+                        print(f"Raw response: {response_text[:200]}...")
+                else:
+                    print(f"No 'Output:::' section found in response")
+                    print(f"Raw response: {response_text[:200]}...")
+                    
+            except Exception as e:
+                print(f"Error generating QA pair {i+1}: {e}")
+                continue
+        
+        print(f"Successfully generated {len(all_qa_pairs)} QA pairs out of {num_qa} attempts")
+        return all_qa_pairs
+            
+    except Exception as e:
+        print(f"Error in QA generation process: {e}")
+        return []
+
+
+def process_domain(domain: str, qa_per_domain: int = 10) -> List[Dict[str, str]]:
+    """Process a single domain to generate QA pairs."""
+    print(f"\n{'='*50}")
+    print(f"Processing domain: {domain}")
+    print(f"{'='*50}")
+    
+    # Load the vector database
+    vdb = load_domain_database(domain)
+    if not vdb:
+        return []
+    
+    # Sample documents from the database
+    sampled_docs = sample_documents_from_db(vdb, num_samples=100)  
+    if not sampled_docs:
+        return []
+    
+    print(f"Will generate QA pairs from pool of {len(sampled_docs)} documents")
+    
+    # Generate QA pairs (each QA pair will sample different docs)
+    qa_pairs = generate_qa_pairs_for_content(
+        all_docs=sampled_docs,
+        domain=domain,
+        num_qa=qa_per_domain
+    )
+    
+    return qa_pairs
+
+
+def save_qa_pairs(all_qa_pairs: List[Dict[str, str]], output_file: str):
+    """Save the generated QA pairs to a JSON file."""
+    output_path = Path(output_file)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(all_qa_pairs, f, indent=2, ensure_ascii=False)
+    
+    print(f"\nSaved {len(all_qa_pairs)} QA pairs to: {output_path}")
+
+
+def main():
+    """Main function to generate QA pairs for all domains."""
+    print("Starting QA pair generation...")
+    print(f"Target domains: {DOMAINS}")
+    print(f"QA pairs per domain: {QA_PAIRS_PER_DOMAIN}")
+    
+    all_qa_pairs = []
+    
+    for domain in DOMAINS:
+        try:
+            qa_pairs = process_domain(domain, QA_PAIRS_PER_DOMAIN)
+            all_qa_pairs.extend(qa_pairs)
+            print(f"Generated {len(qa_pairs)} QA pairs for {domain}")
+            
+        except Exception as e:
+            print(f"Error processing domain {domain}: {e}")
+            continue
+    
+    # Save all QA pairs
+    if all_qa_pairs:
+        output_file = "../backend/data/generated_qa_pairs_gemini_pro_new.json"
+        save_qa_pairs(all_qa_pairs, output_file)
+        
+     
+
+        print(f"{'='*50}")
+        print(f"Total QA pairs generated: {len(all_qa_pairs)}")
+        
+        
+        domain_counts = {}
+        for qa in all_qa_pairs:
+            domain = qa.get('domain', 'unknown')
+            domain_counts[domain] = domain_counts.get(domain, 0) + 1
+        
+        for domain, count in domain_counts.items():
+            print(f"  {domain}: {count} pairs")
+            
+    else:
+        print("No QA pairs were generated!")
+
+
+if __name__ == "__main__":
+    main() 
\ No newline at end of file

From cca788d89ecfebf2a69c3afeed262cdb4017af49 Mon Sep 17 00:00:00 2001
From: Kannav02 <kannavsethi02@gmail.com>
Date: Wed, 9 Jul 2025 14:01:08 -0400
Subject: [PATCH 3/5] feat: added the quality evaluation agents for synthetic
 dataset generation

Signed-off-by: Kannav02 <kannavsethi02@gmail.com>
---
 .../src/dataset_gen_eval/quality_agents.py    | 286 ++++++++++++++++++
 1 file changed, 286 insertions(+)
 create mode 100644 backend/src/dataset_gen_eval/quality_agents.py

diff --git a/backend/src/dataset_gen_eval/quality_agents.py b/backend/src/dataset_gen_eval/quality_agents.py
new file mode 100644
index 00000000..94cbb2e5
--- /dev/null
+++ b/backend/src/dataset_gen_eval/quality_agents.py
@@ -0,0 +1,286 @@
+import sys
+from pathlib import Path
+from dotenv import load_dotenv
+
+
+src_path = Path(__file__).parent.parent
+sys.path.insert(0, str(src_path))
+
+from langchain_google_genai import ChatGoogleGenerativeAI
+from deepeval.metrics import BaseMetric
+from deepeval.test_case import LLMTestCase
+
+load_dotenv()
+
+# Groundedness critique prompt template
+question_groundedness_critique_prompt = """
+You are an expert evaluator tasked with assessing the groundedness of a question-answer pair.
+
+Your task is to evaluate whether the given question is well-grounded in the provided context.
+A well-grounded question should:
+1. Be answerable using information from the context
+2. Not require external knowledge beyond the context
+3. Be specific and factual rather than speculative
+4. Have clear supporting evidence in the context
+
+Please analyze the question and context, then provide a rating from 1 to 5:
+- 1: Completely ungrounded - question cannot be answered from context
+- 2: Poorly grounded - question requires significant external knowledge
+- 3: Moderately grounded - question is partially answerable from context
+- 4: Well grounded - question is mostly answerable from context
+- 5: Perfectly grounded - question is completely answerable from context
+
+Question: {question}
+
+Context: {context}
+
+Please provide your evaluation in the following format:
+Analysis: [Your detailed analysis of why the question is or isn't grounded in the context]
+Total rating: [Your rating from 1 to 5]
+"""
+
+# Question relevance critique prompt template
+question_relevance_critique_prompt = """
+You will be given a question.
+Your task is to provide a 'total rating' representing how useful this question can be to machine learning developers building NLP applications with the Hugging Face ecosystem.
+Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful.
+
+Provide your answer as follows:
+
+Answer:::
+Evaluation: (your rationale for the rating, as a text)
+Total rating: (your rating, as a number between 1 and 5)
+
+You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.
+
+Now here is the question.
+
+Question: {question}
+Answer::: """
+
+# Question standalone critique prompt template
+question_standalone_critique_prompt = """
+You will be given a question.
+Your task is to provide a 'total rating' representing how context-independent this question is.
+Give your answer on a scale of 1 to 5, where 1 means that the question depends on additional information to be understood, and 5 means that the question makes sense by itself.
+For instance, if the question refers to a particular setting, like 'in the context' or 'in the document', the rating must be 1.
+The questions can contain obscure technical nouns or acronyms like Gradio, Hub, Hugging Face or Space and still be a 5: it must simply be clear to an operator with access to documentation what the question is about.
+
+For instance, "What is the name of the checkpoint from which the ViT model is imported?" should receive a 1, since there is an implicit mention of a context, thus the question is not independent from the context.
+
+Provide your answer as follows:
+
+Answer:::
+Evaluation: (your rationale for the rating, as a text)
+Total rating: (your rating, as a number between 1 and 5)
+
+You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.
+
+Now here is the question.
+
+Question: {question}
+Answer::: """
+
+
+class GroundednessMetric(BaseMetric):
+    def __init__(self, threshold: float = 0.6):
+        self.threshold = threshold
+        self.evaluation_model = "gemini-2.5-pro"
+        self.include_reason = True
+        
+        
+        self.llm = ChatGoogleGenerativeAI(
+            model=self.evaluation_model,
+            temperature=0.3,
+        )
+
+    def measure(self, tc: LLMTestCase) -> float:
+        """Synchronous version of the metric evaluation."""
+        prompt = question_groundedness_critique_prompt.format(
+            question=tc.input, 
+            context="\n".join(tc.context or [])
+        )
+        
+        try:
+           
+            response = self.llm.invoke(prompt)
+            response_text = response.content.strip()
+            
+            #
+            if "Total rating:" in response_text:
+                rating_line = response_text.split("Total rating:")[-1].strip()
+                
+                score_1_5 = int(rating_line.split()[0])
+            else:
+                
+                lines = response_text.split('\n')
+                score_1_5 = 3 
+                for line in lines:
+                    if any(keyword in line.lower() for keyword in ['rating:', 'score:']):
+                        try:
+                            score_1_5 = int([char for char in line if char.isdigit()][0])
+                            break
+                        except (IndexError, ValueError):
+                            continue
+            
+            # Convert to 0-1 scale as expected by DeepEval
+            self.score = score_1_5 / 5.0
+            self.reason = response_text
+            self.success = self.score >= self.threshold
+            
+            return self.score
+            
+        except Exception as e:
+            self.score = 0.0
+            self.reason = f"Error during evaluation: {str(e)}"
+            self.success = False
+            self.error = str(e)
+            return self.score
+
+    async def a_measure(self, tc: LLMTestCase):
+        """Async version - fallback to synchronous since we don't have async client setup."""
+        return self.measure(tc)
+
+    def is_successful(self) -> bool:
+        """Check if the metric evaluation was successful."""
+        return False if getattr(self, "error", None) else self.success
+
+    @property
+    def __name__(self):
+        return "Question Groundedness"
+
+
+class QuestionRelevanceMetric(BaseMetric):
+    """DeepEval metric for evaluating question relevance to ML developers building NLP applications."""
+    
+    def __init__(self, threshold: float = 0.6):
+        self.threshold = threshold
+        self.evaluation_model = "gemini-2.5-pro"
+        self.include_reason = True
+        
+        
+        self.llm = ChatGoogleGenerativeAI(
+            model=self.evaluation_model,
+            temperature=0.3,
+        )
+
+    def measure(self, tc: LLMTestCase) -> float:
+        """Synchronous version of the metric evaluation."""
+        prompt = question_relevance_critique_prompt.format(
+            question=tc.input
+        )
+        
+        try:
+            
+            response = self.llm.invoke(prompt)
+            response_text = response.content.strip()
+            
+            
+            if "Total rating:" in response_text:
+                rating_line = response_text.split("Total rating:")[-1].strip()
+                score_1_5 = int(rating_line.split()[0])
+            else:
+                lines = response_text.split('\n')
+                score_1_5 = 3  # Default score
+                for line in lines:
+                    if any(keyword in line.lower() for keyword in ['rating:', 'score:']):
+                        try:
+                            score_1_5 = int([char for char in line if char.isdigit()][0])
+                            break
+                        except (IndexError, ValueError):
+                            continue
+            
+            # Convert to 0-1 scale as expected by DeepEval
+            self.score = score_1_5 / 5.0
+            self.reason = response_text
+            self.success = self.score >= self.threshold
+            
+            return self.score
+            
+        except Exception as e:
+            self.score = 0.0
+            self.reason = f"Error during evaluation: {str(e)}"
+            self.success = False
+            self.error = str(e)
+            return self.score
+
+    async def a_measure(self, tc: LLMTestCase):
+        """Async version - fallback to synchronous since we don't have async client setup."""
+        return self.measure(tc)
+
+    def is_successful(self) -> bool:
+        """Check if the metric evaluation was successful."""
+        return False if getattr(self, "error", None) else self.success
+
+    @property
+    def __name__(self):
+        return "Question Relevance"
+
+
+class QuestionStandaloneMetric(BaseMetric):
+    """DeepEval metric for evaluating question context-independence."""
+    
+    def __init__(self, threshold: float = 0.6):
+        self.threshold = threshold
+        self.evaluation_model = "gemini-2.5-pro"
+        self.include_reason = True
+        
+        
+        self.llm = ChatGoogleGenerativeAI(
+            model=self.evaluation_model,
+            temperature=0.3,
+        )
+
+    def measure(self, tc: LLMTestCase) -> float:
+        """Synchronous version of the metric evaluation."""
+        prompt = question_standalone_critique_prompt.format(
+            question=tc.input
+        )
+        
+        try:
+            
+            response = self.llm.invoke(prompt)
+            response_text = response.content.strip()
+            
+            
+            if "Total rating:" in response_text:
+                rating_line = response_text.split("Total rating:")[-1].strip()
+                
+                score_1_5 = int(rating_line.split()[0])
+            else:
+               
+                lines = response_text.split('\n')
+                score_1_5 = 3  # Default score
+                for line in lines:
+                    if any(keyword in line.lower() for keyword in ['rating:', 'score:']):
+                        try:
+                            score_1_5 = int([char for char in line if char.isdigit()][0])
+                            break
+                        except (IndexError, ValueError):
+                            continue
+            
+     
+            self.score = score_1_5 / 5.0
+            self.reason = response_text
+            self.success = self.score >= self.threshold
+            
+            return self.score
+            
+        except Exception as e:
+            self.score = 0.0
+            self.reason = f"Error during evaluation: {str(e)}"
+            self.success = False
+            self.error = str(e)
+            return self.score
+
+    async def a_measure(self, tc: LLMTestCase):
+        """Async version - fallback to synchronous since we don't have async client setup."""
+        return self.measure(tc)
+
+    def is_successful(self) -> bool:
+        """Check if the metric evaluation was successful."""
+        return False if getattr(self, "error", None) else self.success
+
+    @property
+    def __name__(self):
+        return "Question Standalone"
\ No newline at end of file

From 46dab55d53f0540ee474559cc129f351120a3d6c Mon Sep 17 00:00:00 2001
From: Kannav02 <kannavsethi02@gmail.com>
Date: Wed, 9 Jul 2025 14:07:45 -0400
Subject: [PATCH 4/5] feat: added the script to run the evaluation agent for
 each QA pair

Signed-off-by: Kannav02 <kannavsethi02@gmail.com>
---
 backend/src/dataset_gen_eval/eval_dataset.py | 38 ++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 backend/src/dataset_gen_eval/eval_dataset.py

diff --git a/backend/src/dataset_gen_eval/eval_dataset.py b/backend/src/dataset_gen_eval/eval_dataset.py
new file mode 100644
index 00000000..99f4ea44
--- /dev/null
+++ b/backend/src/dataset_gen_eval/eval_dataset.py
@@ -0,0 +1,38 @@
+import json
+from deepeval.test_case import LLMTestCase
+from .quality_agents import (
+    GroundednessMetric,
+    QuestionRelevanceMetric,
+    QuestionStandaloneMetric
+)
+
+
+json_path = "data/generated_qa_pairs_gemini_pro_new.json"
+
+# Loading questions
+with open(json_path, "r") as f:
+    qa_pairs = json.load(f)
+
+# Initializing metrics
+groundedness_metric = GroundednessMetric()
+relevance_metric = QuestionRelevanceMetric()
+standalone_metric = QuestionStandaloneMetric()
+
+
+for entry in qa_pairs:
+    question = entry["question"]
+    answer = entry["answer"]
+    context = entry["context"]
+    
+    test_case_question = LLMTestCase(input=question,actual_output="", context=[context])
+    
+    groundedness_score = groundedness_metric.measure(test_case_question)
+    relevance_score = relevance_metric.measure(test_case_question)
+    standalone_score = standalone_metric.measure(test_case_question)
+    
+    print(f"Question: {question}")
+    print(f"  Groundedness: {groundedness_score:.2f} ({groundedness_metric.reason})")
+    print(f"  Relevance: {relevance_score:.2f} ({relevance_metric.reason})")
+    print(f"  Standalone: {standalone_score:.2f} ({standalone_metric.reason})")
+    print("-" * 60)
+    break

From fbb472b62bc80ca00ee0c156829de1332efa78e4 Mon Sep 17 00:00:00 2001
From: Jack Luar <jluar@precisioninno.com>
Date: Mon, 14 Jul 2025 15:20:42 +0000
Subject: [PATCH 5/5] remove path hacks, fix checks

Signed-off-by: Jack Luar <jluar@precisioninno.com>
---
 backend/pyproject.toml                        |   6 +-
 backend/requirements-test.txt                 |   1 +
 backend/src/dataset_gen_eval/eval_dataset.py  |  14 +-
 .../src/dataset_gen_eval/generate_qa_pairs.py | 153 +++++++++---------
 backend/src/dataset_gen_eval/ingest_doc.py    |  95 +++++------
 .../src/dataset_gen_eval/quality_agents.py    | 153 +++++++++---------
 6 files changed, 209 insertions(+), 213 deletions(-)

diff --git a/backend/pyproject.toml b/backend/pyproject.toml
index 9e26a877..c08b6839 100644
--- a/backend/pyproject.toml
+++ b/backend/pyproject.toml
@@ -40,6 +40,10 @@ ignore_missing_imports = true
 module = "transformers.*"
 ignore_missing_imports = true
 
+[[tool.mypy.overrides]]
+module = "deepeval.*"
+ignore_missing_imports = true
+
 [tool.ruff]
 exclude = [
     ".bzr",
@@ -75,7 +79,7 @@ target-version = "py310"
 
 [tool.ruff.lint]
 select = ["E4", "E7", "E9","E301","E304","E305","E401","E223","E224","E242", "E", "F" ,"N", "W", "C90"]
-extend-select = ["D203", "D204"]
+extend-select = ["D204"]
 ignore = ["E501"]
 preview = true
 
diff --git a/backend/requirements-test.txt b/backend/requirements-test.txt
index 5155631f..3928562f 100644
--- a/backend/requirements-test.txt
+++ b/backend/requirements-test.txt
@@ -6,3 +6,4 @@ types-tqdm==4.66.0.20240417
 types-beautifulsoup4==4.12.0.20240511
 ruff==0.5.1
 pre-commit==3.7.1
+deepeval==3.2.0
diff --git a/backend/src/dataset_gen_eval/eval_dataset.py b/backend/src/dataset_gen_eval/eval_dataset.py
index 99f4ea44..0d335929 100644
--- a/backend/src/dataset_gen_eval/eval_dataset.py
+++ b/backend/src/dataset_gen_eval/eval_dataset.py
@@ -1,9 +1,9 @@
 import json
-from deepeval.test_case import LLMTestCase
+from deepeval.test_case.llm_test_case import LLMTestCase
 from .quality_agents import (
     GroundednessMetric,
     QuestionRelevanceMetric,
-    QuestionStandaloneMetric
+    QuestionStandaloneMetric,
 )
 
 
@@ -23,13 +23,15 @@
     question = entry["question"]
     answer = entry["answer"]
     context = entry["context"]
-    
-    test_case_question = LLMTestCase(input=question,actual_output="", context=[context])
-    
+
+    test_case_question = LLMTestCase(
+        input=question, actual_output="", context=[context]
+    )
+
     groundedness_score = groundedness_metric.measure(test_case_question)
     relevance_score = relevance_metric.measure(test_case_question)
     standalone_score = standalone_metric.measure(test_case_question)
-    
+
     print(f"Question: {question}")
     print(f"  Groundedness: {groundedness_score:.2f} ({groundedness_metric.reason})")
     print(f"  Relevance: {relevance_score:.2f} ({relevance_metric.reason})")
diff --git a/backend/src/dataset_gen_eval/generate_qa_pairs.py b/backend/src/dataset_gen_eval/generate_qa_pairs.py
index 028d4f2d..bd7c5602 100644
--- a/backend/src/dataset_gen_eval/generate_qa_pairs.py
+++ b/backend/src/dataset_gen_eval/generate_qa_pairs.py
@@ -1,14 +1,9 @@
 import json
-import sys
 from pathlib import Path
-from typing import List, Dict
+from typing import List, Dict, Optional
 import random
 
-# Add src to path for imports
-src_path = Path(__file__).parent.parent
-sys.path.insert(0, str(src_path))
-
-from src.vectorstores.faiss import FAISSVectorDatabase
+from ..vectorstores.faiss import FAISSVectorDatabase
 from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain.docstore.document import Document
 from dotenv import load_dotenv
@@ -18,10 +13,10 @@
 
 # didn't include command_reference cause didn't index it
 DOMAINS = [
-    "installation_guides", 
+    "installation_guides",
     "error_messages",
     "opensta_yosys_klayout",
-    "general_openroad"
+    "general_openroad",
 ]
 
 QA_PAIRS_PER_DOMAIN = 10
@@ -44,15 +39,15 @@
 Output:::"""
 
 
-def load_domain_database(domain: str) -> FAISSVectorDatabase:
+def load_domain_database(domain: str) -> Optional[FAISSVectorDatabase]:
     """Load the FAISS vector database for a specific domain."""
     print(f"Loading vector database for domain: {domain}")
-    
+
     vdb = FAISSVectorDatabase(
         embeddings_type="HF",
-        embeddings_model_name="sentence-transformers/all-MiniLM-L6-v2"
+        embeddings_model_name="sentence-transformers/all-MiniLM-L6-v2",
     )
-    
+
     try:
         vdb.load_db(name=domain)
         print(f"Successfully loaded {domain} database")
@@ -62,101 +57,104 @@ def load_domain_database(domain: str) -> FAISSVectorDatabase:
         return None
 
 
-def sample_documents_from_db(vdb: FAISSVectorDatabase, num_samples: int = 5) -> List[Document]:
+def sample_documents_from_db(
+    vdb: FAISSVectorDatabase, num_samples: int = 5
+) -> List[Document]:
     """Sample random documents from the vector database to use for QA generation."""
     try:
         all_docs = list(vdb.get_documents())
         print(f"Total documents in database: {len(all_docs)}")
-        
+
         # Sample random documents
         sample_size = min(num_samples, len(all_docs))
         sampled_docs = random.sample(all_docs, sample_size)
-        
+
         print(f"Sampled {len(sampled_docs)} documents")
         return sampled_docs
-    
+
     except Exception as e:
         print(f"Error sampling documents: {e}")
         return []
 
 
-def generate_qa_pairs_for_content(all_docs: List[Document], domain: str, num_qa: int = 5) -> List[Dict[str, str]]:
+def generate_qa_pairs_for_content(
+    all_docs: List[Document], domain: str, num_qa: int = 5
+) -> List[Dict[str, str]]:
     """Use Gemini to generate QA pairs from the given documents."""
     try:
         # Initialize Gemini model
         llm = ChatGoogleGenerativeAI(
             model="gemini-2.5-pro",
-            temperature=0.3,  
+            temperature=0.3,
         )
-        
+
         print(f"Generating {num_qa} QA pairs for {domain} domain...")
-        
+
         all_qa_pairs = []
-        
-        
+
         for i in range(num_qa):
             try:
                 # Sample different documents for each QA pair to get variety
-                sample_size = min(5, len(all_docs)) 
+                sample_size = min(5, len(all_docs))
                 sampled_docs = random.sample(all_docs, sample_size)
-                
+
                 # Combine content from this sample
-                content = "\n\n---DOCUMENT SEPARATOR---\n\n".join([
-                    doc.page_content for doc in sampled_docs
-                ])
-                
-                prompt = QA_GENERATION_PROMPT.format(
-                    context=content[:15000]  
+                content = "\n\n---DOCUMENT SEPARATOR---\n\n".join(
+                    [doc.page_content for doc in sampled_docs]
                 )
-                
-                print(f"  Generating QA pair {i+1}/{num_qa}...")
-                
+
+                prompt = QA_GENERATION_PROMPT.format(context=content[:15000])
+
+                print(f"  Generating QA pair {i + 1}/{num_qa}...")
+
                 # gemini cost analysis here, langsmith
                 response = llm.invoke(prompt)
-               
-                response_text = response.content.strip()
-                
-               
+
+                response_content = response.content
+                response_text = response_content.strip() if isinstance(response_content, str) else str(response_content).strip()
+
                 if "Output:::" in response_text:
                     output_section = response_text.split("Output:::")[-1].strip()
-                    
-                   
-                    lines = output_section.split('\n')
+
+                    lines = output_section.split("\n")
                     question = ""
                     answer = ""
-                    
+
                     for line in lines:
                         line = line.strip()
                         if line.startswith("Factoid question:"):
                             question = line.replace("Factoid question:", "").strip()
                         elif line.startswith("Answer:"):
                             answer = line.replace("Answer:", "").strip()
-                    
 
                     if question and answer:
                         qa_pair = {
                             "question": question,
                             "answer": answer,
                             "domain": domain,
-                            "source": "generated_from_docs", # context source add here
-                            "context": content[:15000]  # Add the context used for generation
+                            "source": "generated_from_docs",  # context source add here
+                            "context": content[
+                                :15000
+                            ],  # Add the context used for generation
                         }
                         all_qa_pairs.append(qa_pair)
                         print(f"Generated: {question[:50]}...")
                     else:
-                        print(f"Failed to parse QA pair from response")
+                        print("Failed to parse QA pair from response")
                         print(f"Raw response: {response_text[:200]}...")
                 else:
-                    print(f"No 'Output:::' section found in response")
+                    print("No 'Output:::' section found in response")
                     print(f"Raw response: {response_text[:200]}...")
-                    
+
             except Exception as e:
-                print(f"Error generating QA pair {i+1}: {e}")
+                print(f"Error generating QA pair {i + 1}: {e}")
                 continue
-        
-        print(f"Successfully generated {len(all_qa_pairs)} QA pairs out of {num_qa} attempts")
+
+        print(
+            f"Successfully generated {len(all_qa_pairs)} QA pairs out of {num_qa} attempts"
+        )
         return all_qa_pairs
-            
+
     except Exception as e:
         print(f"Error in QA generation process: {e}")
         return []
@@ -164,29 +162,27 @@ def generate_qa_pairs_for_content(all_docs: List[Document], domain: str, num_qa:
 
 def process_domain(domain: str, qa_per_domain: int = 10) -> List[Dict[str, str]]:
     """Process a single domain to generate QA pairs."""
-    print(f"\n{'='*50}")
+    print(f"\n{'=' * 50}")
     print(f"Processing domain: {domain}")
-    print(f"{'='*50}")
-    
+    print(f"{'=' * 50}")
+
     # Load the vector database
     vdb = load_domain_database(domain)
     if not vdb:
         return []
-    
+
     # Sample documents from the database
-    sampled_docs = sample_documents_from_db(vdb, num_samples=100)  
+    sampled_docs = sample_documents_from_db(vdb, num_samples=100)
     if not sampled_docs:
         return []
-    
+
     print(f"Will generate QA pairs from pool of {len(sampled_docs)} documents")
-    
+
     # Generate QA pairs (each QA pair will sample different docs)
     qa_pairs = generate_qa_pairs_for_content(
-        all_docs=sampled_docs,
-        domain=domain,
-        num_qa=qa_per_domain
+        all_docs=sampled_docs, domain=domain, num_qa=qa_per_domain
     )
-    
+
     return qa_pairs
 
 
@@ -194,10 +190,10 @@ def save_qa_pairs(all_qa_pairs: List[Dict[str, str]], output_file: str):
     """Save the generated QA pairs to a JSON file."""
     output_path = Path(output_file)
     output_path.parent.mkdir(parents=True, exist_ok=True)
-    
-    with open(output_path, 'w', encoding='utf-8') as f:
+
+    with open(output_path, "w", encoding="utf-8") as f:
         json.dump(all_qa_pairs, f, indent=2, ensure_ascii=False)
-    
+
     print(f"\nSaved {len(all_qa_pairs)} QA pairs to: {output_path}")
 
 
@@ -206,41 +202,38 @@ def main():
     print("Starting QA pair generation...")
     print(f"Target domains: {DOMAINS}")
     print(f"QA pairs per domain: {QA_PAIRS_PER_DOMAIN}")
-    
+
     all_qa_pairs = []
-    
+
     for domain in DOMAINS:
         try:
             qa_pairs = process_domain(domain, QA_PAIRS_PER_DOMAIN)
             all_qa_pairs.extend(qa_pairs)
             print(f"Generated {len(qa_pairs)} QA pairs for {domain}")
-            
+
         except Exception as e:
             print(f"Error processing domain {domain}: {e}")
             continue
-    
+
     # Save all QA pairs
     if all_qa_pairs:
-        output_file = "../backend/data/generated_qa_pairs_gemini_pro_new.json"
+        output_file = "data/generated_qa_pairs_gemini_pro_new.json"
         save_qa_pairs(all_qa_pairs, output_file)
-        
-     
 
-        print(f"{'='*50}")
+        print(f"{'=' * 50}")
         print(f"Total QA pairs generated: {len(all_qa_pairs)}")
-        
-        
+
         domain_counts = {}
         for qa in all_qa_pairs:
-            domain = qa.get('domain', 'unknown')
+            domain = qa.get("domain", "unknown")
             domain_counts[domain] = domain_counts.get(domain, 0) + 1
-        
+
         for domain, count in domain_counts.items():
             print(f"  {domain}: {count} pairs")
-            
+
     else:
         print("No QA pairs were generated!")
 
 
 if __name__ == "__main__":
-    main() 
\ No newline at end of file
+    main()
diff --git a/backend/src/dataset_gen_eval/ingest_doc.py b/backend/src/dataset_gen_eval/ingest_doc.py
index 49b95be1..ceaffd75 100644
--- a/backend/src/dataset_gen_eval/ingest_doc.py
+++ b/backend/src/dataset_gen_eval/ingest_doc.py
@@ -1,31 +1,31 @@
 from pathlib import Path
-import json, uuid
-import sys
-
-src_path = Path(__file__).parent.parent
-sys.path.insert(0, str(src_path))
+import json
+import uuid
+from typing import List
 
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from src.vectorstores.faiss import FAISSVectorDatabase
-from tools.process_pdf import process_pdf_docs
-from tools.process_md import process_md
+from ..vectorstores.faiss import FAISSVectorDatabase
+from ..tools.process_pdf import process_pdf_docs
+from ..tools.process_md import process_md
+
+DOMAINS = [
+    "general_openroad",
+    # "command_reference"
+    "installation_guides",
+    "error_messages",
+    "opensta_yosys_klayout",
+]
 
-DOMAINS = ["general_openroad", 
-           #"command_reference"
-           "installation_guides", "error_messages",
-           "opensta_yosys_klayout"
-           ]
 
 # discover files in the raw data dir
-def discover_files(domain:str)->list:
-    root = Path(f"../backend/data/raw/{domain}")
+def discover_files(domain: str) -> List[Path]:
+    root = Path(f"data/raw/{domain}")
     # getting all the files
     return list(root.rglob("*.*"))
 
+
 # document loader function
 def load_as_documents(path):
-    
-    
     suffix = path.suffix.lower()
     if suffix == ".pdf":
         return process_pdf_docs(str(path))
@@ -35,68 +35,74 @@ def load_as_documents(path):
         print("HTML Files skipped for now")
         return []
     else:
-        return []                     
+        return []
+
 
 # chunking function
 def chunk(docs, size=700, overlap=70):
     splitter = RecursiveCharacterTextSplitter(
-        chunk_size=size, chunk_overlap=overlap, add_start_index=True)
+        chunk_size=size, chunk_overlap=overlap, add_start_index=True
+    )
     out = []
     for d in docs:
         out.extend(splitter.split_documents([d]))
     return out
 
+
 # adding metadata to each chunk
 def enrich_metadata(chunks, domain, source_path):
-    for idx, doc in enumerate(chunks):
-        doc.metadata.update({
-            "domain": domain,
-            "doc_path": str(source_path.relative_to('../backend/data/raw')),
-            "chunk_id": f"{uuid.uuid4()}",
-        })
+    for doc in chunks:
+        doc.metadata.update(
+            {
+                "domain": domain,
+                "doc_path": str(source_path.relative_to("data/raw")),
+                "chunk_id": f"{uuid.uuid4()}",
+            }
+        )
     return chunks
 
 
 def build_domain_index(domain):
-    vdb = FAISSVectorDatabase(embeddings_type="HF",
-                              embeddings_model_name="sentence-transformers/all-MiniLM-L6-v2")
+    vdb = FAISSVectorDatabase(
+        embeddings_type="HF",
+        embeddings_model_name="sentence-transformers/all-MiniLM-L6-v2",
+    )
     manifest = []
-    
+
     files = discover_files(domain)
     total_files = len(files)
     print(f"Processing {total_files} files for domain: {domain}")
-    
+
     for idx, fp in enumerate(files):
-        print(f"[{idx+1}/{total_files}] Processing: {fp}")
+        print(f"[{idx + 1}/{total_files}] Processing: {fp}")
 
-        if fp.name.startswith('.'):
+        if fp.name.startswith("."):
             print(f"Skipping system file: {fp}")
             continue
-        
-        
+
         docs = load_as_documents(fp)
-        if not docs:  
+        if not docs:
             print(f"No documents loaded from: {fp}")
             continue
-            
+
         chunks = chunk(docs)
-        if not chunks: 
+        if not chunks:
             print(f"No chunks created from: {fp}")
             continue
-            
+
         chunks = enrich_metadata(chunks, domain, fp)
         manifest += [c.metadata for c in chunks]
-        
+
         # Process chunks in smaller batches for better memory management
         batch_size = 50
         for i in range(0, len(chunks), batch_size):
-            batch = chunks[i:i+batch_size]
+            batch = chunks[i : i + batch_size]
             vdb._add_to_db(batch)
-            print(f"  Added batch {i//batch_size + 1} ({len(batch)} chunks)")
-    
-    vdb.save_db(name=domain)     
-    Path("../backend/data/manifests").mkdir(exist_ok=True, parents=True)
-    with open(f"../backend/data/manifests/{domain}.jsonl", "w") as f:
+            print(f"  Added batch {i // batch_size + 1} ({len(batch)} chunks)")
+
+    vdb.save_db(name=domain)
+    Path("data/manifests").mkdir(exist_ok=True, parents=True)
+    with open(f"data/manifests/{domain}.jsonl", "w") as f:
         for row in manifest:
             f.write(json.dumps(row) + "\n")
 
@@ -105,4 +111,3 @@ def build_domain_index(domain):
     for dom in DOMAINS:
         build_domain_index(dom)
     print("All domain indexes built & manifests written.")
-
diff --git a/backend/src/dataset_gen_eval/quality_agents.py b/backend/src/dataset_gen_eval/quality_agents.py
index 94cbb2e5..7201b66a 100644
--- a/backend/src/dataset_gen_eval/quality_agents.py
+++ b/backend/src/dataset_gen_eval/quality_agents.py
@@ -1,14 +1,9 @@
-import sys
-from pathlib import Path
+from typing import Any
 from dotenv import load_dotenv
 
-
-src_path = Path(__file__).parent.parent
-sys.path.insert(0, str(src_path))
-
 from langchain_google_genai import ChatGoogleGenerativeAI
-from deepeval.metrics import BaseMetric
-from deepeval.test_case import LLMTestCase
+from deepeval.metrics.base_metric import BaseMetric
+from deepeval.test_case.llm_test_case import LLMTestCase
 
 load_dotenv()
 
@@ -87,49 +82,50 @@ def __init__(self, threshold: float = 0.6):
         self.threshold = threshold
         self.evaluation_model = "gemini-2.5-pro"
         self.include_reason = True
-        
-        
+
         self.llm = ChatGoogleGenerativeAI(
             model=self.evaluation_model,
             temperature=0.3,
         )
 
-    def measure(self, tc: LLMTestCase) -> float:
+    def measure(self, test_case: LLMTestCase, *args: Any, **kwargs: Any) -> float:
         """Synchronous version of the metric evaluation."""
         prompt = question_groundedness_critique_prompt.format(
-            question=tc.input, 
-            context="\n".join(tc.context or [])
+            question=test_case.input, context="\n".join(test_case.context or [])
         )
-        
+
         try:
-           
             response = self.llm.invoke(prompt)
-            response_text = response.content.strip()
-            
+            response_content = response.content
+            response_text = response_content.strip() if isinstance(response_content, str) else str(response_content).strip()
+
             #
             if "Total rating:" in response_text:
                 rating_line = response_text.split("Total rating:")[-1].strip()
-                
+
                 score_1_5 = int(rating_line.split()[0])
             else:
-                
-                lines = response_text.split('\n')
-                score_1_5 = 3 
+                lines = response_text.split("\n")
+                score_1_5 = 3
                 for line in lines:
-                    if any(keyword in line.lower() for keyword in ['rating:', 'score:']):
+                    if any(
+                        keyword in line.lower() for keyword in ["rating:", "score:"]
+                    ):
                         try:
-                            score_1_5 = int([char for char in line if char.isdigit()][0])
+                            score_1_5 = int(
+                                [char for char in line if char.isdigit()][0]
+                            )
                             break
                         except (IndexError, ValueError):
                             continue
-            
+
             # Convert to 0-1 scale as expected by DeepEval
             self.score = score_1_5 / 5.0
             self.reason = response_text
             self.success = self.score >= self.threshold
-            
+
             return self.score
-            
+
         except Exception as e:
             self.score = 0.0
             self.reason = f"Error during evaluation: {str(e)}"
@@ -137,66 +133,65 @@ def measure(self, tc: LLMTestCase) -> float:
             self.error = str(e)
             return self.score
 
-    async def a_measure(self, tc: LLMTestCase):
+    async def a_measure(self, test_case: LLMTestCase, *args: Any, **kwargs: Any) -> float:
         """Async version - fallback to synchronous since we don't have async client setup."""
-        return self.measure(tc)
+        return self.measure(test_case, *args, **kwargs)
 
     def is_successful(self) -> bool:
         """Check if the metric evaluation was successful."""
-        return False if getattr(self, "error", None) else self.success
+        return False if getattr(self, "error", None) else bool(getattr(self, "success", False))
 
-    @property
-    def __name__(self):
+    def get_metric_name(self) -> str:
         return "Question Groundedness"
 
 
 class QuestionRelevanceMetric(BaseMetric):
     """DeepEval metric for evaluating question relevance to ML developers building NLP applications."""
-    
+
     def __init__(self, threshold: float = 0.6):
         self.threshold = threshold
         self.evaluation_model = "gemini-2.5-pro"
         self.include_reason = True
-        
-        
+
         self.llm = ChatGoogleGenerativeAI(
             model=self.evaluation_model,
             temperature=0.3,
         )
 
-    def measure(self, tc: LLMTestCase) -> float:
+    def measure(self, test_case: LLMTestCase, *args: Any, **kwargs: Any) -> float:
         """Synchronous version of the metric evaluation."""
-        prompt = question_relevance_critique_prompt.format(
-            question=tc.input
-        )
-        
+        prompt = question_relevance_critique_prompt.format(question=test_case.input)
+
         try:
-            
             response = self.llm.invoke(prompt)
-            response_text = response.content.strip()
-            
-            
+            response_content = response.content
+            response_text = response_content.strip() if isinstance(response_content, str) else str(response_content).strip()
+
             if "Total rating:" in response_text:
                 rating_line = response_text.split("Total rating:")[-1].strip()
                 score_1_5 = int(rating_line.split()[0])
             else:
-                lines = response_text.split('\n')
+                lines = response_text.split("\n")
                 score_1_5 = 3  # Default score
                 for line in lines:
-                    if any(keyword in line.lower() for keyword in ['rating:', 'score:']):
+                    if any(
+                        keyword in line.lower() for keyword in ["rating:", "score:"]
+                    ):
                         try:
-                            score_1_5 = int([char for char in line if char.isdigit()][0])
+                            score_1_5 = int(
+                                [char for char in line if char.isdigit()][0]
+                            )
                             break
                         except (IndexError, ValueError):
                             continue
-            
+
             # Convert to 0-1 scale as expected by DeepEval
             self.score = score_1_5 / 5.0
             self.reason = response_text
             self.success = self.score >= self.threshold
-            
+
             return self.score
-            
+
         except Exception as e:
             self.score = 0.0
             self.reason = f"Error during evaluation: {str(e)}"
@@ -204,68 +199,65 @@ def measure(self, tc: LLMTestCase) -> float:
             self.error = str(e)
             return self.score
 
-    async def a_measure(self, tc: LLMTestCase):
+    async def a_measure(self, test_case: LLMTestCase, *args: Any, **kwargs: Any) -> float:
         """Async version - fallback to synchronous since we don't have async client setup."""
-        return self.measure(tc)
+        return self.measure(test_case, *args, **kwargs)
 
     def is_successful(self) -> bool:
         """Check if the metric evaluation was successful."""
-        return False if getattr(self, "error", None) else self.success
+        return False if getattr(self, "error", None) else bool(getattr(self, "success", False))
 
-    @property
-    def __name__(self):
+    def get_metric_name(self) -> str:
         return "Question Relevance"
 
 
 class QuestionStandaloneMetric(BaseMetric):
     """DeepEval metric for evaluating question context-independence."""
-    
+
     def __init__(self, threshold: float = 0.6):
         self.threshold = threshold
         self.evaluation_model = "gemini-2.5-pro"
         self.include_reason = True
-        
-        
+
         self.llm = ChatGoogleGenerativeAI(
             model=self.evaluation_model,
             temperature=0.3,
         )
 
-    def measure(self, tc: LLMTestCase) -> float:
+    def measure(self, test_case: LLMTestCase, *args: Any, **kwargs: Any) -> float:
         """Synchronous version of the metric evaluation."""
-        prompt = question_standalone_critique_prompt.format(
-            question=tc.input
-        )
-        
+        prompt = question_standalone_critique_prompt.format(question=test_case.input)
+
         try:
-            
             response = self.llm.invoke(prompt)
-            response_text = response.content.strip()
-            
-            
+            response_content = response.content
+            response_text = response_content.strip() if isinstance(response_content, str) else str(response_content).strip()
+
             if "Total rating:" in response_text:
                 rating_line = response_text.split("Total rating:")[-1].strip()
-                
+
                 score_1_5 = int(rating_line.split()[0])
             else:
-               
-                lines = response_text.split('\n')
+                lines = response_text.split("\n")
                 score_1_5 = 3  # Default score
                 for line in lines:
-                    if any(keyword in line.lower() for keyword in ['rating:', 'score:']):
+                    if any(
+                        keyword in line.lower() for keyword in ["rating:", "score:"]
+                    ):
                         try:
-                            score_1_5 = int([char for char in line if char.isdigit()][0])
+                            score_1_5 = int(
+                                [char for char in line if char.isdigit()][0]
+                            )
                             break
                         except (IndexError, ValueError):
                             continue
-            
-     
+
             self.score = score_1_5 / 5.0
             self.reason = response_text
             self.success = self.score >= self.threshold
-            
+
             return self.score
-            
+
         except Exception as e:
             self.score = 0.0
             self.reason = f"Error during evaluation: {str(e)}"
@@ -273,14 +265,13 @@ def measure(self, tc: LLMTestCase) -> float:
             self.error = str(e)
             return self.score
 
-    async def a_measure(self, tc: LLMTestCase):
+    async def a_measure(self, test_case: LLMTestCase, *args: Any, **kwargs: Any) -> float:
         """Async version - fallback to synchronous since we don't have async client setup."""
-        return self.measure(tc)
+        return self.measure(test_case, *args, **kwargs)
 
     def is_successful(self) -> bool:
         """Check if the metric evaluation was successful."""
-        return False if getattr(self, "error", None) else self.success
+        return False if getattr(self, "error", None) else bool(getattr(self, "success", False))
 
-    @property
-    def __name__(self):
-        return "Question Standalone"
\ No newline at end of file
+    def get_metric_name(self) -> str:
+        return "Question Standalone"