diff --git a/backend/pyproject.toml b/backend/pyproject.toml
index 9e26a877..c08b6839 100644
--- a/backend/pyproject.toml
+++ b/backend/pyproject.toml
@@ -40,6 +40,10 @@ ignore_missing_imports = true
 module = "transformers.*"
 ignore_missing_imports = true
 
+[[tool.mypy.overrides]]
+module = "deepeval.*"
+ignore_missing_imports = true
+
 [tool.ruff]
 exclude = [
     ".bzr",
@@ -75,7 +79,7 @@ target-version = "py310"
 
 [tool.ruff.lint]
 select = ["E4", "E7", "E9","E301","E304","E305","E401","E223","E224","E242", "E", "F" ,"N", "W", "C90"]
-extend-select = ["D203", "D204"]
+extend-select = ["D204"]
 ignore = ["E501"]
 preview = true
 
diff --git a/backend/requirements-test.txt b/backend/requirements-test.txt
index 5155631f..3928562f 100644
--- a/backend/requirements-test.txt
+++ b/backend/requirements-test.txt
@@ -6,3 +6,4 @@ types-tqdm==4.66.0.20240417
 types-beautifulsoup4==4.12.0.20240511
 ruff==0.5.1
 pre-commit==3.7.1
+deepeval==3.2.0
diff --git a/backend/src/dataset_gen_eval/eval_dataset.py b/backend/src/dataset_gen_eval/eval_dataset.py
new file mode 100644
index 00000000..0d335929
--- /dev/null
+++ b/backend/src/dataset_gen_eval/eval_dataset.py
@@ -0,0 +1,40 @@
+import json
+from deepeval.test_case.llm_test_case import LLMTestCase
+from .quality_agents import (
+    GroundednessMetric,
+    QuestionRelevanceMetric,
+    QuestionStandaloneMetric,
+)
+
+
+json_path = "data/generated_qa_pairs_gemini_pro_new.json"
+
+# Loading questions
+with open(json_path, "r") as f:
+    qa_pairs = json.load(f)
+
+# Initializing metrics
+groundedness_metric = GroundednessMetric()
+relevance_metric = QuestionRelevanceMetric()
+standalone_metric = QuestionStandaloneMetric()
+
+
+for entry in qa_pairs:
+    question = entry["question"]
+    answer = entry["answer"]
+    context = entry["context"]
+
+    test_case_question = LLMTestCase(
+        input=question, actual_output="", context=[context]
+    )
+
+    groundedness_score = groundedness_metric.measure(test_case_question)
+    relevance_score = relevance_metric.measure(test_case_question)
+    standalone_score = standalone_metric.measure(test_case_question)
+
+    print(f"Question: {question}")
+    print(f"  Groundedness: {groundedness_score:.2f} ({groundedness_metric.reason})")
+    print(f"  Relevance: {relevance_score:.2f} ({relevance_metric.reason})")
+    print(f"  Standalone: {standalone_score:.2f} ({standalone_metric.reason})")
+    print("-" * 60)
+    break
diff --git a/backend/src/dataset_gen_eval/generate_qa_pairs.py b/backend/src/dataset_gen_eval/generate_qa_pairs.py
new file mode 100644
index 00000000..bd7c5602
--- /dev/null
+++ b/backend/src/dataset_gen_eval/generate_qa_pairs.py
@@ -0,0 +1,239 @@
+import json
+from pathlib import Path
+from typing import List, Dict, Optional
+import random
+
+from ..vectorstores.faiss import FAISSVectorDatabase
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain.docstore.document import Document
+from dotenv import load_dotenv
+
+load_dotenv()
+
+
+# didn't include command_reference cause didn't index it
+DOMAINS = [
+    "installation_guides",
+    "error_messages",
+    "opensta_yosys_klayout",
+    "general_openroad",
+]
+
+QA_PAIRS_PER_DOMAIN = 10
+
+QA_GENERATION_PROMPT = """
+Your task is to write a factoid question and an answer given a context.
+Your factoid question should be answerable with a specific, concise piece of factual information from the context.
+Your factoid question should be formulated in the same style as questions users could ask in a search engine.
+This means that your factoid question MUST NOT mention something like "according to the passage" or "context".
+
+Provide your answer as follows:
+
+Output:::
+Factoid question: (your factoid question)
+Answer: (your answer to the factoid question)
+
+Now here is the context.
+
+Context: {context}\n
+Output:::"""
+
+
+def load_domain_database(domain: str) -> Optional[FAISSVectorDatabase]:
+    """Load the FAISS vector database for a specific domain."""
+    print(f"Loading vector database for domain: {domain}")
+
+    vdb = FAISSVectorDatabase(
+        embeddings_type="HF",
+        embeddings_model_name="sentence-transformers/all-MiniLM-L6-v2",
+    )
+
+    try:
+        vdb.load_db(name=domain)
+        print(f"Successfully loaded {domain} database")
+        return vdb
+    except Exception as e:
+        print(f"Error loading database for {domain}: {e}")
+        return None
+
+
+def sample_documents_from_db(
+    vdb: FAISSVectorDatabase, num_samples: int = 5
+) -> List[Document]:
+    """Sample random documents from the vector database to use for QA generation."""
+    try:
+        all_docs = list(vdb.get_documents())
+        print(f"Total documents in database: {len(all_docs)}")
+
+        # Sample random documents
+        sample_size = min(num_samples, len(all_docs))
+        sampled_docs = random.sample(all_docs, sample_size)
+
+        print(f"Sampled {len(sampled_docs)} documents")
+        return sampled_docs
+
+    except Exception as e:
+        print(f"Error sampling documents: {e}")
+        return []
+
+
+def generate_qa_pairs_for_content(
+    all_docs: List[Document], domain: str, num_qa: int = 5
+) -> List[Dict[str, str]]:
+    """Use Gemini to generate QA pairs from the given documents."""
+    try:
+        # Initialize Gemini model
+        llm = ChatGoogleGenerativeAI(
+            model="gemini-2.5-pro",
+            temperature=0.3,
+        )
+
+        print(f"Generating {num_qa} QA pairs for {domain} domain...")
+
+        all_qa_pairs = []
+
+        for i in range(num_qa):
+            try:
+                # Sample different documents for each QA pair to get variety
+                sample_size = min(5, len(all_docs))
+                sampled_docs = random.sample(all_docs, sample_size)
+
+                # Combine content from this sample
+                content = "\n\n---DOCUMENT SEPARATOR---\n\n".join(
+                    [doc.page_content for doc in sampled_docs]
+                )
+
+                prompt = QA_GENERATION_PROMPT.format(context=content[:15000])
+
+                print(f"  Generating QA pair {i + 1}/{num_qa}...")
+
+                # gemini cost analysis here, langsmith
+                response = llm.invoke(prompt)
+
+                response_content = response.content
+                response_text = response_content.strip() if isinstance(response_content, str) else str(response_content).strip()
+
+                if "Output:::" in response_text:
+                    output_section = response_text.split("Output:::")[-1].strip()
+
+                    lines = output_section.split("\n")
+                    question = ""
+                    answer = ""
+
+                    for line in lines:
+                        line = line.strip()
+                        if line.startswith("Factoid question:"):
+                            question = line.replace("Factoid question:", "").strip()
+                        elif line.startswith("Answer:"):
+                            answer = line.replace("Answer:", "").strip()
+
+                    if question and answer:
+                        qa_pair = {
+                            "question": question,
+                            "answer": answer,
+                            "domain": domain,
+                            "source": "generated_from_docs",  # context source add here
+                            "context": content[
+                                :15000
+                            ],  # Add the context used for generation
+                        }
+                        all_qa_pairs.append(qa_pair)
+                        print(f"Generated: {question[:50]}...")
+                    else:
+                        print("Failed to parse QA pair from response")
+                        print(f"Raw response: {response_text[:200]}...")
+                else:
+                    print("No 'Output:::' section found in response")
+                    print(f"Raw response: {response_text[:200]}...")
+
+            except Exception as e:
+                print(f"Error generating QA pair {i + 1}: {e}")
+                continue
+
+        print(
+            f"Successfully generated {len(all_qa_pairs)} QA pairs out of {num_qa} attempts"
+        )
+        return all_qa_pairs
+
+    except Exception as e:
+        print(f"Error in QA generation process: {e}")
+        return []
+
+
+def process_domain(domain: str, qa_per_domain: int = 10) -> List[Dict[str, str]]:
+    """Process a single domain to generate QA pairs."""
+    print(f"\n{'=' * 50}")
+    print(f"Processing domain: {domain}")
+    print(f"{'=' * 50}")
+
+    # Load the vector database
+    vdb = load_domain_database(domain)
+    if not vdb:
+        return []
+
+    # Sample documents from the database
+    sampled_docs = sample_documents_from_db(vdb, num_samples=100)
+    if not sampled_docs:
+        return []
+
+    print(f"Will generate QA pairs from pool of {len(sampled_docs)} documents")
+
+    # Generate QA pairs (each QA pair will sample different docs)
+    qa_pairs = generate_qa_pairs_for_content(
+        all_docs=sampled_docs, domain=domain, num_qa=qa_per_domain
+    )
+
+    return qa_pairs
+
+
+def save_qa_pairs(all_qa_pairs: List[Dict[str, str]], output_file: str):
+    """Save the generated QA pairs to a JSON file."""
+    output_path = Path(output_file)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(all_qa_pairs, f, indent=2, ensure_ascii=False)
+
+    print(f"\nSaved {len(all_qa_pairs)} QA pairs to: {output_path}")
+
+
+def main():
+    """Main function to generate QA pairs for all domains."""
+    print("Starting QA pair generation...")
+    print(f"Target domains: {DOMAINS}")
+    print(f"QA pairs per domain: {QA_PAIRS_PER_DOMAIN}")
+
+    all_qa_pairs = []
+
+    for domain in DOMAINS:
+        try:
+            qa_pairs = process_domain(domain, QA_PAIRS_PER_DOMAIN)
+            all_qa_pairs.extend(qa_pairs)
+            print(f"Generated {len(qa_pairs)} QA pairs for {domain}")
+
+        except Exception as e:
+            print(f"Error processing domain {domain}: {e}")
+            continue
+
+    # Save all QA pairs
+    if all_qa_pairs:
+        output_file = "data/generated_qa_pairs_gemini_pro_new.json"
+        save_qa_pairs(all_qa_pairs, output_file)
+
+        print(f"{'=' * 50}")
+        print(f"Total QA pairs generated: {len(all_qa_pairs)}")
+
+        domain_counts = {}
+        for qa in all_qa_pairs:
+            domain = qa.get("domain", "unknown")
+            domain_counts[domain] = domain_counts.get(domain, 0) + 1
+
+        for domain, count in domain_counts.items():
+            print(f"  {domain}: {count} pairs")
+
+    else:
+        print("No QA pairs were generated!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/backend/src/dataset_gen_eval/ingest_doc.py b/backend/src/dataset_gen_eval/ingest_doc.py
new file mode 100644
index 00000000..ceaffd75
--- /dev/null
+++ b/backend/src/dataset_gen_eval/ingest_doc.py
@@ -0,0 +1,113 @@
+from pathlib import Path
+import json
+import uuid
+from typing import List
+
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from ..vectorstores.faiss import FAISSVectorDatabase
+from ..tools.process_pdf import process_pdf_docs
+from ..tools.process_md import process_md
+
+DOMAINS = [
+    "general_openroad",
+    # "command_reference"
+    "installation_guides",
+    "error_messages",
+    "opensta_yosys_klayout",
+]
+
+
+# discover files in the raw data dir
+def discover_files(domain: str) -> List[Path]:
+    root = Path(f"data/raw/{domain}")
+    # getting all the files
+    return list(root.rglob("*.*"))
+
+
+# document loader function
+def load_as_documents(path):
+    suffix = path.suffix.lower()
+    if suffix == ".pdf":
+        return process_pdf_docs(str(path))
+    elif suffix in {".md", ".markdown"}:
+        return process_md(str(path.parent), split_text=False)
+    elif suffix == ".html":
+        print("HTML Files skipped for now")
+        return []
+    else:
+        return []
+
+
+# chunking function
+def chunk(docs, size=700, overlap=70):
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=size, chunk_overlap=overlap, add_start_index=True
+    )
+    out = []
+    for d in docs:
+        out.extend(splitter.split_documents([d]))
+    return out
+
+
+# adding metadata to each chunk
+def enrich_metadata(chunks, domain, source_path):
+    for doc in chunks:
+        doc.metadata.update(
+            {
+                "domain": domain,
+                "doc_path": str(source_path.relative_to("data/raw")),
+                "chunk_id": f"{uuid.uuid4()}",
+            }
+        )
+    return chunks
+
+
+def build_domain_index(domain):
+    vdb = FAISSVectorDatabase(
+        embeddings_type="HF",
+        embeddings_model_name="sentence-transformers/all-MiniLM-L6-v2",
+    )
+    manifest = []
+
+    files = discover_files(domain)
+    total_files = len(files)
+    print(f"Processing {total_files} files for domain: {domain}")
+
+    for idx, fp in enumerate(files):
+        print(f"[{idx + 1}/{total_files}] Processing: {fp}")
+
+        if fp.name.startswith("."):
+            print(f"Skipping system file: {fp}")
+            continue
+
+        docs = load_as_documents(fp)
+        if not docs:
+            print(f"No documents loaded from: {fp}")
+            continue
+
+        chunks = chunk(docs)
+        if not chunks:
+            print(f"No chunks created from: {fp}")
+            continue
+
+        chunks = enrich_metadata(chunks, domain, fp)
+        manifest += [c.metadata for c in chunks]
+
+        # Process chunks in smaller batches for better memory management
+        batch_size = 50
+        for i in range(0, len(chunks), batch_size):
+            batch = chunks[i : i + batch_size]
+            vdb._add_to_db(batch)
+            print(f"  Added batch {i // batch_size + 1} ({len(batch)} chunks)")
+
+    vdb.save_db(name=domain)
+    Path("data/manifests").mkdir(exist_ok=True, parents=True)
+    with open(f"data/manifests/{domain}.jsonl", "w") as f:
+        for row in manifest:
+            f.write(json.dumps(row) + "\n")
+
+
+if __name__ == "__main__":
+    for dom in DOMAINS:
+        build_domain_index(dom)
+    print("All domain indexes built & manifests written.")
diff --git a/backend/src/dataset_gen_eval/quality_agents.py b/backend/src/dataset_gen_eval/quality_agents.py
new file mode 100644
index 00000000..7201b66a
--- /dev/null
+++ b/backend/src/dataset_gen_eval/quality_agents.py
@@ -0,0 +1,277 @@
+from typing import Any
+from dotenv import load_dotenv
+
+from langchain_google_genai import ChatGoogleGenerativeAI
+from deepeval.metrics.base_metric import BaseMetric
+from deepeval.test_case.llm_test_case import LLMTestCase
+
+load_dotenv()
+
+# Groundedness critique prompt template
+question_groundedness_critique_prompt = """
+You are an expert evaluator tasked with assessing the groundedness of a question-answer pair.
+
+Your task is to evaluate whether the given question is well-grounded in the provided context.
+A well-grounded question should:
+1. Be answerable using information from the context
+2. Not require external knowledge beyond the context
+3. Be specific and factual rather than speculative
+4. Have clear supporting evidence in the context
+
+Please analyze the question and context, then provide a rating from 1 to 5:
+- 1: Completely ungrounded - question cannot be answered from context
+- 2: Poorly grounded - question requires significant external knowledge
+- 3: Moderately grounded - question is partially answerable from context
+- 4: Well grounded - question is mostly answerable from context
+- 5: Perfectly grounded - question is completely answerable from context
+
+Question: {question}
+
+Context: {context}
+
+Please provide your evaluation in the following format:
+Analysis: [Your detailed analysis of why the question is or isn't grounded in the context]
+Total rating: [Your rating from 1 to 5]
+"""
+
+# Question relevance critique prompt template
+question_relevance_critique_prompt = """
+You will be given a question.
+Your task is to provide a 'total rating' representing how useful this question can be to machine learning developers building NLP applications with the Hugging Face ecosystem.
+Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful.
+
+Provide your answer as follows:
+
+Answer:::
+Evaluation: (your rationale for the rating, as a text)
+Total rating: (your rating, as a number between 1 and 5)
+
+You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.
+
+Now here is the question.
+
+Question: {question}
+Answer::: """
+
+# Question standalone critique prompt template
+question_standalone_critique_prompt = """
+You will be given a question.
+Your task is to provide a 'total rating' representing how context-independent this question is.
+Give your answer on a scale of 1 to 5, where 1 means that the question depends on additional information to be understood, and 5 means that the question makes sense by itself.
+For instance, if the question refers to a particular setting, like 'in the context' or 'in the document', the rating must be 1.
+The questions can contain obscure technical nouns or acronyms like Gradio, Hub, Hugging Face or Space and still be a 5: it must simply be clear to an operator with access to documentation what the question is about.
+
+For instance, "What is the name of the checkpoint from which the ViT model is imported?" should receive a 1, since there is an implicit mention of a context, thus the question is not independent from the context.
+
+Provide your answer as follows:
+
+Answer:::
+Evaluation: (your rationale for the rating, as a text)
+Total rating: (your rating, as a number between 1 and 5)
+
+You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.
+
+Now here is the question.
+
+Question: {question}
+Answer::: """
+
+
+class GroundednessMetric(BaseMetric):
+    def __init__(self, threshold: float = 0.6):
+        self.threshold = threshold
+        self.evaluation_model = "gemini-2.5-pro"
+        self.include_reason = True
+
+        self.llm = ChatGoogleGenerativeAI(
+            model=self.evaluation_model,
+            temperature=0.3,
+        )
+
+    def measure(self, test_case: LLMTestCase, *args: Any, **kwargs: Any) -> float:
+        """Synchronous version of the metric evaluation."""
+        prompt = question_groundedness_critique_prompt.format(
+            question=test_case.input, context="\n".join(test_case.context or [])
+        )
+
+        try:
+            response = self.llm.invoke(prompt)
+            response_content = response.content
+            response_text = response_content.strip() if isinstance(response_content, str) else str(response_content).strip()
+
+            #
+            if "Total rating:" in response_text:
+                rating_line = response_text.split("Total rating:")[-1].strip()
+
+                score_1_5 = int(rating_line.split()[0])
+            else:
+                lines = response_text.split("\n")
+                score_1_5 = 3
+                for line in lines:
+                    if any(
+                        keyword in line.lower() for keyword in ["rating:", "score:"]
+                    ):
+                        try:
+                            score_1_5 = int(
+                                [char for char in line if char.isdigit()][0]
+                            )
+                            break
+                        except (IndexError, ValueError):
+                            continue
+
+            # Convert to 0-1 scale as expected by DeepEval
+            self.score = score_1_5 / 5.0
+            self.reason = response_text
+            self.success = self.score >= self.threshold
+
+            return self.score
+
+        except Exception as e:
+            self.score = 0.0
+            self.reason = f"Error during evaluation: {str(e)}"
+            self.success = False
+            self.error = str(e)
+            return self.score
+
+    async def a_measure(self, test_case: LLMTestCase, *args: Any, **kwargs: Any) -> float:
+        """Async version - fallback to synchronous since we don't have async client setup."""
+        return self.measure(test_case, *args, **kwargs)
+
+    def is_successful(self) -> bool:
+        """Check if the metric evaluation was successful."""
+        return False if getattr(self, "error", None) else bool(getattr(self, "success", False))
+
+    def get_metric_name(self) -> str:
+        return "Question Groundedness"
+
+
+class QuestionRelevanceMetric(BaseMetric):
+    """DeepEval metric for evaluating question relevance to ML developers building NLP applications."""
+
+    def __init__(self, threshold: float = 0.6):
+        self.threshold = threshold
+        self.evaluation_model = "gemini-2.5-pro"
+        self.include_reason = True
+
+        self.llm = ChatGoogleGenerativeAI(
+            model=self.evaluation_model,
+            temperature=0.3,
+        )
+
+    def measure(self, test_case: LLMTestCase, *args: Any, **kwargs: Any) -> float:
+        """Synchronous version of the metric evaluation."""
+        prompt = question_relevance_critique_prompt.format(question=test_case.input)
+
+        try:
+            response = self.llm.invoke(prompt)
+            response_content = response.content
+            response_text = response_content.strip() if isinstance(response_content, str) else str(response_content).strip()
+
+            if "Total rating:" in response_text:
+                rating_line = response_text.split("Total rating:")[-1].strip()
+                score_1_5 = int(rating_line.split()[0])
+            else:
+                lines = response_text.split("\n")
+                score_1_5 = 3  # Default score
+                for line in lines:
+                    if any(
+                        keyword in line.lower() for keyword in ["rating:", "score:"]
+                    ):
+                        try:
+                            score_1_5 = int(
+                                [char for char in line if char.isdigit()][0]
+                            )
+                            break
+                        except (IndexError, ValueError):
+                            continue
+
+            # Convert to 0-1 scale as expected by DeepEval
+            self.score = score_1_5 / 5.0
+            self.reason = response_text
+            self.success = self.score >= self.threshold
+
+            return self.score
+
+        except Exception as e:
+            self.score = 0.0
+            self.reason = f"Error during evaluation: {str(e)}"
+            self.success = False
+            self.error = str(e)
+            return self.score
+
+    async def a_measure(self, test_case: LLMTestCase, *args: Any, **kwargs: Any) -> float:
+        """Async version - fallback to synchronous since we don't have async client setup."""
+        return self.measure(test_case, *args, **kwargs)
+
+    def is_successful(self) -> bool:
+        """Check if the metric evaluation was successful."""
+        return False if getattr(self, "error", None) else bool(getattr(self, "success", False))
+
+    def get_metric_name(self) -> str:
+        return "Question Relevance"
+
+
+class QuestionStandaloneMetric(BaseMetric):
+    """DeepEval metric for evaluating question context-independence."""
+
+    def __init__(self, threshold: float = 0.6):
+        self.threshold = threshold
+        self.evaluation_model = "gemini-2.5-pro"
+        self.include_reason = True
+
+        self.llm = ChatGoogleGenerativeAI(
+            model=self.evaluation_model,
+            temperature=0.3,
+        )
+
+    def measure(self, test_case: LLMTestCase, *args: Any, **kwargs: Any) -> float:
+        """Synchronous version of the metric evaluation."""
+        prompt = question_standalone_critique_prompt.format(question=test_case.input)
+
+        try:
+            response = self.llm.invoke(prompt)
+            response_content = response.content
+            response_text = response_content.strip() if isinstance(response_content, str) else str(response_content).strip()
+
+            if "Total rating:" in response_text:
+                rating_line = response_text.split("Total rating:")[-1].strip()
+
+                score_1_5 = int(rating_line.split()[0])
+            else:
+                lines = response_text.split("\n")
+                score_1_5 = 3  # Default score
+                for line in lines:
+                    if any(
+                        keyword in line.lower() for keyword in ["rating:", "score:"]
+                    ):
+                        try:
+                            score_1_5 = int(
+                                [char for char in line if char.isdigit()][0]
+                            )
+                            break
+                        except (IndexError, ValueError):
+                            continue
+
+            self.score = score_1_5 / 5.0
+            self.reason = response_text
+            self.success = self.score >= self.threshold
+
+            return self.score
+
+        except Exception as e:
+            self.score = 0.0
+            self.reason = f"Error during evaluation: {str(e)}"
+            self.success = False
+            self.error = str(e)
+            return self.score
+
+    async def a_measure(self, test_case: LLMTestCase, *args: Any, **kwargs: Any) -> float:
+        """Async version - fallback to synchronous since we don't have async client setup."""
+        return self.measure(test_case, *args, **kwargs)
+
+    def is_successful(self) -> bool:
+        """Check if the metric evaluation was successful."""
+        return False if getattr(self, "error", None) else bool(getattr(self, "success", False))
+
+    def get_metric_name(self) -> str:
+        return "Question Standalone"