From 6d55182831134cd8bbee8e7d9de12531f78df2bf Mon Sep 17 00:00:00 2001 From: Kannav02 Date: Wed, 9 Jul 2025 13:17:44 -0400 Subject: [PATCH 1/5] feat: added the file to ingest the document to vector db Signed-off-by: Kannav02 --- backend/src/dataset_gen_eval/ingest_doc.py | 108 +++++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 backend/src/dataset_gen_eval/ingest_doc.py diff --git a/backend/src/dataset_gen_eval/ingest_doc.py b/backend/src/dataset_gen_eval/ingest_doc.py new file mode 100644 index 00000000..49b95be1 --- /dev/null +++ b/backend/src/dataset_gen_eval/ingest_doc.py @@ -0,0 +1,108 @@ +from pathlib import Path +import json, uuid +import sys + +src_path = Path(__file__).parent.parent +sys.path.insert(0, str(src_path)) + +from langchain.text_splitter import RecursiveCharacterTextSplitter +from src.vectorstores.faiss import FAISSVectorDatabase +from tools.process_pdf import process_pdf_docs +from tools.process_md import process_md + +DOMAINS = ["general_openroad", + #"command_reference" + "installation_guides", "error_messages", + "opensta_yosys_klayout" + ] + +# discover files in the raw data dir +def discover_files(domain:str)->list: + root = Path(f"../backend/data/raw/{domain}") + # getting all the files + return list(root.rglob("*.*")) + +# document loader function +def load_as_documents(path): + + + suffix = path.suffix.lower() + if suffix == ".pdf": + return process_pdf_docs(str(path)) + elif suffix in {".md", ".markdown"}: + return process_md(str(path.parent), split_text=False) + elif suffix == ".html": + print("HTML Files skipped for now") + return [] + else: + return [] + +# chunking function +def chunk(docs, size=700, overlap=70): + splitter = RecursiveCharacterTextSplitter( + chunk_size=size, chunk_overlap=overlap, add_start_index=True) + out = [] + for d in docs: + out.extend(splitter.split_documents([d])) + return out + +# adding metadata to each chunk +def enrich_metadata(chunks, domain, source_path): + for idx, doc in enumerate(chunks): + doc.metadata.update({ + "domain": domain, + "doc_path": str(source_path.relative_to('../backend/data/raw')), + "chunk_id": f"{uuid.uuid4()}", + }) + return chunks + + +def build_domain_index(domain): + vdb = FAISSVectorDatabase(embeddings_type="HF", + embeddings_model_name="sentence-transformers/all-MiniLM-L6-v2") + manifest = [] + + files = discover_files(domain) + total_files = len(files) + print(f"Processing {total_files} files for domain: {domain}") + + for idx, fp in enumerate(files): + print(f"[{idx+1}/{total_files}] Processing: {fp}") + + if fp.name.startswith('.'): + print(f"Skipping system file: {fp}") + continue + + + docs = load_as_documents(fp) + if not docs: + print(f"No documents loaded from: {fp}") + continue + + chunks = chunk(docs) + if not chunks: + print(f"No chunks created from: {fp}") + continue + + chunks = enrich_metadata(chunks, domain, fp) + manifest += [c.metadata for c in chunks] + + # Process chunks in smaller batches for better memory management + batch_size = 50 + for i in range(0, len(chunks), batch_size): + batch = chunks[i:i+batch_size] + vdb._add_to_db(batch) + print(f" Added batch {i//batch_size + 1} ({len(batch)} chunks)") + + vdb.save_db(name=domain) + Path("../backend/data/manifests").mkdir(exist_ok=True, parents=True) + with open(f"../backend/data/manifests/{domain}.jsonl", "w") as f: + for row in manifest: + f.write(json.dumps(row) + "\n") + + +if __name__ == "__main__": + for dom in DOMAINS: + build_domain_index(dom) + print("All domain indexes built & manifests written.") + From 5f839c8a010eba28bbd37ad00de5576846674950 Mon Sep 17 00:00:00 2001 From: Kannav02 Date: Wed, 9 Jul 2025 13:45:50 -0400 Subject: [PATCH 2/5] feat: added the file to generate the qa pairs Signed-off-by: Kannav02 --- .../src/dataset_gen_eval/generate_qa_pairs.py | 246 ++++++++++++++++++ 1 file changed, 246 insertions(+) create mode 100644 backend/src/dataset_gen_eval/generate_qa_pairs.py diff --git a/backend/src/dataset_gen_eval/generate_qa_pairs.py b/backend/src/dataset_gen_eval/generate_qa_pairs.py new file mode 100644 index 00000000..028d4f2d --- /dev/null +++ b/backend/src/dataset_gen_eval/generate_qa_pairs.py @@ -0,0 +1,246 @@ +import json +import sys +from pathlib import Path +from typing import List, Dict +import random + +# Add src to path for imports +src_path = Path(__file__).parent.parent +sys.path.insert(0, str(src_path)) + +from src.vectorstores.faiss import FAISSVectorDatabase +from langchain_google_genai import ChatGoogleGenerativeAI +from langchain.docstore.document import Document +from dotenv import load_dotenv + +load_dotenv() + + +# didn't include command_reference cause didn't index it +DOMAINS = [ + "installation_guides", + "error_messages", + "opensta_yosys_klayout", + "general_openroad" +] + +QA_PAIRS_PER_DOMAIN = 10 + +QA_GENERATION_PROMPT = """ +Your task is to write a factoid question and an answer given a context. +Your factoid question should be answerable with a specific, concise piece of factual information from the context. +Your factoid question should be formulated in the same style as questions users could ask in a search engine. +This means that your factoid question MUST NOT mention something like "according to the passage" or "context". + +Provide your answer as follows: + +Output::: +Factoid question: (your factoid question) +Answer: (your answer to the factoid question) + +Now here is the context. + +Context: {context}\n +Output:::""" + + +def load_domain_database(domain: str) -> FAISSVectorDatabase: + """Load the FAISS vector database for a specific domain.""" + print(f"Loading vector database for domain: {domain}") + + vdb = FAISSVectorDatabase( + embeddings_type="HF", + embeddings_model_name="sentence-transformers/all-MiniLM-L6-v2" + ) + + try: + vdb.load_db(name=domain) + print(f"Successfully loaded {domain} database") + return vdb + except Exception as e: + print(f"Error loading database for {domain}: {e}") + return None + + +def sample_documents_from_db(vdb: FAISSVectorDatabase, num_samples: int = 5) -> List[Document]: + """Sample random documents from the vector database to use for QA generation.""" + try: + all_docs = list(vdb.get_documents()) + print(f"Total documents in database: {len(all_docs)}") + + # Sample random documents + sample_size = min(num_samples, len(all_docs)) + sampled_docs = random.sample(all_docs, sample_size) + + print(f"Sampled {len(sampled_docs)} documents") + return sampled_docs + + except Exception as e: + print(f"Error sampling documents: {e}") + return [] + + +def generate_qa_pairs_for_content(all_docs: List[Document], domain: str, num_qa: int = 5) -> List[Dict[str, str]]: + """Use Gemini to generate QA pairs from the given documents.""" + try: + # Initialize Gemini model + llm = ChatGoogleGenerativeAI( + model="gemini-2.5-pro", + temperature=0.3, + ) + + print(f"Generating {num_qa} QA pairs for {domain} domain...") + + all_qa_pairs = [] + + + for i in range(num_qa): + try: + # Sample different documents for each QA pair to get variety + sample_size = min(5, len(all_docs)) + sampled_docs = random.sample(all_docs, sample_size) + + # Combine content from this sample + content = "\n\n---DOCUMENT SEPARATOR---\n\n".join([ + doc.page_content for doc in sampled_docs + ]) + + prompt = QA_GENERATION_PROMPT.format( + context=content[:15000] + ) + + print(f" Generating QA pair {i+1}/{num_qa}...") + + # gemini cost analysis here, langsmith + response = llm.invoke(prompt) + + response_text = response.content.strip() + + + if "Output:::" in response_text: + output_section = response_text.split("Output:::")[-1].strip() + + + lines = output_section.split('\n') + question = "" + answer = "" + + for line in lines: + line = line.strip() + if line.startswith("Factoid question:"): + question = line.replace("Factoid question:", "").strip() + elif line.startswith("Answer:"): + answer = line.replace("Answer:", "").strip() + + + if question and answer: + qa_pair = { + "question": question, + "answer": answer, + "domain": domain, + "source": "generated_from_docs", # context source add here + "context": content[:15000] # Add the context used for generation + } + all_qa_pairs.append(qa_pair) + print(f"Generated: {question[:50]}...") + else: + print(f"Failed to parse QA pair from response") + print(f"Raw response: {response_text[:200]}...") + else: + print(f"No 'Output:::' section found in response") + print(f"Raw response: {response_text[:200]}...") + + except Exception as e: + print(f"Error generating QA pair {i+1}: {e}") + continue + + print(f"Successfully generated {len(all_qa_pairs)} QA pairs out of {num_qa} attempts") + return all_qa_pairs + + except Exception as e: + print(f"Error in QA generation process: {e}") + return [] + + +def process_domain(domain: str, qa_per_domain: int = 10) -> List[Dict[str, str]]: + """Process a single domain to generate QA pairs.""" + print(f"\n{'='*50}") + print(f"Processing domain: {domain}") + print(f"{'='*50}") + + # Load the vector database + vdb = load_domain_database(domain) + if not vdb: + return [] + + # Sample documents from the database + sampled_docs = sample_documents_from_db(vdb, num_samples=100) + if not sampled_docs: + return [] + + print(f"Will generate QA pairs from pool of {len(sampled_docs)} documents") + + # Generate QA pairs (each QA pair will sample different docs) + qa_pairs = generate_qa_pairs_for_content( + all_docs=sampled_docs, + domain=domain, + num_qa=qa_per_domain + ) + + return qa_pairs + + +def save_qa_pairs(all_qa_pairs: List[Dict[str, str]], output_file: str): + """Save the generated QA pairs to a JSON file.""" + output_path = Path(output_file) + output_path.parent.mkdir(parents=True, exist_ok=True) + + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(all_qa_pairs, f, indent=2, ensure_ascii=False) + + print(f"\nSaved {len(all_qa_pairs)} QA pairs to: {output_path}") + + +def main(): + """Main function to generate QA pairs for all domains.""" + print("Starting QA pair generation...") + print(f"Target domains: {DOMAINS}") + print(f"QA pairs per domain: {QA_PAIRS_PER_DOMAIN}") + + all_qa_pairs = [] + + for domain in DOMAINS: + try: + qa_pairs = process_domain(domain, QA_PAIRS_PER_DOMAIN) + all_qa_pairs.extend(qa_pairs) + print(f"Generated {len(qa_pairs)} QA pairs for {domain}") + + except Exception as e: + print(f"Error processing domain {domain}: {e}") + continue + + # Save all QA pairs + if all_qa_pairs: + output_file = "../backend/data/generated_qa_pairs_gemini_pro_new.json" + save_qa_pairs(all_qa_pairs, output_file) + + + + print(f"{'='*50}") + print(f"Total QA pairs generated: {len(all_qa_pairs)}") + + + domain_counts = {} + for qa in all_qa_pairs: + domain = qa.get('domain', 'unknown') + domain_counts[domain] = domain_counts.get(domain, 0) + 1 + + for domain, count in domain_counts.items(): + print(f" {domain}: {count} pairs") + + else: + print("No QA pairs were generated!") + + +if __name__ == "__main__": + main() \ No newline at end of file From cca788d89ecfebf2a69c3afeed262cdb4017af49 Mon Sep 17 00:00:00 2001 From: Kannav02 Date: Wed, 9 Jul 2025 14:01:08 -0400 Subject: [PATCH 3/5] feat: added the quality evaluation agents for synthetic dataset generation Signed-off-by: Kannav02 --- .../src/dataset_gen_eval/quality_agents.py | 286 ++++++++++++++++++ 1 file changed, 286 insertions(+) create mode 100644 backend/src/dataset_gen_eval/quality_agents.py diff --git a/backend/src/dataset_gen_eval/quality_agents.py b/backend/src/dataset_gen_eval/quality_agents.py new file mode 100644 index 00000000..94cbb2e5 --- /dev/null +++ b/backend/src/dataset_gen_eval/quality_agents.py @@ -0,0 +1,286 @@ +import sys +from pathlib import Path +from dotenv import load_dotenv + + +src_path = Path(__file__).parent.parent +sys.path.insert(0, str(src_path)) + +from langchain_google_genai import ChatGoogleGenerativeAI +from deepeval.metrics import BaseMetric +from deepeval.test_case import LLMTestCase + +load_dotenv() + +# Groundedness critique prompt template +question_groundedness_critique_prompt = """ +You are an expert evaluator tasked with assessing the groundedness of a question-answer pair. + +Your task is to evaluate whether the given question is well-grounded in the provided context. +A well-grounded question should: +1. Be answerable using information from the context +2. Not require external knowledge beyond the context +3. Be specific and factual rather than speculative +4. Have clear supporting evidence in the context + +Please analyze the question and context, then provide a rating from 1 to 5: +- 1: Completely ungrounded - question cannot be answered from context +- 2: Poorly grounded - question requires significant external knowledge +- 3: Moderately grounded - question is partially answerable from context +- 4: Well grounded - question is mostly answerable from context +- 5: Perfectly grounded - question is completely answerable from context + +Question: {question} + +Context: {context} + +Please provide your evaluation in the following format: +Analysis: [Your detailed analysis of why the question is or isn't grounded in the context] +Total rating: [Your rating from 1 to 5] +""" + +# Question relevance critique prompt template +question_relevance_critique_prompt = """ +You will be given a question. +Your task is to provide a 'total rating' representing how useful this question can be to machine learning developers building NLP applications with the Hugging Face ecosystem. +Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful. + +Provide your answer as follows: + +Answer::: +Evaluation: (your rationale for the rating, as a text) +Total rating: (your rating, as a number between 1 and 5) + +You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer. + +Now here is the question. + +Question: {question} +Answer::: """ + +# Question standalone critique prompt template +question_standalone_critique_prompt = """ +You will be given a question. +Your task is to provide a 'total rating' representing how context-independent this question is. +Give your answer on a scale of 1 to 5, where 1 means that the question depends on additional information to be understood, and 5 means that the question makes sense by itself. +For instance, if the question refers to a particular setting, like 'in the context' or 'in the document', the rating must be 1. +The questions can contain obscure technical nouns or acronyms like Gradio, Hub, Hugging Face or Space and still be a 5: it must simply be clear to an operator with access to documentation what the question is about. + +For instance, "What is the name of the checkpoint from which the ViT model is imported?" should receive a 1, since there is an implicit mention of a context, thus the question is not independent from the context. + +Provide your answer as follows: + +Answer::: +Evaluation: (your rationale for the rating, as a text) +Total rating: (your rating, as a number between 1 and 5) + +You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer. + +Now here is the question. + +Question: {question} +Answer::: """ + + +class GroundednessMetric(BaseMetric): + def __init__(self, threshold: float = 0.6): + self.threshold = threshold + self.evaluation_model = "gemini-2.5-pro" + self.include_reason = True + + + self.llm = ChatGoogleGenerativeAI( + model=self.evaluation_model, + temperature=0.3, + ) + + def measure(self, tc: LLMTestCase) -> float: + """Synchronous version of the metric evaluation.""" + prompt = question_groundedness_critique_prompt.format( + question=tc.input, + context="\n".join(tc.context or []) + ) + + try: + + response = self.llm.invoke(prompt) + response_text = response.content.strip() + + # + if "Total rating:" in response_text: + rating_line = response_text.split("Total rating:")[-1].strip() + + score_1_5 = int(rating_line.split()[0]) + else: + + lines = response_text.split('\n') + score_1_5 = 3 + for line in lines: + if any(keyword in line.lower() for keyword in ['rating:', 'score:']): + try: + score_1_5 = int([char for char in line if char.isdigit()][0]) + break + except (IndexError, ValueError): + continue + + # Convert to 0-1 scale as expected by DeepEval + self.score = score_1_5 / 5.0 + self.reason = response_text + self.success = self.score >= self.threshold + + return self.score + + except Exception as e: + self.score = 0.0 + self.reason = f"Error during evaluation: {str(e)}" + self.success = False + self.error = str(e) + return self.score + + async def a_measure(self, tc: LLMTestCase): + """Async version - fallback to synchronous since we don't have async client setup.""" + return self.measure(tc) + + def is_successful(self) -> bool: + """Check if the metric evaluation was successful.""" + return False if getattr(self, "error", None) else self.success + + @property + def __name__(self): + return "Question Groundedness" + + +class QuestionRelevanceMetric(BaseMetric): + """DeepEval metric for evaluating question relevance to ML developers building NLP applications.""" + + def __init__(self, threshold: float = 0.6): + self.threshold = threshold + self.evaluation_model = "gemini-2.5-pro" + self.include_reason = True + + + self.llm = ChatGoogleGenerativeAI( + model=self.evaluation_model, + temperature=0.3, + ) + + def measure(self, tc: LLMTestCase) -> float: + """Synchronous version of the metric evaluation.""" + prompt = question_relevance_critique_prompt.format( + question=tc.input + ) + + try: + + response = self.llm.invoke(prompt) + response_text = response.content.strip() + + + if "Total rating:" in response_text: + rating_line = response_text.split("Total rating:")[-1].strip() + score_1_5 = int(rating_line.split()[0]) + else: + lines = response_text.split('\n') + score_1_5 = 3 # Default score + for line in lines: + if any(keyword in line.lower() for keyword in ['rating:', 'score:']): + try: + score_1_5 = int([char for char in line if char.isdigit()][0]) + break + except (IndexError, ValueError): + continue + + # Convert to 0-1 scale as expected by DeepEval + self.score = score_1_5 / 5.0 + self.reason = response_text + self.success = self.score >= self.threshold + + return self.score + + except Exception as e: + self.score = 0.0 + self.reason = f"Error during evaluation: {str(e)}" + self.success = False + self.error = str(e) + return self.score + + async def a_measure(self, tc: LLMTestCase): + """Async version - fallback to synchronous since we don't have async client setup.""" + return self.measure(tc) + + def is_successful(self) -> bool: + """Check if the metric evaluation was successful.""" + return False if getattr(self, "error", None) else self.success + + @property + def __name__(self): + return "Question Relevance" + + +class QuestionStandaloneMetric(BaseMetric): + """DeepEval metric for evaluating question context-independence.""" + + def __init__(self, threshold: float = 0.6): + self.threshold = threshold + self.evaluation_model = "gemini-2.5-pro" + self.include_reason = True + + + self.llm = ChatGoogleGenerativeAI( + model=self.evaluation_model, + temperature=0.3, + ) + + def measure(self, tc: LLMTestCase) -> float: + """Synchronous version of the metric evaluation.""" + prompt = question_standalone_critique_prompt.format( + question=tc.input + ) + + try: + + response = self.llm.invoke(prompt) + response_text = response.content.strip() + + + if "Total rating:" in response_text: + rating_line = response_text.split("Total rating:")[-1].strip() + + score_1_5 = int(rating_line.split()[0]) + else: + + lines = response_text.split('\n') + score_1_5 = 3 # Default score + for line in lines: + if any(keyword in line.lower() for keyword in ['rating:', 'score:']): + try: + score_1_5 = int([char for char in line if char.isdigit()][0]) + break + except (IndexError, ValueError): + continue + + + self.score = score_1_5 / 5.0 + self.reason = response_text + self.success = self.score >= self.threshold + + return self.score + + except Exception as e: + self.score = 0.0 + self.reason = f"Error during evaluation: {str(e)}" + self.success = False + self.error = str(e) + return self.score + + async def a_measure(self, tc: LLMTestCase): + """Async version - fallback to synchronous since we don't have async client setup.""" + return self.measure(tc) + + def is_successful(self) -> bool: + """Check if the metric evaluation was successful.""" + return False if getattr(self, "error", None) else self.success + + @property + def __name__(self): + return "Question Standalone" \ No newline at end of file From 46dab55d53f0540ee474559cc129f351120a3d6c Mon Sep 17 00:00:00 2001 From: Kannav02 Date: Wed, 9 Jul 2025 14:07:45 -0400 Subject: [PATCH 4/5] feat: added the script to run the evaluation agent for each QA pair Signed-off-by: Kannav02 --- backend/src/dataset_gen_eval/eval_dataset.py | 38 ++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 backend/src/dataset_gen_eval/eval_dataset.py diff --git a/backend/src/dataset_gen_eval/eval_dataset.py b/backend/src/dataset_gen_eval/eval_dataset.py new file mode 100644 index 00000000..99f4ea44 --- /dev/null +++ b/backend/src/dataset_gen_eval/eval_dataset.py @@ -0,0 +1,38 @@ +import json +from deepeval.test_case import LLMTestCase +from .quality_agents import ( + GroundednessMetric, + QuestionRelevanceMetric, + QuestionStandaloneMetric +) + + +json_path = "data/generated_qa_pairs_gemini_pro_new.json" + +# Loading questions +with open(json_path, "r") as f: + qa_pairs = json.load(f) + +# Initializing metrics +groundedness_metric = GroundednessMetric() +relevance_metric = QuestionRelevanceMetric() +standalone_metric = QuestionStandaloneMetric() + + +for entry in qa_pairs: + question = entry["question"] + answer = entry["answer"] + context = entry["context"] + + test_case_question = LLMTestCase(input=question,actual_output="", context=[context]) + + groundedness_score = groundedness_metric.measure(test_case_question) + relevance_score = relevance_metric.measure(test_case_question) + standalone_score = standalone_metric.measure(test_case_question) + + print(f"Question: {question}") + print(f" Groundedness: {groundedness_score:.2f} ({groundedness_metric.reason})") + print(f" Relevance: {relevance_score:.2f} ({relevance_metric.reason})") + print(f" Standalone: {standalone_score:.2f} ({standalone_metric.reason})") + print("-" * 60) + break From fbb472b62bc80ca00ee0c156829de1332efa78e4 Mon Sep 17 00:00:00 2001 From: Jack Luar Date: Mon, 14 Jul 2025 15:20:42 +0000 Subject: [PATCH 5/5] remove path hacks, fix checks Signed-off-by: Jack Luar --- backend/pyproject.toml | 6 +- backend/requirements-test.txt | 1 + backend/src/dataset_gen_eval/eval_dataset.py | 14 +- .../src/dataset_gen_eval/generate_qa_pairs.py | 153 +++++++++--------- backend/src/dataset_gen_eval/ingest_doc.py | 95 +++++------ .../src/dataset_gen_eval/quality_agents.py | 153 +++++++++--------- 6 files changed, 209 insertions(+), 213 deletions(-) diff --git a/backend/pyproject.toml b/backend/pyproject.toml index 9e26a877..c08b6839 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -40,6 +40,10 @@ ignore_missing_imports = true module = "transformers.*" ignore_missing_imports = true +[[tool.mypy.overrides]] +module = "deepeval.*" +ignore_missing_imports = true + [tool.ruff] exclude = [ ".bzr", @@ -75,7 +79,7 @@ target-version = "py310" [tool.ruff.lint] select = ["E4", "E7", "E9","E301","E304","E305","E401","E223","E224","E242", "E", "F" ,"N", "W", "C90"] -extend-select = ["D203", "D204"] +extend-select = ["D204"] ignore = ["E501"] preview = true diff --git a/backend/requirements-test.txt b/backend/requirements-test.txt index 5155631f..3928562f 100644 --- a/backend/requirements-test.txt +++ b/backend/requirements-test.txt @@ -6,3 +6,4 @@ types-tqdm==4.66.0.20240417 types-beautifulsoup4==4.12.0.20240511 ruff==0.5.1 pre-commit==3.7.1 +deepeval==3.2.0 diff --git a/backend/src/dataset_gen_eval/eval_dataset.py b/backend/src/dataset_gen_eval/eval_dataset.py index 99f4ea44..0d335929 100644 --- a/backend/src/dataset_gen_eval/eval_dataset.py +++ b/backend/src/dataset_gen_eval/eval_dataset.py @@ -1,9 +1,9 @@ import json -from deepeval.test_case import LLMTestCase +from deepeval.test_case.llm_test_case import LLMTestCase from .quality_agents import ( GroundednessMetric, QuestionRelevanceMetric, - QuestionStandaloneMetric + QuestionStandaloneMetric, ) @@ -23,13 +23,15 @@ question = entry["question"] answer = entry["answer"] context = entry["context"] - - test_case_question = LLMTestCase(input=question,actual_output="", context=[context]) - + + test_case_question = LLMTestCase( + input=question, actual_output="", context=[context] + ) + groundedness_score = groundedness_metric.measure(test_case_question) relevance_score = relevance_metric.measure(test_case_question) standalone_score = standalone_metric.measure(test_case_question) - + print(f"Question: {question}") print(f" Groundedness: {groundedness_score:.2f} ({groundedness_metric.reason})") print(f" Relevance: {relevance_score:.2f} ({relevance_metric.reason})") diff --git a/backend/src/dataset_gen_eval/generate_qa_pairs.py b/backend/src/dataset_gen_eval/generate_qa_pairs.py index 028d4f2d..bd7c5602 100644 --- a/backend/src/dataset_gen_eval/generate_qa_pairs.py +++ b/backend/src/dataset_gen_eval/generate_qa_pairs.py @@ -1,14 +1,9 @@ import json -import sys from pathlib import Path -from typing import List, Dict +from typing import List, Dict, Optional import random -# Add src to path for imports -src_path = Path(__file__).parent.parent -sys.path.insert(0, str(src_path)) - -from src.vectorstores.faiss import FAISSVectorDatabase +from ..vectorstores.faiss import FAISSVectorDatabase from langchain_google_genai import ChatGoogleGenerativeAI from langchain.docstore.document import Document from dotenv import load_dotenv @@ -18,10 +13,10 @@ # didn't include command_reference cause didn't index it DOMAINS = [ - "installation_guides", + "installation_guides", "error_messages", "opensta_yosys_klayout", - "general_openroad" + "general_openroad", ] QA_PAIRS_PER_DOMAIN = 10 @@ -44,15 +39,15 @@ Output:::""" -def load_domain_database(domain: str) -> FAISSVectorDatabase: +def load_domain_database(domain: str) -> Optional[FAISSVectorDatabase]: """Load the FAISS vector database for a specific domain.""" print(f"Loading vector database for domain: {domain}") - + vdb = FAISSVectorDatabase( embeddings_type="HF", - embeddings_model_name="sentence-transformers/all-MiniLM-L6-v2" + embeddings_model_name="sentence-transformers/all-MiniLM-L6-v2", ) - + try: vdb.load_db(name=domain) print(f"Successfully loaded {domain} database") @@ -62,101 +57,104 @@ def load_domain_database(domain: str) -> FAISSVectorDatabase: return None -def sample_documents_from_db(vdb: FAISSVectorDatabase, num_samples: int = 5) -> List[Document]: +def sample_documents_from_db( + vdb: FAISSVectorDatabase, num_samples: int = 5 +) -> List[Document]: """Sample random documents from the vector database to use for QA generation.""" try: all_docs = list(vdb.get_documents()) print(f"Total documents in database: {len(all_docs)}") - + # Sample random documents sample_size = min(num_samples, len(all_docs)) sampled_docs = random.sample(all_docs, sample_size) - + print(f"Sampled {len(sampled_docs)} documents") return sampled_docs - + except Exception as e: print(f"Error sampling documents: {e}") return [] -def generate_qa_pairs_for_content(all_docs: List[Document], domain: str, num_qa: int = 5) -> List[Dict[str, str]]: +def generate_qa_pairs_for_content( + all_docs: List[Document], domain: str, num_qa: int = 5 +) -> List[Dict[str, str]]: """Use Gemini to generate QA pairs from the given documents.""" try: # Initialize Gemini model llm = ChatGoogleGenerativeAI( model="gemini-2.5-pro", - temperature=0.3, + temperature=0.3, ) - + print(f"Generating {num_qa} QA pairs for {domain} domain...") - + all_qa_pairs = [] - - + for i in range(num_qa): try: # Sample different documents for each QA pair to get variety - sample_size = min(5, len(all_docs)) + sample_size = min(5, len(all_docs)) sampled_docs = random.sample(all_docs, sample_size) - + # Combine content from this sample - content = "\n\n---DOCUMENT SEPARATOR---\n\n".join([ - doc.page_content for doc in sampled_docs - ]) - - prompt = QA_GENERATION_PROMPT.format( - context=content[:15000] + content = "\n\n---DOCUMENT SEPARATOR---\n\n".join( + [doc.page_content for doc in sampled_docs] ) - - print(f" Generating QA pair {i+1}/{num_qa}...") - + + prompt = QA_GENERATION_PROMPT.format(context=content[:15000]) + + print(f" Generating QA pair {i + 1}/{num_qa}...") + # gemini cost analysis here, langsmith response = llm.invoke(prompt) - - response_text = response.content.strip() - - + + response_content = response.content + response_text = response_content.strip() if isinstance(response_content, str) else str(response_content).strip() + if "Output:::" in response_text: output_section = response_text.split("Output:::")[-1].strip() - - - lines = output_section.split('\n') + + lines = output_section.split("\n") question = "" answer = "" - + for line in lines: line = line.strip() if line.startswith("Factoid question:"): question = line.replace("Factoid question:", "").strip() elif line.startswith("Answer:"): answer = line.replace("Answer:", "").strip() - if question and answer: qa_pair = { "question": question, "answer": answer, "domain": domain, - "source": "generated_from_docs", # context source add here - "context": content[:15000] # Add the context used for generation + "source": "generated_from_docs", # context source add here + "context": content[ + :15000 + ], # Add the context used for generation } all_qa_pairs.append(qa_pair) print(f"Generated: {question[:50]}...") else: - print(f"Failed to parse QA pair from response") + print("Failed to parse QA pair from response") print(f"Raw response: {response_text[:200]}...") else: - print(f"No 'Output:::' section found in response") + print("No 'Output:::' section found in response") print(f"Raw response: {response_text[:200]}...") - + except Exception as e: - print(f"Error generating QA pair {i+1}: {e}") + print(f"Error generating QA pair {i + 1}: {e}") continue - - print(f"Successfully generated {len(all_qa_pairs)} QA pairs out of {num_qa} attempts") + + print( + f"Successfully generated {len(all_qa_pairs)} QA pairs out of {num_qa} attempts" + ) return all_qa_pairs - + except Exception as e: print(f"Error in QA generation process: {e}") return [] @@ -164,29 +162,27 @@ def generate_qa_pairs_for_content(all_docs: List[Document], domain: str, num_qa: def process_domain(domain: str, qa_per_domain: int = 10) -> List[Dict[str, str]]: """Process a single domain to generate QA pairs.""" - print(f"\n{'='*50}") + print(f"\n{'=' * 50}") print(f"Processing domain: {domain}") - print(f"{'='*50}") - + print(f"{'=' * 50}") + # Load the vector database vdb = load_domain_database(domain) if not vdb: return [] - + # Sample documents from the database - sampled_docs = sample_documents_from_db(vdb, num_samples=100) + sampled_docs = sample_documents_from_db(vdb, num_samples=100) if not sampled_docs: return [] - + print(f"Will generate QA pairs from pool of {len(sampled_docs)} documents") - + # Generate QA pairs (each QA pair will sample different docs) qa_pairs = generate_qa_pairs_for_content( - all_docs=sampled_docs, - domain=domain, - num_qa=qa_per_domain + all_docs=sampled_docs, domain=domain, num_qa=qa_per_domain ) - + return qa_pairs @@ -194,10 +190,10 @@ def save_qa_pairs(all_qa_pairs: List[Dict[str, str]], output_file: str): """Save the generated QA pairs to a JSON file.""" output_path = Path(output_file) output_path.parent.mkdir(parents=True, exist_ok=True) - - with open(output_path, 'w', encoding='utf-8') as f: + + with open(output_path, "w", encoding="utf-8") as f: json.dump(all_qa_pairs, f, indent=2, ensure_ascii=False) - + print(f"\nSaved {len(all_qa_pairs)} QA pairs to: {output_path}") @@ -206,41 +202,38 @@ def main(): print("Starting QA pair generation...") print(f"Target domains: {DOMAINS}") print(f"QA pairs per domain: {QA_PAIRS_PER_DOMAIN}") - + all_qa_pairs = [] - + for domain in DOMAINS: try: qa_pairs = process_domain(domain, QA_PAIRS_PER_DOMAIN) all_qa_pairs.extend(qa_pairs) print(f"Generated {len(qa_pairs)} QA pairs for {domain}") - + except Exception as e: print(f"Error processing domain {domain}: {e}") continue - + # Save all QA pairs if all_qa_pairs: - output_file = "../backend/data/generated_qa_pairs_gemini_pro_new.json" + output_file = "data/generated_qa_pairs_gemini_pro_new.json" save_qa_pairs(all_qa_pairs, output_file) - - - print(f"{'='*50}") + print(f"{'=' * 50}") print(f"Total QA pairs generated: {len(all_qa_pairs)}") - - + domain_counts = {} for qa in all_qa_pairs: - domain = qa.get('domain', 'unknown') + domain = qa.get("domain", "unknown") domain_counts[domain] = domain_counts.get(domain, 0) + 1 - + for domain, count in domain_counts.items(): print(f" {domain}: {count} pairs") - + else: print("No QA pairs were generated!") if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/backend/src/dataset_gen_eval/ingest_doc.py b/backend/src/dataset_gen_eval/ingest_doc.py index 49b95be1..ceaffd75 100644 --- a/backend/src/dataset_gen_eval/ingest_doc.py +++ b/backend/src/dataset_gen_eval/ingest_doc.py @@ -1,31 +1,31 @@ from pathlib import Path -import json, uuid -import sys - -src_path = Path(__file__).parent.parent -sys.path.insert(0, str(src_path)) +import json +import uuid +from typing import List from langchain.text_splitter import RecursiveCharacterTextSplitter -from src.vectorstores.faiss import FAISSVectorDatabase -from tools.process_pdf import process_pdf_docs -from tools.process_md import process_md +from ..vectorstores.faiss import FAISSVectorDatabase +from ..tools.process_pdf import process_pdf_docs +from ..tools.process_md import process_md + +DOMAINS = [ + "general_openroad", + # "command_reference" + "installation_guides", + "error_messages", + "opensta_yosys_klayout", +] -DOMAINS = ["general_openroad", - #"command_reference" - "installation_guides", "error_messages", - "opensta_yosys_klayout" - ] # discover files in the raw data dir -def discover_files(domain:str)->list: - root = Path(f"../backend/data/raw/{domain}") +def discover_files(domain: str) -> List[Path]: + root = Path(f"data/raw/{domain}") # getting all the files return list(root.rglob("*.*")) + # document loader function def load_as_documents(path): - - suffix = path.suffix.lower() if suffix == ".pdf": return process_pdf_docs(str(path)) @@ -35,68 +35,74 @@ def load_as_documents(path): print("HTML Files skipped for now") return [] else: - return [] + return [] + # chunking function def chunk(docs, size=700, overlap=70): splitter = RecursiveCharacterTextSplitter( - chunk_size=size, chunk_overlap=overlap, add_start_index=True) + chunk_size=size, chunk_overlap=overlap, add_start_index=True + ) out = [] for d in docs: out.extend(splitter.split_documents([d])) return out + # adding metadata to each chunk def enrich_metadata(chunks, domain, source_path): - for idx, doc in enumerate(chunks): - doc.metadata.update({ - "domain": domain, - "doc_path": str(source_path.relative_to('../backend/data/raw')), - "chunk_id": f"{uuid.uuid4()}", - }) + for doc in chunks: + doc.metadata.update( + { + "domain": domain, + "doc_path": str(source_path.relative_to("data/raw")), + "chunk_id": f"{uuid.uuid4()}", + } + ) return chunks def build_domain_index(domain): - vdb = FAISSVectorDatabase(embeddings_type="HF", - embeddings_model_name="sentence-transformers/all-MiniLM-L6-v2") + vdb = FAISSVectorDatabase( + embeddings_type="HF", + embeddings_model_name="sentence-transformers/all-MiniLM-L6-v2", + ) manifest = [] - + files = discover_files(domain) total_files = len(files) print(f"Processing {total_files} files for domain: {domain}") - + for idx, fp in enumerate(files): - print(f"[{idx+1}/{total_files}] Processing: {fp}") + print(f"[{idx + 1}/{total_files}] Processing: {fp}") - if fp.name.startswith('.'): + if fp.name.startswith("."): print(f"Skipping system file: {fp}") continue - - + docs = load_as_documents(fp) - if not docs: + if not docs: print(f"No documents loaded from: {fp}") continue - + chunks = chunk(docs) - if not chunks: + if not chunks: print(f"No chunks created from: {fp}") continue - + chunks = enrich_metadata(chunks, domain, fp) manifest += [c.metadata for c in chunks] - + # Process chunks in smaller batches for better memory management batch_size = 50 for i in range(0, len(chunks), batch_size): - batch = chunks[i:i+batch_size] + batch = chunks[i : i + batch_size] vdb._add_to_db(batch) - print(f" Added batch {i//batch_size + 1} ({len(batch)} chunks)") - - vdb.save_db(name=domain) - Path("../backend/data/manifests").mkdir(exist_ok=True, parents=True) - with open(f"../backend/data/manifests/{domain}.jsonl", "w") as f: + print(f" Added batch {i // batch_size + 1} ({len(batch)} chunks)") + + vdb.save_db(name=domain) + Path("data/manifests").mkdir(exist_ok=True, parents=True) + with open(f"data/manifests/{domain}.jsonl", "w") as f: for row in manifest: f.write(json.dumps(row) + "\n") @@ -105,4 +111,3 @@ def build_domain_index(domain): for dom in DOMAINS: build_domain_index(dom) print("All domain indexes built & manifests written.") - diff --git a/backend/src/dataset_gen_eval/quality_agents.py b/backend/src/dataset_gen_eval/quality_agents.py index 94cbb2e5..7201b66a 100644 --- a/backend/src/dataset_gen_eval/quality_agents.py +++ b/backend/src/dataset_gen_eval/quality_agents.py @@ -1,14 +1,9 @@ -import sys -from pathlib import Path +from typing import Any from dotenv import load_dotenv - -src_path = Path(__file__).parent.parent -sys.path.insert(0, str(src_path)) - from langchain_google_genai import ChatGoogleGenerativeAI -from deepeval.metrics import BaseMetric -from deepeval.test_case import LLMTestCase +from deepeval.metrics.base_metric import BaseMetric +from deepeval.test_case.llm_test_case import LLMTestCase load_dotenv() @@ -87,49 +82,50 @@ def __init__(self, threshold: float = 0.6): self.threshold = threshold self.evaluation_model = "gemini-2.5-pro" self.include_reason = True - - + self.llm = ChatGoogleGenerativeAI( model=self.evaluation_model, temperature=0.3, ) - def measure(self, tc: LLMTestCase) -> float: + def measure(self, test_case: LLMTestCase, *args: Any, **kwargs: Any) -> float: """Synchronous version of the metric evaluation.""" prompt = question_groundedness_critique_prompt.format( - question=tc.input, - context="\n".join(tc.context or []) + question=test_case.input, context="\n".join(test_case.context or []) ) - + try: - response = self.llm.invoke(prompt) - response_text = response.content.strip() - + response_content = response.content + response_text = response_content.strip() if isinstance(response_content, str) else str(response_content).strip() + # if "Total rating:" in response_text: rating_line = response_text.split("Total rating:")[-1].strip() - + score_1_5 = int(rating_line.split()[0]) else: - - lines = response_text.split('\n') - score_1_5 = 3 + lines = response_text.split("\n") + score_1_5 = 3 for line in lines: - if any(keyword in line.lower() for keyword in ['rating:', 'score:']): + if any( + keyword in line.lower() for keyword in ["rating:", "score:"] + ): try: - score_1_5 = int([char for char in line if char.isdigit()][0]) + score_1_5 = int( + [char for char in line if char.isdigit()][0] + ) break except (IndexError, ValueError): continue - + # Convert to 0-1 scale as expected by DeepEval self.score = score_1_5 / 5.0 self.reason = response_text self.success = self.score >= self.threshold - + return self.score - + except Exception as e: self.score = 0.0 self.reason = f"Error during evaluation: {str(e)}" @@ -137,66 +133,65 @@ def measure(self, tc: LLMTestCase) -> float: self.error = str(e) return self.score - async def a_measure(self, tc: LLMTestCase): + async def a_measure(self, test_case: LLMTestCase, *args: Any, **kwargs: Any) -> float: """Async version - fallback to synchronous since we don't have async client setup.""" - return self.measure(tc) + return self.measure(test_case, *args, **kwargs) def is_successful(self) -> bool: """Check if the metric evaluation was successful.""" - return False if getattr(self, "error", None) else self.success + return False if getattr(self, "error", None) else bool(getattr(self, "success", False)) - @property - def __name__(self): + def get_metric_name(self) -> str: return "Question Groundedness" class QuestionRelevanceMetric(BaseMetric): """DeepEval metric for evaluating question relevance to ML developers building NLP applications.""" - + def __init__(self, threshold: float = 0.6): self.threshold = threshold self.evaluation_model = "gemini-2.5-pro" self.include_reason = True - - + self.llm = ChatGoogleGenerativeAI( model=self.evaluation_model, temperature=0.3, ) - def measure(self, tc: LLMTestCase) -> float: + def measure(self, test_case: LLMTestCase, *args: Any, **kwargs: Any) -> float: """Synchronous version of the metric evaluation.""" - prompt = question_relevance_critique_prompt.format( - question=tc.input - ) - + prompt = question_relevance_critique_prompt.format(question=test_case.input) + try: - response = self.llm.invoke(prompt) - response_text = response.content.strip() - - + response_content = response.content + response_text = response_content.strip() if isinstance(response_content, str) else str(response_content).strip() + if "Total rating:" in response_text: rating_line = response_text.split("Total rating:")[-1].strip() score_1_5 = int(rating_line.split()[0]) else: - lines = response_text.split('\n') + lines = response_text.split("\n") score_1_5 = 3 # Default score for line in lines: - if any(keyword in line.lower() for keyword in ['rating:', 'score:']): + if any( + keyword in line.lower() for keyword in ["rating:", "score:"] + ): try: - score_1_5 = int([char for char in line if char.isdigit()][0]) + score_1_5 = int( + [char for char in line if char.isdigit()][0] + ) break except (IndexError, ValueError): continue - + # Convert to 0-1 scale as expected by DeepEval self.score = score_1_5 / 5.0 self.reason = response_text self.success = self.score >= self.threshold - + return self.score - + except Exception as e: self.score = 0.0 self.reason = f"Error during evaluation: {str(e)}" @@ -204,68 +199,65 @@ def measure(self, tc: LLMTestCase) -> float: self.error = str(e) return self.score - async def a_measure(self, tc: LLMTestCase): + async def a_measure(self, test_case: LLMTestCase, *args: Any, **kwargs: Any) -> float: """Async version - fallback to synchronous since we don't have async client setup.""" - return self.measure(tc) + return self.measure(test_case, *args, **kwargs) def is_successful(self) -> bool: """Check if the metric evaluation was successful.""" - return False if getattr(self, "error", None) else self.success + return False if getattr(self, "error", None) else bool(getattr(self, "success", False)) - @property - def __name__(self): + def get_metric_name(self) -> str: return "Question Relevance" class QuestionStandaloneMetric(BaseMetric): """DeepEval metric for evaluating question context-independence.""" - + def __init__(self, threshold: float = 0.6): self.threshold = threshold self.evaluation_model = "gemini-2.5-pro" self.include_reason = True - - + self.llm = ChatGoogleGenerativeAI( model=self.evaluation_model, temperature=0.3, ) - def measure(self, tc: LLMTestCase) -> float: + def measure(self, test_case: LLMTestCase, *args: Any, **kwargs: Any) -> float: """Synchronous version of the metric evaluation.""" - prompt = question_standalone_critique_prompt.format( - question=tc.input - ) - + prompt = question_standalone_critique_prompt.format(question=test_case.input) + try: - response = self.llm.invoke(prompt) - response_text = response.content.strip() - - + response_content = response.content + response_text = response_content.strip() if isinstance(response_content, str) else str(response_content).strip() + if "Total rating:" in response_text: rating_line = response_text.split("Total rating:")[-1].strip() - + score_1_5 = int(rating_line.split()[0]) else: - - lines = response_text.split('\n') + lines = response_text.split("\n") score_1_5 = 3 # Default score for line in lines: - if any(keyword in line.lower() for keyword in ['rating:', 'score:']): + if any( + keyword in line.lower() for keyword in ["rating:", "score:"] + ): try: - score_1_5 = int([char for char in line if char.isdigit()][0]) + score_1_5 = int( + [char for char in line if char.isdigit()][0] + ) break except (IndexError, ValueError): continue - - + self.score = score_1_5 / 5.0 self.reason = response_text self.success = self.score >= self.threshold - + return self.score - + except Exception as e: self.score = 0.0 self.reason = f"Error during evaluation: {str(e)}" @@ -273,14 +265,13 @@ def measure(self, tc: LLMTestCase) -> float: self.error = str(e) return self.score - async def a_measure(self, tc: LLMTestCase): + async def a_measure(self, test_case: LLMTestCase, *args: Any, **kwargs: Any) -> float: """Async version - fallback to synchronous since we don't have async client setup.""" - return self.measure(tc) + return self.measure(test_case, *args, **kwargs) def is_successful(self) -> bool: """Check if the metric evaluation was successful.""" - return False if getattr(self, "error", None) else self.success + return False if getattr(self, "error", None) else bool(getattr(self, "success", False)) - @property - def __name__(self): - return "Question Standalone" \ No newline at end of file + def get_metric_name(self) -> str: + return "Question Standalone"