diff --git a/backend/pyproject.toml b/backend/pyproject.toml index 9e26a877..c08b6839 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -40,6 +40,10 @@ ignore_missing_imports = true module = "transformers.*" ignore_missing_imports = true +[[tool.mypy.overrides]] +module = "deepeval.*" +ignore_missing_imports = true + [tool.ruff] exclude = [ ".bzr", @@ -75,7 +79,7 @@ target-version = "py310" [tool.ruff.lint] select = ["E4", "E7", "E9","E301","E304","E305","E401","E223","E224","E242", "E", "F" ,"N", "W", "C90"] -extend-select = ["D203", "D204"] +extend-select = ["D204"] ignore = ["E501"] preview = true diff --git a/backend/requirements-test.txt b/backend/requirements-test.txt index 5155631f..3928562f 100644 --- a/backend/requirements-test.txt +++ b/backend/requirements-test.txt @@ -6,3 +6,4 @@ types-tqdm==4.66.0.20240417 types-beautifulsoup4==4.12.0.20240511 ruff==0.5.1 pre-commit==3.7.1 +deepeval==3.2.0 diff --git a/backend/src/dataset_gen_eval/eval_dataset.py b/backend/src/dataset_gen_eval/eval_dataset.py new file mode 100644 index 00000000..0d335929 --- /dev/null +++ b/backend/src/dataset_gen_eval/eval_dataset.py @@ -0,0 +1,40 @@ +import json +from deepeval.test_case.llm_test_case import LLMTestCase +from .quality_agents import ( + GroundednessMetric, + QuestionRelevanceMetric, + QuestionStandaloneMetric, +) + + +json_path = "data/generated_qa_pairs_gemini_pro_new.json" + +# Loading questions +with open(json_path, "r") as f: + qa_pairs = json.load(f) + +# Initializing metrics +groundedness_metric = GroundednessMetric() +relevance_metric = QuestionRelevanceMetric() +standalone_metric = QuestionStandaloneMetric() + + +for entry in qa_pairs: + question = entry["question"] + answer = entry["answer"] + context = entry["context"] + + test_case_question = LLMTestCase( + input=question, actual_output="", context=[context] + ) + + groundedness_score = groundedness_metric.measure(test_case_question) + relevance_score = relevance_metric.measure(test_case_question) + standalone_score = standalone_metric.measure(test_case_question) + + print(f"Question: {question}") + print(f" Groundedness: {groundedness_score:.2f} ({groundedness_metric.reason})") + print(f" Relevance: {relevance_score:.2f} ({relevance_metric.reason})") + print(f" Standalone: {standalone_score:.2f} ({standalone_metric.reason})") + print("-" * 60) + break diff --git a/backend/src/dataset_gen_eval/generate_qa_pairs.py b/backend/src/dataset_gen_eval/generate_qa_pairs.py new file mode 100644 index 00000000..bd7c5602 --- /dev/null +++ b/backend/src/dataset_gen_eval/generate_qa_pairs.py @@ -0,0 +1,239 @@ +import json +from pathlib import Path +from typing import List, Dict, Optional +import random + +from ..vectorstores.faiss import FAISSVectorDatabase +from langchain_google_genai import ChatGoogleGenerativeAI +from langchain.docstore.document import Document +from dotenv import load_dotenv + +load_dotenv() + + +# didn't include command_reference cause didn't index it +DOMAINS = [ + "installation_guides", + "error_messages", + "opensta_yosys_klayout", + "general_openroad", +] + +QA_PAIRS_PER_DOMAIN = 10 + +QA_GENERATION_PROMPT = """ +Your task is to write a factoid question and an answer given a context. +Your factoid question should be answerable with a specific, concise piece of factual information from the context. +Your factoid question should be formulated in the same style as questions users could ask in a search engine. +This means that your factoid question MUST NOT mention something like "according to the passage" or "context". + +Provide your answer as follows: + +Output::: +Factoid question: (your factoid question) +Answer: (your answer to the factoid question) + +Now here is the context. + +Context: {context}\n +Output:::""" + + +def load_domain_database(domain: str) -> Optional[FAISSVectorDatabase]: + """Load the FAISS vector database for a specific domain.""" + print(f"Loading vector database for domain: {domain}") + + vdb = FAISSVectorDatabase( + embeddings_type="HF", + embeddings_model_name="sentence-transformers/all-MiniLM-L6-v2", + ) + + try: + vdb.load_db(name=domain) + print(f"Successfully loaded {domain} database") + return vdb + except Exception as e: + print(f"Error loading database for {domain}: {e}") + return None + + +def sample_documents_from_db( + vdb: FAISSVectorDatabase, num_samples: int = 5 +) -> List[Document]: + """Sample random documents from the vector database to use for QA generation.""" + try: + all_docs = list(vdb.get_documents()) + print(f"Total documents in database: {len(all_docs)}") + + # Sample random documents + sample_size = min(num_samples, len(all_docs)) + sampled_docs = random.sample(all_docs, sample_size) + + print(f"Sampled {len(sampled_docs)} documents") + return sampled_docs + + except Exception as e: + print(f"Error sampling documents: {e}") + return [] + + +def generate_qa_pairs_for_content( + all_docs: List[Document], domain: str, num_qa: int = 5 +) -> List[Dict[str, str]]: + """Use Gemini to generate QA pairs from the given documents.""" + try: + # Initialize Gemini model + llm = ChatGoogleGenerativeAI( + model="gemini-2.5-pro", + temperature=0.3, + ) + + print(f"Generating {num_qa} QA pairs for {domain} domain...") + + all_qa_pairs = [] + + for i in range(num_qa): + try: + # Sample different documents for each QA pair to get variety + sample_size = min(5, len(all_docs)) + sampled_docs = random.sample(all_docs, sample_size) + + # Combine content from this sample + content = "\n\n---DOCUMENT SEPARATOR---\n\n".join( + [doc.page_content for doc in sampled_docs] + ) + + prompt = QA_GENERATION_PROMPT.format(context=content[:15000]) + + print(f" Generating QA pair {i + 1}/{num_qa}...") + + # gemini cost analysis here, langsmith + response = llm.invoke(prompt) + + response_content = response.content + response_text = response_content.strip() if isinstance(response_content, str) else str(response_content).strip() + + if "Output:::" in response_text: + output_section = response_text.split("Output:::")[-1].strip() + + lines = output_section.split("\n") + question = "" + answer = "" + + for line in lines: + line = line.strip() + if line.startswith("Factoid question:"): + question = line.replace("Factoid question:", "").strip() + elif line.startswith("Answer:"): + answer = line.replace("Answer:", "").strip() + + if question and answer: + qa_pair = { + "question": question, + "answer": answer, + "domain": domain, + "source": "generated_from_docs", # context source add here + "context": content[ + :15000 + ], # Add the context used for generation + } + all_qa_pairs.append(qa_pair) + print(f"Generated: {question[:50]}...") + else: + print("Failed to parse QA pair from response") + print(f"Raw response: {response_text[:200]}...") + else: + print("No 'Output:::' section found in response") + print(f"Raw response: {response_text[:200]}...") + + except Exception as e: + print(f"Error generating QA pair {i + 1}: {e}") + continue + + print( + f"Successfully generated {len(all_qa_pairs)} QA pairs out of {num_qa} attempts" + ) + return all_qa_pairs + + except Exception as e: + print(f"Error in QA generation process: {e}") + return [] + + +def process_domain(domain: str, qa_per_domain: int = 10) -> List[Dict[str, str]]: + """Process a single domain to generate QA pairs.""" + print(f"\n{'=' * 50}") + print(f"Processing domain: {domain}") + print(f"{'=' * 50}") + + # Load the vector database + vdb = load_domain_database(domain) + if not vdb: + return [] + + # Sample documents from the database + sampled_docs = sample_documents_from_db(vdb, num_samples=100) + if not sampled_docs: + return [] + + print(f"Will generate QA pairs from pool of {len(sampled_docs)} documents") + + # Generate QA pairs (each QA pair will sample different docs) + qa_pairs = generate_qa_pairs_for_content( + all_docs=sampled_docs, domain=domain, num_qa=qa_per_domain + ) + + return qa_pairs + + +def save_qa_pairs(all_qa_pairs: List[Dict[str, str]], output_file: str): + """Save the generated QA pairs to a JSON file.""" + output_path = Path(output_file) + output_path.parent.mkdir(parents=True, exist_ok=True) + + with open(output_path, "w", encoding="utf-8") as f: + json.dump(all_qa_pairs, f, indent=2, ensure_ascii=False) + + print(f"\nSaved {len(all_qa_pairs)} QA pairs to: {output_path}") + + +def main(): + """Main function to generate QA pairs for all domains.""" + print("Starting QA pair generation...") + print(f"Target domains: {DOMAINS}") + print(f"QA pairs per domain: {QA_PAIRS_PER_DOMAIN}") + + all_qa_pairs = [] + + for domain in DOMAINS: + try: + qa_pairs = process_domain(domain, QA_PAIRS_PER_DOMAIN) + all_qa_pairs.extend(qa_pairs) + print(f"Generated {len(qa_pairs)} QA pairs for {domain}") + + except Exception as e: + print(f"Error processing domain {domain}: {e}") + continue + + # Save all QA pairs + if all_qa_pairs: + output_file = "data/generated_qa_pairs_gemini_pro_new.json" + save_qa_pairs(all_qa_pairs, output_file) + + print(f"{'=' * 50}") + print(f"Total QA pairs generated: {len(all_qa_pairs)}") + + domain_counts = {} + for qa in all_qa_pairs: + domain = qa.get("domain", "unknown") + domain_counts[domain] = domain_counts.get(domain, 0) + 1 + + for domain, count in domain_counts.items(): + print(f" {domain}: {count} pairs") + + else: + print("No QA pairs were generated!") + + +if __name__ == "__main__": + main() diff --git a/backend/src/dataset_gen_eval/ingest_doc.py b/backend/src/dataset_gen_eval/ingest_doc.py new file mode 100644 index 00000000..ceaffd75 --- /dev/null +++ b/backend/src/dataset_gen_eval/ingest_doc.py @@ -0,0 +1,113 @@ +from pathlib import Path +import json +import uuid +from typing import List + +from langchain.text_splitter import RecursiveCharacterTextSplitter +from ..vectorstores.faiss import FAISSVectorDatabase +from ..tools.process_pdf import process_pdf_docs +from ..tools.process_md import process_md + +DOMAINS = [ + "general_openroad", + # "command_reference" + "installation_guides", + "error_messages", + "opensta_yosys_klayout", +] + + +# discover files in the raw data dir +def discover_files(domain: str) -> List[Path]: + root = Path(f"data/raw/{domain}") + # getting all the files + return list(root.rglob("*.*")) + + +# document loader function +def load_as_documents(path): + suffix = path.suffix.lower() + if suffix == ".pdf": + return process_pdf_docs(str(path)) + elif suffix in {".md", ".markdown"}: + return process_md(str(path.parent), split_text=False) + elif suffix == ".html": + print("HTML Files skipped for now") + return [] + else: + return [] + + +# chunking function +def chunk(docs, size=700, overlap=70): + splitter = RecursiveCharacterTextSplitter( + chunk_size=size, chunk_overlap=overlap, add_start_index=True + ) + out = [] + for d in docs: + out.extend(splitter.split_documents([d])) + return out + + +# adding metadata to each chunk +def enrich_metadata(chunks, domain, source_path): + for doc in chunks: + doc.metadata.update( + { + "domain": domain, + "doc_path": str(source_path.relative_to("data/raw")), + "chunk_id": f"{uuid.uuid4()}", + } + ) + return chunks + + +def build_domain_index(domain): + vdb = FAISSVectorDatabase( + embeddings_type="HF", + embeddings_model_name="sentence-transformers/all-MiniLM-L6-v2", + ) + manifest = [] + + files = discover_files(domain) + total_files = len(files) + print(f"Processing {total_files} files for domain: {domain}") + + for idx, fp in enumerate(files): + print(f"[{idx + 1}/{total_files}] Processing: {fp}") + + if fp.name.startswith("."): + print(f"Skipping system file: {fp}") + continue + + docs = load_as_documents(fp) + if not docs: + print(f"No documents loaded from: {fp}") + continue + + chunks = chunk(docs) + if not chunks: + print(f"No chunks created from: {fp}") + continue + + chunks = enrich_metadata(chunks, domain, fp) + manifest += [c.metadata for c in chunks] + + # Process chunks in smaller batches for better memory management + batch_size = 50 + for i in range(0, len(chunks), batch_size): + batch = chunks[i : i + batch_size] + vdb._add_to_db(batch) + print(f" Added batch {i // batch_size + 1} ({len(batch)} chunks)") + + vdb.save_db(name=domain) + Path("data/manifests").mkdir(exist_ok=True, parents=True) + with open(f"data/manifests/{domain}.jsonl", "w") as f: + for row in manifest: + f.write(json.dumps(row) + "\n") + + +if __name__ == "__main__": + for dom in DOMAINS: + build_domain_index(dom) + print("All domain indexes built & manifests written.") diff --git a/backend/src/dataset_gen_eval/quality_agents.py b/backend/src/dataset_gen_eval/quality_agents.py new file mode 100644 index 00000000..7201b66a --- /dev/null +++ b/backend/src/dataset_gen_eval/quality_agents.py @@ -0,0 +1,277 @@ +from typing import Any +from dotenv import load_dotenv + +from langchain_google_genai import ChatGoogleGenerativeAI +from deepeval.metrics.base_metric import BaseMetric +from deepeval.test_case.llm_test_case import LLMTestCase + +load_dotenv() + +# Groundedness critique prompt template +question_groundedness_critique_prompt = """ +You are an expert evaluator tasked with assessing the groundedness of a question-answer pair. + +Your task is to evaluate whether the given question is well-grounded in the provided context. +A well-grounded question should: +1. Be answerable using information from the context +2. Not require external knowledge beyond the context +3. Be specific and factual rather than speculative +4. Have clear supporting evidence in the context + +Please analyze the question and context, then provide a rating from 1 to 5: +- 1: Completely ungrounded - question cannot be answered from context +- 2: Poorly grounded - question requires significant external knowledge +- 3: Moderately grounded - question is partially answerable from context +- 4: Well grounded - question is mostly answerable from context +- 5: Perfectly grounded - question is completely answerable from context + +Question: {question} + +Context: {context} + +Please provide your evaluation in the following format: +Analysis: [Your detailed analysis of why the question is or isn't grounded in the context] +Total rating: [Your rating from 1 to 5] +""" + +# Question relevance critique prompt template +question_relevance_critique_prompt = """ +You will be given a question. +Your task is to provide a 'total rating' representing how useful this question can be to machine learning developers building NLP applications with the Hugging Face ecosystem. +Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful. + +Provide your answer as follows: + +Answer::: +Evaluation: (your rationale for the rating, as a text) +Total rating: (your rating, as a number between 1 and 5) + +You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer. + +Now here is the question. + +Question: {question} +Answer::: """ + +# Question standalone critique prompt template +question_standalone_critique_prompt = """ +You will be given a question. +Your task is to provide a 'total rating' representing how context-independent this question is. +Give your answer on a scale of 1 to 5, where 1 means that the question depends on additional information to be understood, and 5 means that the question makes sense by itself. +For instance, if the question refers to a particular setting, like 'in the context' or 'in the document', the rating must be 1. +The questions can contain obscure technical nouns or acronyms like Gradio, Hub, Hugging Face or Space and still be a 5: it must simply be clear to an operator with access to documentation what the question is about. + +For instance, "What is the name of the checkpoint from which the ViT model is imported?" should receive a 1, since there is an implicit mention of a context, thus the question is not independent from the context. + +Provide your answer as follows: + +Answer::: +Evaluation: (your rationale for the rating, as a text) +Total rating: (your rating, as a number between 1 and 5) + +You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer. + +Now here is the question. + +Question: {question} +Answer::: """ + + +class GroundednessMetric(BaseMetric): + def __init__(self, threshold: float = 0.6): + self.threshold = threshold + self.evaluation_model = "gemini-2.5-pro" + self.include_reason = True + + self.llm = ChatGoogleGenerativeAI( + model=self.evaluation_model, + temperature=0.3, + ) + + def measure(self, test_case: LLMTestCase, *args: Any, **kwargs: Any) -> float: + """Synchronous version of the metric evaluation.""" + prompt = question_groundedness_critique_prompt.format( + question=test_case.input, context="\n".join(test_case.context or []) + ) + + try: + response = self.llm.invoke(prompt) + response_content = response.content + response_text = response_content.strip() if isinstance(response_content, str) else str(response_content).strip() + + # + if "Total rating:" in response_text: + rating_line = response_text.split("Total rating:")[-1].strip() + + score_1_5 = int(rating_line.split()[0]) + else: + lines = response_text.split("\n") + score_1_5 = 3 + for line in lines: + if any( + keyword in line.lower() for keyword in ["rating:", "score:"] + ): + try: + score_1_5 = int( + [char for char in line if char.isdigit()][0] + ) + break + except (IndexError, ValueError): + continue + + # Convert to 0-1 scale as expected by DeepEval + self.score = score_1_5 / 5.0 + self.reason = response_text + self.success = self.score >= self.threshold + + return self.score + + except Exception as e: + self.score = 0.0 + self.reason = f"Error during evaluation: {str(e)}" + self.success = False + self.error = str(e) + return self.score + + async def a_measure(self, test_case: LLMTestCase, *args: Any, **kwargs: Any) -> float: + """Async version - fallback to synchronous since we don't have async client setup.""" + return self.measure(test_case, *args, **kwargs) + + def is_successful(self) -> bool: + """Check if the metric evaluation was successful.""" + return False if getattr(self, "error", None) else bool(getattr(self, "success", False)) + + def get_metric_name(self) -> str: + return "Question Groundedness" + + +class QuestionRelevanceMetric(BaseMetric): + """DeepEval metric for evaluating question relevance to ML developers building NLP applications.""" + + def __init__(self, threshold: float = 0.6): + self.threshold = threshold + self.evaluation_model = "gemini-2.5-pro" + self.include_reason = True + + self.llm = ChatGoogleGenerativeAI( + model=self.evaluation_model, + temperature=0.3, + ) + + def measure(self, test_case: LLMTestCase, *args: Any, **kwargs: Any) -> float: + """Synchronous version of the metric evaluation.""" + prompt = question_relevance_critique_prompt.format(question=test_case.input) + + try: + response = self.llm.invoke(prompt) + response_content = response.content + response_text = response_content.strip() if isinstance(response_content, str) else str(response_content).strip() + + if "Total rating:" in response_text: + rating_line = response_text.split("Total rating:")[-1].strip() + score_1_5 = int(rating_line.split()[0]) + else: + lines = response_text.split("\n") + score_1_5 = 3 # Default score + for line in lines: + if any( + keyword in line.lower() for keyword in ["rating:", "score:"] + ): + try: + score_1_5 = int( + [char for char in line if char.isdigit()][0] + ) + break + except (IndexError, ValueError): + continue + + # Convert to 0-1 scale as expected by DeepEval + self.score = score_1_5 / 5.0 + self.reason = response_text + self.success = self.score >= self.threshold + + return self.score + + except Exception as e: + self.score = 0.0 + self.reason = f"Error during evaluation: {str(e)}" + self.success = False + self.error = str(e) + return self.score + + async def a_measure(self, test_case: LLMTestCase, *args: Any, **kwargs: Any) -> float: + """Async version - fallback to synchronous since we don't have async client setup.""" + return self.measure(test_case, *args, **kwargs) + + def is_successful(self) -> bool: + """Check if the metric evaluation was successful.""" + return False if getattr(self, "error", None) else bool(getattr(self, "success", False)) + + def get_metric_name(self) -> str: + return "Question Relevance" + + +class QuestionStandaloneMetric(BaseMetric): + """DeepEval metric for evaluating question context-independence.""" + + def __init__(self, threshold: float = 0.6): + self.threshold = threshold + self.evaluation_model = "gemini-2.5-pro" + self.include_reason = True + + self.llm = ChatGoogleGenerativeAI( + model=self.evaluation_model, + temperature=0.3, + ) + + def measure(self, test_case: LLMTestCase, *args: Any, **kwargs: Any) -> float: + """Synchronous version of the metric evaluation.""" + prompt = question_standalone_critique_prompt.format(question=test_case.input) + + try: + response = self.llm.invoke(prompt) + response_content = response.content + response_text = response_content.strip() if isinstance(response_content, str) else str(response_content).strip() + + if "Total rating:" in response_text: + rating_line = response_text.split("Total rating:")[-1].strip() + + score_1_5 = int(rating_line.split()[0]) + else: + lines = response_text.split("\n") + score_1_5 = 3 # Default score + for line in lines: + if any( + keyword in line.lower() for keyword in ["rating:", "score:"] + ): + try: + score_1_5 = int( + [char for char in line if char.isdigit()][0] + ) + break + except (IndexError, ValueError): + continue + + self.score = score_1_5 / 5.0 + self.reason = response_text + self.success = self.score >= self.threshold + + return self.score + + except Exception as e: + self.score = 0.0 + self.reason = f"Error during evaluation: {str(e)}" + self.success = False + self.error = str(e) + return self.score + + async def a_measure(self, test_case: LLMTestCase, *args: Any, **kwargs: Any) -> float: + """Async version - fallback to synchronous since we don't have async client setup.""" + return self.measure(test_case, *args, **kwargs) + + def is_successful(self) -> bool: + """Check if the metric evaluation was successful.""" + return False if getattr(self, "error", None) else bool(getattr(self, "success", False)) + + def get_metric_name(self) -> str: + return "Question Standalone"