From 287dd04cc1a1a5106818531d8e1e7acd64fc09f3 Mon Sep 17 00:00:00 2001 From: "vaclis.mbp" Date: Wed, 3 Dec 2025 09:48:38 -0800 Subject: [PATCH 1/6] add semantic search service FastAPI service for semantic course search using FAISS and Sentence Transformers. Supports vector similarity search with index persistence to PVC storage. --- apps/semantic-search/Dockerfile | 17 + apps/semantic-search/app/main.py | 493 ++++++++++++++++++++++++++ apps/semantic-search/requirements.txt | 6 + 3 files changed, 516 insertions(+) create mode 100644 apps/semantic-search/Dockerfile create mode 100644 apps/semantic-search/app/main.py create mode 100644 apps/semantic-search/requirements.txt diff --git a/apps/semantic-search/Dockerfile b/apps/semantic-search/Dockerfile new file mode 100644 index 000000000..3c3e0ed36 --- /dev/null +++ b/apps/semantic-search/Dockerfile @@ -0,0 +1,17 @@ +FROM python:3.11-slim + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 + +WORKDIR /app + +RUN apt-get update \ + && apt-get install -y --no-install-recommends build-essential \ + && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY app ./app + +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/apps/semantic-search/app/main.py b/apps/semantic-search/app/main.py new file mode 100644 index 000000000..e1c3b13e8 --- /dev/null +++ b/apps/semantic-search/app/main.py @@ -0,0 +1,493 @@ +import logging +import os +import pickle +import threading +from dataclasses import dataclass +from datetime import datetime +from pathlib import Path +from typing import Dict, Iterable, List, Optional, Set, Tuple + +import faiss +import numpy as np +import requests +from fastapi import FastAPI, HTTPException, Query +from pydantic import BaseModel, Field +from sentence_transformers import SentenceTransformer + +logger = logging.getLogger("semantic-search") +logging.basicConfig(level=os.getenv("SEMANTIC_SEARCH_LOG_LEVEL", "INFO")) + +# Directory for persisting FAISS indices +INDEX_STORAGE_DIR = Path(os.getenv("INDEX_STORAGE_DIR", "/app/indexes")) +INDEX_STORAGE_DIR.mkdir(parents=True, exist_ok=True) + +COURSE_QUERY = """ +query Catalog($year: Int!, $semester: Semester!) { + catalog(year: $year, semester: $semester) { + courseNumber + subject + number + course { + title + description + } + } +} +""" + +# BACKEND_URL from env is for external access (http://backend:8080) +# But semantic-search needs internal Docker network access (port 5001) +BACKEND_INTERNAL_URL = "http://backend:5001" +DEFAULT_CATALOG_URL = f"{BACKEND_INTERNAL_URL}/api/graphql" + +# Semantic search embedding model options: +# BAAI/bge-base-en-v1.5 (Current - best for retrieval, 109M params) +# BAAI/bge-small-en-v1.5 (Faster, smaller, 33M params) +# BAAI/bge-large-en-v1.5 (Most accurate, slower, 335M params) +# sentence-transformers/all-mpnet-base-v2 (Good general purpose, 110M params) +# sentence-transformers/all-MiniLM-L6-v2 (Fastest, lightweight, 22M params) +MODEL_NAME = "BAAI/bge-base-en-v1.5" +QUERY_PREFIX = "Represent this sentence for searching relevant passages: " +DEFAULT_YEAR_ENV = os.getenv("SEMANTIC_SEARCH_YEAR") +DEFAULT_SEMESTER_ENV = os.getenv("SEMANTIC_SEARCH_SEMESTER") +DEFAULT_ALLOWED_SUBJECTS = { + token.strip().upper() + for token in os.getenv("SEMANTIC_SEARCH_ALLOWED_SUBJECTS", "").split(",") + if token.strip() +} or None + + +def resolve_default_term(year_value: Optional[str], semester_value: Optional[str]) -> Optional[Tuple[int, str]]: + if year_value and semester_value: + try: + return int(year_value), semester_value.strip() + except ValueError as exc: + logger.error("Invalid default term configuration: %s", exc) + return None + + if year_value or semester_value: + logger.error("Both SEMANTIC_SEARCH_YEAR and SEMANTIC_SEARCH_SEMESTER are required to set a default term.") + return None + + +DEFAULT_TERM = resolve_default_term(DEFAULT_YEAR_ENV, DEFAULT_SEMESTER_ENV) + + +@dataclass +class TermIndex: + index: faiss.IndexFlatIP + embeddings: np.ndarray + courses: List[Dict] + course_texts: List[str] + kept_idx: List[int] + last_refreshed: datetime + year: int + semester: str + allowed_subjects: Optional[List[str]] + + +class SemanticSearchEngine: + def __init__(self) -> None: + self.model = SentenceTransformer(MODEL_NAME) + self.catalog_url = DEFAULT_CATALOG_URL + self.default_allowed_subjects = set(DEFAULT_ALLOWED_SUBJECTS) if DEFAULT_ALLOWED_SUBJECTS else None + self._indices: Dict[str, TermIndex] = {} + self._lock = threading.RLock() + + def _get_index_path(self, year: int, semester: str, allowed_subjects: Optional[List[str]]) -> Path: + """Get filesystem path for persisted index.""" + suffix = ",".join(allowed_subjects) if allowed_subjects else "all" + filename = f"{year}_{semester}_{suffix}.index" + return INDEX_STORAGE_DIR / filename + + def _save_index(self, entry: TermIndex) -> None: + """Save FAISS index and metadata to disk.""" + try: + index_path = self._get_index_path(entry.year, entry.semester, entry.allowed_subjects) + + # Save FAISS index + faiss.write_index(entry.index, str(index_path.with_suffix(".faiss"))) + + # Save metadata (everything except the FAISS index) + metadata = { + "embeddings": entry.embeddings, + "courses": entry.courses, + "course_texts": entry.course_texts, + "kept_idx": entry.kept_idx, + "last_refreshed": entry.last_refreshed, + "year": entry.year, + "semester": entry.semester, + "allowed_subjects": entry.allowed_subjects, + } + with open(index_path.with_suffix(".pkl"), "wb") as f: + pickle.dump(metadata, f) + + logger.info("Saved index to %s", index_path) + except Exception as exc: + logger.warning("Failed to save index to disk: %s", exc) + + def _load_index(self, year: int, semester: str, allowed_subjects: Optional[List[str]]) -> Optional[TermIndex]: + """Load FAISS index and metadata from disk if available.""" + try: + index_path = self._get_index_path(year, semester, allowed_subjects) + faiss_file = index_path.with_suffix(".faiss") + pkl_file = index_path.with_suffix(".pkl") + + if not faiss_file.exists() or not pkl_file.exists(): + return None + + # Load FAISS index + index = faiss.read_index(str(faiss_file)) + + # Load metadata + with open(pkl_file, "rb") as f: + metadata = pickle.load(f) + + entry = TermIndex( + index=index, + embeddings=metadata["embeddings"], + courses=metadata["courses"], + course_texts=metadata["course_texts"], + kept_idx=metadata["kept_idx"], + last_refreshed=metadata["last_refreshed"], + year=metadata["year"], + semester=metadata["semester"], + allowed_subjects=metadata["allowed_subjects"], + ) + + logger.info( + "Loaded index from disk for %s %s (subjects=%s, size=%d, last_refreshed=%s)", + entry.semester, + entry.year, + "all" if not entry.allowed_subjects else ",".join(sorted(entry.allowed_subjects)), + len(entry.course_texts), + entry.last_refreshed.isoformat(), + ) + return entry + except Exception as exc: + logger.warning("Failed to load index from disk: %s", exc) + return None + + def refresh( + self, year: int, semester: str, allowed_subjects: Optional[Iterable[str]] = None + ) -> TermIndex: + term_semester = semester.strip() + allowed = self._resolve_allowed_subjects(allowed_subjects) + + logger.info( + "Refreshing semantic search index for %s %s (subjects=%s)", + term_semester, + year, + "all" if not allowed else ",".join(sorted(allowed)), + ) + + raw_courses = self._fetch_courses(year, term_semester) + courses = self._deduplicate_courses(raw_courses) + if not courses: + raise RuntimeError("Catalog response did not contain any courses") + + course_texts: List[str] = [] + kept_idx: List[int] = [] + + for i, course in enumerate(courses): + subj = (course.get("subject") or "").strip() + if allowed and subj and subj.upper() not in allowed: + continue + course_texts.append(self._build_course_text(course)) + kept_idx.append(i) + + if not course_texts: + logger.warning("Subject filter removed every course; rebuilding without filter") + course_texts = [self._build_course_text(course) for course in courses] + kept_idx = list(range(len(courses))) + + embeddings = np.asarray(self.model.encode(course_texts, convert_to_numpy=True), dtype="float32") + faiss.normalize_L2(embeddings) + index = faiss.IndexFlatIP(embeddings.shape[1]) + index.add(embeddings) + + entry = TermIndex( + index=index, + embeddings=embeddings, + courses=courses, + course_texts=course_texts, + kept_idx=kept_idx, + last_refreshed=datetime.utcnow(), + year=year, + semester=term_semester, + allowed_subjects=sorted(allowed) if allowed else None, + ) + + with self._lock: + self._indices[self._key(entry.year, entry.semester, entry.allowed_subjects)] = entry + + # Save index to disk for persistence + self._save_index(entry) + + logger.info("Semantic index ready with %d entries", len(course_texts)) + return entry + + def search( + self, + query: str, + year: int, + semester: str, + threshold: float = 0.3, + allowed_subjects: Optional[Iterable[str]] = None, + ) -> Tuple[List[Dict], TermIndex]: + entry = self._get_or_build_index(year, semester, allowed_subjects) + embeddings = entry.embeddings + + # Search top 500 candidates, then filter by threshold + # This balances performance vs completeness + search_k = min(len(embeddings), 500) + if search_k == 0: + return [], entry + + # BGE models work better with instruction prefix for queries + prefixed_query = QUERY_PREFIX + query + query_vec = np.asarray(self.model.encode([prefixed_query], convert_to_numpy=True), dtype="float32") + faiss.normalize_L2(query_vec) + sims, idxs = entry.index.search(query_vec, search_k) + + results = [] + for score, local_idx in zip(sims[0], idxs[0]): + # Apply threshold filter + if score < threshold: + continue + + original = entry.courses[entry.kept_idx[local_idx]] + title = ((original.get("course") or {}).get("title") or "") + desc = ((original.get("course") or {}).get("description") or "") + text = entry.course_texts[local_idx] + results.append( + { + "subject": original.get("subject"), + "courseNumber": original.get("courseNumber"), + "title": title, + "description": desc, + "score": float(score), + "text": text, + } + ) + + # Sort by score only - semantic similarity is more accurate than keyword matching + results.sort(key=lambda r: r["score"], reverse=True) + + # Return all results above threshold + return results, entry + + def describe_indices(self) -> List[Dict]: + with self._lock: + entries = list(self._indices.values()) + return [ + { + "year": entry.year, + "semester": entry.semester, + "allowed_subjects": entry.allowed_subjects, + "size": len(entry.course_texts), + "last_refreshed": entry.last_refreshed.isoformat(), + } + for entry in entries + ] + + def _get_or_build_index( + self, year: int, semester: str, allowed_subjects: Optional[Iterable[str]] + ) -> TermIndex: + canonical_semester = semester.strip() + allowed = self._resolve_allowed_subjects(allowed_subjects) + key = self._key(year, canonical_semester, sorted(allowed) if allowed else None) + + with self._lock: + entry = self._indices.get(key) + + if entry: + return entry + + # Try loading from disk before building + loaded = self._load_index(year, canonical_semester, sorted(allowed) if allowed else None) + if loaded: + with self._lock: + self._indices[key] = loaded + return loaded + + return self.refresh(year, canonical_semester, allowed) + + def _key(self, year: int, semester: str, allowed_subjects: Optional[List[str]]) -> str: + suffix = ",".join(allowed_subjects) if allowed_subjects else "__all__" + return f"{year}:{semester}:{suffix}" + + def _resolve_allowed_subjects( + self, allowed_subjects: Optional[Iterable[str]] + ) -> Optional[Set[str]]: + if allowed_subjects: + cleaned = {item.strip().upper() for item in allowed_subjects if item and item.strip()} + if cleaned: + return cleaned + return set(self.default_allowed_subjects) if self.default_allowed_subjects else None + + def _fetch_courses(self, year: int, semester: str) -> List[Dict]: + resp = requests.post( + self.catalog_url, + json={"query": COURSE_QUERY, "variables": {"year": year, "semester": semester}}, + timeout=60, + ) + resp.raise_for_status() + payload = resp.json() + if "errors" in payload: + raise RuntimeError(f"Catalog query returned errors: {payload['errors']}") + return payload.get("data", {}).get("catalog") or [] + + @staticmethod + def _build_course_text(course: Dict) -> str: + subj = (course.get("subject") or "").strip() + num = course.get("number", "") + title = ((course.get("course") or {}).get("title") or "").strip() + desc = ((course.get("course") or {}).get("description") or "").strip() + return f"SUBJECT: {subj} NUMBER: {num}\nTITLE: {title}\nDESCRIPTION: {desc}\n" + + def _deduplicate_courses(self, courses: List[Dict]) -> List[Dict]: + seen = set() + unique: List[Dict] = [] + dropped = 0 + for course in courses: + course_meta = course.get("course") or {} + subject = ( + course_meta.get("subject") + or course.get("subject") + or "" + ).strip().upper() + course_number = ( + course_meta.get("number") + or course.get("courseNumber") + or "" + ).strip().upper() + key = (subject, course_number) + if key in seen: + dropped += 1 + continue + seen.add(key) + unique.append(course) + if dropped: + logger.info("Deduplicated catalog entries: removed %d duplicates", dropped) + return unique + + +engine = SemanticSearchEngine() +app = FastAPI() + + +class RefreshRequest(BaseModel): + year: int = Field(..., ge=2000, le=2100) + semester: str + allowed_subjects: Optional[List[str]] = None + + +VALID_SEMESTERS = {"Fall", "Spring", "Summer", "Winter"} + + +def normalize_semester(semester: str) -> str: + """Normalize semester string to capitalized format and validate.""" + normalized = semester.strip().capitalize() + if normalized not in VALID_SEMESTERS: + raise ValueError(f"Invalid semester '{semester}'. Must be one of: {', '.join(VALID_SEMESTERS)}") + return normalized + + +def resolve_term(year: Optional[int], semester: Optional[str]) -> Tuple[int, str]: + if year is not None and semester: + try: + return year, normalize_semester(semester) + except ValueError as exc: + raise HTTPException(status_code=400, detail=str(exc)) from exc + if DEFAULT_TERM: + return DEFAULT_TERM + raise HTTPException(status_code=400, detail="year and semester are required when no default term is configured") + + +@app.on_event("startup") +def build_index() -> None: + if DEFAULT_TERM: + try: + # Try loading from disk first + year, semester = DEFAULT_TERM + loaded = engine._load_index(year, semester, None) + if loaded: + key = engine._key(loaded.year, loaded.semester, loaded.allowed_subjects) + with engine._lock: + engine._indices[key] = loaded + logger.info("Loaded default index from disk on startup") + else: + # Build fresh if not found on disk + engine.refresh(*DEFAULT_TERM) + except Exception as exc: # pragma: no cover - startup diagnostics + logger.exception("Failed to build semantic search index: %s", exc) + raise + else: + logger.info("No default term configured; waiting for first refresh/search request.") + + +@app.get("/health") +def health(): + indexes = engine.describe_indices() + return { + "status": "ok" if indexes else "waiting", + "model": MODEL_NAME, + "default_term": DEFAULT_TERM, + "indexes": indexes, + } + + +@app.post("/refresh") +def refresh_index(payload: RefreshRequest): + try: + entry = engine.refresh(payload.year, payload.semester, payload.allowed_subjects) + except Exception as exc: + logger.exception("Refresh failed: %s", exc) + raise HTTPException(status_code=500, detail=str(exc)) from exc + + return { + "status": "refreshed", + "year": entry.year, + "semester": entry.semester, + "allowed_subjects": entry.allowed_subjects, + "size": len(entry.course_texts), + "last_refreshed": entry.last_refreshed.isoformat(), + } + + +@app.get("/search") +def search( + query: str, + threshold: float = Query(0.3, ge=0.0, le=1.0), + year: Optional[int] = None, + semester: Optional[str] = None, + allowed_subjects: List[str] = Query(default_factory=list), +): + if not query: + raise HTTPException(status_code=400, detail="query parameter is required") + + try: + resolved_year, resolved_semester = resolve_term(year, semester) + results, entry = engine.search( + query, + resolved_year, + resolved_semester, + threshold=threshold, + allowed_subjects=allowed_subjects or None, + ) + except HTTPException: + raise + except RuntimeError as exc: + raise HTTPException(status_code=503, detail=str(exc)) from exc + + return { + "query": query, + "threshold": threshold, + "count": len(results), + "year": resolved_year, + "semester": resolved_semester, + "allowed_subjects": entry.allowed_subjects, + "last_refreshed": entry.last_refreshed.isoformat(), + "results": results, + } diff --git a/apps/semantic-search/requirements.txt b/apps/semantic-search/requirements.txt new file mode 100644 index 000000000..0924ccd48 --- /dev/null +++ b/apps/semantic-search/requirements.txt @@ -0,0 +1,6 @@ +fastapi +uvicorn[standard] +sentence-transformers +faiss-cpu +requests +numpy From 26ad4e96dd66679c8304d2e1c6d21d51225c19c8 Mon Sep 17 00:00:00 2001 From: "vaclis.mbp" Date: Wed, 3 Dec 2025 09:48:46 -0800 Subject: [PATCH 2/6] proxy semantic search through backend Add backend routes to proxy semantic search service. Configure SEMANTIC_SEARCH_URL for service communication. --- apps/backend/scripts/prepare-typedefs.js | 2 +- apps/backend/src/bootstrap/loaders/express.ts | 4 + apps/backend/src/config.ts | 2 + .../src/modules/semantic-search/client.ts | 67 +++++++++++ .../src/modules/semantic-search/controller.ts | 50 ++++++++ .../src/modules/semantic-search/routes.ts | 113 ++++++++++++++++++ 6 files changed, 237 insertions(+), 1 deletion(-) create mode 100644 apps/backend/src/modules/semantic-search/client.ts create mode 100644 apps/backend/src/modules/semantic-search/controller.ts create mode 100644 apps/backend/src/modules/semantic-search/routes.ts diff --git a/apps/backend/scripts/prepare-typedefs.js b/apps/backend/scripts/prepare-typedefs.js index 6b7c47668..4e7f4ff27 100644 --- a/apps/backend/scripts/prepare-typedefs.js +++ b/apps/backend/scripts/prepare-typedefs.js @@ -11,7 +11,7 @@ const typedefFiles = fs.readdirSync(sourceDir) .sort(); // Get all module directories from backend/src/modules (excluding non-module directories) -const excludedDirs = ['cache', 'generated-types']; +const excludedDirs = ['cache', 'generated-types', 'semantic-search']; const moduleDirs = fs.readdirSync(modulesDir, { withFileTypes: true }) .filter(dirent => dirent.isDirectory() && !excludedDirs.includes(dirent.name)) .map(dirent => dirent.name) diff --git a/apps/backend/src/bootstrap/loaders/express.ts b/apps/backend/src/bootstrap/loaders/express.ts index bad42a2b1..160ddd0d2 100644 --- a/apps/backend/src/bootstrap/loaders/express.ts +++ b/apps/backend/src/bootstrap/loaders/express.ts @@ -7,6 +7,7 @@ import helmet from "helmet"; import { RedisClientType } from "redis"; import { config } from "../../config"; +import semanticSearchRoutes from "../../modules/semantic-search/routes"; import passportLoader from "./passport"; export default async ( @@ -58,6 +59,9 @@ export default async ( // load authentication passportLoader(app, redis); + // load semantic search routes + app.use("/api/semantic-search", semanticSearchRoutes); + app.use( config.graphqlPath, expressMiddleware(server, { diff --git a/apps/backend/src/config.ts b/apps/backend/src/config.ts index 8d5f8301e..c40283d85 100644 --- a/apps/backend/src/config.ts +++ b/apps/backend/src/config.ts @@ -35,6 +35,7 @@ export interface Config { GOOGLE_CLIENT_ID: string; GOOGLE_CLIENT_SECRET: string; redisUri: string; + semanticSearchUrl: string; } // All your secrets, keys go here @@ -60,4 +61,5 @@ export const config: Config = { GOOGLE_CLIENT_ID: env("GOOGLE_CLIENT_ID"), GOOGLE_CLIENT_SECRET: env("GOOGLE_CLIENT_SECRET"), redisUri: env("REDIS_URI"), + semanticSearchUrl: env("SEMANTIC_SEARCH_URL"), }; diff --git a/apps/backend/src/modules/semantic-search/client.ts b/apps/backend/src/modules/semantic-search/client.ts new file mode 100644 index 000000000..e3a5f5ab6 --- /dev/null +++ b/apps/backend/src/modules/semantic-search/client.ts @@ -0,0 +1,67 @@ +import { config } from "../../config"; + +interface SemanticSearchResult { + subject: string; + courseNumber: string; + title: string; + description: string; + score: number; + text: string; +} + +interface SemanticSearchResponse { + query: string; + threshold: number; + count: number; + year: number; + semester: string; + allowed_subjects: string[] | null; + last_refreshed: string; + results: SemanticSearchResult[]; +} + +export async function searchSemantic( + query: string, + year: number, + semester: string, + allowedSubjects?: string[], + threshold: number = 0.3 +): Promise { + const params = new URLSearchParams({ + query, + threshold: String(threshold), + year: String(year), + semester, + }); + + if (allowedSubjects && allowedSubjects.length > 0) { + allowedSubjects.forEach((subject) => { + params.append("allowed_subjects", subject); + }); + } + + const url = `${config.semanticSearch.url}/search?${params}`; + + try { + const response = await fetch(url); + + if (!response.ok) { + throw new Error(`Semantic search failed: ${response.statusText}`); + } + + return (await response.json()) as SemanticSearchResponse; + } catch (error) { + console.error("Semantic search error:", error); + // Return empty results on error, gracefully falling back + return { + query, + threshold, + count: 0, + year, + semester, + allowed_subjects: allowedSubjects || null, + last_refreshed: new Date().toISOString(), + results: [], + }; + } +} diff --git a/apps/backend/src/modules/semantic-search/controller.ts b/apps/backend/src/modules/semantic-search/controller.ts new file mode 100644 index 000000000..9d9a16048 --- /dev/null +++ b/apps/backend/src/modules/semantic-search/controller.ts @@ -0,0 +1,50 @@ +import { Request, Response } from "express"; + +import { searchSemantic } from "./client"; + +/** + * Lightweight semantic search endpoint that only returns course identifiers + * Frontend will use these to filter the already-loaded catalog + */ +export async function searchCourses(req: Request, res: Response) { + const { query, year, semester, threshold } = req.query; + + if (!query || typeof query !== "string") { + return res.status(400).json({ error: "query parameter is required" }); + } + + const yearNum = year ? parseInt(year as string, 10) : undefined; + const semesterStr = semester as string | undefined; + const thresholdNum = threshold ? parseFloat(threshold as string) : 0.3; + + try { + const results = await searchSemantic( + query, + yearNum!, + semesterStr!, + undefined, + thresholdNum + ); + + // Return lightweight response: only subject + courseNumber + score + const courseIds = results.results.map((r) => ({ + subject: r.subject, + courseNumber: r.courseNumber, + score: r.score, + })); + + return res.json({ + query, + threshold: thresholdNum, + results: courseIds, + count: courseIds.length, + }); + } catch (error) { + console.error("Semantic search error:", error); + return res.status(500).json({ + error: "Semantic search failed", + results: [], + count: 0, + }); + } +} diff --git a/apps/backend/src/modules/semantic-search/routes.ts b/apps/backend/src/modules/semantic-search/routes.ts new file mode 100644 index 000000000..198ae23f2 --- /dev/null +++ b/apps/backend/src/modules/semantic-search/routes.ts @@ -0,0 +1,113 @@ +import { type Response, Router } from "express"; +import type { ParsedQs } from "qs"; +import { RequestInit, fetch } from "undici"; + +import { config } from "../../config"; +import { searchCourses } from "./controller"; + +const router = Router(); +const baseUrl = config.semanticSearch.url.replace(/\/$/, ""); + +type QueryValue = string | ParsedQs | Array | undefined; + +const asString = (value: QueryValue): string | undefined => { + if (!value) return undefined; + if (typeof value === "string") return value; + if (Array.isArray(value)) { + for (const entry of value) { + const found = asString(entry as QueryValue); + if (found) return found; + } + } + return undefined; +}; + +const toStringList = (value: QueryValue): string[] => { + if (!value) return []; + if (Array.isArray(value)) { + const items: string[] = []; + for (const entry of value) { + items.push(...toStringList(entry as QueryValue)); + } + return items; + } + return typeof value === "string" && value.length > 0 ? [value] : []; +}; + +async function forward( + target: string, + init: RequestInit, + res: Response +): Promise { + try { + const response = await fetch(target, init); + const contentType = response.headers.get("content-type") ?? ""; + const raw = await response.text(); + + if (contentType.includes("application/json")) { + const payload = raw ? JSON.parse(raw) : {}; + res.status(response.status).json(payload); + } else { + res.status(response.status).send(raw); + } + } catch (error) { + console.error("Semantic search proxy error:", error); + res.status(502).json({ + error: "Unable to reach semantic search service", + details: String(error), + }); + } +} + +router.get("/health", async (_req, res) => { + await forward(`${baseUrl}/health`, { method: "GET" }, res); +}); + +router.post("/refresh", async (req, res) => { + const body = req.body ?? {}; + await forward( + `${baseUrl}/refresh`, + { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify(body), + }, + res + ); +}); + +// Lightweight endpoint: returns only course identifiers for frontend filtering +router.get("/courses", searchCourses); + +// Full proxy endpoint (kept for backwards compatibility) +router.get("/search", async (req, res) => { + const query = asString(req.query.query); + if (!query || !query.trim()) { + res.status(400).json({ error: "query parameter is required" }); + return; + } + + const params = new URLSearchParams({ query }); + + const topK = asString(req.query.top_k); + if (topK) params.set("top_k", topK); + + const year = asString(req.query.year); + if (year) params.set("year", year); + + const semester = asString(req.query.semester); + if (semester) params.set("semester", semester); + + const allowedSubjects = toStringList(req.query.allowed_subjects); + allowedSubjects.forEach((subject) => + params.append("allowed_subjects", subject) + ); + + await forward( + `${baseUrl}/search?${params.toString()}`, + { method: "GET" }, + res + ); +}); + +export default router; From 7abbfeb2e490b93419a0cf78e6047cdd34e48d0d Mon Sep 17 00:00:00 2001 From: "vaclis.mbp" Date: Wed, 3 Dec 2025 09:48:56 -0800 Subject: [PATCH 3/6] auto-refresh semantic search index via datapuller Add semantic-search-refresh puller to automatically rebuild FAISS indexes when course data updates. --- apps/datapuller/src/main.ts | 2 + .../datapuller/src/pullers/semantic-search.ts | 66 +++++++++++++++++++ apps/datapuller/src/shared/config.ts | 2 + 3 files changed, 70 insertions(+) create mode 100644 apps/datapuller/src/pullers/semantic-search.ts diff --git a/apps/datapuller/src/main.ts b/apps/datapuller/src/main.ts index 360b84c7f..6e76c331d 100644 --- a/apps/datapuller/src/main.ts +++ b/apps/datapuller/src/main.ts @@ -6,6 +6,7 @@ import enrollmentHistoriesPuller from "./pullers/enrollment"; import enrollmentTimeframePuller from "./pullers/enrollment-timeframe"; import gradeDistributionsPuller from "./pullers/grade-distributions"; import sectionsPuller from "./pullers/sections"; +import semanticSearchPuller from "./pullers/semantic-search"; import termsPuller from "./pullers/terms"; import setup from "./shared"; import { Config } from "./shared/config"; @@ -30,6 +31,7 @@ const pullerMap: { "enrollment-timeframe": enrollmentTimeframePuller.syncEnrollmentTimeframe, "terms-all": termsPuller.allTerms, "terms-nearby": termsPuller.nearbyTerms, + "semantic-search-refresh": semanticSearchPuller.refreshSemanticSearch, } as const; const runPuller = async () => { diff --git a/apps/datapuller/src/pullers/semantic-search.ts b/apps/datapuller/src/pullers/semantic-search.ts new file mode 100644 index 000000000..1a366bf09 --- /dev/null +++ b/apps/datapuller/src/pullers/semantic-search.ts @@ -0,0 +1,66 @@ +import { TermModel } from "@repo/common"; + +import { Config } from "../shared/config"; + +const refreshSemanticSearch = async (config: Config) => { + const { log, SEMANTIC_SEARCH_URL } = config; + + log.trace("Refreshing semantic search indices..."); + + // Find all active terms (terms that are currently open or will open soon) + const now = new Date(); + const activeTerms = await TermModel.find({ + endDate: { $gte: now }, + }) + .sort({ startDate: 1 }) + .limit(3) // Refresh current and next 2 terms + .lean(); + + if (activeTerms.length === 0) { + log.info("No active terms found to refresh."); + return; + } + + log.info(`Found ${activeTerms.length} active term(s) to refresh.`); + + for (const term of activeTerms) { + try { + const year = term.year; + const semester = term.semester; + + log.trace(`Refreshing index for ${year} ${semester}...`); + + const response = await fetch(`${SEMANTIC_SEARCH_URL}/refresh`, { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ + year, + semester, + }), + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error( + `Failed to refresh ${year} ${semester}: ${response.status} ${errorText}` + ); + } + + const result = await response.json(); + log.info(`Refreshed ${year} ${semester}: ${result.size} courses indexed`); + } catch (error: any) { + log.error( + `Error refreshing ${term.year} ${term.semester}: ${error.message}` + ); + // Continue with other terms even if one fails + } + } + + log.trace("Semantic search refresh completed."); +}; + +export default { + refreshSemanticSearch, +}; diff --git a/apps/datapuller/src/shared/config.ts b/apps/datapuller/src/shared/config.ts index 1e574cb4d..35964724b 100644 --- a/apps/datapuller/src/shared/config.ts +++ b/apps/datapuller/src/shared/config.ts @@ -32,6 +32,7 @@ export interface Config { WORKGROUP: string; }; BACKEND_URL: string; + SEMANTIC_SEARCH_URL: string; } export function loadConfig(): Config { @@ -64,5 +65,6 @@ export function loadConfig(): Config { WORKGROUP: env("AWS_WORKGROUP"), }, BACKEND_URL: env("BACKEND_URL"), + SEMANTIC_SEARCH_URL: env("SEMANTIC_SEARCH_URL"), }; } From ea05329a06e38d9d37bb22fd529204a71a89f401 Mon Sep 17 00:00:00 2001 From: "vaclis.mbp" Date: Wed, 3 Dec 2025 10:26:37 -0800 Subject: [PATCH 4/6] add k8s deployment for semantic search Configure Kubernetes deployment with PVC for FAISS indexes, daily cronjob for auto-refresh, and docker-compose for local development. --- docker-compose.yml | 12 +++ infra/app/templates/_helpers.tpl | 9 +++ infra/app/templates/backend.yaml | 1 + infra/app/templates/datapuller.yaml | 1 + infra/app/templates/semantic-search.yaml | 97 ++++++++++++++++++++++++ infra/app/values.yaml | 16 ++++ 6 files changed, 136 insertions(+) create mode 100644 infra/app/templates/semantic-search.yaml diff --git a/docker-compose.yml b/docker-compose.yml index 2025314cd..f385869f0 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -20,6 +20,18 @@ services: - ./.env:/backend/apps/backend/.env - ./apps/backend:/backend/apps/backend - ./packages:/backend/packages + semantic-search: + build: + context: ./apps/semantic-search + dockerfile: Dockerfile + networks: + - bt + ports: + - 8000:8000 + restart: always + volumes: + - ./apps/semantic-search/app:/app/app + - ./data/semantic-search/indexes:/app/indexes frontend: build: context: . diff --git a/infra/app/templates/_helpers.tpl b/infra/app/templates/_helpers.tpl index b0839aa5d..a663c8314 100644 --- a/infra/app/templates/_helpers.tpl +++ b/infra/app/templates/_helpers.tpl @@ -35,6 +35,11 @@ app.kubernetes.io/name: cleanup {{ include "bt-app.labels" . }} {{- end -}} +{{- define "bt-app.semanticSearchLabels" -}} +app.kubernetes.io/name: semantic-search +{{ include "bt-app.labels" . }} +{{- end -}} + {{- define "bt-app.backendName" -}} {{ .Release.Name }}-backend {{- end -}} @@ -50,3 +55,7 @@ app.kubernetes.io/name: cleanup {{- define "bt-app.datapullerName" -}} {{ .Release.Name }}-datapuller {{- end -}} + +{{- define "bt-app.semanticSearchName" -}} +{{ .Release.Name }}-semantic-search +{{- end -}} diff --git a/infra/app/templates/backend.yaml b/infra/app/templates/backend.yaml index d2d6b2fe0..e27356057 100644 --- a/infra/app/templates/backend.yaml +++ b/infra/app/templates/backend.yaml @@ -50,6 +50,7 @@ data: NODE_ENV: {{ .Values.nodeEnv }} MONGODB_URI: {{ .Values.mongoUri }} REDIS_URI: {{ .Values.redisUri }} + SEMANTIC_SEARCH_URL: {{ printf "http://%s-svc:%d" (include "bt-app.semanticSearchName" .) (.Values.semanticSearch.port | int) | quote }} SIS_CLASS_APP_ID: "_" # TODO: remove from backend SIS_CLASS_APP_KEY: "_" SIS_COURSE_APP_ID: "_" diff --git a/infra/app/templates/datapuller.yaml b/infra/app/templates/datapuller.yaml index adbed57ba..85e2370f5 100644 --- a/infra/app/templates/datapuller.yaml +++ b/infra/app/templates/datapuller.yaml @@ -52,6 +52,7 @@ data: MONGODB_URI: {{ .Values.mongoUri }} TZ: America/Los_Angeles # for tslog BACKEND_URL: {{ printf "http://%s-svc.bt.svc.cluster.local:%v" (include "bt-app.backendName" .) .Values.backend.cacheWarmingPort }} + SEMANTIC_SEARCH_URL: {{ printf "http://%s-svc:%d" (include "bt-app.semanticSearchName" .) (.Values.semanticSearch.port | int) | quote }} --- diff --git a/infra/app/templates/semantic-search.yaml b/infra/app/templates/semantic-search.yaml new file mode 100644 index 000000000..0a4209b8b --- /dev/null +++ b/infra/app/templates/semantic-search.yaml @@ -0,0 +1,97 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "bt-app.semanticSearchName" . }} + labels: + {{- include "bt-app.semanticSearchLabels" . | nindent 4 }} +spec: + replicas: {{ .Values.semanticSearch.replicas }} + selector: + matchLabels: + {{- include "bt-app.semanticSearchLabels" . | nindent 6 }} + template: + metadata: + labels: + {{- include "bt-app.semanticSearchLabels" . | nindent 8 }} + spec: + containers: + - name: semantic-search + image: {{ printf "%s/%s:%s" .Values.semanticSearch.image.registry .Values.semanticSearch.image.repository ( toString .Values.semanticSearch.image.tag ) }} + {{- if eq .Values.semanticSearch.image.tag "local" }} + imagePullPolicy: IfNotPresent + {{- else }} + imagePullPolicy: Always + {{- end }} + ports: + - containerPort: {{ .Values.semanticSearch.port }} + envFrom: + - configMapRef: + name: {{ include "bt-app.semanticSearchName" . }}-env + readinessProbe: + httpGet: + path: /health + port: {{ .Values.semanticSearch.port }} + initialDelaySeconds: 10 + periodSeconds: 5 + volumeMounts: + - name: indexes + mountPath: /app/indexes + volumes: + - name: indexes + persistentVolumeClaim: + claimName: {{ include "bt-app.semanticSearchName" . }}-pvc + +--- + +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "bt-app.semanticSearchName" . }}-env + labels: + {{- include "bt-app.semanticSearchLabels" . | nindent 4 }} +data: + INDEX_STORAGE_DIR: "/app/indexes" + SEMANTIC_SEARCH_LOG_LEVEL: {{ .Values.semanticSearch.logLevel | quote }} + {{- if .Values.semanticSearch.defaultYear }} + SEMANTIC_SEARCH_YEAR: {{ .Values.semanticSearch.defaultYear | quote }} + {{- end }} + {{- if .Values.semanticSearch.defaultSemester }} + SEMANTIC_SEARCH_SEMESTER: {{ .Values.semanticSearch.defaultSemester | quote }} + {{- end }} + {{- if .Values.semanticSearch.allowedSubjects }} + SEMANTIC_SEARCH_ALLOWED_SUBJECTS: {{ .Values.semanticSearch.allowedSubjects | quote }} + {{- end }} + +--- + +apiVersion: v1 +kind: Service +metadata: + name: {{ include "bt-app.semanticSearchName" . }}-svc + labels: + {{- include "bt-app.semanticSearchLabels" . | nindent 4 }} +spec: + selector: + {{- include "bt-app.semanticSearchLabels" . | nindent 4 }} + ports: + - protocol: TCP + port: {{ .Values.semanticSearch.port }} + targetPort: {{ .Values.semanticSearch.port }} + +--- + +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ include "bt-app.semanticSearchName" . }}-pvc + labels: + {{- include "bt-app.semanticSearchLabels" . | nindent 4 }} +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: {{ .Values.semanticSearch.storage }} + {{- if .Values.semanticSearch.storageClassName }} + storageClassName: {{ .Values.semanticSearch.storageClassName }} + {{- end }} diff --git a/infra/app/values.yaml b/infra/app/values.yaml index 0a2adb6a5..85fb9fd1a 100644 --- a/infra/app/values.yaml +++ b/infra/app/values.yaml @@ -33,6 +33,18 @@ backend: repository: octoberkeleytime/bt-backend tag: prod +semanticSearch: + replicas: 1 + port: 8000 + logLevel: INFO + + image: + registry: docker.io + repository: octoberkeleytime/bt-semantic-search + tag: prod + + storage: 5Gi + datapuller: suspend: false image: @@ -79,3 +91,7 @@ datapuller: enrollment-timeframe: schedule: "40 4 * * *" args: ["--puller=enrollment-timeframe"] + + semantic-search-refresh: + schedule: "10 4 * * *" + args: ["--puller=semantic-search-refresh"] From 81b500d15e43e2eaddb5ce5730f8fe47bb83b1d6 Mon Sep 17 00:00:00 2001 From: "vaclis.mbp" Date: Wed, 3 Dec 2025 10:48:27 -0800 Subject: [PATCH 5/6] frontend support for semantic search button --- .../ClassBrowser/Filters/Filters.module.scss | 5 + .../ClassBrowser/Header/Header.module.scss | 23 +++++ .../components/ClassBrowser/Header/index.tsx | 51 ++++++++++- .../ClassBrowser/List/List.module.scss | 1 + .../components/ClassBrowser/List/index.tsx | 5 +- .../components/ClassBrowser/browserContext.ts | 4 + .../src/components/ClassBrowser/index.tsx | 91 ++++++++++++++++--- 7 files changed, 164 insertions(+), 16 deletions(-) diff --git a/apps/frontend/src/components/ClassBrowser/Filters/Filters.module.scss b/apps/frontend/src/components/ClassBrowser/Filters/Filters.module.scss index 48b88f103..f37e63451 100644 --- a/apps/frontend/src/components/ClassBrowser/Filters/Filters.module.scss +++ b/apps/frontend/src/components/ClassBrowser/Filters/Filters.module.scss @@ -3,6 +3,7 @@ flex-shrink: 0; overflow-y: auto; background-color: var(--foreground-color); + scrollbar-color: var(--label-color) var(--foreground-color); @media (width <= 992px) { width: 384px; @@ -39,6 +40,10 @@ &:hover { color: var(--blue-hover); + + @media (prefers-color-scheme: dark) { + color: var(--blue-400); + } } } diff --git a/apps/frontend/src/components/ClassBrowser/Header/Header.module.scss b/apps/frontend/src/components/ClassBrowser/Header/Header.module.scss index 7ac5d0875..61e2cc37c 100644 --- a/apps/frontend/src/components/ClassBrowser/Header/Header.module.scss +++ b/apps/frontend/src/components/ClassBrowser/Header/Header.module.scss @@ -44,6 +44,29 @@ color: var(--paragraph-color); } } + + .sparksButton { + margin: 0 -8px 0 0; + padding: 0; + + &.active { + color: var(--blue-500); + background-color: color-mix(in srgb, var(--blue-500) 10%, transparent); + border-radius: 4px; + } + } + } + + .aiSearchButton { + width: 100%; + background-color: var(--blue-500); + color: white; + border: 1px solid var(--blue-500); + + &:hover { + background-color: var(--blue-600); + border-color: var(--blue-600); + } } .filterButton { diff --git a/apps/frontend/src/components/ClassBrowser/Header/index.tsx b/apps/frontend/src/components/ClassBrowser/Header/index.tsx index 988b1f8a2..c90d90269 100644 --- a/apps/frontend/src/components/ClassBrowser/Header/index.tsx +++ b/apps/frontend/src/components/ClassBrowser/Header/index.tsx @@ -1,7 +1,13 @@ import classNames from "classnames"; -import { Filter, FilterSolid, Search } from "iconoir-react"; +import { + Filter, + FilterSolid, + Search, + Sparks, + SparksSolid, +} from "iconoir-react"; -import { Button } from "@repo/theme"; +import { Button, IconButton } from "@repo/theme"; import useBrowser from "../useBrowser"; import styles from "./Header.module.scss"; @@ -17,8 +23,25 @@ export default function Header() { year, responsive, hasActiveFilters, + aiSearchActive, + setAiSearchActive, + handleSemanticSearch, + semanticLoading, } = useBrowser(); + const handleAiSearchSubmit = () => { + if (aiSearchActive && query.trim()) { + handleSemanticSearch(); + } + }; + + const handleKeyDown = (e: React.KeyboardEvent) => { + if (e.key === "Enter" && aiSearchActive) { + e.preventDefault(); + handleAiSearchSubmit(); + } + }; + return (
updateQuery(event.target.value)} + onKeyDown={handleKeyDown} placeholder={`Search ${semester} ${year} classes...`} onFocus={() => setExpanded(false)} autoFocus autoComplete="off" /> -
{classes.length.toLocaleString()}
+ {!aiSearchActive && ( +
{classes.length.toLocaleString()}
+ )} + setAiSearchActive(!aiSearchActive)} + aria-label="AI Search" + > + {aiSearchActive ? : } +
+ {aiSearchActive && ( + + )}