diff --git a/app/main.py b/app/main.py index 8342c850..93af9d1b 100644 --- a/app/main.py +++ b/app/main.py @@ -1,46 +1,55 @@ -from typing import Dict - -from fastapi import FastAPI, HTTPException, Depends -from fastapi.security import HTTPBasic, HTTPBasicCredentials - +from fastapi import FastAPI, Depends, HTTPException, Request +from fastapi.middleware.cors import CORSMiddleware +from fastapi.security import OAuth2PasswordRequestForm +from app.services.auth_service import authenticate_user, create_access_token, decode_access_token +from app.schemas.token import Token +from fastapi import Request +from app.services.retriever_service import retrieve_similar_docs app = FastAPI() -security = HTTPBasic() -# Dummy user database -users_db: Dict[str, Dict[str, str]] = { - "Tony": {"password": "password123", "role": "engineering"}, - "Bruce": {"password": "securepass", "role": "marketing"}, - "Sam": {"password": "financepass", "role": "finance"}, - "Peter": {"password": "pete123", "role": "engineering"}, - "Sid": {"password": "sidpass123", "role": "marketing"}, - "Natasha": {"passwoed": "hrpass123", "role": "hr"} -} +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], allow_credentials=True, + allow_methods=["*"], allow_headers=["*"] +) - -# Authentication dependency -def authenticate(credentials: HTTPBasicCredentials = Depends(security)): - username = credentials.username - password = credentials.password - user = users_db.get(username) - if not user or user["password"] != password: +@app.post("/login", response_model=Token) +def login(form_data: OAuth2PasswordRequestForm = Depends()): + user = authenticate_user(form_data.username, form_data.password) + if not user: raise HTTPException(status_code=401, detail="Invalid credentials") - return {"username": username, "role": user["role"]} - - -# Login endpoint -@app.get("/login") -def login(user=Depends(authenticate)): - return {"message": f"Welcome {user['username']}!", "role": user["role"]} - - -# Protected test endpoint -@app.get("/test") -def test(user=Depends(authenticate)): - return {"message": f"Hello {user['username']}! You can now chat.", "role": user["role"]} - + + token = create_access_token({ + "sub": user["username"], + "role": user["role"] + }) + return {"access_token": token, "token_type": "bearer"} + +def get_current_user(request: Request): + auth_header = request.headers.get("Authorization") + if not auth_header or not auth_header.startswith("Bearer "): + raise HTTPException(status_code=401, detail="Missing or invalid token") + + token = auth_header.split(" ")[1] + user_data = decode_access_token(token) + if not user_data: + raise HTTPException(status_code=401, detail="Invalid token") + + return user_data -# Protected chat endpoint @app.post("/chat") -def query(user=Depends(authenticate), message: str = "Hello"): - return "Implement this endpoint." \ No newline at end of file +def chat(query: str, user=Depends(get_current_user)): + if user["role"] not in ["engineering", "hr", "finance", "marketing"]: + raise HTTPException(status_code=403, detail="Access denied for your role.") + + # Dummy role-based logic for now + return { + "response": f"Hi {user['username']}, you asked: '{query}' (role: {user['role']})" + } + + +@app.get("/test-retrieve") +def test_retrieve(query: str, request: Request): + user = get_current_user(request) + return retrieve_similar_docs(query, user["role"]) diff --git a/app/schemas/token.py b/app/schemas/token.py new file mode 100644 index 00000000..6b585b9a --- /dev/null +++ b/app/schemas/token.py @@ -0,0 +1,5 @@ +from pydantic import BaseModel + +class Token(BaseModel): + access_token: str + token_type: str diff --git a/app/schemas/user.py b/app/schemas/user.py new file mode 100644 index 00000000..e6cba000 --- /dev/null +++ b/app/schemas/user.py @@ -0,0 +1,5 @@ +from pydantic import BaseModel + +class UserLogin(BaseModel): + username: str + password: str diff --git a/app/services/auth_service.py b/app/services/auth_service.py new file mode 100644 index 00000000..27c08fd6 --- /dev/null +++ b/app/services/auth_service.py @@ -0,0 +1,30 @@ +import os +from datetime import datetime, timedelta, timezone +from jose import jwt, JWTError +from dotenv import load_dotenv +from app.utils.user_db import users_db + +load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), '..', '.env')) + +SECRET_KEY = os.getenv("JWT_SECRET") +ALGORITHM = os.getenv("JWT_ALGORITHM") +EXPIRATION = int(os.getenv("JWT_EXPIRATION_SECONDS")) + +def authenticate_user(username: str, password: str): + user = users_db.get(username) + if user and user["password"] == password: + return {"username": username, "role": user["role"]} + return None + +def create_access_token(data: dict): + to_encode = data.copy() + expire = datetime.now(timezone.utc) + timedelta(seconds=EXPIRATION) + to_encode.update({"exp": expire}) + return jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM) + +def decode_access_token(token: str): + try: + payload = jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM]) + return {"username": payload.get("sub"), "role": payload.get("role")} + except JWTError: + return None diff --git a/app/services/llm_service.py b/app/services/llm_service.py new file mode 100644 index 00000000..e69de29b diff --git a/app/services/retriever_service.py b/app/services/retriever_service.py new file mode 100644 index 00000000..28b0cb54 --- /dev/null +++ b/app/services/retriever_service.py @@ -0,0 +1,32 @@ +import os +import pickle +import faiss +from sentence_transformers import SentenceTransformer +import numpy as np + +VECTOR_DIR = "vector_data" +embedding_model = SentenceTransformer("all-MiniLM-L6-v2") + + +def load_faiss_index(role: str): + index_path = os.path.join(VECTOR_DIR, f"{role}_index.faiss") + docs_path = os.path.join(VECTOR_DIR, f"{role}_docs.pkl") + + if not os.path.exists(index_path) or not os.path.exists(docs_path): + raise ValueError(f"No vector index found for role: {role}") + + index = faiss.read_index(index_path) + with open(docs_path, "rb") as f: + documents = pickle.load(f) + + return index, documents + + +def retrieve_similar_docs(query: str, role: str, top_k: int = 3) -> list[str]: + index, documents = load_faiss_index(role) + query_vector = embedding_model.encode([query]) + + distances, indices = index.search(query_vector, top_k) + results = [documents[i] for i in indices[0] if i < len(documents)] + + return results diff --git a/app/services/vector_store.py b/app/services/vector_store.py new file mode 100644 index 00000000..be0004a0 --- /dev/null +++ b/app/services/vector_store.py @@ -0,0 +1,65 @@ +from sentence_transformers import SentenceTransformer +import faiss +import os +import pickle +import pandas as pd + +embedding_model = SentenceTransformer("all-MiniLM-L6-v2") + +VECTOR_DIR = "vector_data" +RESOURCE_DIR = "resources/data" + + +def load_documents(role: str) -> list[str]: + role_path = os.path.join(RESOURCE_DIR, role) + documents = [] + + if not os.path.exists(role_path): + print(f"⚠️ Directory for role '{role}' does not exist: {role_path}") + return [] + + for filename in os.listdir(role_path): + file_path = os.path.join(role_path, filename) + + try: + if filename.endswith(".csv"): + df = pd.read_csv(file_path) + for col in df.select_dtypes(include=[object]): + documents.extend(df[col].dropna().astype(str).tolist()) + + elif filename.endswith(".txt") or filename.endswith(".md"): + with open(file_path, "r", encoding="utf-8") as f: + text = f.read().strip() + if text: + documents.append(text) + + except Exception as e: + print(f"⚠️ Failed to read {file_path}: {e}") + + print(f"✅ Loaded {len(documents)} documents for role: {role}") + return documents + + +def build_faiss_index(role: str, documents: list[str]): + if not documents: + print(f"⚠️ Skipping {role}, no documents found.") + return + + vectors = embedding_model.encode(documents, show_progress_bar=True) + index = faiss.IndexFlatL2(vectors.shape[1]) + index.add(vectors) + + os.makedirs(VECTOR_DIR, exist_ok=True) + faiss.write_index(index, f"{VECTOR_DIR}/{role}_index.faiss") + + with open(f"{VECTOR_DIR}/{role}_docs.pkl", "wb") as f: + pickle.dump(documents, f) + + print(f"✅ FAISS index built and saved for role: {role}") + + +if __name__ == "__main__": + roles = ["general", "marketing", "engineering", "finance", "hr"] + for role in roles: + docs = load_documents(role) + build_faiss_index(role, docs) diff --git a/app/utils/user_db.py b/app/utils/user_db.py new file mode 100644 index 00000000..c13473b0 --- /dev/null +++ b/app/utils/user_db.py @@ -0,0 +1,8 @@ +users_db = { + "Tony": {"password": "password123", "role": "engineering"}, + "Bruce": {"password": "securepass", "role": "marketing"}, + "Sam": {"password": "financepass", "role": "finance"}, + "Peter": {"password": "pete123", "role": "engineering"}, + "Sid": {"password": "sidpass123", "role": "marketing"}, + "Natasha": {"password": "hrpass123", "role": "hr"}, +} diff --git a/pyproject.toml b/pyproject.toml index b85801f5..039f9752 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,9 +1,24 @@ [project] name = "ds-rpc-01" version = "0.1.0" -description = "Starter project for the RPC-01: Internal Chatbot with Role Based Access Control" +description = "RAG-based chatbot with role-based access control" readme = "README.md" requires-python = ">=3.10" + dependencies = [ "fastapi[standard]>=0.115.12", + "uvicorn>=0.22.0", + "python-jose[cryptography]>=3.3.0", + "python-dotenv>=1.0.0", + "sentence-transformers>=2.2.2", + "faiss-cpu>=1.7.4", + "pandas>=2.2.2" ] + + +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[tool.setuptools] +packages = ["app"] diff --git a/vector_data/engineering_docs.pkl b/vector_data/engineering_docs.pkl new file mode 100644 index 00000000..16d2e02b Binary files /dev/null and b/vector_data/engineering_docs.pkl differ diff --git a/vector_data/engineering_index.faiss b/vector_data/engineering_index.faiss new file mode 100644 index 00000000..6f10b0c8 Binary files /dev/null and b/vector_data/engineering_index.faiss differ diff --git a/vector_data/finance_docs.pkl b/vector_data/finance_docs.pkl new file mode 100644 index 00000000..d0a753ae Binary files /dev/null and b/vector_data/finance_docs.pkl differ diff --git a/vector_data/finance_index.faiss b/vector_data/finance_index.faiss new file mode 100644 index 00000000..a29aec6c Binary files /dev/null and b/vector_data/finance_index.faiss differ diff --git a/vector_data/general_docs.pkl b/vector_data/general_docs.pkl new file mode 100644 index 00000000..e5ee73aa Binary files /dev/null and b/vector_data/general_docs.pkl differ diff --git a/vector_data/general_index.faiss b/vector_data/general_index.faiss new file mode 100644 index 00000000..20609f65 Binary files /dev/null and b/vector_data/general_index.faiss differ diff --git a/vector_data/hr_docs.pkl b/vector_data/hr_docs.pkl new file mode 100644 index 00000000..a559a682 Binary files /dev/null and b/vector_data/hr_docs.pkl differ diff --git a/vector_data/hr_index.faiss b/vector_data/hr_index.faiss new file mode 100644 index 00000000..a4e6d464 Binary files /dev/null and b/vector_data/hr_index.faiss differ diff --git a/vector_data/marketing_docs.pkl b/vector_data/marketing_docs.pkl new file mode 100644 index 00000000..b1cf7440 Binary files /dev/null and b/vector_data/marketing_docs.pkl differ diff --git a/vector_data/marketing_index.faiss b/vector_data/marketing_index.faiss new file mode 100644 index 00000000..8831b15a Binary files /dev/null and b/vector_data/marketing_index.faiss differ