JayanAXHF · JayanAXHF · May 23, 2025 · May 17, 2025 · May 18, 2025 · May 18, 2025
diff --git a/.gitignore b/.gitignore
@@ -11,3 +11,6 @@
 **/*.pyo
 **/*.pyd
 **/*.c
+**/gpt2-*
+**/*.env.*
+**/llama*
diff --git a/.jukit/.jukit_info.json b/.jukit/.jukit_info.json
@@ -0,0 +1 @@
+{"terminal": "nvimterm"}
diff --git a/README.md b/README.md
@@ -27,27 +27,64 @@ We are proud to represent Lotus Valley International School in this interschool
 
 1. **Clone the repository:**
    ```bash
-   git clone https://github.com/JayanAXHF/shiv-nadar-university.git
-   cd shiv-nadar-university
+   git clone https://github.com/JayanAXHF/shiv-nadar.git
+   cd shiv-nadar
    ```
-2. **Install requirements:**
-   - Make sure you have Python 3.8+ installed.
-   - Install dependencies:
-     ```bash
-     pip install -r requirements.txt
-     ```
-3. **Run the chatbot:**
-   ```bash
-   python chatbot.py
-   ```
-   *(Update the above instructions based on your actual code structure and entry point.)*
+2. **Install dependencies:**
+    The project uses `uv` package manager for the backend. Run
+    ```bash
+    uv sync
+    ```
+    in the backend directory to install the dependencies.
+
+    You can install the frontend dependencies by running
+    ```bash
+    pnpm i
+    ```
+    in the frontend directory.
+3. **Run the backend:**
+Running the backend is a quirky process. Starting from the root directory, run
+```bash
+cd backend/llm
+fastapi dev ../main.py
+```
+Ensure that the `llm` folder contains the trained AI model. Please update your `llm/main.py` file with the path to your model.
+
+4. **Run the frontend:**
+Run the frontend by running
+```bash
+pnpm dev
+```
+in the frontend directory. You can spin up the database console by running
+```bash
+pnpm run db:studio
+```
 
 ## 📂 Structure
 
-- `/src` - Source code for the chatbot
-- `/data` - Event data (schedule, FAQs, etc.)
-- `/docs` - Documentation
-- `/notebooks` - Experimentation and demos
+```
+.
+├── backend/                   # Python backend for LLM logic and datasets
+│   ├── llm/                   # Core LLM scripts, notebooks, data files
+│   ├── logs/                  # TensorBoard logs
+│   ├── results/               # Model checkpoints
+│   ├── testing/               # Test scripts and data
+│   ├── main.py                # Backend entry point
+│   └── pyproject.toml         # Backend dependencies
+│
+├── front_end/                # Next.js frontend app
+│   ├── src/                  # App pages, components, styles
+│   ├── public/               # Static assets (favicon, etc.)
+│   ├── package.json          # Frontend dependencies
+│   └── drizzle.config.ts     # DB config (Drizzle ORM)
+│
+├── summary/                  # LaTeX report with flowcharts & PDF output
+│   └── src/                  # Main .tex, custom class files, .bib
+│
+├── README.md                 # Project overview
+└── indent.log                # Log file (optional/debug)
+```
+
 
 ## 🤝 Contributing
 

diff --git a/Timeline 1.mov b/Timeline 1.mov
diff --git a/backend/.jukit/.jukit_info.json b/backend/.jukit/.jukit_info.json
@@ -0,0 +1 @@
+{"terminal": "nvimterm"}
diff --git a/backend/llm/.jukit/.jukit_info.json b/backend/llm/.jukit/.jukit_info.json
@@ -0,0 +1 @@
+{"terminal": "nvimterm"}
diff --git a/backend/llm/__init__.py b/backend/llm/__init__.py
@@ -0,0 +1 @@
+from .main import eval_ncert, eval_circular
diff --git a/backend/llm/assignments.txt b/backend/llm/assignments.txt
diff --git a/backend/llm/datasetinfo.csv b/backend/llm/datasetinfo.csv
diff --git a/backend/llm/help2.ipynb b/backend/llm/help2.ipynb
@@ -0,0 +1,238 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "32b385c6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install torch tiktoken sentencepiece blobfile datasets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6b3757b2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import login\n",
+    "login(token=\"enter token here\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "45ef6293",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load model directly\n",
+    "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"openai-community/gpt2-medium\")\n",
+    "model = AutoModelForCausalLM.from_pretrained(\"openai-community/gpt2-medium\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7eb61cc1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments\n",
+    "from datasets import load_dataset\n",
+    "import torch\n",
+    "\n",
+    "# Load the dataset\n",
+    "dataset = load_dataset(\"ParthKadam2003/NCERT_Dataset\")\n",
+    "\n",
+    "if \"validation\" not in dataset:\n",
+    "    dataset = dataset[\"train\"].train_test_split(test_size=0.1)\n",
+    "    print(\"Dataset split into 90% train and 10% validation.\")\n",
+    "# Load GPT-2 Medium model and tokenizer\n",
+    "model_name = \"meta-llama/Llama-3.2-3B-Instruct\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "model = AutoModelForCausalLM.from_pretrained(\n",
+    "    model_name,\n",
+    "    device_map=\"auto\",  # Use GPU if available\n",
+    "    torch_dtype=torch.float32\n",
+    ")\n",
+    "tokenizer.pad_token = tokenizer.eos_token\n",
+    "model.config.pad_token_id = tokenizer.eos_token_id\n",
+    "# Tokenize the dataset (using Explanation column)\n",
+    "def tokenize_function(examples):\n",
+    "    tokens = tokenizer(examples[\"Explanation\"], \n",
+    "                       truncation=True, \n",
+    "                       padding=\"max_length\", \n",
+    "                       max_length=128)\n",
+    "    tokens[\"labels\"] = tokens[\"input_ids\"].copy()  # Set labels as a copy of input_ids\n",
+    "    return tokens\n",
+    "\n",
+    "tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=[\"Explanation\"])\n",
+    "# Set training arguments\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir=\"llama-3_2Instruct-science-finetuned\",\n",
+    "    # evaluation_strategy=\"epoch\",\n",
+    "    per_device_train_batch_size=4,  # Adjust for your system\n",
+    "    per_device_eval_batch_size=4,\n",
+    "    learning_rate=5e-5,\n",
+    "    weight_decay=0.01,\n",
+    "    num_train_epochs=3,\n",
+    "    fp16=torch.cuda.is_available(),  # Mixed precision training if GPU available\n",
+    "    save_strategy=\"epoch\",\n",
+    "    logging_dir=\"./logs\",\n",
+    "    push_to_hub=False,\n",
+    ")\n",
+    "\n",
+    "# Trainer setup\n",
+    "trainer = Trainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=tokenized_datasets[\"train\"],\n",
+    "    eval_dataset=tokenized_datasets[\"test\"],  # The split test is used as validation\n",
+    ")\n",
+    "\n",
+    "# Start training\n",
+    "trainer.train()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "410d07d6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
+    "import torch\n",
+    "\n",
+    "# Load the tokenizer (ensure it is saved in the main directory)\n",
+    "base_model_name = \"gpt2\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(base_model_name)\n",
+    "tokenizer.save_pretrained(\"./llama-3_2Instruct-science-finetuned\")  # Save tokenizer to the main directory\n",
+    "\n",
+    "# Load the fine-tuned model from the latest checkpoint\n",
+    "model = AutoModelForCausalLM.from_pretrained(\"./llama-3_2Instruct-science-finetuned/checkpoint-1725\")\n",
+    "\n",
+    "# Set the model to evaluation mode\n",
+    "model.eval()\n",
+    "model.to(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "\n",
+    "print(\"GPT-2 Fine-Tuned Model Ready for Testing!\\n\")\n",
+    "\n",
+    "# Infinite test loop\n",
+    "while True:\n",
+    "    user_question = input(\"Ask your science question (or type 'exit' to quit): \")\n",
+    "    if user_question.lower() == \"exit\":\n",
+    "        print(\"Goodbye!\")\n",
+    "        break\n",
+    "\n",
+    "    # Prepare the input\n",
+    "    input_text = f\"Q: {user_question}\\nA:\"\n",
+    "    input_ids = tokenizer(input_text, return_tensors=\"pt\").input_ids.to(model.device)\n",
+    "\n",
+    "    # Generate the answer\n",
+    "    with torch.no_grad():\n",
+    "        output_ids = model.generate(\n",
+    "            input_ids, \n",
+    "            max_length=128, \n",
+    "            num_return_sequences=1, \n",
+    "            temperature=0.7,  # Controls randomness (0.7 is balanced)\n",
+    "            top_p=0.9,        # Nucleus sampling\n",
+    "            do_sample=True,   # Enable sampling for creativity\n",
+    "        )\n",
+    "    \n",
+    "    # Decode and display the answer\n",
+    "    answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)\n",
+    "    print(\"\\nAnswer:\", answer.split(\"A:\")[-1].strip(), \"\\n\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f1ac4f8b",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mRunning cells with '.venv (Python 3.13.3)' requires the ipykernel package.\n",
+      "\u001b[1;31mInstall 'ipykernel' into the Python environment. \n",
+      "\u001b[1;31mCommand: '/Users/jayansunil/Dev/shiv_nadar/backend/.venv/bin/python -m pip install ipykernel -U --force-reinstall'"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
+    "import torch\n",
+    "\n",
+    "# Load the tokenizer (ensure it is saved in the main directory)\n",
+    "base_model_name = \"Qwen/Qwen3-0.6B\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(base_model_name)\n",
+    "tokenizer.save_pretrained(\"./llama-3_2Instruct-science-finetuned\")  # Save tokenizer to the main directory\n",
+    "\n",
+    "# Load the fine-tuned model from the latest checkpoint\n",
+    "model = AutoModelForCausalLM.from_pretrained(\"./llama-3_2Instruct-science-finetuned/checkpoint-4992\")\n",
+    "\n",
+    "# Set the model to evaluation mode\n",
+    "model.eval()\n",
+    "model.to(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "\n",
+    "print(\"GPT-2 Fine-Tuned Model Ready for Testing!\\n\")\n",
+    "\n",
+    "# Infinite test loop\n",
+    "while True:\n",
+    "    user_question = input(\"Ask your science question (or type 'exit' to quit): \")\n",
+    "    if user_question.lower() == \"exit\":\n",
+    "        print(\"Goodbye!\")\n",
+    "        break\n",
+    "\n",
+    "    # Prepare the input\n",
+    "    input_text = f\"Q: {user_question}\\nA:\"\n",
+    "    input_ids = tokenizer(input_text, return_tensors=\"pt\").input_ids.to(model.device)\n",
+    "\n",
+    "    # Generate the answer\n",
+    "    with torch.no_grad():\n",
+    "        output_ids = model.generate(\n",
+    "            input_ids, \n",
+    "            max_length=128, \n",
+    "            num_return_sequences=1, \n",
+    "            temperature=0.7,  # Controls randomness (0.7 is balanced)\n",
+    "            top_p=0.9,        # Nucleus sampling\n",
+    "            do_sample=True,   # Enable sampling for creativity\n",
+    "        )\n",
+    "    \n",
+    "    # Decode and display the answer\n",
+    "    answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)\n",
+    "    print(\"\\nAnswer:\", answer.split(\"A:\")[-1].strip(), \"\\n\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/backend/llm/llmtorch.py b/backend/llm/llmtorch.py
@@ -0,0 +1,55 @@
+from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
+from datasets import load_dataset
+import torch
+
+# Load the dataset
+dataset = load_dataset("KadamParth/NCERT_Science_9th")
+
+if "validation" not in dataset:
+    dataset = dataset["train"].train_test_split(test_size=0.1)
+    print("Dataset split into 90% train and 10% validation.")
+# Load GPT-2 Medium model and tokenizer
+model_name = "meta-llama/Llama-3.2-1B"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    device_map="auto",  # Use GPU if available
+    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
+)
+tokenizer.pad_token = tokenizer.eos_token
+model.config.pad_token_id = tokenizer.eos_token_id
+# Tokenize the dataset (using Explanation column)
+def tokenize_function(examples):
+    tokens = tokenizer(examples["Explanation"], 
+                       truncation=True, 
+                       padding="max_length", 
+                       max_length=128)
+    tokens["labels"] = tokens["input_ids"].copy()  # Set labels as a copy of input_ids
+    return tokens
+
+tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["Explanation"])
+# Set training arguments
+training_args = TrainingArguments(
+    output_dir=".llama-finetuned",
+    # evaluation_strategy="epoch",
+    per_device_train_batch_size=4,  # Adjust for your system
+    per_device_eval_batch_size=4,
+    learning_rate=5e-5,
+    weight_decay=0.01,
+    num_train_epochs=3,
+    fp16=torch.cuda.is_available(),  # Mixed precision training if GPU available
+    save_strategy="epoch",
+    logging_dir="./logs",
+    push_to_hub=False,
+)
+
+# Trainer setup
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=tokenized_datasets["train"],
+    eval_dataset=tokenized_datasets["test"],  # The split test is used as validation
+)
+
+# Start training
+trainer.train()