Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,6 @@
**/*.pyo
**/*.pyd
**/*.c
**/gpt2-*
**/*.env.*
**/llama*
1 change: 1 addition & 0 deletions .jukit/.jukit_info.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"terminal": "nvimterm"}
71 changes: 54 additions & 17 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,27 +27,64 @@ We are proud to represent Lotus Valley International School in this interschool

1. **Clone the repository:**
```bash
git clone https://github.com/JayanAXHF/shiv-nadar-university.git
cd shiv-nadar-university
git clone https://github.com/JayanAXHF/shiv-nadar.git
cd shiv-nadar
```
2. **Install requirements:**
- Make sure you have Python 3.8+ installed.
- Install dependencies:
```bash
pip install -r requirements.txt
```
3. **Run the chatbot:**
```bash
python chatbot.py
```
*(Update the above instructions based on your actual code structure and entry point.)*
2. **Install dependencies:**
The project uses `uv` package manager for the backend. Run
```bash
uv sync
```
in the backend directory to install the dependencies.

You can install the frontend dependencies by running
```bash
pnpm i
```
in the frontend directory.
3. **Run the backend:**
Running the backend is a quirky process. Starting from the root directory, run
```bash
cd backend/llm
fastapi dev ../main.py
```
Ensure that the `llm` folder contains the trained AI model. Please update your `llm/main.py` file with the path to your model.

4. **Run the frontend:**
Run the frontend by running
```bash
pnpm dev
```
in the frontend directory. You can spin up the database console by running
```bash
pnpm run db:studio
```

## 📂 Structure

- `/src` - Source code for the chatbot
- `/data` - Event data (schedule, FAQs, etc.)
- `/docs` - Documentation
- `/notebooks` - Experimentation and demos
```
.
├── backend/ # Python backend for LLM logic and datasets
│ ├── llm/ # Core LLM scripts, notebooks, data files
│ ├── logs/ # TensorBoard logs
│ ├── results/ # Model checkpoints
│ ├── testing/ # Test scripts and data
│ ├── main.py # Backend entry point
│ └── pyproject.toml # Backend dependencies
├── front_end/ # Next.js frontend app
│ ├── src/ # App pages, components, styles
│ ├── public/ # Static assets (favicon, etc.)
│ ├── package.json # Frontend dependencies
│ └── drizzle.config.ts # DB config (Drizzle ORM)
├── summary/ # LaTeX report with flowcharts & PDF output
│ └── src/ # Main .tex, custom class files, .bib
├── README.md # Project overview
└── indent.log # Log file (optional/debug)
```


## 🤝 Contributing

Expand Down
Binary file added Timeline 1.mov
Binary file not shown.
1 change: 1 addition & 0 deletions backend/.jukit/.jukit_info.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"terminal": "nvimterm"}
1 change: 1 addition & 0 deletions backend/llm/.jukit/.jukit_info.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"terminal": "nvimterm"}
1 change: 1 addition & 0 deletions backend/llm/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .main import eval_ncert, eval_circular
76 changes: 76 additions & 0 deletions backend/llm/assignments.txt

Large diffs are not rendered by default.

76 changes: 76 additions & 0 deletions backend/llm/datasetinfo.csv

Large diffs are not rendered by default.

238 changes: 238 additions & 0 deletions backend/llm/help2.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "32b385c6",
"metadata": {},
"outputs": [],
"source": [
"%pip install torch tiktoken sentencepiece blobfile datasets"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6b3757b2",
"metadata": {},
"outputs": [],
"source": [
"from huggingface_hub import login\n",
"login(token=\"enter token here\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "45ef6293",
"metadata": {},
"outputs": [],
"source": [
"# Load model directly\n",
"from transformers import AutoTokenizer, AutoModelForCausalLM\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(\"openai-community/gpt2-medium\")\n",
"model = AutoModelForCausalLM.from_pretrained(\"openai-community/gpt2-medium\")\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7eb61cc1",
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments\n",
"from datasets import load_dataset\n",
"import torch\n",
"\n",
"# Load the dataset\n",
"dataset = load_dataset(\"ParthKadam2003/NCERT_Dataset\")\n",
"\n",
"if \"validation\" not in dataset:\n",
" dataset = dataset[\"train\"].train_test_split(test_size=0.1)\n",
" print(\"Dataset split into 90% train and 10% validation.\")\n",
"# Load GPT-2 Medium model and tokenizer\n",
"model_name = \"meta-llama/Llama-3.2-3B-Instruct\"\n",
"tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
"model = AutoModelForCausalLM.from_pretrained(\n",
" model_name,\n",
" device_map=\"auto\", # Use GPU if available\n",
" torch_dtype=torch.float32\n",
")\n",
"tokenizer.pad_token = tokenizer.eos_token\n",
"model.config.pad_token_id = tokenizer.eos_token_id\n",
"# Tokenize the dataset (using Explanation column)\n",
"def tokenize_function(examples):\n",
" tokens = tokenizer(examples[\"Explanation\"], \n",
" truncation=True, \n",
" padding=\"max_length\", \n",
" max_length=128)\n",
" tokens[\"labels\"] = tokens[\"input_ids\"].copy() # Set labels as a copy of input_ids\n",
" return tokens\n",
"\n",
"tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=[\"Explanation\"])\n",
"# Set training arguments\n",
"training_args = TrainingArguments(\n",
" output_dir=\"llama-3_2Instruct-science-finetuned\",\n",
" # evaluation_strategy=\"epoch\",\n",
" per_device_train_batch_size=4, # Adjust for your system\n",
" per_device_eval_batch_size=4,\n",
" learning_rate=5e-5,\n",
" weight_decay=0.01,\n",
" num_train_epochs=3,\n",
" fp16=torch.cuda.is_available(), # Mixed precision training if GPU available\n",
" save_strategy=\"epoch\",\n",
" logging_dir=\"./logs\",\n",
" push_to_hub=False,\n",
")\n",
"\n",
"# Trainer setup\n",
"trainer = Trainer(\n",
" model=model,\n",
" args=training_args,\n",
" train_dataset=tokenized_datasets[\"train\"],\n",
" eval_dataset=tokenized_datasets[\"test\"], # The split test is used as validation\n",
")\n",
"\n",
"# Start training\n",
"trainer.train()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "410d07d6",
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoTokenizer, AutoModelForCausalLM\n",
"import torch\n",
"\n",
"# Load the tokenizer (ensure it is saved in the main directory)\n",
"base_model_name = \"gpt2\"\n",
"tokenizer = AutoTokenizer.from_pretrained(base_model_name)\n",
"tokenizer.save_pretrained(\"./llama-3_2Instruct-science-finetuned\") # Save tokenizer to the main directory\n",
"\n",
"# Load the fine-tuned model from the latest checkpoint\n",
"model = AutoModelForCausalLM.from_pretrained(\"./llama-3_2Instruct-science-finetuned/checkpoint-1725\")\n",
"\n",
"# Set the model to evaluation mode\n",
"model.eval()\n",
"model.to(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
"\n",
"print(\"GPT-2 Fine-Tuned Model Ready for Testing!\\n\")\n",
"\n",
"# Infinite test loop\n",
"while True:\n",
" user_question = input(\"Ask your science question (or type 'exit' to quit): \")\n",
" if user_question.lower() == \"exit\":\n",
" print(\"Goodbye!\")\n",
" break\n",
"\n",
" # Prepare the input\n",
" input_text = f\"Q: {user_question}\\nA:\"\n",
" input_ids = tokenizer(input_text, return_tensors=\"pt\").input_ids.to(model.device)\n",
"\n",
" # Generate the answer\n",
" with torch.no_grad():\n",
" output_ids = model.generate(\n",
" input_ids, \n",
" max_length=128, \n",
" num_return_sequences=1, \n",
" temperature=0.7, # Controls randomness (0.7 is balanced)\n",
" top_p=0.9, # Nucleus sampling\n",
" do_sample=True, # Enable sampling for creativity\n",
" )\n",
" \n",
" # Decode and display the answer\n",
" answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)\n",
" print(\"\\nAnswer:\", answer.split(\"A:\")[-1].strip(), \"\\n\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f1ac4f8b",
"metadata": {},
"outputs": [
{
"ename": "",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31mRunning cells with '.venv (Python 3.13.3)' requires the ipykernel package.\n",
"\u001b[1;31mInstall 'ipykernel' into the Python environment. \n",
"\u001b[1;31mCommand: '/Users/jayansunil/Dev/shiv_nadar/backend/.venv/bin/python -m pip install ipykernel -U --force-reinstall'"
]
}
],
"source": [
"from transformers import AutoTokenizer, AutoModelForCausalLM\n",
"import torch\n",
"\n",
"# Load the tokenizer (ensure it is saved in the main directory)\n",
"base_model_name = \"Qwen/Qwen3-0.6B\"\n",
"tokenizer = AutoTokenizer.from_pretrained(base_model_name)\n",
"tokenizer.save_pretrained(\"./llama-3_2Instruct-science-finetuned\") # Save tokenizer to the main directory\n",
"\n",
"# Load the fine-tuned model from the latest checkpoint\n",
"model = AutoModelForCausalLM.from_pretrained(\"./llama-3_2Instruct-science-finetuned/checkpoint-4992\")\n",
"\n",
"# Set the model to evaluation mode\n",
"model.eval()\n",
"model.to(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
"\n",
"print(\"GPT-2 Fine-Tuned Model Ready for Testing!\\n\")\n",
"\n",
"# Infinite test loop\n",
"while True:\n",
" user_question = input(\"Ask your science question (or type 'exit' to quit): \")\n",
" if user_question.lower() == \"exit\":\n",
" print(\"Goodbye!\")\n",
" break\n",
"\n",
" # Prepare the input\n",
" input_text = f\"Q: {user_question}\\nA:\"\n",
" input_ids = tokenizer(input_text, return_tensors=\"pt\").input_ids.to(model.device)\n",
"\n",
" # Generate the answer\n",
" with torch.no_grad():\n",
" output_ids = model.generate(\n",
" input_ids, \n",
" max_length=128, \n",
" num_return_sequences=1, \n",
" temperature=0.7, # Controls randomness (0.7 is balanced)\n",
" top_p=0.9, # Nucleus sampling\n",
" do_sample=True, # Enable sampling for creativity\n",
" )\n",
" \n",
" # Decode and display the answer\n",
" answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)\n",
" print(\"\\nAnswer:\", answer.split(\"A:\")[-1].strip(), \"\\n\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
55 changes: 55 additions & 0 deletions backend/llm/llmtorch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset
import torch

# Load the dataset
dataset = load_dataset("KadamParth/NCERT_Science_9th")

if "validation" not in dataset:
dataset = dataset["train"].train_test_split(test_size=0.1)
print("Dataset split into 90% train and 10% validation.")
# Load GPT-2 Medium model and tokenizer
model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto", # Use GPU if available
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id
# Tokenize the dataset (using Explanation column)
def tokenize_function(examples):
tokens = tokenizer(examples["Explanation"],
truncation=True,
padding="max_length",
max_length=128)
tokens["labels"] = tokens["input_ids"].copy() # Set labels as a copy of input_ids
return tokens

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["Explanation"])
# Set training arguments
training_args = TrainingArguments(
output_dir=".llama-finetuned",
# evaluation_strategy="epoch",
per_device_train_batch_size=4, # Adjust for your system
per_device_eval_batch_size=4,
learning_rate=5e-5,
weight_decay=0.01,
num_train_epochs=3,
fp16=torch.cuda.is_available(), # Mixed precision training if GPU available
save_strategy="epoch",
logging_dir="./logs",
push_to_hub=False,
)

# Trainer setup
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["test"], # The split test is used as validation
)

# Start training
trainer.train()
Loading