tianyi-gu · tianyi-gu · May 27, 2025 · May 21, 2025 · May 27, 2025 · May 27, 2025
diff --git a/.gitignore b/.gitignore
@@ -2,4 +2,45 @@
 /raw_corpus/*.pdf 
 *.pdf
 chunked_corpus/*
-.DS_Store
+.DS_Store
+
+# Django 
+*.log
+*.pot
+*.pyc
+__pycache__/
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+media
+
+# Python
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+.env
+.env.local
+
+# Text Corpus
+text_corpus/*
+chunked_corpus/*
+data/text_corpus/*
+data/chunked_corpus/*
+*.txt
diff --git a/Proposal.pdf b/Proposal.pdf
diff --git a/README.md b/README.md
@@ -1,5 +1,58 @@
-# This is ArchiveBot
+# ArchiveBot
 
-This project aims to develop an AI-powered retrieval-augmented generation (RAG) system that  enables intuitive, conversational access to archived materials from the Oliver Wendell Holmes  library. This would be available to students, librarians, faculty, and any others who are curious about  engaging with the school's history in an accessible manner. This project will enable users to leverage  natural language search, in order for users to be able to retrieve summaries and context-rich  information from historical documents.
+Archivebot is a Django-based RAG (Retrieval-Augmented Generation) system that enables intuitive, conversational access to archival material. Present-day researchers interested in working with historical documents are forced to menially sift through thousands of pages of documents. Despite digitization and search functionality, this process is not only time-consuming, but also error-prone. 
+
+This project aims to alleviate this issue by allowing users to query the archive through natural language, and receive a summary of the most relevant information from the archive.
+
+## Setup Instructions
+
+1. Install the required packages:
+   ```
+   pip install -r requirements.txt
+   ```
+
+2. Set up the Django app:
+   ```
+   python manage.py setup_app
+   ```
+
+3. Run the development server:
+   ```
+   python manage.py runserver
+   ```
+
+4. Access the application at http://127.0.0.1:8000/
+
+## Features
+
+- Web scraping of archive materials
+- OCR processing of PDF documents
+- Text chunking (semantic or fixed-size)
+- Embedding generation
+- Interactive chat interface with RAG capabilities
+
+## Project Structure
+
+- `rag_app/`: The main Django application
+  - `models.py`: Database models for pipeline state and chat history
+  - `views.py`: API endpoints and view functions
+  - `pipeline.py`: Core pipeline functionality
+  - `urls.py`: URL routing
+  - `templates/`: HTML templates
+
+## Usage
+
+1. Start by scraping archive materials for specific years
+2. Process the downloaded PDFs with OCR
+3. Chunk the extracted text
+4. Generate embeddings for the chunks
+5. Load a language model
+6. Chat with the system to query the archived materials
+
+## To-dos and Future Steps
+- Filtering based on article type (excluding Eighth Page articles, etc.)
+- Linking to view original article PDF
+- UI testing for level of parameters able to be set
+- Additional weighting based on recency of source material
 
 Developed in the Computer Science 600 Research and Development Class at Phillips Academy. 
diff --git a/archivebot_project/__init__.py b/archivebot_project/__init__.py
diff --git a/archivebot_project/asgi.py b/archivebot_project/asgi.py
@@ -0,0 +1,16 @@
+"""
+ASGI config for archivebot_project project.
+
+It exposes the ASGI callable as a module-level variable named ``application``.
+
+For more information on this file, see
+https://docs.djangoproject.com/en/4.2/howto/deployment/asgi/
+"""
+
+import os
+
+from django.core.asgi import get_asgi_application
+
+os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebot_project.settings')
+
+application = get_asgi_application() 
diff --git a/archivebot_project/settings.py b/archivebot_project/settings.py
@@ -0,0 +1,117 @@
+import os
+from pathlib import Path
+import sys
+
+# Build paths inside the project like this: BASE_DIR / 'subdir'.
+BASE_DIR = Path(__file__).resolve().parent.parent
+
+# SECURITY WARNING: keep the secret key used in production secret!
+SECRET_KEY = os.environ.get('DJANGO_SECRET_KEY', 'django-insecure-your-secret-key-here')
+
+# SECURITY WARNING: don't run with debug turned on in production!
+DEBUG = True
+
+ALLOWED_HOSTS = []
+
+# Application definition
+INSTALLED_APPS = [
+    'django.contrib.admin',
+    'django.contrib.auth',
+    'django.contrib.contenttypes',
+    'django.contrib.sessions',
+    'django.contrib.messages',
+    'django.contrib.staticfiles',
+    'rag_app.apps.RagAppConfig',
+]
+
+MIDDLEWARE = [
+    'django.middleware.security.SecurityMiddleware',
+    'django.contrib.sessions.middleware.SessionMiddleware',
+    'django.middleware.common.CommonMiddleware',
+    'django.middleware.csrf.CsrfViewMiddleware',
+    'django.contrib.auth.middleware.AuthenticationMiddleware',
+    'django.contrib.messages.middleware.MessageMiddleware',
+    'django.middleware.clickjacking.XFrameOptionsMiddleware',
+]
+
+ROOT_URLCONF = 'archivebot_project.urls'
+
+TEMPLATES = [
+    {
+        'BACKEND': 'django.template.backends.django.DjangoTemplates',
+        'DIRS': [],
+        'APP_DIRS': True,
+        'OPTIONS': {
+            'context_processors': [
+                'django.template.context_processors.debug',
+                'django.template.context_processors.request',
+                'django.contrib.auth.context_processors.auth',
+                'django.contrib.messages.context_processors.messages',
+            ],
+        },
+    },
+]
+
+WSGI_APPLICATION = 'archivebot_project.wsgi.application'
+
+# Database
+DATABASES = {
+    'default': {
+        'ENGINE': 'django.db.backends.sqlite3',
+        'NAME': BASE_DIR / 'db.sqlite3',
+    }
+}
+
+# Password validation
+AUTH_PASSWORD_VALIDATORS = [
+    {
+        'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
+    },
+    {
+        'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
+    },
+    {
+        'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
+    },
+    {
+        'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
+    },
+]
+
+# Internationalization
+LANGUAGE_CODE = 'en-us'
+TIME_ZONE = 'UTC'
+USE_I18N = True
+USE_TZ = True
+
+# Data directories
+DATA_DIR = BASE_DIR / "data"
+RAW_CORPUS_DIR = DATA_DIR / "raw_corpus"
+TEXT_CORPUS_DIR = DATA_DIR / "text_corpus"
+CHUNKED_CORPUS_DIR = DATA_DIR / "chunked_corpus"
+
+# Static files (CSS, JavaScript, Images)
+STATIC_URL = '/static/'
+
+# Create static directory if it doesn't exist
+STATIC_APP_DIR = BASE_DIR / "rag_app" / "static"
+os.makedirs(STATIC_APP_DIR, exist_ok=True)
+os.makedirs(STATIC_APP_DIR / "rag_app" / "css", exist_ok=True)
+os.makedirs(STATIC_APP_DIR / "rag_app" / "js", exist_ok=True)
+os.makedirs(STATIC_APP_DIR / "rag_app" / "images", exist_ok=True)
+
+STATICFILES_DIRS = [
+    STATIC_APP_DIR,
+]
+STATIC_ROOT = BASE_DIR / "staticfiles"
+
+# Media files
+MEDIA_URL = '/media/'
+MEDIA_ROOT = BASE_DIR / "media"
+
+# Default primary key field type
+DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
+
+# Create data directories
+for directory in [DATA_DIR, RAW_CORPUS_DIR, TEXT_CORPUS_DIR, CHUNKED_CORPUS_DIR]:
+    os.makedirs(directory, exist_ok=True) 
diff --git a/archivebot_project/urls.py b/archivebot_project/urls.py
@@ -0,0 +1,7 @@
+from django.contrib import admin
+from django.urls import path, include
+
+urlpatterns = [
+    path('admin/', admin.site.urls),
+    path('', include('rag_app.urls')),
+] 
diff --git a/archivebot_project/wsgi.py b/archivebot_project/wsgi.py
@@ -0,0 +1,7 @@
+import os
+
+from django.core.wsgi import get_wsgi_application
+
+os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebot_project.settings')
+
+application = get_wsgi_application()