diff --git a/.gitignore b/.gitignore
index 3557e5d..06125d6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -189,3 +189,4 @@ gradle-app.setting
 
 /.vs/
 node_modules/
+.env
diff --git a/GroupFive/__init__.py b/GroupFive/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/GroupFive/admin.py b/GroupFive/admin.py
new file mode 100644
index 0000000..8c38f3f
--- /dev/null
+++ b/GroupFive/admin.py
@@ -0,0 +1,3 @@
+from django.contrib import admin
+
+# Register your models here.
diff --git a/GroupFive/apps.py b/GroupFive/apps.py
new file mode 100644
index 0000000..8220433
--- /dev/null
+++ b/GroupFive/apps.py
@@ -0,0 +1,6 @@
+from django.apps import AppConfig
+
+
+class OurApplicationConfig(AppConfig):
+    default_auto_field = 'django.db.models.BigAutoField'
+    name = 'GroupFive'
diff --git a/GroupFive/dummy_analysis.py b/GroupFive/dummy_analysis.py
new file mode 100644
index 0000000..992ba89
--- /dev/null
+++ b/GroupFive/dummy_analysis.py
@@ -0,0 +1,13 @@
+
+def run_dummy(code, language):
+
+    return {
+        "summary" : "this dummy code is better than yours",
+        "findings" : [
+            {
+                "severity" : "Minimal",
+                "description" : "Bad code",
+                "fix" : "Figure it Out"
+            }
+        ]
+    }
\ No newline at end of file
diff --git a/GroupFive/migrations/0001_initial.py b/GroupFive/migrations/0001_initial.py
new file mode 100644
index 0000000..b8f9b01
--- /dev/null
+++ b/GroupFive/migrations/0001_initial.py
@@ -0,0 +1,30 @@
+# Generated by Django 5.0.3 on 2026-02-18 09:51
+
+import django.db.models.deletion
+import uuid
+from django.conf import settings
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    initial = True
+
+    dependencies = [
+        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='AnalysisTask',
+            fields=[
+                ('id', models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False)),
+                ('input_code', models.TextField()),
+                ('language', models.CharField(max_length=50)),
+                ('status', models.CharField(choices=[('QUEUED', 'Queued'), ('RUNNING', 'Running'), ('COMPLETED', 'Completed'), ('FAILED', 'Failed')], max_length=20)),
+                ('results', models.JSONField(blank=True, null=True)),
+                ('created_at', models.DateTimeField(auto_now_add=True)),
+                ('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
+            ],
+        ),
+    ]
diff --git a/GroupFive/migrations/__init__.py b/GroupFive/migrations/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/GroupFive/models.py b/GroupFive/models.py
new file mode 100644
index 0000000..f6f0ea7
--- /dev/null
+++ b/GroupFive/models.py
@@ -0,0 +1,21 @@
+#all id related lines are noted and can be deleted or changed if user id is skipped or substituted
+import uuid #for user ID
+from django.db import models
+from django.contrib.auth.models import User
+
+class AnalysisTask(models.Model):
+    #potential review request statuses
+    STATUS_OPT = [
+        ("QUEUED", "Queued"),
+        ("RUNNING", "Running"),
+        ("COMPLETED", "Completed"),
+        ("FAILED", "Failed")
+    ]
+
+    id = models.UUIDField(primary_key=True, default=uuid.uuid4) #more user id
+    user = models.ForeignKey(User, on_delete=models.CASCADE) #user id/user
+    input_code = models.TextField() #user provided code
+    language = models.CharField(max_length=50) #language of user provided code
+    status = models.CharField(max_length=20, choices=STATUS_OPT) #status of review request
+    results = models.JSONField(null=True, blank=True) #results of review
+    created_at = models.DateTimeField(auto_now_add=True) #creation timestamp
diff --git a/GroupFive/serializers.py b/GroupFive/serializers.py
new file mode 100644
index 0000000..255c15e
--- /dev/null
+++ b/GroupFive/serializers.py
@@ -0,0 +1,7 @@
+#this file uses serializers to define what information we add to our AnalysisTask model from user
+from rest_framework import serializers
+
+class AnalysisRequestSerializer(serializers.Serializer):
+    code = serializers.CharField() #for input code
+    #language definition of input code, can be commented out if language distinction added later
+    language = serializers.CharField()
diff --git a/GroupFive/tasks.py b/GroupFive/tasks.py
new file mode 100644
index 0000000..24c186e
--- /dev/null
+++ b/GroupFive/tasks.py
@@ -0,0 +1,22 @@
+#from celery import shared_task #task queue to handle simultaneous requests, making testing annoying for now can readd later when necessary
+from GroupFive.models import AnalysisTask
+from .dummy_analysis import run_dummy
+
+#@shared_task --from celery, readd later
+def run_analysis_async(task_id):
+
+    #instance of analysisTask
+    task = AnalysisTask.objects.get(id=task_id)
+    task.status = "RUNNING" #update status
+    task.save() #save instance task
+
+    try:
+        #call ai api rather than dummy
+        results = run_dummy(task.input_code, task.language)
+
+        task.results = results #store results
+        task.status = "COMPLETED" #update status
+    except Exception(BaseException) as e:
+        task.status = "FAILED"
+
+    task.save()
\ No newline at end of file
diff --git a/GroupFive/tests.py b/GroupFive/tests.py
new file mode 100644
index 0000000..5f5da37
--- /dev/null
+++ b/GroupFive/tests.py
@@ -0,0 +1,60 @@
+from rest_framework.test import APITestCase
+from django.contrib.auth.models import User
+from rest_framework import status
+from .models import *
+from uuid import uuid4
+
+
+class InitialAnalysisTests(APITestCase):
+
+    def setUp(self):
+        #create user
+        self.User = User.objects.create_user(
+            username="username",
+            password="password"
+        )
+        self.client.login(username="username", password="password")
+
+    def test_create_analysisTask(self):
+
+        response = self.client.post("/api/GroupFive/",{
+            "code" : "print('Hello World')", #code to analyze
+            "language" : "Python" #language of code
+        }, format="json")
+
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        self.assertIn("task_id", response.data) #task_id in data
+        self.assertEqual(response.data["status"], "QUEUED")
+
+class InitialWorkflowTest(APITestCase):
+
+    def setUp(self):
+        #create user
+        self.User = User.objects.create_user(
+            username="username",
+            password="password"
+        )
+        self.client.login(username="username", password="password")
+
+    def test_initial_workflow(self):
+        response = self.client.post("/api/GroupFive/",{
+            "code" : "print('Hello Again')", #code to analyze
+            "language" : "Python" #language of code
+        }, format="json")
+
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+
+        task_id = response.data["task_id"]
+
+        task = AnalysisTask.objects.get(id=task_id)
+
+        #confirm that dummy ran
+        self.assertEqual(task.status, "COMPLETED")
+
+        result_response = self.client.get(f"/api/GroupFive/{task_id}")
+
+        #ensure task endpoint
+        self.assertEqual(result_response.status_code, 200)
+
+
+
diff --git a/GroupFive/views.py b/GroupFive/views.py
new file mode 100644
index 0000000..01cfa8e
--- /dev/null
+++ b/GroupFive/views.py
@@ -0,0 +1,42 @@
+#all id related lines are noted and can be deleted or changed if user id is skipped or substituted
+from rest_framework.views import APIView
+from rest_framework.response import Response
+from rest_framework.permissions import IsAuthenticated #for user id
+from GroupFive.models import AnalysisTask
+from GroupFive.serializers import AnalysisRequestSerializer
+from .tasks import run_analysis_async
+
+
+#analysis task endpoint
+class AnalysisView(APIView):
+    permission_classes = [IsAuthenticated]
+
+    def post(self, request):
+        serializer = AnalysisRequestSerializer(data=request.data)
+        serializer.is_valid(raise_exception=True) #deserialize, check correct input and format, raises 400 Bad Request on fail
+
+        task = AnalysisTask.objects.create(
+            user=request.user, #user
+            input_code=serializer.validated_data["code"],
+            language=serializer.validated_data["language"],
+            status="QUEUED"
+        )
+
+        run_analysis_async(str(task.id))
+
+        return Response({
+            "task_id": str(task.id),
+            "status": task.status
+        })
+
+#status endpoint
+class StatusView(APIView):
+    permission_classes = [IsAuthenticated]
+
+    def get(self, request, task_id):
+        task = AnalysisTask.objects.get(id=task_id, user=request.user) #user
+
+        return Response({
+            "status": task.status,
+            "summary": task.results if task.status == "COMPLETED" else None
+        })
\ No newline at end of file
diff --git a/ai_payload.json b/ai_payload.json
new file mode 100644
index 0000000..92b3004
--- /dev/null
+++ b/ai_payload.json
@@ -0,0 +1,103 @@
+{
+  "schema": "ai_payload_v1",
+  "project_id": "121d4d1ff944c1642e8901fe9689b26811561e8437fe55d499cf4a9708c67e7d",
+  "input_path": "/Users/zhangtingen/Downloads/V/testquery.py",
+  "input_type": "file",
+  "pipeline_version": "1.0.0",
+  "normalized_findings": [
+    {
+      "finding_id": "F-001",
+      "issue_type": "database_access_heuristic",
+      "title": "Database Access Pattern Detected",
+      "severity": "low",
+      "confidence": "low",
+      "finding_status": "review_needed",
+      "source_file": "testquery.py",
+      "evidence": [
+        {
+          "line": 0,
+          "category": "database_access",
+          "snippet": "heuristic"
+        }
+      ],
+      "analysis_limitations": [
+        "Heuristic only; line may be 0 when matched on whole file."
+      ]
+    },
+    {
+      "finding_id": "F-002",
+      "issue_type": "sql_execution_review",
+      "title": "SQL Execution Present — Review for Injection / Unsafe Queries",
+      "severity": "medium",
+      "confidence": "medium",
+      "finding_status": "review_needed",
+      "source_file": "testquery.py",
+      "evidence": [
+        {
+          "line": 7,
+          "category": "sql_execution",
+          "snippet": "result = conn.execute(text(\"SELECT NOW();\"))"
+        },
+        {
+          "line": 13,
+          "category": "sql_execution",
+          "snippet": "#conn.execute(text(\"INSERT INTO users(email, password_hash) VALUES ('test2@example.com', '1A2B3C');\"))"
+        },
+        {
+          "line": 14,
+          "category": "sql_execution",
+          "snippet": "result = conn.execute(text(\"SELECT * FROM users;\"))"
+        }
+      ]
+    },
+    {
+      "finding_id": "F-003",
+      "issue_type": "potential_sensitive_data_exposure_via_debug_output",
+      "title": "Potential Sensitive Data Exposure via Debug Output",
+      "severity": "medium",
+      "confidence": "medium",
+      "finding_status": "review_needed",
+      "source_file": "testquery.py",
+      "evidence": [
+        {
+          "line": 8,
+          "category": "debug_output",
+          "snippet": "print(\"Connected! Server time:\", result.fetchone()[0])"
+        },
+        {
+          "line": 10,
+          "category": "debug_output",
+          "snippet": "print(\"Connection failed:\", e)"
+        },
+        {
+          "line": 15,
+          "category": "debug_output",
+          "snippet": "print(result.fetchall())"
+        }
+      ],
+      "analysis_limitations": [
+        "Cannot determine if printed data contains sensitive fields without data flow analysis."
+      ]
+    },
+    {
+      "finding_id": "F-004",
+      "issue_type": "broad_exception_handling",
+      "title": "Broad Exception Handling",
+      "severity": "low",
+      "confidence": "medium",
+      "finding_status": "review_needed",
+      "source_file": "testquery.py",
+      "evidence": [
+        {
+          "line": 9,
+          "category": "broad_except",
+          "snippet": "except Exception as e:"
+        }
+      ]
+    }
+  ],
+  "meta": {
+    "finding_count": 4,
+    "note": "Full source lives in preprocess output only; fetch by file_id/chunk_id if needed."
+  }
+}
\ No newline at end of file
diff --git a/config/__init__.py b/config/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/config/asgi.py b/config/asgi.py
new file mode 100644
index 0000000..39149a0
--- /dev/null
+++ b/config/asgi.py
@@ -0,0 +1,16 @@
+"""
+ASGI config for config project.
+
+It exposes the ASGI callable as a module-level variable named ``application``.
+
+For more information on this file, see
+https://docs.djangoproject.com/en/5.0/howto/deployment/asgi/
+"""
+
+import os
+
+from django.core.asgi import get_asgi_application
+
+os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'config.settings')
+
+application = get_asgi_application()
diff --git a/config/settings.py b/config/settings.py
new file mode 100644
index 0000000..4a3bd85
--- /dev/null
+++ b/config/settings.py
@@ -0,0 +1,125 @@
+"""
+Django settings for config project.
+
+Generated by 'django-admin startproject' using Django 5.0.3.
+
+For more information on this file, see
+https://docs.djangoproject.com/en/5.0/topics/settings/
+
+For the full list of settings and their values, see
+https://docs.djangoproject.com/en/5.0/ref/settings/
+"""
+
+from pathlib import Path
+
+# Build paths inside the project like this: BASE_DIR / 'subdir'.
+BASE_DIR = Path(__file__).resolve().parent.parent
+
+
+# Quick-start development settings - unsuitable for production
+# See https://docs.djangoproject.com/en/5.0/howto/deployment/checklist/
+
+# SECURITY WARNING: keep the secret key used in production secret!
+SECRET_KEY = 'django-insecure-y+j3zht6sr%!!2fg0&-ek^21&)yc+y+5a*-ly+@16$8$px)a$@'
+
+# SECURITY WARNING: don't run with debug turned on in production!
+DEBUG = True
+
+ALLOWED_HOSTS = []
+
+
+# Application definition
+
+INSTALLED_APPS = [
+    'django.contrib.admin',
+    'django.contrib.auth',
+    'django.contrib.contenttypes',
+    'django.contrib.sessions',
+    'django.contrib.messages',
+    'django.contrib.staticfiles',
+    'GroupFive',
+    'rest_framework'
+]
+
+MIDDLEWARE = [
+    'django.middleware.security.SecurityMiddleware',
+    'django.contrib.sessions.middleware.SessionMiddleware',
+    'django.middleware.common.CommonMiddleware',
+    'django.middleware.csrf.CsrfViewMiddleware',
+    'django.contrib.auth.middleware.AuthenticationMiddleware',
+    'django.contrib.messages.middleware.MessageMiddleware',
+    'django.middleware.clickjacking.XFrameOptionsMiddleware',
+]
+
+ROOT_URLCONF = 'config.urls'
+
+TEMPLATES = [
+    {
+        'BACKEND': 'django.template.backends.django.DjangoTemplates',
+        'DIRS': [],
+        'APP_DIRS': True,
+        'OPTIONS': {
+            'context_processors': [
+                'django.template.context_processors.debug',
+                'django.template.context_processors.request',
+                'django.contrib.auth.context_processors.auth',
+                'django.contrib.messages.context_processors.messages',
+            ],
+        },
+    },
+]
+
+WSGI_APPLICATION = 'config.wsgi.application'
+
+
+# Database
+# https://docs.djangoproject.com/en/5.0/ref/settings/#databases
+
+DATABASES = {
+    'default': {
+        'ENGINE': 'django.db.backends.sqlite3',
+        'NAME': BASE_DIR / 'db.sqlite3',
+    }
+}
+
+
+# Password validation
+# https://docs.djangoproject.com/en/5.0/ref/settings/#auth-password-validators
+
+AUTH_PASSWORD_VALIDATORS = [
+    {
+        'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
+    },
+    {
+        'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
+    },
+    {
+        'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
+    },
+    {
+        'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
+    },
+]
+
+
+# Internationalization
+# https://docs.djangoproject.com/en/5.0/topics/i18n/
+
+LANGUAGE_CODE = 'en-us'
+
+TIME_ZONE = 'UTC'
+
+USE_I18N = True
+
+USE_TZ = True
+
+
+# Static files (CSS, JavaScript, Images)
+# https://docs.djangoproject.com/en/5.0/howto/static-files/
+
+STATIC_URL = 'static/'
+
+# Default primary key field type
+# https://docs.djangoproject.com/en/5.0/ref/settings/#default-auto-field
+
+DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
diff --git a/config/urls.py b/config/urls.py
new file mode 100644
index 0000000..a770eb4
--- /dev/null
+++ b/config/urls.py
@@ -0,0 +1,26 @@
+"""
+URL configuration for config project.
+
+The `urlpatterns` list routes URLs to views. For more information please see:
+    https://docs.djangoproject.com/en/5.0/topics/http/urls/
+Examples:
+Function views
+    1. Add an import:  from my_app import views
+    2. Add a URL to urlpatterns:  path('', views.home, name='home')
+Class-based views
+    1. Add an import:  from other_app.views import Home
+    2. Add a URL to urlpatterns:  path('', Home.as_view(), name='home')
+Including another URLconf
+    1. Import the include() function: from django.urls import include, path
+    2. Add a URL to urlpatterns:  path('blog/', include('blog.urls'))
+"""
+from django.contrib import admin
+from django.urls import path
+
+from GroupFive.views import AnalysisView
+
+urlpatterns = [
+    path('admin/', admin.site.urls),
+    path('api/GroupFive/', AnalysisView.as_view(), name='GroupFive'),
+    path('api/GroupFive/', )
+]
diff --git a/config/wsgi.py b/config/wsgi.py
new file mode 100644
index 0000000..c0a9631
--- /dev/null
+++ b/config/wsgi.py
@@ -0,0 +1,16 @@
+"""
+WSGI config for config project.
+
+It exposes the WSGI callable as a module-level variable named ``application``.
+
+For more information on this file, see
+https://docs.djangoproject.com/en/5.0/howto/deployment/wsgi/
+"""
+
+import os
+
+from django.core.wsgi import get_wsgi_application
+
+os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'config.settings')
+
+application = get_wsgi_application()
diff --git a/db.py b/db.py
new file mode 100644
index 0000000..5afc09b
--- /dev/null
+++ b/db.py
@@ -0,0 +1,21 @@
+import os
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker
+from dotenv import load_dotenv
+
+load_dotenv()
+
+DB_URL = f"postgresql://" \
+         f"{os.getenv('DB_USER')}:" \
+         f"{os.getenv('DB_PASS')}@" \
+         f"{os.getenv('DB_HOST')}:" \
+         f"{os.getenv('DB_PORT')}/" \
+         f"{os.getenv('DB_NAME')}"
+
+engine = create_engine(
+    DB_URL,
+    echo=True,
+    connect_args={"sslmode": "require"}
+)
+
+SessionLocal = sessionmaker(bind=engine)
\ No newline at end of file
diff --git a/front-end/db_queries.html b/front-end/db_queries.html
new file mode 100644
index 0000000..f840170
--- /dev/null
+++ b/front-end/db_queries.html
@@ -0,0 +1,32 @@
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+    <meta charset="UTF-8">
+    <meta name="description" content="">
+    <meta http-equiv="X-UA-Compatible" content="IE=edge">
+    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+    <title>Hello World!</title>
+    <link rel="stylesheet" href="styles/style.css">
+    <script type="text/javascript" src="/eel.js"></script>
+    <script type="text/javascript" src="scripts/main.js"></script>
+
+</head>
+
+<body>
+
+<input type="text" id="num-1" placeholder="Enter number.">
+<input type="text" id="num-2" placeholder="Enter number.">
+<input type="button" class="submit" value="Add" onclick="operate(eel.add)">
+<input type="button" class="submit" value="Subtract" onclick="operate(eel.subtract)">
+<br>
+<br>
+<input type="button" value="Show Users" onclick="loadUsers()">
+<input type="text" id="email" placeholder="Enter email.">
+<input type="text" id="pass" placeholder="Enter password.">
+<input type="button" value="Add User" onclick="addUsers()">
+
+<span id="output"></span>
+</body>
+
+</html>
\ No newline at end of file
diff --git a/front-end/index.html b/front-end/index.html
index 3a3ade3..f824cb1 100644
--- a/front-end/index.html
+++ b/front-end/index.html
@@ -1,25 +1,116 @@
 <!DOCTYPE html>
 <html lang="en">
-
 <head>
     <meta charset="UTF-8">
-    <meta name="description" content="">
+    <title>AutoPen Dashboard</title>
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+
+    <!-- External CSS -->
+    <link rel="stylesheet" href="styles/style.css">
     <meta http-equiv="X-UA-Compatible" content="IE=edge">
     <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
     <title>Hello World!</title>
     <link rel="stylesheet" href="styles/style.css">
     <script type="text/javascript" src="/eel.js"></script>
     <script type="text/javascript" src="scripts/main.js"></script>
-
 </head>
-
 <body>
+<input type="text" id="num-1" placeholder="Enter number.">
+<input type="text" id="num-2" placeholder="Enter number.">
 
-    <input type="text" id="num-1" placeholder="Enter number.">
-    <input type="text" id="num-2" placeholder="Enter number.">
-    <input type="button" class="submit" value="Add" onclick="operate(eel.add)">
-    <input type="button" class="submit" value="Subtract" onclick="operate(eel.subtract)">
-    <span id="output"></span>
-</body>
+<div class="sidebar">
+    <h2>AutoPen</h2>
+    <ul>
+        <li>Dashboard</li>
+        <li>Start Scan</li>
+        <li>Reports</li>
+        <li>Targets</li>
+        <li>Settings</li>
+    </ul>
+</div>
+
+<div class="main">
+    <div class="header">
+        <h1>Penetration Testing Dashboard</h1>
+        <div class="status">System Status: Active</div>
+    </div>
+
+    <div class="cards">
+        <div class="card">
+            <h3>Total Scans</h3>
+            <p>128</p>
+        </div>
+        <div class="card">
+            <h3>Critical Vulnerabilities</h3>
+            <p>12</p>
+        </div>
+        <div class="card">
+            <h3>Medium Vulnerabilities</h3>
+            <p>34</p>
+        </div>
+        <div class="card">
+            <h3>Low Vulnerabilities</h3>
+            <p>56</p>
+        </div>
+    </div>
 
-</html>
\ No newline at end of file
+    <!-- Upload Code -->
+    <div class="code-center">
+        <div class="code-box">
+            <h2>Upload Code for Analysis</h2>
+
+            <div class="upload-tabs">
+                <button class="tab-btn active" data-tab="file">Upload File</button>
+                <button class="tab-btn" data-tab="paste">Paste Code</button>
+            </div>
+
+            <!-- Upload File -->
+            <div class="tab-content active" id="file">
+                <label class="drop-zone">
+                    <input type="file" hidden>
+                    <p>Drop your code file here<br>or <span>click to browse</span></p>
+                </label>
+            </div>
+
+            <!-- Paste Code -->
+            <div class="tab-content" id="paste">
+                <textarea placeholder="Paste your code here..."></textarea>
+            </div>
+
+            <button class="scan-btn">Start Scan</button>
+        </div>
+    </div>
+
+    <script src="scripts/upload.js"></script>
+
+    <div class="recent-scans">
+        <h3>Recent Scan Results</h3>
+        <table>
+            <thead>
+            <tr>
+                <th>Target</th>
+                <th>Date</th>
+                <th>Risk Level</th>
+                <th>Status</th>
+            </tr>
+            </thead>
+            <tbody>
+            <tr>
+                <td>example.com</td>
+                <td>02/14/2026</td>
+                <td class="critical">Critical</td>
+                <td>Completed</td>
+            </tr>
+            <tr>
+                <td>test-server.net</td>
+                <td>02/12/2026</td>
+                <td class="low">Low</td>
+                <td>Completed</td>
+            </tr>
+            </tbody>
+        </table>
+    </div>
+</div>
+
+</body>
+</html>
diff --git a/front-end/login.html b/front-end/login.html
new file mode 100644
index 0000000..425f5a7
--- /dev/null
+++ b/front-end/login.html
@@ -0,0 +1,30 @@
+<!DOCTYPE html>
+<html>
+<head>
+  <title>Login</title>
+</head>
+<body>
+
+<h2>Login</h2>
+
+{% if error %}
+<p style="color:red">{{ error }}</p>
+{% endif %}
+
+<form method="POST">
+  {% csrf_token %}
+
+  <input type="text" name="username" placeholder="Username" required>
+  <br><br>
+
+  <input type="password" name="password" placeholder="Password" required>
+  <br><br>
+
+  <button type="submit">Login</button>
+</form>
+
+<p>Don't have an account?</p>
+<a href="/register/">Register</a>
+
+</body>
+</html>
\ No newline at end of file
diff --git a/front-end/register.html b/front-end/register.html
new file mode 100644
index 0000000..18bf32c
--- /dev/null
+++ b/front-end/register.html
@@ -0,0 +1,27 @@
+<!DOCTYPE html>
+<html>
+<head>
+  <title>Register</title>
+</head>
+<body>
+
+<h2>Create Account</h2>
+
+<form method="POST">
+  {% csrf_token %}
+
+  <input type="text" name="username" placeholder="Username" required>
+  <br><br>
+
+  <input type="password" name="password" placeholder="Password" required>
+  <br><br>
+
+  <button type="submit">Register</button>
+
+</form>
+
+<br>
+<a href="/">Back to Login</a>
+
+</body>
+</html>
\ No newline at end of file
diff --git a/front-end/scripts/main.js b/front-end/scripts/main.js
index feeebf0..fc76eff 100644
--- a/front-end/scripts/main.js
+++ b/front-end/scripts/main.js
@@ -6,3 +6,26 @@ function operate(operator) {
 		document.querySelector('#output').innerText = result;
 	});
 }
+
+function loadUsers() {
+	eel.showUsers()(users => {
+		document.querySelector('#output').innerText = JSON.stringify(users, null, 2);
+	});
+}
+
+function addUsers() {
+	var email = document.querySelector('#email').value;
+	var password = document.querySelector('#pass').value;
+
+	eel.addUsers(email, password)(response => {
+		document.querySelector('#output').innerText = response;
+	});
+function askGPT() {
+	const prompt = document.querySelector('#prompt').value;
+
+	document.querySelector('#output').innerText = "Loading...";
+
+	let newVar = eel.ask_api(prompt)(result => {
+		document.querySelector('#output').innerText = result;
+	});}}
+
diff --git a/front-end/scripts/upload.js b/front-end/scripts/upload.js
new file mode 100644
index 0000000..4d33a0b
--- /dev/null
+++ b/front-end/scripts/upload.js
@@ -0,0 +1,14 @@
+const tabButtons = document.querySelectorAll(".tab-btn");
+const tabContents = document.querySelectorAll(".tab-content");
+
+tabButtons.forEach(button => {
+    button.addEventListener("click", () => {
+        // Remove active state
+        tabButtons.forEach(btn => btn.classList.remove("active"));
+        tabContents.forEach(tab => tab.classList.remove("active"));
+
+        // Activate selected tab
+        button.classList.add("active");
+        document.getElementById(button.dataset.tab).classList.add("active");
+    });
+});
\ No newline at end of file
diff --git a/front-end/styles/style.css b/front-end/styles/style.css
index e69de29..8357d12 100644
--- a/front-end/styles/style.css
+++ b/front-end/styles/style.css
@@ -0,0 +1,245 @@
+/* ================= RESET ================= */
+* {
+    margin: 0;
+    padding: 0;
+    box-sizing: border-box;
+    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+}
+
+body {
+    background: #000000;
+    color: #e5e7eb;
+    display: flex;
+    min-height: 100vh;
+}
+
+/* ================= SIDEBAR ================= */
+.sidebar {
+    width: 250px;
+    height: 100vh;
+    background: rgba(17, 24, 39, 0.8);
+    padding: 20px;
+    border-right: 1px solid rgba(168, 85, 247, 0.2);
+}
+
+.sidebar h2 {
+    color: #e9d5ff;
+    margin-bottom: 40px;
+    text-align: center;
+}
+
+.sidebar ul {
+    list-style: none;
+}
+
+.sidebar ul li {
+    padding: 15px;
+    margin: 10px 0;
+    background: rgba(17, 24, 39, 0.8);
+    border: 1px solid rgba(168, 85, 247, 0.2);
+    border-radius: 8px;
+    cursor: pointer;
+    transition: 0.25s;
+}
+
+.sidebar ul li:hover {
+    background: rgba(17, 24, 39, 0.95);
+}
+
+/* ================= MAIN ================= */
+.main {
+    flex: 1;
+    padding: 40px;
+}
+
+/* ================= HEADER ================= */
+.header {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    margin-bottom: 40px;
+}
+
+.header h1 {
+    color: #ffffff;
+}
+
+.status {
+    background: rgba(17, 24, 39, 0.8);
+    padding: 10px 20px;
+    border-radius: 999px;
+    border: 1px solid rgba(168, 85, 247, 0.2);
+}
+
+/* ================= UPLOAD SECTION ================= */
+.code-center {
+    margin-bottom: 50px;
+}
+
+.code-box {
+    background: rgba(17, 24, 39, 0.8);
+    border: 1px solid rgba(168, 85, 247, 0.2);
+    border-radius: 12px;
+    padding: 30px;
+    width: 100%;
+    max-width: 900px;
+}
+
+.code-box h2 {
+    color: #ffffff;
+    margin-bottom: 20px;
+}
+
+/* ================= TABS ================= */
+.upload-tabs {
+    display: flex;
+    gap: 12px;
+    margin-bottom: 20px;
+}
+
+.tab-btn {
+    background: transparent;
+    border: 1px solid rgba(168, 85, 247, 0.2);
+    color: #e9d5ff;
+    padding: 8px 22px;
+    border-radius: 999px;
+    cursor: pointer;
+    transition: 0.25s;
+}
+
+.tab-btn:hover {
+    background: rgba(168, 85, 247, 0.15);
+}
+
+.tab-btn.active {
+    background: rgba(168, 85, 247, 0.25);
+}
+
+/* ================= TAB CONTENT ================= */
+.tab-content {
+    display: none;
+}
+
+.tab-content.active {
+    display: block;
+}
+
+/* ================= DROP ZONE ================= */
+.drop-zone {
+    display: block;
+    padding: 50px;
+    border: 2px dashed rgba(168, 85, 247, 0.8);
+    border-radius: 12px;
+    background: rgba(17, 24, 39, 0.6);
+    text-align: center;
+    cursor: pointer;
+    transition: 0.25s;
+    width: 80%;
+    max-width: 600px;
+    margin: 0 auto;
+}
+
+.drop-zone:hover {
+    background: rgba(17, 24, 39, 0.9);
+}
+
+.drop-zone p {
+    font-size: 18px;
+    margin-bottom: 6px;
+}
+
+.drop-zone span {
+    font-size: 14px;
+    color: #c4b5fd;
+}
+
+/* ================= TEXTAREA ================= */
+textarea {
+    width: 100%;
+    min-height: 220px;
+    background: rgba(17, 24, 39, 0.6);
+    color: #ffffff;
+    border: 1px solid rgba(168, 85, 247, 0.2);
+    border-radius: 8px;
+    padding: 15px;
+    resize: vertical;
+}
+
+/* ================= BUTTON ================= */
+.scan-btn {
+    margin-top: 20px;
+    padding: 10px 30px;
+    background: rgba(168, 85, 247, 0.85);
+    color: white;
+    border: none;
+    border-radius: 999px;
+    cursor: pointer;
+    transition: 0.25s;
+}
+
+.scan-btn:hover {
+    background: rgba(168, 85, 247, 1);
+}
+
+/* ================= CARDS ================= */
+.cards {
+    display: grid;
+    grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
+    gap: 20px;
+    margin-bottom: 40px;
+}
+
+.card {
+    background: rgba(17, 24, 39, 0.8);
+    border: 1px solid rgba(168, 85, 247, 0.2);
+    padding: 20px;
+    border-radius: 12px;
+    transition: 0.25s;
+}
+
+.card:hover {
+    background: rgba(17, 24, 39, 0.95);
+}
+
+.card h3 {
+    color: #ffffff;
+    margin-bottom: 10px;
+}
+
+.card p {
+    font-size: 28px;
+    font-weight: bold;
+}
+
+/* ================= TABLE ================= */
+.recent-scans {
+    background: rgba(17, 24, 39, 0.8);
+    border: 1px solid rgba(168, 85, 247, 0.2);
+    padding: 20px;
+    border-radius: 12px;
+}
+
+table {
+    width: 100%;
+    border-collapse: collapse;
+    margin-top: 15px;
+}
+
+th, td {
+    padding: 12px;
+    text-align: left;
+}
+
+th {
+    color: #e9d5ff;
+    border-bottom: 1px solid rgba(168, 85, 247, 0.2);
+}
+
+tr:hover {
+    background: rgba(168, 85, 247, 0.05);
+}
+
+/* ================= SEVERITY COLORS ================= */
+.critical { color: #fb7185; font-weight: bold; }
+.medium   { color: #facc15; font-weight: bold; }
+.low      { color: #34d399; font-weight: bold; }
diff --git a/main.py b/main.py
index cedf85d..1aef0c6 100644
--- a/main.py
+++ b/main.py
@@ -1,16 +1,72 @@
 import eel
-
+from db import engine
+from sqlalchemy import text
+import bcrypt
+import os
+from dotenv import load_dotenv
+from openai import OpenAI
+#sofia
+#jacob
+#tim
+#Nathan
+#Sid
 eel.init('front-end')
 
+try:
+    with engine.connect() as conn:
+        # Run a simple query to test
+        result = conn.execute(text("SELECT NOW();"))
+        print("Connected! Server time:", result.fetchone()[0])
+except Exception as e:
+    print("Connection failed:", e)
+load_dotenv()
+print("API key loaded:", bool(os.getenv("OPENAI_API_KEY")))
+client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+
+@eel.expose
+def ask_api(user_text):
+    print("ask_api received:", user_text)
+
+    resp = client.chat.completions.create(
+            model="gpt-4.1-mini",
+            messages=[{"role": "user", "content": str(user_text)}],
+        )
+
+    answer = resp.choices[0].message.content
+    print("ask_api answer:", answer)
+    return answer
+
+
+
+
 
 @eel.expose
 def add(num1, num2):
     return int(num1) + int(num2)
 
-
 @eel.expose
 def subtract(num1, num2):
     return int(num1) - int(num2)
 
+@eel.expose
+def showUsers():
+     with engine.connect() as conn:
+         result = conn.execute(text("SELECT * FROM users;"))
+         users = result.fetchall()
+         return [dict(row._mapping) for row in users]
+
+@eel.expose
+def addUsers(email, password):
+    #hashing logic here
+    hashed = bcrypt.hashpw(password.encode(), bcrypt.gensalt()).decode()
+
+    with engine.begin() as conn:   # auto-commit
+        conn.execute(
+            text("INSERT INTO users (email, password_hash) VALUES (:email, :password)"),
+            {"email": email, "password": hashed}
+        )
+    return "User added successfully"
+
 
-eel.start('index.html', size=(1000, 600))
+if __name__ == "__main__":
+    eel.start('index.html', size=(1000, 600), mode='safari')
diff --git a/manage.py b/manage.py
new file mode 100644
index 0000000..8e7ac79
--- /dev/null
+++ b/manage.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python
+"""Django's command-line utility for administrative tasks."""
+import os
+import sys
+
+
+def main():
+    """Run administrative tasks."""
+    os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'config.settings')
+    try:
+        from django.core.management import execute_from_command_line
+    except ImportError as exc:
+        raise ImportError(
+            "Couldn't import Django. Are you sure it's installed and "
+            "available on your PYTHONPATH environment variable? Did you "
+            "forget to activate a virtual environment?"
+        ) from exc
+    execute_from_command_line(sys.argv)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/preprocess/__init__.py b/preprocess/__init__.py
new file mode 100644
index 0000000..3716dcb
--- /dev/null
+++ b/preprocess/__init__.py
@@ -0,0 +1,26 @@
+"""
+Backend preprocessing pipeline for secure code review.
+Produces structured JSON (project/file/chunk) without frontend.
+"""
+
+from preprocess.pipeline import run_file, run_snippet, PIPELINE_VERSION
+from preprocess.normalized_findings import (
+    normalize_preprocess_output,
+    normalize_file_findings,
+    run_file_with_findings,
+    export_ai_payload,
+    run_file_ai_payload,
+    slim_finding_for_ai,
+)
+
+__all__ = [
+    "run_file",
+    "run_snippet",
+    "PIPELINE_VERSION",
+    "normalize_preprocess_output",
+    "normalize_file_findings",
+    "run_file_with_findings",
+    "export_ai_payload",
+    "run_file_ai_payload",
+    "slim_finding_for_ai",
+]
diff --git a/preprocess/chunking.py b/preprocess/chunking.py
new file mode 100644
index 0000000..9fd74b7
--- /dev/null
+++ b/preprocess/chunking.py
@@ -0,0 +1,119 @@
+"""
+Build chunks: whole file, or per function/class, or line-based fallback.
+"""
+
+from typing import Any, Dict, List
+
+MAX_LINES_WHOLE_CHUNK = 200
+LINE_CHUNK_SIZE = 120
+LINE_CHUNK_OVERLAP = 10
+
+
+def lines_to_content(lines: List[str], start: int, end: int) -> str:
+    """start/end are 1-based inclusive line numbers."""
+    if start < 1:
+        start = 1
+    if end > len(lines):
+        end = len(lines)
+    return "\n".join(lines[start - 1 : end])
+
+
+def build_chunks(
+    file_id: str,
+    content: str,
+    language: str,
+    structure: Dict[str, Any],
+) -> List[Dict[str, Any]]:
+    """
+    Returns list of chunk dicts with chunk_id, file_id, start_line, end_line, type, symbol, content.
+    chunk_id filled by caller after content_hash.
+    """
+    lines = content.splitlines()
+    n = len(lines)
+    chunks: List[Dict[str, Any]] = []
+
+    if language == "python" and structure.get("parse_ok"):
+        # Prefer function/class spans
+        spans = []
+        for f in structure.get("functions", []):
+            spans.append(("function", f["name"], f["line"], f.get("end_line", f["line"])))
+        for c in structure.get("classes", []):
+            spans.append(("class", c["name"], c["line"], c.get("end_line", c["line"])))
+        spans.sort(key=lambda x: x[2])
+        if spans and n > MAX_LINES_WHOLE_CHUNK:
+            covered = set()
+            for typ, sym, start, end in spans:
+                if end < start:
+                    end = start
+                chunk_content = lines_to_content(lines, start, end)
+                if not chunk_content.strip():
+                    continue
+                chunks.append({
+                    "file_id": file_id,
+                    "start_line": start,
+                    "end_line": end,
+                    "type": typ,
+                    "symbol": sym,
+                    "content": chunk_content,
+                })
+                for ln in range(start, end + 1):
+                    covered.add(ln)
+            # Optional: add line-based for uncovered regions — keep simple: if no spans cover whole file, fallback
+            if not chunks:
+                pass
+        if not chunks and n <= MAX_LINES_WHOLE_CHUNK:
+            chunks.append({
+                "file_id": file_id,
+                "start_line": 1,
+                "end_line": n,
+                "type": "file",
+                "symbol": None,
+                "content": content,
+            })
+        elif not chunks:
+            chunks.extend(_line_chunks(file_id, lines))
+    else:
+        if n <= MAX_LINES_WHOLE_CHUNK:
+            chunks.append({
+                "file_id": file_id,
+                "start_line": 1,
+                "end_line": n,
+                "type": "file",
+                "symbol": None,
+                "content": content,
+            })
+        else:
+            chunks.extend(_line_chunks(file_id, lines))
+
+    if not chunks and content.strip():
+        chunks.append({
+            "file_id": file_id,
+            "start_line": 1,
+            "end_line": max(n, 1),
+            "type": "file",
+            "symbol": None,
+            "content": content,
+        })
+    return chunks
+
+
+def _line_chunks(file_id: str, lines: List[str]) -> List[Dict[str, Any]]:
+    out = []
+    n = len(lines)
+    i = 0
+    while i < n:
+        start = i + 1
+        end = min(i + LINE_CHUNK_SIZE, n)
+        block = "\n".join(lines[i:end])
+        out.append({
+            "file_id": file_id,
+            "start_line": start,
+            "end_line": end,
+            "type": "lines",
+            "symbol": None,
+            "content": block,
+        })
+        i = end - LINE_CHUNK_OVERLAP if end - LINE_CHUNK_OVERLAP > i else end
+        if i >= n:
+            break
+    return out
diff --git a/preprocess/filters.py b/preprocess/filters.py
new file mode 100644
index 0000000..d828632
--- /dev/null
+++ b/preprocess/filters.py
@@ -0,0 +1,68 @@
+"""
+Skip binary/large/minified paths and irrelevant directories.
+"""
+
+from pathlib import Path
+from typing import Optional, Tuple
+
+SKIP_DIR_NAMES = {
+    "node_modules",
+    "venv",
+    ".venv",
+    ".git",
+    "dist",
+    "build",
+    "__pycache__",
+    ".next",
+    "target",
+    ".idea",
+    ".vscode",
+}
+
+BINARY_EXTENSIONS = {
+    ".png", ".jpg", ".jpeg", ".gif", ".webp", ".ico", ".bmp",
+    ".pdf", ".zip", ".tar", ".gz", ".7z", ".rar",
+    ".so", ".dll", ".dylib", ".exe", ".bin",
+    ".pyc", ".pyo", ".class", ".o", ".a",
+    ".woff", ".woff2", ".ttf", ".eot",
+    ".mp3", ".mp4", ".webm", ".avi",
+}
+
+MINIFIED_NAME_SUFFIXES = (".min.js", ".min.css")
+GENERATED_NAME_PARTS = ("-generated", "_generated", ".generated.")
+
+
+def should_skip_path(path: Path, max_bytes: int = 512 * 1024) -> Tuple[bool, Optional[str]]:
+    """
+    Returns (skip, reason). reason is None if not skipped.
+    """
+    if not path.exists():
+        return True, "not_found"
+    if path.is_dir():
+        if path.name in SKIP_DIR_NAMES:
+            return True, f"skip_dir:{path.name}"
+        return False, None
+    suffix = path.suffix.lower()
+    if suffix in BINARY_EXTENSIONS:
+        return True, f"binary_ext:{suffix}"
+    try:
+        size = path.stat().st_size
+    except OSError:
+        return True, "stat_failed"
+    if size > max_bytes:
+        return True, f"too_large:{size}"
+    name = path.name.lower()
+    for s in MINIFIED_NAME_SUFFIXES:
+        if name.endswith(s):
+            return True, "minified_name"
+    for part in GENERATED_NAME_PARTS:
+        if part in name:
+            return True, "generated_name"
+    return False, None
+
+
+def path_has_skip_segment(path: Path) -> bool:
+    for part in path.parts:
+        if part in SKIP_DIR_NAMES:
+            return True
+    return False
diff --git a/preprocess/hashing.py b/preprocess/hashing.py
new file mode 100644
index 0000000..b7e944c
--- /dev/null
+++ b/preprocess/hashing.py
@@ -0,0 +1,15 @@
+import hashlib
+import uuid
+from pathlib import Path
+
+
+def sha256_text(s: str) -> str:
+    return hashlib.sha256(s.encode("utf-8")).hexdigest()
+
+
+def sha256_bytes(b: bytes) -> str:
+    return hashlib.sha256(b).hexdigest()
+
+
+def new_uuid() -> str:
+    return str(uuid.uuid4())
diff --git a/preprocess/io_read.py b/preprocess/io_read.py
new file mode 100644
index 0000000..6f0862d
--- /dev/null
+++ b/preprocess/io_read.py
@@ -0,0 +1,35 @@
+"""
+Read file as text: UTF-8 first, then fallbacks. Detect unreadable/binary.
+"""
+
+from pathlib import Path
+from typing import Optional, Tuple
+
+ENCODING_FALLBACKS = ["utf-8", "utf-8-sig", "latin-1", "cp1252"]
+
+
+def read_text_with_fallback(path: Path) -> Tuple[Optional[str], Optional[str], Optional[str]]:
+    """
+    Returns (text, encoding_used, error).
+    If unreadable, text is None and error explains why.
+    """
+    raw = path.read_bytes()
+    if b"\x00" in raw[:8192] and raw[:8192].count(b"\x00") > 2:
+        return None, None, "likely_binary_null_bytes"
+    for enc in ENCODING_FALLBACKS:
+        try:
+            text = raw.decode(enc)
+            return text, enc, None
+        except UnicodeDecodeError:
+            continue
+    return None, None, "decode_failed_all_encodings"
+
+
+def is_probably_text(s: str, sample_lines: int = 50) -> bool:
+    """Heuristic: too many non-printable chars => skip."""
+    sample = "\n".join(s.splitlines()[:sample_lines])
+    if not sample.strip():
+        return True
+    printable = sum(1 for c in sample if c.isprintable() or c in "\n\r\t")
+    ratio = printable / max(len(sample), 1)
+    return ratio >= 0.85
diff --git a/preprocess/language.py b/preprocess/language.py
new file mode 100644
index 0000000..0610b53
--- /dev/null
+++ b/preprocess/language.py
@@ -0,0 +1,105 @@
+"""
+Language detection: extension first, then shebang/keywords/patterns.
+"""
+
+import re
+from pathlib import Path
+from typing import Dict, Optional
+
+EXT_TO_LANGUAGE = {
+    ".py": "python",
+    ".js": "javascript",
+    ".mjs": "javascript",
+    ".ts": "typescript",
+    ".tsx": "typescript",
+    ".jsx": "javascript",
+    ".java": "java",
+    ".go": "go",
+    ".rb": "ruby",
+    ".php": "php",
+    ".cs": "csharp",
+    ".rs": "rust",
+    ".c": "c",
+    ".cpp": "cpp",
+    ".h": "c",
+    ".sql": "sql",
+    ".sh": "shell",
+    ".bash": "shell",
+    ".yaml": "yaml",
+    ".yml": "yaml",
+    ".json": "json",
+    ".html": "html",
+    ".css": "css",
+}
+
+SHEBANG_PATTERN = re.compile(r"^#!\s*/usr/bin/env\s+(\w+)|^#!\s*/.*\b(python|node|ruby)\b", re.MULTILINE)
+
+KEYWORD_HINTS = [
+    (r"\bdef\s+\w+\s*\(", "python"),
+    (r"\bimport\s+\w+", "python"),
+    (r"\bfrom\s+\w+\s+import\b", "python"),
+    (r"\bfunction\s+\w+\s*\(", "javascript"),
+    (r"\bconst\s+\w+\s*=\s*\(", "javascript"),
+    (r"\brequire\s*\(", "javascript"),
+    (r"\bpublic\s+class\s+\w+", "java"),
+    (r"\bpackage\s+main\b", "go"),
+    (r"\bfn\s+main\s*\(", "rust"),
+]
+
+
+def detect_language(path: Optional[Path], content: str, hint: Optional[str] = None) -> Dict:
+    """
+    Returns { "language", "confidence": high|medium|low|unknown, "reason": str }.
+    """
+    reasons = []
+    lang_from_ext = None
+    if path and path.suffix:
+        lang_from_ext = EXT_TO_LANGUAGE.get(path.suffix.lower())
+        if lang_from_ext:
+            reasons.append(f"extension:{path.suffix}")
+
+    if hint:
+        h = hint.strip().lower()
+        if h in ("py", "python"):
+            lang_from_ext = lang_from_ext or "python"
+            reasons.append("hint:python")
+        elif h in ("js", "javascript"):
+            lang_from_ext = lang_from_ext or "javascript"
+            reasons.append("hint:javascript")
+
+    lang_from_content = None
+    first_lines = "\n".join(content.splitlines()[:30])
+    m = SHEBANG_PATTERN.search(first_lines)
+    if m:
+        g = (m.group(1) or m.group(2) or "").lower()
+        if "python" in g:
+            lang_from_content = "python"
+        elif "node" in g:
+            lang_from_content = "javascript"
+        elif "ruby" in g:
+            lang_from_content = "ruby"
+        if lang_from_content:
+            reasons.append("shebang")
+
+    if not lang_from_content:
+        for pattern, lang in KEYWORD_HINTS:
+            if re.search(pattern, first_lines):
+                lang_from_content = lang
+                reasons.append(f"keyword:{pattern[:20]}")
+                break
+
+    if lang_from_ext and lang_from_content:
+        if lang_from_ext == lang_from_content:
+            return {"language": lang_from_ext, "confidence": "high", "reason": ";".join(reasons)}
+        return {
+            "language": lang_from_ext,
+            "confidence": "medium",
+            "reason": f"extension_vs_content_conflict;{';'.join(reasons)}",
+        }
+    if lang_from_ext:
+        return {"language": lang_from_ext, "confidence": "medium", "reason": ";".join(reasons)}
+    if lang_from_content:
+        return {"language": lang_from_content, "confidence": "medium", "reason": ";".join(reasons)}
+    if path and path.suffix:
+        return {"language": "unknown", "confidence": "low", "reason": f"unknown_ext:{path.suffix}"}
+    return {"language": "unknown", "confidence": "unknown", "reason": "no_extension_no_keywords"}
diff --git a/preprocess/normalized_findings.py b/preprocess/normalized_findings.py
new file mode 100644
index 0000000..edef5ad
--- /dev/null
+++ b/preprocess/normalized_findings.py
@@ -0,0 +1,377 @@
+"""
+Normalize preprocess signals into structured findings for AI / report stage.
+No human layer: finding_status is review_needed; confidence reflects heuristic only.
+"""
+
+from typing import Any, Dict, List, Optional
+
+# Signal bucket key (as in file_record["signals"]) -> default issue template
+# category inside each item is the regex category e.g. broad_except, debug_output
+SIGNAL_BUCKET_META = {
+    "error_handling": {
+        "issue_type": "broad_exception_handling",
+        "title": "Broad Exception Handling",
+        "affected_component": ["backend", "error_handling"],
+        "confidence": "medium",
+        "severity": "low",
+        "remediation_keywords": [
+            "catch specific exceptions",
+            "structured logging",
+            "safer exception handling",
+        ],
+        "possible_impact": [
+            "reduced error observability",
+            "generic failure handling may hide root causes",
+        ],
+    },
+    "debug_output": {
+        "issue_type": "potential_sensitive_data_exposure_via_debug_output",
+        "title": "Potential Sensitive Data Exposure via Debug Output",
+        "affected_component": ["backend", "logging"],
+        "confidence": "medium",
+        "severity": "medium",
+        "remediation_keywords": [
+            "remove debug prints",
+            "sanitize logs",
+            "avoid dumping full query results",
+        ],
+        "possible_impact": [
+            "internal data may appear in console or logs",
+            "debug output could leak in production",
+        ],
+        "analysis_limitations": [
+            "Cannot determine if printed data contains sensitive fields without data flow analysis.",
+        ],
+    },
+    "sql_execution": {
+        "issue_type": "sql_execution_review",
+        "title": "SQL Execution Present — Review for Injection / Unsafe Queries",
+        "affected_component": ["backend", "database"],
+        "confidence": "medium",
+        "severity": "medium",
+        "remediation_keywords": [
+            "parameterized queries",
+            "prepared statements",
+            "avoid string concatenation in SQL",
+        ],
+        "possible_impact": [
+            "SQL injection if input is concatenated into queries",
+            "unsafe query patterns",
+        ],
+    },
+    "command_execution": {
+        "issue_type": "command_execution_surface",
+        "title": "Command Execution or Dynamic Evaluation",
+        "affected_component": ["backend", "process"],
+        "confidence": "high",
+        "severity": "high",
+        "remediation_keywords": [
+            "avoid eval/exec",
+            "sanitize subprocess arguments",
+            "use allowlists for shell commands",
+        ],
+        "possible_impact": [
+            "command injection",
+            "arbitrary code execution",
+        ],
+    },
+    "file_access": {
+        "issue_type": "file_io_review",
+        "title": "File Access — Review Path Handling",
+        "affected_component": ["backend", "filesystem"],
+        "confidence": "low",
+        "severity": "low",
+        "remediation_keywords": [
+            "validate paths",
+            "avoid path traversal",
+        ],
+        "possible_impact": [
+            "path traversal if paths are user-controlled",
+        ],
+    },
+    "user_input_sources": {
+        "issue_type": "user_input_flow",
+        "title": "User-Controlled Input Source",
+        "affected_component": ["backend", "input"],
+        "confidence": "medium",
+        "severity": "medium",
+        "remediation_keywords": [
+            "validate and sanitize input",
+            "use safe APIs",
+        ],
+        "possible_impact": [
+            "injection or logic flaws if input reaches sensitive sinks",
+        ],
+    },
+    "possible_hardcoded_secrets": {
+        "issue_type": "possible_hardcoded_secret",
+        "title": "Possible Hardcoded Secret",
+        "affected_component": ["backend", "secrets"],
+        "confidence": "low",
+        "severity": "high",
+        "remediation_keywords": [
+            "use environment variables or secret manager",
+            "rotate credentials",
+        ],
+        "possible_impact": [
+            "credential leak if committed or logged",
+        ],
+        "analysis_limitations": [
+            "May be test data or placeholders; verify context.",
+        ],
+    },
+    "auth_related_keywords": {
+        "issue_type": "auth_surface_keyword",
+        "title": "Auth-Related Keyword Present",
+        "affected_component": ["backend", "auth"],
+        "confidence": "low",
+        "severity": "low",
+        "remediation_keywords": [
+            "review auth flow",
+            "session handling",
+        ],
+        "possible_impact": [
+            "auth logic may need manual review",
+        ],
+    },
+    "database_access": {
+        "issue_type": "database_access_heuristic",
+        "title": "Database Access Pattern Detected",
+        "affected_component": ["backend", "database"],
+        "confidence": "low",
+        "severity": "low",
+        "remediation_keywords": [
+            "least privilege",
+            "connection pooling security",
+        ],
+        "possible_impact": [
+            "review how connections and queries are used",
+        ],
+        "analysis_limitations": [
+            "Heuristic only; line may be 0 when matched on whole file.",
+        ],
+    },
+    "crypto_usage": {
+        "issue_type": "crypto_usage_review",
+        "title": "Cryptographic API Usage",
+        "affected_component": ["backend", "crypto"],
+        "confidence": "low",
+        "severity": "medium",
+        "remediation_keywords": [
+            "use vetted libraries",
+            "avoid weak algorithms",
+        ],
+        "possible_impact": [
+            "misuse may weaken security",
+        ],
+    },
+}
+
+
+def _evidence_from_signal_item(signal_key: str, index: int, item: Dict[str, Any]) -> Dict[str, Any]:
+    line = item.get("line", 0)
+    category = item.get("category", signal_key)
+    text = item.get("match", "")
+    if len(text) > 500:
+        text = text[:500] + "..."
+    return {
+        "line": line,
+        "signal": signal_key,
+        "category": category,
+        "text": text,
+    }
+
+
+def _refs_for_bucket(signal_key: str, indices: List[int]) -> List[str]:
+    return [f"signals.{signal_key}[{i}]" for i in indices]
+
+
+def normalize_file_findings(
+    file_record: Dict[str, Any],
+    file_index: int = 0,
+    starting_f_index: int = 1,
+) -> List[Dict[str, Any]]:
+    """
+    Build normalized_findings from a single preprocess file_record.
+    One finding per signal bucket that has entries (evidence = all items in bucket).
+    """
+    signals = file_record.get("signals") or {}
+    path = file_record.get("path", f"file_{file_index}")
+    findings: List[Dict[str, Any]] = []
+    f_num = starting_f_index
+
+    for signal_key, items in signals.items():
+        if not items:
+            continue
+        meta = SIGNAL_BUCKET_META.get(signal_key)
+        if not meta:
+            # Unknown bucket: generic finding
+            meta = {
+                "issue_type": f"heuristic_{signal_key}",
+                "title": f"Signal: {signal_key}",
+                "affected_component": ["backend"],
+                "confidence": "low",
+                "severity": "low",
+                "remediation_keywords": ["manual review"],
+                "possible_impact": ["pattern matched; context unknown"],
+            }
+        evidence = []
+        refs = []
+        for i, item in enumerate(items):
+            if not isinstance(item, dict):
+                continue
+            evidence.append(_evidence_from_signal_item(signal_key, i, item))
+            refs.append(f"signals.{signal_key}[{i}]")
+
+        if not evidence:
+            continue
+
+        finding = {
+            "finding_id": f"F-{f_num:03d}",
+            "issue_type": meta["issue_type"],
+            "title": meta["title"],
+            "affected_component": list(meta["affected_component"]),
+            "confidence": meta["confidence"],
+            "severity": meta["severity"],
+            "finding_status": "review_needed",
+            "verification_method": "automated_heuristic",
+            "source_file": path,
+            "file_id": file_record.get("file_id"),
+            "possible_impact": list(meta.get("possible_impact", [])),
+            "evidence": evidence,
+            "remediation_keywords": list(meta.get("remediation_keywords", [])),
+            "source_signal_refs": refs,
+        }
+        if meta.get("analysis_limitations"):
+            finding["analysis_limitations"] = list(meta["analysis_limitations"])
+        findings.append(finding)
+        f_num += 1
+
+    return findings
+
+
+def normalize_preprocess_output(preprocess_output: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Attach normalized_findings to full preprocess pipeline output.
+    """
+    all_findings: List[Dict[str, Any]] = []
+    f_num = 1
+    for idx, file_record in enumerate(preprocess_output.get("files") or []):
+        batch = normalize_file_findings(file_record, file_index=idx, starting_f_index=f_num)
+        all_findings.extend(batch)
+        f_num += len(batch)
+
+    return {
+        "pipeline_version": preprocess_output.get("pipeline_version"),
+        "project_id": preprocess_output.get("project_id"),
+        "created_at": preprocess_output.get("created_at"),
+        "input_type": preprocess_output.get("input_type"),
+        "input_path": preprocess_output.get("input_path"),
+        "normalized_findings": all_findings,
+        "normalized_findings_meta": {
+            "count": len(all_findings),
+            "generator": "preprocess.normalized_findings",
+            "note": "All findings are heuristic; finding_status is review_needed unless overridden downstream.",
+        },
+    }
+
+
+def slim_finding_for_ai(
+    finding: Dict[str, Any],
+    max_evidence_snippet: int = 120,
+    include_remediation: bool = False,
+) -> Dict[str, Any]:
+    """
+    Shrink one finding for LLM context: drop long text, optional remediation lists.
+    """
+    out = {
+        "finding_id": finding.get("finding_id"),
+        "issue_type": finding.get("issue_type"),
+        "title": finding.get("title"),
+        "severity": finding.get("severity"),
+        "confidence": finding.get("confidence"),
+        "finding_status": finding.get("finding_status"),
+        "source_file": finding.get("source_file"),
+    }
+    slim_evidence = []
+    for ev in finding.get("evidence") or []:
+        text = ev.get("text") or ""
+        if len(text) > max_evidence_snippet:
+            text = text[:max_evidence_snippet].rstrip() + "..."
+        slim_evidence.append({
+            "line": ev.get("line"),
+            "category": ev.get("category"),
+            "snippet": text if text else None,
+        })
+    out["evidence"] = slim_evidence
+    if include_remediation and finding.get("remediation_keywords"):
+        out["remediation_keywords"] = finding.get("remediation_keywords")
+    if finding.get("analysis_limitations"):
+        # Keep one line each to save tokens
+        out["analysis_limitations"] = [
+            (s[:150] + "...") if len(s) > 150 else s
+            for s in finding.get("analysis_limitations", [])[:2]
+        ]
+    return out
+
+
+def export_ai_payload(
+    preprocess_output: Dict[str, Any],
+    max_evidence_snippet: int = 120,
+) -> Dict[str, Any]:
+    """
+    Compact payload for AI only — no chunk content, no raw signals.
+    Preprocess full JSON stays separate (long); this is short for token budget.
+    """
+    normalized = normalize_preprocess_output(preprocess_output)
+    slim_findings = [
+        slim_finding_for_ai(f, max_evidence_snippet=max_evidence_snippet)
+        for f in normalized.get("normalized_findings") or []
+    ]
+    return {
+        "schema": "ai_payload_v1",
+        "project_id": preprocess_output.get("project_id"),
+        "input_path": preprocess_output.get("input_path"),
+        "input_type": preprocess_output.get("input_type"),
+        "pipeline_version": preprocess_output.get("pipeline_version"),
+        "normalized_findings": slim_findings,
+        "meta": {
+            "finding_count": len(slim_findings),
+            "note": "Full source lives in preprocess output only; fetch by file_id/chunk_id if needed.",
+        },
+    }
+
+
+def run_file_ai_payload(path: str, max_evidence_snippet: int = 120) -> Dict[str, Any]:
+    """Preprocess then return only slim AI payload (no chunks, no duplicate signals)."""
+    from preprocess.pipeline import run_file
+
+    pre = run_file(path)
+    if not pre.get("files"):
+        return {
+            "schema": "ai_payload_v1",
+            "project_id": pre.get("project_id"),
+            "input_path": pre.get("input_path"),
+            "normalized_findings": [],
+            "meta": {"finding_count": 0, "reason": "no files processed"},
+        }
+    return export_ai_payload(pre, max_evidence_snippet=max_evidence_snippet)
+
+
+def run_file_with_findings(path: str) -> Dict[str, Any]:
+    """Convenience: preprocess file then normalize (single import for CLI)."""
+    from preprocess.pipeline import run_file
+
+    pre = run_file(path)
+    if not pre.get("files"):
+        return {
+            **pre,
+            "normalized_findings": [],
+            "normalized_findings_meta": {"count": 0, "reason": "no files processed"},
+        }
+    normalized = normalize_preprocess_output(pre)
+    # Merge: keep full preprocess payload + normalized_findings at top level for AI
+    out = dict(pre)
+    out["normalized_findings"] = normalized["normalized_findings"]
+    out["normalized_findings_meta"] = normalized["normalized_findings_meta"]
+    return out
diff --git a/preprocess/parse_python.py b/preprocess/parse_python.py
new file mode 100644
index 0000000..b663297
--- /dev/null
+++ b/preprocess/parse_python.py
@@ -0,0 +1,74 @@
+"""
+Extract imports, functions, classes from Python source via ast.
+On failure returns empty structure for fallback chunking.
+"""
+
+import ast
+from typing import Any, Dict, List, Optional
+
+
+def parse_python_structure(source: str) -> Dict[str, Any]:
+    out: Dict[str, Any] = {
+        "parse_ok": False,
+        "imports": [],
+        "functions": [],
+        "classes": [],
+        "calls": [],  # simplified: names only, from ast.Call if possible
+    }
+    try:
+        tree = ast.parse(source)
+    except SyntaxError:
+        return out
+    out["parse_ok"] = True
+    for node in ast.walk(tree):
+        if isinstance(node, ast.Import):
+            for alias in node.names:
+                out["imports"].append({
+                    "line": node.lineno,
+                    "end_line": getattr(node, "end_lineno", node.lineno),
+                    "name": alias.name,
+                    "alias": alias.asname,
+                })
+        elif isinstance(node, ast.ImportFrom):
+            mod = node.module or ""
+            for alias in node.names:
+                out["imports"].append({
+                    "line": node.lineno,
+                    "end_line": getattr(node, "end_lineno", node.lineno),
+                    "name": f"{mod}.{alias.name}" if mod else alias.name,
+                    "alias": alias.asname,
+                })
+        elif isinstance(node, ast.FunctionDef) or isinstance(node, ast.AsyncFunctionDef):
+            end = getattr(node, "end_lineno", node.lineno)
+            out["functions"].append({
+                "name": node.name,
+                "line": node.lineno,
+                "end_line": end,
+            })
+        elif isinstance(node, ast.ClassDef):
+            end = getattr(node, "end_lineno", node.lineno)
+            out["classes"].append({
+                "name": node.name,
+                "line": node.lineno,
+                "end_line": end,
+            })
+        elif isinstance(node, ast.Call):
+            name = _call_name(node)
+            if name:
+                out["calls"].append({"line": node.lineno, "name": name})
+    return out
+
+
+def _call_name(node: ast.Call) -> Optional[str]:
+    if isinstance(node.func, ast.Name):
+        return node.func.id
+    if isinstance(node.func, ast.Attribute):
+        parts: List[str] = []
+        cur = node.func
+        while isinstance(cur, ast.Attribute):
+            parts.append(cur.attr)
+            cur = cur.value
+        if isinstance(cur, ast.Name):
+            parts.append(cur.id)
+            return ".".join(reversed(parts))
+    return None
diff --git a/preprocess/pipeline.py b/preprocess/pipeline.py
new file mode 100644
index 0000000..dfe9e5f
--- /dev/null
+++ b/preprocess/pipeline.py
@@ -0,0 +1,211 @@
+"""
+Orchestrates preprocessing: file or snippet -> structured JSON.
+"""
+
+import json
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Dict, Optional, Union
+
+from preprocess.chunking import build_chunks
+from preprocess.filters import should_skip_path, path_has_skip_segment
+from preprocess.hashing import sha256_text
+from preprocess.io_read import read_text_with_fallback, is_probably_text
+from preprocess.language import detect_language
+from preprocess.parse_python import parse_python_structure
+from preprocess.signals import extract_signals, risk_hints_from_signals
+
+PIPELINE_VERSION = "1.0.0"
+
+
+def _file_id(project_id: str, rel_path: str, content: str) -> str:
+    return sha256_text(f"{project_id}:{rel_path}:{sha256_text(content)}")
+
+
+def _chunk_id(file_id: str, start: int, end: int, content: str) -> str:
+    return sha256_text(f"{file_id}:{start}:{end}:{sha256_text(content)}")
+
+
+def run_file(
+    path: Union[str, Path],
+    language_hint: Optional[str] = None,
+    max_file_bytes: int = 512 * 1024,
+) -> Dict[str, Any]:
+    path = Path(path).resolve()
+    skip, reason = should_skip_path(path, max_bytes=max_file_bytes)
+    project_id = sha256_text(str(path))
+    if skip:
+        return {
+            "pipeline_version": PIPELINE_VERSION,
+            "project_id": project_id,
+            "created_at": datetime.now(timezone.utc).isoformat(),
+            "input_type": "file",
+            "input_path": str(path),
+            "files": [],
+            "files_skipped": [{"path": str(path), "reason": reason}],
+            "chunks": [],
+        }
+    if path_has_skip_segment(path):
+        return {
+            "pipeline_version": PIPELINE_VERSION,
+            "project_id": project_id,
+            "created_at": datetime.now(timezone.utc).isoformat(),
+            "input_type": "file",
+            "input_path": str(path),
+            "files": [],
+            "files_skipped": [{"path": str(path), "reason": "path_in_skip_dir"}],
+            "chunks": [],
+        }
+
+    text, encoding, err = read_text_with_fallback(path)
+    if text is None or err:
+        return {
+            "pipeline_version": PIPELINE_VERSION,
+            "project_id": project_id,
+            "created_at": datetime.now(timezone.utc).isoformat(),
+            "input_type": "file",
+            "input_path": str(path),
+            "files": [],
+            "files_skipped": [{"path": str(path), "reason": err or "read_failed"}],
+            "chunks": [],
+        }
+    if not is_probably_text(text):
+        return {
+            "pipeline_version": PIPELINE_VERSION,
+            "project_id": project_id,
+            "created_at": datetime.now(timezone.utc).isoformat(),
+            "input_type": "file",
+            "input_path": str(path),
+            "files": [],
+            "files_skipped": [{"path": str(path), "reason": "not_probably_text"}],
+            "chunks": [],
+        }
+
+    rel_path = path.name
+    lang_info = detect_language(path, text, language_hint)
+    structure: Dict[str, Any] = {}
+    if lang_info["language"] == "python":
+        structure = parse_python_structure(text)
+
+    file_id = _file_id(project_id, rel_path, text)
+    content_hash = sha256_text(text)
+
+    raw_chunks = build_chunks(file_id, text, lang_info["language"], structure)
+    chunks_out = []
+    signals_full = extract_signals(text)
+    risk_hints_file = risk_hints_from_signals(signals_full)
+
+    for ch in raw_chunks:
+        cid = _chunk_id(file_id, ch["start_line"], ch["end_line"], ch["content"])
+        chunk_signals = extract_signals(ch["content"])
+        chunks_out.append({
+            "chunk_id": cid,
+            "file_id": file_id,
+            "start_line": ch["start_line"],
+            "end_line": ch["end_line"],
+            "type": ch["type"],
+            "symbol": ch["symbol"],
+            "content": ch["content"],
+            "content_hash": sha256_text(ch["content"]),
+            "signals": {k: v for k, v in chunk_signals.items() if v},
+            "risk_hints": risk_hints_from_signals(chunk_signals),
+        })
+
+    file_record = {
+        "file_id": file_id,
+        "path": rel_path,
+        "language": lang_info["language"],
+        "language_confidence": lang_info["confidence"],
+        "language_reason": lang_info["reason"],
+        "encoding_used": encoding,
+        "line_count": len(text.splitlines()),
+        "byte_size": path.stat().st_size,
+        "content_hash": content_hash,
+        "parse_ok": structure.get("parse_ok", False),
+        "imports": structure.get("imports", []),
+        "functions": structure.get("functions", []),
+        "classes": structure.get("classes", []),
+        "calls_sample": structure.get("calls", [])[:50],
+        "signals": {k: v for k, v in signals_full.items() if v},
+        "risk_hints": risk_hints_file,
+    }
+
+    return {
+        "pipeline_version": PIPELINE_VERSION,
+        "project_id": project_id,
+        "created_at": datetime.now(timezone.utc).isoformat(),
+        "input_type": "file",
+        "input_path": str(path),
+        "files": [file_record],
+        "files_skipped": [],
+        "chunks": chunks_out,
+    }
+
+
+def run_snippet(
+    content: str,
+    virtual_name: str = "snippet.py",
+    language_hint: Optional[str] = None,
+) -> Dict[str, Any]:
+    """Treat snippet as a single virtual file."""
+    path = Path(virtual_name)
+    project_id = sha256_text(content[:5000] + virtual_name)
+    lang_info = detect_language(path, content, language_hint)
+    structure = {}
+    if lang_info["language"] == "python":
+        structure = parse_python_structure(content)
+    file_id = _file_id(project_id, virtual_name, content)
+    raw_chunks = build_chunks(file_id, content, lang_info["language"], structure)
+    signals_full = extract_signals(content)
+    chunks_out = []
+    for ch in raw_chunks:
+        cid = _chunk_id(file_id, ch["start_line"], ch["end_line"], ch["content"])
+        chunk_signals = extract_signals(ch["content"])
+        chunks_out.append({
+            "chunk_id": cid,
+            "file_id": file_id,
+            "start_line": ch["start_line"],
+            "end_line": ch["end_line"],
+            "type": ch["type"],
+            "symbol": ch["symbol"],
+            "content": ch["content"],
+            "content_hash": sha256_text(ch["content"]),
+            "signals": {k: v for k, v in chunk_signals.items() if v},
+            "risk_hints": risk_hints_from_signals(chunk_signals),
+        })
+    file_record = {
+        "file_id": file_id,
+        "path": virtual_name,
+        "language": lang_info["language"],
+        "language_confidence": lang_info["confidence"],
+        "language_reason": lang_info["reason"],
+        "encoding_used": "utf-8",
+        "line_count": len(content.splitlines()),
+        "byte_size": len(content.encode("utf-8")),
+        "content_hash": sha256_text(content),
+        "parse_ok": structure.get("parse_ok", False),
+        "imports": structure.get("imports", []),
+        "functions": structure.get("functions", []),
+        "classes": structure.get("classes", []),
+        "calls_sample": structure.get("calls", [])[:50],
+        "signals": {k: v for k, v in signals_full.items() if v},
+        "risk_hints": risk_hints_from_signals(signals_full),
+    }
+    return {
+        "pipeline_version": PIPELINE_VERSION,
+        "project_id": project_id,
+        "created_at": datetime.now(timezone.utc).isoformat(),
+        "input_type": "snippet",
+        "input_path": virtual_name,
+        "files": [file_record],
+        "files_skipped": [],
+        "chunks": chunks_out,
+    }
+
+
+def run_file_to_json(path: Union[str, Path], out_path: Optional[str] = None) -> str:
+    data = run_file(path)
+    s = json.dumps(data, ensure_ascii=False, indent=2)
+    if out_path:
+        Path(out_path).write_text(s, encoding="utf-8")
+    return s
diff --git a/preprocess/signals.py b/preprocess/signals.py
new file mode 100644
index 0000000..094f8c3
--- /dev/null
+++ b/preprocess/signals.py
@@ -0,0 +1,93 @@
+"""
+Regex-based security signals and risk_hints from source text.
+"""
+
+import re
+from typing import Dict, List, Set
+
+# Patterns -> signal category
+PATTERNS = [
+    (r"\bsubprocess\.(run|Popen|call)\b", "command_execution"),
+    (r"\bos\.system\s*\(", "command_execution"),
+    (r"\bexec\s*\(", "command_execution"),
+    (r"\beval\s*\(", "command_execution"),
+    (r"\.execute\s*\(\s*[\"']", "sql_execution"),
+    (r"\bcursor\.execute\s*\(", "sql_execution"),
+    (r"\btext\s*\(\s*[\"'].*SELECT|INSERT|UPDATE|DELETE", "sql_execution"),
+    (r"\bopen\s*\(", "file_access"),
+    (r"\bPath\s*\([^)]*\)\.(read|write)", "file_access"),
+    (r"\binput\s*\(", "user_input_source"),
+    (r"\brequest\.(args|form|json|get)\b", "user_input_source"),
+    (r"\bargv\b|\bsys\.argv\b", "user_input_source"),
+    (r"\bhashlib\.|bcrypt\.|crypto\.|jwt\.|openssl\b", "crypto_usage"),
+    (r"\bprint\s*\(", "debug_output"),
+    (r"\bconsole\.log\s*\(", "debug_output"),
+    (r"\bexcept\s+Exception\b|\bexcept\s*:", "broad_except"),
+    (r"(?i)(password|secret|api_key|apikey|token)\s*=\s*['\"][^'\"]{8,}", "possible_hardcoded_secret"),
+    (r"(?i)\b(auth|login|session|oauth|bearer)\b", "auth_keyword"),
+]
+
+
+def extract_signals(content: str) -> Dict[str, List[Dict]]:
+    """
+    Returns dict with keys: imports (from regex), function_calls, user_input_sources,
+    database_access, sql_execution, command_execution, file_access, crypto_usage,
+    debug_output, error_handling, possible_hardcoded_secrets, auth_related_keywords.
+    Each value is a list of { "line", "match", "category" }.
+    """
+    lines = content.splitlines()
+    by_category: Dict[str, List[Dict]] = {
+        "imports": [],
+        "function_calls": [],
+        "user_input_sources": [],
+        "database_access": [],
+        "sql_execution": [],
+        "command_execution": [],
+        "file_access": [],
+        "crypto_usage": [],
+        "debug_output": [],
+        "error_handling": [],
+        "possible_hardcoded_secrets": [],
+        "auth_related_keywords": [],
+    }
+    category_map = {
+        "command_execution": "command_execution",
+        "sql_execution": "sql_execution",
+        "file_access": "file_access",
+        "user_input_source": "user_input_sources",
+        "crypto_usage": "crypto_usage",
+        "debug_output": "debug_output",
+        "broad_except": "error_handling",
+        "possible_hardcoded_secret": "possible_hardcoded_secrets",
+        "auth_keyword": "auth_related_keywords",
+    }
+    for i, line in enumerate(lines, start=1):
+        for pattern, cat in PATTERNS:
+            if re.search(pattern, line):
+                key = category_map.get(cat, cat)
+                if key not in by_category:
+                    by_category[key] = []
+                by_category[key].append({"line": i, "match": line.strip()[:200], "category": cat})
+    # DB access heuristic
+    if re.search(r"\bengine\.connect\b|\bconnection\b|\bdatabase\b", content, re.I):
+        by_category["database_access"].append({"line": 0, "match": "heuristic", "category": "database_access"})
+    return by_category
+
+
+def risk_hints_from_signals(signals: Dict[str, List]) -> List[str]:
+    hints: Set[str] = set()
+    if signals.get("sql_execution"):
+        hints.add("sql_execution_present")
+    if signals.get("command_execution"):
+        hints.add("command_execution_present")
+    if signals.get("possible_hardcoded_secrets"):
+        hints.add("possible_hardcoded_secret")
+    if signals.get("user_input_sources"):
+        hints.add("user_input_flow")
+    if signals.get("file_access"):
+        hints.add("file_io")
+    if signals.get("broad_except") or signals.get("error_handling"):
+        hints.add("error_handling_review")
+    if signals.get("auth_related_keywords"):
+        hints.add("auth_surface")
+    return sorted(hints)
diff --git a/prescan.py b/prescan.py
new file mode 100644
index 0000000..e6d51ca
--- /dev/null
+++ b/prescan.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+import argparse
+import json
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+
+def run_semgrep(target_path: str) -> dict:
+   
+    path = Path(target_path).resolve()
+    if not path.exists():
+        return {"tool": "semgrep", "error": f"path not found: {target_path}", "results": []}
+    try:
+        cmd = [
+            sys.executable, "-m", "semgrep", "scan",
+            "--config", "auto",
+            "--json",
+            "--quiet",
+            str(path),
+        ]
+        out = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            timeout=300,
+            cwd=os.getcwd(),
+        )
+        if out.returncode != 0 and not out.stdout.strip():
+            return {
+                "tool": "semgrep",
+                "error": out.stderr.strip() or f"exit code {out.returncode}",
+                "results": [],
+            }
+        data = json.loads(out.stdout) if out.stdout.strip() else {}
+        return {"tool": "semgrep", "error": None, "results": data.get("results", data)}
+    except FileNotFoundError:
+        return {"tool": "semgrep", "error": "semgrep not installed (pip install semgrep)", "results": []}
+    except subprocess.TimeoutExpired:
+        return {"tool": "semgrep", "error": "timeout", "results": []}
+    except json.JSONDecodeError as e:
+        return {"tool": "semgrep", "error": str(e), "results": []}
+
+
+def run_gitleaks(target_path: str) -> dict:
+    """Run gitleaks detect on target_path and return JSON results. Empty structure if unavailable."""
+    path = Path(target_path).resolve()
+    if not path.exists():
+        return {"tool": "gitleaks", "error": f"path not found: {target_path}", "results": []}
+    source = str(path) if path.is_dir() else str(path.parent)
+    try:
+        cmd = [
+            "gitleaks", "detect",
+            "--source", source,
+            "--no-git",
+            "--report-format", "json",
+            "--report-path", "-",  # write JSON to stdout
+        ]
+        out = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            timeout=120,
+        )
+        # when gitleaks finds secrets exit code may be 1, but stdout still contains JSON
+        raw = out.stdout.strip()
+        if not raw:
+            return {"tool": "gitleaks", "error": None, "results": []}
+        try:
+            data = json.loads(raw)
+            results = data if isinstance(data, list) else data.get("findings", data.get("results", []))
+        except json.JSONDecodeError:
+            results = []
+        return {"tool": "gitleaks", "error": None, "results": results}
+    except FileNotFoundError:
+        return {"tool": "gitleaks", "error": "gitleaks not installed", "results": []}
+    except subprocess.TimeoutExpired:
+        return {"tool": "gitleaks", "error": "timeout", "results": []}
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Pre-scan: semgrep + gitleaks -> JSON report")
+    parser.add_argument(
+        "input_path",
+        nargs="?",
+        default=".",
+        help="File or directory path to scan (default: current directory)",
+    )
+    parser.add_argument(
+        "-o", "--output",
+        default="prescan_report.json",
+        help="Output JSON file path (default: prescan_report.json)",
+    )
+    args = parser.parse_args()
+
+    input_path = os.path.normpath(args.input_path)
+    if not os.path.exists(input_path):
+        print(f"Error: path not found {input_path}", file=sys.stderr)
+        sys.exit(1)
+
+    report = {
+        "input_path": os.path.abspath(input_path),
+        "semgrep": run_semgrep(input_path),
+        "gitleaks": run_gitleaks(input_path),
+    }
+
+    out_path = args.output
+    with open(out_path, "w", encoding="utf-8") as f:
+        json.dump(report, f, ensure_ascii=False, indent=2)
+
+    print(f"Report written to: {out_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/prescan_report.json b/prescan_report.json
new file mode 100644
index 0000000..63f87d9
--- /dev/null
+++ b/prescan_report.json
@@ -0,0 +1,13 @@
+{
+  "input_path": "/Users/zhangtingen/Downloads/V/testquery.py",
+  "semgrep": {
+    "tool": "semgrep",
+    "error": "Using `python -m semgrep` to run Semgrep is deprecated as of 1.38.0. Please simply run `semgrep` instead.",
+    "results": []
+  },
+  "gitleaks": {
+    "tool": "gitleaks",
+    "error": "gitleaks not installed",
+    "results": []
+  }
+}
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index e7c13db..aa31d5f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,11 @@
 eel
+engine
 pyqrcode
 pyinstaller
 pypng
 autopep8
+psycopg2-binary
+SQLAlchemy
+python-dotenv
+bcrypt
+semgrep
diff --git a/run_preprocess_test.py b/run_preprocess_test.py
new file mode 100644
index 0000000..6709079
--- /dev/null
+++ b/run_preprocess_test.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+"""
+Run preprocessing and write JSON.
+
+By default writes the LLM-friendly payload only (compact, no chunk content).
+Use --full for complete preprocess output (scanners / cache).
+
+Usage:
+  python3 run_preprocess_test.py                    # -> ai_payload.json (LLM)
+  python3 run_preprocess_test.py -o out.json        # LLM payload to out.json
+  python3 run_preprocess_test.py --full             # full preprocess JSON
+  python3 run_preprocess_test.py --full -o pre.json
+  python3 run_preprocess_test.py other.py -o x.json
+"""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parent
+sys.path.insert(0, str(ROOT))
+
+from preprocess.pipeline import run_file  # noqa: E402
+from preprocess.normalized_findings import run_file_ai_payload  # noqa: E402
+
+DEFAULT_INPUT = ROOT / "testquery.py"
+DEFAULT_OUTPUT_LLM = "ai_payload.json"
+DEFAULT_OUTPUT_FULL = "preprocess_output.json"
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Preprocess a file -> JSON (default: LLM payload only)",
+    )
+    parser.add_argument(
+        "input",
+        nargs="?",
+        default=str(DEFAULT_INPUT),
+        help="File path to preprocess (default: testquery.py)",
+    )
+    parser.add_argument(
+        "-o", "--output",
+        default=None,
+        help="Output JSON path (default: ai_payload.json or preprocess_output.json with --full)",
+    )
+    parser.add_argument(
+        "--full",
+        action="store_true",
+        help="Write full preprocess JSON (includes chunk content; for scanners, not for LLM)",
+    )
+    args = parser.parse_args()
+
+    input_path = Path(args.input)
+    if not input_path.is_file():
+        print(f"Not a file: {input_path}", file=sys.stderr)
+        sys.exit(1)
+
+    if args.full:
+        data = run_file(input_path)
+        out = args.output or DEFAULT_OUTPUT_FULL
+    else:
+        data = run_file_ai_payload(input_path)
+        out = args.output or DEFAULT_OUTPUT_LLM
+
+    out_path = Path(out)
+    out_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
+    print(f"Wrote {out_path}")
+    print(f"  project_id: {data.get('project_id', '')[:16]}...")
+
+    if args.full:
+        print(f"  mode: full preprocess")
+        print(f"  files: {len(data['files'])}, chunks: {len(data['chunks'])}, skipped: {len(data['files_skipped'])}")
+    else:
+        n = len(data.get("normalized_findings") or [])
+        print(f"  mode: LLM payload ({n} findings, no chunk content)")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/testquery.py b/testquery.py
new file mode 100644
index 0000000..18cd9e2
--- /dev/null
+++ b/testquery.py
@@ -0,0 +1,15 @@
+from sqlalchemy import text
+from db import engine
+
+try:
+    with engine.connect() as conn:
+        # Run a simple query to test
+        result = conn.execute(text("SELECT NOW();"))
+        print("Connected! Server time:", result.fetchone()[0])
+except Exception as e:
+    print("Connection failed:", e)
+
+with engine.connect() as conn:
+    #conn.execute(text("INSERT INTO users(email, password_hash) VALUES ('test2@example.com', '1A2B3C');"))
+    result = conn.execute(text("SELECT * FROM users;"))
+    print(result.fetchall())